aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-01-10 20:42:53 -0500
committerIngo Molnar <mingo@elte.hu>2009-01-10 20:42:53 -0500
commit506c10f26c481b7f8ef27c1c79290f68989b2e9e (patch)
tree03de82e812f00957aa6276dac2fe51c3358e88d7 /fs
parente1df957670aef74ffd9a4ad93e6d2c90bf6b4845 (diff)
parentc59765042f53a79a7a65585042ff463b69cb248c (diff)
Merge commit 'v2.6.29-rc1' into perfcounters/core
Conflicts: include/linux/kernel_stat.h
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig137
-rw-r--r--fs/Kconfig.binfmt2
-rw-r--r--fs/Makefile8
-rw-r--r--fs/affs/file.c2
-rw-r--r--fs/affs/inode.c3
-rw-r--r--fs/afs/write.c2
-rw-r--r--fs/aio.c100
-rw-r--r--fs/anon_inodes.c7
-rw-r--r--fs/autofs/inode.c2
-rw-r--r--fs/autofs4/autofs_i.h2
-rw-r--r--fs/autofs4/dev-ioctl.c75
-rw-r--r--fs/autofs4/expire.c4
-rw-r--r--fs/autofs4/inode.c18
-rw-r--r--fs/autofs4/waitq.c8
-rw-r--r--fs/bad_inode.c6
-rw-r--r--fs/befs/linuxvfs.c5
-rw-r--r--fs/bfs/inode.c45
-rw-r--r--fs/binfmt_aout.c81
-rw-r--r--fs/binfmt_elf.c12
-rw-r--r--fs/binfmt_elf_fdpic.c35
-rw-r--r--fs/binfmt_flat.c34
-rw-r--r--fs/binfmt_misc.c5
-rw-r--r--fs/bio-integrity.c2
-rw-r--r--fs/bio.c356
-rw-r--r--fs/block_dev.c42
-rw-r--r--fs/btrfs/Makefile25
-rw-r--r--fs/btrfs/acl.c351
-rw-r--r--fs/btrfs/async-thread.c419
-rw-r--r--fs/btrfs/async-thread.h101
-rw-r--r--fs/btrfs/btrfs_inode.h131
-rw-r--r--fs/btrfs/compat.h7
-rw-r--r--fs/btrfs/compression.c709
-rw-r--r--fs/btrfs/compression.h47
-rw-r--r--fs/btrfs/crc32c.h29
-rw-r--r--fs/btrfs/ctree.c3953
-rw-r--r--fs/btrfs/ctree.h2129
-rw-r--r--fs/btrfs/dir-item.c386
-rw-r--r--fs/btrfs/disk-io.c2343
-rw-r--r--fs/btrfs/disk-io.h102
-rw-r--r--fs/btrfs/export.c203
-rw-r--r--fs/btrfs/export.h19
-rw-r--r--fs/btrfs/extent-tree.c5986
-rw-r--r--fs/btrfs/extent_io.c3717
-rw-r--r--fs/btrfs/extent_io.h269
-rw-r--r--fs/btrfs/extent_map.c351
-rw-r--r--fs/btrfs/extent_map.h62
-rw-r--r--fs/btrfs/file-item.c831
-rw-r--r--fs/btrfs/file.c1288
-rw-r--r--fs/btrfs/free-space-cache.c495
-rw-r--r--fs/btrfs/hash.h27
-rw-r--r--fs/btrfs/inode-item.c206
-rw-r--r--fs/btrfs/inode-map.c144
-rw-r--r--fs/btrfs/inode.c5035
-rw-r--r--fs/btrfs/ioctl.c1132
-rw-r--r--fs/btrfs/ioctl.h67
-rw-r--r--fs/btrfs/locking.c88
-rw-r--r--fs/btrfs/locking.h27
-rw-r--r--fs/btrfs/ordered-data.c730
-rw-r--r--fs/btrfs/ordered-data.h158
-rw-r--r--fs/btrfs/orphan.c67
-rw-r--r--fs/btrfs/print-tree.c216
-rw-r--r--fs/btrfs/print-tree.h23
-rw-r--r--fs/btrfs/ref-cache.c230
-rw-r--r--fs/btrfs/ref-cache.h77
-rw-r--r--fs/btrfs/root-tree.c366
-rw-r--r--fs/btrfs/struct-funcs.c139
-rw-r--r--fs/btrfs/super.c722
-rw-r--r--fs/btrfs/sysfs.c269
-rw-r--r--fs/btrfs/transaction.c1097
-rw-r--r--fs/btrfs/transaction.h106
-rw-r--r--fs/btrfs/tree-defrag.c147
-rw-r--r--fs/btrfs/tree-log.c2898
-rw-r--r--fs/btrfs/tree-log.h41
-rw-r--r--fs/btrfs/version.h4
-rw-r--r--fs/btrfs/version.sh43
-rw-r--r--fs/btrfs/volumes.c3218
-rw-r--r--fs/btrfs/volumes.h162
-rw-r--r--fs/btrfs/xattr.c322
-rw-r--r--fs/btrfs/xattr.h39
-rw-r--r--fs/btrfs/zlib.c632
-rw-r--r--fs/buffer.c98
-rw-r--r--fs/char_dev.c2
-rw-r--r--fs/cifs/Makefile2
-rw-r--r--fs/cifs/cifsfs.c7
-rw-r--r--fs/cifs/cifsfs.h1
-rw-r--r--fs/cifs/fcntl.c118
-rw-r--r--fs/cifs/file.c2
-rw-r--r--fs/cifs/inode.c2
-rw-r--r--fs/coda/file.c12
-rw-r--r--fs/coda/sysctl.c5
-rw-r--r--fs/compat.c6
-rw-r--r--fs/configfs/inode.c3
-rw-r--r--fs/cramfs/inode.c2
-rw-r--r--fs/dcache.c37
-rw-r--r--fs/dcookies.c28
-rw-r--r--fs/debugfs/file.c32
-rw-r--r--fs/debugfs/inode.c3
-rw-r--r--fs/devpts/inode.c472
-rw-r--r--fs/direct-io.c13
-rw-r--r--fs/dlm/ast.c56
-rw-r--r--fs/dlm/ast.h4
-rw-r--r--fs/dlm/debug_fs.c310
-rw-r--r--fs/dlm/dir.c18
-rw-r--r--fs/dlm/dlm_internal.h4
-rw-r--r--fs/dlm/lock.c31
-rw-r--r--fs/dlm/lowcomms.c8
-rw-r--r--fs/dlm/memory.c6
-rw-r--r--fs/dlm/midcomms.c2
-rw-r--r--fs/dlm/netlink.c1
-rw-r--r--fs/dlm/user.c4
-rw-r--r--fs/dlm/user.h2
-rw-r--r--fs/dquot.c438
-rw-r--r--fs/ecryptfs/crypto.c514
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h105
-rw-r--r--fs/ecryptfs/file.c45
-rw-r--r--fs/ecryptfs/inode.c303
-rw-r--r--fs/ecryptfs/keystore.c651
-rw-r--r--fs/ecryptfs/main.c126
-rw-r--r--fs/ecryptfs/messaging.c4
-rw-r--r--fs/ecryptfs/miscdev.c18
-rw-r--r--fs/ecryptfs/mmap.c2
-rw-r--r--fs/exec.c86
-rw-r--r--fs/ext2/ialloc.c14
-rw-r--r--fs/ext2/inode.c9
-rw-r--r--fs/ext2/ioctl.c3
-rw-r--r--fs/ext2/namei.c15
-rw-r--r--fs/ext2/super.c10
-rw-r--r--fs/ext3/hash.c77
-rw-r--r--fs/ext3/ialloc.c14
-rw-r--r--fs/ext3/inode.c9
-rw-r--r--fs/ext3/ioctl.c3
-rw-r--r--fs/ext3/namei.c33
-rw-r--r--fs/ext3/super.c104
-rw-r--r--fs/ext4/balloc.c293
-rw-r--r--fs/ext4/bitmap.c5
-rw-r--r--fs/ext4/dir.c10
-rw-r--r--fs/ext4/ext4.h158
-rw-r--r--fs/ext4/ext4_extents.h5
-rw-r--r--fs/ext4/ext4_i.h16
-rw-r--r--fs/ext4/ext4_jbd2.c83
-rw-r--r--fs/ext4/ext4_jbd2.h87
-rw-r--r--fs/ext4/ext4_sb.h12
-rw-r--r--fs/ext4/extents.c62
-rw-r--r--fs/ext4/file.c3
-rw-r--r--fs/ext4/hash.c77
-rw-r--r--fs/ext4/ialloc.c330
-rw-r--r--fs/ext4/inode.c322
-rw-r--r--fs/ext4/ioctl.c2
-rw-r--r--fs/ext4/mballoc.c629
-rw-r--r--fs/ext4/mballoc.h71
-rw-r--r--fs/ext4/migrate.c19
-rw-r--r--fs/ext4/namei.c113
-rw-r--r--fs/ext4/resize.c113
-rw-r--r--fs/ext4/super.c686
-rw-r--r--fs/ext4/xattr.c25
-rw-r--r--fs/fat/dir.c1
-rw-r--r--fs/fat/inode.c2
-rw-r--r--fs/fat/namei_vfat.c2
-rw-r--r--fs/file_table.c10
-rw-r--r--fs/filesystems.c23
-rw-r--r--fs/freevxfs/vxfs_inode.c4
-rw-r--r--fs/fs-writeback.c92
-rw-r--r--fs/fuse/control.c6
-rw-r--r--fs/fuse/dev.c113
-rw-r--r--fs/fuse/dir.c48
-rw-r--r--fs/fuse/file.c461
-rw-r--r--fs/fuse/fuse_i.h83
-rw-r--r--fs/fuse/inode.c157
-rw-r--r--fs/gfs2/Kconfig2
-rw-r--r--fs/gfs2/Makefile2
-rw-r--r--fs/gfs2/acl.c2
-rw-r--r--fs/gfs2/bmap.c77
-rw-r--r--fs/gfs2/bmap.h34
-rw-r--r--fs/gfs2/daemon.c136
-rw-r--r--fs/gfs2/daemon.h17
-rw-r--r--fs/gfs2/dir.c62
-rw-r--r--fs/gfs2/dir.h1
-rw-r--r--fs/gfs2/eattr.c40
-rw-r--r--fs/gfs2/glock.c303
-rw-r--r--fs/gfs2/glock.h2
-rw-r--r--fs/gfs2/glops.c56
-rw-r--r--fs/gfs2/incore.h55
-rw-r--r--fs/gfs2/inode.c53
-rw-r--r--fs/gfs2/inode.h13
-rw-r--r--fs/gfs2/locking/dlm/mount.c12
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c16
-rw-r--r--fs/gfs2/main.c15
-rw-r--r--fs/gfs2/mount.c29
-rw-r--r--fs/gfs2/ops_address.c35
-rw-r--r--fs/gfs2/ops_dentry.c2
-rw-r--r--fs/gfs2/ops_dentry.h17
-rw-r--r--fs/gfs2/ops_export.c5
-rw-r--r--fs/gfs2/ops_file.c24
-rw-r--r--fs/gfs2/ops_fstype.c125
-rw-r--r--fs/gfs2/ops_fstype.h19
-rw-r--r--fs/gfs2/ops_inode.c75
-rw-r--r--fs/gfs2/ops_inode.h25
-rw-r--r--fs/gfs2/ops_super.c165
-rw-r--r--fs/gfs2/ops_super.h17
-rw-r--r--fs/gfs2/quota.c113
-rw-r--r--fs/gfs2/quota.h24
-rw-r--r--fs/gfs2/recovery.c48
-rw-r--r--fs/gfs2/recovery.h14
-rw-r--r--fs/gfs2/rgrp.c58
-rw-r--r--fs/gfs2/super.c246
-rw-r--r--fs/gfs2/super.h13
-rw-r--r--fs/gfs2/sys.c66
-rw-r--r--fs/gfs2/sys.h4
-rw-r--r--fs/gfs2/util.c1
-rw-r--r--fs/gfs2/util.h1
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/hugetlbfs/inode.c13
-rw-r--r--fs/inode.c273
-rw-r--r--fs/ioctl.c90
-rw-r--r--fs/ioprio.c3
-rw-r--r--fs/isofs/inode.c6
-rw-r--r--fs/jbd/commit.c15
-rw-r--r--fs/jbd/transaction.c39
-rw-r--r--fs/jbd2/checkpoint.c24
-rw-r--r--fs/jbd2/commit.c67
-rw-r--r--fs/jbd2/journal.c143
-rw-r--r--fs/jbd2/transaction.c107
-rw-r--r--fs/jffs2/compr_rubin.c120
-rw-r--r--fs/jffs2/erase.c5
-rw-r--r--fs/jffs2/file.c2
-rw-r--r--fs/jffs2/nodelist.h3
-rw-r--r--fs/jfs/inode.c8
-rw-r--r--fs/jfs/jfs_imap.c10
-rw-r--r--fs/jfs/jfs_inode.c29
-rw-r--r--fs/jfs/namei.c24
-rw-r--r--fs/jfs/super.c10
-rw-r--r--fs/libfs.c7
-rw-r--r--fs/lockd/clntlock.c23
-rw-r--r--fs/lockd/clntproc.c7
-rw-r--r--fs/lockd/host.c180
-rw-r--r--fs/lockd/mon.c569
-rw-r--r--fs/lockd/svc.c78
-rw-r--r--fs/lockd/svc4proc.c13
-rw-r--r--fs/lockd/svcproc.c13
-rw-r--r--fs/lockd/svcsubs.c1
-rw-r--r--fs/lockd/xdr.c5
-rw-r--r--fs/lockd/xdr4.c5
-rw-r--r--fs/minix/dir.c2
-rw-r--r--fs/mpage.c6
-rw-r--r--fs/namei.c160
-rw-r--r--fs/namespace.c2
-rw-r--r--fs/ncpfs/getopt.c1
-rw-r--r--fs/ncpfs/ioctl.c2
-rw-r--r--fs/nfs/callback.c36
-rw-r--r--fs/nfs/client.c95
-rw-r--r--fs/nfs/delegation.c260
-rw-r--r--fs/nfs/delegation.h33
-rw-r--r--fs/nfs/dir.c24
-rw-r--r--fs/nfs/file.c2
-rw-r--r--fs/nfs/inode.c13
-rw-r--r--fs/nfs/internal.h14
-rw-r--r--fs/nfs/mount_clnt.c34
-rw-r--r--fs/nfs/nfs4_fs.h32
-rw-r--r--fs/nfs/nfs4proc.c431
-rw-r--r--fs/nfs/nfs4renewd.c22
-rw-r--r--fs/nfs/nfs4state.c415
-rw-r--r--fs/nfs/nfs4xdr.c1235
-rw-r--r--fs/nfs/nfsroot.c27
-rw-r--r--fs/nfs/read.c6
-rw-r--r--fs/nfs/super.c44
-rw-r--r--fs/nfs_common/nfsacl.c4
-rw-r--r--fs/nfsctl.c5
-rw-r--r--fs/nfsd/auth.c4
-rw-r--r--fs/nfsd/nfs4callback.c12
-rw-r--r--fs/nfsd/nfs4proc.c5
-rw-r--r--fs/nfsd/nfs4recover.c2
-rw-r--r--fs/nfsd/nfs4state.c91
-rw-r--r--fs/nfsd/nfs4xdr.c2
-rw-r--r--fs/nfsd/nfsctl.c479
-rw-r--r--fs/nfsd/nfsfh.c36
-rw-r--r--fs/nfsd/nfsproc.c1
-rw-r--r--fs/nfsd/vfs.c9
-rw-r--r--fs/notify/Kconfig2
-rw-r--r--fs/notify/Makefile2
-rw-r--r--fs/notify/dnotify/Kconfig10
-rw-r--r--fs/notify/dnotify/Makefile1
-rw-r--r--fs/notify/dnotify/dnotify.c (renamed from fs/dnotify.c)3
-rw-r--r--fs/notify/inotify/Kconfig27
-rw-r--r--fs/notify/inotify/Makefile2
-rw-r--r--fs/notify/inotify/inotify.c (renamed from fs/inotify.c)0
-rw-r--r--fs/notify/inotify/inotify_user.c (renamed from fs/inotify_user.c)4
-rw-r--r--fs/ntfs/inode.c3
-rw-r--r--fs/ocfs2/Makefile7
-rw-r--r--fs/ocfs2/acl.c479
-rw-r--r--fs/ocfs2/acl.h58
-rw-r--r--fs/ocfs2/alloc.c712
-rw-r--r--fs/ocfs2/alloc.h30
-rw-r--r--fs/ocfs2/aops.c59
-rw-r--r--fs/ocfs2/blockcheck.c477
-rw-r--r--fs/ocfs2/blockcheck.h82
-rw-r--r--fs/ocfs2/buffer_head_io.c32
-rw-r--r--fs/ocfs2/buffer_head_io.h27
-rw-r--r--fs/ocfs2/cluster/heartbeat.c2
-rw-r--r--fs/ocfs2/cluster/masklog.c1
-rw-r--r--fs/ocfs2/cluster/masklog.h1
-rw-r--r--fs/ocfs2/dir.c399
-rw-r--r--fs/ocfs2/dir.h2
-rw-r--r--fs/ocfs2/dlm/dlmast.c52
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h3
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c53
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c1
-rw-r--r--fs/ocfs2/dlm/dlmfs.c2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c42
-rw-r--r--fs/ocfs2/dlm/dlmthread.c3
-rw-r--r--fs/ocfs2/dlmglue.c172
-rw-r--r--fs/ocfs2/dlmglue.h19
-rw-r--r--fs/ocfs2/extent_map.c96
-rw-r--r--fs/ocfs2/extent_map.h24
-rw-r--r--fs/ocfs2/file.c211
-rw-r--r--fs/ocfs2/file.h3
-rw-r--r--fs/ocfs2/inode.c175
-rw-r--r--fs/ocfs2/inode.h18
-rw-r--r--fs/ocfs2/journal.c364
-rw-r--r--fs/ocfs2/journal.h128
-rw-r--r--fs/ocfs2/localalloc.c26
-rw-r--r--fs/ocfs2/namei.c318
-rw-r--r--fs/ocfs2/ocfs2.h46
-rw-r--r--fs/ocfs2/ocfs2_fs.h213
-rw-r--r--fs/ocfs2/ocfs2_jbd_compat.h82
-rw-r--r--fs/ocfs2/ocfs2_lockid.h5
-rw-r--r--fs/ocfs2/quota.h119
-rw-r--r--fs/ocfs2/quota_global.c1025
-rw-r--r--fs/ocfs2/quota_local.c1253
-rw-r--r--fs/ocfs2/resize.c76
-rw-r--r--fs/ocfs2/slot_map.c4
-rw-r--r--fs/ocfs2/suballoc.c363
-rw-r--r--fs/ocfs2/suballoc.h18
-rw-r--r--fs/ocfs2/super.c328
-rw-r--r--fs/ocfs2/symlink.c2
-rw-r--r--fs/ocfs2/xattr.c2984
-rw-r--r--fs/ocfs2/xattr.h45
-rw-r--r--fs/omfs/inode.c1
-rw-r--r--fs/open.c7
-rw-r--r--fs/openpromfs/inode.c3
-rw-r--r--fs/partitions/check.c12
-rw-r--r--fs/pipe.c7
-rw-r--r--fs/proc/base.c235
-rw-r--r--fs/proc/generic.c8
-rw-r--r--fs/proc/inode.c3
-rw-r--r--fs/proc/internal.h2
-rw-r--r--fs/proc/meminfo.c6
-rw-r--r--fs/proc/nommu.c71
-rw-r--r--fs/proc/proc_net.c2
-rw-r--r--fs/proc/proc_sysctl.c1
-rw-r--r--fs/proc/root.c8
-rw-r--r--fs/proc/stat.c7
-rw-r--r--fs/proc/task_mmu.c8
-rw-r--r--fs/proc/task_nommu.c122
-rw-r--r--fs/proc/vmcore.c2
-rw-r--r--fs/quota.c11
-rw-r--r--fs/quota_tree.c645
-rw-r--r--fs/quota_tree.h25
-rw-r--r--fs/quota_v1.c28
-rw-r--r--fs/quota_v2.c631
-rw-r--r--fs/quotaio_v1.h33
-rw-r--r--fs/quotaio_v2.h60
-rw-r--r--fs/ramfs/file-nommu.c21
-rw-r--r--fs/ramfs/inode.c1
-rw-r--r--fs/read_write.c13
-rw-r--r--fs/reiserfs/inode.c30
-rw-r--r--fs/reiserfs/namei.c8
-rw-r--r--fs/reiserfs/super.c20
-rw-r--r--fs/romfs/inode.c13
-rw-r--r--fs/select.c76
-rw-r--r--fs/seq_file.c13
-rw-r--r--fs/smbfs/file.c2
-rw-r--r--fs/splice.c1
-rw-r--r--fs/squashfs/Makefile8
-rw-r--r--fs/squashfs/block.c274
-rw-r--r--fs/squashfs/cache.c412
-rw-r--r--fs/squashfs/dir.c235
-rw-r--r--fs/squashfs/export.c155
-rw-r--r--fs/squashfs/file.c502
-rw-r--r--fs/squashfs/fragment.c98
-rw-r--r--fs/squashfs/id.c94
-rw-r--r--fs/squashfs/inode.c346
-rw-r--r--fs/squashfs/namei.c242
-rw-r--r--fs/squashfs/squashfs.h90
-rw-r--r--fs/squashfs/squashfs_fs.h381
-rw-r--r--fs/squashfs/squashfs_fs_i.h45
-rw-r--r--fs/squashfs/squashfs_fs_sb.h76
-rw-r--r--fs/squashfs/super.c440
-rw-r--r--fs/squashfs/symlink.c118
-rw-r--r--fs/stat.c2
-rw-r--r--fs/super.c12
-rw-r--r--fs/sync.c50
-rw-r--r--fs/sysfs/inode.c3
-rw-r--r--fs/sysv/inode.c6
-rw-r--r--fs/ubifs/Kconfig2
-rw-r--r--fs/ubifs/budget.c212
-rw-r--r--fs/ubifs/commit.c25
-rw-r--r--fs/ubifs/compress.c18
-rw-r--r--fs/ubifs/debug.c265
-rw-r--r--fs/ubifs/debug.h117
-rw-r--r--fs/ubifs/file.c17
-rw-r--r--fs/ubifs/gc.c2
-rw-r--r--fs/ubifs/ioctl.c2
-rw-r--r--fs/ubifs/journal.c6
-rw-r--r--fs/ubifs/key.h32
-rw-r--r--fs/ubifs/lprops.c14
-rw-r--r--fs/ubifs/lpt.c45
-rw-r--r--fs/ubifs/lpt_commit.c210
-rw-r--r--fs/ubifs/orphan.c2
-rw-r--r--fs/ubifs/replay.c15
-rw-r--r--fs/ubifs/sb.c20
-rw-r--r--fs/ubifs/shrinker.c2
-rw-r--r--fs/ubifs/super.c260
-rw-r--r--fs/ubifs/tnc.c31
-rw-r--r--fs/ubifs/tnc_commit.c9
-rw-r--r--fs/ubifs/ubifs-media.h7
-rw-r--r--fs/ubifs/ubifs.h111
-rw-r--r--fs/xattr.c2
-rw-r--r--fs/xfs/Makefile6
-rw-r--r--fs/xfs/linux-2.6/sv.h22
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c66
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.h3
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c87
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h30
-rw-r--r--fs/xfs/linux-2.6/xfs_cred.h8
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c189
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.c23
-rw-r--r--fs/xfs/linux-2.6/xfs_globals.c8
-rw-r--r--fs/xfs/linux-2.6/xfs_globals.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c238
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.h82
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c851
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.h214
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c122
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h13
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c50
-rw-r--r--fs/xfs/linux-2.6/xfs_stats.c6
-rw-r--r--fs/xfs/linux-2.6/xfs_stats.h65
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c894
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h15
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c762
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h55
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.c11
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.h3
-rw-r--r--fs/xfs/linux-2.6/xfs_vfs.h77
-rw-r--r--fs/xfs/linux-2.6/xfs_vnode.c145
-rw-r--r--fs/xfs/linux-2.6/xfs_vnode.h72
-rw-r--r--fs/xfs/quota/xfs_dquot.c39
-rw-r--r--fs/xfs/quota/xfs_dquot.h4
-rw-r--r--fs/xfs/quota/xfs_dquot_item.c45
-rw-r--r--fs/xfs/quota/xfs_qm.c57
-rw-r--r--fs/xfs/quota/xfs_qm.h3
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c5
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c151
-rw-r--r--fs/xfs/support/debug.c39
-rw-r--r--fs/xfs/support/debug.h2
-rw-r--r--fs/xfs/support/ktrace.c9
-rw-r--r--fs/xfs/xfs.h2
-rw-r--r--fs/xfs/xfs_acl.c2
-rw-r--r--fs/xfs/xfs_ag.h15
-rw-r--r--fs/xfs/xfs_alloc.c264
-rw-r--r--fs/xfs/xfs_alloc.h27
-rw-r--r--fs/xfs/xfs_alloc_btree.c2387
-rw-r--r--fs/xfs/xfs_alloc_btree.h107
-rw-r--r--fs/xfs/xfs_arch.h39
-rw-r--r--fs/xfs/xfs_bit.h3
-rw-r--r--fs/xfs/xfs_bmap.c410
-rw-r--r--fs/xfs/xfs_bmap.h72
-rw-r--r--fs/xfs/xfs_bmap_btree.c2617
-rw-r--r--fs/xfs/xfs_bmap_btree.h171
-rw-r--r--fs/xfs/xfs_btree.c3596
-rw-r--r--fs/xfs/xfs_btree.h392
-rw-r--r--fs/xfs/xfs_btree_trace.c249
-rw-r--r--fs/xfs/xfs_btree_trace.h116
-rw-r--r--fs/xfs/xfs_buf_item.c45
-rw-r--r--fs/xfs/xfs_clnt.h105
-rw-r--r--fs/xfs/xfs_da_btree.h24
-rw-r--r--fs/xfs/xfs_dfrag.c8
-rw-r--r--fs/xfs/xfs_dfrag.h2
-rw-r--r--fs/xfs/xfs_dinode.h148
-rw-r--r--fs/xfs/xfs_dir2_sf.h7
-rw-r--r--fs/xfs/xfs_dmops.c5
-rw-r--r--fs/xfs/xfs_error.c15
-rw-r--r--fs/xfs/xfs_error.h12
-rw-r--r--fs/xfs/xfs_extfree_item.c45
-rw-r--r--fs/xfs/xfs_fs.h26
-rw-r--r--fs/xfs/xfs_fsops.c41
-rw-r--r--fs/xfs/xfs_fsops.h2
-rw-r--r--fs/xfs/xfs_ialloc.c449
-rw-r--r--fs/xfs/xfs_ialloc.h31
-rw-r--r--fs/xfs/xfs_ialloc_btree.c2193
-rw-r--r--fs/xfs/xfs_ialloc_btree.h111
-rw-r--r--fs/xfs/xfs_iget.c735
-rw-r--r--fs/xfs/xfs_imap.h40
-rw-r--r--fs/xfs/xfs_inode.c587
-rw-r--r--fs/xfs/xfs_inode.h375
-rw-r--r--fs/xfs/xfs_inode_item.c45
-rw-r--r--fs/xfs/xfs_inode_item.h41
-rw-r--r--fs/xfs/xfs_iomap.c28
-rw-r--r--fs/xfs/xfs_itable.c102
-rw-r--r--fs/xfs/xfs_itable.h14
-rw-r--r--fs/xfs/xfs_log.c81
-rw-r--r--fs/xfs/xfs_log.h4
-rw-r--r--fs/xfs/xfs_log_priv.h48
-rw-r--r--fs/xfs/xfs_log_recover.c416
-rw-r--r--fs/xfs/xfs_mount.c81
-rw-r--r--fs/xfs/xfs_mount.h73
-rw-r--r--fs/xfs/xfs_qmops.c5
-rw-r--r--fs/xfs/xfs_quota.h8
-rw-r--r--fs/xfs/xfs_rename.c61
-rw-r--r--fs/xfs/xfs_rtalloc.c41
-rw-r--r--fs/xfs/xfs_rw.c2
-rw-r--r--fs/xfs/xfs_sb.h167
-rw-r--r--fs/xfs/xfs_trans.c22
-rw-r--r--fs/xfs/xfs_trans.h322
-rw-r--r--fs/xfs/xfs_trans_ail.c362
-rw-r--r--fs/xfs/xfs_trans_buf.c7
-rw-r--r--fs/xfs/xfs_trans_inode.c30
-rw-r--r--fs/xfs/xfs_trans_item.c10
-rw-r--r--fs/xfs/xfs_trans_priv.h98
-rw-r--r--fs/xfs/xfs_utils.c12
-rw-r--r--fs/xfs/xfs_vfsops.c757
-rw-r--r--fs/xfs/xfs_vfsops.h16
-rw-r--r--fs/xfs/xfs_vnodeops.c354
-rw-r--r--fs/xfs/xfs_vnodeops.h10
526 files changed, 77358 insertions, 22493 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 522469a7eca3..51307b0fdf0f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -189,6 +189,8 @@ config OCFS2_FS
189 select CONFIGFS_FS 189 select CONFIGFS_FS
190 select JBD2 190 select JBD2
191 select CRC32 191 select CRC32
192 select QUOTA
193 select QUOTA_TREE
192 help 194 help
193 OCFS2 is a general purpose extent based shared disk cluster file 195 OCFS2 is a general purpose extent based shared disk cluster file
194 system with many similarities to ext3. It supports 64 bit inode 196 system with many similarities to ext3. It supports 64 bit inode
@@ -258,56 +260,37 @@ config OCFS2_DEBUG_FS
258 this option for debugging only as it is likely to decrease 260 this option for debugging only as it is likely to decrease
259 performance of the filesystem. 261 performance of the filesystem.
260 262
261config OCFS2_COMPAT_JBD 263config OCFS2_FS_POSIX_ACL
262 bool "Use JBD for compatibility" 264 bool "OCFS2 POSIX Access Control Lists"
263 depends on OCFS2_FS 265 depends on OCFS2_FS
266 select FS_POSIX_ACL
264 default n 267 default n
265 select JBD
266 help 268 help
267 The ocfs2 filesystem now uses JBD2 for its journalling. JBD2 269 Posix Access Control Lists (ACLs) support permissions for users and
268 is backwards compatible with JBD. It is safe to say N here. 270 groups beyond the owner/group/world scheme.
269 However, if you really want to use the original JBD, say Y here.
270
271endif # BLOCK
272 271
273config DNOTIFY 272config BTRFS_FS
274 bool "Dnotify support" 273 tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
275 default y 274 depends on EXPERIMENTAL
275 select LIBCRC32C
276 select ZLIB_INFLATE
277 select ZLIB_DEFLATE
276 help 278 help
277 Dnotify is a directory-based per-fd file change notification system 279 Btrfs is a new filesystem with extents, writable snapshotting,
278 that uses signals to communicate events to user-space. There exist 280 support for multiple devices and many more features.
279 superior alternatives, but some applications may still rely on
280 dnotify.
281
282 If unsure, say Y.
283
284config INOTIFY
285 bool "Inotify file change notification support"
286 default y
287 ---help---
288 Say Y here to enable inotify support. Inotify is a file change
289 notification system and a replacement for dnotify. Inotify fixes
290 numerous shortcomings in dnotify and introduces several new features
291 including multiple file events, one-shot support, and unmount
292 notification.
293 281
294 For more information, see <file:Documentation/filesystems/inotify.txt> 282 Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET
283 FINALIZED. You should say N here unless you are interested in
284 testing Btrfs with non-critical data.
295 285
296 If unsure, say Y. 286 To compile this file system support as a module, choose M here. The
287 module will be called btrfs.
297 288
298config INOTIFY_USER 289 If unsure, say N.
299 bool "Inotify support for userspace"
300 depends on INOTIFY
301 default y
302 ---help---
303 Say Y here to enable inotify support for userspace, including the
304 associated system calls. Inotify allows monitoring of both files and
305 directories via a single open fd. Events are read from the file
306 descriptor, which is also select()- and poll()-able.
307 290
308 For more information, see <file:Documentation/filesystems/inotify.txt> 291endif # BLOCK
309 292
310 If unsure, say Y. 293source "fs/notify/Kconfig"
311 294
312config QUOTA 295config QUOTA
313 bool "Quota support" 296 bool "Quota support"
@@ -340,6 +323,10 @@ config PRINT_QUOTA_WARNING
340 Note that this behavior is currently deprecated and may go away in 323 Note that this behavior is currently deprecated and may go away in
341 future. Please use notification via netlink socket instead. 324 future. Please use notification via netlink socket instead.
342 325
326# Generic support for tree structured quota files. Seleted when needed.
327config QUOTA_TREE
328 tristate
329
343config QFMT_V1 330config QFMT_V1
344 tristate "Old quota format support" 331 tristate "Old quota format support"
345 depends on QUOTA 332 depends on QUOTA
@@ -351,6 +338,7 @@ config QFMT_V1
351config QFMT_V2 338config QFMT_V2
352 tristate "Quota format v2 support" 339 tristate "Quota format v2 support"
353 depends on QUOTA 340 depends on QUOTA
341 select QUOTA_TREE
354 help 342 help
355 This quota format allows using quotas with 32-bit UIDs/GIDs. If you 343 This quota format allows using quotas with 32-bit UIDs/GIDs. If you
356 need this functionality say Y here. 344 need this functionality say Y here.
@@ -752,7 +740,20 @@ config CONFIGFS_FS
752 740
753endmenu 741endmenu
754 742
755menu "Miscellaneous filesystems" 743menuconfig MISC_FILESYSTEMS
744 bool "Miscellaneous filesystems"
745 default y
746 ---help---
747 Say Y here to get to see options for various miscellaneous
748 filesystems, such as filesystems that came from other
749 operating systems.
750
751 This option alone does not add any kernel code.
752
753 If you say N, all options in this submenu will be skipped and
754 disabled; if unsure, say Y here.
755
756if MISC_FILESYSTEMS
756 757
757config ADFS_FS 758config ADFS_FS
758 tristate "ADFS file system support (EXPERIMENTAL)" 759 tristate "ADFS file system support (EXPERIMENTAL)"
@@ -931,6 +932,58 @@ config CRAMFS
931 932
932 If unsure, say N. 933 If unsure, say N.
933 934
935config SQUASHFS
936 tristate "SquashFS 4.0 - Squashed file system support"
937 depends on BLOCK
938 select ZLIB_INFLATE
939 help
940 Saying Y here includes support for SquashFS 4.0 (a Compressed
941 Read-Only File System). Squashfs is a highly compressed read-only
942 filesystem for Linux. It uses zlib compression to compress both
943 files, inodes and directories. Inodes in the system are very small
944 and all blocks are packed to minimise data overhead. Block sizes
945 greater than 4K are supported up to a maximum of 1 Mbytes (default
946 block size 128K). SquashFS 4.0 supports 64 bit filesystems and files
947 (larger than 4GB), full uid/gid information, hard links and
948 timestamps.
949
950 Squashfs is intended for general read-only filesystem use, for
951 archival use (i.e. in cases where a .tar.gz file may be used), and in
952 embedded systems where low overhead is needed. Further information
953 and tools are available from http://squashfs.sourceforge.net.
954
955 If you want to compile this as a module ( = code which can be
956 inserted in and removed from the running kernel whenever you want),
957 say M here and read <file:Documentation/modules.txt>. The module
958 will be called squashfs. Note that the root file system (the one
959 containing the directory /) cannot be compiled as a module.
960
961 If unsure, say N.
962
963config SQUASHFS_EMBEDDED
964
965 bool "Additional option for memory-constrained systems"
966 depends on SQUASHFS
967 default n
968 help
969 Saying Y here allows you to specify cache size.
970
971 If unsure, say N.
972
973config SQUASHFS_FRAGMENT_CACHE_SIZE
974 int "Number of fragments cached" if SQUASHFS_EMBEDDED
975 depends on SQUASHFS
976 default "3"
977 help
978 By default SquashFS caches the last 3 fragments read from
979 the filesystem. Increasing this amount may mean SquashFS
980 has to re-read fragments less often from disk, at the expense
981 of extra system memory. Decreasing this amount will mean
982 SquashFS uses less memory at the expense of extra reads from disk.
983
984 Note there must be at least one cached fragment. Anything
985 much more than three will probably not make much difference.
986
934config VXFS_FS 987config VXFS_FS
935 tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)" 988 tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
936 depends on BLOCK 989 depends on BLOCK
@@ -1122,7 +1175,7 @@ config UFS_DEBUG
1122 Y here. This will result in _many_ additional debugging messages to be 1175 Y here. This will result in _many_ additional debugging messages to be
1123 written to the system log. 1176 written to the system log.
1124 1177
1125endmenu 1178endif # MISC_FILESYSTEMS
1126 1179
1127menuconfig NETWORK_FILESYSTEMS 1180menuconfig NETWORK_FILESYSTEMS
1128 bool "Network File Systems" 1181 bool "Network File Systems"
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index ce9fb3fbfae4..bb4cc5b8abc8 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -43,7 +43,7 @@ config BINFMT_ELF_FDPIC
43config CORE_DUMP_DEFAULT_ELF_HEADERS 43config CORE_DUMP_DEFAULT_ELF_HEADERS
44 bool "Write ELF core dumps with partial segments" 44 bool "Write ELF core dumps with partial segments"
45 default n 45 default n
46 depends on BINFMT_ELF 46 depends on BINFMT_ELF && ELF_CORE
47 help 47 help
48 ELF core dump files describe each memory mapping of the crashed 48 ELF core dump files describe each memory mapping of the crashed
49 process, and can contain or omit the memory contents of each one. 49 process, and can contain or omit the memory contents of each one.
diff --git a/fs/Makefile b/fs/Makefile
index d9f8afe6f0c4..38bc735c67ad 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -20,8 +20,7 @@ obj-y += no-block.o
20endif 20endif
21 21
22obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o 22obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
23obj-$(CONFIG_INOTIFY) += inotify.o 23obj-y += notify/
24obj-$(CONFIG_INOTIFY_USER) += inotify_user.o
25obj-$(CONFIG_EPOLL) += eventpoll.o 24obj-$(CONFIG_EPOLL) += eventpoll.o
26obj-$(CONFIG_ANON_INODES) += anon_inodes.o 25obj-$(CONFIG_ANON_INODES) += anon_inodes.o
27obj-$(CONFIG_SIGNALFD) += signalfd.o 26obj-$(CONFIG_SIGNALFD) += signalfd.o
@@ -55,10 +54,9 @@ obj-$(CONFIG_GENERIC_ACL) += generic_acl.o
55obj-$(CONFIG_QUOTA) += dquot.o 54obj-$(CONFIG_QUOTA) += dquot.o
56obj-$(CONFIG_QFMT_V1) += quota_v1.o 55obj-$(CONFIG_QFMT_V1) += quota_v1.o
57obj-$(CONFIG_QFMT_V2) += quota_v2.o 56obj-$(CONFIG_QFMT_V2) += quota_v2.o
57obj-$(CONFIG_QUOTA_TREE) += quota_tree.o
58obj-$(CONFIG_QUOTACTL) += quota.o 58obj-$(CONFIG_QUOTACTL) += quota.o
59 59
60obj-$(CONFIG_DNOTIFY) += dnotify.o
61
62obj-$(CONFIG_PROC_FS) += proc/ 60obj-$(CONFIG_PROC_FS) += proc/
63obj-y += partitions/ 61obj-y += partitions/
64obj-$(CONFIG_SYSFS) += sysfs/ 62obj-$(CONFIG_SYSFS) += sysfs/
@@ -76,6 +74,7 @@ obj-$(CONFIG_JBD) += jbd/
76obj-$(CONFIG_JBD2) += jbd2/ 74obj-$(CONFIG_JBD2) += jbd2/
77obj-$(CONFIG_EXT2_FS) += ext2/ 75obj-$(CONFIG_EXT2_FS) += ext2/
78obj-$(CONFIG_CRAMFS) += cramfs/ 76obj-$(CONFIG_CRAMFS) += cramfs/
77obj-$(CONFIG_SQUASHFS) += squashfs/
79obj-y += ramfs/ 78obj-y += ramfs/
80obj-$(CONFIG_HUGETLBFS) += hugetlbfs/ 79obj-$(CONFIG_HUGETLBFS) += hugetlbfs/
81obj-$(CONFIG_CODA_FS) += coda/ 80obj-$(CONFIG_CODA_FS) += coda/
@@ -121,4 +120,5 @@ obj-$(CONFIG_HOSTFS) += hostfs/
121obj-$(CONFIG_HPPFS) += hppfs/ 120obj-$(CONFIG_HPPFS) += hppfs/
122obj-$(CONFIG_DEBUG_FS) += debugfs/ 121obj-$(CONFIG_DEBUG_FS) += debugfs/
123obj-$(CONFIG_OCFS2_FS) += ocfs2/ 122obj-$(CONFIG_OCFS2_FS) += ocfs2/
123obj-$(CONFIG_BTRFS_FS) += btrfs/
124obj-$(CONFIG_GFS2_FS) += gfs2/ 124obj-$(CONFIG_GFS2_FS) += gfs2/
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 1377b1240b6e..9246cb4aa018 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -628,7 +628,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
628 } 628 }
629 629
630 index = pos >> PAGE_CACHE_SHIFT; 630 index = pos >> PAGE_CACHE_SHIFT;
631 page = __grab_cache_page(mapping, index); 631 page = grab_cache_page_write_begin(mapping, index, flags);
632 if (!page) 632 if (!page)
633 return -ENOMEM; 633 return -ENOMEM;
634 *pagep = page; 634 *pagep = page;
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 415d9c67ac16..3c4ec7d864c4 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -119,8 +119,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
119 goto bad_inode; 119 goto bad_inode;
120#else 120#else
121 inode->i_mode |= S_IFDIR; 121 inode->i_mode |= S_IFDIR;
122 inode->i_op = NULL; 122 /* ... and leave ->i_op and ->i_fop pointing to empty */
123 inode->i_fop = NULL;
124 break; 123 break;
125#endif 124#endif
126 case ST_LINKFILE: 125 case ST_LINKFILE:
diff --git a/fs/afs/write.c b/fs/afs/write.c
index d6b85dab35fc..3fb36d433621 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -144,7 +144,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
144 candidate->state = AFS_WBACK_PENDING; 144 candidate->state = AFS_WBACK_PENDING;
145 init_waitqueue_head(&candidate->waitq); 145 init_waitqueue_head(&candidate->waitq);
146 146
147 page = __grab_cache_page(mapping, index); 147 page = grab_cache_page_write_begin(mapping, index, flags);
148 if (!page) { 148 if (!page) {
149 kfree(candidate); 149 kfree(candidate);
150 return -ENOMEM; 150 return -ENOMEM;
diff --git a/fs/aio.c b/fs/aio.c
index f658441d5666..d6f89d3c15e8 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -191,6 +191,20 @@ static int aio_setup_ring(struct kioctx *ctx)
191 kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \ 191 kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \
192} while(0) 192} while(0)
193 193
194static void ctx_rcu_free(struct rcu_head *head)
195{
196 struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
197 unsigned nr_events = ctx->max_reqs;
198
199 kmem_cache_free(kioctx_cachep, ctx);
200
201 if (nr_events) {
202 spin_lock(&aio_nr_lock);
203 BUG_ON(aio_nr - nr_events > aio_nr);
204 aio_nr -= nr_events;
205 spin_unlock(&aio_nr_lock);
206 }
207}
194 208
195/* __put_ioctx 209/* __put_ioctx
196 * Called when the last user of an aio context has gone away, 210 * Called when the last user of an aio context has gone away,
@@ -198,8 +212,6 @@ static int aio_setup_ring(struct kioctx *ctx)
198 */ 212 */
199static void __put_ioctx(struct kioctx *ctx) 213static void __put_ioctx(struct kioctx *ctx)
200{ 214{
201 unsigned nr_events = ctx->max_reqs;
202
203 BUG_ON(ctx->reqs_active); 215 BUG_ON(ctx->reqs_active);
204 216
205 cancel_delayed_work(&ctx->wq); 217 cancel_delayed_work(&ctx->wq);
@@ -208,14 +220,7 @@ static void __put_ioctx(struct kioctx *ctx)
208 mmdrop(ctx->mm); 220 mmdrop(ctx->mm);
209 ctx->mm = NULL; 221 ctx->mm = NULL;
210 pr_debug("__put_ioctx: freeing %p\n", ctx); 222 pr_debug("__put_ioctx: freeing %p\n", ctx);
211 kmem_cache_free(kioctx_cachep, ctx); 223 call_rcu(&ctx->rcu_head, ctx_rcu_free);
212
213 if (nr_events) {
214 spin_lock(&aio_nr_lock);
215 BUG_ON(aio_nr - nr_events > aio_nr);
216 aio_nr -= nr_events;
217 spin_unlock(&aio_nr_lock);
218 }
219} 224}
220 225
221#define get_ioctx(kioctx) do { \ 226#define get_ioctx(kioctx) do { \
@@ -235,6 +240,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
235{ 240{
236 struct mm_struct *mm; 241 struct mm_struct *mm;
237 struct kioctx *ctx; 242 struct kioctx *ctx;
243 int did_sync = 0;
238 244
239 /* Prevent overflows */ 245 /* Prevent overflows */
240 if ((nr_events > (0x10000000U / sizeof(struct io_event))) || 246 if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
@@ -267,21 +273,30 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
267 goto out_freectx; 273 goto out_freectx;
268 274
269 /* limit the number of system wide aios */ 275 /* limit the number of system wide aios */
270 spin_lock(&aio_nr_lock); 276 do {
271 if (aio_nr + ctx->max_reqs > aio_max_nr || 277 spin_lock_bh(&aio_nr_lock);
272 aio_nr + ctx->max_reqs < aio_nr) 278 if (aio_nr + nr_events > aio_max_nr ||
273 ctx->max_reqs = 0; 279 aio_nr + nr_events < aio_nr)
274 else 280 ctx->max_reqs = 0;
275 aio_nr += ctx->max_reqs; 281 else
276 spin_unlock(&aio_nr_lock); 282 aio_nr += ctx->max_reqs;
283 spin_unlock_bh(&aio_nr_lock);
284 if (ctx->max_reqs || did_sync)
285 break;
286
287 /* wait for rcu callbacks to have completed before giving up */
288 synchronize_rcu();
289 did_sync = 1;
290 ctx->max_reqs = nr_events;
291 } while (1);
292
277 if (ctx->max_reqs == 0) 293 if (ctx->max_reqs == 0)
278 goto out_cleanup; 294 goto out_cleanup;
279 295
280 /* now link into global list. */ 296 /* now link into global list. */
281 write_lock(&mm->ioctx_list_lock); 297 spin_lock(&mm->ioctx_lock);
282 ctx->next = mm->ioctx_list; 298 hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
283 mm->ioctx_list = ctx; 299 spin_unlock(&mm->ioctx_lock);
284 write_unlock(&mm->ioctx_list_lock);
285 300
286 dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", 301 dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
287 ctx, ctx->user_id, current->mm, ctx->ring_info.nr); 302 ctx, ctx->user_id, current->mm, ctx->ring_info.nr);
@@ -375,11 +390,12 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
375 */ 390 */
376void exit_aio(struct mm_struct *mm) 391void exit_aio(struct mm_struct *mm)
377{ 392{
378 struct kioctx *ctx = mm->ioctx_list; 393 struct kioctx *ctx;
379 mm->ioctx_list = NULL; 394
380 while (ctx) { 395 while (!hlist_empty(&mm->ioctx_list)) {
381 struct kioctx *next = ctx->next; 396 ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list);
382 ctx->next = NULL; 397 hlist_del_rcu(&ctx->list);
398
383 aio_cancel_all(ctx); 399 aio_cancel_all(ctx);
384 400
385 wait_for_all_aios(ctx); 401 wait_for_all_aios(ctx);
@@ -394,7 +410,6 @@ void exit_aio(struct mm_struct *mm)
394 atomic_read(&ctx->users), ctx->dead, 410 atomic_read(&ctx->users), ctx->dead,
395 ctx->reqs_active); 411 ctx->reqs_active);
396 put_ioctx(ctx); 412 put_ioctx(ctx);
397 ctx = next;
398 } 413 }
399} 414}
400 415
@@ -555,19 +570,21 @@ int aio_put_req(struct kiocb *req)
555 570
556static struct kioctx *lookup_ioctx(unsigned long ctx_id) 571static struct kioctx *lookup_ioctx(unsigned long ctx_id)
557{ 572{
558 struct kioctx *ioctx; 573 struct mm_struct *mm = current->mm;
559 struct mm_struct *mm; 574 struct kioctx *ctx = NULL;
575 struct hlist_node *n;
560 576
561 mm = current->mm; 577 rcu_read_lock();
562 read_lock(&mm->ioctx_list_lock); 578
563 for (ioctx = mm->ioctx_list; ioctx; ioctx = ioctx->next) 579 hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
564 if (likely(ioctx->user_id == ctx_id && !ioctx->dead)) { 580 if (ctx->user_id == ctx_id && !ctx->dead) {
565 get_ioctx(ioctx); 581 get_ioctx(ctx);
566 break; 582 break;
567 } 583 }
568 read_unlock(&mm->ioctx_list_lock); 584 }
569 585
570 return ioctx; 586 rcu_read_unlock();
587 return ctx;
571} 588}
572 589
573/* 590/*
@@ -1215,19 +1232,14 @@ out:
1215static void io_destroy(struct kioctx *ioctx) 1232static void io_destroy(struct kioctx *ioctx)
1216{ 1233{
1217 struct mm_struct *mm = current->mm; 1234 struct mm_struct *mm = current->mm;
1218 struct kioctx **tmp;
1219 int was_dead; 1235 int was_dead;
1220 1236
1221 /* delete the entry from the list is someone else hasn't already */ 1237 /* delete the entry from the list is someone else hasn't already */
1222 write_lock(&mm->ioctx_list_lock); 1238 spin_lock(&mm->ioctx_lock);
1223 was_dead = ioctx->dead; 1239 was_dead = ioctx->dead;
1224 ioctx->dead = 1; 1240 ioctx->dead = 1;
1225 for (tmp = &mm->ioctx_list; *tmp && *tmp != ioctx; 1241 hlist_del_rcu(&ioctx->list);
1226 tmp = &(*tmp)->next) 1242 spin_unlock(&mm->ioctx_lock);
1227 ;
1228 if (*tmp)
1229 *tmp = ioctx->next;
1230 write_unlock(&mm->ioctx_list_lock);
1231 1243
1232 dprintk("aio_release(%p)\n", ioctx); 1244 dprintk("aio_release(%p)\n", ioctx);
1233 if (likely(!was_dead)) 1245 if (likely(!was_dead))
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index c16d9be1b017..3bbdb9d02376 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -79,9 +79,12 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
79 if (IS_ERR(anon_inode_inode)) 79 if (IS_ERR(anon_inode_inode))
80 return -ENODEV; 80 return -ENODEV;
81 81
82 if (fops->owner && !try_module_get(fops->owner))
83 return -ENOENT;
84
82 error = get_unused_fd_flags(flags); 85 error = get_unused_fd_flags(flags);
83 if (error < 0) 86 if (error < 0)
84 return error; 87 goto err_module;
85 fd = error; 88 fd = error;
86 89
87 /* 90 /*
@@ -128,6 +131,8 @@ err_dput:
128 dput(dentry); 131 dput(dentry);
129err_put_unused_fd: 132err_put_unused_fd:
130 put_unused_fd(fd); 133 put_unused_fd(fd);
134err_module:
135 module_put(fops->owner);
131 return error; 136 return error;
132} 137}
133EXPORT_SYMBOL_GPL(anon_inode_getfd); 138EXPORT_SYMBOL_GPL(anon_inode_getfd);
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index c773680d5c60..e1734f2d6e26 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -251,13 +251,11 @@ struct inode *autofs_iget(struct super_block *sb, unsigned long ino)
251 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; 251 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
252 inode->i_nlink = 2; 252 inode->i_nlink = 2;
253 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 253 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
254 inode->i_blocks = 0;
255 254
256 if (ino == AUTOFS_ROOT_INO) { 255 if (ino == AUTOFS_ROOT_INO) {
257 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; 256 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
258 inode->i_op = &autofs_root_inode_operations; 257 inode->i_op = &autofs_root_inode_operations;
259 inode->i_fop = &autofs_root_operations; 258 inode->i_fop = &autofs_root_operations;
260 inode->i_uid = inode->i_gid = 0; /* Changed in read_super */
261 goto done; 259 goto done;
262 } 260 }
263 261
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index e0f16da00e54..a76803108d06 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -25,8 +25,6 @@
25#define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION) 25#define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION)
26#define AUTOFS_DEV_IOCTL_IOC_COUNT (AUTOFS_IOC_COUNT - 11) 26#define AUTOFS_DEV_IOCTL_IOC_COUNT (AUTOFS_IOC_COUNT - 11)
27 27
28#define AUTOFS_TYPE_TRIGGER (AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET)
29
30#include <linux/kernel.h> 28#include <linux/kernel.h>
31#include <linux/slab.h> 29#include <linux/slab.h>
32#include <linux/time.h> 30#include <linux/time.h>
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 63b7c7afe8df..025e105bffea 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -124,7 +124,7 @@ static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
124 124
125/* 125/*
126 * Check sanity of parameter control fields and if a path is present 126 * Check sanity of parameter control fields and if a path is present
127 * check that it has a "/" and is terminated. 127 * check that it is terminated and contains at least one "/".
128 */ 128 */
129static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) 129static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
130{ 130{
@@ -138,15 +138,16 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
138 } 138 }
139 139
140 if (param->size > sizeof(*param)) { 140 if (param->size > sizeof(*param)) {
141 err = check_name(param->path); 141 err = invalid_str(param->path,
142 (void *) ((size_t) param + param->size));
142 if (err) { 143 if (err) {
143 AUTOFS_WARN("invalid path supplied for cmd(0x%08x)", 144 AUTOFS_WARN(
144 cmd); 145 "path string terminator missing for cmd(0x%08x)",
146 cmd);
145 goto out; 147 goto out;
146 } 148 }
147 149
148 err = invalid_str(param->path, 150 err = check_name(param->path);
149 (void *) ((size_t) param + param->size));
150 if (err) { 151 if (err) {
151 AUTOFS_WARN("invalid path supplied for cmd(0x%08x)", 152 AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
152 cmd); 153 cmd);
@@ -180,7 +181,7 @@ static int autofs_dev_ioctl_protover(struct file *fp,
180 struct autofs_sb_info *sbi, 181 struct autofs_sb_info *sbi,
181 struct autofs_dev_ioctl *param) 182 struct autofs_dev_ioctl *param)
182{ 183{
183 param->arg1 = sbi->version; 184 param->protover.version = sbi->version;
184 return 0; 185 return 0;
185} 186}
186 187
@@ -189,7 +190,7 @@ static int autofs_dev_ioctl_protosubver(struct file *fp,
189 struct autofs_sb_info *sbi, 190 struct autofs_sb_info *sbi,
190 struct autofs_dev_ioctl *param) 191 struct autofs_dev_ioctl *param)
191{ 192{
192 param->arg1 = sbi->sub_version; 193 param->protosubver.sub_version = sbi->sub_version;
193 return 0; 194 return 0;
194} 195}
195 196
@@ -335,13 +336,13 @@ static int autofs_dev_ioctl_openmount(struct file *fp,
335 int err, fd; 336 int err, fd;
336 337
337 /* param->path has already been checked */ 338 /* param->path has already been checked */
338 if (!param->arg1) 339 if (!param->openmount.devid)
339 return -EINVAL; 340 return -EINVAL;
340 341
341 param->ioctlfd = -1; 342 param->ioctlfd = -1;
342 343
343 path = param->path; 344 path = param->path;
344 devid = param->arg1; 345 devid = param->openmount.devid;
345 346
346 err = 0; 347 err = 0;
347 fd = autofs_dev_ioctl_open_mountpoint(path, devid); 348 fd = autofs_dev_ioctl_open_mountpoint(path, devid);
@@ -373,7 +374,7 @@ static int autofs_dev_ioctl_ready(struct file *fp,
373{ 374{
374 autofs_wqt_t token; 375 autofs_wqt_t token;
375 376
376 token = (autofs_wqt_t) param->arg1; 377 token = (autofs_wqt_t) param->ready.token;
377 return autofs4_wait_release(sbi, token, 0); 378 return autofs4_wait_release(sbi, token, 0);
378} 379}
379 380
@@ -388,8 +389,8 @@ static int autofs_dev_ioctl_fail(struct file *fp,
388 autofs_wqt_t token; 389 autofs_wqt_t token;
389 int status; 390 int status;
390 391
391 token = (autofs_wqt_t) param->arg1; 392 token = (autofs_wqt_t) param->fail.token;
392 status = param->arg2 ? param->arg2 : -ENOENT; 393 status = param->fail.status ? param->fail.status : -ENOENT;
393 return autofs4_wait_release(sbi, token, status); 394 return autofs4_wait_release(sbi, token, status);
394} 395}
395 396
@@ -412,10 +413,10 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
412 int pipefd; 413 int pipefd;
413 int err = 0; 414 int err = 0;
414 415
415 if (param->arg1 == -1) 416 if (param->setpipefd.pipefd == -1)
416 return -EINVAL; 417 return -EINVAL;
417 418
418 pipefd = param->arg1; 419 pipefd = param->setpipefd.pipefd;
419 420
420 mutex_lock(&sbi->wq_mutex); 421 mutex_lock(&sbi->wq_mutex);
421 if (!sbi->catatonic) { 422 if (!sbi->catatonic) {
@@ -457,8 +458,8 @@ static int autofs_dev_ioctl_timeout(struct file *fp,
457{ 458{
458 unsigned long timeout; 459 unsigned long timeout;
459 460
460 timeout = param->arg1; 461 timeout = param->timeout.timeout;
461 param->arg1 = sbi->exp_timeout / HZ; 462 param->timeout.timeout = sbi->exp_timeout / HZ;
462 sbi->exp_timeout = timeout * HZ; 463 sbi->exp_timeout = timeout * HZ;
463 return 0; 464 return 0;
464} 465}
@@ -489,7 +490,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
489 path = param->path; 490 path = param->path;
490 devid = sbi->sb->s_dev; 491 devid = sbi->sb->s_dev;
491 492
492 param->arg1 = param->arg2 = -1; 493 param->requester.uid = param->requester.gid = -1;
493 494
494 /* Get nameidata of the parent directory */ 495 /* Get nameidata of the parent directory */
495 err = path_lookup(path, LOOKUP_PARENT, &nd); 496 err = path_lookup(path, LOOKUP_PARENT, &nd);
@@ -505,8 +506,8 @@ static int autofs_dev_ioctl_requester(struct file *fp,
505 err = 0; 506 err = 0;
506 autofs4_expire_wait(nd.path.dentry); 507 autofs4_expire_wait(nd.path.dentry);
507 spin_lock(&sbi->fs_lock); 508 spin_lock(&sbi->fs_lock);
508 param->arg1 = ino->uid; 509 param->requester.uid = ino->uid;
509 param->arg2 = ino->gid; 510 param->requester.gid = ino->gid;
510 spin_unlock(&sbi->fs_lock); 511 spin_unlock(&sbi->fs_lock);
511 } 512 }
512 513
@@ -529,10 +530,10 @@ static int autofs_dev_ioctl_expire(struct file *fp,
529 int err = -EAGAIN; 530 int err = -EAGAIN;
530 int how; 531 int how;
531 532
532 how = param->arg1; 533 how = param->expire.how;
533 mnt = fp->f_path.mnt; 534 mnt = fp->f_path.mnt;
534 535
535 if (sbi->type & AUTOFS_TYPE_TRIGGER) 536 if (autofs_type_trigger(sbi->type))
536 dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how); 537 dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how);
537 else 538 else
538 dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how); 539 dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how);
@@ -565,9 +566,9 @@ static int autofs_dev_ioctl_askumount(struct file *fp,
565 struct autofs_sb_info *sbi, 566 struct autofs_sb_info *sbi,
566 struct autofs_dev_ioctl *param) 567 struct autofs_dev_ioctl *param)
567{ 568{
568 param->arg1 = 0; 569 param->askumount.may_umount = 0;
569 if (may_umount(fp->f_path.mnt)) 570 if (may_umount(fp->f_path.mnt))
570 param->arg1 = 1; 571 param->askumount.may_umount = 1;
571 return 0; 572 return 0;
572} 573}
573 574
@@ -600,6 +601,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
600 struct nameidata nd; 601 struct nameidata nd;
601 const char *path; 602 const char *path;
602 unsigned int type; 603 unsigned int type;
604 unsigned int devid, magic;
603 int err = -ENOENT; 605 int err = -ENOENT;
604 606
605 if (param->size <= sizeof(*param)) { 607 if (param->size <= sizeof(*param)) {
@@ -608,13 +610,13 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
608 } 610 }
609 611
610 path = param->path; 612 path = param->path;
611 type = param->arg1; 613 type = param->ismountpoint.in.type;
612 614
613 param->arg1 = 0; 615 param->ismountpoint.out.devid = devid = 0;
614 param->arg2 = 0; 616 param->ismountpoint.out.magic = magic = 0;
615 617
616 if (!fp || param->ioctlfd == -1) { 618 if (!fp || param->ioctlfd == -1) {
617 if (type == AUTOFS_TYPE_ANY) { 619 if (autofs_type_any(type)) {
618 struct super_block *sb; 620 struct super_block *sb;
619 621
620 err = path_lookup(path, LOOKUP_FOLLOW, &nd); 622 err = path_lookup(path, LOOKUP_FOLLOW, &nd);
@@ -622,7 +624,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
622 goto out; 624 goto out;
623 625
624 sb = nd.path.dentry->d_sb; 626 sb = nd.path.dentry->d_sb;
625 param->arg1 = new_encode_dev(sb->s_dev); 627 devid = new_encode_dev(sb->s_dev);
626 } else { 628 } else {
627 struct autofs_info *ino; 629 struct autofs_info *ino;
628 630
@@ -635,38 +637,41 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
635 goto out_release; 637 goto out_release;
636 638
637 ino = autofs4_dentry_ino(nd.path.dentry); 639 ino = autofs4_dentry_ino(nd.path.dentry);
638 param->arg1 = autofs4_get_dev(ino->sbi); 640 devid = autofs4_get_dev(ino->sbi);
639 } 641 }
640 642
641 err = 0; 643 err = 0;
642 if (nd.path.dentry->d_inode && 644 if (nd.path.dentry->d_inode &&
643 nd.path.mnt->mnt_root == nd.path.dentry) { 645 nd.path.mnt->mnt_root == nd.path.dentry) {
644 err = 1; 646 err = 1;
645 param->arg2 = nd.path.dentry->d_inode->i_sb->s_magic; 647 magic = nd.path.dentry->d_inode->i_sb->s_magic;
646 } 648 }
647 } else { 649 } else {
648 dev_t devid = new_encode_dev(sbi->sb->s_dev); 650 dev_t dev = autofs4_get_dev(sbi);
649 651
650 err = path_lookup(path, LOOKUP_PARENT, &nd); 652 err = path_lookup(path, LOOKUP_PARENT, &nd);
651 if (err) 653 if (err)
652 goto out; 654 goto out;
653 655
654 err = autofs_dev_ioctl_find_super(&nd, devid); 656 err = autofs_dev_ioctl_find_super(&nd, dev);
655 if (err) 657 if (err)
656 goto out_release; 658 goto out_release;
657 659
658 param->arg1 = autofs4_get_dev(sbi); 660 devid = dev;
659 661
660 err = have_submounts(nd.path.dentry); 662 err = have_submounts(nd.path.dentry);
661 663
662 if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) { 664 if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) {
663 if (follow_down(&nd.path.mnt, &nd.path.dentry)) { 665 if (follow_down(&nd.path.mnt, &nd.path.dentry)) {
664 struct inode *inode = nd.path.dentry->d_inode; 666 struct inode *inode = nd.path.dentry->d_inode;
665 param->arg2 = inode->i_sb->s_magic; 667 magic = inode->i_sb->s_magic;
666 } 668 }
667 } 669 }
668 } 670 }
669 671
672 param->ismountpoint.out.devid = devid;
673 param->ismountpoint.out.magic = magic;
674
670out_release: 675out_release:
671 path_put(&nd.path); 676 path_put(&nd.path);
672out: 677out:
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 4b6fb3f628c0..e3bd50776f9e 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -63,7 +63,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
63 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 63 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
64 64
65 /* This is an autofs submount, we can't expire it */ 65 /* This is an autofs submount, we can't expire it */
66 if (sbi->type == AUTOFS_TYPE_INDIRECT) 66 if (autofs_type_indirect(sbi->type))
67 goto done; 67 goto done;
68 68
69 /* 69 /*
@@ -490,7 +490,7 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
490 if (arg && get_user(do_now, arg)) 490 if (arg && get_user(do_now, arg))
491 return -EFAULT; 491 return -EFAULT;
492 492
493 if (sbi->type & AUTOFS_TYPE_TRIGGER) 493 if (autofs_type_trigger(sbi->type))
494 dentry = autofs4_expire_direct(sb, mnt, sbi, do_now); 494 dentry = autofs4_expire_direct(sb, mnt, sbi, do_now);
495 else 495 else
496 dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now); 496 dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 7b19802cfef4..716e12b627b2 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -197,9 +197,9 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
197 seq_printf(m, ",minproto=%d", sbi->min_proto); 197 seq_printf(m, ",minproto=%d", sbi->min_proto);
198 seq_printf(m, ",maxproto=%d", sbi->max_proto); 198 seq_printf(m, ",maxproto=%d", sbi->max_proto);
199 199
200 if (sbi->type & AUTOFS_TYPE_OFFSET) 200 if (autofs_type_offset(sbi->type))
201 seq_printf(m, ",offset"); 201 seq_printf(m, ",offset");
202 else if (sbi->type & AUTOFS_TYPE_DIRECT) 202 else if (autofs_type_direct(sbi->type))
203 seq_printf(m, ",direct"); 203 seq_printf(m, ",direct");
204 else 204 else
205 seq_printf(m, ",indirect"); 205 seq_printf(m, ",indirect");
@@ -284,13 +284,13 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
284 *maxproto = option; 284 *maxproto = option;
285 break; 285 break;
286 case Opt_indirect: 286 case Opt_indirect:
287 *type = AUTOFS_TYPE_INDIRECT; 287 set_autofs_type_indirect(type);
288 break; 288 break;
289 case Opt_direct: 289 case Opt_direct:
290 *type = AUTOFS_TYPE_DIRECT; 290 set_autofs_type_direct(type);
291 break; 291 break;
292 case Opt_offset: 292 case Opt_offset:
293 *type = AUTOFS_TYPE_OFFSET; 293 set_autofs_type_offset(type);
294 break; 294 break;
295 default: 295 default:
296 return 1; 296 return 1;
@@ -338,7 +338,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
338 sbi->sb = s; 338 sbi->sb = s;
339 sbi->version = 0; 339 sbi->version = 0;
340 sbi->sub_version = 0; 340 sbi->sub_version = 0;
341 sbi->type = AUTOFS_TYPE_INDIRECT; 341 set_autofs_type_indirect(&sbi->type);
342 sbi->min_proto = 0; 342 sbi->min_proto = 0;
343 sbi->max_proto = 0; 343 sbi->max_proto = 0;
344 mutex_init(&sbi->wq_mutex); 344 mutex_init(&sbi->wq_mutex);
@@ -380,7 +380,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
380 } 380 }
381 381
382 root_inode->i_fop = &autofs4_root_operations; 382 root_inode->i_fop = &autofs4_root_operations;
383 root_inode->i_op = sbi->type & AUTOFS_TYPE_TRIGGER ? 383 root_inode->i_op = autofs_type_trigger(sbi->type) ?
384 &autofs4_direct_root_inode_operations : 384 &autofs4_direct_root_inode_operations :
385 &autofs4_indirect_root_inode_operations; 385 &autofs4_indirect_root_inode_operations;
386 386
@@ -455,11 +455,7 @@ struct inode *autofs4_get_inode(struct super_block *sb,
455 if (sb->s_root) { 455 if (sb->s_root) {
456 inode->i_uid = sb->s_root->d_inode->i_uid; 456 inode->i_uid = sb->s_root->d_inode->i_uid;
457 inode->i_gid = sb->s_root->d_inode->i_gid; 457 inode->i_gid = sb->s_root->d_inode->i_gid;
458 } else {
459 inode->i_uid = 0;
460 inode->i_gid = 0;
461 } 458 }
462 inode->i_blocks = 0;
463 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 459 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
464 460
465 if (S_ISDIR(inf->mode)) { 461 if (S_ISDIR(inf->mode)) {
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index e02cc8ae5eb3..eeb246845909 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -337,7 +337,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
337 * is very similar for indirect mounts except only dentrys 337 * is very similar for indirect mounts except only dentrys
338 * in the root of the autofs file system may be negative. 338 * in the root of the autofs file system may be negative.
339 */ 339 */
340 if (sbi->type & AUTOFS_TYPE_TRIGGER) 340 if (autofs_type_trigger(sbi->type))
341 return -ENOENT; 341 return -ENOENT;
342 else if (!IS_ROOT(dentry->d_parent)) 342 else if (!IS_ROOT(dentry->d_parent))
343 return -ENOENT; 343 return -ENOENT;
@@ -348,7 +348,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
348 return -ENOMEM; 348 return -ENOMEM;
349 349
350 /* If this is a direct mount request create a dummy name */ 350 /* If this is a direct mount request create a dummy name */
351 if (IS_ROOT(dentry) && sbi->type & AUTOFS_TYPE_TRIGGER) 351 if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type))
352 qstr.len = sprintf(name, "%p", dentry); 352 qstr.len = sprintf(name, "%p", dentry);
353 else { 353 else {
354 qstr.len = autofs4_getpath(sbi, dentry, &name); 354 qstr.len = autofs4_getpath(sbi, dentry, &name);
@@ -406,11 +406,11 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
406 type = autofs_ptype_expire_multi; 406 type = autofs_ptype_expire_multi;
407 } else { 407 } else {
408 if (notify == NFY_MOUNT) 408 if (notify == NFY_MOUNT)
409 type = (sbi->type & AUTOFS_TYPE_TRIGGER) ? 409 type = autofs_type_trigger(sbi->type) ?
410 autofs_ptype_missing_direct : 410 autofs_ptype_missing_direct :
411 autofs_ptype_missing_indirect; 411 autofs_ptype_missing_indirect;
412 else 412 else
413 type = (sbi->type & AUTOFS_TYPE_TRIGGER) ? 413 type = autofs_type_trigger(sbi->type) ?
414 autofs_ptype_expire_direct : 414 autofs_ptype_expire_direct :
415 autofs_ptype_expire_indirect; 415 autofs_ptype_expire_indirect;
416 } 416 }
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 5f1538c03b1b..a05287a23f62 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -132,11 +132,6 @@ static int bad_file_check_flags(int flags)
132 return -EIO; 132 return -EIO;
133} 133}
134 134
135static int bad_file_dir_notify(struct file *file, unsigned long arg)
136{
137 return -EIO;
138}
139
140static int bad_file_flock(struct file *filp, int cmd, struct file_lock *fl) 135static int bad_file_flock(struct file *filp, int cmd, struct file_lock *fl)
141{ 136{
142 return -EIO; 137 return -EIO;
@@ -179,7 +174,6 @@ static const struct file_operations bad_file_ops =
179 .sendpage = bad_file_sendpage, 174 .sendpage = bad_file_sendpage,
180 .get_unmapped_area = bad_file_get_unmapped_area, 175 .get_unmapped_area = bad_file_get_unmapped_area,
181 .check_flags = bad_file_check_flags, 176 .check_flags = bad_file_check_flags,
182 .dir_notify = bad_file_dir_notify,
183 .flock = bad_file_flock, 177 .flock = bad_file_flock,
184 .splice_write = bad_file_splice_write, 178 .splice_write = bad_file_splice_write,
185 .splice_read = bad_file_splice_read, 179 .splice_read = bad_file_splice_read,
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index b6dfee37c7b7..d06cb023ad02 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -378,7 +378,8 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
378 inode->i_size = 0; 378 inode->i_size = 0;
379 inode->i_blocks = befs_sb->block_size / VFS_BLOCK_SIZE; 379 inode->i_blocks = befs_sb->block_size / VFS_BLOCK_SIZE;
380 strncpy(befs_ino->i_data.symlink, raw_inode->data.symlink, 380 strncpy(befs_ino->i_data.symlink, raw_inode->data.symlink,
381 BEFS_SYMLINK_LEN); 381 BEFS_SYMLINK_LEN - 1);
382 befs_ino->i_data.symlink[BEFS_SYMLINK_LEN - 1] = '\0';
382 } else { 383 } else {
383 int num_blks; 384 int num_blks;
384 385
@@ -477,6 +478,8 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd)
477 kfree(link); 478 kfree(link);
478 befs_error(sb, "Failed to read entire long symlink"); 479 befs_error(sb, "Failed to read entire long symlink");
479 link = ERR_PTR(-EIO); 480 link = ERR_PTR(-EIO);
481 } else {
482 link[len - 1] = '\0';
480 } 483 }
481 } else { 484 } else {
482 link = befs_ino->i_data.symlink; 485 link = befs_ino->i_data.symlink;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 0ed57b5ee012..cc4062d12ca2 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -213,6 +213,9 @@ static void bfs_put_super(struct super_block *s)
213{ 213{
214 struct bfs_sb_info *info = BFS_SB(s); 214 struct bfs_sb_info *info = BFS_SB(s);
215 215
216 if (!info)
217 return;
218
216 brelse(info->si_sbh); 219 brelse(info->si_sbh);
217 mutex_destroy(&info->bfs_lock); 220 mutex_destroy(&info->bfs_lock);
218 kfree(info->si_imap); 221 kfree(info->si_imap);
@@ -327,6 +330,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
327 unsigned i, imap_len; 330 unsigned i, imap_len;
328 struct bfs_sb_info *info; 331 struct bfs_sb_info *info;
329 long ret = -EINVAL; 332 long ret = -EINVAL;
333 unsigned long i_sblock, i_eblock, i_eoff, s_size;
330 334
331 info = kzalloc(sizeof(*info), GFP_KERNEL); 335 info = kzalloc(sizeof(*info), GFP_KERNEL);
332 if (!info) 336 if (!info)
@@ -350,6 +354,12 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
350 354
351 s->s_magic = BFS_MAGIC; 355 s->s_magic = BFS_MAGIC;
352 info->si_sbh = bh; 356 info->si_sbh = bh;
357
358 if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) {
359 printf("Superblock is corrupted\n");
360 goto out;
361 }
362
353 info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / 363 info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) /
354 sizeof(struct bfs_inode) 364 sizeof(struct bfs_inode)
355 + BFS_ROOT_INO - 1; 365 + BFS_ROOT_INO - 1;
@@ -380,6 +390,18 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
380 - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS; 390 - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS;
381 info->si_freei = 0; 391 info->si_freei = 0;
382 info->si_lf_eblk = 0; 392 info->si_lf_eblk = 0;
393
394 /* can we read the last block? */
395 bh = sb_bread(s, info->si_blocks - 1);
396 if (!bh) {
397 printf("Last block not available: %lu\n", info->si_blocks - 1);
398 iput(inode);
399 ret = -EIO;
400 kfree(info->si_imap);
401 goto out;
402 }
403 brelse(bh);
404
383 bh = NULL; 405 bh = NULL;
384 for (i = BFS_ROOT_INO; i <= info->si_lasti; i++) { 406 for (i = BFS_ROOT_INO; i <= info->si_lasti; i++) {
385 struct bfs_inode *di; 407 struct bfs_inode *di;
@@ -397,6 +419,29 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
397 419
398 di = (struct bfs_inode *)bh->b_data + off; 420 di = (struct bfs_inode *)bh->b_data + off;
399 421
422 /* test if filesystem is not corrupted */
423
424 i_eoff = le32_to_cpu(di->i_eoffset);
425 i_sblock = le32_to_cpu(di->i_sblock);
426 i_eblock = le32_to_cpu(di->i_eblock);
427 s_size = le32_to_cpu(bfs_sb->s_end);
428
429 if (i_sblock > info->si_blocks ||
430 i_eblock > info->si_blocks ||
431 i_sblock > i_eblock ||
432 i_eoff > s_size ||
433 i_sblock * BFS_BSIZE > i_eoff) {
434
435 printf("Inode 0x%08x corrupted\n", i);
436
437 brelse(bh);
438 s->s_root = NULL;
439 kfree(info->si_imap);
440 kfree(info);
441 s->s_fs_info = NULL;
442 return -EIO;
443 }
444
400 if (!di->i_ino) { 445 if (!di->i_ino) {
401 info->si_freei++; 446 info->si_freei++;
402 continue; 447 continue;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index f1f3f4192a60..b639dcf7c778 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -95,92 +95,55 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, u
95 int has_dumped = 0; 95 int has_dumped = 0;
96 unsigned long dump_start, dump_size; 96 unsigned long dump_start, dump_size;
97 struct user dump; 97 struct user dump;
98#if defined(__alpha__) 98#ifdef __alpha__
99# define START_DATA(u) (u.start_data) 99# define START_DATA(u) (u.start_data)
100#elif defined(__arm__) 100#else
101# define START_DATA(u) ((u.u_tsize << PAGE_SHIFT) + u.start_code) 101# define START_DATA(u) ((u.u_tsize << PAGE_SHIFT) + u.start_code)
102#elif defined(__sparc__)
103# define START_DATA(u) (u.u_tsize)
104#elif defined(__i386__) || defined(__mc68000__) || defined(__arch_um__)
105# define START_DATA(u) (u.u_tsize << PAGE_SHIFT)
106#endif 102#endif
107#ifdef __sparc__
108# define START_STACK(u) ((regs->u_regs[UREG_FP]) & ~(PAGE_SIZE - 1))
109#else
110# define START_STACK(u) (u.start_stack) 103# define START_STACK(u) (u.start_stack)
111#endif
112 104
113 fs = get_fs(); 105 fs = get_fs();
114 set_fs(KERNEL_DS); 106 set_fs(KERNEL_DS);
115 has_dumped = 1; 107 has_dumped = 1;
116 current->flags |= PF_DUMPCORE; 108 current->flags |= PF_DUMPCORE;
117 strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm)); 109 strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm));
118#ifndef __sparc__
119 dump.u_ar0 = offsetof(struct user, regs); 110 dump.u_ar0 = offsetof(struct user, regs);
120#endif
121 dump.signal = signr; 111 dump.signal = signr;
122 aout_dump_thread(regs, &dump); 112 aout_dump_thread(regs, &dump);
123 113
124/* If the size of the dump file exceeds the rlimit, then see what would happen 114/* If the size of the dump file exceeds the rlimit, then see what would happen
125 if we wrote the stack, but not the data area. */ 115 if we wrote the stack, but not the data area. */
126#ifdef __sparc__
127 if ((dump.u_dsize + dump.u_ssize) > limit)
128 dump.u_dsize = 0;
129#else
130 if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > limit) 116 if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > limit)
131 dump.u_dsize = 0; 117 dump.u_dsize = 0;
132#endif
133 118
134/* Make sure we have enough room to write the stack and data areas. */ 119/* Make sure we have enough room to write the stack and data areas. */
135#ifdef __sparc__
136 if (dump.u_ssize > limit)
137 dump.u_ssize = 0;
138#else
139 if ((dump.u_ssize + 1) * PAGE_SIZE > limit) 120 if ((dump.u_ssize + 1) * PAGE_SIZE > limit)
140 dump.u_ssize = 0; 121 dump.u_ssize = 0;
141#endif
142 122
143/* make sure we actually have a data and stack area to dump */ 123/* make sure we actually have a data and stack area to dump */
144 set_fs(USER_DS); 124 set_fs(USER_DS);
145#ifdef __sparc__
146 if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize))
147 dump.u_dsize = 0;
148 if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize))
149 dump.u_ssize = 0;
150#else
151 if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize << PAGE_SHIFT)) 125 if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
152 dump.u_dsize = 0; 126 dump.u_dsize = 0;
153 if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize << PAGE_SHIFT)) 127 if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
154 dump.u_ssize = 0; 128 dump.u_ssize = 0;
155#endif
156 129
157 set_fs(KERNEL_DS); 130 set_fs(KERNEL_DS);
158/* struct user */ 131/* struct user */
159 DUMP_WRITE(&dump,sizeof(dump)); 132 DUMP_WRITE(&dump,sizeof(dump));
160/* Now dump all of the user data. Include malloced stuff as well */ 133/* Now dump all of the user data. Include malloced stuff as well */
161#ifndef __sparc__
162 DUMP_SEEK(PAGE_SIZE); 134 DUMP_SEEK(PAGE_SIZE);
163#endif
164/* now we start writing out the user space info */ 135/* now we start writing out the user space info */
165 set_fs(USER_DS); 136 set_fs(USER_DS);
166/* Dump the data area */ 137/* Dump the data area */
167 if (dump.u_dsize != 0) { 138 if (dump.u_dsize != 0) {
168 dump_start = START_DATA(dump); 139 dump_start = START_DATA(dump);
169#ifdef __sparc__
170 dump_size = dump.u_dsize;
171#else
172 dump_size = dump.u_dsize << PAGE_SHIFT; 140 dump_size = dump.u_dsize << PAGE_SHIFT;
173#endif
174 DUMP_WRITE(dump_start,dump_size); 141 DUMP_WRITE(dump_start,dump_size);
175 } 142 }
176/* Now prepare to dump the stack area */ 143/* Now prepare to dump the stack area */
177 if (dump.u_ssize != 0) { 144 if (dump.u_ssize != 0) {
178 dump_start = START_STACK(dump); 145 dump_start = START_STACK(dump);
179#ifdef __sparc__
180 dump_size = dump.u_ssize;
181#else
182 dump_size = dump.u_ssize << PAGE_SHIFT; 146 dump_size = dump.u_ssize << PAGE_SHIFT;
183#endif
184 DUMP_WRITE(dump_start,dump_size); 147 DUMP_WRITE(dump_start,dump_size);
185 } 148 }
186/* Finally dump the task struct. Not be used by gdb, but could be useful */ 149/* Finally dump the task struct. Not be used by gdb, but could be useful */
@@ -205,29 +168,24 @@ static unsigned long __user *create_aout_tables(char __user *p, struct linux_bin
205 int envc = bprm->envc; 168 int envc = bprm->envc;
206 169
207 sp = (void __user *)((-(unsigned long)sizeof(char *)) & (unsigned long) p); 170 sp = (void __user *)((-(unsigned long)sizeof(char *)) & (unsigned long) p);
208#ifdef __sparc__
209 /* This imposes the proper stack alignment for a new process. */
210 sp = (void __user *) (((unsigned long) sp) & ~7);
211 if ((envc+argc+3)&1) --sp;
212#endif
213#ifdef __alpha__ 171#ifdef __alpha__
214/* whee.. test-programs are so much fun. */ 172/* whee.. test-programs are so much fun. */
215 put_user(0, --sp); 173 put_user(0, --sp);
216 put_user(0, --sp); 174 put_user(0, --sp);
217 if (bprm->loader) { 175 if (bprm->loader) {
218 put_user(0, --sp); 176 put_user(0, --sp);
219 put_user(0x3eb, --sp); 177 put_user(1003, --sp);
220 put_user(bprm->loader, --sp); 178 put_user(bprm->loader, --sp);
221 put_user(0x3ea, --sp); 179 put_user(1002, --sp);
222 } 180 }
223 put_user(bprm->exec, --sp); 181 put_user(bprm->exec, --sp);
224 put_user(0x3e9, --sp); 182 put_user(1001, --sp);
225#endif 183#endif
226 sp -= envc+1; 184 sp -= envc+1;
227 envp = (char __user * __user *) sp; 185 envp = (char __user * __user *) sp;
228 sp -= argc+1; 186 sp -= argc+1;
229 argv = (char __user * __user *) sp; 187 argv = (char __user * __user *) sp;
230#if defined(__i386__) || defined(__mc68000__) || defined(__arm__) || defined(__arch_um__) 188#ifndef __alpha__
231 put_user((unsigned long) envp,--sp); 189 put_user((unsigned long) envp,--sp);
232 put_user((unsigned long) argv,--sp); 190 put_user((unsigned long) argv,--sp);
233#endif 191#endif
@@ -300,13 +258,8 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
300 return retval; 258 return retval;
301 259
302 /* OK, This is the point of no return */ 260 /* OK, This is the point of no return */
303#if defined(__alpha__) 261#ifdef __alpha__
304 SET_AOUT_PERSONALITY(bprm, ex); 262 SET_AOUT_PERSONALITY(bprm, ex);
305#elif defined(__sparc__)
306 set_personality(PER_SUNOS);
307#if !defined(__sparc_v9__)
308 memcpy(&current->thread.core_exec, &ex, sizeof(struct exec));
309#endif
310#else 263#else
311 set_personality(PER_LINUX); 264 set_personality(PER_LINUX);
312#endif 265#endif
@@ -322,24 +275,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
322 275
323 install_exec_creds(bprm); 276 install_exec_creds(bprm);
324 current->flags &= ~PF_FORKNOEXEC; 277 current->flags &= ~PF_FORKNOEXEC;
325#ifdef __sparc__
326 if (N_MAGIC(ex) == NMAGIC) {
327 loff_t pos = fd_offset;
328 /* Fuck me plenty... */
329 /* <AOL></AOL> */
330 down_write(&current->mm->mmap_sem);
331 error = do_brk(N_TXTADDR(ex), ex.a_text);
332 up_write(&current->mm->mmap_sem);
333 bprm->file->f_op->read(bprm->file, (char *) N_TXTADDR(ex),
334 ex.a_text, &pos);
335 down_write(&current->mm->mmap_sem);
336 error = do_brk(N_DATADDR(ex), ex.a_data);
337 up_write(&current->mm->mmap_sem);
338 bprm->file->f_op->read(bprm->file, (char *) N_DATADDR(ex),
339 ex.a_data, &pos);
340 goto beyond_if;
341 }
342#endif
343 278
344 if (N_MAGIC(ex) == OMAGIC) { 279 if (N_MAGIC(ex) == OMAGIC) {
345 unsigned long text_addr, map_size; 280 unsigned long text_addr, map_size;
@@ -347,7 +282,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
347 282
348 text_addr = N_TXTADDR(ex); 283 text_addr = N_TXTADDR(ex);
349 284
350#if defined(__alpha__) || defined(__sparc__) 285#ifdef __alpha__
351 pos = fd_offset; 286 pos = fd_offset;
352 map_size = ex.a_text+ex.a_data + PAGE_SIZE - 1; 287 map_size = ex.a_text+ex.a_data + PAGE_SIZE - 1;
353#else 288#else
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index c41fa2af7677..e3ff2b9e602f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -152,8 +152,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
152 elf_addr_t __user *sp; 152 elf_addr_t __user *sp;
153 elf_addr_t __user *u_platform; 153 elf_addr_t __user *u_platform;
154 elf_addr_t __user *u_base_platform; 154 elf_addr_t __user *u_base_platform;
155 elf_addr_t __user *u_rand_bytes;
155 const char *k_platform = ELF_PLATFORM; 156 const char *k_platform = ELF_PLATFORM;
156 const char *k_base_platform = ELF_BASE_PLATFORM; 157 const char *k_base_platform = ELF_BASE_PLATFORM;
158 unsigned char k_rand_bytes[16];
157 int items; 159 int items;
158 elf_addr_t *elf_info; 160 elf_addr_t *elf_info;
159 int ei_index = 0; 161 int ei_index = 0;
@@ -196,6 +198,15 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
196 return -EFAULT; 198 return -EFAULT;
197 } 199 }
198 200
201 /*
202 * Generate 16 random bytes for userspace PRNG seeding.
203 */
204 get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes));
205 u_rand_bytes = (elf_addr_t __user *)
206 STACK_ALLOC(p, sizeof(k_rand_bytes));
207 if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes)))
208 return -EFAULT;
209
199 /* Create the ELF interpreter info */ 210 /* Create the ELF interpreter info */
200 elf_info = (elf_addr_t *)current->mm->saved_auxv; 211 elf_info = (elf_addr_t *)current->mm->saved_auxv;
201 /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */ 212 /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */
@@ -228,6 +239,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
228 NEW_AUX_ENT(AT_GID, cred->gid); 239 NEW_AUX_ENT(AT_GID, cred->gid);
229 NEW_AUX_ENT(AT_EGID, cred->egid); 240 NEW_AUX_ENT(AT_EGID, cred->egid);
230 NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); 241 NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
242 NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
231 NEW_AUX_ENT(AT_EXECFN, bprm->exec); 243 NEW_AUX_ENT(AT_EXECFN, bprm->exec);
232 if (k_platform) { 244 if (k_platform) {
233 NEW_AUX_ENT(AT_PLATFORM, 245 NEW_AUX_ENT(AT_PLATFORM,
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index aa5b43205e37..f3e72c5c19f5 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -168,9 +168,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
168 struct elf_fdpic_params exec_params, interp_params; 168 struct elf_fdpic_params exec_params, interp_params;
169 struct elf_phdr *phdr; 169 struct elf_phdr *phdr;
170 unsigned long stack_size, entryaddr; 170 unsigned long stack_size, entryaddr;
171#ifndef CONFIG_MMU
172 unsigned long fullsize;
173#endif
174#ifdef ELF_FDPIC_PLAT_INIT 171#ifdef ELF_FDPIC_PLAT_INIT
175 unsigned long dynaddr; 172 unsigned long dynaddr;
176#endif 173#endif
@@ -390,11 +387,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
390 goto error_kill; 387 goto error_kill;
391 } 388 }
392 389
393 /* expand the stack mapping to use up the entire allocation granule */
394 fullsize = kobjsize((char *) current->mm->start_brk);
395 if (!IS_ERR_VALUE(do_mremap(current->mm->start_brk, stack_size,
396 fullsize, 0, 0)))
397 stack_size = fullsize;
398 up_write(&current->mm->mmap_sem); 390 up_write(&current->mm->mmap_sem);
399 391
400 current->mm->brk = current->mm->start_brk; 392 current->mm->brk = current->mm->start_brk;
@@ -1567,11 +1559,9 @@ end_coredump:
1567static int elf_fdpic_dump_segments(struct file *file, size_t *size, 1559static int elf_fdpic_dump_segments(struct file *file, size_t *size,
1568 unsigned long *limit, unsigned long mm_flags) 1560 unsigned long *limit, unsigned long mm_flags)
1569{ 1561{
1570 struct vm_list_struct *vml; 1562 struct vm_area_struct *vma;
1571
1572 for (vml = current->mm->context.vmlist; vml; vml = vml->next) {
1573 struct vm_area_struct *vma = vml->vma;
1574 1563
1564 for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
1575 if (!maydump(vma, mm_flags)) 1565 if (!maydump(vma, mm_flags))
1576 continue; 1566 continue;
1577 1567
@@ -1617,9 +1607,6 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
1617 elf_fpxregset_t *xfpu = NULL; 1607 elf_fpxregset_t *xfpu = NULL;
1618#endif 1608#endif
1619 int thread_status_size = 0; 1609 int thread_status_size = 0;
1620#ifndef CONFIG_MMU
1621 struct vm_list_struct *vml;
1622#endif
1623 elf_addr_t *auxv; 1610 elf_addr_t *auxv;
1624 unsigned long mm_flags; 1611 unsigned long mm_flags;
1625 1612
@@ -1685,13 +1672,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
1685 fill_prstatus(prstatus, current, signr); 1672 fill_prstatus(prstatus, current, signr);
1686 elf_core_copy_regs(&prstatus->pr_reg, regs); 1673 elf_core_copy_regs(&prstatus->pr_reg, regs);
1687 1674
1688#ifdef CONFIG_MMU
1689 segs = current->mm->map_count; 1675 segs = current->mm->map_count;
1690#else
1691 segs = 0;
1692 for (vml = current->mm->context.vmlist; vml; vml = vml->next)
1693 segs++;
1694#endif
1695#ifdef ELF_CORE_EXTRA_PHDRS 1676#ifdef ELF_CORE_EXTRA_PHDRS
1696 segs += ELF_CORE_EXTRA_PHDRS; 1677 segs += ELF_CORE_EXTRA_PHDRS;
1697#endif 1678#endif
@@ -1766,20 +1747,10 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
1766 mm_flags = current->mm->flags; 1747 mm_flags = current->mm->flags;
1767 1748
1768 /* write program headers for segments dump */ 1749 /* write program headers for segments dump */
1769 for ( 1750 for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
1770#ifdef CONFIG_MMU
1771 vma = current->mm->mmap; vma; vma = vma->vm_next
1772#else
1773 vml = current->mm->context.vmlist; vml; vml = vml->next
1774#endif
1775 ) {
1776 struct elf_phdr phdr; 1751 struct elf_phdr phdr;
1777 size_t sz; 1752 size_t sz;
1778 1753
1779#ifndef CONFIG_MMU
1780 vma = vml->vma;
1781#endif
1782
1783 sz = vma->vm_end - vma->vm_start; 1754 sz = vma->vm_end - vma->vm_start;
1784 1755
1785 phdr.p_type = PT_LOAD; 1756 phdr.p_type = PT_LOAD;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 7bbd5c6b3725..5cebf0b37798 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -417,8 +417,8 @@ static int load_flat_file(struct linux_binprm * bprm,
417 unsigned long textpos = 0, datapos = 0, result; 417 unsigned long textpos = 0, datapos = 0, result;
418 unsigned long realdatastart = 0; 418 unsigned long realdatastart = 0;
419 unsigned long text_len, data_len, bss_len, stack_len, flags; 419 unsigned long text_len, data_len, bss_len, stack_len, flags;
420 unsigned long len, reallen, memp = 0; 420 unsigned long len, memp = 0;
421 unsigned long extra, rlim; 421 unsigned long memp_size, extra, rlim;
422 unsigned long *reloc = 0, *rp; 422 unsigned long *reloc = 0, *rp;
423 struct inode *inode; 423 struct inode *inode;
424 int i, rev, relocs = 0; 424 int i, rev, relocs = 0;
@@ -543,17 +543,10 @@ static int load_flat_file(struct linux_binprm * bprm,
543 } 543 }
544 544
545 len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); 545 len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
546 len = PAGE_ALIGN(len);
546 down_write(&current->mm->mmap_sem); 547 down_write(&current->mm->mmap_sem);
547 realdatastart = do_mmap(0, 0, len, 548 realdatastart = do_mmap(0, 0, len,
548 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); 549 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
549 /* Remap to use all availabe slack region space */
550 if (realdatastart && (realdatastart < (unsigned long)-4096)) {
551 reallen = kobjsize((void *)realdatastart);
552 if (reallen > len) {
553 realdatastart = do_mremap(realdatastart, len,
554 reallen, MREMAP_FIXED, realdatastart);
555 }
556 }
557 up_write(&current->mm->mmap_sem); 550 up_write(&current->mm->mmap_sem);
558 551
559 if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) { 552 if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) {
@@ -591,21 +584,14 @@ static int load_flat_file(struct linux_binprm * bprm,
591 584
592 reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len)); 585 reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len));
593 memp = realdatastart; 586 memp = realdatastart;
594 587 memp_size = len;
595 } else { 588 } else {
596 589
597 len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); 590 len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
591 len = PAGE_ALIGN(len);
598 down_write(&current->mm->mmap_sem); 592 down_write(&current->mm->mmap_sem);
599 textpos = do_mmap(0, 0, len, 593 textpos = do_mmap(0, 0, len,
600 PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); 594 PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
601 /* Remap to use all availabe slack region space */
602 if (textpos && (textpos < (unsigned long) -4096)) {
603 reallen = kobjsize((void *)textpos);
604 if (reallen > len) {
605 textpos = do_mremap(textpos, len, reallen,
606 MREMAP_FIXED, textpos);
607 }
608 }
609 up_write(&current->mm->mmap_sem); 595 up_write(&current->mm->mmap_sem);
610 596
611 if (!textpos || textpos >= (unsigned long) -4096) { 597 if (!textpos || textpos >= (unsigned long) -4096) {
@@ -622,7 +608,7 @@ static int load_flat_file(struct linux_binprm * bprm,
622 reloc = (unsigned long *) (textpos + ntohl(hdr->reloc_start) + 608 reloc = (unsigned long *) (textpos + ntohl(hdr->reloc_start) +
623 MAX_SHARED_LIBS * sizeof(unsigned long)); 609 MAX_SHARED_LIBS * sizeof(unsigned long));
624 memp = textpos; 610 memp = textpos;
625 611 memp_size = len;
626#ifdef CONFIG_BINFMT_ZFLAT 612#ifdef CONFIG_BINFMT_ZFLAT
627 /* 613 /*
628 * load it all in and treat it like a RAM load from now on 614 * load it all in and treat it like a RAM load from now on
@@ -680,10 +666,12 @@ static int load_flat_file(struct linux_binprm * bprm,
680 * set up the brk stuff, uses any slack left in data/bss/stack 666 * set up the brk stuff, uses any slack left in data/bss/stack
681 * allocation. We put the brk after the bss (between the bss 667 * allocation. We put the brk after the bss (between the bss
682 * and stack) like other platforms. 668 * and stack) like other platforms.
669 * Userspace code relies on the stack pointer starting out at
670 * an address right at the end of a page.
683 */ 671 */
684 current->mm->start_brk = datapos + data_len + bss_len; 672 current->mm->start_brk = datapos + data_len + bss_len;
685 current->mm->brk = (current->mm->start_brk + 3) & ~3; 673 current->mm->brk = (current->mm->start_brk + 3) & ~3;
686 current->mm->context.end_brk = memp + kobjsize((void *) memp) - stack_len; 674 current->mm->context.end_brk = memp + memp_size - stack_len;
687 } 675 }
688 676
689 if (flags & FLAT_FLAG_KTRACE) 677 if (flags & FLAT_FLAG_KTRACE)
@@ -790,8 +778,8 @@ static int load_flat_file(struct linux_binprm * bprm,
790 778
791 /* zero the BSS, BRK and stack areas */ 779 /* zero the BSS, BRK and stack areas */
792 memset((void*)(datapos + data_len), 0, bss_len + 780 memset((void*)(datapos + data_len), 0, bss_len +
793 (memp + kobjsize((void *) memp) - stack_len - /* end brk */ 781 (memp + memp_size - stack_len - /* end brk */
794 libinfo->lib_list[id].start_brk) + /* start brk */ 782 libinfo->lib_list[id].start_brk) + /* start brk */
795 stack_len); 783 stack_len);
796 784
797 return 0; 785 return 0;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index f2744ab4e5b3..c4e83537ead7 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -496,9 +496,6 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
496 496
497 if (inode) { 497 if (inode) {
498 inode->i_mode = mode; 498 inode->i_mode = mode;
499 inode->i_uid = 0;
500 inode->i_gid = 0;
501 inode->i_blocks = 0;
502 inode->i_atime = inode->i_mtime = inode->i_ctime = 499 inode->i_atime = inode->i_mtime = inode->i_ctime =
503 current_fs_time(inode->i_sb); 500 current_fs_time(inode->i_sb);
504 } 501 }
@@ -652,7 +649,7 @@ static const struct file_operations bm_register_operations = {
652static ssize_t 649static ssize_t
653bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) 650bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
654{ 651{
655 char *s = enabled ? "enabled" : "disabled"; 652 char *s = enabled ? "enabled\n" : "disabled\n";
656 653
657 return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s)); 654 return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
658} 655}
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 19caf7c962ac..77ebc3c263d6 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -111,7 +111,7 @@ void bio_integrity_free(struct bio *bio, struct bio_set *bs)
111 && bip->bip_buf != NULL) 111 && bip->bip_buf != NULL)
112 kfree(bip->bip_buf); 112 kfree(bip->bip_buf);
113 113
114 mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]); 114 bvec_free_bs(bs, bip->bip_vec, bip->bip_pool);
115 mempool_free(bip, bs->bio_integrity_pool); 115 mempool_free(bip, bs->bio_integrity_pool);
116 116
117 bio->bi_integrity = NULL; 117 bio->bi_integrity = NULL;
diff --git a/fs/bio.c b/fs/bio.c
index df99c882b807..062299acbccd 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -31,7 +31,11 @@
31 31
32DEFINE_TRACE(block_split); 32DEFINE_TRACE(block_split);
33 33
34static struct kmem_cache *bio_slab __read_mostly; 34/*
35 * Test patch to inline a certain number of bi_io_vec's inside the bio
36 * itself, to shrink a bio data allocation from two mempool calls to one
37 */
38#define BIO_INLINE_VECS 4
35 39
36static mempool_t *bio_split_pool __read_mostly; 40static mempool_t *bio_split_pool __read_mostly;
37 41
@@ -40,9 +44,8 @@ static mempool_t *bio_split_pool __read_mostly;
40 * break badly! cannot be bigger than what you can fit into an 44 * break badly! cannot be bigger than what you can fit into an
41 * unsigned short 45 * unsigned short
42 */ 46 */
43
44#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) } 47#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
45static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { 48struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
46 BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES), 49 BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
47}; 50};
48#undef BV 51#undef BV
@@ -53,12 +56,121 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
53 */ 56 */
54struct bio_set *fs_bio_set; 57struct bio_set *fs_bio_set;
55 58
59/*
60 * Our slab pool management
61 */
62struct bio_slab {
63 struct kmem_cache *slab;
64 unsigned int slab_ref;
65 unsigned int slab_size;
66 char name[8];
67};
68static DEFINE_MUTEX(bio_slab_lock);
69static struct bio_slab *bio_slabs;
70static unsigned int bio_slab_nr, bio_slab_max;
71
72static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
73{
74 unsigned int sz = sizeof(struct bio) + extra_size;
75 struct kmem_cache *slab = NULL;
76 struct bio_slab *bslab;
77 unsigned int i, entry = -1;
78
79 mutex_lock(&bio_slab_lock);
80
81 i = 0;
82 while (i < bio_slab_nr) {
83 struct bio_slab *bslab = &bio_slabs[i];
84
85 if (!bslab->slab && entry == -1)
86 entry = i;
87 else if (bslab->slab_size == sz) {
88 slab = bslab->slab;
89 bslab->slab_ref++;
90 break;
91 }
92 i++;
93 }
94
95 if (slab)
96 goto out_unlock;
97
98 if (bio_slab_nr == bio_slab_max && entry == -1) {
99 bio_slab_max <<= 1;
100 bio_slabs = krealloc(bio_slabs,
101 bio_slab_max * sizeof(struct bio_slab),
102 GFP_KERNEL);
103 if (!bio_slabs)
104 goto out_unlock;
105 }
106 if (entry == -1)
107 entry = bio_slab_nr++;
108
109 bslab = &bio_slabs[entry];
110
111 snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
112 slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL);
113 if (!slab)
114 goto out_unlock;
115
116 printk("bio: create slab <%s> at %d\n", bslab->name, entry);
117 bslab->slab = slab;
118 bslab->slab_ref = 1;
119 bslab->slab_size = sz;
120out_unlock:
121 mutex_unlock(&bio_slab_lock);
122 return slab;
123}
124
125static void bio_put_slab(struct bio_set *bs)
126{
127 struct bio_slab *bslab = NULL;
128 unsigned int i;
129
130 mutex_lock(&bio_slab_lock);
131
132 for (i = 0; i < bio_slab_nr; i++) {
133 if (bs->bio_slab == bio_slabs[i].slab) {
134 bslab = &bio_slabs[i];
135 break;
136 }
137 }
138
139 if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
140 goto out;
141
142 WARN_ON(!bslab->slab_ref);
143
144 if (--bslab->slab_ref)
145 goto out;
146
147 kmem_cache_destroy(bslab->slab);
148 bslab->slab = NULL;
149
150out:
151 mutex_unlock(&bio_slab_lock);
152}
153
56unsigned int bvec_nr_vecs(unsigned short idx) 154unsigned int bvec_nr_vecs(unsigned short idx)
57{ 155{
58 return bvec_slabs[idx].nr_vecs; 156 return bvec_slabs[idx].nr_vecs;
59} 157}
60 158
61struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs) 159void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx)
160{
161 BIO_BUG_ON(idx >= BIOVEC_NR_POOLS);
162
163 if (idx == BIOVEC_MAX_IDX)
164 mempool_free(bv, bs->bvec_pool);
165 else {
166 struct biovec_slab *bvs = bvec_slabs + idx;
167
168 kmem_cache_free(bvs->slab, bv);
169 }
170}
171
172struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
173 struct bio_set *bs)
62{ 174{
63 struct bio_vec *bvl; 175 struct bio_vec *bvl;
64 176
@@ -67,60 +179,85 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct
67 * If not, this is a bio_kmalloc() allocation and just do a 179 * If not, this is a bio_kmalloc() allocation and just do a
68 * kzalloc() for the exact number of vecs right away. 180 * kzalloc() for the exact number of vecs right away.
69 */ 181 */
70 if (bs) { 182 if (!bs)
183 bvl = kmalloc(nr * sizeof(struct bio_vec), gfp_mask);
184
185 /*
186 * see comment near bvec_array define!
187 */
188 switch (nr) {
189 case 1:
190 *idx = 0;
191 break;
192 case 2 ... 4:
193 *idx = 1;
194 break;
195 case 5 ... 16:
196 *idx = 2;
197 break;
198 case 17 ... 64:
199 *idx = 3;
200 break;
201 case 65 ... 128:
202 *idx = 4;
203 break;
204 case 129 ... BIO_MAX_PAGES:
205 *idx = 5;
206 break;
207 default:
208 return NULL;
209 }
210
211 /*
212 * idx now points to the pool we want to allocate from. only the
213 * 1-vec entry pool is mempool backed.
214 */
215 if (*idx == BIOVEC_MAX_IDX) {
216fallback:
217 bvl = mempool_alloc(bs->bvec_pool, gfp_mask);
218 } else {
219 struct biovec_slab *bvs = bvec_slabs + *idx;
220 gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
221
71 /* 222 /*
72 * see comment near bvec_array define! 223 * Make this allocation restricted and don't dump info on
224 * allocation failures, since we'll fallback to the mempool
225 * in case of failure.
73 */ 226 */
74 switch (nr) { 227 __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
75 case 1:
76 *idx = 0;
77 break;
78 case 2 ... 4:
79 *idx = 1;
80 break;
81 case 5 ... 16:
82 *idx = 2;
83 break;
84 case 17 ... 64:
85 *idx = 3;
86 break;
87 case 65 ... 128:
88 *idx = 4;
89 break;
90 case 129 ... BIO_MAX_PAGES:
91 *idx = 5;
92 break;
93 default:
94 return NULL;
95 }
96 228
97 /* 229 /*
98 * idx now points to the pool we want to allocate from 230 * Try a slab allocation. If this fails and __GFP_WAIT
231 * is set, retry with the 1-entry mempool
99 */ 232 */
100 bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask); 233 bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
101 if (bvl) 234 if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) {
102 memset(bvl, 0, 235 *idx = BIOVEC_MAX_IDX;
103 bvec_nr_vecs(*idx) * sizeof(struct bio_vec)); 236 goto fallback;
104 } else 237 }
105 bvl = kzalloc(nr * sizeof(struct bio_vec), gfp_mask); 238 }
106 239
107 return bvl; 240 return bvl;
108} 241}
109 242
110void bio_free(struct bio *bio, struct bio_set *bio_set) 243void bio_free(struct bio *bio, struct bio_set *bs)
111{ 244{
112 if (bio->bi_io_vec) { 245 void *p;
113 const int pool_idx = BIO_POOL_IDX(bio);
114
115 BIO_BUG_ON(pool_idx >= BIOVEC_NR_POOLS);
116 246
117 mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]); 247 if (bio_has_allocated_vec(bio))
118 } 248 bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
119 249
120 if (bio_integrity(bio)) 250 if (bio_integrity(bio))
121 bio_integrity_free(bio, bio_set); 251 bio_integrity_free(bio, bs);
252
253 /*
254 * If we have front padding, adjust the bio pointer before freeing
255 */
256 p = bio;
257 if (bs->front_pad)
258 p -= bs->front_pad;
122 259
123 mempool_free(bio, bio_set->bio_pool); 260 mempool_free(p, bs->bio_pool);
124} 261}
125 262
126/* 263/*
@@ -133,7 +270,8 @@ static void bio_fs_destructor(struct bio *bio)
133 270
134static void bio_kmalloc_destructor(struct bio *bio) 271static void bio_kmalloc_destructor(struct bio *bio)
135{ 272{
136 kfree(bio->bi_io_vec); 273 if (bio_has_allocated_vec(bio))
274 kfree(bio->bi_io_vec);
137 kfree(bio); 275 kfree(bio);
138} 276}
139 277
@@ -157,16 +295,20 @@ void bio_init(struct bio *bio)
157 * for a &struct bio to become free. If a %NULL @bs is passed in, we will 295 * for a &struct bio to become free. If a %NULL @bs is passed in, we will
158 * fall back to just using @kmalloc to allocate the required memory. 296 * fall back to just using @kmalloc to allocate the required memory.
159 * 297 *
160 * allocate bio and iovecs from the memory pools specified by the 298 * Note that the caller must set ->bi_destructor on succesful return
161 * bio_set structure, or @kmalloc if none given. 299 * of a bio, to do the appropriate freeing of the bio once the reference
300 * count drops to zero.
162 **/ 301 **/
163struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) 302struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
164{ 303{
165 struct bio *bio; 304 struct bio *bio = NULL;
305
306 if (bs) {
307 void *p = mempool_alloc(bs->bio_pool, gfp_mask);
166 308
167 if (bs) 309 if (p)
168 bio = mempool_alloc(bs->bio_pool, gfp_mask); 310 bio = p + bs->front_pad;
169 else 311 } else
170 bio = kmalloc(sizeof(*bio), gfp_mask); 312 bio = kmalloc(sizeof(*bio), gfp_mask);
171 313
172 if (likely(bio)) { 314 if (likely(bio)) {
@@ -176,7 +318,15 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
176 if (likely(nr_iovecs)) { 318 if (likely(nr_iovecs)) {
177 unsigned long uninitialized_var(idx); 319 unsigned long uninitialized_var(idx);
178 320
179 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); 321 if (nr_iovecs <= BIO_INLINE_VECS) {
322 idx = 0;
323 bvl = bio->bi_inline_vecs;
324 nr_iovecs = BIO_INLINE_VECS;
325 } else {
326 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx,
327 bs);
328 nr_iovecs = bvec_nr_vecs(idx);
329 }
180 if (unlikely(!bvl)) { 330 if (unlikely(!bvl)) {
181 if (bs) 331 if (bs)
182 mempool_free(bio, bs->bio_pool); 332 mempool_free(bio, bs->bio_pool);
@@ -186,7 +336,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
186 goto out; 336 goto out;
187 } 337 }
188 bio->bi_flags |= idx << BIO_POOL_OFFSET; 338 bio->bi_flags |= idx << BIO_POOL_OFFSET;
189 bio->bi_max_vecs = bvec_nr_vecs(idx); 339 bio->bi_max_vecs = nr_iovecs;
190 } 340 }
191 bio->bi_io_vec = bvl; 341 bio->bi_io_vec = bvl;
192 } 342 }
@@ -638,6 +788,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
638 int i, ret; 788 int i, ret;
639 int nr_pages = 0; 789 int nr_pages = 0;
640 unsigned int len = 0; 790 unsigned int len = 0;
791 unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
641 792
642 for (i = 0; i < iov_count; i++) { 793 for (i = 0; i < iov_count; i++) {
643 unsigned long uaddr; 794 unsigned long uaddr;
@@ -664,35 +815,42 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
664 bio->bi_rw |= (!write_to_vm << BIO_RW); 815 bio->bi_rw |= (!write_to_vm << BIO_RW);
665 816
666 ret = 0; 817 ret = 0;
667 i = 0; 818
819 if (map_data) {
820 nr_pages = 1 << map_data->page_order;
821 i = map_data->offset / PAGE_SIZE;
822 }
668 while (len) { 823 while (len) {
669 unsigned int bytes; 824 unsigned int bytes = PAGE_SIZE;
670 825
671 if (map_data) 826 bytes -= offset;
672 bytes = 1U << (PAGE_SHIFT + map_data->page_order);
673 else
674 bytes = PAGE_SIZE;
675 827
676 if (bytes > len) 828 if (bytes > len)
677 bytes = len; 829 bytes = len;
678 830
679 if (map_data) { 831 if (map_data) {
680 if (i == map_data->nr_entries) { 832 if (i == map_data->nr_entries * nr_pages) {
681 ret = -ENOMEM; 833 ret = -ENOMEM;
682 break; 834 break;
683 } 835 }
684 page = map_data->pages[i++]; 836
685 } else 837 page = map_data->pages[i / nr_pages];
838 page += (i % nr_pages);
839
840 i++;
841 } else {
686 page = alloc_page(q->bounce_gfp | gfp_mask); 842 page = alloc_page(q->bounce_gfp | gfp_mask);
687 if (!page) { 843 if (!page) {
688 ret = -ENOMEM; 844 ret = -ENOMEM;
689 break; 845 break;
846 }
690 } 847 }
691 848
692 if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) 849 if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes)
693 break; 850 break;
694 851
695 len -= bytes; 852 len -= bytes;
853 offset = 0;
696 } 854 }
697 855
698 if (ret) 856 if (ret)
@@ -701,7 +859,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
701 /* 859 /*
702 * success 860 * success
703 */ 861 */
704 if (!write_to_vm) { 862 if (!write_to_vm && (!map_data || !map_data->null_mapped)) {
705 ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0); 863 ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0);
706 if (ret) 864 if (ret)
707 goto cleanup; 865 goto cleanup;
@@ -1346,30 +1504,18 @@ EXPORT_SYMBOL(bio_sector_offset);
1346 */ 1504 */
1347static int biovec_create_pools(struct bio_set *bs, int pool_entries) 1505static int biovec_create_pools(struct bio_set *bs, int pool_entries)
1348{ 1506{
1349 int i; 1507 struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
1350 1508
1351 for (i = 0; i < BIOVEC_NR_POOLS; i++) { 1509 bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab);
1352 struct biovec_slab *bp = bvec_slabs + i; 1510 if (!bs->bvec_pool)
1353 mempool_t **bvp = bs->bvec_pools + i; 1511 return -ENOMEM;
1354 1512
1355 *bvp = mempool_create_slab_pool(pool_entries, bp->slab);
1356 if (!*bvp)
1357 return -ENOMEM;
1358 }
1359 return 0; 1513 return 0;
1360} 1514}
1361 1515
1362static void biovec_free_pools(struct bio_set *bs) 1516static void biovec_free_pools(struct bio_set *bs)
1363{ 1517{
1364 int i; 1518 mempool_destroy(bs->bvec_pool);
1365
1366 for (i = 0; i < BIOVEC_NR_POOLS; i++) {
1367 mempool_t *bvp = bs->bvec_pools[i];
1368
1369 if (bvp)
1370 mempool_destroy(bvp);
1371 }
1372
1373} 1519}
1374 1520
1375void bioset_free(struct bio_set *bs) 1521void bioset_free(struct bio_set *bs)
@@ -1379,25 +1525,49 @@ void bioset_free(struct bio_set *bs)
1379 1525
1380 bioset_integrity_free(bs); 1526 bioset_integrity_free(bs);
1381 biovec_free_pools(bs); 1527 biovec_free_pools(bs);
1528 bio_put_slab(bs);
1382 1529
1383 kfree(bs); 1530 kfree(bs);
1384} 1531}
1385 1532
1386struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size) 1533/**
1534 * bioset_create - Create a bio_set
1535 * @pool_size: Number of bio and bio_vecs to cache in the mempool
1536 * @front_pad: Number of bytes to allocate in front of the returned bio
1537 *
1538 * Description:
1539 * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
1540 * to ask for a number of bytes to be allocated in front of the bio.
1541 * Front pad allocation is useful for embedding the bio inside
1542 * another structure, to avoid allocating extra data to go with the bio.
1543 * Note that the bio must be embedded at the END of that structure always,
1544 * or things will break badly.
1545 */
1546struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
1387{ 1547{
1388 struct bio_set *bs = kzalloc(sizeof(*bs), GFP_KERNEL); 1548 unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
1549 struct bio_set *bs;
1389 1550
1551 bs = kzalloc(sizeof(*bs), GFP_KERNEL);
1390 if (!bs) 1552 if (!bs)
1391 return NULL; 1553 return NULL;
1392 1554
1393 bs->bio_pool = mempool_create_slab_pool(bio_pool_size, bio_slab); 1555 bs->front_pad = front_pad;
1556
1557 bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
1558 if (!bs->bio_slab) {
1559 kfree(bs);
1560 return NULL;
1561 }
1562
1563 bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab);
1394 if (!bs->bio_pool) 1564 if (!bs->bio_pool)
1395 goto bad; 1565 goto bad;
1396 1566
1397 if (bioset_integrity_create(bs, bio_pool_size)) 1567 if (bioset_integrity_create(bs, pool_size))
1398 goto bad; 1568 goto bad;
1399 1569
1400 if (!biovec_create_pools(bs, bvec_pool_size)) 1570 if (!biovec_create_pools(bs, pool_size))
1401 return bs; 1571 return bs;
1402 1572
1403bad: 1573bad:
@@ -1421,12 +1591,16 @@ static void __init biovec_init_slabs(void)
1421 1591
1422static int __init init_bio(void) 1592static int __init init_bio(void)
1423{ 1593{
1424 bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC); 1594 bio_slab_max = 2;
1595 bio_slab_nr = 0;
1596 bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL);
1597 if (!bio_slabs)
1598 panic("bio: can't allocate bios\n");
1425 1599
1426 bio_integrity_init_slab(); 1600 bio_integrity_init_slab();
1427 biovec_init_slabs(); 1601 biovec_init_slabs();
1428 1602
1429 fs_bio_set = bioset_create(BIO_POOL_SIZE, 2); 1603 fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
1430 if (!fs_bio_set) 1604 if (!fs_bio_set)
1431 panic("bio: can't allocate bios\n"); 1605 panic("bio: can't allocate bios\n");
1432 1606
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 99e0ae1a4c78..b3c1efff5e1d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -285,6 +285,8 @@ static void init_once(void *foo)
285 INIT_LIST_HEAD(&bdev->bd_holder_list); 285 INIT_LIST_HEAD(&bdev->bd_holder_list);
286#endif 286#endif
287 inode_init_once(&ei->vfs_inode); 287 inode_init_once(&ei->vfs_inode);
288 /* Initialize mutex for freeze. */
289 mutex_init(&bdev->bd_fsfreeze_mutex);
288} 290}
289 291
290static inline void __bd_forget(struct inode *inode) 292static inline void __bd_forget(struct inode *inode)
@@ -326,12 +328,13 @@ static struct file_system_type bd_type = {
326 .kill_sb = kill_anon_super, 328 .kill_sb = kill_anon_super,
327}; 329};
328 330
329static struct vfsmount *bd_mnt __read_mostly; 331struct super_block *blockdev_superblock __read_mostly;
330struct super_block *blockdev_superblock;
331 332
332void __init bdev_cache_init(void) 333void __init bdev_cache_init(void)
333{ 334{
334 int err; 335 int err;
336 struct vfsmount *bd_mnt;
337
335 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 338 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
336 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 339 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
337 SLAB_MEM_SPREAD|SLAB_PANIC), 340 SLAB_MEM_SPREAD|SLAB_PANIC),
@@ -373,7 +376,7 @@ struct block_device *bdget(dev_t dev)
373 struct block_device *bdev; 376 struct block_device *bdev;
374 struct inode *inode; 377 struct inode *inode;
375 378
376 inode = iget5_locked(bd_mnt->mnt_sb, hash(dev), 379 inode = iget5_locked(blockdev_superblock, hash(dev),
377 bdev_test, bdev_set, &dev); 380 bdev_test, bdev_set, &dev);
378 381
379 if (!inode) 382 if (!inode)
@@ -463,7 +466,7 @@ void bd_forget(struct inode *inode)
463 466
464 spin_lock(&bdev_lock); 467 spin_lock(&bdev_lock);
465 if (inode->i_bdev) { 468 if (inode->i_bdev) {
466 if (inode->i_sb != blockdev_superblock) 469 if (!sb_is_blkdev_sb(inode->i_sb))
467 bdev = inode->i_bdev; 470 bdev = inode->i_bdev;
468 __bd_forget(inode); 471 __bd_forget(inode);
469 } 472 }
@@ -1004,6 +1007,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1004 } 1007 }
1005 1008
1006 lock_kernel(); 1009 lock_kernel();
1010 restart:
1007 1011
1008 ret = -ENXIO; 1012 ret = -ENXIO;
1009 disk = get_gendisk(bdev->bd_dev, &partno); 1013 disk = get_gendisk(bdev->bd_dev, &partno);
@@ -1024,6 +1028,19 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1024 1028
1025 if (disk->fops->open) { 1029 if (disk->fops->open) {
1026 ret = disk->fops->open(bdev, mode); 1030 ret = disk->fops->open(bdev, mode);
1031 if (ret == -ERESTARTSYS) {
1032 /* Lost a race with 'disk' being
1033 * deleted, try again.
1034 * See md.c
1035 */
1036 disk_put_part(bdev->bd_part);
1037 bdev->bd_part = NULL;
1038 module_put(disk->fops->owner);
1039 put_disk(disk);
1040 bdev->bd_disk = NULL;
1041 mutex_unlock(&bdev->bd_mutex);
1042 goto restart;
1043 }
1027 if (ret) 1044 if (ret)
1028 goto out_clear; 1045 goto out_clear;
1029 } 1046 }
@@ -1219,6 +1236,20 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
1219 return blkdev_ioctl(bdev, mode, cmd, arg); 1236 return blkdev_ioctl(bdev, mode, cmd, arg);
1220} 1237}
1221 1238
1239/*
1240 * Try to release a page associated with block device when the system
1241 * is under memory pressure.
1242 */
1243static int blkdev_releasepage(struct page *page, gfp_t wait)
1244{
1245 struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
1246
1247 if (super && super->s_op->bdev_try_to_free_page)
1248 return super->s_op->bdev_try_to_free_page(super, page, wait);
1249
1250 return try_to_free_buffers(page);
1251}
1252
1222static const struct address_space_operations def_blk_aops = { 1253static const struct address_space_operations def_blk_aops = {
1223 .readpage = blkdev_readpage, 1254 .readpage = blkdev_readpage,
1224 .writepage = blkdev_writepage, 1255 .writepage = blkdev_writepage,
@@ -1226,6 +1257,7 @@ static const struct address_space_operations def_blk_aops = {
1226 .write_begin = blkdev_write_begin, 1257 .write_begin = blkdev_write_begin,
1227 .write_end = blkdev_write_end, 1258 .write_end = blkdev_write_end,
1228 .writepages = generic_writepages, 1259 .writepages = generic_writepages,
1260 .releasepage = blkdev_releasepage,
1229 .direct_IO = blkdev_direct_IO, 1261 .direct_IO = blkdev_direct_IO,
1230}; 1262};
1231 1263
@@ -1261,7 +1293,7 @@ EXPORT_SYMBOL(ioctl_by_bdev);
1261 1293
1262/** 1294/**
1263 * lookup_bdev - lookup a struct block_device by name 1295 * lookup_bdev - lookup a struct block_device by name
1264 * @path: special file representing the block device 1296 * @pathname: special file representing the block device
1265 * 1297 *
1266 * Get a reference to the blockdevice at @pathname in the current 1298 * Get a reference to the blockdevice at @pathname in the current
1267 * namespace if possible and return it. Return ERR_PTR(error) 1299 * namespace if possible and return it. Return ERR_PTR(error)
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
new file mode 100644
index 000000000000..d2cf5a54a4b8
--- /dev/null
+++ b/fs/btrfs/Makefile
@@ -0,0 +1,25 @@
1ifneq ($(KERNELRELEASE),)
2# kbuild part of makefile
3
4obj-$(CONFIG_BTRFS_FS) := btrfs.o
5btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
6 file-item.o inode-item.o inode-map.o disk-io.o \
7 transaction.o inode.o file.o tree-defrag.o \
8 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
9 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
10 ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
11 compression.o
12else
13
14# Normal Makefile
15
16KERNELDIR := /lib/modules/`uname -r`/build
17all:
18 $(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules
19
20modules_install:
21 $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install
22clean:
23 $(MAKE) -C $(KERNELDIR) M=`pwd` clean
24
25endif
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
new file mode 100644
index 000000000000..1d53b62dbba5
--- /dev/null
+++ b/fs/btrfs/acl.c
@@ -0,0 +1,351 @@
1/*
2 * Copyright (C) 2007 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/string.h>
21#include <linux/xattr.h>
22#include <linux/posix_acl_xattr.h>
23#include <linux/posix_acl.h>
24#include <linux/sched.h>
25
26#include "ctree.h"
27#include "btrfs_inode.h"
28#include "xattr.h"
29
30#ifdef CONFIG_FS_POSIX_ACL
31
32static void btrfs_update_cached_acl(struct inode *inode,
33 struct posix_acl **p_acl,
34 struct posix_acl *acl)
35{
36 spin_lock(&inode->i_lock);
37 if (*p_acl && *p_acl != BTRFS_ACL_NOT_CACHED)
38 posix_acl_release(*p_acl);
39 *p_acl = posix_acl_dup(acl);
40 spin_unlock(&inode->i_lock);
41}
42
43static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
44{
45 int size;
46 const char *name;
47 char *value = NULL;
48 struct posix_acl *acl = NULL, **p_acl;
49
50 switch (type) {
51 case ACL_TYPE_ACCESS:
52 name = POSIX_ACL_XATTR_ACCESS;
53 p_acl = &BTRFS_I(inode)->i_acl;
54 break;
55 case ACL_TYPE_DEFAULT:
56 name = POSIX_ACL_XATTR_DEFAULT;
57 p_acl = &BTRFS_I(inode)->i_default_acl;
58 break;
59 default:
60 return ERR_PTR(-EINVAL);
61 }
62
63 spin_lock(&inode->i_lock);
64 if (*p_acl != BTRFS_ACL_NOT_CACHED)
65 acl = posix_acl_dup(*p_acl);
66 spin_unlock(&inode->i_lock);
67
68 if (acl)
69 return acl;
70
71
72 size = __btrfs_getxattr(inode, name, "", 0);
73 if (size > 0) {
74 value = kzalloc(size, GFP_NOFS);
75 if (!value)
76 return ERR_PTR(-ENOMEM);
77 size = __btrfs_getxattr(inode, name, value, size);
78 if (size > 0) {
79 acl = posix_acl_from_xattr(value, size);
80 btrfs_update_cached_acl(inode, p_acl, acl);
81 }
82 kfree(value);
83 } else if (size == -ENOENT) {
84 acl = NULL;
85 btrfs_update_cached_acl(inode, p_acl, acl);
86 }
87
88 return acl;
89}
90
91static int btrfs_xattr_get_acl(struct inode *inode, int type,
92 void *value, size_t size)
93{
94 struct posix_acl *acl;
95 int ret = 0;
96
97 acl = btrfs_get_acl(inode, type);
98
99 if (IS_ERR(acl))
100 return PTR_ERR(acl);
101 if (acl == NULL)
102 return -ENODATA;
103 ret = posix_acl_to_xattr(acl, value, size);
104 posix_acl_release(acl);
105
106 return ret;
107}
108
109/*
110 * Needs to be called with fs_mutex held
111 */
112static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
113{
114 int ret, size = 0;
115 const char *name;
116 struct posix_acl **p_acl;
117 char *value = NULL;
118 mode_t mode;
119
120 if (acl) {
121 ret = posix_acl_valid(acl);
122 if (ret < 0)
123 return ret;
124 ret = 0;
125 }
126
127 switch (type) {
128 case ACL_TYPE_ACCESS:
129 mode = inode->i_mode;
130 ret = posix_acl_equiv_mode(acl, &mode);
131 if (ret < 0)
132 return ret;
133 ret = 0;
134 inode->i_mode = mode;
135 name = POSIX_ACL_XATTR_ACCESS;
136 p_acl = &BTRFS_I(inode)->i_acl;
137 break;
138 case ACL_TYPE_DEFAULT:
139 if (!S_ISDIR(inode->i_mode))
140 return acl ? -EINVAL : 0;
141 name = POSIX_ACL_XATTR_DEFAULT;
142 p_acl = &BTRFS_I(inode)->i_default_acl;
143 break;
144 default:
145 return -EINVAL;
146 }
147
148 if (acl) {
149 size = posix_acl_xattr_size(acl->a_count);
150 value = kmalloc(size, GFP_NOFS);
151 if (!value) {
152 ret = -ENOMEM;
153 goto out;
154 }
155
156 ret = posix_acl_to_xattr(acl, value, size);
157 if (ret < 0)
158 goto out;
159 }
160
161 ret = __btrfs_setxattr(inode, name, value, size, 0);
162
163out:
164 kfree(value);
165
166 if (!ret)
167 btrfs_update_cached_acl(inode, p_acl, acl);
168
169 return ret;
170}
171
172static int btrfs_xattr_set_acl(struct inode *inode, int type,
173 const void *value, size_t size)
174{
175 int ret = 0;
176 struct posix_acl *acl = NULL;
177
178 if (value) {
179 acl = posix_acl_from_xattr(value, size);
180 if (acl == NULL) {
181 value = NULL;
182 size = 0;
183 } else if (IS_ERR(acl)) {
184 return PTR_ERR(acl);
185 }
186 }
187
188 ret = btrfs_set_acl(inode, acl, type);
189
190 posix_acl_release(acl);
191
192 return ret;
193}
194
195
196static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name,
197 void *value, size_t size)
198{
199 return btrfs_xattr_get_acl(inode, ACL_TYPE_ACCESS, value, size);
200}
201
202static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name,
203 const void *value, size_t size, int flags)
204{
205 return btrfs_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
206}
207
208static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name,
209 void *value, size_t size)
210{
211 return btrfs_xattr_get_acl(inode, ACL_TYPE_DEFAULT, value, size);
212}
213
214static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name,
215 const void *value, size_t size, int flags)
216{
217 return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
218}
219
220int btrfs_check_acl(struct inode *inode, int mask)
221{
222 struct posix_acl *acl;
223 int error = -EAGAIN;
224
225 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
226
227 if (IS_ERR(acl))
228 return PTR_ERR(acl);
229 if (acl) {
230 error = posix_acl_permission(inode, acl, mask);
231 posix_acl_release(acl);
232 }
233
234 return error;
235}
236
237/*
238 * btrfs_init_acl is already generally called under fs_mutex, so the locking
239 * stuff has been fixed to work with that. If the locking stuff changes, we
240 * need to re-evaluate the acl locking stuff.
241 */
242int btrfs_init_acl(struct inode *inode, struct inode *dir)
243{
244 struct posix_acl *acl = NULL;
245 int ret = 0;
246
247 /* this happens with subvols */
248 if (!dir)
249 return 0;
250
251 if (!S_ISLNK(inode->i_mode)) {
252 if (IS_POSIXACL(dir)) {
253 acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT);
254 if (IS_ERR(acl))
255 return PTR_ERR(acl);
256 }
257
258 if (!acl)
259 inode->i_mode &= ~current->fs->umask;
260 }
261
262 if (IS_POSIXACL(dir) && acl) {
263 struct posix_acl *clone;
264 mode_t mode;
265
266 if (S_ISDIR(inode->i_mode)) {
267 ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT);
268 if (ret)
269 goto failed;
270 }
271 clone = posix_acl_clone(acl, GFP_NOFS);
272 ret = -ENOMEM;
273 if (!clone)
274 goto failed;
275
276 mode = inode->i_mode;
277 ret = posix_acl_create_masq(clone, &mode);
278 if (ret >= 0) {
279 inode->i_mode = mode;
280 if (ret > 0) {
281 /* we need an acl */
282 ret = btrfs_set_acl(inode, clone,
283 ACL_TYPE_ACCESS);
284 }
285 }
286 }
287failed:
288 posix_acl_release(acl);
289
290 return ret;
291}
292
293int btrfs_acl_chmod(struct inode *inode)
294{
295 struct posix_acl *acl, *clone;
296 int ret = 0;
297
298 if (S_ISLNK(inode->i_mode))
299 return -EOPNOTSUPP;
300
301 if (!IS_POSIXACL(inode))
302 return 0;
303
304 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
305 if (IS_ERR(acl) || !acl)
306 return PTR_ERR(acl);
307
308 clone = posix_acl_clone(acl, GFP_KERNEL);
309 posix_acl_release(acl);
310 if (!clone)
311 return -ENOMEM;
312
313 ret = posix_acl_chmod_masq(clone, inode->i_mode);
314 if (!ret)
315 ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS);
316
317 posix_acl_release(clone);
318
319 return ret;
320}
321
322struct xattr_handler btrfs_xattr_acl_default_handler = {
323 .prefix = POSIX_ACL_XATTR_DEFAULT,
324 .get = btrfs_xattr_acl_default_get,
325 .set = btrfs_xattr_acl_default_set,
326};
327
328struct xattr_handler btrfs_xattr_acl_access_handler = {
329 .prefix = POSIX_ACL_XATTR_ACCESS,
330 .get = btrfs_xattr_acl_access_get,
331 .set = btrfs_xattr_acl_access_set,
332};
333
334#else /* CONFIG_FS_POSIX_ACL */
335
336int btrfs_acl_chmod(struct inode *inode)
337{
338 return 0;
339}
340
341int btrfs_init_acl(struct inode *inode, struct inode *dir)
342{
343 return 0;
344}
345
346int btrfs_check_acl(struct inode *inode, int mask)
347{
348 return 0;
349}
350
351#endif /* CONFIG_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
new file mode 100644
index 000000000000..8e2fec05dbe0
--- /dev/null
+++ b/fs/btrfs/async-thread.c
@@ -0,0 +1,419 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/version.h>
20#include <linux/kthread.h>
21#include <linux/list.h>
22#include <linux/spinlock.h>
23# include <linux/freezer.h>
24#include "async-thread.h"
25
26#define WORK_QUEUED_BIT 0
27#define WORK_DONE_BIT 1
28#define WORK_ORDER_DONE_BIT 2
29
30/*
31 * container for the kthread task pointer and the list of pending work
32 * One of these is allocated per thread.
33 */
34struct btrfs_worker_thread {
35 /* pool we belong to */
36 struct btrfs_workers *workers;
37
38 /* list of struct btrfs_work that are waiting for service */
39 struct list_head pending;
40
41 /* list of worker threads from struct btrfs_workers */
42 struct list_head worker_list;
43
44 /* kthread */
45 struct task_struct *task;
46
47 /* number of things on the pending list */
48 atomic_t num_pending;
49
50 unsigned long sequence;
51
52 /* protects the pending list. */
53 spinlock_t lock;
54
55 /* set to non-zero when this thread is already awake and kicking */
56 int working;
57
58 /* are we currently idle */
59 int idle;
60};
61
62/*
63 * helper function to move a thread onto the idle list after it
64 * has finished some requests.
65 */
66static void check_idle_worker(struct btrfs_worker_thread *worker)
67{
68 if (!worker->idle && atomic_read(&worker->num_pending) <
69 worker->workers->idle_thresh / 2) {
70 unsigned long flags;
71 spin_lock_irqsave(&worker->workers->lock, flags);
72 worker->idle = 1;
73 list_move(&worker->worker_list, &worker->workers->idle_list);
74 spin_unlock_irqrestore(&worker->workers->lock, flags);
75 }
76}
77
78/*
79 * helper function to move a thread off the idle list after new
80 * pending work is added.
81 */
82static void check_busy_worker(struct btrfs_worker_thread *worker)
83{
84 if (worker->idle && atomic_read(&worker->num_pending) >=
85 worker->workers->idle_thresh) {
86 unsigned long flags;
87 spin_lock_irqsave(&worker->workers->lock, flags);
88 worker->idle = 0;
89 list_move_tail(&worker->worker_list,
90 &worker->workers->worker_list);
91 spin_unlock_irqrestore(&worker->workers->lock, flags);
92 }
93}
94
95static noinline int run_ordered_completions(struct btrfs_workers *workers,
96 struct btrfs_work *work)
97{
98 unsigned long flags;
99
100 if (!workers->ordered)
101 return 0;
102
103 set_bit(WORK_DONE_BIT, &work->flags);
104
105 spin_lock_irqsave(&workers->lock, flags);
106
107 while (!list_empty(&workers->order_list)) {
108 work = list_entry(workers->order_list.next,
109 struct btrfs_work, order_list);
110
111 if (!test_bit(WORK_DONE_BIT, &work->flags))
112 break;
113
114 /* we are going to call the ordered done function, but
115 * we leave the work item on the list as a barrier so
116 * that later work items that are done don't have their
117 * functions called before this one returns
118 */
119 if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
120 break;
121
122 spin_unlock_irqrestore(&workers->lock, flags);
123
124 work->ordered_func(work);
125
126 /* now take the lock again and call the freeing code */
127 spin_lock_irqsave(&workers->lock, flags);
128 list_del(&work->order_list);
129 work->ordered_free(work);
130 }
131
132 spin_unlock_irqrestore(&workers->lock, flags);
133 return 0;
134}
135
136/*
137 * main loop for servicing work items
138 */
139static int worker_loop(void *arg)
140{
141 struct btrfs_worker_thread *worker = arg;
142 struct list_head *cur;
143 struct btrfs_work *work;
144 do {
145 spin_lock_irq(&worker->lock);
146 while (!list_empty(&worker->pending)) {
147 cur = worker->pending.next;
148 work = list_entry(cur, struct btrfs_work, list);
149 list_del(&work->list);
150 clear_bit(WORK_QUEUED_BIT, &work->flags);
151
152 work->worker = worker;
153 spin_unlock_irq(&worker->lock);
154
155 work->func(work);
156
157 atomic_dec(&worker->num_pending);
158 /*
159 * unless this is an ordered work queue,
160 * 'work' was probably freed by func above.
161 */
162 run_ordered_completions(worker->workers, work);
163
164 spin_lock_irq(&worker->lock);
165 check_idle_worker(worker);
166
167 }
168 worker->working = 0;
169 if (freezing(current)) {
170 refrigerator();
171 } else {
172 set_current_state(TASK_INTERRUPTIBLE);
173 spin_unlock_irq(&worker->lock);
174 if (!kthread_should_stop())
175 schedule();
176 __set_current_state(TASK_RUNNING);
177 }
178 } while (!kthread_should_stop());
179 return 0;
180}
181
182/*
183 * this will wait for all the worker threads to shutdown
184 */
185int btrfs_stop_workers(struct btrfs_workers *workers)
186{
187 struct list_head *cur;
188 struct btrfs_worker_thread *worker;
189
190 list_splice_init(&workers->idle_list, &workers->worker_list);
191 while (!list_empty(&workers->worker_list)) {
192 cur = workers->worker_list.next;
193 worker = list_entry(cur, struct btrfs_worker_thread,
194 worker_list);
195 kthread_stop(worker->task);
196 list_del(&worker->worker_list);
197 kfree(worker);
198 }
199 return 0;
200}
201
202/*
203 * simple init on struct btrfs_workers
204 */
205void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
206{
207 workers->num_workers = 0;
208 INIT_LIST_HEAD(&workers->worker_list);
209 INIT_LIST_HEAD(&workers->idle_list);
210 INIT_LIST_HEAD(&workers->order_list);
211 spin_lock_init(&workers->lock);
212 workers->max_workers = max;
213 workers->idle_thresh = 32;
214 workers->name = name;
215 workers->ordered = 0;
216}
217
218/*
219 * starts new worker threads. This does not enforce the max worker
220 * count in case you need to temporarily go past it.
221 */
222int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
223{
224 struct btrfs_worker_thread *worker;
225 int ret = 0;
226 int i;
227
228 for (i = 0; i < num_workers; i++) {
229 worker = kzalloc(sizeof(*worker), GFP_NOFS);
230 if (!worker) {
231 ret = -ENOMEM;
232 goto fail;
233 }
234
235 INIT_LIST_HEAD(&worker->pending);
236 INIT_LIST_HEAD(&worker->worker_list);
237 spin_lock_init(&worker->lock);
238 atomic_set(&worker->num_pending, 0);
239 worker->task = kthread_run(worker_loop, worker,
240 "btrfs-%s-%d", workers->name,
241 workers->num_workers + i);
242 worker->workers = workers;
243 if (IS_ERR(worker->task)) {
244 kfree(worker);
245 ret = PTR_ERR(worker->task);
246 goto fail;
247 }
248
249 spin_lock_irq(&workers->lock);
250 list_add_tail(&worker->worker_list, &workers->idle_list);
251 worker->idle = 1;
252 workers->num_workers++;
253 spin_unlock_irq(&workers->lock);
254 }
255 return 0;
256fail:
257 btrfs_stop_workers(workers);
258 return ret;
259}
260
261/*
262 * run through the list and find a worker thread that doesn't have a lot
263 * to do right now. This can return null if we aren't yet at the thread
264 * count limit and all of the threads are busy.
265 */
266static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
267{
268 struct btrfs_worker_thread *worker;
269 struct list_head *next;
270 int enforce_min = workers->num_workers < workers->max_workers;
271
272 /*
273 * if we find an idle thread, don't move it to the end of the
274 * idle list. This improves the chance that the next submission
275 * will reuse the same thread, and maybe catch it while it is still
276 * working
277 */
278 if (!list_empty(&workers->idle_list)) {
279 next = workers->idle_list.next;
280 worker = list_entry(next, struct btrfs_worker_thread,
281 worker_list);
282 return worker;
283 }
284 if (enforce_min || list_empty(&workers->worker_list))
285 return NULL;
286
287 /*
288 * if we pick a busy task, move the task to the end of the list.
289 * hopefully this will keep things somewhat evenly balanced.
290 * Do the move in batches based on the sequence number. This groups
291 * requests submitted at roughly the same time onto the same worker.
292 */
293 next = workers->worker_list.next;
294 worker = list_entry(next, struct btrfs_worker_thread, worker_list);
295 atomic_inc(&worker->num_pending);
296 worker->sequence++;
297
298 if (worker->sequence % workers->idle_thresh == 0)
299 list_move_tail(next, &workers->worker_list);
300 return worker;
301}
302
303/*
304 * selects a worker thread to take the next job. This will either find
305 * an idle worker, start a new worker up to the max count, or just return
306 * one of the existing busy workers.
307 */
308static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
309{
310 struct btrfs_worker_thread *worker;
311 unsigned long flags;
312
313again:
314 spin_lock_irqsave(&workers->lock, flags);
315 worker = next_worker(workers);
316 spin_unlock_irqrestore(&workers->lock, flags);
317
318 if (!worker) {
319 spin_lock_irqsave(&workers->lock, flags);
320 if (workers->num_workers >= workers->max_workers) {
321 struct list_head *fallback = NULL;
322 /*
323 * we have failed to find any workers, just
324 * return the force one
325 */
326 if (!list_empty(&workers->worker_list))
327 fallback = workers->worker_list.next;
328 if (!list_empty(&workers->idle_list))
329 fallback = workers->idle_list.next;
330 BUG_ON(!fallback);
331 worker = list_entry(fallback,
332 struct btrfs_worker_thread, worker_list);
333 spin_unlock_irqrestore(&workers->lock, flags);
334 } else {
335 spin_unlock_irqrestore(&workers->lock, flags);
336 /* we're below the limit, start another worker */
337 btrfs_start_workers(workers, 1);
338 goto again;
339 }
340 }
341 return worker;
342}
343
344/*
345 * btrfs_requeue_work just puts the work item back on the tail of the list
346 * it was taken from. It is intended for use with long running work functions
347 * that make some progress and want to give the cpu up for others.
348 */
349int btrfs_requeue_work(struct btrfs_work *work)
350{
351 struct btrfs_worker_thread *worker = work->worker;
352 unsigned long flags;
353
354 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
355 goto out;
356
357 spin_lock_irqsave(&worker->lock, flags);
358 atomic_inc(&worker->num_pending);
359 list_add_tail(&work->list, &worker->pending);
360
361 /* by definition we're busy, take ourselves off the idle
362 * list
363 */
364 if (worker->idle) {
365 spin_lock_irqsave(&worker->workers->lock, flags);
366 worker->idle = 0;
367 list_move_tail(&worker->worker_list,
368 &worker->workers->worker_list);
369 spin_unlock_irqrestore(&worker->workers->lock, flags);
370 }
371
372 spin_unlock_irqrestore(&worker->lock, flags);
373
374out:
375 return 0;
376}
377
378/*
379 * places a struct btrfs_work into the pending queue of one of the kthreads
380 */
381int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
382{
383 struct btrfs_worker_thread *worker;
384 unsigned long flags;
385 int wake = 0;
386
387 /* don't requeue something already on a list */
388 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
389 goto out;
390
391 worker = find_worker(workers);
392 if (workers->ordered) {
393 spin_lock_irqsave(&workers->lock, flags);
394 list_add_tail(&work->order_list, &workers->order_list);
395 spin_unlock_irqrestore(&workers->lock, flags);
396 } else {
397 INIT_LIST_HEAD(&work->order_list);
398 }
399
400 spin_lock_irqsave(&worker->lock, flags);
401 atomic_inc(&worker->num_pending);
402 check_busy_worker(worker);
403 list_add_tail(&work->list, &worker->pending);
404
405 /*
406 * avoid calling into wake_up_process if this thread has already
407 * been kicked
408 */
409 if (!worker->working)
410 wake = 1;
411 worker->working = 1;
412
413 spin_unlock_irqrestore(&worker->lock, flags);
414
415 if (wake)
416 wake_up_process(worker->task);
417out:
418 return 0;
419}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
new file mode 100644
index 000000000000..31be4ed8b63e
--- /dev/null
+++ b/fs/btrfs/async-thread.h
@@ -0,0 +1,101 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_ASYNC_THREAD_
20#define __BTRFS_ASYNC_THREAD_
21
22struct btrfs_worker_thread;
23
24/*
25 * This is similar to a workqueue, but it is meant to spread the operations
26 * across all available cpus instead of just the CPU that was used to
27 * queue the work. There is also some batching introduced to try and
28 * cut down on context switches.
29 *
30 * By default threads are added on demand up to 2 * the number of cpus.
31 * Changing struct btrfs_workers->max_workers is one way to prevent
32 * demand creation of kthreads.
33 *
34 * the basic model of these worker threads is to embed a btrfs_work
35 * structure in your own data struct, and use container_of in a
36 * work function to get back to your data struct.
37 */
38struct btrfs_work {
39 /*
40 * func should be set to the function you want called
41 * your work struct is passed as the only arg
42 *
43 * ordered_func must be set for work sent to an ordered work queue,
44 * and it is called to complete a given work item in the same
45 * order they were sent to the queue.
46 */
47 void (*func)(struct btrfs_work *work);
48 void (*ordered_func)(struct btrfs_work *work);
49 void (*ordered_free)(struct btrfs_work *work);
50
51 /*
52 * flags should be set to zero. It is used to make sure the
53 * struct is only inserted once into the list.
54 */
55 unsigned long flags;
56
57 /* don't touch these */
58 struct btrfs_worker_thread *worker;
59 struct list_head list;
60 struct list_head order_list;
61};
62
63struct btrfs_workers {
64 /* current number of running workers */
65 int num_workers;
66
67 /* max number of workers allowed. changed by btrfs_start_workers */
68 int max_workers;
69
70 /* once a worker has this many requests or fewer, it is idle */
71 int idle_thresh;
72
73 /* force completions in the order they were queued */
74 int ordered;
75
76 /* list with all the work threads. The workers on the idle thread
77 * may be actively servicing jobs, but they haven't yet hit the
78 * idle thresh limit above.
79 */
80 struct list_head worker_list;
81 struct list_head idle_list;
82
83 /*
84 * when operating in ordered mode, this maintains the list
85 * of work items waiting for completion
86 */
87 struct list_head order_list;
88
89 /* lock for finding the next worker thread to queue on */
90 spinlock_t lock;
91
92 /* extra name for this worker, used for current->name */
93 char *name;
94};
95
96int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
97int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
98int btrfs_stop_workers(struct btrfs_workers *workers);
99void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
100int btrfs_requeue_work(struct btrfs_work *work);
101#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
new file mode 100644
index 000000000000..a8c9693b75ac
--- /dev/null
+++ b/fs/btrfs/btrfs_inode.h
@@ -0,0 +1,131 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_I__
20#define __BTRFS_I__
21
22#include "extent_map.h"
23#include "extent_io.h"
24#include "ordered-data.h"
25
26/* in memory btrfs inode */
27struct btrfs_inode {
28 /* which subvolume this inode belongs to */
29 struct btrfs_root *root;
30
31 /* key used to find this inode on disk. This is used by the code
32 * to read in roots of subvolumes
33 */
34 struct btrfs_key location;
35
36 /* the extent_tree has caches of all the extent mappings to disk */
37 struct extent_map_tree extent_tree;
38
39 /* the io_tree does range state (DIRTY, LOCKED etc) */
40 struct extent_io_tree io_tree;
41
42 /* special utility tree used to record which mirrors have already been
43 * tried when checksums fail for a given block
44 */
45 struct extent_io_tree io_failure_tree;
46
47 /* held while inesrting or deleting extents from files */
48 struct mutex extent_mutex;
49
50 /* held while logging the inode in tree-log.c */
51 struct mutex log_mutex;
52
53 /* used to order data wrt metadata */
54 struct btrfs_ordered_inode_tree ordered_tree;
55
56 /* standard acl pointers */
57 struct posix_acl *i_acl;
58 struct posix_acl *i_default_acl;
59
60 /* for keeping track of orphaned inodes */
61 struct list_head i_orphan;
62
63 /* list of all the delalloc inodes in the FS. There are times we need
64 * to write all the delalloc pages to disk, and this list is used
65 * to walk them all.
66 */
67 struct list_head delalloc_inodes;
68
69 /* full 64 bit generation number, struct vfs_inode doesn't have a big
70 * enough field for this.
71 */
72 u64 generation;
73
74 /* sequence number for NFS changes */
75 u64 sequence;
76
77 /*
78 * transid of the trans_handle that last modified this inode
79 */
80 u64 last_trans;
81 /*
82 * transid that last logged this inode
83 */
84 u64 logged_trans;
85
86 /*
87 * trans that last made a change that should be fully fsync'd. This
88 * gets reset to zero each time the inode is logged
89 */
90 u64 log_dirty_trans;
91
92 /* total number of bytes pending delalloc, used by stat to calc the
93 * real block usage of the file
94 */
95 u64 delalloc_bytes;
96
97 /*
98 * the size of the file stored in the metadata on disk. data=ordered
99 * means the in-memory i_size might be larger than the size on disk
100 * because not all the blocks are written yet.
101 */
102 u64 disk_i_size;
103
104 /* flags field from the on disk inode */
105 u32 flags;
106
107 /*
108 * if this is a directory then index_cnt is the counter for the index
109 * number for new files that are created
110 */
111 u64 index_cnt;
112
113 /* the start of block group preferred for allocations. */
114 u64 block_group;
115
116 struct inode vfs_inode;
117};
118
119static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
120{
121 return container_of(inode, struct btrfs_inode, vfs_inode);
122}
123
124static inline void btrfs_i_size_write(struct inode *inode, u64 size)
125{
126 inode->i_size = size;
127 BTRFS_I(inode)->disk_i_size = size;
128}
129
130
131#endif
diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
new file mode 100644
index 000000000000..7c4503ef6efd
--- /dev/null
+++ b/fs/btrfs/compat.h
@@ -0,0 +1,7 @@
1#ifndef _COMPAT_H_
2#define _COMPAT_H_
3
4#define btrfs_drop_nlink(inode) drop_nlink(inode)
5#define btrfs_inc_nlink(inode) inc_nlink(inode)
6
7#endif /* _COMPAT_H_ */
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
new file mode 100644
index 000000000000..ee848d8585d9
--- /dev/null
+++ b/fs/btrfs/compression.c
@@ -0,0 +1,709 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bio.h>
21#include <linux/buffer_head.h>
22#include <linux/file.h>
23#include <linux/fs.h>
24#include <linux/pagemap.h>
25#include <linux/highmem.h>
26#include <linux/time.h>
27#include <linux/init.h>
28#include <linux/string.h>
29#include <linux/smp_lock.h>
30#include <linux/backing-dev.h>
31#include <linux/mpage.h>
32#include <linux/swap.h>
33#include <linux/writeback.h>
34#include <linux/bit_spinlock.h>
35#include <linux/version.h>
36#include <linux/pagevec.h>
37#include "compat.h"
38#include "ctree.h"
39#include "disk-io.h"
40#include "transaction.h"
41#include "btrfs_inode.h"
42#include "volumes.h"
43#include "ordered-data.h"
44#include "compression.h"
45#include "extent_io.h"
46#include "extent_map.h"
47
48struct compressed_bio {
49 /* number of bios pending for this compressed extent */
50 atomic_t pending_bios;
51
52 /* the pages with the compressed data on them */
53 struct page **compressed_pages;
54
55 /* inode that owns this data */
56 struct inode *inode;
57
58 /* starting offset in the inode for our pages */
59 u64 start;
60
61 /* number of bytes in the inode we're working on */
62 unsigned long len;
63
64 /* number of bytes on disk */
65 unsigned long compressed_len;
66
67 /* number of compressed pages in the array */
68 unsigned long nr_pages;
69
70 /* IO errors */
71 int errors;
72 int mirror_num;
73
74 /* for reads, this is the bio we are copying the data into */
75 struct bio *orig_bio;
76
77 /*
78 * the start of a variable length array of checksums only
79 * used by reads
80 */
81 u32 sums;
82};
83
84static inline int compressed_bio_size(struct btrfs_root *root,
85 unsigned long disk_size)
86{
87 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
88 return sizeof(struct compressed_bio) +
89 ((disk_size + root->sectorsize - 1) / root->sectorsize) *
90 csum_size;
91}
92
93static struct bio *compressed_bio_alloc(struct block_device *bdev,
94 u64 first_byte, gfp_t gfp_flags)
95{
96 struct bio *bio;
97 int nr_vecs;
98
99 nr_vecs = bio_get_nr_vecs(bdev);
100 bio = bio_alloc(gfp_flags, nr_vecs);
101
102 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
103 while (!bio && (nr_vecs /= 2))
104 bio = bio_alloc(gfp_flags, nr_vecs);
105 }
106
107 if (bio) {
108 bio->bi_size = 0;
109 bio->bi_bdev = bdev;
110 bio->bi_sector = first_byte >> 9;
111 }
112 return bio;
113}
114
115static int check_compressed_csum(struct inode *inode,
116 struct compressed_bio *cb,
117 u64 disk_start)
118{
119 int ret;
120 struct btrfs_root *root = BTRFS_I(inode)->root;
121 struct page *page;
122 unsigned long i;
123 char *kaddr;
124 u32 csum;
125 u32 *cb_sum = &cb->sums;
126
127 if (btrfs_test_flag(inode, NODATASUM))
128 return 0;
129
130 for (i = 0; i < cb->nr_pages; i++) {
131 page = cb->compressed_pages[i];
132 csum = ~(u32)0;
133
134 kaddr = kmap_atomic(page, KM_USER0);
135 csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE);
136 btrfs_csum_final(csum, (char *)&csum);
137 kunmap_atomic(kaddr, KM_USER0);
138
139 if (csum != *cb_sum) {
140 printk(KERN_INFO "btrfs csum failed ino %lu "
141 "extent %llu csum %u "
142 "wanted %u mirror %d\n", inode->i_ino,
143 (unsigned long long)disk_start,
144 csum, *cb_sum, cb->mirror_num);
145 ret = -EIO;
146 goto fail;
147 }
148 cb_sum++;
149
150 }
151 ret = 0;
152fail:
153 return ret;
154}
155
156/* when we finish reading compressed pages from the disk, we
157 * decompress them and then run the bio end_io routines on the
158 * decompressed pages (in the inode address space).
159 *
160 * This allows the checksumming and other IO error handling routines
161 * to work normally
162 *
163 * The compressed pages are freed here, and it must be run
164 * in process context
165 */
166static void end_compressed_bio_read(struct bio *bio, int err)
167{
168 struct extent_io_tree *tree;
169 struct compressed_bio *cb = bio->bi_private;
170 struct inode *inode;
171 struct page *page;
172 unsigned long index;
173 int ret;
174
175 if (err)
176 cb->errors = 1;
177
178 /* if there are more bios still pending for this compressed
179 * extent, just exit
180 */
181 if (!atomic_dec_and_test(&cb->pending_bios))
182 goto out;
183
184 inode = cb->inode;
185 ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9);
186 if (ret)
187 goto csum_failed;
188
189 /* ok, we're the last bio for this extent, lets start
190 * the decompression.
191 */
192 tree = &BTRFS_I(inode)->io_tree;
193 ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
194 cb->start,
195 cb->orig_bio->bi_io_vec,
196 cb->orig_bio->bi_vcnt,
197 cb->compressed_len);
198csum_failed:
199 if (ret)
200 cb->errors = 1;
201
202 /* release the compressed pages */
203 index = 0;
204 for (index = 0; index < cb->nr_pages; index++) {
205 page = cb->compressed_pages[index];
206 page->mapping = NULL;
207 page_cache_release(page);
208 }
209
210 /* do io completion on the original bio */
211 if (cb->errors) {
212 bio_io_error(cb->orig_bio);
213 } else {
214 int bio_index = 0;
215 struct bio_vec *bvec = cb->orig_bio->bi_io_vec;
216
217 /*
218 * we have verified the checksum already, set page
219 * checked so the end_io handlers know about it
220 */
221 while (bio_index < cb->orig_bio->bi_vcnt) {
222 SetPageChecked(bvec->bv_page);
223 bvec++;
224 bio_index++;
225 }
226 bio_endio(cb->orig_bio, 0);
227 }
228
229 /* finally free the cb struct */
230 kfree(cb->compressed_pages);
231 kfree(cb);
232out:
233 bio_put(bio);
234}
235
236/*
237 * Clear the writeback bits on all of the file
238 * pages for a compressed write
239 */
240static noinline int end_compressed_writeback(struct inode *inode, u64 start,
241 unsigned long ram_size)
242{
243 unsigned long index = start >> PAGE_CACHE_SHIFT;
244 unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
245 struct page *pages[16];
246 unsigned long nr_pages = end_index - index + 1;
247 int i;
248 int ret;
249
250 while (nr_pages > 0) {
251 ret = find_get_pages_contig(inode->i_mapping, index,
252 min_t(unsigned long,
253 nr_pages, ARRAY_SIZE(pages)), pages);
254 if (ret == 0) {
255 nr_pages -= 1;
256 index += 1;
257 continue;
258 }
259 for (i = 0; i < ret; i++) {
260 end_page_writeback(pages[i]);
261 page_cache_release(pages[i]);
262 }
263 nr_pages -= ret;
264 index += ret;
265 }
266 /* the inode may be gone now */
267 return 0;
268}
269
270/*
271 * do the cleanup once all the compressed pages hit the disk.
272 * This will clear writeback on the file pages and free the compressed
273 * pages.
274 *
275 * This also calls the writeback end hooks for the file pages so that
276 * metadata and checksums can be updated in the file.
277 */
278static void end_compressed_bio_write(struct bio *bio, int err)
279{
280 struct extent_io_tree *tree;
281 struct compressed_bio *cb = bio->bi_private;
282 struct inode *inode;
283 struct page *page;
284 unsigned long index;
285
286 if (err)
287 cb->errors = 1;
288
289 /* if there are more bios still pending for this compressed
290 * extent, just exit
291 */
292 if (!atomic_dec_and_test(&cb->pending_bios))
293 goto out;
294
295 /* ok, we're the last bio for this extent, step one is to
296 * call back into the FS and do all the end_io operations
297 */
298 inode = cb->inode;
299 tree = &BTRFS_I(inode)->io_tree;
300 cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
301 tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
302 cb->start,
303 cb->start + cb->len - 1,
304 NULL, 1);
305 cb->compressed_pages[0]->mapping = NULL;
306
307 end_compressed_writeback(inode, cb->start, cb->len);
308 /* note, our inode could be gone now */
309
310 /*
311 * release the compressed pages, these came from alloc_page and
312 * are not attached to the inode at all
313 */
314 index = 0;
315 for (index = 0; index < cb->nr_pages; index++) {
316 page = cb->compressed_pages[index];
317 page->mapping = NULL;
318 page_cache_release(page);
319 }
320
321 /* finally free the cb struct */
322 kfree(cb->compressed_pages);
323 kfree(cb);
324out:
325 bio_put(bio);
326}
327
328/*
329 * worker function to build and submit bios for previously compressed pages.
330 * The corresponding pages in the inode should be marked for writeback
331 * and the compressed pages should have a reference on them for dropping
332 * when the IO is complete.
333 *
334 * This also checksums the file bytes and gets things ready for
335 * the end io hooks.
336 */
337int btrfs_submit_compressed_write(struct inode *inode, u64 start,
338 unsigned long len, u64 disk_start,
339 unsigned long compressed_len,
340 struct page **compressed_pages,
341 unsigned long nr_pages)
342{
343 struct bio *bio = NULL;
344 struct btrfs_root *root = BTRFS_I(inode)->root;
345 struct compressed_bio *cb;
346 unsigned long bytes_left;
347 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
348 int page_index = 0;
349 struct page *page;
350 u64 first_byte = disk_start;
351 struct block_device *bdev;
352 int ret;
353
354 WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
355 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
356 atomic_set(&cb->pending_bios, 0);
357 cb->errors = 0;
358 cb->inode = inode;
359 cb->start = start;
360 cb->len = len;
361 cb->mirror_num = 0;
362 cb->compressed_pages = compressed_pages;
363 cb->compressed_len = compressed_len;
364 cb->orig_bio = NULL;
365 cb->nr_pages = nr_pages;
366
367 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
368
369 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
370 bio->bi_private = cb;
371 bio->bi_end_io = end_compressed_bio_write;
372 atomic_inc(&cb->pending_bios);
373
374 /* create and submit bios for the compressed pages */
375 bytes_left = compressed_len;
376 for (page_index = 0; page_index < cb->nr_pages; page_index++) {
377 page = compressed_pages[page_index];
378 page->mapping = inode->i_mapping;
379 if (bio->bi_size)
380 ret = io_tree->ops->merge_bio_hook(page, 0,
381 PAGE_CACHE_SIZE,
382 bio, 0);
383 else
384 ret = 0;
385
386 page->mapping = NULL;
387 if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
388 PAGE_CACHE_SIZE) {
389 bio_get(bio);
390
391 /*
392 * inc the count before we submit the bio so
393 * we know the end IO handler won't happen before
394 * we inc the count. Otherwise, the cb might get
395 * freed before we're done setting it up
396 */
397 atomic_inc(&cb->pending_bios);
398 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
399 BUG_ON(ret);
400
401 ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
402 BUG_ON(ret);
403
404 ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
405 BUG_ON(ret);
406
407 bio_put(bio);
408
409 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
410 bio->bi_private = cb;
411 bio->bi_end_io = end_compressed_bio_write;
412 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
413 }
414 if (bytes_left < PAGE_CACHE_SIZE) {
415 printk("bytes left %lu compress len %lu nr %lu\n",
416 bytes_left, cb->compressed_len, cb->nr_pages);
417 }
418 bytes_left -= PAGE_CACHE_SIZE;
419 first_byte += PAGE_CACHE_SIZE;
420 cond_resched();
421 }
422 bio_get(bio);
423
424 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
425 BUG_ON(ret);
426
427 ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
428 BUG_ON(ret);
429
430 ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
431 BUG_ON(ret);
432
433 bio_put(bio);
434 return 0;
435}
436
437static noinline int add_ra_bio_pages(struct inode *inode,
438 u64 compressed_end,
439 struct compressed_bio *cb)
440{
441 unsigned long end_index;
442 unsigned long page_index;
443 u64 last_offset;
444 u64 isize = i_size_read(inode);
445 int ret;
446 struct page *page;
447 unsigned long nr_pages = 0;
448 struct extent_map *em;
449 struct address_space *mapping = inode->i_mapping;
450 struct pagevec pvec;
451 struct extent_map_tree *em_tree;
452 struct extent_io_tree *tree;
453 u64 end;
454 int misses = 0;
455
456 page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page;
457 last_offset = (page_offset(page) + PAGE_CACHE_SIZE);
458 em_tree = &BTRFS_I(inode)->extent_tree;
459 tree = &BTRFS_I(inode)->io_tree;
460
461 if (isize == 0)
462 return 0;
463
464 end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
465
466 pagevec_init(&pvec, 0);
467 while (last_offset < compressed_end) {
468 page_index = last_offset >> PAGE_CACHE_SHIFT;
469
470 if (page_index > end_index)
471 break;
472
473 rcu_read_lock();
474 page = radix_tree_lookup(&mapping->page_tree, page_index);
475 rcu_read_unlock();
476 if (page) {
477 misses++;
478 if (misses > 4)
479 break;
480 goto next;
481 }
482
483 page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS);
484 if (!page)
485 break;
486
487 page->index = page_index;
488 /*
489 * what we want to do here is call add_to_page_cache_lru,
490 * but that isn't exported, so we reproduce it here
491 */
492 if (add_to_page_cache(page, mapping,
493 page->index, GFP_NOFS)) {
494 page_cache_release(page);
495 goto next;
496 }
497
498 /* open coding of lru_cache_add, also not exported */
499 page_cache_get(page);
500 if (!pagevec_add(&pvec, page))
501 __pagevec_lru_add_file(&pvec);
502
503 end = last_offset + PAGE_CACHE_SIZE - 1;
504 /*
505 * at this point, we have a locked page in the page cache
506 * for these bytes in the file. But, we have to make
507 * sure they map to this compressed extent on disk.
508 */
509 set_page_extent_mapped(page);
510 lock_extent(tree, last_offset, end, GFP_NOFS);
511 spin_lock(&em_tree->lock);
512 em = lookup_extent_mapping(em_tree, last_offset,
513 PAGE_CACHE_SIZE);
514 spin_unlock(&em_tree->lock);
515
516 if (!em || last_offset < em->start ||
517 (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
518 (em->block_start >> 9) != cb->orig_bio->bi_sector) {
519 free_extent_map(em);
520 unlock_extent(tree, last_offset, end, GFP_NOFS);
521 unlock_page(page);
522 page_cache_release(page);
523 break;
524 }
525 free_extent_map(em);
526
527 if (page->index == end_index) {
528 char *userpage;
529 size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1);
530
531 if (zero_offset) {
532 int zeros;
533 zeros = PAGE_CACHE_SIZE - zero_offset;
534 userpage = kmap_atomic(page, KM_USER0);
535 memset(userpage + zero_offset, 0, zeros);
536 flush_dcache_page(page);
537 kunmap_atomic(userpage, KM_USER0);
538 }
539 }
540
541 ret = bio_add_page(cb->orig_bio, page,
542 PAGE_CACHE_SIZE, 0);
543
544 if (ret == PAGE_CACHE_SIZE) {
545 nr_pages++;
546 page_cache_release(page);
547 } else {
548 unlock_extent(tree, last_offset, end, GFP_NOFS);
549 unlock_page(page);
550 page_cache_release(page);
551 break;
552 }
553next:
554 last_offset += PAGE_CACHE_SIZE;
555 }
556 if (pagevec_count(&pvec))
557 __pagevec_lru_add_file(&pvec);
558 return 0;
559}
560
561/*
562 * for a compressed read, the bio we get passed has all the inode pages
563 * in it. We don't actually do IO on those pages but allocate new ones
564 * to hold the compressed pages on disk.
565 *
566 * bio->bi_sector points to the compressed extent on disk
567 * bio->bi_io_vec points to all of the inode pages
568 * bio->bi_vcnt is a count of pages
569 *
570 * After the compressed pages are read, we copy the bytes into the
571 * bio we were passed and then call the bio end_io calls
572 */
573int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
574 int mirror_num, unsigned long bio_flags)
575{
576 struct extent_io_tree *tree;
577 struct extent_map_tree *em_tree;
578 struct compressed_bio *cb;
579 struct btrfs_root *root = BTRFS_I(inode)->root;
580 unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
581 unsigned long compressed_len;
582 unsigned long nr_pages;
583 unsigned long page_index;
584 struct page *page;
585 struct block_device *bdev;
586 struct bio *comp_bio;
587 u64 cur_disk_byte = (u64)bio->bi_sector << 9;
588 u64 em_len;
589 u64 em_start;
590 struct extent_map *em;
591 int ret;
592 u32 *sums;
593
594 tree = &BTRFS_I(inode)->io_tree;
595 em_tree = &BTRFS_I(inode)->extent_tree;
596
597 /* we need the actual starting offset of this extent in the file */
598 spin_lock(&em_tree->lock);
599 em = lookup_extent_mapping(em_tree,
600 page_offset(bio->bi_io_vec->bv_page),
601 PAGE_CACHE_SIZE);
602 spin_unlock(&em_tree->lock);
603
604 compressed_len = em->block_len;
605 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
606 atomic_set(&cb->pending_bios, 0);
607 cb->errors = 0;
608 cb->inode = inode;
609 cb->mirror_num = mirror_num;
610 sums = &cb->sums;
611
612 cb->start = em->orig_start;
613 em_len = em->len;
614 em_start = em->start;
615
616 free_extent_map(em);
617 em = NULL;
618
619 cb->len = uncompressed_len;
620 cb->compressed_len = compressed_len;
621 cb->orig_bio = bio;
622
623 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
624 PAGE_CACHE_SIZE;
625 cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
626 GFP_NOFS);
627 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
628
629 for (page_index = 0; page_index < nr_pages; page_index++) {
630 cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
631 __GFP_HIGHMEM);
632 }
633 cb->nr_pages = nr_pages;
634
635 add_ra_bio_pages(inode, em_start + em_len, cb);
636
637 /* include any pages we added in add_ra-bio_pages */
638 uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
639 cb->len = uncompressed_len;
640
641 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
642 comp_bio->bi_private = cb;
643 comp_bio->bi_end_io = end_compressed_bio_read;
644 atomic_inc(&cb->pending_bios);
645
646 for (page_index = 0; page_index < nr_pages; page_index++) {
647 page = cb->compressed_pages[page_index];
648 page->mapping = inode->i_mapping;
649 page->index = em_start >> PAGE_CACHE_SHIFT;
650
651 if (comp_bio->bi_size)
652 ret = tree->ops->merge_bio_hook(page, 0,
653 PAGE_CACHE_SIZE,
654 comp_bio, 0);
655 else
656 ret = 0;
657
658 page->mapping = NULL;
659 if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
660 PAGE_CACHE_SIZE) {
661 bio_get(comp_bio);
662
663 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
664 BUG_ON(ret);
665
666 /*
667 * inc the count before we submit the bio so
668 * we know the end IO handler won't happen before
669 * we inc the count. Otherwise, the cb might get
670 * freed before we're done setting it up
671 */
672 atomic_inc(&cb->pending_bios);
673
674 if (!btrfs_test_flag(inode, NODATASUM)) {
675 btrfs_lookup_bio_sums(root, inode, comp_bio,
676 sums);
677 }
678 sums += (comp_bio->bi_size + root->sectorsize - 1) /
679 root->sectorsize;
680
681 ret = btrfs_map_bio(root, READ, comp_bio,
682 mirror_num, 0);
683 BUG_ON(ret);
684
685 bio_put(comp_bio);
686
687 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
688 GFP_NOFS);
689 comp_bio->bi_private = cb;
690 comp_bio->bi_end_io = end_compressed_bio_read;
691
692 bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0);
693 }
694 cur_disk_byte += PAGE_CACHE_SIZE;
695 }
696 bio_get(comp_bio);
697
698 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
699 BUG_ON(ret);
700
701 if (!btrfs_test_flag(inode, NODATASUM))
702 btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
703
704 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
705 BUG_ON(ret);
706
707 bio_put(comp_bio);
708 return 0;
709}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
new file mode 100644
index 000000000000..421f5b4aa715
--- /dev/null
+++ b/fs/btrfs/compression.h
@@ -0,0 +1,47 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_COMPRESSION_
20#define __BTRFS_COMPRESSION_
21
22int btrfs_zlib_decompress(unsigned char *data_in,
23 struct page *dest_page,
24 unsigned long start_byte,
25 size_t srclen, size_t destlen);
26int btrfs_zlib_compress_pages(struct address_space *mapping,
27 u64 start, unsigned long len,
28 struct page **pages,
29 unsigned long nr_dest_pages,
30 unsigned long *out_pages,
31 unsigned long *total_in,
32 unsigned long *total_out,
33 unsigned long max_out);
34int btrfs_zlib_decompress_biovec(struct page **pages_in,
35 u64 disk_start,
36 struct bio_vec *bvec,
37 int vcnt,
38 size_t srclen);
39void btrfs_zlib_exit(void);
40int btrfs_submit_compressed_write(struct inode *inode, u64 start,
41 unsigned long len, u64 disk_start,
42 unsigned long compressed_len,
43 struct page **compressed_pages,
44 unsigned long nr_pages);
45int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
46 int mirror_num, unsigned long bio_flags);
47#endif
diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h
new file mode 100644
index 000000000000..6e1b3de36700
--- /dev/null
+++ b/fs/btrfs/crc32c.h
@@ -0,0 +1,29 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_CRC32C__
20#define __BTRFS_CRC32C__
21#include <linux/crc32c.h>
22
23/*
24 * this file used to do more for selecting the HW version of crc32c,
25 * perhaps it will one day again soon.
26 */
27#define btrfs_crc32c(seed, data, length) crc32c(seed, data, length)
28#endif
29
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
new file mode 100644
index 000000000000..9e46c0776816
--- /dev/null
+++ b/fs/btrfs/ctree.c
@@ -0,0 +1,3953 @@
1/*
2 * Copyright (C) 2007,2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "disk-io.h"
22#include "transaction.h"
23#include "print-tree.h"
24#include "locking.h"
25
26static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
27 *root, struct btrfs_path *path, int level);
28static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
29 *root, struct btrfs_key *ins_key,
30 struct btrfs_path *path, int data_size, int extend);
31static int push_node_left(struct btrfs_trans_handle *trans,
32 struct btrfs_root *root, struct extent_buffer *dst,
33 struct extent_buffer *src, int empty);
34static int balance_node_right(struct btrfs_trans_handle *trans,
35 struct btrfs_root *root,
36 struct extent_buffer *dst_buf,
37 struct extent_buffer *src_buf);
38static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
39 struct btrfs_path *path, int level, int slot);
40
41inline void btrfs_init_path(struct btrfs_path *p)
42{
43 memset(p, 0, sizeof(*p));
44}
45
46struct btrfs_path *btrfs_alloc_path(void)
47{
48 struct btrfs_path *path;
49 path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
50 if (path) {
51 btrfs_init_path(path);
52 path->reada = 1;
53 }
54 return path;
55}
56
57/* this also releases the path */
58void btrfs_free_path(struct btrfs_path *p)
59{
60 btrfs_release_path(NULL, p);
61 kmem_cache_free(btrfs_path_cachep, p);
62}
63
64/*
65 * path release drops references on the extent buffers in the path
66 * and it drops any locks held by this path
67 *
68 * It is safe to call this on paths that no locks or extent buffers held.
69 */
70noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
71{
72 int i;
73
74 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
75 p->slots[i] = 0;
76 if (!p->nodes[i])
77 continue;
78 if (p->locks[i]) {
79 btrfs_tree_unlock(p->nodes[i]);
80 p->locks[i] = 0;
81 }
82 free_extent_buffer(p->nodes[i]);
83 p->nodes[i] = NULL;
84 }
85}
86
87/*
88 * safely gets a reference on the root node of a tree. A lock
89 * is not taken, so a concurrent writer may put a different node
90 * at the root of the tree. See btrfs_lock_root_node for the
91 * looping required.
92 *
93 * The extent buffer returned by this has a reference taken, so
94 * it won't disappear. It may stop being the root of the tree
95 * at any time because there are no locks held.
96 */
97struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
98{
99 struct extent_buffer *eb;
100 spin_lock(&root->node_lock);
101 eb = root->node;
102 extent_buffer_get(eb);
103 spin_unlock(&root->node_lock);
104 return eb;
105}
106
107/* loop around taking references on and locking the root node of the
108 * tree until you end up with a lock on the root. A locked buffer
109 * is returned, with a reference held.
110 */
111struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
112{
113 struct extent_buffer *eb;
114
115 while (1) {
116 eb = btrfs_root_node(root);
117 btrfs_tree_lock(eb);
118
119 spin_lock(&root->node_lock);
120 if (eb == root->node) {
121 spin_unlock(&root->node_lock);
122 break;
123 }
124 spin_unlock(&root->node_lock);
125
126 btrfs_tree_unlock(eb);
127 free_extent_buffer(eb);
128 }
129 return eb;
130}
131
132/* cowonly root (everything not a reference counted cow subvolume), just get
133 * put onto a simple dirty list. transaction.c walks this to make sure they
134 * get properly updated on disk.
135 */
136static void add_root_to_dirty_list(struct btrfs_root *root)
137{
138 if (root->track_dirty && list_empty(&root->dirty_list)) {
139 list_add(&root->dirty_list,
140 &root->fs_info->dirty_cowonly_roots);
141 }
142}
143
144/*
145 * used by snapshot creation to make a copy of a root for a tree with
146 * a given objectid. The buffer with the new root node is returned in
147 * cow_ret, and this func returns zero on success or a negative error code.
148 */
149int btrfs_copy_root(struct btrfs_trans_handle *trans,
150 struct btrfs_root *root,
151 struct extent_buffer *buf,
152 struct extent_buffer **cow_ret, u64 new_root_objectid)
153{
154 struct extent_buffer *cow;
155 u32 nritems;
156 int ret = 0;
157 int level;
158 struct btrfs_root *new_root;
159
160 new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
161 if (!new_root)
162 return -ENOMEM;
163
164 memcpy(new_root, root, sizeof(*new_root));
165 new_root->root_key.objectid = new_root_objectid;
166
167 WARN_ON(root->ref_cows && trans->transid !=
168 root->fs_info->running_transaction->transid);
169 WARN_ON(root->ref_cows && trans->transid != root->last_trans);
170
171 level = btrfs_header_level(buf);
172 nritems = btrfs_header_nritems(buf);
173
174 cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
175 new_root_objectid, trans->transid,
176 level, buf->start, 0);
177 if (IS_ERR(cow)) {
178 kfree(new_root);
179 return PTR_ERR(cow);
180 }
181
182 copy_extent_buffer(cow, buf, 0, 0, cow->len);
183 btrfs_set_header_bytenr(cow, cow->start);
184 btrfs_set_header_generation(cow, trans->transid);
185 btrfs_set_header_owner(cow, new_root_objectid);
186 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
187
188 write_extent_buffer(cow, root->fs_info->fsid,
189 (unsigned long)btrfs_header_fsid(cow),
190 BTRFS_FSID_SIZE);
191
192 WARN_ON(btrfs_header_generation(buf) > trans->transid);
193 ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
194 kfree(new_root);
195
196 if (ret)
197 return ret;
198
199 btrfs_mark_buffer_dirty(cow);
200 *cow_ret = cow;
201 return 0;
202}
203
204/*
205 * does the dirty work in cow of a single block. The parent block (if
206 * supplied) is updated to point to the new cow copy. The new buffer is marked
207 * dirty and returned locked. If you modify the block it needs to be marked
208 * dirty again.
209 *
210 * search_start -- an allocation hint for the new block
211 *
212 * empty_size -- a hint that you plan on doing more cow. This is the size in
213 * bytes the allocator should try to find free next to the block it returns.
214 * This is just a hint and may be ignored by the allocator.
215 *
216 * prealloc_dest -- if you have already reserved a destination for the cow,
217 * this uses that block instead of allocating a new one.
218 * btrfs_alloc_reserved_extent is used to finish the allocation.
219 */
220static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
221 struct btrfs_root *root,
222 struct extent_buffer *buf,
223 struct extent_buffer *parent, int parent_slot,
224 struct extent_buffer **cow_ret,
225 u64 search_start, u64 empty_size,
226 u64 prealloc_dest)
227{
228 u64 parent_start;
229 struct extent_buffer *cow;
230 u32 nritems;
231 int ret = 0;
232 int level;
233 int unlock_orig = 0;
234
235 if (*cow_ret == buf)
236 unlock_orig = 1;
237
238 WARN_ON(!btrfs_tree_locked(buf));
239
240 if (parent)
241 parent_start = parent->start;
242 else
243 parent_start = 0;
244
245 WARN_ON(root->ref_cows && trans->transid !=
246 root->fs_info->running_transaction->transid);
247 WARN_ON(root->ref_cows && trans->transid != root->last_trans);
248
249 level = btrfs_header_level(buf);
250 nritems = btrfs_header_nritems(buf);
251
252 if (prealloc_dest) {
253 struct btrfs_key ins;
254
255 ins.objectid = prealloc_dest;
256 ins.offset = buf->len;
257 ins.type = BTRFS_EXTENT_ITEM_KEY;
258
259 ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
260 root->root_key.objectid,
261 trans->transid, level, &ins);
262 BUG_ON(ret);
263 cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
264 buf->len);
265 } else {
266 cow = btrfs_alloc_free_block(trans, root, buf->len,
267 parent_start,
268 root->root_key.objectid,
269 trans->transid, level,
270 search_start, empty_size);
271 }
272 if (IS_ERR(cow))
273 return PTR_ERR(cow);
274
275 copy_extent_buffer(cow, buf, 0, 0, cow->len);
276 btrfs_set_header_bytenr(cow, cow->start);
277 btrfs_set_header_generation(cow, trans->transid);
278 btrfs_set_header_owner(cow, root->root_key.objectid);
279 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
280
281 write_extent_buffer(cow, root->fs_info->fsid,
282 (unsigned long)btrfs_header_fsid(cow),
283 BTRFS_FSID_SIZE);
284
285 WARN_ON(btrfs_header_generation(buf) > trans->transid);
286 if (btrfs_header_generation(buf) != trans->transid) {
287 u32 nr_extents;
288 ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
289 if (ret)
290 return ret;
291
292 ret = btrfs_cache_ref(trans, root, buf, nr_extents);
293 WARN_ON(ret);
294 } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
295 /*
296 * There are only two places that can drop reference to
297 * tree blocks owned by living reloc trees, one is here,
298 * the other place is btrfs_drop_subtree. In both places,
299 * we check reference count while tree block is locked.
300 * Furthermore, if reference count is one, it won't get
301 * increased by someone else.
302 */
303 u32 refs;
304 ret = btrfs_lookup_extent_ref(trans, root, buf->start,
305 buf->len, &refs);
306 BUG_ON(ret);
307 if (refs == 1) {
308 ret = btrfs_update_ref(trans, root, buf, cow,
309 0, nritems);
310 clean_tree_block(trans, root, buf);
311 } else {
312 ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
313 }
314 BUG_ON(ret);
315 } else {
316 ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
317 if (ret)
318 return ret;
319 clean_tree_block(trans, root, buf);
320 }
321
322 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
323 ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
324 WARN_ON(ret);
325 }
326
327 if (buf == root->node) {
328 WARN_ON(parent && parent != buf);
329
330 spin_lock(&root->node_lock);
331 root->node = cow;
332 extent_buffer_get(cow);
333 spin_unlock(&root->node_lock);
334
335 if (buf != root->commit_root) {
336 btrfs_free_extent(trans, root, buf->start,
337 buf->len, buf->start,
338 root->root_key.objectid,
339 btrfs_header_generation(buf),
340 level, 1);
341 }
342 free_extent_buffer(buf);
343 add_root_to_dirty_list(root);
344 } else {
345 btrfs_set_node_blockptr(parent, parent_slot,
346 cow->start);
347 WARN_ON(trans->transid == 0);
348 btrfs_set_node_ptr_generation(parent, parent_slot,
349 trans->transid);
350 btrfs_mark_buffer_dirty(parent);
351 WARN_ON(btrfs_header_generation(parent) != trans->transid);
352 btrfs_free_extent(trans, root, buf->start, buf->len,
353 parent_start, btrfs_header_owner(parent),
354 btrfs_header_generation(parent), level, 1);
355 }
356 if (unlock_orig)
357 btrfs_tree_unlock(buf);
358 free_extent_buffer(buf);
359 btrfs_mark_buffer_dirty(cow);
360 *cow_ret = cow;
361 return 0;
362}
363
364/*
365 * cows a single block, see __btrfs_cow_block for the real work.
366 * This version of it has extra checks so that a block isn't cow'd more than
367 * once per transaction, as long as it hasn't been written yet
368 */
369noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
370 struct btrfs_root *root, struct extent_buffer *buf,
371 struct extent_buffer *parent, int parent_slot,
372 struct extent_buffer **cow_ret, u64 prealloc_dest)
373{
374 u64 search_start;
375 int ret;
376
377 if (trans->transaction != root->fs_info->running_transaction) {
378 printk(KERN_CRIT "trans %llu running %llu\n",
379 (unsigned long long)trans->transid,
380 (unsigned long long)
381 root->fs_info->running_transaction->transid);
382 WARN_ON(1);
383 }
384 if (trans->transid != root->fs_info->generation) {
385 printk(KERN_CRIT "trans %llu running %llu\n",
386 (unsigned long long)trans->transid,
387 (unsigned long long)root->fs_info->generation);
388 WARN_ON(1);
389 }
390
391 spin_lock(&root->fs_info->hash_lock);
392 if (btrfs_header_generation(buf) == trans->transid &&
393 btrfs_header_owner(buf) == root->root_key.objectid &&
394 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
395 *cow_ret = buf;
396 spin_unlock(&root->fs_info->hash_lock);
397 WARN_ON(prealloc_dest);
398 return 0;
399 }
400 spin_unlock(&root->fs_info->hash_lock);
401 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
402 ret = __btrfs_cow_block(trans, root, buf, parent,
403 parent_slot, cow_ret, search_start, 0,
404 prealloc_dest);
405 return ret;
406}
407
408/*
409 * helper function for defrag to decide if two blocks pointed to by a
410 * node are actually close by
411 */
412static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
413{
414 if (blocknr < other && other - (blocknr + blocksize) < 32768)
415 return 1;
416 if (blocknr > other && blocknr - (other + blocksize) < 32768)
417 return 1;
418 return 0;
419}
420
421/*
422 * compare two keys in a memcmp fashion
423 */
424static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
425{
426 struct btrfs_key k1;
427
428 btrfs_disk_key_to_cpu(&k1, disk);
429
430 if (k1.objectid > k2->objectid)
431 return 1;
432 if (k1.objectid < k2->objectid)
433 return -1;
434 if (k1.type > k2->type)
435 return 1;
436 if (k1.type < k2->type)
437 return -1;
438 if (k1.offset > k2->offset)
439 return 1;
440 if (k1.offset < k2->offset)
441 return -1;
442 return 0;
443}
444
445/*
446 * same as comp_keys only with two btrfs_key's
447 */
448static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
449{
450 if (k1->objectid > k2->objectid)
451 return 1;
452 if (k1->objectid < k2->objectid)
453 return -1;
454 if (k1->type > k2->type)
455 return 1;
456 if (k1->type < k2->type)
457 return -1;
458 if (k1->offset > k2->offset)
459 return 1;
460 if (k1->offset < k2->offset)
461 return -1;
462 return 0;
463}
464
465/*
466 * this is used by the defrag code to go through all the
467 * leaves pointed to by a node and reallocate them so that
468 * disk order is close to key order
469 */
470int btrfs_realloc_node(struct btrfs_trans_handle *trans,
471 struct btrfs_root *root, struct extent_buffer *parent,
472 int start_slot, int cache_only, u64 *last_ret,
473 struct btrfs_key *progress)
474{
475 struct extent_buffer *cur;
476 u64 blocknr;
477 u64 gen;
478 u64 search_start = *last_ret;
479 u64 last_block = 0;
480 u64 other;
481 u32 parent_nritems;
482 int end_slot;
483 int i;
484 int err = 0;
485 int parent_level;
486 int uptodate;
487 u32 blocksize;
488 int progress_passed = 0;
489 struct btrfs_disk_key disk_key;
490
491 parent_level = btrfs_header_level(parent);
492 if (cache_only && parent_level != 1)
493 return 0;
494
495 if (trans->transaction != root->fs_info->running_transaction)
496 WARN_ON(1);
497 if (trans->transid != root->fs_info->generation)
498 WARN_ON(1);
499
500 parent_nritems = btrfs_header_nritems(parent);
501 blocksize = btrfs_level_size(root, parent_level - 1);
502 end_slot = parent_nritems;
503
504 if (parent_nritems == 1)
505 return 0;
506
507 for (i = start_slot; i < end_slot; i++) {
508 int close = 1;
509
510 if (!parent->map_token) {
511 map_extent_buffer(parent,
512 btrfs_node_key_ptr_offset(i),
513 sizeof(struct btrfs_key_ptr),
514 &parent->map_token, &parent->kaddr,
515 &parent->map_start, &parent->map_len,
516 KM_USER1);
517 }
518 btrfs_node_key(parent, &disk_key, i);
519 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
520 continue;
521
522 progress_passed = 1;
523 blocknr = btrfs_node_blockptr(parent, i);
524 gen = btrfs_node_ptr_generation(parent, i);
525 if (last_block == 0)
526 last_block = blocknr;
527
528 if (i > 0) {
529 other = btrfs_node_blockptr(parent, i - 1);
530 close = close_blocks(blocknr, other, blocksize);
531 }
532 if (!close && i < end_slot - 2) {
533 other = btrfs_node_blockptr(parent, i + 1);
534 close = close_blocks(blocknr, other, blocksize);
535 }
536 if (close) {
537 last_block = blocknr;
538 continue;
539 }
540 if (parent->map_token) {
541 unmap_extent_buffer(parent, parent->map_token,
542 KM_USER1);
543 parent->map_token = NULL;
544 }
545
546 cur = btrfs_find_tree_block(root, blocknr, blocksize);
547 if (cur)
548 uptodate = btrfs_buffer_uptodate(cur, gen);
549 else
550 uptodate = 0;
551 if (!cur || !uptodate) {
552 if (cache_only) {
553 free_extent_buffer(cur);
554 continue;
555 }
556 if (!cur) {
557 cur = read_tree_block(root, blocknr,
558 blocksize, gen);
559 } else if (!uptodate) {
560 btrfs_read_buffer(cur, gen);
561 }
562 }
563 if (search_start == 0)
564 search_start = last_block;
565
566 btrfs_tree_lock(cur);
567 err = __btrfs_cow_block(trans, root, cur, parent, i,
568 &cur, search_start,
569 min(16 * blocksize,
570 (end_slot - i) * blocksize), 0);
571 if (err) {
572 btrfs_tree_unlock(cur);
573 free_extent_buffer(cur);
574 break;
575 }
576 search_start = cur->start;
577 last_block = cur->start;
578 *last_ret = search_start;
579 btrfs_tree_unlock(cur);
580 free_extent_buffer(cur);
581 }
582 if (parent->map_token) {
583 unmap_extent_buffer(parent, parent->map_token,
584 KM_USER1);
585 parent->map_token = NULL;
586 }
587 return err;
588}
589
590/*
591 * The leaf data grows from end-to-front in the node.
592 * this returns the address of the start of the last item,
593 * which is the stop of the leaf data stack
594 */
595static inline unsigned int leaf_data_end(struct btrfs_root *root,
596 struct extent_buffer *leaf)
597{
598 u32 nr = btrfs_header_nritems(leaf);
599 if (nr == 0)
600 return BTRFS_LEAF_DATA_SIZE(root);
601 return btrfs_item_offset_nr(leaf, nr - 1);
602}
603
604/*
605 * extra debugging checks to make sure all the items in a key are
606 * well formed and in the proper order
607 */
608static int check_node(struct btrfs_root *root, struct btrfs_path *path,
609 int level)
610{
611 struct extent_buffer *parent = NULL;
612 struct extent_buffer *node = path->nodes[level];
613 struct btrfs_disk_key parent_key;
614 struct btrfs_disk_key node_key;
615 int parent_slot;
616 int slot;
617 struct btrfs_key cpukey;
618 u32 nritems = btrfs_header_nritems(node);
619
620 if (path->nodes[level + 1])
621 parent = path->nodes[level + 1];
622
623 slot = path->slots[level];
624 BUG_ON(nritems == 0);
625 if (parent) {
626 parent_slot = path->slots[level + 1];
627 btrfs_node_key(parent, &parent_key, parent_slot);
628 btrfs_node_key(node, &node_key, 0);
629 BUG_ON(memcmp(&parent_key, &node_key,
630 sizeof(struct btrfs_disk_key)));
631 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
632 btrfs_header_bytenr(node));
633 }
634 BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
635 if (slot != 0) {
636 btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
637 btrfs_node_key(node, &node_key, slot);
638 BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
639 }
640 if (slot < nritems - 1) {
641 btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
642 btrfs_node_key(node, &node_key, slot);
643 BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
644 }
645 return 0;
646}
647
648/*
649 * extra checking to make sure all the items in a leaf are
650 * well formed and in the proper order
651 */
652static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
653 int level)
654{
655 struct extent_buffer *leaf = path->nodes[level];
656 struct extent_buffer *parent = NULL;
657 int parent_slot;
658 struct btrfs_key cpukey;
659 struct btrfs_disk_key parent_key;
660 struct btrfs_disk_key leaf_key;
661 int slot = path->slots[0];
662
663 u32 nritems = btrfs_header_nritems(leaf);
664
665 if (path->nodes[level + 1])
666 parent = path->nodes[level + 1];
667
668 if (nritems == 0)
669 return 0;
670
671 if (parent) {
672 parent_slot = path->slots[level + 1];
673 btrfs_node_key(parent, &parent_key, parent_slot);
674 btrfs_item_key(leaf, &leaf_key, 0);
675
676 BUG_ON(memcmp(&parent_key, &leaf_key,
677 sizeof(struct btrfs_disk_key)));
678 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
679 btrfs_header_bytenr(leaf));
680 }
681 if (slot != 0 && slot < nritems - 1) {
682 btrfs_item_key(leaf, &leaf_key, slot);
683 btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
684 if (comp_keys(&leaf_key, &cpukey) <= 0) {
685 btrfs_print_leaf(root, leaf);
686 printk(KERN_CRIT "slot %d offset bad key\n", slot);
687 BUG_ON(1);
688 }
689 if (btrfs_item_offset_nr(leaf, slot - 1) !=
690 btrfs_item_end_nr(leaf, slot)) {
691 btrfs_print_leaf(root, leaf);
692 printk(KERN_CRIT "slot %d offset bad\n", slot);
693 BUG_ON(1);
694 }
695 }
696 if (slot < nritems - 1) {
697 btrfs_item_key(leaf, &leaf_key, slot);
698 btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
699 BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
700 if (btrfs_item_offset_nr(leaf, slot) !=
701 btrfs_item_end_nr(leaf, slot + 1)) {
702 btrfs_print_leaf(root, leaf);
703 printk(KERN_CRIT "slot %d offset bad\n", slot);
704 BUG_ON(1);
705 }
706 }
707 BUG_ON(btrfs_item_offset_nr(leaf, 0) +
708 btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
709 return 0;
710}
711
712static noinline int check_block(struct btrfs_root *root,
713 struct btrfs_path *path, int level)
714{
715 return 0;
716 if (level == 0)
717 return check_leaf(root, path, level);
718 return check_node(root, path, level);
719}
720
721/*
722 * search for key in the extent_buffer. The items start at offset p,
723 * and they are item_size apart. There are 'max' items in p.
724 *
725 * the slot in the array is returned via slot, and it points to
726 * the place where you would insert key if it is not found in
727 * the array.
728 *
729 * slot may point to max if the key is bigger than all of the keys
730 */
731static noinline int generic_bin_search(struct extent_buffer *eb,
732 unsigned long p,
733 int item_size, struct btrfs_key *key,
734 int max, int *slot)
735{
736 int low = 0;
737 int high = max;
738 int mid;
739 int ret;
740 struct btrfs_disk_key *tmp = NULL;
741 struct btrfs_disk_key unaligned;
742 unsigned long offset;
743 char *map_token = NULL;
744 char *kaddr = NULL;
745 unsigned long map_start = 0;
746 unsigned long map_len = 0;
747 int err;
748
749 while (low < high) {
750 mid = (low + high) / 2;
751 offset = p + mid * item_size;
752
753 if (!map_token || offset < map_start ||
754 (offset + sizeof(struct btrfs_disk_key)) >
755 map_start + map_len) {
756 if (map_token) {
757 unmap_extent_buffer(eb, map_token, KM_USER0);
758 map_token = NULL;
759 }
760
761 err = map_private_extent_buffer(eb, offset,
762 sizeof(struct btrfs_disk_key),
763 &map_token, &kaddr,
764 &map_start, &map_len, KM_USER0);
765
766 if (!err) {
767 tmp = (struct btrfs_disk_key *)(kaddr + offset -
768 map_start);
769 } else {
770 read_extent_buffer(eb, &unaligned,
771 offset, sizeof(unaligned));
772 tmp = &unaligned;
773 }
774
775 } else {
776 tmp = (struct btrfs_disk_key *)(kaddr + offset -
777 map_start);
778 }
779 ret = comp_keys(tmp, key);
780
781 if (ret < 0)
782 low = mid + 1;
783 else if (ret > 0)
784 high = mid;
785 else {
786 *slot = mid;
787 if (map_token)
788 unmap_extent_buffer(eb, map_token, KM_USER0);
789 return 0;
790 }
791 }
792 *slot = low;
793 if (map_token)
794 unmap_extent_buffer(eb, map_token, KM_USER0);
795 return 1;
796}
797
798/*
799 * simple bin_search frontend that does the right thing for
800 * leaves vs nodes
801 */
802static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
803 int level, int *slot)
804{
805 if (level == 0) {
806 return generic_bin_search(eb,
807 offsetof(struct btrfs_leaf, items),
808 sizeof(struct btrfs_item),
809 key, btrfs_header_nritems(eb),
810 slot);
811 } else {
812 return generic_bin_search(eb,
813 offsetof(struct btrfs_node, ptrs),
814 sizeof(struct btrfs_key_ptr),
815 key, btrfs_header_nritems(eb),
816 slot);
817 }
818 return -1;
819}
820
821/* given a node and slot number, this reads the blocks it points to. The
822 * extent buffer is returned with a reference taken (but unlocked).
823 * NULL is returned on error.
824 */
825static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
826 struct extent_buffer *parent, int slot)
827{
828 int level = btrfs_header_level(parent);
829 if (slot < 0)
830 return NULL;
831 if (slot >= btrfs_header_nritems(parent))
832 return NULL;
833
834 BUG_ON(level == 0);
835
836 return read_tree_block(root, btrfs_node_blockptr(parent, slot),
837 btrfs_level_size(root, level - 1),
838 btrfs_node_ptr_generation(parent, slot));
839}
840
841/*
842 * node level balancing, used to make sure nodes are in proper order for
843 * item deletion. We balance from the top down, so we have to make sure
844 * that a deletion won't leave an node completely empty later on.
845 */
846static noinline int balance_level(struct btrfs_trans_handle *trans,
847 struct btrfs_root *root,
848 struct btrfs_path *path, int level)
849{
850 struct extent_buffer *right = NULL;
851 struct extent_buffer *mid;
852 struct extent_buffer *left = NULL;
853 struct extent_buffer *parent = NULL;
854 int ret = 0;
855 int wret;
856 int pslot;
857 int orig_slot = path->slots[level];
858 int err_on_enospc = 0;
859 u64 orig_ptr;
860
861 if (level == 0)
862 return 0;
863
864 mid = path->nodes[level];
865 WARN_ON(!path->locks[level]);
866 WARN_ON(btrfs_header_generation(mid) != trans->transid);
867
868 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
869
870 if (level < BTRFS_MAX_LEVEL - 1)
871 parent = path->nodes[level + 1];
872 pslot = path->slots[level + 1];
873
874 /*
875 * deal with the case where there is only one pointer in the root
876 * by promoting the node below to a root
877 */
878 if (!parent) {
879 struct extent_buffer *child;
880
881 if (btrfs_header_nritems(mid) != 1)
882 return 0;
883
884 /* promote the child to a root */
885 child = read_node_slot(root, mid, 0);
886 btrfs_tree_lock(child);
887 BUG_ON(!child);
888 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
889 BUG_ON(ret);
890
891 spin_lock(&root->node_lock);
892 root->node = child;
893 spin_unlock(&root->node_lock);
894
895 ret = btrfs_update_extent_ref(trans, root, child->start,
896 mid->start, child->start,
897 root->root_key.objectid,
898 trans->transid, level - 1);
899 BUG_ON(ret);
900
901 add_root_to_dirty_list(root);
902 btrfs_tree_unlock(child);
903 path->locks[level] = 0;
904 path->nodes[level] = NULL;
905 clean_tree_block(trans, root, mid);
906 btrfs_tree_unlock(mid);
907 /* once for the path */
908 free_extent_buffer(mid);
909 ret = btrfs_free_extent(trans, root, mid->start, mid->len,
910 mid->start, root->root_key.objectid,
911 btrfs_header_generation(mid),
912 level, 1);
913 /* once for the root ptr */
914 free_extent_buffer(mid);
915 return ret;
916 }
917 if (btrfs_header_nritems(mid) >
918 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
919 return 0;
920
921 if (btrfs_header_nritems(mid) < 2)
922 err_on_enospc = 1;
923
924 left = read_node_slot(root, parent, pslot - 1);
925 if (left) {
926 btrfs_tree_lock(left);
927 wret = btrfs_cow_block(trans, root, left,
928 parent, pslot - 1, &left, 0);
929 if (wret) {
930 ret = wret;
931 goto enospc;
932 }
933 }
934 right = read_node_slot(root, parent, pslot + 1);
935 if (right) {
936 btrfs_tree_lock(right);
937 wret = btrfs_cow_block(trans, root, right,
938 parent, pslot + 1, &right, 0);
939 if (wret) {
940 ret = wret;
941 goto enospc;
942 }
943 }
944
945 /* first, try to make some room in the middle buffer */
946 if (left) {
947 orig_slot += btrfs_header_nritems(left);
948 wret = push_node_left(trans, root, left, mid, 1);
949 if (wret < 0)
950 ret = wret;
951 if (btrfs_header_nritems(mid) < 2)
952 err_on_enospc = 1;
953 }
954
955 /*
956 * then try to empty the right most buffer into the middle
957 */
958 if (right) {
959 wret = push_node_left(trans, root, mid, right, 1);
960 if (wret < 0 && wret != -ENOSPC)
961 ret = wret;
962 if (btrfs_header_nritems(right) == 0) {
963 u64 bytenr = right->start;
964 u64 generation = btrfs_header_generation(parent);
965 u32 blocksize = right->len;
966
967 clean_tree_block(trans, root, right);
968 btrfs_tree_unlock(right);
969 free_extent_buffer(right);
970 right = NULL;
971 wret = del_ptr(trans, root, path, level + 1, pslot +
972 1);
973 if (wret)
974 ret = wret;
975 wret = btrfs_free_extent(trans, root, bytenr,
976 blocksize, parent->start,
977 btrfs_header_owner(parent),
978 generation, level, 1);
979 if (wret)
980 ret = wret;
981 } else {
982 struct btrfs_disk_key right_key;
983 btrfs_node_key(right, &right_key, 0);
984 btrfs_set_node_key(parent, &right_key, pslot + 1);
985 btrfs_mark_buffer_dirty(parent);
986 }
987 }
988 if (btrfs_header_nritems(mid) == 1) {
989 /*
990 * we're not allowed to leave a node with one item in the
991 * tree during a delete. A deletion from lower in the tree
992 * could try to delete the only pointer in this node.
993 * So, pull some keys from the left.
994 * There has to be a left pointer at this point because
995 * otherwise we would have pulled some pointers from the
996 * right
997 */
998 BUG_ON(!left);
999 wret = balance_node_right(trans, root, mid, left);
1000 if (wret < 0) {
1001 ret = wret;
1002 goto enospc;
1003 }
1004 if (wret == 1) {
1005 wret = push_node_left(trans, root, left, mid, 1);
1006 if (wret < 0)
1007 ret = wret;
1008 }
1009 BUG_ON(wret == 1);
1010 }
1011 if (btrfs_header_nritems(mid) == 0) {
1012 /* we've managed to empty the middle node, drop it */
1013 u64 root_gen = btrfs_header_generation(parent);
1014 u64 bytenr = mid->start;
1015 u32 blocksize = mid->len;
1016
1017 clean_tree_block(trans, root, mid);
1018 btrfs_tree_unlock(mid);
1019 free_extent_buffer(mid);
1020 mid = NULL;
1021 wret = del_ptr(trans, root, path, level + 1, pslot);
1022 if (wret)
1023 ret = wret;
1024 wret = btrfs_free_extent(trans, root, bytenr, blocksize,
1025 parent->start,
1026 btrfs_header_owner(parent),
1027 root_gen, level, 1);
1028 if (wret)
1029 ret = wret;
1030 } else {
1031 /* update the parent key to reflect our changes */
1032 struct btrfs_disk_key mid_key;
1033 btrfs_node_key(mid, &mid_key, 0);
1034 btrfs_set_node_key(parent, &mid_key, pslot);
1035 btrfs_mark_buffer_dirty(parent);
1036 }
1037
1038 /* update the path */
1039 if (left) {
1040 if (btrfs_header_nritems(left) > orig_slot) {
1041 extent_buffer_get(left);
1042 /* left was locked after cow */
1043 path->nodes[level] = left;
1044 path->slots[level + 1] -= 1;
1045 path->slots[level] = orig_slot;
1046 if (mid) {
1047 btrfs_tree_unlock(mid);
1048 free_extent_buffer(mid);
1049 }
1050 } else {
1051 orig_slot -= btrfs_header_nritems(left);
1052 path->slots[level] = orig_slot;
1053 }
1054 }
1055 /* double check we haven't messed things up */
1056 check_block(root, path, level);
1057 if (orig_ptr !=
1058 btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1059 BUG();
1060enospc:
1061 if (right) {
1062 btrfs_tree_unlock(right);
1063 free_extent_buffer(right);
1064 }
1065 if (left) {
1066 if (path->nodes[level] != left)
1067 btrfs_tree_unlock(left);
1068 free_extent_buffer(left);
1069 }
1070 return ret;
1071}
1072
1073/* Node balancing for insertion. Here we only split or push nodes around
1074 * when they are completely full. This is also done top down, so we
1075 * have to be pessimistic.
1076 */
1077static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1078 struct btrfs_root *root,
1079 struct btrfs_path *path, int level)
1080{
1081 struct extent_buffer *right = NULL;
1082 struct extent_buffer *mid;
1083 struct extent_buffer *left = NULL;
1084 struct extent_buffer *parent = NULL;
1085 int ret = 0;
1086 int wret;
1087 int pslot;
1088 int orig_slot = path->slots[level];
1089 u64 orig_ptr;
1090
1091 if (level == 0)
1092 return 1;
1093
1094 mid = path->nodes[level];
1095 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1096 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1097
1098 if (level < BTRFS_MAX_LEVEL - 1)
1099 parent = path->nodes[level + 1];
1100 pslot = path->slots[level + 1];
1101
1102 if (!parent)
1103 return 1;
1104
1105 left = read_node_slot(root, parent, pslot - 1);
1106
1107 /* first, try to make some room in the middle buffer */
1108 if (left) {
1109 u32 left_nr;
1110
1111 btrfs_tree_lock(left);
1112 left_nr = btrfs_header_nritems(left);
1113 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1114 wret = 1;
1115 } else {
1116 ret = btrfs_cow_block(trans, root, left, parent,
1117 pslot - 1, &left, 0);
1118 if (ret)
1119 wret = 1;
1120 else {
1121 wret = push_node_left(trans, root,
1122 left, mid, 0);
1123 }
1124 }
1125 if (wret < 0)
1126 ret = wret;
1127 if (wret == 0) {
1128 struct btrfs_disk_key disk_key;
1129 orig_slot += left_nr;
1130 btrfs_node_key(mid, &disk_key, 0);
1131 btrfs_set_node_key(parent, &disk_key, pslot);
1132 btrfs_mark_buffer_dirty(parent);
1133 if (btrfs_header_nritems(left) > orig_slot) {
1134 path->nodes[level] = left;
1135 path->slots[level + 1] -= 1;
1136 path->slots[level] = orig_slot;
1137 btrfs_tree_unlock(mid);
1138 free_extent_buffer(mid);
1139 } else {
1140 orig_slot -=
1141 btrfs_header_nritems(left);
1142 path->slots[level] = orig_slot;
1143 btrfs_tree_unlock(left);
1144 free_extent_buffer(left);
1145 }
1146 return 0;
1147 }
1148 btrfs_tree_unlock(left);
1149 free_extent_buffer(left);
1150 }
1151 right = read_node_slot(root, parent, pslot + 1);
1152
1153 /*
1154 * then try to empty the right most buffer into the middle
1155 */
1156 if (right) {
1157 u32 right_nr;
1158 btrfs_tree_lock(right);
1159 right_nr = btrfs_header_nritems(right);
1160 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1161 wret = 1;
1162 } else {
1163 ret = btrfs_cow_block(trans, root, right,
1164 parent, pslot + 1,
1165 &right, 0);
1166 if (ret)
1167 wret = 1;
1168 else {
1169 wret = balance_node_right(trans, root,
1170 right, mid);
1171 }
1172 }
1173 if (wret < 0)
1174 ret = wret;
1175 if (wret == 0) {
1176 struct btrfs_disk_key disk_key;
1177
1178 btrfs_node_key(right, &disk_key, 0);
1179 btrfs_set_node_key(parent, &disk_key, pslot + 1);
1180 btrfs_mark_buffer_dirty(parent);
1181
1182 if (btrfs_header_nritems(mid) <= orig_slot) {
1183 path->nodes[level] = right;
1184 path->slots[level + 1] += 1;
1185 path->slots[level] = orig_slot -
1186 btrfs_header_nritems(mid);
1187 btrfs_tree_unlock(mid);
1188 free_extent_buffer(mid);
1189 } else {
1190 btrfs_tree_unlock(right);
1191 free_extent_buffer(right);
1192 }
1193 return 0;
1194 }
1195 btrfs_tree_unlock(right);
1196 free_extent_buffer(right);
1197 }
1198 return 1;
1199}
1200
1201/*
1202 * readahead one full node of leaves, finding things that are close
1203 * to the block in 'slot', and triggering ra on them.
1204 */
1205static noinline void reada_for_search(struct btrfs_root *root,
1206 struct btrfs_path *path,
1207 int level, int slot, u64 objectid)
1208{
1209 struct extent_buffer *node;
1210 struct btrfs_disk_key disk_key;
1211 u32 nritems;
1212 u64 search;
1213 u64 lowest_read;
1214 u64 highest_read;
1215 u64 nread = 0;
1216 int direction = path->reada;
1217 struct extent_buffer *eb;
1218 u32 nr;
1219 u32 blocksize;
1220 u32 nscan = 0;
1221
1222 if (level != 1)
1223 return;
1224
1225 if (!path->nodes[level])
1226 return;
1227
1228 node = path->nodes[level];
1229
1230 search = btrfs_node_blockptr(node, slot);
1231 blocksize = btrfs_level_size(root, level - 1);
1232 eb = btrfs_find_tree_block(root, search, blocksize);
1233 if (eb) {
1234 free_extent_buffer(eb);
1235 return;
1236 }
1237
1238 highest_read = search;
1239 lowest_read = search;
1240
1241 nritems = btrfs_header_nritems(node);
1242 nr = slot;
1243 while (1) {
1244 if (direction < 0) {
1245 if (nr == 0)
1246 break;
1247 nr--;
1248 } else if (direction > 0) {
1249 nr++;
1250 if (nr >= nritems)
1251 break;
1252 }
1253 if (path->reada < 0 && objectid) {
1254 btrfs_node_key(node, &disk_key, nr);
1255 if (btrfs_disk_key_objectid(&disk_key) != objectid)
1256 break;
1257 }
1258 search = btrfs_node_blockptr(node, nr);
1259 if ((search >= lowest_read && search <= highest_read) ||
1260 (search < lowest_read && lowest_read - search <= 16384) ||
1261 (search > highest_read && search - highest_read <= 16384)) {
1262 readahead_tree_block(root, search, blocksize,
1263 btrfs_node_ptr_generation(node, nr));
1264 nread += blocksize;
1265 }
1266 nscan++;
1267 if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32))
1268 break;
1269
1270 if (nread > (256 * 1024) || nscan > 128)
1271 break;
1272
1273 if (search < lowest_read)
1274 lowest_read = search;
1275 if (search > highest_read)
1276 highest_read = search;
1277 }
1278}
1279
1280/*
1281 * when we walk down the tree, it is usually safe to unlock the higher layers
1282 * in the tree. The exceptions are when our path goes through slot 0, because
1283 * operations on the tree might require changing key pointers higher up in the
1284 * tree.
1285 *
1286 * callers might also have set path->keep_locks, which tells this code to keep
1287 * the lock if the path points to the last slot in the block. This is part of
1288 * walking through the tree, and selecting the next slot in the higher block.
1289 *
1290 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
1291 * if lowest_unlock is 1, level 0 won't be unlocked
1292 */
1293static noinline void unlock_up(struct btrfs_path *path, int level,
1294 int lowest_unlock)
1295{
1296 int i;
1297 int skip_level = level;
1298 int no_skips = 0;
1299 struct extent_buffer *t;
1300
1301 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1302 if (!path->nodes[i])
1303 break;
1304 if (!path->locks[i])
1305 break;
1306 if (!no_skips && path->slots[i] == 0) {
1307 skip_level = i + 1;
1308 continue;
1309 }
1310 if (!no_skips && path->keep_locks) {
1311 u32 nritems;
1312 t = path->nodes[i];
1313 nritems = btrfs_header_nritems(t);
1314 if (nritems < 1 || path->slots[i] >= nritems - 1) {
1315 skip_level = i + 1;
1316 continue;
1317 }
1318 }
1319 if (skip_level < i && i >= lowest_unlock)
1320 no_skips = 1;
1321
1322 t = path->nodes[i];
1323 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
1324 btrfs_tree_unlock(t);
1325 path->locks[i] = 0;
1326 }
1327 }
1328}
1329
1330/*
1331 * look for key in the tree. path is filled in with nodes along the way
1332 * if key is found, we return zero and you can find the item in the leaf
1333 * level of the path (level 0)
1334 *
1335 * If the key isn't found, the path points to the slot where it should
1336 * be inserted, and 1 is returned. If there are other errors during the
1337 * search a negative error number is returned.
1338 *
1339 * if ins_len > 0, nodes and leaves will be split as we walk down the
1340 * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if
1341 * possible)
1342 */
1343int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1344 *root, struct btrfs_key *key, struct btrfs_path *p, int
1345 ins_len, int cow)
1346{
1347 struct extent_buffer *b;
1348 struct extent_buffer *tmp;
1349 int slot;
1350 int ret;
1351 int level;
1352 int should_reada = p->reada;
1353 int lowest_unlock = 1;
1354 int blocksize;
1355 u8 lowest_level = 0;
1356 u64 blocknr;
1357 u64 gen;
1358 struct btrfs_key prealloc_block;
1359
1360 lowest_level = p->lowest_level;
1361 WARN_ON(lowest_level && ins_len > 0);
1362 WARN_ON(p->nodes[0] != NULL);
1363
1364 if (ins_len < 0)
1365 lowest_unlock = 2;
1366
1367 prealloc_block.objectid = 0;
1368
1369again:
1370 if (p->skip_locking)
1371 b = btrfs_root_node(root);
1372 else
1373 b = btrfs_lock_root_node(root);
1374
1375 while (b) {
1376 level = btrfs_header_level(b);
1377
1378 /*
1379 * setup the path here so we can release it under lock
1380 * contention with the cow code
1381 */
1382 p->nodes[level] = b;
1383 if (!p->skip_locking)
1384 p->locks[level] = 1;
1385
1386 if (cow) {
1387 int wret;
1388
1389 /* is a cow on this block not required */
1390 spin_lock(&root->fs_info->hash_lock);
1391 if (btrfs_header_generation(b) == trans->transid &&
1392 btrfs_header_owner(b) == root->root_key.objectid &&
1393 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
1394 spin_unlock(&root->fs_info->hash_lock);
1395 goto cow_done;
1396 }
1397 spin_unlock(&root->fs_info->hash_lock);
1398
1399 /* ok, we have to cow, is our old prealloc the right
1400 * size?
1401 */
1402 if (prealloc_block.objectid &&
1403 prealloc_block.offset != b->len) {
1404 btrfs_free_reserved_extent(root,
1405 prealloc_block.objectid,
1406 prealloc_block.offset);
1407 prealloc_block.objectid = 0;
1408 }
1409
1410 /*
1411 * for higher level blocks, try not to allocate blocks
1412 * with the block and the parent locks held.
1413 */
1414 if (level > 1 && !prealloc_block.objectid &&
1415 btrfs_path_lock_waiting(p, level)) {
1416 u32 size = b->len;
1417 u64 hint = b->start;
1418
1419 btrfs_release_path(root, p);
1420 ret = btrfs_reserve_extent(trans, root,
1421 size, size, 0,
1422 hint, (u64)-1,
1423 &prealloc_block, 0);
1424 BUG_ON(ret);
1425 goto again;
1426 }
1427
1428 wret = btrfs_cow_block(trans, root, b,
1429 p->nodes[level + 1],
1430 p->slots[level + 1],
1431 &b, prealloc_block.objectid);
1432 prealloc_block.objectid = 0;
1433 if (wret) {
1434 free_extent_buffer(b);
1435 ret = wret;
1436 goto done;
1437 }
1438 }
1439cow_done:
1440 BUG_ON(!cow && ins_len);
1441 if (level != btrfs_header_level(b))
1442 WARN_ON(1);
1443 level = btrfs_header_level(b);
1444
1445 p->nodes[level] = b;
1446 if (!p->skip_locking)
1447 p->locks[level] = 1;
1448
1449 ret = check_block(root, p, level);
1450 if (ret) {
1451 ret = -1;
1452 goto done;
1453 }
1454
1455 ret = bin_search(b, key, level, &slot);
1456 if (level != 0) {
1457 if (ret && slot > 0)
1458 slot -= 1;
1459 p->slots[level] = slot;
1460 if ((p->search_for_split || ins_len > 0) &&
1461 btrfs_header_nritems(b) >=
1462 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1463 int sret = split_node(trans, root, p, level);
1464 BUG_ON(sret > 0);
1465 if (sret) {
1466 ret = sret;
1467 goto done;
1468 }
1469 b = p->nodes[level];
1470 slot = p->slots[level];
1471 } else if (ins_len < 0) {
1472 int sret = balance_level(trans, root, p,
1473 level);
1474 if (sret) {
1475 ret = sret;
1476 goto done;
1477 }
1478 b = p->nodes[level];
1479 if (!b) {
1480 btrfs_release_path(NULL, p);
1481 goto again;
1482 }
1483 slot = p->slots[level];
1484 BUG_ON(btrfs_header_nritems(b) == 1);
1485 }
1486 unlock_up(p, level, lowest_unlock);
1487
1488 /* this is only true while dropping a snapshot */
1489 if (level == lowest_level) {
1490 ret = 0;
1491 goto done;
1492 }
1493
1494 blocknr = btrfs_node_blockptr(b, slot);
1495 gen = btrfs_node_ptr_generation(b, slot);
1496 blocksize = btrfs_level_size(root, level - 1);
1497
1498 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1499 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1500 b = tmp;
1501 } else {
1502 /*
1503 * reduce lock contention at high levels
1504 * of the btree by dropping locks before
1505 * we read.
1506 */
1507 if (level > 1) {
1508 btrfs_release_path(NULL, p);
1509 if (tmp)
1510 free_extent_buffer(tmp);
1511 if (should_reada)
1512 reada_for_search(root, p,
1513 level, slot,
1514 key->objectid);
1515
1516 tmp = read_tree_block(root, blocknr,
1517 blocksize, gen);
1518 if (tmp)
1519 free_extent_buffer(tmp);
1520 goto again;
1521 } else {
1522 if (tmp)
1523 free_extent_buffer(tmp);
1524 if (should_reada)
1525 reada_for_search(root, p,
1526 level, slot,
1527 key->objectid);
1528 b = read_node_slot(root, b, slot);
1529 }
1530 }
1531 if (!p->skip_locking)
1532 btrfs_tree_lock(b);
1533 } else {
1534 p->slots[level] = slot;
1535 if (ins_len > 0 &&
1536 btrfs_leaf_free_space(root, b) < ins_len) {
1537 int sret = split_leaf(trans, root, key,
1538 p, ins_len, ret == 0);
1539 BUG_ON(sret > 0);
1540 if (sret) {
1541 ret = sret;
1542 goto done;
1543 }
1544 }
1545 if (!p->search_for_split)
1546 unlock_up(p, level, lowest_unlock);
1547 goto done;
1548 }
1549 }
1550 ret = 1;
1551done:
1552 if (prealloc_block.objectid) {
1553 btrfs_free_reserved_extent(root,
1554 prealloc_block.objectid,
1555 prealloc_block.offset);
1556 }
1557
1558 return ret;
1559}
1560
1561int btrfs_merge_path(struct btrfs_trans_handle *trans,
1562 struct btrfs_root *root,
1563 struct btrfs_key *node_keys,
1564 u64 *nodes, int lowest_level)
1565{
1566 struct extent_buffer *eb;
1567 struct extent_buffer *parent;
1568 struct btrfs_key key;
1569 u64 bytenr;
1570 u64 generation;
1571 u32 blocksize;
1572 int level;
1573 int slot;
1574 int key_match;
1575 int ret;
1576
1577 eb = btrfs_lock_root_node(root);
1578 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
1579 BUG_ON(ret);
1580
1581 parent = eb;
1582 while (1) {
1583 level = btrfs_header_level(parent);
1584 if (level == 0 || level <= lowest_level)
1585 break;
1586
1587 ret = bin_search(parent, &node_keys[lowest_level], level,
1588 &slot);
1589 if (ret && slot > 0)
1590 slot--;
1591
1592 bytenr = btrfs_node_blockptr(parent, slot);
1593 if (nodes[level - 1] == bytenr)
1594 break;
1595
1596 blocksize = btrfs_level_size(root, level - 1);
1597 generation = btrfs_node_ptr_generation(parent, slot);
1598 btrfs_node_key_to_cpu(eb, &key, slot);
1599 key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
1600
1601 if (generation == trans->transid) {
1602 eb = read_tree_block(root, bytenr, blocksize,
1603 generation);
1604 btrfs_tree_lock(eb);
1605 }
1606
1607 /*
1608 * if node keys match and node pointer hasn't been modified
1609 * in the running transaction, we can merge the path. for
1610 * blocks owened by reloc trees, the node pointer check is
1611 * skipped, this is because these blocks are fully controlled
1612 * by the space balance code, no one else can modify them.
1613 */
1614 if (!nodes[level - 1] || !key_match ||
1615 (generation == trans->transid &&
1616 btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
1617 if (level == 1 || level == lowest_level + 1) {
1618 if (generation == trans->transid) {
1619 btrfs_tree_unlock(eb);
1620 free_extent_buffer(eb);
1621 }
1622 break;
1623 }
1624
1625 if (generation != trans->transid) {
1626 eb = read_tree_block(root, bytenr, blocksize,
1627 generation);
1628 btrfs_tree_lock(eb);
1629 }
1630
1631 ret = btrfs_cow_block(trans, root, eb, parent, slot,
1632 &eb, 0);
1633 BUG_ON(ret);
1634
1635 if (root->root_key.objectid ==
1636 BTRFS_TREE_RELOC_OBJECTID) {
1637 if (!nodes[level - 1]) {
1638 nodes[level - 1] = eb->start;
1639 memcpy(&node_keys[level - 1], &key,
1640 sizeof(node_keys[0]));
1641 } else {
1642 WARN_ON(1);
1643 }
1644 }
1645
1646 btrfs_tree_unlock(parent);
1647 free_extent_buffer(parent);
1648 parent = eb;
1649 continue;
1650 }
1651
1652 btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
1653 btrfs_set_node_ptr_generation(parent, slot, trans->transid);
1654 btrfs_mark_buffer_dirty(parent);
1655
1656 ret = btrfs_inc_extent_ref(trans, root,
1657 nodes[level - 1],
1658 blocksize, parent->start,
1659 btrfs_header_owner(parent),
1660 btrfs_header_generation(parent),
1661 level - 1);
1662 BUG_ON(ret);
1663
1664 /*
1665 * If the block was created in the running transaction,
1666 * it's possible this is the last reference to it, so we
1667 * should drop the subtree.
1668 */
1669 if (generation == trans->transid) {
1670 ret = btrfs_drop_subtree(trans, root, eb, parent);
1671 BUG_ON(ret);
1672 btrfs_tree_unlock(eb);
1673 free_extent_buffer(eb);
1674 } else {
1675 ret = btrfs_free_extent(trans, root, bytenr,
1676 blocksize, parent->start,
1677 btrfs_header_owner(parent),
1678 btrfs_header_generation(parent),
1679 level - 1, 1);
1680 BUG_ON(ret);
1681 }
1682 break;
1683 }
1684 btrfs_tree_unlock(parent);
1685 free_extent_buffer(parent);
1686 return 0;
1687}
1688
1689/*
1690 * adjust the pointers going up the tree, starting at level
1691 * making sure the right key of each node is points to 'key'.
1692 * This is used after shifting pointers to the left, so it stops
1693 * fixing up pointers when a given leaf/node is not in slot 0 of the
1694 * higher levels
1695 *
1696 * If this fails to write a tree block, it returns -1, but continues
1697 * fixing up the blocks in ram so the tree is consistent.
1698 */
1699static int fixup_low_keys(struct btrfs_trans_handle *trans,
1700 struct btrfs_root *root, struct btrfs_path *path,
1701 struct btrfs_disk_key *key, int level)
1702{
1703 int i;
1704 int ret = 0;
1705 struct extent_buffer *t;
1706
1707 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1708 int tslot = path->slots[i];
1709 if (!path->nodes[i])
1710 break;
1711 t = path->nodes[i];
1712 btrfs_set_node_key(t, key, tslot);
1713 btrfs_mark_buffer_dirty(path->nodes[i]);
1714 if (tslot != 0)
1715 break;
1716 }
1717 return ret;
1718}
1719
1720/*
1721 * update item key.
1722 *
1723 * This function isn't completely safe. It's the caller's responsibility
1724 * that the new key won't break the order
1725 */
1726int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1727 struct btrfs_root *root, struct btrfs_path *path,
1728 struct btrfs_key *new_key)
1729{
1730 struct btrfs_disk_key disk_key;
1731 struct extent_buffer *eb;
1732 int slot;
1733
1734 eb = path->nodes[0];
1735 slot = path->slots[0];
1736 if (slot > 0) {
1737 btrfs_item_key(eb, &disk_key, slot - 1);
1738 if (comp_keys(&disk_key, new_key) >= 0)
1739 return -1;
1740 }
1741 if (slot < btrfs_header_nritems(eb) - 1) {
1742 btrfs_item_key(eb, &disk_key, slot + 1);
1743 if (comp_keys(&disk_key, new_key) <= 0)
1744 return -1;
1745 }
1746
1747 btrfs_cpu_key_to_disk(&disk_key, new_key);
1748 btrfs_set_item_key(eb, &disk_key, slot);
1749 btrfs_mark_buffer_dirty(eb);
1750 if (slot == 0)
1751 fixup_low_keys(trans, root, path, &disk_key, 1);
1752 return 0;
1753}
1754
1755/*
1756 * try to push data from one node into the next node left in the
1757 * tree.
1758 *
1759 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
1760 * error, and > 0 if there was no room in the left hand block.
1761 */
1762static int push_node_left(struct btrfs_trans_handle *trans,
1763 struct btrfs_root *root, struct extent_buffer *dst,
1764 struct extent_buffer *src, int empty)
1765{
1766 int push_items = 0;
1767 int src_nritems;
1768 int dst_nritems;
1769 int ret = 0;
1770
1771 src_nritems = btrfs_header_nritems(src);
1772 dst_nritems = btrfs_header_nritems(dst);
1773 push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1774 WARN_ON(btrfs_header_generation(src) != trans->transid);
1775 WARN_ON(btrfs_header_generation(dst) != trans->transid);
1776
1777 if (!empty && src_nritems <= 8)
1778 return 1;
1779
1780 if (push_items <= 0)
1781 return 1;
1782
1783 if (empty) {
1784 push_items = min(src_nritems, push_items);
1785 if (push_items < src_nritems) {
1786 /* leave at least 8 pointers in the node if
1787 * we aren't going to empty it
1788 */
1789 if (src_nritems - push_items < 8) {
1790 if (push_items <= 8)
1791 return 1;
1792 push_items -= 8;
1793 }
1794 }
1795 } else
1796 push_items = min(src_nritems - 8, push_items);
1797
1798 copy_extent_buffer(dst, src,
1799 btrfs_node_key_ptr_offset(dst_nritems),
1800 btrfs_node_key_ptr_offset(0),
1801 push_items * sizeof(struct btrfs_key_ptr));
1802
1803 if (push_items < src_nritems) {
1804 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
1805 btrfs_node_key_ptr_offset(push_items),
1806 (src_nritems - push_items) *
1807 sizeof(struct btrfs_key_ptr));
1808 }
1809 btrfs_set_header_nritems(src, src_nritems - push_items);
1810 btrfs_set_header_nritems(dst, dst_nritems + push_items);
1811 btrfs_mark_buffer_dirty(src);
1812 btrfs_mark_buffer_dirty(dst);
1813
1814 ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
1815 BUG_ON(ret);
1816
1817 return ret;
1818}
1819
1820/*
1821 * try to push data from one node into the next node right in the
1822 * tree.
1823 *
1824 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
1825 * error, and > 0 if there was no room in the right hand block.
1826 *
1827 * this will only push up to 1/2 the contents of the left node over
1828 */
1829static int balance_node_right(struct btrfs_trans_handle *trans,
1830 struct btrfs_root *root,
1831 struct extent_buffer *dst,
1832 struct extent_buffer *src)
1833{
1834 int push_items = 0;
1835 int max_push;
1836 int src_nritems;
1837 int dst_nritems;
1838 int ret = 0;
1839
1840 WARN_ON(btrfs_header_generation(src) != trans->transid);
1841 WARN_ON(btrfs_header_generation(dst) != trans->transid);
1842
1843 src_nritems = btrfs_header_nritems(src);
1844 dst_nritems = btrfs_header_nritems(dst);
1845 push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1846 if (push_items <= 0)
1847 return 1;
1848
1849 if (src_nritems < 4)
1850 return 1;
1851
1852 max_push = src_nritems / 2 + 1;
1853 /* don't try to empty the node */
1854 if (max_push >= src_nritems)
1855 return 1;
1856
1857 if (max_push < push_items)
1858 push_items = max_push;
1859
1860 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
1861 btrfs_node_key_ptr_offset(0),
1862 (dst_nritems) *
1863 sizeof(struct btrfs_key_ptr));
1864
1865 copy_extent_buffer(dst, src,
1866 btrfs_node_key_ptr_offset(0),
1867 btrfs_node_key_ptr_offset(src_nritems - push_items),
1868 push_items * sizeof(struct btrfs_key_ptr));
1869
1870 btrfs_set_header_nritems(src, src_nritems - push_items);
1871 btrfs_set_header_nritems(dst, dst_nritems + push_items);
1872
1873 btrfs_mark_buffer_dirty(src);
1874 btrfs_mark_buffer_dirty(dst);
1875
1876 ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
1877 BUG_ON(ret);
1878
1879 return ret;
1880}
1881
1882/*
1883 * helper function to insert a new root level in the tree.
1884 * A new node is allocated, and a single item is inserted to
1885 * point to the existing root
1886 *
1887 * returns zero on success or < 0 on failure.
1888 */
1889static noinline int insert_new_root(struct btrfs_trans_handle *trans,
1890 struct btrfs_root *root,
1891 struct btrfs_path *path, int level)
1892{
1893 u64 lower_gen;
1894 struct extent_buffer *lower;
1895 struct extent_buffer *c;
1896 struct extent_buffer *old;
1897 struct btrfs_disk_key lower_key;
1898 int ret;
1899
1900 BUG_ON(path->nodes[level]);
1901 BUG_ON(path->nodes[level-1] != root->node);
1902
1903 lower = path->nodes[level-1];
1904 if (level == 1)
1905 btrfs_item_key(lower, &lower_key, 0);
1906 else
1907 btrfs_node_key(lower, &lower_key, 0);
1908
1909 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
1910 root->root_key.objectid, trans->transid,
1911 level, root->node->start, 0);
1912 if (IS_ERR(c))
1913 return PTR_ERR(c);
1914
1915 memset_extent_buffer(c, 0, 0, root->nodesize);
1916 btrfs_set_header_nritems(c, 1);
1917 btrfs_set_header_level(c, level);
1918 btrfs_set_header_bytenr(c, c->start);
1919 btrfs_set_header_generation(c, trans->transid);
1920 btrfs_set_header_owner(c, root->root_key.objectid);
1921
1922 write_extent_buffer(c, root->fs_info->fsid,
1923 (unsigned long)btrfs_header_fsid(c),
1924 BTRFS_FSID_SIZE);
1925
1926 write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
1927 (unsigned long)btrfs_header_chunk_tree_uuid(c),
1928 BTRFS_UUID_SIZE);
1929
1930 btrfs_set_node_key(c, &lower_key, 0);
1931 btrfs_set_node_blockptr(c, 0, lower->start);
1932 lower_gen = btrfs_header_generation(lower);
1933 WARN_ON(lower_gen != trans->transid);
1934
1935 btrfs_set_node_ptr_generation(c, 0, lower_gen);
1936
1937 btrfs_mark_buffer_dirty(c);
1938
1939 spin_lock(&root->node_lock);
1940 old = root->node;
1941 root->node = c;
1942 spin_unlock(&root->node_lock);
1943
1944 ret = btrfs_update_extent_ref(trans, root, lower->start,
1945 lower->start, c->start,
1946 root->root_key.objectid,
1947 trans->transid, level - 1);
1948 BUG_ON(ret);
1949
1950 /* the super has an extra ref to root->node */
1951 free_extent_buffer(old);
1952
1953 add_root_to_dirty_list(root);
1954 extent_buffer_get(c);
1955 path->nodes[level] = c;
1956 path->locks[level] = 1;
1957 path->slots[level] = 0;
1958 return 0;
1959}
1960
1961/*
1962 * worker function to insert a single pointer in a node.
1963 * the node should have enough room for the pointer already
1964 *
1965 * slot and level indicate where you want the key to go, and
1966 * blocknr is the block the key points to.
1967 *
1968 * returns zero on success and < 0 on any error
1969 */
1970static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
1971 *root, struct btrfs_path *path, struct btrfs_disk_key
1972 *key, u64 bytenr, int slot, int level)
1973{
1974 struct extent_buffer *lower;
1975 int nritems;
1976
1977 BUG_ON(!path->nodes[level]);
1978 lower = path->nodes[level];
1979 nritems = btrfs_header_nritems(lower);
1980 if (slot > nritems)
1981 BUG();
1982 if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
1983 BUG();
1984 if (slot != nritems) {
1985 memmove_extent_buffer(lower,
1986 btrfs_node_key_ptr_offset(slot + 1),
1987 btrfs_node_key_ptr_offset(slot),
1988 (nritems - slot) * sizeof(struct btrfs_key_ptr));
1989 }
1990 btrfs_set_node_key(lower, key, slot);
1991 btrfs_set_node_blockptr(lower, slot, bytenr);
1992 WARN_ON(trans->transid == 0);
1993 btrfs_set_node_ptr_generation(lower, slot, trans->transid);
1994 btrfs_set_header_nritems(lower, nritems + 1);
1995 btrfs_mark_buffer_dirty(lower);
1996 return 0;
1997}
1998
1999/*
2000 * split the node at the specified level in path in two.
2001 * The path is corrected to point to the appropriate node after the split
2002 *
2003 * Before splitting this tries to make some room in the node by pushing
2004 * left and right, if either one works, it returns right away.
2005 *
2006 * returns 0 on success and < 0 on failure
2007 */
2008static noinline int split_node(struct btrfs_trans_handle *trans,
2009 struct btrfs_root *root,
2010 struct btrfs_path *path, int level)
2011{
2012 struct extent_buffer *c;
2013 struct extent_buffer *split;
2014 struct btrfs_disk_key disk_key;
2015 int mid;
2016 int ret;
2017 int wret;
2018 u32 c_nritems;
2019
2020 c = path->nodes[level];
2021 WARN_ON(btrfs_header_generation(c) != trans->transid);
2022 if (c == root->node) {
2023 /* trying to split the root, lets make a new one */
2024 ret = insert_new_root(trans, root, path, level + 1);
2025 if (ret)
2026 return ret;
2027 } else {
2028 ret = push_nodes_for_insert(trans, root, path, level);
2029 c = path->nodes[level];
2030 if (!ret && btrfs_header_nritems(c) <
2031 BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
2032 return 0;
2033 if (ret < 0)
2034 return ret;
2035 }
2036
2037 c_nritems = btrfs_header_nritems(c);
2038
2039 split = btrfs_alloc_free_block(trans, root, root->nodesize,
2040 path->nodes[level + 1]->start,
2041 root->root_key.objectid,
2042 trans->transid, level, c->start, 0);
2043 if (IS_ERR(split))
2044 return PTR_ERR(split);
2045
2046 btrfs_set_header_flags(split, btrfs_header_flags(c));
2047 btrfs_set_header_level(split, btrfs_header_level(c));
2048 btrfs_set_header_bytenr(split, split->start);
2049 btrfs_set_header_generation(split, trans->transid);
2050 btrfs_set_header_owner(split, root->root_key.objectid);
2051 btrfs_set_header_flags(split, 0);
2052 write_extent_buffer(split, root->fs_info->fsid,
2053 (unsigned long)btrfs_header_fsid(split),
2054 BTRFS_FSID_SIZE);
2055 write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
2056 (unsigned long)btrfs_header_chunk_tree_uuid(split),
2057 BTRFS_UUID_SIZE);
2058
2059 mid = (c_nritems + 1) / 2;
2060
2061 copy_extent_buffer(split, c,
2062 btrfs_node_key_ptr_offset(0),
2063 btrfs_node_key_ptr_offset(mid),
2064 (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2065 btrfs_set_header_nritems(split, c_nritems - mid);
2066 btrfs_set_header_nritems(c, mid);
2067 ret = 0;
2068
2069 btrfs_mark_buffer_dirty(c);
2070 btrfs_mark_buffer_dirty(split);
2071
2072 btrfs_node_key(split, &disk_key, 0);
2073 wret = insert_ptr(trans, root, path, &disk_key, split->start,
2074 path->slots[level + 1] + 1,
2075 level + 1);
2076 if (wret)
2077 ret = wret;
2078
2079 ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
2080 BUG_ON(ret);
2081
2082 if (path->slots[level] >= mid) {
2083 path->slots[level] -= mid;
2084 btrfs_tree_unlock(c);
2085 free_extent_buffer(c);
2086 path->nodes[level] = split;
2087 path->slots[level + 1] += 1;
2088 } else {
2089 btrfs_tree_unlock(split);
2090 free_extent_buffer(split);
2091 }
2092 return ret;
2093}
2094
2095/*
2096 * how many bytes are required to store the items in a leaf. start
2097 * and nr indicate which items in the leaf to check. This totals up the
2098 * space used both by the item structs and the item data
2099 */
2100static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2101{
2102 int data_len;
2103 int nritems = btrfs_header_nritems(l);
2104 int end = min(nritems, start + nr) - 1;
2105
2106 if (!nr)
2107 return 0;
2108 data_len = btrfs_item_end_nr(l, start);
2109 data_len = data_len - btrfs_item_offset_nr(l, end);
2110 data_len += sizeof(struct btrfs_item) * nr;
2111 WARN_ON(data_len < 0);
2112 return data_len;
2113}
2114
2115/*
2116 * The space between the end of the leaf items and
2117 * the start of the leaf data. IOW, how much room
2118 * the leaf has left for both items and data
2119 */
2120noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2121 struct extent_buffer *leaf)
2122{
2123 int nritems = btrfs_header_nritems(leaf);
2124 int ret;
2125 ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
2126 if (ret < 0) {
2127 printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
2128 "used %d nritems %d\n",
2129 ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
2130 leaf_space_used(leaf, 0, nritems), nritems);
2131 }
2132 return ret;
2133}
2134
2135/*
2136 * push some data in the path leaf to the right, trying to free up at
2137 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2138 *
2139 * returns 1 if the push failed because the other node didn't have enough
2140 * room, 0 if everything worked out and < 0 if there were major errors.
2141 */
2142static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2143 *root, struct btrfs_path *path, int data_size,
2144 int empty)
2145{
2146 struct extent_buffer *left = path->nodes[0];
2147 struct extent_buffer *right;
2148 struct extent_buffer *upper;
2149 struct btrfs_disk_key disk_key;
2150 int slot;
2151 u32 i;
2152 int free_space;
2153 int push_space = 0;
2154 int push_items = 0;
2155 struct btrfs_item *item;
2156 u32 left_nritems;
2157 u32 nr;
2158 u32 right_nritems;
2159 u32 data_end;
2160 u32 this_item_size;
2161 int ret;
2162
2163 slot = path->slots[1];
2164 if (!path->nodes[1])
2165 return 1;
2166
2167 upper = path->nodes[1];
2168 if (slot >= btrfs_header_nritems(upper) - 1)
2169 return 1;
2170
2171 WARN_ON(!btrfs_tree_locked(path->nodes[1]));
2172
2173 right = read_node_slot(root, upper, slot + 1);
2174 btrfs_tree_lock(right);
2175 free_space = btrfs_leaf_free_space(root, right);
2176 if (free_space < data_size)
2177 goto out_unlock;
2178
2179 /* cow and double check */
2180 ret = btrfs_cow_block(trans, root, right, upper,
2181 slot + 1, &right, 0);
2182 if (ret)
2183 goto out_unlock;
2184
2185 free_space = btrfs_leaf_free_space(root, right);
2186 if (free_space < data_size)
2187 goto out_unlock;
2188
2189 left_nritems = btrfs_header_nritems(left);
2190 if (left_nritems == 0)
2191 goto out_unlock;
2192
2193 if (empty)
2194 nr = 0;
2195 else
2196 nr = 1;
2197
2198 if (path->slots[0] >= left_nritems)
2199 push_space += data_size;
2200
2201 i = left_nritems - 1;
2202 while (i >= nr) {
2203 item = btrfs_item_nr(left, i);
2204
2205 if (!empty && push_items > 0) {
2206 if (path->slots[0] > i)
2207 break;
2208 if (path->slots[0] == i) {
2209 int space = btrfs_leaf_free_space(root, left);
2210 if (space + push_space * 2 > free_space)
2211 break;
2212 }
2213 }
2214
2215 if (path->slots[0] == i)
2216 push_space += data_size;
2217
2218 if (!left->map_token) {
2219 map_extent_buffer(left, (unsigned long)item,
2220 sizeof(struct btrfs_item),
2221 &left->map_token, &left->kaddr,
2222 &left->map_start, &left->map_len,
2223 KM_USER1);
2224 }
2225
2226 this_item_size = btrfs_item_size(left, item);
2227 if (this_item_size + sizeof(*item) + push_space > free_space)
2228 break;
2229
2230 push_items++;
2231 push_space += this_item_size + sizeof(*item);
2232 if (i == 0)
2233 break;
2234 i--;
2235 }
2236 if (left->map_token) {
2237 unmap_extent_buffer(left, left->map_token, KM_USER1);
2238 left->map_token = NULL;
2239 }
2240
2241 if (push_items == 0)
2242 goto out_unlock;
2243
2244 if (!empty && push_items == left_nritems)
2245 WARN_ON(1);
2246
2247 /* push left to right */
2248 right_nritems = btrfs_header_nritems(right);
2249
2250 push_space = btrfs_item_end_nr(left, left_nritems - push_items);
2251 push_space -= leaf_data_end(root, left);
2252
2253 /* make room in the right data area */
2254 data_end = leaf_data_end(root, right);
2255 memmove_extent_buffer(right,
2256 btrfs_leaf_data(right) + data_end - push_space,
2257 btrfs_leaf_data(right) + data_end,
2258 BTRFS_LEAF_DATA_SIZE(root) - data_end);
2259
2260 /* copy from the left data area */
2261 copy_extent_buffer(right, left, btrfs_leaf_data(right) +
2262 BTRFS_LEAF_DATA_SIZE(root) - push_space,
2263 btrfs_leaf_data(left) + leaf_data_end(root, left),
2264 push_space);
2265
2266 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2267 btrfs_item_nr_offset(0),
2268 right_nritems * sizeof(struct btrfs_item));
2269
2270 /* copy the items from left to right */
2271 copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2272 btrfs_item_nr_offset(left_nritems - push_items),
2273 push_items * sizeof(struct btrfs_item));
2274
2275 /* update the item pointers */
2276 right_nritems += push_items;
2277 btrfs_set_header_nritems(right, right_nritems);
2278 push_space = BTRFS_LEAF_DATA_SIZE(root);
2279 for (i = 0; i < right_nritems; i++) {
2280 item = btrfs_item_nr(right, i);
2281 if (!right->map_token) {
2282 map_extent_buffer(right, (unsigned long)item,
2283 sizeof(struct btrfs_item),
2284 &right->map_token, &right->kaddr,
2285 &right->map_start, &right->map_len,
2286 KM_USER1);
2287 }
2288 push_space -= btrfs_item_size(right, item);
2289 btrfs_set_item_offset(right, item, push_space);
2290 }
2291
2292 if (right->map_token) {
2293 unmap_extent_buffer(right, right->map_token, KM_USER1);
2294 right->map_token = NULL;
2295 }
2296 left_nritems -= push_items;
2297 btrfs_set_header_nritems(left, left_nritems);
2298
2299 if (left_nritems)
2300 btrfs_mark_buffer_dirty(left);
2301 btrfs_mark_buffer_dirty(right);
2302
2303 ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
2304 BUG_ON(ret);
2305
2306 btrfs_item_key(right, &disk_key, 0);
2307 btrfs_set_node_key(upper, &disk_key, slot + 1);
2308 btrfs_mark_buffer_dirty(upper);
2309
2310 /* then fixup the leaf pointer in the path */
2311 if (path->slots[0] >= left_nritems) {
2312 path->slots[0] -= left_nritems;
2313 if (btrfs_header_nritems(path->nodes[0]) == 0)
2314 clean_tree_block(trans, root, path->nodes[0]);
2315 btrfs_tree_unlock(path->nodes[0]);
2316 free_extent_buffer(path->nodes[0]);
2317 path->nodes[0] = right;
2318 path->slots[1] += 1;
2319 } else {
2320 btrfs_tree_unlock(right);
2321 free_extent_buffer(right);
2322 }
2323 return 0;
2324
2325out_unlock:
2326 btrfs_tree_unlock(right);
2327 free_extent_buffer(right);
2328 return 1;
2329}
2330
2331/*
2332 * push some data in the path leaf to the left, trying to free up at
2333 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2334 */
2335static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2336 *root, struct btrfs_path *path, int data_size,
2337 int empty)
2338{
2339 struct btrfs_disk_key disk_key;
2340 struct extent_buffer *right = path->nodes[0];
2341 struct extent_buffer *left;
2342 int slot;
2343 int i;
2344 int free_space;
2345 int push_space = 0;
2346 int push_items = 0;
2347 struct btrfs_item *item;
2348 u32 old_left_nritems;
2349 u32 right_nritems;
2350 u32 nr;
2351 int ret = 0;
2352 int wret;
2353 u32 this_item_size;
2354 u32 old_left_item_size;
2355
2356 slot = path->slots[1];
2357 if (slot == 0)
2358 return 1;
2359 if (!path->nodes[1])
2360 return 1;
2361
2362 right_nritems = btrfs_header_nritems(right);
2363 if (right_nritems == 0)
2364 return 1;
2365
2366 WARN_ON(!btrfs_tree_locked(path->nodes[1]));
2367
2368 left = read_node_slot(root, path->nodes[1], slot - 1);
2369 btrfs_tree_lock(left);
2370 free_space = btrfs_leaf_free_space(root, left);
2371 if (free_space < data_size) {
2372 ret = 1;
2373 goto out;
2374 }
2375
2376 /* cow and double check */
2377 ret = btrfs_cow_block(trans, root, left,
2378 path->nodes[1], slot - 1, &left, 0);
2379 if (ret) {
2380 /* we hit -ENOSPC, but it isn't fatal here */
2381 ret = 1;
2382 goto out;
2383 }
2384
2385 free_space = btrfs_leaf_free_space(root, left);
2386 if (free_space < data_size) {
2387 ret = 1;
2388 goto out;
2389 }
2390
2391 if (empty)
2392 nr = right_nritems;
2393 else
2394 nr = right_nritems - 1;
2395
2396 for (i = 0; i < nr; i++) {
2397 item = btrfs_item_nr(right, i);
2398 if (!right->map_token) {
2399 map_extent_buffer(right, (unsigned long)item,
2400 sizeof(struct btrfs_item),
2401 &right->map_token, &right->kaddr,
2402 &right->map_start, &right->map_len,
2403 KM_USER1);
2404 }
2405
2406 if (!empty && push_items > 0) {
2407 if (path->slots[0] < i)
2408 break;
2409 if (path->slots[0] == i) {
2410 int space = btrfs_leaf_free_space(root, right);
2411 if (space + push_space * 2 > free_space)
2412 break;
2413 }
2414 }
2415
2416 if (path->slots[0] == i)
2417 push_space += data_size;
2418
2419 this_item_size = btrfs_item_size(right, item);
2420 if (this_item_size + sizeof(*item) + push_space > free_space)
2421 break;
2422
2423 push_items++;
2424 push_space += this_item_size + sizeof(*item);
2425 }
2426
2427 if (right->map_token) {
2428 unmap_extent_buffer(right, right->map_token, KM_USER1);
2429 right->map_token = NULL;
2430 }
2431
2432 if (push_items == 0) {
2433 ret = 1;
2434 goto out;
2435 }
2436 if (!empty && push_items == btrfs_header_nritems(right))
2437 WARN_ON(1);
2438
2439 /* push data from right to left */
2440 copy_extent_buffer(left, right,
2441 btrfs_item_nr_offset(btrfs_header_nritems(left)),
2442 btrfs_item_nr_offset(0),
2443 push_items * sizeof(struct btrfs_item));
2444
2445 push_space = BTRFS_LEAF_DATA_SIZE(root) -
2446 btrfs_item_offset_nr(right, push_items - 1);
2447
2448 copy_extent_buffer(left, right, btrfs_leaf_data(left) +
2449 leaf_data_end(root, left) - push_space,
2450 btrfs_leaf_data(right) +
2451 btrfs_item_offset_nr(right, push_items - 1),
2452 push_space);
2453 old_left_nritems = btrfs_header_nritems(left);
2454 BUG_ON(old_left_nritems <= 0);
2455
2456 old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
2457 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2458 u32 ioff;
2459
2460 item = btrfs_item_nr(left, i);
2461 if (!left->map_token) {
2462 map_extent_buffer(left, (unsigned long)item,
2463 sizeof(struct btrfs_item),
2464 &left->map_token, &left->kaddr,
2465 &left->map_start, &left->map_len,
2466 KM_USER1);
2467 }
2468
2469 ioff = btrfs_item_offset(left, item);
2470 btrfs_set_item_offset(left, item,
2471 ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2472 }
2473 btrfs_set_header_nritems(left, old_left_nritems + push_items);
2474 if (left->map_token) {
2475 unmap_extent_buffer(left, left->map_token, KM_USER1);
2476 left->map_token = NULL;
2477 }
2478
2479 /* fixup right node */
2480 if (push_items > right_nritems) {
2481 printk(KERN_CRIT "push items %d nr %u\n", push_items,
2482 right_nritems);
2483 WARN_ON(1);
2484 }
2485
2486 if (push_items < right_nritems) {
2487 push_space = btrfs_item_offset_nr(right, push_items - 1) -
2488 leaf_data_end(root, right);
2489 memmove_extent_buffer(right, btrfs_leaf_data(right) +
2490 BTRFS_LEAF_DATA_SIZE(root) - push_space,
2491 btrfs_leaf_data(right) +
2492 leaf_data_end(root, right), push_space);
2493
2494 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2495 btrfs_item_nr_offset(push_items),
2496 (btrfs_header_nritems(right) - push_items) *
2497 sizeof(struct btrfs_item));
2498 }
2499 right_nritems -= push_items;
2500 btrfs_set_header_nritems(right, right_nritems);
2501 push_space = BTRFS_LEAF_DATA_SIZE(root);
2502 for (i = 0; i < right_nritems; i++) {
2503 item = btrfs_item_nr(right, i);
2504
2505 if (!right->map_token) {
2506 map_extent_buffer(right, (unsigned long)item,
2507 sizeof(struct btrfs_item),
2508 &right->map_token, &right->kaddr,
2509 &right->map_start, &right->map_len,
2510 KM_USER1);
2511 }
2512
2513 push_space = push_space - btrfs_item_size(right, item);
2514 btrfs_set_item_offset(right, item, push_space);
2515 }
2516 if (right->map_token) {
2517 unmap_extent_buffer(right, right->map_token, KM_USER1);
2518 right->map_token = NULL;
2519 }
2520
2521 btrfs_mark_buffer_dirty(left);
2522 if (right_nritems)
2523 btrfs_mark_buffer_dirty(right);
2524
2525 ret = btrfs_update_ref(trans, root, right, left,
2526 old_left_nritems, push_items);
2527 BUG_ON(ret);
2528
2529 btrfs_item_key(right, &disk_key, 0);
2530 wret = fixup_low_keys(trans, root, path, &disk_key, 1);
2531 if (wret)
2532 ret = wret;
2533
2534 /* then fixup the leaf pointer in the path */
2535 if (path->slots[0] < push_items) {
2536 path->slots[0] += old_left_nritems;
2537 if (btrfs_header_nritems(path->nodes[0]) == 0)
2538 clean_tree_block(trans, root, path->nodes[0]);
2539 btrfs_tree_unlock(path->nodes[0]);
2540 free_extent_buffer(path->nodes[0]);
2541 path->nodes[0] = left;
2542 path->slots[1] -= 1;
2543 } else {
2544 btrfs_tree_unlock(left);
2545 free_extent_buffer(left);
2546 path->slots[0] -= push_items;
2547 }
2548 BUG_ON(path->slots[0] < 0);
2549 return ret;
2550out:
2551 btrfs_tree_unlock(left);
2552 free_extent_buffer(left);
2553 return ret;
2554}
2555
2556/*
2557 * split the path's leaf in two, making sure there is at least data_size
2558 * available for the resulting leaf level of the path.
2559 *
2560 * returns 0 if all went well and < 0 on failure.
2561 */
2562static noinline int split_leaf(struct btrfs_trans_handle *trans,
2563 struct btrfs_root *root,
2564 struct btrfs_key *ins_key,
2565 struct btrfs_path *path, int data_size,
2566 int extend)
2567{
2568 struct extent_buffer *l;
2569 u32 nritems;
2570 int mid;
2571 int slot;
2572 struct extent_buffer *right;
2573 int data_copy_size;
2574 int rt_data_off;
2575 int i;
2576 int ret = 0;
2577 int wret;
2578 int double_split;
2579 int num_doubles = 0;
2580 struct btrfs_disk_key disk_key;
2581
2582 /* first try to make some room by pushing left and right */
2583 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
2584 wret = push_leaf_right(trans, root, path, data_size, 0);
2585 if (wret < 0)
2586 return wret;
2587 if (wret) {
2588 wret = push_leaf_left(trans, root, path, data_size, 0);
2589 if (wret < 0)
2590 return wret;
2591 }
2592 l = path->nodes[0];
2593
2594 /* did the pushes work? */
2595 if (btrfs_leaf_free_space(root, l) >= data_size)
2596 return 0;
2597 }
2598
2599 if (!path->nodes[1]) {
2600 ret = insert_new_root(trans, root, path, 1);
2601 if (ret)
2602 return ret;
2603 }
2604again:
2605 double_split = 0;
2606 l = path->nodes[0];
2607 slot = path->slots[0];
2608 nritems = btrfs_header_nritems(l);
2609 mid = (nritems + 1) / 2;
2610
2611 right = btrfs_alloc_free_block(trans, root, root->leafsize,
2612 path->nodes[1]->start,
2613 root->root_key.objectid,
2614 trans->transid, 0, l->start, 0);
2615 if (IS_ERR(right)) {
2616 BUG_ON(1);
2617 return PTR_ERR(right);
2618 }
2619
2620 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2621 btrfs_set_header_bytenr(right, right->start);
2622 btrfs_set_header_generation(right, trans->transid);
2623 btrfs_set_header_owner(right, root->root_key.objectid);
2624 btrfs_set_header_level(right, 0);
2625 write_extent_buffer(right, root->fs_info->fsid,
2626 (unsigned long)btrfs_header_fsid(right),
2627 BTRFS_FSID_SIZE);
2628
2629 write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
2630 (unsigned long)btrfs_header_chunk_tree_uuid(right),
2631 BTRFS_UUID_SIZE);
2632 if (mid <= slot) {
2633 if (nritems == 1 ||
2634 leaf_space_used(l, mid, nritems - mid) + data_size >
2635 BTRFS_LEAF_DATA_SIZE(root)) {
2636 if (slot >= nritems) {
2637 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2638 btrfs_set_header_nritems(right, 0);
2639 wret = insert_ptr(trans, root, path,
2640 &disk_key, right->start,
2641 path->slots[1] + 1, 1);
2642 if (wret)
2643 ret = wret;
2644
2645 btrfs_tree_unlock(path->nodes[0]);
2646 free_extent_buffer(path->nodes[0]);
2647 path->nodes[0] = right;
2648 path->slots[0] = 0;
2649 path->slots[1] += 1;
2650 btrfs_mark_buffer_dirty(right);
2651 return ret;
2652 }
2653 mid = slot;
2654 if (mid != nritems &&
2655 leaf_space_used(l, mid, nritems - mid) +
2656 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2657 double_split = 1;
2658 }
2659 }
2660 } else {
2661 if (leaf_space_used(l, 0, mid) + data_size >
2662 BTRFS_LEAF_DATA_SIZE(root)) {
2663 if (!extend && data_size && slot == 0) {
2664 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2665 btrfs_set_header_nritems(right, 0);
2666 wret = insert_ptr(trans, root, path,
2667 &disk_key,
2668 right->start,
2669 path->slots[1], 1);
2670 if (wret)
2671 ret = wret;
2672 btrfs_tree_unlock(path->nodes[0]);
2673 free_extent_buffer(path->nodes[0]);
2674 path->nodes[0] = right;
2675 path->slots[0] = 0;
2676 if (path->slots[1] == 0) {
2677 wret = fixup_low_keys(trans, root,
2678 path, &disk_key, 1);
2679 if (wret)
2680 ret = wret;
2681 }
2682 btrfs_mark_buffer_dirty(right);
2683 return ret;
2684 } else if ((extend || !data_size) && slot == 0) {
2685 mid = 1;
2686 } else {
2687 mid = slot;
2688 if (mid != nritems &&
2689 leaf_space_used(l, mid, nritems - mid) +
2690 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2691 double_split = 1;
2692 }
2693 }
2694 }
2695 }
2696 nritems = nritems - mid;
2697 btrfs_set_header_nritems(right, nritems);
2698 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2699
2700 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2701 btrfs_item_nr_offset(mid),
2702 nritems * sizeof(struct btrfs_item));
2703
2704 copy_extent_buffer(right, l,
2705 btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2706 data_copy_size, btrfs_leaf_data(l) +
2707 leaf_data_end(root, l), data_copy_size);
2708
2709 rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2710 btrfs_item_end_nr(l, mid);
2711
2712 for (i = 0; i < nritems; i++) {
2713 struct btrfs_item *item = btrfs_item_nr(right, i);
2714 u32 ioff;
2715
2716 if (!right->map_token) {
2717 map_extent_buffer(right, (unsigned long)item,
2718 sizeof(struct btrfs_item),
2719 &right->map_token, &right->kaddr,
2720 &right->map_start, &right->map_len,
2721 KM_USER1);
2722 }
2723
2724 ioff = btrfs_item_offset(right, item);
2725 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2726 }
2727
2728 if (right->map_token) {
2729 unmap_extent_buffer(right, right->map_token, KM_USER1);
2730 right->map_token = NULL;
2731 }
2732
2733 btrfs_set_header_nritems(l, mid);
2734 ret = 0;
2735 btrfs_item_key(right, &disk_key, 0);
2736 wret = insert_ptr(trans, root, path, &disk_key, right->start,
2737 path->slots[1] + 1, 1);
2738 if (wret)
2739 ret = wret;
2740
2741 btrfs_mark_buffer_dirty(right);
2742 btrfs_mark_buffer_dirty(l);
2743 BUG_ON(path->slots[0] != slot);
2744
2745 ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
2746 BUG_ON(ret);
2747
2748 if (mid <= slot) {
2749 btrfs_tree_unlock(path->nodes[0]);
2750 free_extent_buffer(path->nodes[0]);
2751 path->nodes[0] = right;
2752 path->slots[0] -= mid;
2753 path->slots[1] += 1;
2754 } else {
2755 btrfs_tree_unlock(right);
2756 free_extent_buffer(right);
2757 }
2758
2759 BUG_ON(path->slots[0] < 0);
2760
2761 if (double_split) {
2762 BUG_ON(num_doubles != 0);
2763 num_doubles++;
2764 goto again;
2765 }
2766 return ret;
2767}
2768
2769/*
2770 * This function splits a single item into two items,
2771 * giving 'new_key' to the new item and splitting the
2772 * old one at split_offset (from the start of the item).
2773 *
2774 * The path may be released by this operation. After
2775 * the split, the path is pointing to the old item. The
2776 * new item is going to be in the same node as the old one.
2777 *
2778 * Note, the item being split must be smaller enough to live alone on
2779 * a tree block with room for one extra struct btrfs_item
2780 *
2781 * This allows us to split the item in place, keeping a lock on the
2782 * leaf the entire time.
2783 */
2784int btrfs_split_item(struct btrfs_trans_handle *trans,
2785 struct btrfs_root *root,
2786 struct btrfs_path *path,
2787 struct btrfs_key *new_key,
2788 unsigned long split_offset)
2789{
2790 u32 item_size;
2791 struct extent_buffer *leaf;
2792 struct btrfs_key orig_key;
2793 struct btrfs_item *item;
2794 struct btrfs_item *new_item;
2795 int ret = 0;
2796 int slot;
2797 u32 nritems;
2798 u32 orig_offset;
2799 struct btrfs_disk_key disk_key;
2800 char *buf;
2801
2802 leaf = path->nodes[0];
2803 btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]);
2804 if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item))
2805 goto split;
2806
2807 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2808 btrfs_release_path(root, path);
2809
2810 path->search_for_split = 1;
2811 path->keep_locks = 1;
2812
2813 ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1);
2814 path->search_for_split = 0;
2815
2816 /* if our item isn't there or got smaller, return now */
2817 if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0],
2818 path->slots[0])) {
2819 path->keep_locks = 0;
2820 return -EAGAIN;
2821 }
2822
2823 ret = split_leaf(trans, root, &orig_key, path,
2824 sizeof(struct btrfs_item), 1);
2825 path->keep_locks = 0;
2826 BUG_ON(ret);
2827
2828 leaf = path->nodes[0];
2829 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
2830
2831split:
2832 item = btrfs_item_nr(leaf, path->slots[0]);
2833 orig_offset = btrfs_item_offset(leaf, item);
2834 item_size = btrfs_item_size(leaf, item);
2835
2836
2837 buf = kmalloc(item_size, GFP_NOFS);
2838 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
2839 path->slots[0]), item_size);
2840 slot = path->slots[0] + 1;
2841 leaf = path->nodes[0];
2842
2843 nritems = btrfs_header_nritems(leaf);
2844
2845 if (slot != nritems) {
2846 /* shift the items */
2847 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
2848 btrfs_item_nr_offset(slot),
2849 (nritems - slot) * sizeof(struct btrfs_item));
2850
2851 }
2852
2853 btrfs_cpu_key_to_disk(&disk_key, new_key);
2854 btrfs_set_item_key(leaf, &disk_key, slot);
2855
2856 new_item = btrfs_item_nr(leaf, slot);
2857
2858 btrfs_set_item_offset(leaf, new_item, orig_offset);
2859 btrfs_set_item_size(leaf, new_item, item_size - split_offset);
2860
2861 btrfs_set_item_offset(leaf, item,
2862 orig_offset + item_size - split_offset);
2863 btrfs_set_item_size(leaf, item, split_offset);
2864
2865 btrfs_set_header_nritems(leaf, nritems + 1);
2866
2867 /* write the data for the start of the original item */
2868 write_extent_buffer(leaf, buf,
2869 btrfs_item_ptr_offset(leaf, path->slots[0]),
2870 split_offset);
2871
2872 /* write the data for the new item */
2873 write_extent_buffer(leaf, buf + split_offset,
2874 btrfs_item_ptr_offset(leaf, slot),
2875 item_size - split_offset);
2876 btrfs_mark_buffer_dirty(leaf);
2877
2878 ret = 0;
2879 if (btrfs_leaf_free_space(root, leaf) < 0) {
2880 btrfs_print_leaf(root, leaf);
2881 BUG();
2882 }
2883 kfree(buf);
2884 return ret;
2885}
2886
2887/*
2888 * make the item pointed to by the path smaller. new_size indicates
2889 * how small to make it, and from_end tells us if we just chop bytes
2890 * off the end of the item or if we shift the item to chop bytes off
2891 * the front.
2892 */
2893int btrfs_truncate_item(struct btrfs_trans_handle *trans,
2894 struct btrfs_root *root,
2895 struct btrfs_path *path,
2896 u32 new_size, int from_end)
2897{
2898 int ret = 0;
2899 int slot;
2900 int slot_orig;
2901 struct extent_buffer *leaf;
2902 struct btrfs_item *item;
2903 u32 nritems;
2904 unsigned int data_end;
2905 unsigned int old_data_start;
2906 unsigned int old_size;
2907 unsigned int size_diff;
2908 int i;
2909
2910 slot_orig = path->slots[0];
2911 leaf = path->nodes[0];
2912 slot = path->slots[0];
2913
2914 old_size = btrfs_item_size_nr(leaf, slot);
2915 if (old_size == new_size)
2916 return 0;
2917
2918 nritems = btrfs_header_nritems(leaf);
2919 data_end = leaf_data_end(root, leaf);
2920
2921 old_data_start = btrfs_item_offset_nr(leaf, slot);
2922
2923 size_diff = old_size - new_size;
2924
2925 BUG_ON(slot < 0);
2926 BUG_ON(slot >= nritems);
2927
2928 /*
2929 * item0..itemN ... dataN.offset..dataN.size .. data0.size
2930 */
2931 /* first correct the data pointers */
2932 for (i = slot; i < nritems; i++) {
2933 u32 ioff;
2934 item = btrfs_item_nr(leaf, i);
2935
2936 if (!leaf->map_token) {
2937 map_extent_buffer(leaf, (unsigned long)item,
2938 sizeof(struct btrfs_item),
2939 &leaf->map_token, &leaf->kaddr,
2940 &leaf->map_start, &leaf->map_len,
2941 KM_USER1);
2942 }
2943
2944 ioff = btrfs_item_offset(leaf, item);
2945 btrfs_set_item_offset(leaf, item, ioff + size_diff);
2946 }
2947
2948 if (leaf->map_token) {
2949 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
2950 leaf->map_token = NULL;
2951 }
2952
2953 /* shift the data */
2954 if (from_end) {
2955 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
2956 data_end + size_diff, btrfs_leaf_data(leaf) +
2957 data_end, old_data_start + new_size - data_end);
2958 } else {
2959 struct btrfs_disk_key disk_key;
2960 u64 offset;
2961
2962 btrfs_item_key(leaf, &disk_key, slot);
2963
2964 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
2965 unsigned long ptr;
2966 struct btrfs_file_extent_item *fi;
2967
2968 fi = btrfs_item_ptr(leaf, slot,
2969 struct btrfs_file_extent_item);
2970 fi = (struct btrfs_file_extent_item *)(
2971 (unsigned long)fi - size_diff);
2972
2973 if (btrfs_file_extent_type(leaf, fi) ==
2974 BTRFS_FILE_EXTENT_INLINE) {
2975 ptr = btrfs_item_ptr_offset(leaf, slot);
2976 memmove_extent_buffer(leaf, ptr,
2977 (unsigned long)fi,
2978 offsetof(struct btrfs_file_extent_item,
2979 disk_bytenr));
2980 }
2981 }
2982
2983 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
2984 data_end + size_diff, btrfs_leaf_data(leaf) +
2985 data_end, old_data_start - data_end);
2986
2987 offset = btrfs_disk_key_offset(&disk_key);
2988 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
2989 btrfs_set_item_key(leaf, &disk_key, slot);
2990 if (slot == 0)
2991 fixup_low_keys(trans, root, path, &disk_key, 1);
2992 }
2993
2994 item = btrfs_item_nr(leaf, slot);
2995 btrfs_set_item_size(leaf, item, new_size);
2996 btrfs_mark_buffer_dirty(leaf);
2997
2998 ret = 0;
2999 if (btrfs_leaf_free_space(root, leaf) < 0) {
3000 btrfs_print_leaf(root, leaf);
3001 BUG();
3002 }
3003 return ret;
3004}
3005
3006/*
3007 * make the item pointed to by the path bigger, data_size is the new size.
3008 */
3009int btrfs_extend_item(struct btrfs_trans_handle *trans,
3010 struct btrfs_root *root, struct btrfs_path *path,
3011 u32 data_size)
3012{
3013 int ret = 0;
3014 int slot;
3015 int slot_orig;
3016 struct extent_buffer *leaf;
3017 struct btrfs_item *item;
3018 u32 nritems;
3019 unsigned int data_end;
3020 unsigned int old_data;
3021 unsigned int old_size;
3022 int i;
3023
3024 slot_orig = path->slots[0];
3025 leaf = path->nodes[0];
3026
3027 nritems = btrfs_header_nritems(leaf);
3028 data_end = leaf_data_end(root, leaf);
3029
3030 if (btrfs_leaf_free_space(root, leaf) < data_size) {
3031 btrfs_print_leaf(root, leaf);
3032 BUG();
3033 }
3034 slot = path->slots[0];
3035 old_data = btrfs_item_end_nr(leaf, slot);
3036
3037 BUG_ON(slot < 0);
3038 if (slot >= nritems) {
3039 btrfs_print_leaf(root, leaf);
3040 printk(KERN_CRIT "slot %d too large, nritems %d\n",
3041 slot, nritems);
3042 BUG_ON(1);
3043 }
3044
3045 /*
3046 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3047 */
3048 /* first correct the data pointers */
3049 for (i = slot; i < nritems; i++) {
3050 u32 ioff;
3051 item = btrfs_item_nr(leaf, i);
3052
3053 if (!leaf->map_token) {
3054 map_extent_buffer(leaf, (unsigned long)item,
3055 sizeof(struct btrfs_item),
3056 &leaf->map_token, &leaf->kaddr,
3057 &leaf->map_start, &leaf->map_len,
3058 KM_USER1);
3059 }
3060 ioff = btrfs_item_offset(leaf, item);
3061 btrfs_set_item_offset(leaf, item, ioff - data_size);
3062 }
3063
3064 if (leaf->map_token) {
3065 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3066 leaf->map_token = NULL;
3067 }
3068
3069 /* shift the data */
3070 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3071 data_end - data_size, btrfs_leaf_data(leaf) +
3072 data_end, old_data - data_end);
3073
3074 data_end = old_data;
3075 old_size = btrfs_item_size_nr(leaf, slot);
3076 item = btrfs_item_nr(leaf, slot);
3077 btrfs_set_item_size(leaf, item, old_size + data_size);
3078 btrfs_mark_buffer_dirty(leaf);
3079
3080 ret = 0;
3081 if (btrfs_leaf_free_space(root, leaf) < 0) {
3082 btrfs_print_leaf(root, leaf);
3083 BUG();
3084 }
3085 return ret;
3086}
3087
3088/*
3089 * Given a key and some data, insert items into the tree.
3090 * This does all the path init required, making room in the tree if needed.
3091 * Returns the number of keys that were inserted.
3092 */
3093int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
3094 struct btrfs_root *root,
3095 struct btrfs_path *path,
3096 struct btrfs_key *cpu_key, u32 *data_size,
3097 int nr)
3098{
3099 struct extent_buffer *leaf;
3100 struct btrfs_item *item;
3101 int ret = 0;
3102 int slot;
3103 int i;
3104 u32 nritems;
3105 u32 total_data = 0;
3106 u32 total_size = 0;
3107 unsigned int data_end;
3108 struct btrfs_disk_key disk_key;
3109 struct btrfs_key found_key;
3110
3111 for (i = 0; i < nr; i++) {
3112 if (total_size + data_size[i] + sizeof(struct btrfs_item) >
3113 BTRFS_LEAF_DATA_SIZE(root)) {
3114 break;
3115 nr = i;
3116 }
3117 total_data += data_size[i];
3118 total_size += data_size[i] + sizeof(struct btrfs_item);
3119 }
3120 BUG_ON(nr == 0);
3121
3122 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3123 if (ret == 0)
3124 return -EEXIST;
3125 if (ret < 0)
3126 goto out;
3127
3128 leaf = path->nodes[0];
3129
3130 nritems = btrfs_header_nritems(leaf);
3131 data_end = leaf_data_end(root, leaf);
3132
3133 if (btrfs_leaf_free_space(root, leaf) < total_size) {
3134 for (i = nr; i >= 0; i--) {
3135 total_data -= data_size[i];
3136 total_size -= data_size[i] + sizeof(struct btrfs_item);
3137 if (total_size < btrfs_leaf_free_space(root, leaf))
3138 break;
3139 }
3140 nr = i;
3141 }
3142
3143 slot = path->slots[0];
3144 BUG_ON(slot < 0);
3145
3146 if (slot != nritems) {
3147 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3148
3149 item = btrfs_item_nr(leaf, slot);
3150 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3151
3152 /* figure out how many keys we can insert in here */
3153 total_data = data_size[0];
3154 for (i = 1; i < nr; i++) {
3155 if (comp_cpu_keys(&found_key, cpu_key + i) <= 0)
3156 break;
3157 total_data += data_size[i];
3158 }
3159 nr = i;
3160
3161 if (old_data < data_end) {
3162 btrfs_print_leaf(root, leaf);
3163 printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3164 slot, old_data, data_end);
3165 BUG_ON(1);
3166 }
3167 /*
3168 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3169 */
3170 /* first correct the data pointers */
3171 WARN_ON(leaf->map_token);
3172 for (i = slot; i < nritems; i++) {
3173 u32 ioff;
3174
3175 item = btrfs_item_nr(leaf, i);
3176 if (!leaf->map_token) {
3177 map_extent_buffer(leaf, (unsigned long)item,
3178 sizeof(struct btrfs_item),
3179 &leaf->map_token, &leaf->kaddr,
3180 &leaf->map_start, &leaf->map_len,
3181 KM_USER1);
3182 }
3183
3184 ioff = btrfs_item_offset(leaf, item);
3185 btrfs_set_item_offset(leaf, item, ioff - total_data);
3186 }
3187 if (leaf->map_token) {
3188 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3189 leaf->map_token = NULL;
3190 }
3191
3192 /* shift the items */
3193 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3194 btrfs_item_nr_offset(slot),
3195 (nritems - slot) * sizeof(struct btrfs_item));
3196
3197 /* shift the data */
3198 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3199 data_end - total_data, btrfs_leaf_data(leaf) +
3200 data_end, old_data - data_end);
3201 data_end = old_data;
3202 } else {
3203 /*
3204 * this sucks but it has to be done, if we are inserting at
3205 * the end of the leaf only insert 1 of the items, since we
3206 * have no way of knowing whats on the next leaf and we'd have
3207 * to drop our current locks to figure it out
3208 */
3209 nr = 1;
3210 }
3211
3212 /* setup the item for the new data */
3213 for (i = 0; i < nr; i++) {
3214 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3215 btrfs_set_item_key(leaf, &disk_key, slot + i);
3216 item = btrfs_item_nr(leaf, slot + i);
3217 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3218 data_end -= data_size[i];
3219 btrfs_set_item_size(leaf, item, data_size[i]);
3220 }
3221 btrfs_set_header_nritems(leaf, nritems + nr);
3222 btrfs_mark_buffer_dirty(leaf);
3223
3224 ret = 0;
3225 if (slot == 0) {
3226 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3227 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3228 }
3229
3230 if (btrfs_leaf_free_space(root, leaf) < 0) {
3231 btrfs_print_leaf(root, leaf);
3232 BUG();
3233 }
3234out:
3235 if (!ret)
3236 ret = nr;
3237 return ret;
3238}
3239
3240/*
3241 * Given a key and some data, insert items into the tree.
3242 * This does all the path init required, making room in the tree if needed.
3243 */
3244int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3245 struct btrfs_root *root,
3246 struct btrfs_path *path,
3247 struct btrfs_key *cpu_key, u32 *data_size,
3248 int nr)
3249{
3250 struct extent_buffer *leaf;
3251 struct btrfs_item *item;
3252 int ret = 0;
3253 int slot;
3254 int slot_orig;
3255 int i;
3256 u32 nritems;
3257 u32 total_size = 0;
3258 u32 total_data = 0;
3259 unsigned int data_end;
3260 struct btrfs_disk_key disk_key;
3261
3262 for (i = 0; i < nr; i++)
3263 total_data += data_size[i];
3264
3265 total_size = total_data + (nr * sizeof(struct btrfs_item));
3266 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3267 if (ret == 0)
3268 return -EEXIST;
3269 if (ret < 0)
3270 goto out;
3271
3272 slot_orig = path->slots[0];
3273 leaf = path->nodes[0];
3274
3275 nritems = btrfs_header_nritems(leaf);
3276 data_end = leaf_data_end(root, leaf);
3277
3278 if (btrfs_leaf_free_space(root, leaf) < total_size) {
3279 btrfs_print_leaf(root, leaf);
3280 printk(KERN_CRIT "not enough freespace need %u have %d\n",
3281 total_size, btrfs_leaf_free_space(root, leaf));
3282 BUG();
3283 }
3284
3285 slot = path->slots[0];
3286 BUG_ON(slot < 0);
3287
3288 if (slot != nritems) {
3289 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3290
3291 if (old_data < data_end) {
3292 btrfs_print_leaf(root, leaf);
3293 printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3294 slot, old_data, data_end);
3295 BUG_ON(1);
3296 }
3297 /*
3298 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3299 */
3300 /* first correct the data pointers */
3301 WARN_ON(leaf->map_token);
3302 for (i = slot; i < nritems; i++) {
3303 u32 ioff;
3304
3305 item = btrfs_item_nr(leaf, i);
3306 if (!leaf->map_token) {
3307 map_extent_buffer(leaf, (unsigned long)item,
3308 sizeof(struct btrfs_item),
3309 &leaf->map_token, &leaf->kaddr,
3310 &leaf->map_start, &leaf->map_len,
3311 KM_USER1);
3312 }
3313
3314 ioff = btrfs_item_offset(leaf, item);
3315 btrfs_set_item_offset(leaf, item, ioff - total_data);
3316 }
3317 if (leaf->map_token) {
3318 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3319 leaf->map_token = NULL;
3320 }
3321
3322 /* shift the items */
3323 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3324 btrfs_item_nr_offset(slot),
3325 (nritems - slot) * sizeof(struct btrfs_item));
3326
3327 /* shift the data */
3328 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3329 data_end - total_data, btrfs_leaf_data(leaf) +
3330 data_end, old_data - data_end);
3331 data_end = old_data;
3332 }
3333
3334 /* setup the item for the new data */
3335 for (i = 0; i < nr; i++) {
3336 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3337 btrfs_set_item_key(leaf, &disk_key, slot + i);
3338 item = btrfs_item_nr(leaf, slot + i);
3339 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3340 data_end -= data_size[i];
3341 btrfs_set_item_size(leaf, item, data_size[i]);
3342 }
3343 btrfs_set_header_nritems(leaf, nritems + nr);
3344 btrfs_mark_buffer_dirty(leaf);
3345
3346 ret = 0;
3347 if (slot == 0) {
3348 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3349 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3350 }
3351
3352 if (btrfs_leaf_free_space(root, leaf) < 0) {
3353 btrfs_print_leaf(root, leaf);
3354 BUG();
3355 }
3356out:
3357 return ret;
3358}
3359
3360/*
3361 * Given a key and some data, insert an item into the tree.
3362 * This does all the path init required, making room in the tree if needed.
3363 */
3364int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
3365 *root, struct btrfs_key *cpu_key, void *data, u32
3366 data_size)
3367{
3368 int ret = 0;
3369 struct btrfs_path *path;
3370 struct extent_buffer *leaf;
3371 unsigned long ptr;
3372
3373 path = btrfs_alloc_path();
3374 BUG_ON(!path);
3375 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3376 if (!ret) {
3377 leaf = path->nodes[0];
3378 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3379 write_extent_buffer(leaf, data, ptr, data_size);
3380 btrfs_mark_buffer_dirty(leaf);
3381 }
3382 btrfs_free_path(path);
3383 return ret;
3384}
3385
3386/*
3387 * delete the pointer from a given node.
3388 *
3389 * the tree should have been previously balanced so the deletion does not
3390 * empty a node.
3391 */
3392static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3393 struct btrfs_path *path, int level, int slot)
3394{
3395 struct extent_buffer *parent = path->nodes[level];
3396 u32 nritems;
3397 int ret = 0;
3398 int wret;
3399
3400 nritems = btrfs_header_nritems(parent);
3401 if (slot != nritems - 1) {
3402 memmove_extent_buffer(parent,
3403 btrfs_node_key_ptr_offset(slot),
3404 btrfs_node_key_ptr_offset(slot + 1),
3405 sizeof(struct btrfs_key_ptr) *
3406 (nritems - slot - 1));
3407 }
3408 nritems--;
3409 btrfs_set_header_nritems(parent, nritems);
3410 if (nritems == 0 && parent == root->node) {
3411 BUG_ON(btrfs_header_level(root->node) != 1);
3412 /* just turn the root into a leaf and break */
3413 btrfs_set_header_level(root->node, 0);
3414 } else if (slot == 0) {
3415 struct btrfs_disk_key disk_key;
3416
3417 btrfs_node_key(parent, &disk_key, 0);
3418 wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
3419 if (wret)
3420 ret = wret;
3421 }
3422 btrfs_mark_buffer_dirty(parent);
3423 return ret;
3424}
3425
3426/*
3427 * a helper function to delete the leaf pointed to by path->slots[1] and
3428 * path->nodes[1]. bytenr is the node block pointer, but since the callers
3429 * already know it, it is faster to have them pass it down than to
3430 * read it out of the node again.
3431 *
3432 * This deletes the pointer in path->nodes[1] and frees the leaf
3433 * block extent. zero is returned if it all worked out, < 0 otherwise.
3434 *
3435 * The path must have already been setup for deleting the leaf, including
3436 * all the proper balancing. path->nodes[1] must be locked.
3437 */
3438noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3439 struct btrfs_root *root,
3440 struct btrfs_path *path, u64 bytenr)
3441{
3442 int ret;
3443 u64 root_gen = btrfs_header_generation(path->nodes[1]);
3444
3445 ret = del_ptr(trans, root, path, 1, path->slots[1]);
3446 if (ret)
3447 return ret;
3448
3449 ret = btrfs_free_extent(trans, root, bytenr,
3450 btrfs_level_size(root, 0),
3451 path->nodes[1]->start,
3452 btrfs_header_owner(path->nodes[1]),
3453 root_gen, 0, 1);
3454 return ret;
3455}
3456/*
3457 * delete the item at the leaf level in path. If that empties
3458 * the leaf, remove it from the tree
3459 */
3460int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3461 struct btrfs_path *path, int slot, int nr)
3462{
3463 struct extent_buffer *leaf;
3464 struct btrfs_item *item;
3465 int last_off;
3466 int dsize = 0;
3467 int ret = 0;
3468 int wret;
3469 int i;
3470 u32 nritems;
3471
3472 leaf = path->nodes[0];
3473 last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
3474
3475 for (i = 0; i < nr; i++)
3476 dsize += btrfs_item_size_nr(leaf, slot + i);
3477
3478 nritems = btrfs_header_nritems(leaf);
3479
3480 if (slot + nr != nritems) {
3481 int data_end = leaf_data_end(root, leaf);
3482
3483 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3484 data_end + dsize,
3485 btrfs_leaf_data(leaf) + data_end,
3486 last_off - data_end);
3487
3488 for (i = slot + nr; i < nritems; i++) {
3489 u32 ioff;
3490
3491 item = btrfs_item_nr(leaf, i);
3492 if (!leaf->map_token) {
3493 map_extent_buffer(leaf, (unsigned long)item,
3494 sizeof(struct btrfs_item),
3495 &leaf->map_token, &leaf->kaddr,
3496 &leaf->map_start, &leaf->map_len,
3497 KM_USER1);
3498 }
3499 ioff = btrfs_item_offset(leaf, item);
3500 btrfs_set_item_offset(leaf, item, ioff + dsize);
3501 }
3502
3503 if (leaf->map_token) {
3504 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3505 leaf->map_token = NULL;
3506 }
3507
3508 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
3509 btrfs_item_nr_offset(slot + nr),
3510 sizeof(struct btrfs_item) *
3511 (nritems - slot - nr));
3512 }
3513 btrfs_set_header_nritems(leaf, nritems - nr);
3514 nritems -= nr;
3515
3516 /* delete the leaf if we've emptied it */
3517 if (nritems == 0) {
3518 if (leaf == root->node) {
3519 btrfs_set_header_level(leaf, 0);
3520 } else {
3521 ret = btrfs_del_leaf(trans, root, path, leaf->start);
3522 BUG_ON(ret);
3523 }
3524 } else {
3525 int used = leaf_space_used(leaf, 0, nritems);
3526 if (slot == 0) {
3527 struct btrfs_disk_key disk_key;
3528
3529 btrfs_item_key(leaf, &disk_key, 0);
3530 wret = fixup_low_keys(trans, root, path,
3531 &disk_key, 1);
3532 if (wret)
3533 ret = wret;
3534 }
3535
3536 /* delete the leaf if it is mostly empty */
3537 if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
3538 /* push_leaf_left fixes the path.
3539 * make sure the path still points to our leaf
3540 * for possible call to del_ptr below
3541 */
3542 slot = path->slots[1];
3543 extent_buffer_get(leaf);
3544
3545 wret = push_leaf_left(trans, root, path, 1, 1);
3546 if (wret < 0 && wret != -ENOSPC)
3547 ret = wret;
3548
3549 if (path->nodes[0] == leaf &&
3550 btrfs_header_nritems(leaf)) {
3551 wret = push_leaf_right(trans, root, path, 1, 1);
3552 if (wret < 0 && wret != -ENOSPC)
3553 ret = wret;
3554 }
3555
3556 if (btrfs_header_nritems(leaf) == 0) {
3557 path->slots[1] = slot;
3558 ret = btrfs_del_leaf(trans, root, path,
3559 leaf->start);
3560 BUG_ON(ret);
3561 free_extent_buffer(leaf);
3562 } else {
3563 /* if we're still in the path, make sure
3564 * we're dirty. Otherwise, one of the
3565 * push_leaf functions must have already
3566 * dirtied this buffer
3567 */
3568 if (path->nodes[0] == leaf)
3569 btrfs_mark_buffer_dirty(leaf);
3570 free_extent_buffer(leaf);
3571 }
3572 } else {
3573 btrfs_mark_buffer_dirty(leaf);
3574 }
3575 }
3576 return ret;
3577}
3578
3579/*
3580 * search the tree again to find a leaf with lesser keys
3581 * returns 0 if it found something or 1 if there are no lesser leaves.
3582 * returns < 0 on io errors.
3583 *
3584 * This may release the path, and so you may lose any locks held at the
3585 * time you call it.
3586 */
3587int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
3588{
3589 struct btrfs_key key;
3590 struct btrfs_disk_key found_key;
3591 int ret;
3592
3593 btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
3594
3595 if (key.offset > 0)
3596 key.offset--;
3597 else if (key.type > 0)
3598 key.type--;
3599 else if (key.objectid > 0)
3600 key.objectid--;
3601 else
3602 return 1;
3603
3604 btrfs_release_path(root, path);
3605 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3606 if (ret < 0)
3607 return ret;
3608 btrfs_item_key(path->nodes[0], &found_key, 0);
3609 ret = comp_keys(&found_key, &key);
3610 if (ret < 0)
3611 return 0;
3612 return 1;
3613}
3614
3615/*
3616 * A helper function to walk down the tree starting at min_key, and looking
3617 * for nodes or leaves that are either in cache or have a minimum
3618 * transaction id. This is used by the btree defrag code, and tree logging
3619 *
3620 * This does not cow, but it does stuff the starting key it finds back
3621 * into min_key, so you can call btrfs_search_slot with cow=1 on the
3622 * key and get a writable path.
3623 *
3624 * This does lock as it descends, and path->keep_locks should be set
3625 * to 1 by the caller.
3626 *
3627 * This honors path->lowest_level to prevent descent past a given level
3628 * of the tree.
3629 *
3630 * min_trans indicates the oldest transaction that you are interested
3631 * in walking through. Any nodes or leaves older than min_trans are
3632 * skipped over (without reading them).
3633 *
3634 * returns zero if something useful was found, < 0 on error and 1 if there
3635 * was nothing in the tree that matched the search criteria.
3636 */
3637int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
3638 struct btrfs_key *max_key,
3639 struct btrfs_path *path, int cache_only,
3640 u64 min_trans)
3641{
3642 struct extent_buffer *cur;
3643 struct btrfs_key found_key;
3644 int slot;
3645 int sret;
3646 u32 nritems;
3647 int level;
3648 int ret = 1;
3649
3650 WARN_ON(!path->keep_locks);
3651again:
3652 cur = btrfs_lock_root_node(root);
3653 level = btrfs_header_level(cur);
3654 WARN_ON(path->nodes[level]);
3655 path->nodes[level] = cur;
3656 path->locks[level] = 1;
3657
3658 if (btrfs_header_generation(cur) < min_trans) {
3659 ret = 1;
3660 goto out;
3661 }
3662 while (1) {
3663 nritems = btrfs_header_nritems(cur);
3664 level = btrfs_header_level(cur);
3665 sret = bin_search(cur, min_key, level, &slot);
3666
3667 /* at the lowest level, we're done, setup the path and exit */
3668 if (level == path->lowest_level) {
3669 if (slot >= nritems)
3670 goto find_next_key;
3671 ret = 0;
3672 path->slots[level] = slot;
3673 btrfs_item_key_to_cpu(cur, &found_key, slot);
3674 goto out;
3675 }
3676 if (sret && slot > 0)
3677 slot--;
3678 /*
3679 * check this node pointer against the cache_only and
3680 * min_trans parameters. If it isn't in cache or is too
3681 * old, skip to the next one.
3682 */
3683 while (slot < nritems) {
3684 u64 blockptr;
3685 u64 gen;
3686 struct extent_buffer *tmp;
3687 struct btrfs_disk_key disk_key;
3688
3689 blockptr = btrfs_node_blockptr(cur, slot);
3690 gen = btrfs_node_ptr_generation(cur, slot);
3691 if (gen < min_trans) {
3692 slot++;
3693 continue;
3694 }
3695 if (!cache_only)
3696 break;
3697
3698 if (max_key) {
3699 btrfs_node_key(cur, &disk_key, slot);
3700 if (comp_keys(&disk_key, max_key) >= 0) {
3701 ret = 1;
3702 goto out;
3703 }
3704 }
3705
3706 tmp = btrfs_find_tree_block(root, blockptr,
3707 btrfs_level_size(root, level - 1));
3708
3709 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
3710 free_extent_buffer(tmp);
3711 break;
3712 }
3713 if (tmp)
3714 free_extent_buffer(tmp);
3715 slot++;
3716 }
3717find_next_key:
3718 /*
3719 * we didn't find a candidate key in this node, walk forward
3720 * and find another one
3721 */
3722 if (slot >= nritems) {
3723 path->slots[level] = slot;
3724 sret = btrfs_find_next_key(root, path, min_key, level,
3725 cache_only, min_trans);
3726 if (sret == 0) {
3727 btrfs_release_path(root, path);
3728 goto again;
3729 } else {
3730 goto out;
3731 }
3732 }
3733 /* save our key for returning back */
3734 btrfs_node_key_to_cpu(cur, &found_key, slot);
3735 path->slots[level] = slot;
3736 if (level == path->lowest_level) {
3737 ret = 0;
3738 unlock_up(path, level, 1);
3739 goto out;
3740 }
3741 cur = read_node_slot(root, cur, slot);
3742
3743 btrfs_tree_lock(cur);
3744 path->locks[level - 1] = 1;
3745 path->nodes[level - 1] = cur;
3746 unlock_up(path, level, 1);
3747 }
3748out:
3749 if (ret == 0)
3750 memcpy(min_key, &found_key, sizeof(found_key));
3751 return ret;
3752}
3753
3754/*
3755 * this is similar to btrfs_next_leaf, but does not try to preserve
3756 * and fixup the path. It looks for and returns the next key in the
3757 * tree based on the current path and the cache_only and min_trans
3758 * parameters.
3759 *
3760 * 0 is returned if another key is found, < 0 if there are any errors
3761 * and 1 is returned if there are no higher keys in the tree
3762 *
3763 * path->keep_locks should be set to 1 on the search made before
3764 * calling this function.
3765 */
3766int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
3767 struct btrfs_key *key, int lowest_level,
3768 int cache_only, u64 min_trans)
3769{
3770 int level = lowest_level;
3771 int slot;
3772 struct extent_buffer *c;
3773
3774 WARN_ON(!path->keep_locks);
3775 while (level < BTRFS_MAX_LEVEL) {
3776 if (!path->nodes[level])
3777 return 1;
3778
3779 slot = path->slots[level] + 1;
3780 c = path->nodes[level];
3781next:
3782 if (slot >= btrfs_header_nritems(c)) {
3783 level++;
3784 if (level == BTRFS_MAX_LEVEL)
3785 return 1;
3786 continue;
3787 }
3788 if (level == 0)
3789 btrfs_item_key_to_cpu(c, key, slot);
3790 else {
3791 u64 blockptr = btrfs_node_blockptr(c, slot);
3792 u64 gen = btrfs_node_ptr_generation(c, slot);
3793
3794 if (cache_only) {
3795 struct extent_buffer *cur;
3796 cur = btrfs_find_tree_block(root, blockptr,
3797 btrfs_level_size(root, level - 1));
3798 if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
3799 slot++;
3800 if (cur)
3801 free_extent_buffer(cur);
3802 goto next;
3803 }
3804 free_extent_buffer(cur);
3805 }
3806 if (gen < min_trans) {
3807 slot++;
3808 goto next;
3809 }
3810 btrfs_node_key_to_cpu(c, key, slot);
3811 }
3812 return 0;
3813 }
3814 return 1;
3815}
3816
3817/*
3818 * search the tree again to find a leaf with greater keys
3819 * returns 0 if it found something or 1 if there are no greater leaves.
3820 * returns < 0 on io errors.
3821 */
3822int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3823{
3824 int slot;
3825 int level = 1;
3826 struct extent_buffer *c;
3827 struct extent_buffer *next = NULL;
3828 struct btrfs_key key;
3829 u32 nritems;
3830 int ret;
3831
3832 nritems = btrfs_header_nritems(path->nodes[0]);
3833 if (nritems == 0)
3834 return 1;
3835
3836 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
3837
3838 btrfs_release_path(root, path);
3839 path->keep_locks = 1;
3840 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3841 path->keep_locks = 0;
3842
3843 if (ret < 0)
3844 return ret;
3845
3846 nritems = btrfs_header_nritems(path->nodes[0]);
3847 /*
3848 * by releasing the path above we dropped all our locks. A balance
3849 * could have added more items next to the key that used to be
3850 * at the very end of the block. So, check again here and
3851 * advance the path if there are now more items available.
3852 */
3853 if (nritems > 0 && path->slots[0] < nritems - 1) {
3854 path->slots[0]++;
3855 goto done;
3856 }
3857
3858 while (level < BTRFS_MAX_LEVEL) {
3859 if (!path->nodes[level])
3860 return 1;
3861
3862 slot = path->slots[level] + 1;
3863 c = path->nodes[level];
3864 if (slot >= btrfs_header_nritems(c)) {
3865 level++;
3866 if (level == BTRFS_MAX_LEVEL)
3867 return 1;
3868 continue;
3869 }
3870
3871 if (next) {
3872 btrfs_tree_unlock(next);
3873 free_extent_buffer(next);
3874 }
3875
3876 if (level == 1 && (path->locks[1] || path->skip_locking) &&
3877 path->reada)
3878 reada_for_search(root, path, level, slot, 0);
3879
3880 next = read_node_slot(root, c, slot);
3881 if (!path->skip_locking) {
3882 WARN_ON(!btrfs_tree_locked(c));
3883 btrfs_tree_lock(next);
3884 }
3885 break;
3886 }
3887 path->slots[level] = slot;
3888 while (1) {
3889 level--;
3890 c = path->nodes[level];
3891 if (path->locks[level])
3892 btrfs_tree_unlock(c);
3893 free_extent_buffer(c);
3894 path->nodes[level] = next;
3895 path->slots[level] = 0;
3896 if (!path->skip_locking)
3897 path->locks[level] = 1;
3898 if (!level)
3899 break;
3900 if (level == 1 && path->locks[1] && path->reada)
3901 reada_for_search(root, path, level, slot, 0);
3902 next = read_node_slot(root, next, 0);
3903 if (!path->skip_locking) {
3904 WARN_ON(!btrfs_tree_locked(path->nodes[level]));
3905 btrfs_tree_lock(next);
3906 }
3907 }
3908done:
3909 unlock_up(path, 0, 1);
3910 return 0;
3911}
3912
3913/*
3914 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
3915 * searching until it gets past min_objectid or finds an item of 'type'
3916 *
3917 * returns 0 if something is found, 1 if nothing was found and < 0 on error
3918 */
3919int btrfs_previous_item(struct btrfs_root *root,
3920 struct btrfs_path *path, u64 min_objectid,
3921 int type)
3922{
3923 struct btrfs_key found_key;
3924 struct extent_buffer *leaf;
3925 u32 nritems;
3926 int ret;
3927
3928 while (1) {
3929 if (path->slots[0] == 0) {
3930 ret = btrfs_prev_leaf(root, path);
3931 if (ret != 0)
3932 return ret;
3933 } else {
3934 path->slots[0]--;
3935 }
3936 leaf = path->nodes[0];
3937 nritems = btrfs_header_nritems(leaf);
3938 if (nritems == 0)
3939 return 1;
3940 if (path->slots[0] == nritems)
3941 path->slots[0]--;
3942
3943 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3944 if (found_key.type == type)
3945 return 0;
3946 if (found_key.objectid < min_objectid)
3947 break;
3948 if (found_key.objectid == min_objectid &&
3949 found_key.type < type)
3950 break;
3951 }
3952 return 1;
3953}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
new file mode 100644
index 000000000000..eee060f88113
--- /dev/null
+++ b/fs/btrfs/ctree.h
@@ -0,0 +1,2129 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_CTREE__
20#define __BTRFS_CTREE__
21
22#include <linux/version.h>
23#include <linux/mm.h>
24#include <linux/highmem.h>
25#include <linux/fs.h>
26#include <linux/completion.h>
27#include <linux/backing-dev.h>
28#include <linux/wait.h>
29#include <asm/kmap_types.h>
30#include "extent_io.h"
31#include "extent_map.h"
32#include "async-thread.h"
33
34struct btrfs_trans_handle;
35struct btrfs_transaction;
36extern struct kmem_cache *btrfs_trans_handle_cachep;
37extern struct kmem_cache *btrfs_transaction_cachep;
38extern struct kmem_cache *btrfs_bit_radix_cachep;
39extern struct kmem_cache *btrfs_path_cachep;
40struct btrfs_ordered_sum;
41
42#define BTRFS_MAGIC "_BHRfS_M"
43
44#define BTRFS_ACL_NOT_CACHED ((void *)-1)
45
46#ifdef CONFIG_LOCKDEP
47# define BTRFS_MAX_LEVEL 7
48#else
49# define BTRFS_MAX_LEVEL 8
50#endif
51
52/* holds pointers to all of the tree roots */
53#define BTRFS_ROOT_TREE_OBJECTID 1ULL
54
55/* stores information about which extents are in use, and reference counts */
56#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
57
58/*
59 * chunk tree stores translations from logical -> physical block numbering
60 * the super block points to the chunk tree
61 */
62#define BTRFS_CHUNK_TREE_OBJECTID 3ULL
63
64/*
65 * stores information about which areas of a given device are in use.
66 * one per device. The tree of tree roots points to the device tree
67 */
68#define BTRFS_DEV_TREE_OBJECTID 4ULL
69
70/* one per subvolume, storing files and directories */
71#define BTRFS_FS_TREE_OBJECTID 5ULL
72
73/* directory objectid inside the root tree */
74#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
75
76/* holds checksums of all the data extents */
77#define BTRFS_CSUM_TREE_OBJECTID 7ULL
78
79/* orhpan objectid for tracking unlinked/truncated files */
80#define BTRFS_ORPHAN_OBJECTID -5ULL
81
82/* does write ahead logging to speed up fsyncs */
83#define BTRFS_TREE_LOG_OBJECTID -6ULL
84#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
85
86/* for space balancing */
87#define BTRFS_TREE_RELOC_OBJECTID -8ULL
88#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
89
90/*
91 * extent checksums all have this objectid
92 * this allows them to share the logging tree
93 * for fsyncs
94 */
95#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
96
97/* dummy objectid represents multiple objectids */
98#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
99
100/*
101 * All files have objectids in this range.
102 */
103#define BTRFS_FIRST_FREE_OBJECTID 256ULL
104#define BTRFS_LAST_FREE_OBJECTID -256ULL
105#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
106
107
108/*
109 * the device items go into the chunk tree. The key is in the form
110 * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
111 */
112#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
113
114/*
115 * we can actually store much bigger names, but lets not confuse the rest
116 * of linux
117 */
118#define BTRFS_NAME_LEN 255
119
120/* 32 bytes in various csum fields */
121#define BTRFS_CSUM_SIZE 32
122
123/* csum types */
124#define BTRFS_CSUM_TYPE_CRC32 0
125
126static int btrfs_csum_sizes[] = { 4, 0 };
127
128/* four bytes for CRC32 */
129#define BTRFS_EMPTY_DIR_SIZE 0
130
131#define BTRFS_FT_UNKNOWN 0
132#define BTRFS_FT_REG_FILE 1
133#define BTRFS_FT_DIR 2
134#define BTRFS_FT_CHRDEV 3
135#define BTRFS_FT_BLKDEV 4
136#define BTRFS_FT_FIFO 5
137#define BTRFS_FT_SOCK 6
138#define BTRFS_FT_SYMLINK 7
139#define BTRFS_FT_XATTR 8
140#define BTRFS_FT_MAX 9
141
142/*
143 * the key defines the order in the tree, and so it also defines (optimal)
144 * block layout. objectid corresonds to the inode number. The flags
145 * tells us things about the object, and is a kind of stream selector.
146 * so for a given inode, keys with flags of 1 might refer to the inode
147 * data, flags of 2 may point to file data in the btree and flags == 3
148 * may point to extents.
149 *
150 * offset is the starting byte offset for this key in the stream.
151 *
152 * btrfs_disk_key is in disk byte order. struct btrfs_key is always
153 * in cpu native order. Otherwise they are identical and their sizes
154 * should be the same (ie both packed)
155 */
156struct btrfs_disk_key {
157 __le64 objectid;
158 u8 type;
159 __le64 offset;
160} __attribute__ ((__packed__));
161
162struct btrfs_key {
163 u64 objectid;
164 u8 type;
165 u64 offset;
166} __attribute__ ((__packed__));
167
168struct btrfs_mapping_tree {
169 struct extent_map_tree map_tree;
170};
171
172#define BTRFS_UUID_SIZE 16
173struct btrfs_dev_item {
174 /* the internal btrfs device id */
175 __le64 devid;
176
177 /* size of the device */
178 __le64 total_bytes;
179
180 /* bytes used */
181 __le64 bytes_used;
182
183 /* optimal io alignment for this device */
184 __le32 io_align;
185
186 /* optimal io width for this device */
187 __le32 io_width;
188
189 /* minimal io size for this device */
190 __le32 sector_size;
191
192 /* type and info about this device */
193 __le64 type;
194
195 /* expected generation for this device */
196 __le64 generation;
197
198 /*
199 * starting byte of this partition on the device,
200 * to allowr for stripe alignment in the future
201 */
202 __le64 start_offset;
203
204 /* grouping information for allocation decisions */
205 __le32 dev_group;
206
207 /* seek speed 0-100 where 100 is fastest */
208 u8 seek_speed;
209
210 /* bandwidth 0-100 where 100 is fastest */
211 u8 bandwidth;
212
213 /* btrfs generated uuid for this device */
214 u8 uuid[BTRFS_UUID_SIZE];
215
216 /* uuid of FS who owns this device */
217 u8 fsid[BTRFS_UUID_SIZE];
218} __attribute__ ((__packed__));
219
220struct btrfs_stripe {
221 __le64 devid;
222 __le64 offset;
223 u8 dev_uuid[BTRFS_UUID_SIZE];
224} __attribute__ ((__packed__));
225
226struct btrfs_chunk {
227 /* size of this chunk in bytes */
228 __le64 length;
229
230 /* objectid of the root referencing this chunk */
231 __le64 owner;
232
233 __le64 stripe_len;
234 __le64 type;
235
236 /* optimal io alignment for this chunk */
237 __le32 io_align;
238
239 /* optimal io width for this chunk */
240 __le32 io_width;
241
242 /* minimal io size for this chunk */
243 __le32 sector_size;
244
245 /* 2^16 stripes is quite a lot, a second limit is the size of a single
246 * item in the btree
247 */
248 __le16 num_stripes;
249
250 /* sub stripes only matter for raid10 */
251 __le16 sub_stripes;
252 struct btrfs_stripe stripe;
253 /* additional stripes go here */
254} __attribute__ ((__packed__));
255
256static inline unsigned long btrfs_chunk_item_size(int num_stripes)
257{
258 BUG_ON(num_stripes == 0);
259 return sizeof(struct btrfs_chunk) +
260 sizeof(struct btrfs_stripe) * (num_stripes - 1);
261}
262
263#define BTRFS_FSID_SIZE 16
264#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0)
265
266/*
267 * every tree block (leaf or node) starts with this header.
268 */
269struct btrfs_header {
270 /* these first four must match the super block */
271 u8 csum[BTRFS_CSUM_SIZE];
272 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
273 __le64 bytenr; /* which block this node is supposed to live in */
274 __le64 flags;
275
276 /* allowed to be different from the super from here on down */
277 u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
278 __le64 generation;
279 __le64 owner;
280 __le32 nritems;
281 u8 level;
282} __attribute__ ((__packed__));
283
284#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
285 sizeof(struct btrfs_header)) / \
286 sizeof(struct btrfs_key_ptr))
287#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
288#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize))
289#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
290 sizeof(struct btrfs_item) - \
291 sizeof(struct btrfs_file_extent_item))
292
293#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
294
295/*
296 * this is a very generous portion of the super block, giving us
297 * room to translate 14 chunks with 3 stripes each.
298 */
299#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
300#define BTRFS_LABEL_SIZE 256
301
302/*
303 * the super block basically lists the main trees of the FS
304 * it currently lacks any block count etc etc
305 */
306struct btrfs_super_block {
307 u8 csum[BTRFS_CSUM_SIZE];
308 /* the first 4 fields must match struct btrfs_header */
309 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
310 __le64 bytenr; /* this block number */
311 __le64 flags;
312
313 /* allowed to be different from the btrfs_header from here own down */
314 __le64 magic;
315 __le64 generation;
316 __le64 root;
317 __le64 chunk_root;
318 __le64 log_root;
319
320 /* this will help find the new super based on the log root */
321 __le64 log_root_transid;
322 __le64 total_bytes;
323 __le64 bytes_used;
324 __le64 root_dir_objectid;
325 __le64 num_devices;
326 __le32 sectorsize;
327 __le32 nodesize;
328 __le32 leafsize;
329 __le32 stripesize;
330 __le32 sys_chunk_array_size;
331 __le64 chunk_root_generation;
332 __le64 compat_flags;
333 __le64 compat_ro_flags;
334 __le64 incompat_flags;
335 __le16 csum_type;
336 u8 root_level;
337 u8 chunk_root_level;
338 u8 log_root_level;
339 struct btrfs_dev_item dev_item;
340
341 char label[BTRFS_LABEL_SIZE];
342
343 /* future expansion */
344 __le64 reserved[32];
345 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
346} __attribute__ ((__packed__));
347
348/*
349 * Compat flags that we support. If any incompat flags are set other than the
350 * ones specified below then we will fail to mount
351 */
352#define BTRFS_FEATURE_COMPAT_SUPP 0x0
353#define BTRFS_FEATURE_COMPAT_RO_SUPP 0x0
354#define BTRFS_FEATURE_INCOMPAT_SUPP 0x0
355
356/*
357 * A leaf is full of items. offset and size tell us where to find
358 * the item in the leaf (relative to the start of the data area)
359 */
360struct btrfs_item {
361 struct btrfs_disk_key key;
362 __le32 offset;
363 __le32 size;
364} __attribute__ ((__packed__));
365
366/*
367 * leaves have an item area and a data area:
368 * [item0, item1....itemN] [free space] [dataN...data1, data0]
369 *
370 * The data is separate from the items to get the keys closer together
371 * during searches.
372 */
373struct btrfs_leaf {
374 struct btrfs_header header;
375 struct btrfs_item items[];
376} __attribute__ ((__packed__));
377
378/*
379 * all non-leaf blocks are nodes, they hold only keys and pointers to
380 * other blocks
381 */
382struct btrfs_key_ptr {
383 struct btrfs_disk_key key;
384 __le64 blockptr;
385 __le64 generation;
386} __attribute__ ((__packed__));
387
388struct btrfs_node {
389 struct btrfs_header header;
390 struct btrfs_key_ptr ptrs[];
391} __attribute__ ((__packed__));
392
393/*
394 * btrfs_paths remember the path taken from the root down to the leaf.
395 * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
396 * to any other levels that are present.
397 *
398 * The slots array records the index of the item or block pointer
399 * used while walking the tree.
400 */
401struct btrfs_path {
402 struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
403 int slots[BTRFS_MAX_LEVEL];
404 /* if there is real range locking, this locks field will change */
405 int locks[BTRFS_MAX_LEVEL];
406 int reada;
407 /* keep some upper locks as we walk down */
408 int keep_locks;
409 int skip_locking;
410 int lowest_level;
411
412 /*
413 * set by btrfs_split_item, tells search_slot to keep all locks
414 * and to force calls to keep space in the nodes
415 */
416 int search_for_split;
417};
418
419/*
420 * items in the extent btree are used to record the objectid of the
421 * owner of the block and the number of references
422 */
423struct btrfs_extent_item {
424 __le32 refs;
425} __attribute__ ((__packed__));
426
427struct btrfs_extent_ref {
428 __le64 root;
429 __le64 generation;
430 __le64 objectid;
431 __le32 num_refs;
432} __attribute__ ((__packed__));
433
434/* dev extents record free space on individual devices. The owner
435 * field points back to the chunk allocation mapping tree that allocated
436 * the extent. The chunk tree uuid field is a way to double check the owner
437 */
438struct btrfs_dev_extent {
439 __le64 chunk_tree;
440 __le64 chunk_objectid;
441 __le64 chunk_offset;
442 __le64 length;
443 u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
444} __attribute__ ((__packed__));
445
446struct btrfs_inode_ref {
447 __le64 index;
448 __le16 name_len;
449 /* name goes here */
450} __attribute__ ((__packed__));
451
452struct btrfs_timespec {
453 __le64 sec;
454 __le32 nsec;
455} __attribute__ ((__packed__));
456
457typedef enum {
458 BTRFS_COMPRESS_NONE = 0,
459 BTRFS_COMPRESS_ZLIB = 1,
460 BTRFS_COMPRESS_LAST = 2,
461} btrfs_compression_type;
462
463/* we don't understand any encryption methods right now */
464typedef enum {
465 BTRFS_ENCRYPTION_NONE = 0,
466 BTRFS_ENCRYPTION_LAST = 1,
467} btrfs_encryption_type;
468
469struct btrfs_inode_item {
470 /* nfs style generation number */
471 __le64 generation;
472 /* transid that last touched this inode */
473 __le64 transid;
474 __le64 size;
475 __le64 nbytes;
476 __le64 block_group;
477 __le32 nlink;
478 __le32 uid;
479 __le32 gid;
480 __le32 mode;
481 __le64 rdev;
482 __le64 flags;
483
484 /* modification sequence number for NFS */
485 __le64 sequence;
486
487 /*
488 * a little future expansion, for more than this we can
489 * just grow the inode item and version it
490 */
491 __le64 reserved[4];
492 struct btrfs_timespec atime;
493 struct btrfs_timespec ctime;
494 struct btrfs_timespec mtime;
495 struct btrfs_timespec otime;
496} __attribute__ ((__packed__));
497
498struct btrfs_dir_log_item {
499 __le64 end;
500} __attribute__ ((__packed__));
501
502struct btrfs_dir_item {
503 struct btrfs_disk_key location;
504 __le64 transid;
505 __le16 data_len;
506 __le16 name_len;
507 u8 type;
508} __attribute__ ((__packed__));
509
510struct btrfs_root_item {
511 struct btrfs_inode_item inode;
512 __le64 generation;
513 __le64 root_dirid;
514 __le64 bytenr;
515 __le64 byte_limit;
516 __le64 bytes_used;
517 __le64 last_snapshot;
518 __le64 flags;
519 __le32 refs;
520 struct btrfs_disk_key drop_progress;
521 u8 drop_level;
522 u8 level;
523} __attribute__ ((__packed__));
524
525/*
526 * this is used for both forward and backward root refs
527 */
528struct btrfs_root_ref {
529 __le64 dirid;
530 __le64 sequence;
531 __le16 name_len;
532} __attribute__ ((__packed__));
533
534#define BTRFS_FILE_EXTENT_INLINE 0
535#define BTRFS_FILE_EXTENT_REG 1
536#define BTRFS_FILE_EXTENT_PREALLOC 2
537
538struct btrfs_file_extent_item {
539 /*
540 * transaction id that created this extent
541 */
542 __le64 generation;
543 /*
544 * max number of bytes to hold this extent in ram
545 * when we split a compressed extent we can't know how big
546 * each of the resulting pieces will be. So, this is
547 * an upper limit on the size of the extent in ram instead of
548 * an exact limit.
549 */
550 __le64 ram_bytes;
551
552 /*
553 * 32 bits for the various ways we might encode the data,
554 * including compression and encryption. If any of these
555 * are set to something a given disk format doesn't understand
556 * it is treated like an incompat flag for reading and writing,
557 * but not for stat.
558 */
559 u8 compression;
560 u8 encryption;
561 __le16 other_encoding; /* spare for later use */
562
563 /* are we inline data or a real extent? */
564 u8 type;
565
566 /*
567 * disk space consumed by the extent, checksum blocks are included
568 * in these numbers
569 */
570 __le64 disk_bytenr;
571 __le64 disk_num_bytes;
572 /*
573 * the logical offset in file blocks (no csums)
574 * this extent record is for. This allows a file extent to point
575 * into the middle of an existing extent on disk, sharing it
576 * between two snapshots (useful if some bytes in the middle of the
577 * extent have changed
578 */
579 __le64 offset;
580 /*
581 * the logical number of file blocks (no csums included). This
582 * always reflects the size uncompressed and without encoding.
583 */
584 __le64 num_bytes;
585
586} __attribute__ ((__packed__));
587
588struct btrfs_csum_item {
589 u8 csum;
590} __attribute__ ((__packed__));
591
592/* different types of block groups (and chunks) */
593#define BTRFS_BLOCK_GROUP_DATA (1 << 0)
594#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1)
595#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
596#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3)
597#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
598#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
599#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
600
601struct btrfs_block_group_item {
602 __le64 used;
603 __le64 chunk_objectid;
604 __le64 flags;
605} __attribute__ ((__packed__));
606
607struct btrfs_space_info {
608 u64 flags;
609 u64 total_bytes;
610 u64 bytes_used;
611 u64 bytes_pinned;
612 u64 bytes_reserved;
613 u64 bytes_readonly;
614 int full;
615 int force_alloc;
616 struct list_head list;
617
618 /* for block groups in our same type */
619 struct list_head block_groups;
620 spinlock_t lock;
621 struct rw_semaphore groups_sem;
622};
623
624struct btrfs_free_space {
625 struct rb_node bytes_index;
626 struct rb_node offset_index;
627 u64 offset;
628 u64 bytes;
629};
630
631struct btrfs_block_group_cache {
632 struct btrfs_key key;
633 struct btrfs_block_group_item item;
634 spinlock_t lock;
635 struct mutex alloc_mutex;
636 struct mutex cache_mutex;
637 u64 pinned;
638 u64 reserved;
639 u64 flags;
640 int cached;
641 int ro;
642 int dirty;
643
644 struct btrfs_space_info *space_info;
645
646 /* free space cache stuff */
647 struct rb_root free_space_bytes;
648 struct rb_root free_space_offset;
649
650 /* block group cache stuff */
651 struct rb_node cache_node;
652
653 /* for block groups in the same raid type */
654 struct list_head list;
655
656 /* usage count */
657 atomic_t count;
658};
659
660struct btrfs_leaf_ref_tree {
661 struct rb_root root;
662 struct list_head list;
663 spinlock_t lock;
664};
665
666struct btrfs_device;
667struct btrfs_fs_devices;
668struct btrfs_fs_info {
669 u8 fsid[BTRFS_FSID_SIZE];
670 u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
671 struct btrfs_root *extent_root;
672 struct btrfs_root *tree_root;
673 struct btrfs_root *chunk_root;
674 struct btrfs_root *dev_root;
675 struct btrfs_root *fs_root;
676 struct btrfs_root *csum_root;
677
678 /* the log root tree is a directory of all the other log roots */
679 struct btrfs_root *log_root_tree;
680 struct radix_tree_root fs_roots_radix;
681
682 /* block group cache stuff */
683 spinlock_t block_group_cache_lock;
684 struct rb_root block_group_cache_tree;
685
686 struct extent_io_tree pinned_extents;
687 struct extent_io_tree pending_del;
688 struct extent_io_tree extent_ins;
689
690 /* logical->physical extent mapping */
691 struct btrfs_mapping_tree mapping_tree;
692
693 u64 generation;
694 u64 last_trans_committed;
695 u64 last_trans_new_blockgroup;
696 u64 open_ioctl_trans;
697 unsigned long mount_opt;
698 u64 max_extent;
699 u64 max_inline;
700 u64 alloc_start;
701 struct btrfs_transaction *running_transaction;
702 wait_queue_head_t transaction_throttle;
703 wait_queue_head_t transaction_wait;
704
705 wait_queue_head_t async_submit_wait;
706 wait_queue_head_t tree_log_wait;
707
708 struct btrfs_super_block super_copy;
709 struct btrfs_super_block super_for_commit;
710 struct block_device *__bdev;
711 struct super_block *sb;
712 struct inode *btree_inode;
713 struct backing_dev_info bdi;
714 spinlock_t hash_lock;
715 struct mutex trans_mutex;
716 struct mutex tree_log_mutex;
717 struct mutex transaction_kthread_mutex;
718 struct mutex cleaner_mutex;
719 struct mutex extent_ins_mutex;
720 struct mutex pinned_mutex;
721 struct mutex chunk_mutex;
722 struct mutex drop_mutex;
723 struct mutex volume_mutex;
724 struct mutex tree_reloc_mutex;
725 struct list_head trans_list;
726 struct list_head hashers;
727 struct list_head dead_roots;
728
729 atomic_t nr_async_submits;
730 atomic_t async_submit_draining;
731 atomic_t nr_async_bios;
732 atomic_t async_delalloc_pages;
733 atomic_t tree_log_writers;
734 atomic_t tree_log_commit;
735 unsigned long tree_log_batch;
736 u64 tree_log_transid;
737
738 /*
739 * this is used by the balancing code to wait for all the pending
740 * ordered extents
741 */
742 spinlock_t ordered_extent_lock;
743 struct list_head ordered_extents;
744 struct list_head delalloc_inodes;
745
746 /*
747 * there is a pool of worker threads for checksumming during writes
748 * and a pool for checksumming after reads. This is because readers
749 * can run with FS locks held, and the writers may be waiting for
750 * those locks. We don't want ordering in the pending list to cause
751 * deadlocks, and so the two are serviced separately.
752 *
753 * A third pool does submit_bio to avoid deadlocking with the other
754 * two
755 */
756 struct btrfs_workers workers;
757 struct btrfs_workers delalloc_workers;
758 struct btrfs_workers endio_workers;
759 struct btrfs_workers endio_meta_workers;
760 struct btrfs_workers endio_meta_write_workers;
761 struct btrfs_workers endio_write_workers;
762 struct btrfs_workers submit_workers;
763 /*
764 * fixup workers take dirty pages that didn't properly go through
765 * the cow mechanism and make them safe to write. It happens
766 * for the sys_munmap function call path
767 */
768 struct btrfs_workers fixup_workers;
769 struct task_struct *transaction_kthread;
770 struct task_struct *cleaner_kthread;
771 int thread_pool_size;
772
773 /* tree relocation relocated fields */
774 struct list_head dead_reloc_roots;
775 struct btrfs_leaf_ref_tree reloc_ref_tree;
776 struct btrfs_leaf_ref_tree shared_ref_tree;
777
778 struct kobject super_kobj;
779 struct completion kobj_unregister;
780 int do_barriers;
781 int closing;
782 int log_root_recovering;
783 atomic_t throttles;
784 atomic_t throttle_gen;
785
786 u64 total_pinned;
787 struct list_head dirty_cowonly_roots;
788
789 struct btrfs_fs_devices *fs_devices;
790 struct list_head space_info;
791 spinlock_t delalloc_lock;
792 spinlock_t new_trans_lock;
793 u64 delalloc_bytes;
794 u64 last_alloc;
795 u64 last_data_alloc;
796
797 spinlock_t ref_cache_lock;
798 u64 total_ref_cache_size;
799
800 u64 avail_data_alloc_bits;
801 u64 avail_metadata_alloc_bits;
802 u64 avail_system_alloc_bits;
803 u64 data_alloc_profile;
804 u64 metadata_alloc_profile;
805 u64 system_alloc_profile;
806
807 void *bdev_holder;
808};
809
810/*
811 * in ram representation of the tree. extent_root is used for all allocations
812 * and for the extent tree extent_root root.
813 */
814struct btrfs_dirty_root;
815struct btrfs_root {
816 struct extent_buffer *node;
817
818 /* the node lock is held while changing the node pointer */
819 spinlock_t node_lock;
820
821 struct extent_buffer *commit_root;
822 struct btrfs_leaf_ref_tree *ref_tree;
823 struct btrfs_leaf_ref_tree ref_tree_struct;
824 struct btrfs_dirty_root *dirty_root;
825 struct btrfs_root *log_root;
826 struct btrfs_root *reloc_root;
827
828 struct btrfs_root_item root_item;
829 struct btrfs_key root_key;
830 struct btrfs_fs_info *fs_info;
831 struct extent_io_tree dirty_log_pages;
832
833 struct kobject root_kobj;
834 struct completion kobj_unregister;
835 struct mutex objectid_mutex;
836 struct mutex log_mutex;
837
838 u64 objectid;
839 u64 last_trans;
840
841 /* data allocations are done in sectorsize units */
842 u32 sectorsize;
843
844 /* node allocations are done in nodesize units */
845 u32 nodesize;
846
847 /* leaf allocations are done in leafsize units */
848 u32 leafsize;
849
850 u32 stripesize;
851
852 u32 type;
853 u64 highest_inode;
854 u64 last_inode_alloc;
855 int ref_cows;
856 int track_dirty;
857 u64 defrag_trans_start;
858 struct btrfs_key defrag_progress;
859 struct btrfs_key defrag_max;
860 int defrag_running;
861 int defrag_level;
862 char *name;
863 int in_sysfs;
864
865 /* the dirty list is only used by non-reference counted roots */
866 struct list_head dirty_list;
867
868 spinlock_t list_lock;
869 struct list_head dead_list;
870 struct list_head orphan_list;
871
872 /*
873 * right now this just gets used so that a root has its own devid
874 * for stat. It may be used for more later
875 */
876 struct super_block anon_super;
877};
878
879/*
880
881 * inode items have the data typically returned from stat and store other
882 * info about object characteristics. There is one for every file and dir in
883 * the FS
884 */
885#define BTRFS_INODE_ITEM_KEY 1
886#define BTRFS_INODE_REF_KEY 12
887#define BTRFS_XATTR_ITEM_KEY 24
888#define BTRFS_ORPHAN_ITEM_KEY 48
889/* reserve 2-15 close to the inode for later flexibility */
890
891/*
892 * dir items are the name -> inode pointers in a directory. There is one
893 * for every name in a directory.
894 */
895#define BTRFS_DIR_LOG_ITEM_KEY 60
896#define BTRFS_DIR_LOG_INDEX_KEY 72
897#define BTRFS_DIR_ITEM_KEY 84
898#define BTRFS_DIR_INDEX_KEY 96
899/*
900 * extent data is for file data
901 */
902#define BTRFS_EXTENT_DATA_KEY 108
903
904/*
905 * extent csums are stored in a separate tree and hold csums for
906 * an entire extent on disk.
907 */
908#define BTRFS_EXTENT_CSUM_KEY 128
909
910/*
911 * root items point to tree roots. There are typically in the root
912 * tree used by the super block to find all the other trees
913 */
914#define BTRFS_ROOT_ITEM_KEY 132
915
916/*
917 * root backrefs tie subvols and snapshots to the directory entries that
918 * reference them
919 */
920#define BTRFS_ROOT_BACKREF_KEY 144
921
922/*
923 * root refs make a fast index for listing all of the snapshots and
924 * subvolumes referenced by a given root. They point directly to the
925 * directory item in the root that references the subvol
926 */
927#define BTRFS_ROOT_REF_KEY 156
928
929/*
930 * extent items are in the extent map tree. These record which blocks
931 * are used, and how many references there are to each block
932 */
933#define BTRFS_EXTENT_ITEM_KEY 168
934#define BTRFS_EXTENT_REF_KEY 180
935
936/*
937 * block groups give us hints into the extent allocation trees. Which
938 * blocks are free etc etc
939 */
940#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
941
942#define BTRFS_DEV_EXTENT_KEY 204
943#define BTRFS_DEV_ITEM_KEY 216
944#define BTRFS_CHUNK_ITEM_KEY 228
945
946/*
947 * string items are for debugging. They just store a short string of
948 * data in the FS
949 */
950#define BTRFS_STRING_ITEM_KEY 253
951
952#define BTRFS_MOUNT_NODATASUM (1 << 0)
953#define BTRFS_MOUNT_NODATACOW (1 << 1)
954#define BTRFS_MOUNT_NOBARRIER (1 << 2)
955#define BTRFS_MOUNT_SSD (1 << 3)
956#define BTRFS_MOUNT_DEGRADED (1 << 4)
957#define BTRFS_MOUNT_COMPRESS (1 << 5)
958
959#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
960#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
961#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \
962 BTRFS_MOUNT_##opt)
963/*
964 * Inode flags
965 */
966#define BTRFS_INODE_NODATASUM (1 << 0)
967#define BTRFS_INODE_NODATACOW (1 << 1)
968#define BTRFS_INODE_READONLY (1 << 2)
969#define BTRFS_INODE_NOCOMPRESS (1 << 3)
970#define BTRFS_INODE_PREALLOC (1 << 4)
971#define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \
972 ~BTRFS_INODE_##flag)
973#define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \
974 BTRFS_INODE_##flag)
975#define btrfs_test_flag(inode, flag) (BTRFS_I(inode)->flags & \
976 BTRFS_INODE_##flag)
977/* some macros to generate set/get funcs for the struct fields. This
978 * assumes there is a lefoo_to_cpu for every type, so lets make a simple
979 * one for u8:
980 */
981#define le8_to_cpu(v) (v)
982#define cpu_to_le8(v) (v)
983#define __le8 u8
984
985#define read_eb_member(eb, ptr, type, member, result) ( \
986 read_extent_buffer(eb, (char *)(result), \
987 ((unsigned long)(ptr)) + \
988 offsetof(type, member), \
989 sizeof(((type *)0)->member)))
990
991#define write_eb_member(eb, ptr, type, member, result) ( \
992 write_extent_buffer(eb, (char *)(result), \
993 ((unsigned long)(ptr)) + \
994 offsetof(type, member), \
995 sizeof(((type *)0)->member)))
996
997#ifndef BTRFS_SETGET_FUNCS
998#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
999u##bits btrfs_##name(struct extent_buffer *eb, type *s); \
1000void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
1001#endif
1002
1003#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
1004static inline u##bits btrfs_##name(struct extent_buffer *eb) \
1005{ \
1006 type *p = kmap_atomic(eb->first_page, KM_USER0); \
1007 u##bits res = le##bits##_to_cpu(p->member); \
1008 kunmap_atomic(p, KM_USER0); \
1009 return res; \
1010} \
1011static inline void btrfs_set_##name(struct extent_buffer *eb, \
1012 u##bits val) \
1013{ \
1014 type *p = kmap_atomic(eb->first_page, KM_USER0); \
1015 p->member = cpu_to_le##bits(val); \
1016 kunmap_atomic(p, KM_USER0); \
1017}
1018
1019#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
1020static inline u##bits btrfs_##name(type *s) \
1021{ \
1022 return le##bits##_to_cpu(s->member); \
1023} \
1024static inline void btrfs_set_##name(type *s, u##bits val) \
1025{ \
1026 s->member = cpu_to_le##bits(val); \
1027}
1028
1029BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
1030BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64);
1031BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
1032BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
1033BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
1034BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item,
1035 start_offset, 64);
1036BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
1037BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
1038BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
1039BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
1040BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
1041BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64);
1042
1043BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
1044BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
1045 total_bytes, 64);
1046BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item,
1047 bytes_used, 64);
1048BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item,
1049 io_align, 32);
1050BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
1051 io_width, 32);
1052BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
1053 sector_size, 32);
1054BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
1055BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item,
1056 dev_group, 32);
1057BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
1058 seek_speed, 8);
1059BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
1060 bandwidth, 8);
1061BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item,
1062 generation, 64);
1063
1064static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
1065{
1066 return (char *)d + offsetof(struct btrfs_dev_item, uuid);
1067}
1068
1069static inline char *btrfs_device_fsid(struct btrfs_dev_item *d)
1070{
1071 return (char *)d + offsetof(struct btrfs_dev_item, fsid);
1072}
1073
1074BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
1075BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
1076BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
1077BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
1078BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
1079BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
1080BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
1081BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
1082BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16);
1083BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
1084BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
1085
1086static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s)
1087{
1088 return (char *)s + offsetof(struct btrfs_stripe, dev_uuid);
1089}
1090
1091BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64);
1092BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
1093BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
1094 stripe_len, 64);
1095BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk,
1096 io_align, 32);
1097BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk,
1098 io_width, 32);
1099BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
1100 sector_size, 32);
1101BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
1102BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
1103 num_stripes, 16);
1104BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk,
1105 sub_stripes, 16);
1106BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
1107BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
1108
1109static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c,
1110 int nr)
1111{
1112 unsigned long offset = (unsigned long)c;
1113 offset += offsetof(struct btrfs_chunk, stripe);
1114 offset += nr * sizeof(struct btrfs_stripe);
1115 return (struct btrfs_stripe *)offset;
1116}
1117
1118static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr)
1119{
1120 return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
1121}
1122
1123static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
1124 struct btrfs_chunk *c, int nr)
1125{
1126 return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
1127}
1128
1129static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb,
1130 struct btrfs_chunk *c, int nr,
1131 u64 val)
1132{
1133 btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val);
1134}
1135
1136static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
1137 struct btrfs_chunk *c, int nr)
1138{
1139 return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
1140}
1141
1142static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb,
1143 struct btrfs_chunk *c, int nr,
1144 u64 val)
1145{
1146 btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val);
1147}
1148
1149/* struct btrfs_block_group_item */
1150BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
1151 used, 64);
1152BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item,
1153 used, 64);
1154BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid,
1155 struct btrfs_block_group_item, chunk_objectid, 64);
1156
1157BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid,
1158 struct btrfs_block_group_item, chunk_objectid, 64);
1159BTRFS_SETGET_FUNCS(disk_block_group_flags,
1160 struct btrfs_block_group_item, flags, 64);
1161BTRFS_SETGET_STACK_FUNCS(block_group_flags,
1162 struct btrfs_block_group_item, flags, 64);
1163
1164/* struct btrfs_inode_ref */
1165BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
1166BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
1167
1168/* struct btrfs_inode_item */
1169BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
1170BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
1171BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
1172BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
1173BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
1174BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
1175BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
1176BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
1177BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
1178BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
1179BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
1180BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
1181
1182static inline struct btrfs_timespec *
1183btrfs_inode_atime(struct btrfs_inode_item *inode_item)
1184{
1185 unsigned long ptr = (unsigned long)inode_item;
1186 ptr += offsetof(struct btrfs_inode_item, atime);
1187 return (struct btrfs_timespec *)ptr;
1188}
1189
1190static inline struct btrfs_timespec *
1191btrfs_inode_mtime(struct btrfs_inode_item *inode_item)
1192{
1193 unsigned long ptr = (unsigned long)inode_item;
1194 ptr += offsetof(struct btrfs_inode_item, mtime);
1195 return (struct btrfs_timespec *)ptr;
1196}
1197
1198static inline struct btrfs_timespec *
1199btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
1200{
1201 unsigned long ptr = (unsigned long)inode_item;
1202 ptr += offsetof(struct btrfs_inode_item, ctime);
1203 return (struct btrfs_timespec *)ptr;
1204}
1205
1206static inline struct btrfs_timespec *
1207btrfs_inode_otime(struct btrfs_inode_item *inode_item)
1208{
1209 unsigned long ptr = (unsigned long)inode_item;
1210 ptr += offsetof(struct btrfs_inode_item, otime);
1211 return (struct btrfs_timespec *)ptr;
1212}
1213
1214BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
1215BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
1216
1217/* struct btrfs_dev_extent */
1218BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
1219 chunk_tree, 64);
1220BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
1221 chunk_objectid, 64);
1222BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
1223 chunk_offset, 64);
1224BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
1225
1226static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
1227{
1228 unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid);
1229 return (u8 *)((unsigned long)dev + ptr);
1230}
1231
1232/* struct btrfs_extent_ref */
1233BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
1234BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
1235BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64);
1236BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32);
1237
1238BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64);
1239BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref,
1240 generation, 64);
1241BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref,
1242 objectid, 64);
1243BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref,
1244 num_refs, 32);
1245
1246/* struct btrfs_extent_item */
1247BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
1248BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item,
1249 refs, 32);
1250
1251/* struct btrfs_node */
1252BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
1253BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
1254
1255static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr)
1256{
1257 unsigned long ptr;
1258 ptr = offsetof(struct btrfs_node, ptrs) +
1259 sizeof(struct btrfs_key_ptr) * nr;
1260 return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr);
1261}
1262
1263static inline void btrfs_set_node_blockptr(struct extent_buffer *eb,
1264 int nr, u64 val)
1265{
1266 unsigned long ptr;
1267 ptr = offsetof(struct btrfs_node, ptrs) +
1268 sizeof(struct btrfs_key_ptr) * nr;
1269 btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
1270}
1271
1272static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr)
1273{
1274 unsigned long ptr;
1275 ptr = offsetof(struct btrfs_node, ptrs) +
1276 sizeof(struct btrfs_key_ptr) * nr;
1277 return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr);
1278}
1279
1280static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb,
1281 int nr, u64 val)
1282{
1283 unsigned long ptr;
1284 ptr = offsetof(struct btrfs_node, ptrs) +
1285 sizeof(struct btrfs_key_ptr) * nr;
1286 btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val);
1287}
1288
1289static inline unsigned long btrfs_node_key_ptr_offset(int nr)
1290{
1291 return offsetof(struct btrfs_node, ptrs) +
1292 sizeof(struct btrfs_key_ptr) * nr;
1293}
1294
1295void btrfs_node_key(struct extent_buffer *eb,
1296 struct btrfs_disk_key *disk_key, int nr);
1297
1298static inline void btrfs_set_node_key(struct extent_buffer *eb,
1299 struct btrfs_disk_key *disk_key, int nr)
1300{
1301 unsigned long ptr;
1302 ptr = btrfs_node_key_ptr_offset(nr);
1303 write_eb_member(eb, (struct btrfs_key_ptr *)ptr,
1304 struct btrfs_key_ptr, key, disk_key);
1305}
1306
1307/* struct btrfs_item */
1308BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32);
1309BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32);
1310
1311static inline unsigned long btrfs_item_nr_offset(int nr)
1312{
1313 return offsetof(struct btrfs_leaf, items) +
1314 sizeof(struct btrfs_item) * nr;
1315}
1316
1317static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb,
1318 int nr)
1319{
1320 return (struct btrfs_item *)btrfs_item_nr_offset(nr);
1321}
1322
1323static inline u32 btrfs_item_end(struct extent_buffer *eb,
1324 struct btrfs_item *item)
1325{
1326 return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
1327}
1328
1329static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr)
1330{
1331 return btrfs_item_end(eb, btrfs_item_nr(eb, nr));
1332}
1333
1334static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr)
1335{
1336 return btrfs_item_offset(eb, btrfs_item_nr(eb, nr));
1337}
1338
1339static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr)
1340{
1341 return btrfs_item_size(eb, btrfs_item_nr(eb, nr));
1342}
1343
1344static inline void btrfs_item_key(struct extent_buffer *eb,
1345 struct btrfs_disk_key *disk_key, int nr)
1346{
1347 struct btrfs_item *item = btrfs_item_nr(eb, nr);
1348 read_eb_member(eb, item, struct btrfs_item, key, disk_key);
1349}
1350
1351static inline void btrfs_set_item_key(struct extent_buffer *eb,
1352 struct btrfs_disk_key *disk_key, int nr)
1353{
1354 struct btrfs_item *item = btrfs_item_nr(eb, nr);
1355 write_eb_member(eb, item, struct btrfs_item, key, disk_key);
1356}
1357
1358BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);
1359
1360/*
1361 * struct btrfs_root_ref
1362 */
1363BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64);
1364BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64);
1365BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16);
1366
1367/* struct btrfs_dir_item */
1368BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
1369BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
1370BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
1371BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
1372
1373static inline void btrfs_dir_item_key(struct extent_buffer *eb,
1374 struct btrfs_dir_item *item,
1375 struct btrfs_disk_key *key)
1376{
1377 read_eb_member(eb, item, struct btrfs_dir_item, location, key);
1378}
1379
1380static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
1381 struct btrfs_dir_item *item,
1382 struct btrfs_disk_key *key)
1383{
1384 write_eb_member(eb, item, struct btrfs_dir_item, location, key);
1385}
1386
1387/* struct btrfs_disk_key */
1388BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
1389 objectid, 64);
1390BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
1391BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
1392
1393static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
1394 struct btrfs_disk_key *disk)
1395{
1396 cpu->offset = le64_to_cpu(disk->offset);
1397 cpu->type = disk->type;
1398 cpu->objectid = le64_to_cpu(disk->objectid);
1399}
1400
1401static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
1402 struct btrfs_key *cpu)
1403{
1404 disk->offset = cpu_to_le64(cpu->offset);
1405 disk->type = cpu->type;
1406 disk->objectid = cpu_to_le64(cpu->objectid);
1407}
1408
1409static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb,
1410 struct btrfs_key *key, int nr)
1411{
1412 struct btrfs_disk_key disk_key;
1413 btrfs_node_key(eb, &disk_key, nr);
1414 btrfs_disk_key_to_cpu(key, &disk_key);
1415}
1416
1417static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb,
1418 struct btrfs_key *key, int nr)
1419{
1420 struct btrfs_disk_key disk_key;
1421 btrfs_item_key(eb, &disk_key, nr);
1422 btrfs_disk_key_to_cpu(key, &disk_key);
1423}
1424
1425static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb,
1426 struct btrfs_dir_item *item,
1427 struct btrfs_key *key)
1428{
1429 struct btrfs_disk_key disk_key;
1430 btrfs_dir_item_key(eb, item, &disk_key);
1431 btrfs_disk_key_to_cpu(key, &disk_key);
1432}
1433
1434
1435static inline u8 btrfs_key_type(struct btrfs_key *key)
1436{
1437 return key->type;
1438}
1439
1440static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val)
1441{
1442 key->type = val;
1443}
1444
1445/* struct btrfs_header */
1446BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
1447BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
1448 generation, 64);
1449BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
1450BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
1451BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
1452BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
1453
1454static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag)
1455{
1456 return (btrfs_header_flags(eb) & flag) == flag;
1457}
1458
1459static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
1460{
1461 u64 flags = btrfs_header_flags(eb);
1462 btrfs_set_header_flags(eb, flags | flag);
1463 return (flags & flag) == flag;
1464}
1465
1466static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
1467{
1468 u64 flags = btrfs_header_flags(eb);
1469 btrfs_set_header_flags(eb, flags & ~flag);
1470 return (flags & flag) == flag;
1471}
1472
1473static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
1474{
1475 unsigned long ptr = offsetof(struct btrfs_header, fsid);
1476 return (u8 *)ptr;
1477}
1478
1479static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
1480{
1481 unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid);
1482 return (u8 *)ptr;
1483}
1484
1485static inline u8 *btrfs_super_fsid(struct extent_buffer *eb)
1486{
1487 unsigned long ptr = offsetof(struct btrfs_super_block, fsid);
1488 return (u8 *)ptr;
1489}
1490
1491static inline u8 *btrfs_header_csum(struct extent_buffer *eb)
1492{
1493 unsigned long ptr = offsetof(struct btrfs_header, csum);
1494 return (u8 *)ptr;
1495}
1496
1497static inline struct btrfs_node *btrfs_buffer_node(struct extent_buffer *eb)
1498{
1499 return NULL;
1500}
1501
1502static inline struct btrfs_leaf *btrfs_buffer_leaf(struct extent_buffer *eb)
1503{
1504 return NULL;
1505}
1506
1507static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb)
1508{
1509 return NULL;
1510}
1511
1512static inline int btrfs_is_leaf(struct extent_buffer *eb)
1513{
1514 return btrfs_header_level(eb) == 0;
1515}
1516
1517/* struct btrfs_root_item */
1518BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item,
1519 generation, 64);
1520BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
1521BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64);
1522BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
1523
1524BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item,
1525 generation, 64);
1526BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
1527BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
1528BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
1529BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
1530BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64);
1531BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
1532BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
1533BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
1534 last_snapshot, 64);
1535
1536/* struct btrfs_super_block */
1537
1538BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
1539BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
1540BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
1541 generation, 64);
1542BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
1543BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
1544 struct btrfs_super_block, sys_chunk_array_size, 32);
1545BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation,
1546 struct btrfs_super_block, chunk_root_generation, 64);
1547BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
1548 root_level, 8);
1549BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
1550 chunk_root, 64);
1551BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
1552 chunk_root_level, 8);
1553BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
1554 log_root, 64);
1555BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block,
1556 log_root_transid, 64);
1557BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
1558 log_root_level, 8);
1559BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
1560 total_bytes, 64);
1561BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
1562 bytes_used, 64);
1563BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
1564 sectorsize, 32);
1565BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
1566 nodesize, 32);
1567BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block,
1568 leafsize, 32);
1569BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
1570 stripesize, 32);
1571BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
1572 root_dir_objectid, 64);
1573BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
1574 num_devices, 64);
1575BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
1576 compat_flags, 64);
1577BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
1578 compat_flags, 64);
1579BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
1580 incompat_flags, 64);
1581BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
1582 csum_type, 16);
1583
1584static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
1585{
1586 int t = btrfs_super_csum_type(s);
1587 BUG_ON(t >= ARRAY_SIZE(btrfs_csum_sizes));
1588 return btrfs_csum_sizes[t];
1589}
1590
1591static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
1592{
1593 return offsetof(struct btrfs_leaf, items);
1594}
1595
1596/* struct btrfs_file_extent_item */
1597BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
1598
1599static inline unsigned long
1600btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
1601{
1602 unsigned long offset = (unsigned long)e;
1603 offset += offsetof(struct btrfs_file_extent_item, disk_bytenr);
1604 return offset;
1605}
1606
1607static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
1608{
1609 return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
1610}
1611
1612BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
1613 disk_bytenr, 64);
1614BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
1615 generation, 64);
1616BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item,
1617 disk_num_bytes, 64);
1618BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
1619 offset, 64);
1620BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
1621 num_bytes, 64);
1622BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
1623 ram_bytes, 64);
1624BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
1625 compression, 8);
1626BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
1627 encryption, 8);
1628BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
1629 other_encoding, 16);
1630
1631/* this returns the number of file bytes represented by the inline item.
1632 * If an item is compressed, this is the uncompressed size
1633 */
1634static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
1635 struct btrfs_file_extent_item *e)
1636{
1637 return btrfs_file_extent_ram_bytes(eb, e);
1638}
1639
1640/*
1641 * this returns the number of bytes used by the item on disk, minus the
1642 * size of any extent headers. If a file is compressed on disk, this is
1643 * the compressed size
1644 */
1645static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
1646 struct btrfs_item *e)
1647{
1648 unsigned long offset;
1649 offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
1650 return btrfs_item_size(eb, e) - offset;
1651}
1652
1653static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
1654{
1655 return sb->s_fs_info;
1656}
1657
1658static inline int btrfs_set_root_name(struct btrfs_root *root,
1659 const char *name, int len)
1660{
1661 /* if we already have a name just free it */
1662 kfree(root->name);
1663
1664 root->name = kmalloc(len+1, GFP_KERNEL);
1665 if (!root->name)
1666 return -ENOMEM;
1667
1668 memcpy(root->name, name, len);
1669 root->name[len] = '\0';
1670
1671 return 0;
1672}
1673
1674static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
1675{
1676 if (level == 0)
1677 return root->leafsize;
1678 return root->nodesize;
1679}
1680
1681/* helper function to cast into the data area of the leaf. */
1682#define btrfs_item_ptr(leaf, slot, type) \
1683 ((type *)(btrfs_leaf_data(leaf) + \
1684 btrfs_item_offset_nr(leaf, slot)))
1685
1686#define btrfs_item_ptr_offset(leaf, slot) \
1687 ((unsigned long)(btrfs_leaf_data(leaf) + \
1688 btrfs_item_offset_nr(leaf, slot)))
1689
1690static inline struct dentry *fdentry(struct file *file)
1691{
1692 return file->f_path.dentry;
1693}
1694
1695/* extent-tree.c */
1696int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
1697int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
1698 struct btrfs_root *root, u64 bytenr,
1699 u64 num_bytes, u32 *refs);
1700int btrfs_update_pinned_extents(struct btrfs_root *root,
1701 u64 bytenr, u64 num, int pin);
1702int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
1703 struct btrfs_root *root, struct extent_buffer *leaf);
1704int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1705 struct btrfs_root *root, u64 objectid, u64 bytenr);
1706int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1707 struct btrfs_root *root);
1708int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
1709struct btrfs_block_group_cache *btrfs_lookup_block_group(
1710 struct btrfs_fs_info *info,
1711 u64 bytenr);
1712u64 btrfs_find_block_group(struct btrfs_root *root,
1713 u64 search_start, u64 search_hint, int owner);
1714struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1715 struct btrfs_root *root,
1716 u32 blocksize, u64 parent,
1717 u64 root_objectid,
1718 u64 ref_generation,
1719 int level,
1720 u64 hint,
1721 u64 empty_size);
1722struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
1723 struct btrfs_root *root,
1724 u64 bytenr, u32 blocksize);
1725int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
1726 struct btrfs_root *root,
1727 u64 num_bytes, u64 parent, u64 min_bytes,
1728 u64 root_objectid, u64 ref_generation,
1729 u64 owner, u64 empty_size, u64 hint_byte,
1730 u64 search_end, struct btrfs_key *ins, u64 data);
1731int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
1732 struct btrfs_root *root, u64 parent,
1733 u64 root_objectid, u64 ref_generation,
1734 u64 owner, struct btrfs_key *ins);
1735int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
1736 struct btrfs_root *root, u64 parent,
1737 u64 root_objectid, u64 ref_generation,
1738 u64 owner, struct btrfs_key *ins);
1739int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
1740 struct btrfs_root *root,
1741 u64 num_bytes, u64 min_alloc_size,
1742 u64 empty_size, u64 hint_byte,
1743 u64 search_end, struct btrfs_key *ins,
1744 u64 data);
1745int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1746 struct extent_buffer *orig_buf, struct extent_buffer *buf,
1747 u32 *nr_extents);
1748int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1749 struct extent_buffer *buf, u32 nr_extents);
1750int btrfs_update_ref(struct btrfs_trans_handle *trans,
1751 struct btrfs_root *root, struct extent_buffer *orig_buf,
1752 struct extent_buffer *buf, int start_slot, int nr);
1753int btrfs_free_extent(struct btrfs_trans_handle *trans,
1754 struct btrfs_root *root,
1755 u64 bytenr, u64 num_bytes, u64 parent,
1756 u64 root_objectid, u64 ref_generation,
1757 u64 owner_objectid, int pin);
1758int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
1759int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
1760 struct btrfs_root *root,
1761 struct extent_io_tree *unpin);
1762int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1763 struct btrfs_root *root,
1764 u64 bytenr, u64 num_bytes, u64 parent,
1765 u64 root_objectid, u64 ref_generation,
1766 u64 owner_objectid);
1767int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1768 struct btrfs_root *root, u64 bytenr,
1769 u64 orig_parent, u64 parent,
1770 u64 root_objectid, u64 ref_generation,
1771 u64 owner_objectid);
1772int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1773 struct btrfs_root *root);
1774int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
1775int btrfs_free_block_groups(struct btrfs_fs_info *info);
1776int btrfs_read_block_groups(struct btrfs_root *root);
1777int btrfs_make_block_group(struct btrfs_trans_handle *trans,
1778 struct btrfs_root *root, u64 bytes_used,
1779 u64 type, u64 chunk_objectid, u64 chunk_offset,
1780 u64 size);
1781int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
1782 struct btrfs_root *root, u64 group_start);
1783int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
1784int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
1785 struct btrfs_root *root);
1786int btrfs_drop_dead_reloc_roots(struct btrfs_root *root);
1787int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
1788 struct btrfs_root *root,
1789 struct extent_buffer *buf, u64 orig_start);
1790int btrfs_add_dead_reloc_root(struct btrfs_root *root);
1791int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
1792int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
1793u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
1794/* ctree.c */
1795int btrfs_previous_item(struct btrfs_root *root,
1796 struct btrfs_path *path, u64 min_objectid,
1797 int type);
1798int btrfs_merge_path(struct btrfs_trans_handle *trans,
1799 struct btrfs_root *root,
1800 struct btrfs_key *node_keys,
1801 u64 *nodes, int lowest_level);
1802int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1803 struct btrfs_root *root, struct btrfs_path *path,
1804 struct btrfs_key *new_key);
1805struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
1806struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
1807int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
1808 struct btrfs_key *key, int lowest_level,
1809 int cache_only, u64 min_trans);
1810int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
1811 struct btrfs_key *max_key,
1812 struct btrfs_path *path, int cache_only,
1813 u64 min_trans);
1814int btrfs_cow_block(struct btrfs_trans_handle *trans,
1815 struct btrfs_root *root, struct extent_buffer *buf,
1816 struct extent_buffer *parent, int parent_slot,
1817 struct extent_buffer **cow_ret, u64 prealloc_dest);
1818int btrfs_copy_root(struct btrfs_trans_handle *trans,
1819 struct btrfs_root *root,
1820 struct extent_buffer *buf,
1821 struct extent_buffer **cow_ret, u64 new_root_objectid);
1822int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
1823 *root, struct btrfs_path *path, u32 data_size);
1824int btrfs_truncate_item(struct btrfs_trans_handle *trans,
1825 struct btrfs_root *root,
1826 struct btrfs_path *path,
1827 u32 new_size, int from_end);
1828int btrfs_split_item(struct btrfs_trans_handle *trans,
1829 struct btrfs_root *root,
1830 struct btrfs_path *path,
1831 struct btrfs_key *new_key,
1832 unsigned long split_offset);
1833int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1834 *root, struct btrfs_key *key, struct btrfs_path *p, int
1835 ins_len, int cow);
1836int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1837 struct btrfs_root *root, struct extent_buffer *parent,
1838 int start_slot, int cache_only, u64 *last_ret,
1839 struct btrfs_key *progress);
1840void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
1841struct btrfs_path *btrfs_alloc_path(void);
1842void btrfs_free_path(struct btrfs_path *p);
1843void btrfs_init_path(struct btrfs_path *p);
1844int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1845 struct btrfs_path *path, int slot, int nr);
1846int btrfs_del_leaf(struct btrfs_trans_handle *trans,
1847 struct btrfs_root *root,
1848 struct btrfs_path *path, u64 bytenr);
1849static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
1850 struct btrfs_root *root,
1851 struct btrfs_path *path)
1852{
1853 return btrfs_del_items(trans, root, path, path->slots[0], 1);
1854}
1855
1856int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
1857 *root, struct btrfs_key *key, void *data, u32 data_size);
1858int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
1859 struct btrfs_root *root,
1860 struct btrfs_path *path,
1861 struct btrfs_key *cpu_key, u32 *data_size,
1862 int nr);
1863int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
1864 struct btrfs_root *root,
1865 struct btrfs_path *path,
1866 struct btrfs_key *cpu_key, u32 *data_size, int nr);
1867
1868static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
1869 struct btrfs_root *root,
1870 struct btrfs_path *path,
1871 struct btrfs_key *key,
1872 u32 data_size)
1873{
1874 return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
1875}
1876
1877int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
1878int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
1879int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
1880int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
1881 *root);
1882int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
1883 struct btrfs_root *root,
1884 struct extent_buffer *node,
1885 struct extent_buffer *parent);
1886/* root-item.c */
1887int btrfs_find_root_ref(struct btrfs_root *tree_root,
1888 struct btrfs_path *path,
1889 u64 root_id, u64 ref_id);
1890int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
1891 struct btrfs_root *tree_root,
1892 u64 root_id, u8 type, u64 ref_id,
1893 u64 dirid, u64 sequence,
1894 const char *name, int name_len);
1895int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1896 struct btrfs_key *key);
1897int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
1898 *root, struct btrfs_key *key, struct btrfs_root_item
1899 *item);
1900int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
1901 *root, struct btrfs_key *key, struct btrfs_root_item
1902 *item);
1903int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
1904 btrfs_root_item *item, struct btrfs_key *key);
1905int btrfs_search_root(struct btrfs_root *root, u64 search_start,
1906 u64 *found_objectid);
1907int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
1908 struct btrfs_root *latest_root);
1909/* dir-item.c */
1910int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
1911 struct btrfs_root *root, const char *name,
1912 int name_len, u64 dir,
1913 struct btrfs_key *location, u8 type, u64 index);
1914struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
1915 struct btrfs_root *root,
1916 struct btrfs_path *path, u64 dir,
1917 const char *name, int name_len,
1918 int mod);
1919struct btrfs_dir_item *
1920btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
1921 struct btrfs_root *root,
1922 struct btrfs_path *path, u64 dir,
1923 u64 objectid, const char *name, int name_len,
1924 int mod);
1925struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
1926 struct btrfs_path *path,
1927 const char *name, int name_len);
1928int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
1929 struct btrfs_root *root,
1930 struct btrfs_path *path,
1931 struct btrfs_dir_item *di);
1932int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
1933 struct btrfs_root *root, const char *name,
1934 u16 name_len, const void *data, u16 data_len,
1935 u64 dir);
1936struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
1937 struct btrfs_root *root,
1938 struct btrfs_path *path, u64 dir,
1939 const char *name, u16 name_len,
1940 int mod);
1941
1942/* orphan.c */
1943int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
1944 struct btrfs_root *root, u64 offset);
1945int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
1946 struct btrfs_root *root, u64 offset);
1947
1948/* inode-map.c */
1949int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
1950 struct btrfs_root *fs_root,
1951 u64 dirid, u64 *objectid);
1952int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid);
1953
1954/* inode-item.c */
1955int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
1956 struct btrfs_root *root,
1957 const char *name, int name_len,
1958 u64 inode_objectid, u64 ref_objectid, u64 index);
1959int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
1960 struct btrfs_root *root,
1961 const char *name, int name_len,
1962 u64 inode_objectid, u64 ref_objectid, u64 *index);
1963int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
1964 struct btrfs_root *root,
1965 struct btrfs_path *path, u64 objectid);
1966int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
1967 *root, struct btrfs_path *path,
1968 struct btrfs_key *location, int mod);
1969
1970/* file-item.c */
1971int btrfs_del_csums(struct btrfs_trans_handle *trans,
1972 struct btrfs_root *root, u64 bytenr, u64 len);
1973int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
1974 struct bio *bio, u32 *dst);
1975int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
1976 struct btrfs_root *root,
1977 u64 objectid, u64 pos,
1978 u64 disk_offset, u64 disk_num_bytes,
1979 u64 num_bytes, u64 offset, u64 ram_bytes,
1980 u8 compression, u8 encryption, u16 other_encoding);
1981int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
1982 struct btrfs_root *root,
1983 struct btrfs_path *path, u64 objectid,
1984 u64 bytenr, int mod);
1985int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
1986 struct btrfs_root *root,
1987 struct btrfs_ordered_sum *sums);
1988int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
1989 struct bio *bio, u64 file_start, int contig);
1990int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
1991 u64 start, unsigned long len);
1992struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
1993 struct btrfs_root *root,
1994 struct btrfs_path *path,
1995 u64 bytenr, int cow);
1996int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
1997 struct btrfs_root *root, struct btrfs_path *path,
1998 u64 isize);
1999int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start,
2000 u64 end, struct list_head *list);
2001/* inode.c */
2002
2003/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
2004#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
2005#define ClearPageChecked ClearPageFsMisc
2006#define SetPageChecked SetPageFsMisc
2007#define PageChecked PageFsMisc
2008#endif
2009
2010struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
2011int btrfs_set_inode_index(struct inode *dir, u64 *index);
2012int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2013 struct btrfs_root *root,
2014 struct inode *dir, struct inode *inode,
2015 const char *name, int name_len);
2016int btrfs_add_link(struct btrfs_trans_handle *trans,
2017 struct inode *parent_inode, struct inode *inode,
2018 const char *name, int name_len, int add_backref, u64 index);
2019int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2020 struct btrfs_root *root,
2021 struct inode *inode, u64 new_size,
2022 u32 min_type);
2023
2024int btrfs_start_delalloc_inodes(struct btrfs_root *root);
2025int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
2026int btrfs_writepages(struct address_space *mapping,
2027 struct writeback_control *wbc);
2028int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
2029 struct btrfs_root *new_root, struct dentry *dentry,
2030 u64 new_dirid, u64 alloc_hint);
2031int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
2032 size_t size, struct bio *bio, unsigned long bio_flags);
2033
2034unsigned long btrfs_force_ra(struct address_space *mapping,
2035 struct file_ra_state *ra, struct file *file,
2036 pgoff_t offset, pgoff_t last_index);
2037int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
2038 int for_del);
2039int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
2040int btrfs_readpage(struct file *file, struct page *page);
2041void btrfs_delete_inode(struct inode *inode);
2042void btrfs_put_inode(struct inode *inode);
2043void btrfs_read_locked_inode(struct inode *inode);
2044int btrfs_write_inode(struct inode *inode, int wait);
2045void btrfs_dirty_inode(struct inode *inode);
2046struct inode *btrfs_alloc_inode(struct super_block *sb);
2047void btrfs_destroy_inode(struct inode *inode);
2048int btrfs_init_cachep(void);
2049void btrfs_destroy_cachep(void);
2050long btrfs_ioctl_trans_end(struct file *file);
2051struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
2052 struct btrfs_root *root, int wait);
2053struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
2054 struct btrfs_root *root);
2055struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
2056 struct btrfs_root *root, int *is_new);
2057int btrfs_commit_write(struct file *file, struct page *page,
2058 unsigned from, unsigned to);
2059struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
2060 size_t page_offset, u64 start, u64 end,
2061 int create);
2062int btrfs_update_inode(struct btrfs_trans_handle *trans,
2063 struct btrfs_root *root,
2064 struct inode *inode);
2065int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2066int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2067void btrfs_orphan_cleanup(struct btrfs_root *root);
2068int btrfs_cont_expand(struct inode *inode, loff_t size);
2069
2070/* ioctl.c */
2071long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
2072
2073/* file.c */
2074int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
2075int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
2076 int skip_pinned);
2077int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
2078extern struct file_operations btrfs_file_operations;
2079int btrfs_drop_extents(struct btrfs_trans_handle *trans,
2080 struct btrfs_root *root, struct inode *inode,
2081 u64 start, u64 end, u64 inline_limit, u64 *hint_block);
2082int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
2083 struct btrfs_root *root,
2084 struct inode *inode, u64 start, u64 end);
2085int btrfs_release_file(struct inode *inode, struct file *file);
2086
2087/* tree-defrag.c */
2088int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
2089 struct btrfs_root *root, int cache_only);
2090
2091/* sysfs.c */
2092int btrfs_init_sysfs(void);
2093void btrfs_exit_sysfs(void);
2094int btrfs_sysfs_add_super(struct btrfs_fs_info *fs);
2095int btrfs_sysfs_add_root(struct btrfs_root *root);
2096void btrfs_sysfs_del_root(struct btrfs_root *root);
2097void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
2098
2099/* xattr.c */
2100ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
2101
2102/* super.c */
2103u64 btrfs_parse_size(char *str);
2104int btrfs_parse_options(struct btrfs_root *root, char *options);
2105int btrfs_sync_fs(struct super_block *sb, int wait);
2106
2107/* acl.c */
2108int btrfs_check_acl(struct inode *inode, int mask);
2109int btrfs_init_acl(struct inode *inode, struct inode *dir);
2110int btrfs_acl_chmod(struct inode *inode);
2111
2112/* free-space-cache.c */
2113int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
2114 u64 bytenr, u64 size);
2115int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
2116 u64 offset, u64 bytes);
2117int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
2118 u64 bytenr, u64 size);
2119int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
2120 u64 offset, u64 bytes);
2121void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
2122 *block_group);
2123struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
2124 *block_group, u64 offset,
2125 u64 bytes);
2126void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
2127 u64 bytes);
2128u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
2129#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
new file mode 100644
index 000000000000..926a0b287a7d
--- /dev/null
+++ b/fs/btrfs/dir-item.c
@@ -0,0 +1,386 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "hash.h"
22#include "transaction.h"
23
24/*
25 * insert a name into a directory, doing overflow properly if there is a hash
26 * collision. data_size indicates how big the item inserted should be. On
27 * success a struct btrfs_dir_item pointer is returned, otherwise it is
28 * an ERR_PTR.
29 *
30 * The name is not copied into the dir item, you have to do that yourself.
31 */
32static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
33 *trans,
34 struct btrfs_root *root,
35 struct btrfs_path *path,
36 struct btrfs_key *cpu_key,
37 u32 data_size,
38 const char *name,
39 int name_len)
40{
41 int ret;
42 char *ptr;
43 struct btrfs_item *item;
44 struct extent_buffer *leaf;
45
46 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
47 if (ret == -EEXIST) {
48 struct btrfs_dir_item *di;
49 di = btrfs_match_dir_item_name(root, path, name, name_len);
50 if (di)
51 return ERR_PTR(-EEXIST);
52 ret = btrfs_extend_item(trans, root, path, data_size);
53 WARN_ON(ret > 0);
54 }
55 if (ret < 0)
56 return ERR_PTR(ret);
57 WARN_ON(ret > 0);
58 leaf = path->nodes[0];
59 item = btrfs_item_nr(leaf, path->slots[0]);
60 ptr = btrfs_item_ptr(leaf, path->slots[0], char);
61 BUG_ON(data_size > btrfs_item_size(leaf, item));
62 ptr += btrfs_item_size(leaf, item) - data_size;
63 return (struct btrfs_dir_item *)ptr;
64}
65
66/*
67 * xattrs work a lot like directories, this inserts an xattr item
68 * into the tree
69 */
70int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
71 struct btrfs_root *root, const char *name,
72 u16 name_len, const void *data, u16 data_len,
73 u64 dir)
74{
75 int ret = 0;
76 struct btrfs_path *path;
77 struct btrfs_dir_item *dir_item;
78 unsigned long name_ptr, data_ptr;
79 struct btrfs_key key, location;
80 struct btrfs_disk_key disk_key;
81 struct extent_buffer *leaf;
82 u32 data_size;
83
84 key.objectid = dir;
85 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
86 key.offset = btrfs_name_hash(name, name_len);
87 path = btrfs_alloc_path();
88 if (!path)
89 return -ENOMEM;
90 if (name_len + data_len + sizeof(struct btrfs_dir_item) >
91 BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item))
92 return -ENOSPC;
93
94 data_size = sizeof(*dir_item) + name_len + data_len;
95 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
96 name, name_len);
97 /*
98 * FIXME: at some point we should handle xattr's that are larger than
99 * what we can fit in our leaf. We set location to NULL b/c we arent
100 * pointing at anything else, that will change if we store the xattr
101 * data in a separate inode.
102 */
103 BUG_ON(IS_ERR(dir_item));
104 memset(&location, 0, sizeof(location));
105
106 leaf = path->nodes[0];
107 btrfs_cpu_key_to_disk(&disk_key, &location);
108 btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
109 btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR);
110 btrfs_set_dir_name_len(leaf, dir_item, name_len);
111 btrfs_set_dir_transid(leaf, dir_item, trans->transid);
112 btrfs_set_dir_data_len(leaf, dir_item, data_len);
113 name_ptr = (unsigned long)(dir_item + 1);
114 data_ptr = (unsigned long)((char *)name_ptr + name_len);
115
116 write_extent_buffer(leaf, name, name_ptr, name_len);
117 write_extent_buffer(leaf, data, data_ptr, data_len);
118 btrfs_mark_buffer_dirty(path->nodes[0]);
119
120 btrfs_free_path(path);
121 return ret;
122}
123
124/*
125 * insert a directory item in the tree, doing all the magic for
126 * both indexes. 'dir' indicates which objectid to insert it into,
127 * 'location' is the key to stuff into the directory item, 'type' is the
128 * type of the inode we're pointing to, and 'index' is the sequence number
129 * to use for the second index (if one is created).
130 */
131int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
132 *root, const char *name, int name_len, u64 dir,
133 struct btrfs_key *location, u8 type, u64 index)
134{
135 int ret = 0;
136 int ret2 = 0;
137 struct btrfs_path *path;
138 struct btrfs_dir_item *dir_item;
139 struct extent_buffer *leaf;
140 unsigned long name_ptr;
141 struct btrfs_key key;
142 struct btrfs_disk_key disk_key;
143 u32 data_size;
144
145 key.objectid = dir;
146 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
147 key.offset = btrfs_name_hash(name, name_len);
148 path = btrfs_alloc_path();
149 data_size = sizeof(*dir_item) + name_len;
150 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
151 name, name_len);
152 if (IS_ERR(dir_item)) {
153 ret = PTR_ERR(dir_item);
154 if (ret == -EEXIST)
155 goto second_insert;
156 goto out;
157 }
158
159 leaf = path->nodes[0];
160 btrfs_cpu_key_to_disk(&disk_key, location);
161 btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
162 btrfs_set_dir_type(leaf, dir_item, type);
163 btrfs_set_dir_data_len(leaf, dir_item, 0);
164 btrfs_set_dir_name_len(leaf, dir_item, name_len);
165 btrfs_set_dir_transid(leaf, dir_item, trans->transid);
166 name_ptr = (unsigned long)(dir_item + 1);
167
168 write_extent_buffer(leaf, name, name_ptr, name_len);
169 btrfs_mark_buffer_dirty(leaf);
170
171second_insert:
172 /* FIXME, use some real flag for selecting the extra index */
173 if (root == root->fs_info->tree_root) {
174 ret = 0;
175 goto out;
176 }
177 btrfs_release_path(root, path);
178
179 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
180 key.offset = index;
181 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
182 name, name_len);
183 if (IS_ERR(dir_item)) {
184 ret2 = PTR_ERR(dir_item);
185 goto out;
186 }
187 leaf = path->nodes[0];
188 btrfs_cpu_key_to_disk(&disk_key, location);
189 btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
190 btrfs_set_dir_type(leaf, dir_item, type);
191 btrfs_set_dir_data_len(leaf, dir_item, 0);
192 btrfs_set_dir_name_len(leaf, dir_item, name_len);
193 btrfs_set_dir_transid(leaf, dir_item, trans->transid);
194 name_ptr = (unsigned long)(dir_item + 1);
195 write_extent_buffer(leaf, name, name_ptr, name_len);
196 btrfs_mark_buffer_dirty(leaf);
197out:
198 btrfs_free_path(path);
199 if (ret)
200 return ret;
201 if (ret2)
202 return ret2;
203 return 0;
204}
205
206/*
207 * lookup a directory item based on name. 'dir' is the objectid
208 * we're searching in, and 'mod' tells us if you plan on deleting the
209 * item (use mod < 0) or changing the options (use mod > 0)
210 */
211struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
212 struct btrfs_root *root,
213 struct btrfs_path *path, u64 dir,
214 const char *name, int name_len,
215 int mod)
216{
217 int ret;
218 struct btrfs_key key;
219 int ins_len = mod < 0 ? -1 : 0;
220 int cow = mod != 0;
221 struct btrfs_key found_key;
222 struct extent_buffer *leaf;
223
224 key.objectid = dir;
225 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
226
227 key.offset = btrfs_name_hash(name, name_len);
228
229 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
230 if (ret < 0)
231 return ERR_PTR(ret);
232 if (ret > 0) {
233 if (path->slots[0] == 0)
234 return NULL;
235 path->slots[0]--;
236 }
237
238 leaf = path->nodes[0];
239 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
240
241 if (found_key.objectid != dir ||
242 btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY ||
243 found_key.offset != key.offset)
244 return NULL;
245
246 return btrfs_match_dir_item_name(root, path, name, name_len);
247}
248
249/*
250 * lookup a directory item based on index. 'dir' is the objectid
251 * we're searching in, and 'mod' tells us if you plan on deleting the
252 * item (use mod < 0) or changing the options (use mod > 0)
253 *
254 * The name is used to make sure the index really points to the name you were
255 * looking for.
256 */
257struct btrfs_dir_item *
258btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
259 struct btrfs_root *root,
260 struct btrfs_path *path, u64 dir,
261 u64 objectid, const char *name, int name_len,
262 int mod)
263{
264 int ret;
265 struct btrfs_key key;
266 int ins_len = mod < 0 ? -1 : 0;
267 int cow = mod != 0;
268
269 key.objectid = dir;
270 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
271 key.offset = objectid;
272
273 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
274 if (ret < 0)
275 return ERR_PTR(ret);
276 if (ret > 0)
277 return ERR_PTR(-ENOENT);
278 return btrfs_match_dir_item_name(root, path, name, name_len);
279}
280
281struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
282 struct btrfs_root *root,
283 struct btrfs_path *path, u64 dir,
284 const char *name, u16 name_len,
285 int mod)
286{
287 int ret;
288 struct btrfs_key key;
289 int ins_len = mod < 0 ? -1 : 0;
290 int cow = mod != 0;
291 struct btrfs_key found_key;
292 struct extent_buffer *leaf;
293
294 key.objectid = dir;
295 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
296 key.offset = btrfs_name_hash(name, name_len);
297 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
298 if (ret < 0)
299 return ERR_PTR(ret);
300 if (ret > 0) {
301 if (path->slots[0] == 0)
302 return NULL;
303 path->slots[0]--;
304 }
305
306 leaf = path->nodes[0];
307 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
308
309 if (found_key.objectid != dir ||
310 btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY ||
311 found_key.offset != key.offset)
312 return NULL;
313
314 return btrfs_match_dir_item_name(root, path, name, name_len);
315}
316
317/*
318 * helper function to look at the directory item pointed to by 'path'
319 * this walks through all the entries in a dir item and finds one
320 * for a specific name.
321 */
322struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
323 struct btrfs_path *path,
324 const char *name, int name_len)
325{
326 struct btrfs_dir_item *dir_item;
327 unsigned long name_ptr;
328 u32 total_len;
329 u32 cur = 0;
330 u32 this_len;
331 struct extent_buffer *leaf;
332
333 leaf = path->nodes[0];
334 dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
335 total_len = btrfs_item_size_nr(leaf, path->slots[0]);
336 while (cur < total_len) {
337 this_len = sizeof(*dir_item) +
338 btrfs_dir_name_len(leaf, dir_item) +
339 btrfs_dir_data_len(leaf, dir_item);
340 name_ptr = (unsigned long)(dir_item + 1);
341
342 if (btrfs_dir_name_len(leaf, dir_item) == name_len &&
343 memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
344 return dir_item;
345
346 cur += this_len;
347 dir_item = (struct btrfs_dir_item *)((char *)dir_item +
348 this_len);
349 }
350 return NULL;
351}
352
353/*
354 * given a pointer into a directory item, delete it. This
355 * handles items that have more than one entry in them.
356 */
357int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
358 struct btrfs_root *root,
359 struct btrfs_path *path,
360 struct btrfs_dir_item *di)
361{
362
363 struct extent_buffer *leaf;
364 u32 sub_item_len;
365 u32 item_len;
366 int ret = 0;
367
368 leaf = path->nodes[0];
369 sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) +
370 btrfs_dir_data_len(leaf, di);
371 item_len = btrfs_item_size_nr(leaf, path->slots[0]);
372 if (sub_item_len == item_len) {
373 ret = btrfs_del_item(trans, root, path);
374 } else {
375 /* MARKER */
376 unsigned long ptr = (unsigned long)di;
377 unsigned long start;
378
379 start = btrfs_item_ptr_offset(leaf, path->slots[0]);
380 memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
381 item_len - (ptr + sub_item_len - start));
382 ret = btrfs_truncate_item(trans, root, path,
383 item_len - sub_item_len, 1);
384 }
385 return 0;
386}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
new file mode 100644
index 000000000000..81a313874ae5
--- /dev/null
+++ b/fs/btrfs/disk-io.c
@@ -0,0 +1,2343 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/version.h>
20#include <linux/fs.h>
21#include <linux/blkdev.h>
22#include <linux/scatterlist.h>
23#include <linux/swap.h>
24#include <linux/radix-tree.h>
25#include <linux/writeback.h>
26#include <linux/buffer_head.h>
27#include <linux/workqueue.h>
28#include <linux/kthread.h>
29#include <linux/freezer.h>
30#include "compat.h"
31#include "crc32c.h"
32#include "ctree.h"
33#include "disk-io.h"
34#include "transaction.h"
35#include "btrfs_inode.h"
36#include "volumes.h"
37#include "print-tree.h"
38#include "async-thread.h"
39#include "locking.h"
40#include "ref-cache.h"
41#include "tree-log.h"
42
43static struct extent_io_ops btree_extent_io_ops;
44static void end_workqueue_fn(struct btrfs_work *work);
45
46/*
47 * end_io_wq structs are used to do processing in task context when an IO is
48 * complete. This is used during reads to verify checksums, and it is used
49 * by writes to insert metadata for new file extents after IO is complete.
50 */
51struct end_io_wq {
52 struct bio *bio;
53 bio_end_io_t *end_io;
54 void *private;
55 struct btrfs_fs_info *info;
56 int error;
57 int metadata;
58 struct list_head list;
59 struct btrfs_work work;
60};
61
62/*
63 * async submit bios are used to offload expensive checksumming
64 * onto the worker threads. They checksum file and metadata bios
65 * just before they are sent down the IO stack.
66 */
67struct async_submit_bio {
68 struct inode *inode;
69 struct bio *bio;
70 struct list_head list;
71 extent_submit_bio_hook_t *submit_bio_start;
72 extent_submit_bio_hook_t *submit_bio_done;
73 int rw;
74 int mirror_num;
75 unsigned long bio_flags;
76 struct btrfs_work work;
77};
78
79/*
80 * extents on the btree inode are pretty simple, there's one extent
81 * that covers the entire device
82 */
83static struct extent_map *btree_get_extent(struct inode *inode,
84 struct page *page, size_t page_offset, u64 start, u64 len,
85 int create)
86{
87 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
88 struct extent_map *em;
89 int ret;
90
91 spin_lock(&em_tree->lock);
92 em = lookup_extent_mapping(em_tree, start, len);
93 if (em) {
94 em->bdev =
95 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
96 spin_unlock(&em_tree->lock);
97 goto out;
98 }
99 spin_unlock(&em_tree->lock);
100
101 em = alloc_extent_map(GFP_NOFS);
102 if (!em) {
103 em = ERR_PTR(-ENOMEM);
104 goto out;
105 }
106 em->start = 0;
107 em->len = (u64)-1;
108 em->block_len = (u64)-1;
109 em->block_start = 0;
110 em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
111
112 spin_lock(&em_tree->lock);
113 ret = add_extent_mapping(em_tree, em);
114 if (ret == -EEXIST) {
115 u64 failed_start = em->start;
116 u64 failed_len = em->len;
117
118 free_extent_map(em);
119 em = lookup_extent_mapping(em_tree, start, len);
120 if (em) {
121 ret = 0;
122 } else {
123 em = lookup_extent_mapping(em_tree, failed_start,
124 failed_len);
125 ret = -EIO;
126 }
127 } else if (ret) {
128 free_extent_map(em);
129 em = NULL;
130 }
131 spin_unlock(&em_tree->lock);
132
133 if (ret)
134 em = ERR_PTR(ret);
135out:
136 return em;
137}
138
139u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
140{
141 return btrfs_crc32c(seed, data, len);
142}
143
144void btrfs_csum_final(u32 crc, char *result)
145{
146 *(__le32 *)result = ~cpu_to_le32(crc);
147}
148
149/*
150 * compute the csum for a btree block, and either verify it or write it
151 * into the csum field of the block.
152 */
153static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
154 int verify)
155{
156 u16 csum_size =
157 btrfs_super_csum_size(&root->fs_info->super_copy);
158 char *result = NULL;
159 unsigned long len;
160 unsigned long cur_len;
161 unsigned long offset = BTRFS_CSUM_SIZE;
162 char *map_token = NULL;
163 char *kaddr;
164 unsigned long map_start;
165 unsigned long map_len;
166 int err;
167 u32 crc = ~(u32)0;
168 unsigned long inline_result;
169
170 len = buf->len - offset;
171 while (len > 0) {
172 err = map_private_extent_buffer(buf, offset, 32,
173 &map_token, &kaddr,
174 &map_start, &map_len, KM_USER0);
175 if (err)
176 return 1;
177 cur_len = min(len, map_len - (offset - map_start));
178 crc = btrfs_csum_data(root, kaddr + offset - map_start,
179 crc, cur_len);
180 len -= cur_len;
181 offset += cur_len;
182 unmap_extent_buffer(buf, map_token, KM_USER0);
183 }
184 if (csum_size > sizeof(inline_result)) {
185 result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
186 if (!result)
187 return 1;
188 } else {
189 result = (char *)&inline_result;
190 }
191
192 btrfs_csum_final(crc, result);
193
194 if (verify) {
195 if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
196 u32 val;
197 u32 found = 0;
198 memcpy(&found, result, csum_size);
199
200 read_extent_buffer(buf, &val, 0, csum_size);
201 printk(KERN_INFO "btrfs: %s checksum verify failed "
202 "on %llu wanted %X found %X level %d\n",
203 root->fs_info->sb->s_id,
204 buf->start, val, found, btrfs_header_level(buf));
205 if (result != (char *)&inline_result)
206 kfree(result);
207 return 1;
208 }
209 } else {
210 write_extent_buffer(buf, result, 0, csum_size);
211 }
212 if (result != (char *)&inline_result)
213 kfree(result);
214 return 0;
215}
216
217/*
218 * we can't consider a given block up to date unless the transid of the
219 * block matches the transid in the parent node's pointer. This is how we
220 * detect blocks that either didn't get written at all or got written
221 * in the wrong place.
222 */
223static int verify_parent_transid(struct extent_io_tree *io_tree,
224 struct extent_buffer *eb, u64 parent_transid)
225{
226 int ret;
227
228 if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
229 return 0;
230
231 lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS);
232 if (extent_buffer_uptodate(io_tree, eb) &&
233 btrfs_header_generation(eb) == parent_transid) {
234 ret = 0;
235 goto out;
236 }
237 printk("parent transid verify failed on %llu wanted %llu found %llu\n",
238 (unsigned long long)eb->start,
239 (unsigned long long)parent_transid,
240 (unsigned long long)btrfs_header_generation(eb));
241 ret = 1;
242 clear_extent_buffer_uptodate(io_tree, eb);
243out:
244 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
245 GFP_NOFS);
246 return ret;
247}
248
249/*
250 * helper to read a given tree block, doing retries as required when
251 * the checksums don't match and we have alternate mirrors to try.
252 */
253static int btree_read_extent_buffer_pages(struct btrfs_root *root,
254 struct extent_buffer *eb,
255 u64 start, u64 parent_transid)
256{
257 struct extent_io_tree *io_tree;
258 int ret;
259 int num_copies = 0;
260 int mirror_num = 0;
261
262 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
263 while (1) {
264 ret = read_extent_buffer_pages(io_tree, eb, start, 1,
265 btree_get_extent, mirror_num);
266 if (!ret &&
267 !verify_parent_transid(io_tree, eb, parent_transid))
268 return ret;
269
270 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
271 eb->start, eb->len);
272 if (num_copies == 1)
273 return ret;
274
275 mirror_num++;
276 if (mirror_num > num_copies)
277 return ret;
278 }
279 return -EIO;
280}
281
282/*
283 * checksum a dirty tree block before IO. This has extra checks to make sure
284 * we only fill in the checksum field in the first page of a multi-page block
285 */
286
287static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
288{
289 struct extent_io_tree *tree;
290 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
291 u64 found_start;
292 int found_level;
293 unsigned long len;
294 struct extent_buffer *eb;
295 int ret;
296
297 tree = &BTRFS_I(page->mapping->host)->io_tree;
298
299 if (page->private == EXTENT_PAGE_PRIVATE)
300 goto out;
301 if (!page->private)
302 goto out;
303 len = page->private >> 2;
304 WARN_ON(len == 0);
305
306 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
307 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
308 btrfs_header_generation(eb));
309 BUG_ON(ret);
310 found_start = btrfs_header_bytenr(eb);
311 if (found_start != start) {
312 WARN_ON(1);
313 goto err;
314 }
315 if (eb->first_page != page) {
316 WARN_ON(1);
317 goto err;
318 }
319 if (!PageUptodate(page)) {
320 WARN_ON(1);
321 goto err;
322 }
323 found_level = btrfs_header_level(eb);
324
325 csum_tree_block(root, eb, 0);
326err:
327 free_extent_buffer(eb);
328out:
329 return 0;
330}
331
332static int check_tree_block_fsid(struct btrfs_root *root,
333 struct extent_buffer *eb)
334{
335 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
336 u8 fsid[BTRFS_UUID_SIZE];
337 int ret = 1;
338
339 read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb),
340 BTRFS_FSID_SIZE);
341 while (fs_devices) {
342 if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
343 ret = 0;
344 break;
345 }
346 fs_devices = fs_devices->seed;
347 }
348 return ret;
349}
350
351static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
352 struct extent_state *state)
353{
354 struct extent_io_tree *tree;
355 u64 found_start;
356 int found_level;
357 unsigned long len;
358 struct extent_buffer *eb;
359 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
360 int ret = 0;
361
362 tree = &BTRFS_I(page->mapping->host)->io_tree;
363 if (page->private == EXTENT_PAGE_PRIVATE)
364 goto out;
365 if (!page->private)
366 goto out;
367
368 len = page->private >> 2;
369 WARN_ON(len == 0);
370
371 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
372
373 found_start = btrfs_header_bytenr(eb);
374 if (found_start != start) {
375 printk(KERN_INFO "btrfs bad tree block start %llu %llu\n",
376 (unsigned long long)found_start,
377 (unsigned long long)eb->start);
378 ret = -EIO;
379 goto err;
380 }
381 if (eb->first_page != page) {
382 printk(KERN_INFO "btrfs bad first page %lu %lu\n",
383 eb->first_page->index, page->index);
384 WARN_ON(1);
385 ret = -EIO;
386 goto err;
387 }
388 if (check_tree_block_fsid(root, eb)) {
389 printk(KERN_INFO "btrfs bad fsid on block %llu\n",
390 (unsigned long long)eb->start);
391 ret = -EIO;
392 goto err;
393 }
394 found_level = btrfs_header_level(eb);
395
396 ret = csum_tree_block(root, eb, 1);
397 if (ret)
398 ret = -EIO;
399
400 end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
401 end = eb->start + end - 1;
402err:
403 free_extent_buffer(eb);
404out:
405 return ret;
406}
407
408static void end_workqueue_bio(struct bio *bio, int err)
409{
410 struct end_io_wq *end_io_wq = bio->bi_private;
411 struct btrfs_fs_info *fs_info;
412
413 fs_info = end_io_wq->info;
414 end_io_wq->error = err;
415 end_io_wq->work.func = end_workqueue_fn;
416 end_io_wq->work.flags = 0;
417
418 if (bio->bi_rw & (1 << BIO_RW)) {
419 if (end_io_wq->metadata)
420 btrfs_queue_worker(&fs_info->endio_meta_write_workers,
421 &end_io_wq->work);
422 else
423 btrfs_queue_worker(&fs_info->endio_write_workers,
424 &end_io_wq->work);
425 } else {
426 if (end_io_wq->metadata)
427 btrfs_queue_worker(&fs_info->endio_meta_workers,
428 &end_io_wq->work);
429 else
430 btrfs_queue_worker(&fs_info->endio_workers,
431 &end_io_wq->work);
432 }
433}
434
435int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
436 int metadata)
437{
438 struct end_io_wq *end_io_wq;
439 end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
440 if (!end_io_wq)
441 return -ENOMEM;
442
443 end_io_wq->private = bio->bi_private;
444 end_io_wq->end_io = bio->bi_end_io;
445 end_io_wq->info = info;
446 end_io_wq->error = 0;
447 end_io_wq->bio = bio;
448 end_io_wq->metadata = metadata;
449
450 bio->bi_private = end_io_wq;
451 bio->bi_end_io = end_workqueue_bio;
452 return 0;
453}
454
455unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
456{
457 unsigned long limit = min_t(unsigned long,
458 info->workers.max_workers,
459 info->fs_devices->open_devices);
460 return 256 * limit;
461}
462
463int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
464{
465 return atomic_read(&info->nr_async_bios) >
466 btrfs_async_submit_limit(info);
467}
468
469static void run_one_async_start(struct btrfs_work *work)
470{
471 struct btrfs_fs_info *fs_info;
472 struct async_submit_bio *async;
473
474 async = container_of(work, struct async_submit_bio, work);
475 fs_info = BTRFS_I(async->inode)->root->fs_info;
476 async->submit_bio_start(async->inode, async->rw, async->bio,
477 async->mirror_num, async->bio_flags);
478}
479
480static void run_one_async_done(struct btrfs_work *work)
481{
482 struct btrfs_fs_info *fs_info;
483 struct async_submit_bio *async;
484 int limit;
485
486 async = container_of(work, struct async_submit_bio, work);
487 fs_info = BTRFS_I(async->inode)->root->fs_info;
488
489 limit = btrfs_async_submit_limit(fs_info);
490 limit = limit * 2 / 3;
491
492 atomic_dec(&fs_info->nr_async_submits);
493
494 if (atomic_read(&fs_info->nr_async_submits) < limit &&
495 waitqueue_active(&fs_info->async_submit_wait))
496 wake_up(&fs_info->async_submit_wait);
497
498 async->submit_bio_done(async->inode, async->rw, async->bio,
499 async->mirror_num, async->bio_flags);
500}
501
502static void run_one_async_free(struct btrfs_work *work)
503{
504 struct async_submit_bio *async;
505
506 async = container_of(work, struct async_submit_bio, work);
507 kfree(async);
508}
509
510int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
511 int rw, struct bio *bio, int mirror_num,
512 unsigned long bio_flags,
513 extent_submit_bio_hook_t *submit_bio_start,
514 extent_submit_bio_hook_t *submit_bio_done)
515{
516 struct async_submit_bio *async;
517
518 async = kmalloc(sizeof(*async), GFP_NOFS);
519 if (!async)
520 return -ENOMEM;
521
522 async->inode = inode;
523 async->rw = rw;
524 async->bio = bio;
525 async->mirror_num = mirror_num;
526 async->submit_bio_start = submit_bio_start;
527 async->submit_bio_done = submit_bio_done;
528
529 async->work.func = run_one_async_start;
530 async->work.ordered_func = run_one_async_done;
531 async->work.ordered_free = run_one_async_free;
532
533 async->work.flags = 0;
534 async->bio_flags = bio_flags;
535
536 atomic_inc(&fs_info->nr_async_submits);
537 btrfs_queue_worker(&fs_info->workers, &async->work);
538#if 0
539 int limit = btrfs_async_submit_limit(fs_info);
540 if (atomic_read(&fs_info->nr_async_submits) > limit) {
541 wait_event_timeout(fs_info->async_submit_wait,
542 (atomic_read(&fs_info->nr_async_submits) < limit),
543 HZ/10);
544
545 wait_event_timeout(fs_info->async_submit_wait,
546 (atomic_read(&fs_info->nr_async_bios) < limit),
547 HZ/10);
548 }
549#endif
550 while (atomic_read(&fs_info->async_submit_draining) &&
551 atomic_read(&fs_info->nr_async_submits)) {
552 wait_event(fs_info->async_submit_wait,
553 (atomic_read(&fs_info->nr_async_submits) == 0));
554 }
555
556 return 0;
557}
558
559static int btree_csum_one_bio(struct bio *bio)
560{
561 struct bio_vec *bvec = bio->bi_io_vec;
562 int bio_index = 0;
563 struct btrfs_root *root;
564
565 WARN_ON(bio->bi_vcnt <= 0);
566 while (bio_index < bio->bi_vcnt) {
567 root = BTRFS_I(bvec->bv_page->mapping->host)->root;
568 csum_dirty_buffer(root, bvec->bv_page);
569 bio_index++;
570 bvec++;
571 }
572 return 0;
573}
574
575static int __btree_submit_bio_start(struct inode *inode, int rw,
576 struct bio *bio, int mirror_num,
577 unsigned long bio_flags)
578{
579 /*
580 * when we're called for a write, we're already in the async
581 * submission context. Just jump into btrfs_map_bio
582 */
583 btree_csum_one_bio(bio);
584 return 0;
585}
586
587static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
588 int mirror_num, unsigned long bio_flags)
589{
590 /*
591 * when we're called for a write, we're already in the async
592 * submission context. Just jump into btrfs_map_bio
593 */
594 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
595}
596
597static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
598 int mirror_num, unsigned long bio_flags)
599{
600 int ret;
601
602 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
603 bio, 1);
604 BUG_ON(ret);
605
606 if (!(rw & (1 << BIO_RW))) {
607 /*
608 * called for a read, do the setup so that checksum validation
609 * can happen in the async kernel threads
610 */
611 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
612 mirror_num, 0);
613 }
614 /*
615 * kthread helpers are used to submit writes so that checksumming
616 * can happen in parallel across all CPUs
617 */
618 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
619 inode, rw, bio, mirror_num, 0,
620 __btree_submit_bio_start,
621 __btree_submit_bio_done);
622}
623
624static int btree_writepage(struct page *page, struct writeback_control *wbc)
625{
626 struct extent_io_tree *tree;
627 tree = &BTRFS_I(page->mapping->host)->io_tree;
628
629 if (current->flags & PF_MEMALLOC) {
630 redirty_page_for_writepage(wbc, page);
631 unlock_page(page);
632 return 0;
633 }
634 return extent_write_full_page(tree, page, btree_get_extent, wbc);
635}
636
637static int btree_writepages(struct address_space *mapping,
638 struct writeback_control *wbc)
639{
640 struct extent_io_tree *tree;
641 tree = &BTRFS_I(mapping->host)->io_tree;
642 if (wbc->sync_mode == WB_SYNC_NONE) {
643 u64 num_dirty;
644 u64 start = 0;
645 unsigned long thresh = 32 * 1024 * 1024;
646
647 if (wbc->for_kupdate)
648 return 0;
649
650 num_dirty = count_range_bits(tree, &start, (u64)-1,
651 thresh, EXTENT_DIRTY);
652 if (num_dirty < thresh)
653 return 0;
654 }
655 return extent_writepages(tree, mapping, btree_get_extent, wbc);
656}
657
658static int btree_readpage(struct file *file, struct page *page)
659{
660 struct extent_io_tree *tree;
661 tree = &BTRFS_I(page->mapping->host)->io_tree;
662 return extent_read_full_page(tree, page, btree_get_extent);
663}
664
665static int btree_releasepage(struct page *page, gfp_t gfp_flags)
666{
667 struct extent_io_tree *tree;
668 struct extent_map_tree *map;
669 int ret;
670
671 if (PageWriteback(page) || PageDirty(page))
672 return 0;
673
674 tree = &BTRFS_I(page->mapping->host)->io_tree;
675 map = &BTRFS_I(page->mapping->host)->extent_tree;
676
677 ret = try_release_extent_state(map, tree, page, gfp_flags);
678 if (!ret)
679 return 0;
680
681 ret = try_release_extent_buffer(tree, page);
682 if (ret == 1) {
683 ClearPagePrivate(page);
684 set_page_private(page, 0);
685 page_cache_release(page);
686 }
687
688 return ret;
689}
690
691static void btree_invalidatepage(struct page *page, unsigned long offset)
692{
693 struct extent_io_tree *tree;
694 tree = &BTRFS_I(page->mapping->host)->io_tree;
695 extent_invalidatepage(tree, page, offset);
696 btree_releasepage(page, GFP_NOFS);
697 if (PagePrivate(page)) {
698 printk(KERN_WARNING "btrfs warning page private not zero "
699 "on page %llu\n", (unsigned long long)page_offset(page));
700 ClearPagePrivate(page);
701 set_page_private(page, 0);
702 page_cache_release(page);
703 }
704}
705
706#if 0
707static int btree_writepage(struct page *page, struct writeback_control *wbc)
708{
709 struct buffer_head *bh;
710 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
711 struct buffer_head *head;
712 if (!page_has_buffers(page)) {
713 create_empty_buffers(page, root->fs_info->sb->s_blocksize,
714 (1 << BH_Dirty)|(1 << BH_Uptodate));
715 }
716 head = page_buffers(page);
717 bh = head;
718 do {
719 if (buffer_dirty(bh))
720 csum_tree_block(root, bh, 0);
721 bh = bh->b_this_page;
722 } while (bh != head);
723 return block_write_full_page(page, btree_get_block, wbc);
724}
725#endif
726
727static struct address_space_operations btree_aops = {
728 .readpage = btree_readpage,
729 .writepage = btree_writepage,
730 .writepages = btree_writepages,
731 .releasepage = btree_releasepage,
732 .invalidatepage = btree_invalidatepage,
733 .sync_page = block_sync_page,
734};
735
736int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
737 u64 parent_transid)
738{
739 struct extent_buffer *buf = NULL;
740 struct inode *btree_inode = root->fs_info->btree_inode;
741 int ret = 0;
742
743 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
744 if (!buf)
745 return 0;
746 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
747 buf, 0, 0, btree_get_extent, 0);
748 free_extent_buffer(buf);
749 return ret;
750}
751
752struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
753 u64 bytenr, u32 blocksize)
754{
755 struct inode *btree_inode = root->fs_info->btree_inode;
756 struct extent_buffer *eb;
757 eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
758 bytenr, blocksize, GFP_NOFS);
759 return eb;
760}
761
762struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
763 u64 bytenr, u32 blocksize)
764{
765 struct inode *btree_inode = root->fs_info->btree_inode;
766 struct extent_buffer *eb;
767
768 eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
769 bytenr, blocksize, NULL, GFP_NOFS);
770 return eb;
771}
772
773
774int btrfs_write_tree_block(struct extent_buffer *buf)
775{
776 return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start,
777 buf->start + buf->len - 1, WB_SYNC_ALL);
778}
779
780int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
781{
782 return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
783 buf->start, buf->start + buf->len - 1);
784}
785
786struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
787 u32 blocksize, u64 parent_transid)
788{
789 struct extent_buffer *buf = NULL;
790 struct inode *btree_inode = root->fs_info->btree_inode;
791 struct extent_io_tree *io_tree;
792 int ret;
793
794 io_tree = &BTRFS_I(btree_inode)->io_tree;
795
796 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
797 if (!buf)
798 return NULL;
799
800 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
801
802 if (ret == 0)
803 buf->flags |= EXTENT_UPTODATE;
804 else
805 WARN_ON(1);
806 return buf;
807
808}
809
810int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
811 struct extent_buffer *buf)
812{
813 struct inode *btree_inode = root->fs_info->btree_inode;
814 if (btrfs_header_generation(buf) ==
815 root->fs_info->running_transaction->transid) {
816 WARN_ON(!btrfs_tree_locked(buf));
817 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
818 buf);
819 }
820 return 0;
821}
822
823static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
824 u32 stripesize, struct btrfs_root *root,
825 struct btrfs_fs_info *fs_info,
826 u64 objectid)
827{
828 root->node = NULL;
829 root->commit_root = NULL;
830 root->ref_tree = NULL;
831 root->sectorsize = sectorsize;
832 root->nodesize = nodesize;
833 root->leafsize = leafsize;
834 root->stripesize = stripesize;
835 root->ref_cows = 0;
836 root->track_dirty = 0;
837
838 root->fs_info = fs_info;
839 root->objectid = objectid;
840 root->last_trans = 0;
841 root->highest_inode = 0;
842 root->last_inode_alloc = 0;
843 root->name = NULL;
844 root->in_sysfs = 0;
845
846 INIT_LIST_HEAD(&root->dirty_list);
847 INIT_LIST_HEAD(&root->orphan_list);
848 INIT_LIST_HEAD(&root->dead_list);
849 spin_lock_init(&root->node_lock);
850 spin_lock_init(&root->list_lock);
851 mutex_init(&root->objectid_mutex);
852 mutex_init(&root->log_mutex);
853 extent_io_tree_init(&root->dirty_log_pages,
854 fs_info->btree_inode->i_mapping, GFP_NOFS);
855
856 btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
857 root->ref_tree = &root->ref_tree_struct;
858
859 memset(&root->root_key, 0, sizeof(root->root_key));
860 memset(&root->root_item, 0, sizeof(root->root_item));
861 memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
862 memset(&root->root_kobj, 0, sizeof(root->root_kobj));
863 root->defrag_trans_start = fs_info->generation;
864 init_completion(&root->kobj_unregister);
865 root->defrag_running = 0;
866 root->defrag_level = 0;
867 root->root_key.objectid = objectid;
868 root->anon_super.s_root = NULL;
869 root->anon_super.s_dev = 0;
870 INIT_LIST_HEAD(&root->anon_super.s_list);
871 INIT_LIST_HEAD(&root->anon_super.s_instances);
872 init_rwsem(&root->anon_super.s_umount);
873
874 return 0;
875}
876
877static int find_and_setup_root(struct btrfs_root *tree_root,
878 struct btrfs_fs_info *fs_info,
879 u64 objectid,
880 struct btrfs_root *root)
881{
882 int ret;
883 u32 blocksize;
884 u64 generation;
885
886 __setup_root(tree_root->nodesize, tree_root->leafsize,
887 tree_root->sectorsize, tree_root->stripesize,
888 root, fs_info, objectid);
889 ret = btrfs_find_last_root(tree_root, objectid,
890 &root->root_item, &root->root_key);
891 BUG_ON(ret);
892
893 generation = btrfs_root_generation(&root->root_item);
894 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
895 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
896 blocksize, generation);
897 BUG_ON(!root->node);
898 return 0;
899}
900
901int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
902 struct btrfs_fs_info *fs_info)
903{
904 struct extent_buffer *eb;
905 struct btrfs_root *log_root_tree = fs_info->log_root_tree;
906 u64 start = 0;
907 u64 end = 0;
908 int ret;
909
910 if (!log_root_tree)
911 return 0;
912
913 while (1) {
914 ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
915 0, &start, &end, EXTENT_DIRTY);
916 if (ret)
917 break;
918
919 clear_extent_dirty(&log_root_tree->dirty_log_pages,
920 start, end, GFP_NOFS);
921 }
922 eb = fs_info->log_root_tree->node;
923
924 WARN_ON(btrfs_header_level(eb) != 0);
925 WARN_ON(btrfs_header_nritems(eb) != 0);
926
927 ret = btrfs_free_reserved_extent(fs_info->tree_root,
928 eb->start, eb->len);
929 BUG_ON(ret);
930
931 free_extent_buffer(eb);
932 kfree(fs_info->log_root_tree);
933 fs_info->log_root_tree = NULL;
934 return 0;
935}
936
937int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
938 struct btrfs_fs_info *fs_info)
939{
940 struct btrfs_root *root;
941 struct btrfs_root *tree_root = fs_info->tree_root;
942
943 root = kzalloc(sizeof(*root), GFP_NOFS);
944 if (!root)
945 return -ENOMEM;
946
947 __setup_root(tree_root->nodesize, tree_root->leafsize,
948 tree_root->sectorsize, tree_root->stripesize,
949 root, fs_info, BTRFS_TREE_LOG_OBJECTID);
950
951 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
952 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
953 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
954 root->ref_cows = 0;
955
956 root->node = btrfs_alloc_free_block(trans, root, root->leafsize,
957 0, BTRFS_TREE_LOG_OBJECTID,
958 trans->transid, 0, 0, 0);
959
960 btrfs_set_header_nritems(root->node, 0);
961 btrfs_set_header_level(root->node, 0);
962 btrfs_set_header_bytenr(root->node, root->node->start);
963 btrfs_set_header_generation(root->node, trans->transid);
964 btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID);
965
966 write_extent_buffer(root->node, root->fs_info->fsid,
967 (unsigned long)btrfs_header_fsid(root->node),
968 BTRFS_FSID_SIZE);
969 btrfs_mark_buffer_dirty(root->node);
970 btrfs_tree_unlock(root->node);
971 fs_info->log_root_tree = root;
972 return 0;
973}
974
975struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
976 struct btrfs_key *location)
977{
978 struct btrfs_root *root;
979 struct btrfs_fs_info *fs_info = tree_root->fs_info;
980 struct btrfs_path *path;
981 struct extent_buffer *l;
982 u64 highest_inode;
983 u64 generation;
984 u32 blocksize;
985 int ret = 0;
986
987 root = kzalloc(sizeof(*root), GFP_NOFS);
988 if (!root)
989 return ERR_PTR(-ENOMEM);
990 if (location->offset == (u64)-1) {
991 ret = find_and_setup_root(tree_root, fs_info,
992 location->objectid, root);
993 if (ret) {
994 kfree(root);
995 return ERR_PTR(ret);
996 }
997 goto insert;
998 }
999
1000 __setup_root(tree_root->nodesize, tree_root->leafsize,
1001 tree_root->sectorsize, tree_root->stripesize,
1002 root, fs_info, location->objectid);
1003
1004 path = btrfs_alloc_path();
1005 BUG_ON(!path);
1006 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
1007 if (ret != 0) {
1008 if (ret > 0)
1009 ret = -ENOENT;
1010 goto out;
1011 }
1012 l = path->nodes[0];
1013 read_extent_buffer(l, &root->root_item,
1014 btrfs_item_ptr_offset(l, path->slots[0]),
1015 sizeof(root->root_item));
1016 memcpy(&root->root_key, location, sizeof(*location));
1017 ret = 0;
1018out:
1019 btrfs_release_path(root, path);
1020 btrfs_free_path(path);
1021 if (ret) {
1022 kfree(root);
1023 return ERR_PTR(ret);
1024 }
1025 generation = btrfs_root_generation(&root->root_item);
1026 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1027 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1028 blocksize, generation);
1029 BUG_ON(!root->node);
1030insert:
1031 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
1032 root->ref_cows = 1;
1033 ret = btrfs_find_highest_inode(root, &highest_inode);
1034 if (ret == 0) {
1035 root->highest_inode = highest_inode;
1036 root->last_inode_alloc = highest_inode;
1037 }
1038 }
1039 return root;
1040}
1041
1042struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1043 u64 root_objectid)
1044{
1045 struct btrfs_root *root;
1046
1047 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
1048 return fs_info->tree_root;
1049 if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
1050 return fs_info->extent_root;
1051
1052 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1053 (unsigned long)root_objectid);
1054 return root;
1055}
1056
1057struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1058 struct btrfs_key *location)
1059{
1060 struct btrfs_root *root;
1061 int ret;
1062
1063 if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
1064 return fs_info->tree_root;
1065 if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
1066 return fs_info->extent_root;
1067 if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
1068 return fs_info->chunk_root;
1069 if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
1070 return fs_info->dev_root;
1071 if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
1072 return fs_info->csum_root;
1073
1074 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1075 (unsigned long)location->objectid);
1076 if (root)
1077 return root;
1078
1079 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
1080 if (IS_ERR(root))
1081 return root;
1082
1083 set_anon_super(&root->anon_super, NULL);
1084
1085 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1086 (unsigned long)root->root_key.objectid,
1087 root);
1088 if (ret) {
1089 free_extent_buffer(root->node);
1090 kfree(root);
1091 return ERR_PTR(ret);
1092 }
1093 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
1094 ret = btrfs_find_dead_roots(fs_info->tree_root,
1095 root->root_key.objectid, root);
1096 BUG_ON(ret);
1097 btrfs_orphan_cleanup(root);
1098 }
1099 return root;
1100}
1101
1102struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
1103 struct btrfs_key *location,
1104 const char *name, int namelen)
1105{
1106 struct btrfs_root *root;
1107 int ret;
1108
1109 root = btrfs_read_fs_root_no_name(fs_info, location);
1110 if (!root)
1111 return NULL;
1112
1113 if (root->in_sysfs)
1114 return root;
1115
1116 ret = btrfs_set_root_name(root, name, namelen);
1117 if (ret) {
1118 free_extent_buffer(root->node);
1119 kfree(root);
1120 return ERR_PTR(ret);
1121 }
1122#if 0
1123 ret = btrfs_sysfs_add_root(root);
1124 if (ret) {
1125 free_extent_buffer(root->node);
1126 kfree(root->name);
1127 kfree(root);
1128 return ERR_PTR(ret);
1129 }
1130#endif
1131 root->in_sysfs = 1;
1132 return root;
1133}
1134
1135static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1136{
1137 struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
1138 int ret = 0;
1139 struct list_head *cur;
1140 struct btrfs_device *device;
1141 struct backing_dev_info *bdi;
1142#if 0
1143 if ((bdi_bits & (1 << BDI_write_congested)) &&
1144 btrfs_congested_async(info, 0))
1145 return 1;
1146#endif
1147 list_for_each(cur, &info->fs_devices->devices) {
1148 device = list_entry(cur, struct btrfs_device, dev_list);
1149 if (!device->bdev)
1150 continue;
1151 bdi = blk_get_backing_dev_info(device->bdev);
1152 if (bdi && bdi_congested(bdi, bdi_bits)) {
1153 ret = 1;
1154 break;
1155 }
1156 }
1157 return ret;
1158}
1159
1160/*
1161 * this unplugs every device on the box, and it is only used when page
1162 * is null
1163 */
1164static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1165{
1166 struct list_head *cur;
1167 struct btrfs_device *device;
1168 struct btrfs_fs_info *info;
1169
1170 info = (struct btrfs_fs_info *)bdi->unplug_io_data;
1171 list_for_each(cur, &info->fs_devices->devices) {
1172 device = list_entry(cur, struct btrfs_device, dev_list);
1173 if (!device->bdev)
1174 continue;
1175
1176 bdi = blk_get_backing_dev_info(device->bdev);
1177 if (bdi->unplug_io_fn)
1178 bdi->unplug_io_fn(bdi, page);
1179 }
1180}
1181
1182static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1183{
1184 struct inode *inode;
1185 struct extent_map_tree *em_tree;
1186 struct extent_map *em;
1187 struct address_space *mapping;
1188 u64 offset;
1189
1190 /* the generic O_DIRECT read code does this */
1191 if (1 || !page) {
1192 __unplug_io_fn(bdi, page);
1193 return;
1194 }
1195
1196 /*
1197 * page->mapping may change at any time. Get a consistent copy
1198 * and use that for everything below
1199 */
1200 smp_mb();
1201 mapping = page->mapping;
1202 if (!mapping)
1203 return;
1204
1205 inode = mapping->host;
1206
1207 /*
1208 * don't do the expensive searching for a small number of
1209 * devices
1210 */
1211 if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
1212 __unplug_io_fn(bdi, page);
1213 return;
1214 }
1215
1216 offset = page_offset(page);
1217
1218 em_tree = &BTRFS_I(inode)->extent_tree;
1219 spin_lock(&em_tree->lock);
1220 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
1221 spin_unlock(&em_tree->lock);
1222 if (!em) {
1223 __unplug_io_fn(bdi, page);
1224 return;
1225 }
1226
1227 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1228 free_extent_map(em);
1229 __unplug_io_fn(bdi, page);
1230 return;
1231 }
1232 offset = offset - em->start;
1233 btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
1234 em->block_start + offset, page);
1235 free_extent_map(em);
1236}
1237
1238static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1239{
1240 bdi_init(bdi);
1241 bdi->ra_pages = default_backing_dev_info.ra_pages;
1242 bdi->state = 0;
1243 bdi->capabilities = default_backing_dev_info.capabilities;
1244 bdi->unplug_io_fn = btrfs_unplug_io_fn;
1245 bdi->unplug_io_data = info;
1246 bdi->congested_fn = btrfs_congested_fn;
1247 bdi->congested_data = info;
1248 return 0;
1249}
1250
1251static int bio_ready_for_csum(struct bio *bio)
1252{
1253 u64 length = 0;
1254 u64 buf_len = 0;
1255 u64 start = 0;
1256 struct page *page;
1257 struct extent_io_tree *io_tree = NULL;
1258 struct btrfs_fs_info *info = NULL;
1259 struct bio_vec *bvec;
1260 int i;
1261 int ret;
1262
1263 bio_for_each_segment(bvec, bio, i) {
1264 page = bvec->bv_page;
1265 if (page->private == EXTENT_PAGE_PRIVATE) {
1266 length += bvec->bv_len;
1267 continue;
1268 }
1269 if (!page->private) {
1270 length += bvec->bv_len;
1271 continue;
1272 }
1273 length = bvec->bv_len;
1274 buf_len = page->private >> 2;
1275 start = page_offset(page) + bvec->bv_offset;
1276 io_tree = &BTRFS_I(page->mapping->host)->io_tree;
1277 info = BTRFS_I(page->mapping->host)->root->fs_info;
1278 }
1279 /* are we fully contained in this bio? */
1280 if (buf_len <= length)
1281 return 1;
1282
1283 ret = extent_range_uptodate(io_tree, start + length,
1284 start + buf_len - 1);
1285 if (ret == 1)
1286 return ret;
1287 return ret;
1288}
1289
1290/*
1291 * called by the kthread helper functions to finally call the bio end_io
1292 * functions. This is where read checksum verification actually happens
1293 */
1294static void end_workqueue_fn(struct btrfs_work *work)
1295{
1296 struct bio *bio;
1297 struct end_io_wq *end_io_wq;
1298 struct btrfs_fs_info *fs_info;
1299 int error;
1300
1301 end_io_wq = container_of(work, struct end_io_wq, work);
1302 bio = end_io_wq->bio;
1303 fs_info = end_io_wq->info;
1304
1305 /* metadata bio reads are special because the whole tree block must
1306 * be checksummed at once. This makes sure the entire block is in
1307 * ram and up to date before trying to verify things. For
1308 * blocksize <= pagesize, it is basically a noop
1309 */
1310 if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata &&
1311 !bio_ready_for_csum(bio)) {
1312 btrfs_queue_worker(&fs_info->endio_meta_workers,
1313 &end_io_wq->work);
1314 return;
1315 }
1316 error = end_io_wq->error;
1317 bio->bi_private = end_io_wq->private;
1318 bio->bi_end_io = end_io_wq->end_io;
1319 kfree(end_io_wq);
1320 bio_endio(bio, error);
1321}
1322
1323static int cleaner_kthread(void *arg)
1324{
1325 struct btrfs_root *root = arg;
1326
1327 do {
1328 smp_mb();
1329 if (root->fs_info->closing)
1330 break;
1331
1332 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1333 mutex_lock(&root->fs_info->cleaner_mutex);
1334 btrfs_clean_old_snapshots(root);
1335 mutex_unlock(&root->fs_info->cleaner_mutex);
1336
1337 if (freezing(current)) {
1338 refrigerator();
1339 } else {
1340 smp_mb();
1341 if (root->fs_info->closing)
1342 break;
1343 set_current_state(TASK_INTERRUPTIBLE);
1344 schedule();
1345 __set_current_state(TASK_RUNNING);
1346 }
1347 } while (!kthread_should_stop());
1348 return 0;
1349}
1350
1351static int transaction_kthread(void *arg)
1352{
1353 struct btrfs_root *root = arg;
1354 struct btrfs_trans_handle *trans;
1355 struct btrfs_transaction *cur;
1356 unsigned long now;
1357 unsigned long delay;
1358 int ret;
1359
1360 do {
1361 smp_mb();
1362 if (root->fs_info->closing)
1363 break;
1364
1365 delay = HZ * 30;
1366 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1367 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1368
1369 if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
1370 printk(KERN_INFO "btrfs: total reference cache "
1371 "size %llu\n",
1372 root->fs_info->total_ref_cache_size);
1373 }
1374
1375 mutex_lock(&root->fs_info->trans_mutex);
1376 cur = root->fs_info->running_transaction;
1377 if (!cur) {
1378 mutex_unlock(&root->fs_info->trans_mutex);
1379 goto sleep;
1380 }
1381
1382 now = get_seconds();
1383 if (now < cur->start_time || now - cur->start_time < 30) {
1384 mutex_unlock(&root->fs_info->trans_mutex);
1385 delay = HZ * 5;
1386 goto sleep;
1387 }
1388 mutex_unlock(&root->fs_info->trans_mutex);
1389 trans = btrfs_start_transaction(root, 1);
1390 ret = btrfs_commit_transaction(trans, root);
1391sleep:
1392 wake_up_process(root->fs_info->cleaner_kthread);
1393 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
1394
1395 if (freezing(current)) {
1396 refrigerator();
1397 } else {
1398 if (root->fs_info->closing)
1399 break;
1400 set_current_state(TASK_INTERRUPTIBLE);
1401 schedule_timeout(delay);
1402 __set_current_state(TASK_RUNNING);
1403 }
1404 } while (!kthread_should_stop());
1405 return 0;
1406}
1407
1408struct btrfs_root *open_ctree(struct super_block *sb,
1409 struct btrfs_fs_devices *fs_devices,
1410 char *options)
1411{
1412 u32 sectorsize;
1413 u32 nodesize;
1414 u32 leafsize;
1415 u32 blocksize;
1416 u32 stripesize;
1417 u64 generation;
1418 u64 features;
1419 struct btrfs_key location;
1420 struct buffer_head *bh;
1421 struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
1422 GFP_NOFS);
1423 struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
1424 GFP_NOFS);
1425 struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root),
1426 GFP_NOFS);
1427 struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
1428 GFP_NOFS);
1429 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
1430 GFP_NOFS);
1431 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
1432 GFP_NOFS);
1433 struct btrfs_root *log_tree_root;
1434
1435 int ret;
1436 int err = -EINVAL;
1437
1438 struct btrfs_super_block *disk_super;
1439
1440 if (!extent_root || !tree_root || !fs_info ||
1441 !chunk_root || !dev_root || !csum_root) {
1442 err = -ENOMEM;
1443 goto fail;
1444 }
1445 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
1446 INIT_LIST_HEAD(&fs_info->trans_list);
1447 INIT_LIST_HEAD(&fs_info->dead_roots);
1448 INIT_LIST_HEAD(&fs_info->hashers);
1449 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1450 spin_lock_init(&fs_info->hash_lock);
1451 spin_lock_init(&fs_info->delalloc_lock);
1452 spin_lock_init(&fs_info->new_trans_lock);
1453 spin_lock_init(&fs_info->ref_cache_lock);
1454
1455 init_completion(&fs_info->kobj_unregister);
1456 fs_info->tree_root = tree_root;
1457 fs_info->extent_root = extent_root;
1458 fs_info->csum_root = csum_root;
1459 fs_info->chunk_root = chunk_root;
1460 fs_info->dev_root = dev_root;
1461 fs_info->fs_devices = fs_devices;
1462 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1463 INIT_LIST_HEAD(&fs_info->space_info);
1464 btrfs_mapping_init(&fs_info->mapping_tree);
1465 atomic_set(&fs_info->nr_async_submits, 0);
1466 atomic_set(&fs_info->async_delalloc_pages, 0);
1467 atomic_set(&fs_info->async_submit_draining, 0);
1468 atomic_set(&fs_info->nr_async_bios, 0);
1469 atomic_set(&fs_info->throttles, 0);
1470 atomic_set(&fs_info->throttle_gen, 0);
1471 fs_info->sb = sb;
1472 fs_info->max_extent = (u64)-1;
1473 fs_info->max_inline = 8192 * 1024;
1474 setup_bdi(fs_info, &fs_info->bdi);
1475 fs_info->btree_inode = new_inode(sb);
1476 fs_info->btree_inode->i_ino = 1;
1477 fs_info->btree_inode->i_nlink = 1;
1478
1479 fs_info->thread_pool_size = min_t(unsigned long,
1480 num_online_cpus() + 2, 8);
1481
1482 INIT_LIST_HEAD(&fs_info->ordered_extents);
1483 spin_lock_init(&fs_info->ordered_extent_lock);
1484
1485 sb->s_blocksize = 4096;
1486 sb->s_blocksize_bits = blksize_bits(4096);
1487
1488 /*
1489 * we set the i_size on the btree inode to the max possible int.
1490 * the real end of the address space is determined by all of
1491 * the devices in the system
1492 */
1493 fs_info->btree_inode->i_size = OFFSET_MAX;
1494 fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
1495 fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
1496
1497 extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
1498 fs_info->btree_inode->i_mapping,
1499 GFP_NOFS);
1500 extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
1501 GFP_NOFS);
1502
1503 BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
1504
1505 spin_lock_init(&fs_info->block_group_cache_lock);
1506 fs_info->block_group_cache_tree.rb_node = NULL;
1507
1508 extent_io_tree_init(&fs_info->pinned_extents,
1509 fs_info->btree_inode->i_mapping, GFP_NOFS);
1510 extent_io_tree_init(&fs_info->pending_del,
1511 fs_info->btree_inode->i_mapping, GFP_NOFS);
1512 extent_io_tree_init(&fs_info->extent_ins,
1513 fs_info->btree_inode->i_mapping, GFP_NOFS);
1514 fs_info->do_barriers = 1;
1515
1516 INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
1517 btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree);
1518 btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
1519
1520 BTRFS_I(fs_info->btree_inode)->root = tree_root;
1521 memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
1522 sizeof(struct btrfs_key));
1523 insert_inode_hash(fs_info->btree_inode);
1524
1525 mutex_init(&fs_info->trans_mutex);
1526 mutex_init(&fs_info->tree_log_mutex);
1527 mutex_init(&fs_info->drop_mutex);
1528 mutex_init(&fs_info->extent_ins_mutex);
1529 mutex_init(&fs_info->pinned_mutex);
1530 mutex_init(&fs_info->chunk_mutex);
1531 mutex_init(&fs_info->transaction_kthread_mutex);
1532 mutex_init(&fs_info->cleaner_mutex);
1533 mutex_init(&fs_info->volume_mutex);
1534 mutex_init(&fs_info->tree_reloc_mutex);
1535 init_waitqueue_head(&fs_info->transaction_throttle);
1536 init_waitqueue_head(&fs_info->transaction_wait);
1537 init_waitqueue_head(&fs_info->async_submit_wait);
1538 init_waitqueue_head(&fs_info->tree_log_wait);
1539 atomic_set(&fs_info->tree_log_commit, 0);
1540 atomic_set(&fs_info->tree_log_writers, 0);
1541 fs_info->tree_log_transid = 0;
1542
1543 __setup_root(4096, 4096, 4096, 4096, tree_root,
1544 fs_info, BTRFS_ROOT_TREE_OBJECTID);
1545
1546
1547 bh = btrfs_read_dev_super(fs_devices->latest_bdev);
1548 if (!bh)
1549 goto fail_iput;
1550
1551 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
1552 memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
1553 sizeof(fs_info->super_for_commit));
1554 brelse(bh);
1555
1556 memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
1557
1558 disk_super = &fs_info->super_copy;
1559 if (!btrfs_super_root(disk_super))
1560 goto fail_iput;
1561
1562 ret = btrfs_parse_options(tree_root, options);
1563 if (ret) {
1564 err = ret;
1565 goto fail_iput;
1566 }
1567
1568 features = btrfs_super_incompat_flags(disk_super) &
1569 ~BTRFS_FEATURE_INCOMPAT_SUPP;
1570 if (features) {
1571 printk(KERN_ERR "BTRFS: couldn't mount because of "
1572 "unsupported optional features (%Lx).\n",
1573 features);
1574 err = -EINVAL;
1575 goto fail_iput;
1576 }
1577
1578 features = btrfs_super_compat_ro_flags(disk_super) &
1579 ~BTRFS_FEATURE_COMPAT_RO_SUPP;
1580 if (!(sb->s_flags & MS_RDONLY) && features) {
1581 printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
1582 "unsupported option features (%Lx).\n",
1583 features);
1584 err = -EINVAL;
1585 goto fail_iput;
1586 }
1587
1588 /*
1589 * we need to start all the end_io workers up front because the
1590 * queue work function gets called at interrupt time, and so it
1591 * cannot dynamically grow.
1592 */
1593 btrfs_init_workers(&fs_info->workers, "worker",
1594 fs_info->thread_pool_size);
1595
1596 btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
1597 fs_info->thread_pool_size);
1598
1599 btrfs_init_workers(&fs_info->submit_workers, "submit",
1600 min_t(u64, fs_devices->num_devices,
1601 fs_info->thread_pool_size));
1602
1603 /* a higher idle thresh on the submit workers makes it much more
1604 * likely that bios will be send down in a sane order to the
1605 * devices
1606 */
1607 fs_info->submit_workers.idle_thresh = 64;
1608
1609 fs_info->workers.idle_thresh = 16;
1610 fs_info->workers.ordered = 1;
1611
1612 fs_info->delalloc_workers.idle_thresh = 2;
1613 fs_info->delalloc_workers.ordered = 1;
1614
1615 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
1616 btrfs_init_workers(&fs_info->endio_workers, "endio",
1617 fs_info->thread_pool_size);
1618 btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
1619 fs_info->thread_pool_size);
1620 btrfs_init_workers(&fs_info->endio_meta_write_workers,
1621 "endio-meta-write", fs_info->thread_pool_size);
1622 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
1623 fs_info->thread_pool_size);
1624
1625 /*
1626 * endios are largely parallel and should have a very
1627 * low idle thresh
1628 */
1629 fs_info->endio_workers.idle_thresh = 4;
1630 fs_info->endio_write_workers.idle_thresh = 64;
1631 fs_info->endio_meta_write_workers.idle_thresh = 64;
1632
1633 btrfs_start_workers(&fs_info->workers, 1);
1634 btrfs_start_workers(&fs_info->submit_workers, 1);
1635 btrfs_start_workers(&fs_info->delalloc_workers, 1);
1636 btrfs_start_workers(&fs_info->fixup_workers, 1);
1637 btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
1638 btrfs_start_workers(&fs_info->endio_meta_workers,
1639 fs_info->thread_pool_size);
1640 btrfs_start_workers(&fs_info->endio_meta_write_workers,
1641 fs_info->thread_pool_size);
1642 btrfs_start_workers(&fs_info->endio_write_workers,
1643 fs_info->thread_pool_size);
1644
1645 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1646 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
1647 4 * 1024 * 1024 / PAGE_CACHE_SIZE);
1648
1649 nodesize = btrfs_super_nodesize(disk_super);
1650 leafsize = btrfs_super_leafsize(disk_super);
1651 sectorsize = btrfs_super_sectorsize(disk_super);
1652 stripesize = btrfs_super_stripesize(disk_super);
1653 tree_root->nodesize = nodesize;
1654 tree_root->leafsize = leafsize;
1655 tree_root->sectorsize = sectorsize;
1656 tree_root->stripesize = stripesize;
1657
1658 sb->s_blocksize = sectorsize;
1659 sb->s_blocksize_bits = blksize_bits(sectorsize);
1660
1661 if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
1662 sizeof(disk_super->magic))) {
1663 printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
1664 goto fail_sb_buffer;
1665 }
1666
1667 mutex_lock(&fs_info->chunk_mutex);
1668 ret = btrfs_read_sys_array(tree_root);
1669 mutex_unlock(&fs_info->chunk_mutex);
1670 if (ret) {
1671 printk(KERN_WARNING "btrfs: failed to read the system "
1672 "array on %s\n", sb->s_id);
1673 goto fail_sys_array;
1674 }
1675
1676 blocksize = btrfs_level_size(tree_root,
1677 btrfs_super_chunk_root_level(disk_super));
1678 generation = btrfs_super_chunk_root_generation(disk_super);
1679
1680 __setup_root(nodesize, leafsize, sectorsize, stripesize,
1681 chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
1682
1683 chunk_root->node = read_tree_block(chunk_root,
1684 btrfs_super_chunk_root(disk_super),
1685 blocksize, generation);
1686 BUG_ON(!chunk_root->node);
1687
1688 read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
1689 (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
1690 BTRFS_UUID_SIZE);
1691
1692 mutex_lock(&fs_info->chunk_mutex);
1693 ret = btrfs_read_chunk_tree(chunk_root);
1694 mutex_unlock(&fs_info->chunk_mutex);
1695 if (ret) {
1696 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
1697 sb->s_id);
1698 goto fail_chunk_root;
1699 }
1700
1701 btrfs_close_extra_devices(fs_devices);
1702
1703 blocksize = btrfs_level_size(tree_root,
1704 btrfs_super_root_level(disk_super));
1705 generation = btrfs_super_generation(disk_super);
1706
1707 tree_root->node = read_tree_block(tree_root,
1708 btrfs_super_root(disk_super),
1709 blocksize, generation);
1710 if (!tree_root->node)
1711 goto fail_chunk_root;
1712
1713
1714 ret = find_and_setup_root(tree_root, fs_info,
1715 BTRFS_EXTENT_TREE_OBJECTID, extent_root);
1716 if (ret)
1717 goto fail_tree_root;
1718 extent_root->track_dirty = 1;
1719
1720 ret = find_and_setup_root(tree_root, fs_info,
1721 BTRFS_DEV_TREE_OBJECTID, dev_root);
1722 dev_root->track_dirty = 1;
1723
1724 if (ret)
1725 goto fail_extent_root;
1726
1727 ret = find_and_setup_root(tree_root, fs_info,
1728 BTRFS_CSUM_TREE_OBJECTID, csum_root);
1729 if (ret)
1730 goto fail_extent_root;
1731
1732 csum_root->track_dirty = 1;
1733
1734 btrfs_read_block_groups(extent_root);
1735
1736 fs_info->generation = generation;
1737 fs_info->last_trans_committed = generation;
1738 fs_info->data_alloc_profile = (u64)-1;
1739 fs_info->metadata_alloc_profile = (u64)-1;
1740 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1741 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1742 "btrfs-cleaner");
1743 if (!fs_info->cleaner_kthread)
1744 goto fail_csum_root;
1745
1746 fs_info->transaction_kthread = kthread_run(transaction_kthread,
1747 tree_root,
1748 "btrfs-transaction");
1749 if (!fs_info->transaction_kthread)
1750 goto fail_cleaner;
1751
1752 if (btrfs_super_log_root(disk_super) != 0) {
1753 u64 bytenr = btrfs_super_log_root(disk_super);
1754
1755 if (fs_devices->rw_devices == 0) {
1756 printk(KERN_WARNING "Btrfs log replay required "
1757 "on RO media\n");
1758 err = -EIO;
1759 goto fail_trans_kthread;
1760 }
1761 blocksize =
1762 btrfs_level_size(tree_root,
1763 btrfs_super_log_root_level(disk_super));
1764
1765 log_tree_root = kzalloc(sizeof(struct btrfs_root),
1766 GFP_NOFS);
1767
1768 __setup_root(nodesize, leafsize, sectorsize, stripesize,
1769 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
1770
1771 log_tree_root->node = read_tree_block(tree_root, bytenr,
1772 blocksize,
1773 generation + 1);
1774 ret = btrfs_recover_log_trees(log_tree_root);
1775 BUG_ON(ret);
1776
1777 if (sb->s_flags & MS_RDONLY) {
1778 ret = btrfs_commit_super(tree_root);
1779 BUG_ON(ret);
1780 }
1781 }
1782
1783 if (!(sb->s_flags & MS_RDONLY)) {
1784 ret = btrfs_cleanup_reloc_trees(tree_root);
1785 BUG_ON(ret);
1786 }
1787
1788 location.objectid = BTRFS_FS_TREE_OBJECTID;
1789 location.type = BTRFS_ROOT_ITEM_KEY;
1790 location.offset = (u64)-1;
1791
1792 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
1793 if (!fs_info->fs_root)
1794 goto fail_trans_kthread;
1795 return tree_root;
1796
1797fail_trans_kthread:
1798 kthread_stop(fs_info->transaction_kthread);
1799fail_cleaner:
1800 kthread_stop(fs_info->cleaner_kthread);
1801
1802 /*
1803 * make sure we're done with the btree inode before we stop our
1804 * kthreads
1805 */
1806 filemap_write_and_wait(fs_info->btree_inode->i_mapping);
1807 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
1808
1809fail_csum_root:
1810 free_extent_buffer(csum_root->node);
1811fail_extent_root:
1812 free_extent_buffer(extent_root->node);
1813fail_tree_root:
1814 free_extent_buffer(tree_root->node);
1815fail_chunk_root:
1816 free_extent_buffer(chunk_root->node);
1817fail_sys_array:
1818 free_extent_buffer(dev_root->node);
1819fail_sb_buffer:
1820 btrfs_stop_workers(&fs_info->fixup_workers);
1821 btrfs_stop_workers(&fs_info->delalloc_workers);
1822 btrfs_stop_workers(&fs_info->workers);
1823 btrfs_stop_workers(&fs_info->endio_workers);
1824 btrfs_stop_workers(&fs_info->endio_meta_workers);
1825 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
1826 btrfs_stop_workers(&fs_info->endio_write_workers);
1827 btrfs_stop_workers(&fs_info->submit_workers);
1828fail_iput:
1829 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
1830 iput(fs_info->btree_inode);
1831fail:
1832 btrfs_close_devices(fs_info->fs_devices);
1833 btrfs_mapping_tree_free(&fs_info->mapping_tree);
1834
1835 kfree(extent_root);
1836 kfree(tree_root);
1837 bdi_destroy(&fs_info->bdi);
1838 kfree(fs_info);
1839 kfree(chunk_root);
1840 kfree(dev_root);
1841 kfree(csum_root);
1842 return ERR_PTR(err);
1843}
1844
1845static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
1846{
1847 char b[BDEVNAME_SIZE];
1848
1849 if (uptodate) {
1850 set_buffer_uptodate(bh);
1851 } else {
1852 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
1853 printk(KERN_WARNING "lost page write due to "
1854 "I/O error on %s\n",
1855 bdevname(bh->b_bdev, b));
1856 }
1857 /* note, we dont' set_buffer_write_io_error because we have
1858 * our own ways of dealing with the IO errors
1859 */
1860 clear_buffer_uptodate(bh);
1861 }
1862 unlock_buffer(bh);
1863 put_bh(bh);
1864}
1865
1866struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
1867{
1868 struct buffer_head *bh;
1869 struct buffer_head *latest = NULL;
1870 struct btrfs_super_block *super;
1871 int i;
1872 u64 transid = 0;
1873 u64 bytenr;
1874
1875 /* we would like to check all the supers, but that would make
1876 * a btrfs mount succeed after a mkfs from a different FS.
1877 * So, we need to add a special mount option to scan for
1878 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1879 */
1880 for (i = 0; i < 1; i++) {
1881 bytenr = btrfs_sb_offset(i);
1882 if (bytenr + 4096 >= i_size_read(bdev->bd_inode))
1883 break;
1884 bh = __bread(bdev, bytenr / 4096, 4096);
1885 if (!bh)
1886 continue;
1887
1888 super = (struct btrfs_super_block *)bh->b_data;
1889 if (btrfs_super_bytenr(super) != bytenr ||
1890 strncmp((char *)(&super->magic), BTRFS_MAGIC,
1891 sizeof(super->magic))) {
1892 brelse(bh);
1893 continue;
1894 }
1895
1896 if (!latest || btrfs_super_generation(super) > transid) {
1897 brelse(latest);
1898 latest = bh;
1899 transid = btrfs_super_generation(super);
1900 } else {
1901 brelse(bh);
1902 }
1903 }
1904 return latest;
1905}
1906
1907static int write_dev_supers(struct btrfs_device *device,
1908 struct btrfs_super_block *sb,
1909 int do_barriers, int wait, int max_mirrors)
1910{
1911 struct buffer_head *bh;
1912 int i;
1913 int ret;
1914 int errors = 0;
1915 u32 crc;
1916 u64 bytenr;
1917 int last_barrier = 0;
1918
1919 if (max_mirrors == 0)
1920 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
1921
1922 /* make sure only the last submit_bh does a barrier */
1923 if (do_barriers) {
1924 for (i = 0; i < max_mirrors; i++) {
1925 bytenr = btrfs_sb_offset(i);
1926 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
1927 device->total_bytes)
1928 break;
1929 last_barrier = i;
1930 }
1931 }
1932
1933 for (i = 0; i < max_mirrors; i++) {
1934 bytenr = btrfs_sb_offset(i);
1935 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
1936 break;
1937
1938 if (wait) {
1939 bh = __find_get_block(device->bdev, bytenr / 4096,
1940 BTRFS_SUPER_INFO_SIZE);
1941 BUG_ON(!bh);
1942 brelse(bh);
1943 wait_on_buffer(bh);
1944 if (buffer_uptodate(bh)) {
1945 brelse(bh);
1946 continue;
1947 }
1948 } else {
1949 btrfs_set_super_bytenr(sb, bytenr);
1950
1951 crc = ~(u32)0;
1952 crc = btrfs_csum_data(NULL, (char *)sb +
1953 BTRFS_CSUM_SIZE, crc,
1954 BTRFS_SUPER_INFO_SIZE -
1955 BTRFS_CSUM_SIZE);
1956 btrfs_csum_final(crc, sb->csum);
1957
1958 bh = __getblk(device->bdev, bytenr / 4096,
1959 BTRFS_SUPER_INFO_SIZE);
1960 memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
1961
1962 set_buffer_uptodate(bh);
1963 get_bh(bh);
1964 lock_buffer(bh);
1965 bh->b_end_io = btrfs_end_buffer_write_sync;
1966 }
1967
1968 if (i == last_barrier && do_barriers && device->barriers) {
1969 ret = submit_bh(WRITE_BARRIER, bh);
1970 if (ret == -EOPNOTSUPP) {
1971 printk("btrfs: disabling barriers on dev %s\n",
1972 device->name);
1973 set_buffer_uptodate(bh);
1974 device->barriers = 0;
1975 get_bh(bh);
1976 lock_buffer(bh);
1977 ret = submit_bh(WRITE, bh);
1978 }
1979 } else {
1980 ret = submit_bh(WRITE, bh);
1981 }
1982
1983 if (!ret && wait) {
1984 wait_on_buffer(bh);
1985 if (!buffer_uptodate(bh))
1986 errors++;
1987 } else if (ret) {
1988 errors++;
1989 }
1990 if (wait)
1991 brelse(bh);
1992 }
1993 return errors < i ? 0 : -1;
1994}
1995
1996int write_all_supers(struct btrfs_root *root, int max_mirrors)
1997{
1998 struct list_head *cur;
1999 struct list_head *head = &root->fs_info->fs_devices->devices;
2000 struct btrfs_device *dev;
2001 struct btrfs_super_block *sb;
2002 struct btrfs_dev_item *dev_item;
2003 int ret;
2004 int do_barriers;
2005 int max_errors;
2006 int total_errors = 0;
2007 u64 flags;
2008
2009 max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
2010 do_barriers = !btrfs_test_opt(root, NOBARRIER);
2011
2012 sb = &root->fs_info->super_for_commit;
2013 dev_item = &sb->dev_item;
2014 list_for_each(cur, head) {
2015 dev = list_entry(cur, struct btrfs_device, dev_list);
2016 if (!dev->bdev) {
2017 total_errors++;
2018 continue;
2019 }
2020 if (!dev->in_fs_metadata || !dev->writeable)
2021 continue;
2022
2023 btrfs_set_stack_device_generation(dev_item, 0);
2024 btrfs_set_stack_device_type(dev_item, dev->type);
2025 btrfs_set_stack_device_id(dev_item, dev->devid);
2026 btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
2027 btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
2028 btrfs_set_stack_device_io_align(dev_item, dev->io_align);
2029 btrfs_set_stack_device_io_width(dev_item, dev->io_width);
2030 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
2031 memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
2032 memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
2033
2034 flags = btrfs_super_flags(sb);
2035 btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
2036
2037 ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors);
2038 if (ret)
2039 total_errors++;
2040 }
2041 if (total_errors > max_errors) {
2042 printk(KERN_ERR "btrfs: %d errors while writing supers\n",
2043 total_errors);
2044 BUG();
2045 }
2046
2047 total_errors = 0;
2048 list_for_each(cur, head) {
2049 dev = list_entry(cur, struct btrfs_device, dev_list);
2050 if (!dev->bdev)
2051 continue;
2052 if (!dev->in_fs_metadata || !dev->writeable)
2053 continue;
2054
2055 ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors);
2056 if (ret)
2057 total_errors++;
2058 }
2059 if (total_errors > max_errors) {
2060 printk(KERN_ERR "btrfs: %d errors while writing supers\n",
2061 total_errors);
2062 BUG();
2063 }
2064 return 0;
2065}
2066
2067int write_ctree_super(struct btrfs_trans_handle *trans,
2068 struct btrfs_root *root, int max_mirrors)
2069{
2070 int ret;
2071
2072 ret = write_all_supers(root, max_mirrors);
2073 return ret;
2074}
2075
2076int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
2077{
2078 radix_tree_delete(&fs_info->fs_roots_radix,
2079 (unsigned long)root->root_key.objectid);
2080 if (root->anon_super.s_dev) {
2081 down_write(&root->anon_super.s_umount);
2082 kill_anon_super(&root->anon_super);
2083 }
2084 if (root->node)
2085 free_extent_buffer(root->node);
2086 if (root->commit_root)
2087 free_extent_buffer(root->commit_root);
2088 kfree(root->name);
2089 kfree(root);
2090 return 0;
2091}
2092
2093static int del_fs_roots(struct btrfs_fs_info *fs_info)
2094{
2095 int ret;
2096 struct btrfs_root *gang[8];
2097 int i;
2098
2099 while (1) {
2100 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
2101 (void **)gang, 0,
2102 ARRAY_SIZE(gang));
2103 if (!ret)
2104 break;
2105 for (i = 0; i < ret; i++)
2106 btrfs_free_fs_root(fs_info, gang[i]);
2107 }
2108 return 0;
2109}
2110
2111int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
2112{
2113 u64 root_objectid = 0;
2114 struct btrfs_root *gang[8];
2115 int i;
2116 int ret;
2117
2118 while (1) {
2119 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
2120 (void **)gang, root_objectid,
2121 ARRAY_SIZE(gang));
2122 if (!ret)
2123 break;
2124 for (i = 0; i < ret; i++) {
2125 root_objectid = gang[i]->root_key.objectid;
2126 ret = btrfs_find_dead_roots(fs_info->tree_root,
2127 root_objectid, gang[i]);
2128 BUG_ON(ret);
2129 btrfs_orphan_cleanup(gang[i]);
2130 }
2131 root_objectid++;
2132 }
2133 return 0;
2134}
2135
2136int btrfs_commit_super(struct btrfs_root *root)
2137{
2138 struct btrfs_trans_handle *trans;
2139 int ret;
2140
2141 mutex_lock(&root->fs_info->cleaner_mutex);
2142 btrfs_clean_old_snapshots(root);
2143 mutex_unlock(&root->fs_info->cleaner_mutex);
2144 trans = btrfs_start_transaction(root, 1);
2145 ret = btrfs_commit_transaction(trans, root);
2146 BUG_ON(ret);
2147 /* run commit again to drop the original snapshot */
2148 trans = btrfs_start_transaction(root, 1);
2149 btrfs_commit_transaction(trans, root);
2150 ret = btrfs_write_and_wait_transaction(NULL, root);
2151 BUG_ON(ret);
2152
2153 ret = write_ctree_super(NULL, root, 0);
2154 return ret;
2155}
2156
2157int close_ctree(struct btrfs_root *root)
2158{
2159 struct btrfs_fs_info *fs_info = root->fs_info;
2160 int ret;
2161
2162 fs_info->closing = 1;
2163 smp_mb();
2164
2165 kthread_stop(root->fs_info->transaction_kthread);
2166 kthread_stop(root->fs_info->cleaner_kthread);
2167
2168 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
2169 ret = btrfs_commit_super(root);
2170 if (ret)
2171 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2172 }
2173
2174 if (fs_info->delalloc_bytes) {
2175 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
2176 fs_info->delalloc_bytes);
2177 }
2178 if (fs_info->total_ref_cache_size) {
2179 printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
2180 (unsigned long long)fs_info->total_ref_cache_size);
2181 }
2182
2183 if (fs_info->extent_root->node)
2184 free_extent_buffer(fs_info->extent_root->node);
2185
2186 if (fs_info->tree_root->node)
2187 free_extent_buffer(fs_info->tree_root->node);
2188
2189 if (root->fs_info->chunk_root->node)
2190 free_extent_buffer(root->fs_info->chunk_root->node);
2191
2192 if (root->fs_info->dev_root->node)
2193 free_extent_buffer(root->fs_info->dev_root->node);
2194
2195 if (root->fs_info->csum_root->node)
2196 free_extent_buffer(root->fs_info->csum_root->node);
2197
2198 btrfs_free_block_groups(root->fs_info);
2199
2200 del_fs_roots(fs_info);
2201
2202 iput(fs_info->btree_inode);
2203
2204 btrfs_stop_workers(&fs_info->fixup_workers);
2205 btrfs_stop_workers(&fs_info->delalloc_workers);
2206 btrfs_stop_workers(&fs_info->workers);
2207 btrfs_stop_workers(&fs_info->endio_workers);
2208 btrfs_stop_workers(&fs_info->endio_meta_workers);
2209 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2210 btrfs_stop_workers(&fs_info->endio_write_workers);
2211 btrfs_stop_workers(&fs_info->submit_workers);
2212
2213#if 0
2214 while (!list_empty(&fs_info->hashers)) {
2215 struct btrfs_hasher *hasher;
2216 hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher,
2217 hashers);
2218 list_del(&hasher->hashers);
2219 crypto_free_hash(&fs_info->hash_tfm);
2220 kfree(hasher);
2221 }
2222#endif
2223 btrfs_close_devices(fs_info->fs_devices);
2224 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2225
2226 bdi_destroy(&fs_info->bdi);
2227
2228 kfree(fs_info->extent_root);
2229 kfree(fs_info->tree_root);
2230 kfree(fs_info->chunk_root);
2231 kfree(fs_info->dev_root);
2232 kfree(fs_info->csum_root);
2233 return 0;
2234}
2235
2236int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
2237{
2238 int ret;
2239 struct inode *btree_inode = buf->first_page->mapping->host;
2240
2241 ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf);
2242 if (!ret)
2243 return ret;
2244
2245 ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
2246 parent_transid);
2247 return !ret;
2248}
2249
2250int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
2251{
2252 struct inode *btree_inode = buf->first_page->mapping->host;
2253 return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree,
2254 buf);
2255}
2256
2257void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
2258{
2259 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
2260 u64 transid = btrfs_header_generation(buf);
2261 struct inode *btree_inode = root->fs_info->btree_inode;
2262
2263 WARN_ON(!btrfs_tree_locked(buf));
2264 if (transid != root->fs_info->generation) {
2265 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
2266 "found %llu running %llu\n",
2267 (unsigned long long)buf->start,
2268 (unsigned long long)transid,
2269 (unsigned long long)root->fs_info->generation);
2270 WARN_ON(1);
2271 }
2272 set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
2273}
2274
2275void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
2276{
2277 /*
2278 * looks as though older kernels can get into trouble with
2279 * this code, they end up stuck in balance_dirty_pages forever
2280 */
2281 struct extent_io_tree *tree;
2282 u64 num_dirty;
2283 u64 start = 0;
2284 unsigned long thresh = 32 * 1024 * 1024;
2285 tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
2286
2287 if (current_is_pdflush() || current->flags & PF_MEMALLOC)
2288 return;
2289
2290 num_dirty = count_range_bits(tree, &start, (u64)-1,
2291 thresh, EXTENT_DIRTY);
2292 if (num_dirty > thresh) {
2293 balance_dirty_pages_ratelimited_nr(
2294 root->fs_info->btree_inode->i_mapping, 1);
2295 }
2296 return;
2297}
2298
2299int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2300{
2301 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
2302 int ret;
2303 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
2304 if (ret == 0)
2305 buf->flags |= EXTENT_UPTODATE;
2306 return ret;
2307}
2308
2309int btree_lock_page_hook(struct page *page)
2310{
2311 struct inode *inode = page->mapping->host;
2312 struct btrfs_root *root = BTRFS_I(inode)->root;
2313 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2314 struct extent_buffer *eb;
2315 unsigned long len;
2316 u64 bytenr = page_offset(page);
2317
2318 if (page->private == EXTENT_PAGE_PRIVATE)
2319 goto out;
2320
2321 len = page->private >> 2;
2322 eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS);
2323 if (!eb)
2324 goto out;
2325
2326 btrfs_tree_lock(eb);
2327 spin_lock(&root->fs_info->hash_lock);
2328 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2329 spin_unlock(&root->fs_info->hash_lock);
2330 btrfs_tree_unlock(eb);
2331 free_extent_buffer(eb);
2332out:
2333 lock_page(page);
2334 return 0;
2335}
2336
2337static struct extent_io_ops btree_extent_io_ops = {
2338 .write_cache_pages_lock_hook = btree_lock_page_hook,
2339 .readpage_end_io_hook = btree_readpage_end_io_hook,
2340 .submit_bio_hook = btree_submit_bio_hook,
2341 /* note we're sharing with inode.c for the merge bio hook */
2342 .merge_bio_hook = btrfs_merge_bio_hook,
2343};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
new file mode 100644
index 000000000000..c0ff404c31b7
--- /dev/null
+++ b/fs/btrfs/disk-io.h
@@ -0,0 +1,102 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __DISKIO__
20#define __DISKIO__
21
22#define BTRFS_SUPER_INFO_OFFSET (64 * 1024)
23#define BTRFS_SUPER_INFO_SIZE 4096
24
25#define BTRFS_SUPER_MIRROR_MAX 3
26#define BTRFS_SUPER_MIRROR_SHIFT 12
27
28static inline u64 btrfs_sb_offset(int mirror)
29{
30 u64 start = 16 * 1024;
31 if (mirror)
32 return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror);
33 return BTRFS_SUPER_INFO_OFFSET;
34}
35
36struct btrfs_device;
37struct btrfs_fs_devices;
38
39struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
40 u32 blocksize, u64 parent_transid);
41int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
42 u64 parent_transid);
43struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
44 u64 bytenr, u32 blocksize);
45int clean_tree_block(struct btrfs_trans_handle *trans,
46 struct btrfs_root *root, struct extent_buffer *buf);
47struct btrfs_root *open_ctree(struct super_block *sb,
48 struct btrfs_fs_devices *fs_devices,
49 char *options);
50int close_ctree(struct btrfs_root *root);
51int write_ctree_super(struct btrfs_trans_handle *trans,
52 struct btrfs_root *root, int max_mirrors);
53struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
54int btrfs_commit_super(struct btrfs_root *root);
55struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
56 u64 bytenr, u32 blocksize);
57struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
58 u64 root_objectid);
59struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
60 struct btrfs_key *location,
61 const char *name, int namelen);
62struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
63 struct btrfs_key *location);
64struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
65 struct btrfs_key *location);
66int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
67int btrfs_insert_dev_radix(struct btrfs_root *root,
68 struct block_device *bdev,
69 u64 device_id,
70 u64 block_start,
71 u64 num_blocks);
72void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
73int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
74void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
75int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
76int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
77int wait_on_tree_block_writeback(struct btrfs_root *root,
78 struct extent_buffer *buf);
79int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
80u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
81void btrfs_csum_final(u32 crc, char *result);
82int btrfs_open_device(struct btrfs_device *dev);
83int btrfs_verify_block_csum(struct btrfs_root *root,
84 struct extent_buffer *buf);
85int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
86 int metadata);
87int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
88 int rw, struct bio *bio, int mirror_num,
89 unsigned long bio_flags,
90 extent_submit_bio_hook_t *submit_bio_start,
91 extent_submit_bio_hook_t *submit_bio_done);
92
93int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
94unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
95int btrfs_write_tree_block(struct extent_buffer *buf);
96int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
97int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
98 struct btrfs_fs_info *fs_info);
99int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
100 struct btrfs_fs_info *fs_info);
101int btree_lock_page_hook(struct page *page);
102#endif
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
new file mode 100644
index 000000000000..85315d2c90de
--- /dev/null
+++ b/fs/btrfs/export.c
@@ -0,0 +1,203 @@
1#include <linux/fs.h>
2#include <linux/types.h>
3#include "ctree.h"
4#include "disk-io.h"
5#include "btrfs_inode.h"
6#include "print-tree.h"
7#include "export.h"
8#include "compat.h"
9
10#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \
11 parent_objectid) / 4)
12#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, \
13 parent_root_objectid) / 4)
14#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4)
15
16static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
17 int connectable)
18{
19 struct btrfs_fid *fid = (struct btrfs_fid *)fh;
20 struct inode *inode = dentry->d_inode;
21 int len = *max_len;
22 int type;
23
24 if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) ||
25 (connectable && len < BTRFS_FID_SIZE_CONNECTABLE))
26 return 255;
27
28 len = BTRFS_FID_SIZE_NON_CONNECTABLE;
29 type = FILEID_BTRFS_WITHOUT_PARENT;
30
31 fid->objectid = BTRFS_I(inode)->location.objectid;
32 fid->root_objectid = BTRFS_I(inode)->root->objectid;
33 fid->gen = inode->i_generation;
34
35 if (connectable && !S_ISDIR(inode->i_mode)) {
36 struct inode *parent;
37 u64 parent_root_id;
38
39 spin_lock(&dentry->d_lock);
40
41 parent = dentry->d_parent->d_inode;
42 fid->parent_objectid = BTRFS_I(parent)->location.objectid;
43 fid->parent_gen = parent->i_generation;
44 parent_root_id = BTRFS_I(parent)->root->objectid;
45
46 spin_unlock(&dentry->d_lock);
47
48 if (parent_root_id != fid->root_objectid) {
49 fid->parent_root_objectid = parent_root_id;
50 len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
51 type = FILEID_BTRFS_WITH_PARENT_ROOT;
52 } else {
53 len = BTRFS_FID_SIZE_CONNECTABLE;
54 type = FILEID_BTRFS_WITH_PARENT;
55 }
56 }
57
58 *max_len = len;
59 return type;
60}
61
62static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
63 u64 root_objectid, u32 generation)
64{
65 struct btrfs_root *root;
66 struct inode *inode;
67 struct btrfs_key key;
68
69 key.objectid = root_objectid;
70 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
71 key.offset = (u64)-1;
72
73 root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key);
74 if (IS_ERR(root))
75 return ERR_CAST(root);
76
77 key.objectid = objectid;
78 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
79 key.offset = 0;
80
81 inode = btrfs_iget(sb, &key, root, NULL);
82 if (IS_ERR(inode))
83 return (void *)inode;
84
85 if (generation != inode->i_generation) {
86 iput(inode);
87 return ERR_PTR(-ESTALE);
88 }
89
90 return d_obtain_alias(inode);
91}
92
93static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
94 int fh_len, int fh_type)
95{
96 struct btrfs_fid *fid = (struct btrfs_fid *) fh;
97 u64 objectid, root_objectid;
98 u32 generation;
99
100 if (fh_type == FILEID_BTRFS_WITH_PARENT) {
101 if (fh_len != BTRFS_FID_SIZE_CONNECTABLE)
102 return NULL;
103 root_objectid = fid->root_objectid;
104 } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) {
105 if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT)
106 return NULL;
107 root_objectid = fid->parent_root_objectid;
108 } else
109 return NULL;
110
111 objectid = fid->parent_objectid;
112 generation = fid->parent_gen;
113
114 return btrfs_get_dentry(sb, objectid, root_objectid, generation);
115}
116
117static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
118 int fh_len, int fh_type)
119{
120 struct btrfs_fid *fid = (struct btrfs_fid *) fh;
121 u64 objectid, root_objectid;
122 u32 generation;
123
124 if ((fh_type != FILEID_BTRFS_WITH_PARENT ||
125 fh_len != BTRFS_FID_SIZE_CONNECTABLE) &&
126 (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT ||
127 fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) &&
128 (fh_type != FILEID_BTRFS_WITHOUT_PARENT ||
129 fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE))
130 return NULL;
131
132 objectid = fid->objectid;
133 root_objectid = fid->root_objectid;
134 generation = fid->gen;
135
136 return btrfs_get_dentry(sb, objectid, root_objectid, generation);
137}
138
139static struct dentry *btrfs_get_parent(struct dentry *child)
140{
141 struct inode *dir = child->d_inode;
142 struct btrfs_root *root = BTRFS_I(dir)->root;
143 struct btrfs_key key;
144 struct btrfs_path *path;
145 struct extent_buffer *leaf;
146 int slot;
147 u64 objectid;
148 int ret;
149
150 path = btrfs_alloc_path();
151
152 key.objectid = dir->i_ino;
153 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
154 key.offset = (u64)-1;
155
156 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
157 if (ret < 0) {
158 /* Error */
159 btrfs_free_path(path);
160 return ERR_PTR(ret);
161 }
162 leaf = path->nodes[0];
163 slot = path->slots[0];
164 if (ret) {
165 /* btrfs_search_slot() returns the slot where we'd want to
166 insert a backref for parent inode #0xFFFFFFFFFFFFFFFF.
167 The _real_ backref, telling us what the parent inode
168 _actually_ is, will be in the slot _before_ the one
169 that btrfs_search_slot() returns. */
170 if (!slot) {
171 /* Unless there is _no_ key in the tree before... */
172 btrfs_free_path(path);
173 return ERR_PTR(-EIO);
174 }
175 slot--;
176 }
177
178 btrfs_item_key_to_cpu(leaf, &key, slot);
179 btrfs_free_path(path);
180
181 if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY)
182 return ERR_PTR(-EINVAL);
183
184 objectid = key.offset;
185
186 /* If we are already at the root of a subvol, return the real root */
187 if (objectid == dir->i_ino)
188 return dget(dir->i_sb->s_root);
189
190 /* Build a new key for the inode item */
191 key.objectid = objectid;
192 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
193 key.offset = 0;
194
195 return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
196}
197
198const struct export_operations btrfs_export_ops = {
199 .encode_fh = btrfs_encode_fh,
200 .fh_to_dentry = btrfs_fh_to_dentry,
201 .fh_to_parent = btrfs_fh_to_parent,
202 .get_parent = btrfs_get_parent,
203};
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
new file mode 100644
index 000000000000..074348a95841
--- /dev/null
+++ b/fs/btrfs/export.h
@@ -0,0 +1,19 @@
1#ifndef BTRFS_EXPORT_H
2#define BTRFS_EXPORT_H
3
4#include <linux/exportfs.h>
5
6extern const struct export_operations btrfs_export_ops;
7
8struct btrfs_fid {
9 u64 objectid;
10 u64 root_objectid;
11 u32 gen;
12
13 u64 parent_objectid;
14 u32 parent_gen;
15
16 u64 parent_root_objectid;
17} __attribute__ ((packed));
18
19#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
new file mode 100644
index 000000000000..293da650873f
--- /dev/null
+++ b/fs/btrfs/extent-tree.c
@@ -0,0 +1,5986 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/pagemap.h>
20#include <linux/writeback.h>
21#include <linux/blkdev.h>
22#include <linux/version.h>
23#include "compat.h"
24#include "hash.h"
25#include "crc32c.h"
26#include "ctree.h"
27#include "disk-io.h"
28#include "print-tree.h"
29#include "transaction.h"
30#include "volumes.h"
31#include "locking.h"
32#include "ref-cache.h"
33#include "compat.h"
34
35#define PENDING_EXTENT_INSERT 0
36#define PENDING_EXTENT_DELETE 1
37#define PENDING_BACKREF_UPDATE 2
38
39struct pending_extent_op {
40 int type;
41 u64 bytenr;
42 u64 num_bytes;
43 u64 parent;
44 u64 orig_parent;
45 u64 generation;
46 u64 orig_generation;
47 int level;
48 struct list_head list;
49 int del;
50};
51
52static int finish_current_insert(struct btrfs_trans_handle *trans,
53 struct btrfs_root *extent_root, int all);
54static int del_pending_extents(struct btrfs_trans_handle *trans,
55 struct btrfs_root *extent_root, int all);
56static int pin_down_bytes(struct btrfs_trans_handle *trans,
57 struct btrfs_root *root,
58 u64 bytenr, u64 num_bytes, int is_data);
59static int update_block_group(struct btrfs_trans_handle *trans,
60 struct btrfs_root *root,
61 u64 bytenr, u64 num_bytes, int alloc,
62 int mark_free);
63
64static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
65{
66 return (cache->flags & bits) == bits;
67}
68
69/*
70 * this adds the block group to the fs_info rb tree for the block group
71 * cache
72 */
73static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
74 struct btrfs_block_group_cache *block_group)
75{
76 struct rb_node **p;
77 struct rb_node *parent = NULL;
78 struct btrfs_block_group_cache *cache;
79
80 spin_lock(&info->block_group_cache_lock);
81 p = &info->block_group_cache_tree.rb_node;
82
83 while (*p) {
84 parent = *p;
85 cache = rb_entry(parent, struct btrfs_block_group_cache,
86 cache_node);
87 if (block_group->key.objectid < cache->key.objectid) {
88 p = &(*p)->rb_left;
89 } else if (block_group->key.objectid > cache->key.objectid) {
90 p = &(*p)->rb_right;
91 } else {
92 spin_unlock(&info->block_group_cache_lock);
93 return -EEXIST;
94 }
95 }
96
97 rb_link_node(&block_group->cache_node, parent, p);
98 rb_insert_color(&block_group->cache_node,
99 &info->block_group_cache_tree);
100 spin_unlock(&info->block_group_cache_lock);
101
102 return 0;
103}
104
105/*
106 * This will return the block group at or after bytenr if contains is 0, else
107 * it will return the block group that contains the bytenr
108 */
109static struct btrfs_block_group_cache *
110block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
111 int contains)
112{
113 struct btrfs_block_group_cache *cache, *ret = NULL;
114 struct rb_node *n;
115 u64 end, start;
116
117 spin_lock(&info->block_group_cache_lock);
118 n = info->block_group_cache_tree.rb_node;
119
120 while (n) {
121 cache = rb_entry(n, struct btrfs_block_group_cache,
122 cache_node);
123 end = cache->key.objectid + cache->key.offset - 1;
124 start = cache->key.objectid;
125
126 if (bytenr < start) {
127 if (!contains && (!ret || start < ret->key.objectid))
128 ret = cache;
129 n = n->rb_left;
130 } else if (bytenr > start) {
131 if (contains && bytenr <= end) {
132 ret = cache;
133 break;
134 }
135 n = n->rb_right;
136 } else {
137 ret = cache;
138 break;
139 }
140 }
141 if (ret)
142 atomic_inc(&ret->count);
143 spin_unlock(&info->block_group_cache_lock);
144
145 return ret;
146}
147
148/*
149 * this is only called by cache_block_group, since we could have freed extents
150 * we need to check the pinned_extents for any extents that can't be used yet
151 * since their free space will be released as soon as the transaction commits.
152 */
153static int add_new_free_space(struct btrfs_block_group_cache *block_group,
154 struct btrfs_fs_info *info, u64 start, u64 end)
155{
156 u64 extent_start, extent_end, size;
157 int ret;
158
159 mutex_lock(&info->pinned_mutex);
160 while (start < end) {
161 ret = find_first_extent_bit(&info->pinned_extents, start,
162 &extent_start, &extent_end,
163 EXTENT_DIRTY);
164 if (ret)
165 break;
166
167 if (extent_start == start) {
168 start = extent_end + 1;
169 } else if (extent_start > start && extent_start < end) {
170 size = extent_start - start;
171 ret = btrfs_add_free_space(block_group, start,
172 size);
173 BUG_ON(ret);
174 start = extent_end + 1;
175 } else {
176 break;
177 }
178 }
179
180 if (start < end) {
181 size = end - start;
182 ret = btrfs_add_free_space(block_group, start, size);
183 BUG_ON(ret);
184 }
185 mutex_unlock(&info->pinned_mutex);
186
187 return 0;
188}
189
190static int remove_sb_from_cache(struct btrfs_root *root,
191 struct btrfs_block_group_cache *cache)
192{
193 u64 bytenr;
194 u64 *logical;
195 int stripe_len;
196 int i, nr, ret;
197
198 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
199 bytenr = btrfs_sb_offset(i);
200 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
201 cache->key.objectid, bytenr, 0,
202 &logical, &nr, &stripe_len);
203 BUG_ON(ret);
204 while (nr--) {
205 btrfs_remove_free_space(cache, logical[nr],
206 stripe_len);
207 }
208 kfree(logical);
209 }
210 return 0;
211}
212
213static int cache_block_group(struct btrfs_root *root,
214 struct btrfs_block_group_cache *block_group)
215{
216 struct btrfs_path *path;
217 int ret = 0;
218 struct btrfs_key key;
219 struct extent_buffer *leaf;
220 int slot;
221 u64 last;
222
223 if (!block_group)
224 return 0;
225
226 root = root->fs_info->extent_root;
227
228 if (block_group->cached)
229 return 0;
230
231 path = btrfs_alloc_path();
232 if (!path)
233 return -ENOMEM;
234
235 path->reada = 2;
236 /*
237 * we get into deadlocks with paths held by callers of this function.
238 * since the alloc_mutex is protecting things right now, just
239 * skip the locking here
240 */
241 path->skip_locking = 1;
242 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
243 key.objectid = last;
244 key.offset = 0;
245 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
246 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
247 if (ret < 0)
248 goto err;
249
250 while (1) {
251 leaf = path->nodes[0];
252 slot = path->slots[0];
253 if (slot >= btrfs_header_nritems(leaf)) {
254 ret = btrfs_next_leaf(root, path);
255 if (ret < 0)
256 goto err;
257 if (ret == 0)
258 continue;
259 else
260 break;
261 }
262 btrfs_item_key_to_cpu(leaf, &key, slot);
263 if (key.objectid < block_group->key.objectid)
264 goto next;
265
266 if (key.objectid >= block_group->key.objectid +
267 block_group->key.offset)
268 break;
269
270 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
271 add_new_free_space(block_group, root->fs_info, last,
272 key.objectid);
273
274 last = key.objectid + key.offset;
275 }
276next:
277 path->slots[0]++;
278 }
279
280 add_new_free_space(block_group, root->fs_info, last,
281 block_group->key.objectid +
282 block_group->key.offset);
283
284 remove_sb_from_cache(root, block_group);
285 block_group->cached = 1;
286 ret = 0;
287err:
288 btrfs_free_path(path);
289 return ret;
290}
291
292/*
293 * return the block group that starts at or after bytenr
294 */
295static struct btrfs_block_group_cache *
296btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
297{
298 struct btrfs_block_group_cache *cache;
299
300 cache = block_group_cache_tree_search(info, bytenr, 0);
301
302 return cache;
303}
304
305/*
306 * return the block group that contains teh given bytenr
307 */
308struct btrfs_block_group_cache *btrfs_lookup_block_group(
309 struct btrfs_fs_info *info,
310 u64 bytenr)
311{
312 struct btrfs_block_group_cache *cache;
313
314 cache = block_group_cache_tree_search(info, bytenr, 1);
315
316 return cache;
317}
318
319static inline void put_block_group(struct btrfs_block_group_cache *cache)
320{
321 if (atomic_dec_and_test(&cache->count))
322 kfree(cache);
323}
324
325static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
326 u64 flags)
327{
328 struct list_head *head = &info->space_info;
329 struct list_head *cur;
330 struct btrfs_space_info *found;
331 list_for_each(cur, head) {
332 found = list_entry(cur, struct btrfs_space_info, list);
333 if (found->flags == flags)
334 return found;
335 }
336 return NULL;
337}
338
339static u64 div_factor(u64 num, int factor)
340{
341 if (factor == 10)
342 return num;
343 num *= factor;
344 do_div(num, 10);
345 return num;
346}
347
348u64 btrfs_find_block_group(struct btrfs_root *root,
349 u64 search_start, u64 search_hint, int owner)
350{
351 struct btrfs_block_group_cache *cache;
352 u64 used;
353 u64 last = max(search_hint, search_start);
354 u64 group_start = 0;
355 int full_search = 0;
356 int factor = 9;
357 int wrapped = 0;
358again:
359 while (1) {
360 cache = btrfs_lookup_first_block_group(root->fs_info, last);
361 if (!cache)
362 break;
363
364 spin_lock(&cache->lock);
365 last = cache->key.objectid + cache->key.offset;
366 used = btrfs_block_group_used(&cache->item);
367
368 if ((full_search || !cache->ro) &&
369 block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
370 if (used + cache->pinned + cache->reserved <
371 div_factor(cache->key.offset, factor)) {
372 group_start = cache->key.objectid;
373 spin_unlock(&cache->lock);
374 put_block_group(cache);
375 goto found;
376 }
377 }
378 spin_unlock(&cache->lock);
379 put_block_group(cache);
380 cond_resched();
381 }
382 if (!wrapped) {
383 last = search_start;
384 wrapped = 1;
385 goto again;
386 }
387 if (!full_search && factor < 10) {
388 last = search_start;
389 full_search = 1;
390 factor = 10;
391 goto again;
392 }
393found:
394 return group_start;
395}
396
397/* simple helper to search for an existing extent at a given offset */
398int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
399{
400 int ret;
401 struct btrfs_key key;
402 struct btrfs_path *path;
403
404 path = btrfs_alloc_path();
405 BUG_ON(!path);
406 key.objectid = start;
407 key.offset = len;
408 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
409 ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
410 0, 0);
411 btrfs_free_path(path);
412 return ret;
413}
414
415/*
416 * Back reference rules. Back refs have three main goals:
417 *
418 * 1) differentiate between all holders of references to an extent so that
419 * when a reference is dropped we can make sure it was a valid reference
420 * before freeing the extent.
421 *
422 * 2) Provide enough information to quickly find the holders of an extent
423 * if we notice a given block is corrupted or bad.
424 *
425 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
426 * maintenance. This is actually the same as #2, but with a slightly
427 * different use case.
428 *
429 * File extents can be referenced by:
430 *
431 * - multiple snapshots, subvolumes, or different generations in one subvol
432 * - different files inside a single subvolume
433 * - different offsets inside a file (bookend extents in file.c)
434 *
435 * The extent ref structure has fields for:
436 *
437 * - Objectid of the subvolume root
438 * - Generation number of the tree holding the reference
439 * - objectid of the file holding the reference
440 * - number of references holding by parent node (alway 1 for tree blocks)
441 *
442 * Btree leaf may hold multiple references to a file extent. In most cases,
443 * these references are from same file and the corresponding offsets inside
444 * the file are close together.
445 *
446 * When a file extent is allocated the fields are filled in:
447 * (root_key.objectid, trans->transid, inode objectid, 1)
448 *
449 * When a leaf is cow'd new references are added for every file extent found
450 * in the leaf. It looks similar to the create case, but trans->transid will
451 * be different when the block is cow'd.
452 *
453 * (root_key.objectid, trans->transid, inode objectid,
454 * number of references in the leaf)
455 *
456 * When a file extent is removed either during snapshot deletion or
457 * file truncation, we find the corresponding back reference and check
458 * the following fields:
459 *
460 * (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
461 * inode objectid)
462 *
463 * Btree extents can be referenced by:
464 *
465 * - Different subvolumes
466 * - Different generations of the same subvolume
467 *
468 * When a tree block is created, back references are inserted:
469 *
470 * (root->root_key.objectid, trans->transid, level, 1)
471 *
472 * When a tree block is cow'd, new back references are added for all the
473 * blocks it points to. If the tree block isn't in reference counted root,
474 * the old back references are removed. These new back references are of
475 * the form (trans->transid will have increased since creation):
476 *
477 * (root->root_key.objectid, trans->transid, level, 1)
478 *
479 * When a backref is in deleting, the following fields are checked:
480 *
481 * if backref was for a tree root:
482 * (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
483 * else
484 * (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
485 *
486 * Back Reference Key composing:
487 *
488 * The key objectid corresponds to the first byte in the extent, the key
489 * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first
490 * byte of parent extent. If a extent is tree root, the key offset is set
491 * to the key objectid.
492 */
493
494static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans,
495 struct btrfs_root *root,
496 struct btrfs_path *path,
497 u64 bytenr, u64 parent,
498 u64 ref_root, u64 ref_generation,
499 u64 owner_objectid, int del)
500{
501 struct btrfs_key key;
502 struct btrfs_extent_ref *ref;
503 struct extent_buffer *leaf;
504 u64 ref_objectid;
505 int ret;
506
507 key.objectid = bytenr;
508 key.type = BTRFS_EXTENT_REF_KEY;
509 key.offset = parent;
510
511 ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1);
512 if (ret < 0)
513 goto out;
514 if (ret > 0) {
515 ret = -ENOENT;
516 goto out;
517 }
518
519 leaf = path->nodes[0];
520 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
521 ref_objectid = btrfs_ref_objectid(leaf, ref);
522 if (btrfs_ref_root(leaf, ref) != ref_root ||
523 btrfs_ref_generation(leaf, ref) != ref_generation ||
524 (ref_objectid != owner_objectid &&
525 ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
526 ret = -EIO;
527 WARN_ON(1);
528 goto out;
529 }
530 ret = 0;
531out:
532 return ret;
533}
534
535/*
536 * updates all the backrefs that are pending on update_list for the
537 * extent_root
538 */
539static noinline int update_backrefs(struct btrfs_trans_handle *trans,
540 struct btrfs_root *extent_root,
541 struct btrfs_path *path,
542 struct list_head *update_list)
543{
544 struct btrfs_key key;
545 struct btrfs_extent_ref *ref;
546 struct btrfs_fs_info *info = extent_root->fs_info;
547 struct pending_extent_op *op;
548 struct extent_buffer *leaf;
549 int ret = 0;
550 struct list_head *cur = update_list->next;
551 u64 ref_objectid;
552 u64 ref_root = extent_root->root_key.objectid;
553
554 op = list_entry(cur, struct pending_extent_op, list);
555
556search:
557 key.objectid = op->bytenr;
558 key.type = BTRFS_EXTENT_REF_KEY;
559 key.offset = op->orig_parent;
560
561 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
562 BUG_ON(ret);
563
564 leaf = path->nodes[0];
565
566loop:
567 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
568
569 ref_objectid = btrfs_ref_objectid(leaf, ref);
570
571 if (btrfs_ref_root(leaf, ref) != ref_root ||
572 btrfs_ref_generation(leaf, ref) != op->orig_generation ||
573 (ref_objectid != op->level &&
574 ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
575 printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
576 "root %llu, owner %u\n",
577 (unsigned long long)op->bytenr,
578 (unsigned long long)op->orig_parent,
579 (unsigned long long)ref_root, op->level);
580 btrfs_print_leaf(extent_root, leaf);
581 BUG();
582 }
583
584 key.objectid = op->bytenr;
585 key.offset = op->parent;
586 key.type = BTRFS_EXTENT_REF_KEY;
587 ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
588 BUG_ON(ret);
589 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
590 btrfs_set_ref_generation(leaf, ref, op->generation);
591
592 cur = cur->next;
593
594 list_del_init(&op->list);
595 unlock_extent(&info->extent_ins, op->bytenr,
596 op->bytenr + op->num_bytes - 1, GFP_NOFS);
597 kfree(op);
598
599 if (cur == update_list) {
600 btrfs_mark_buffer_dirty(path->nodes[0]);
601 btrfs_release_path(extent_root, path);
602 goto out;
603 }
604
605 op = list_entry(cur, struct pending_extent_op, list);
606
607 path->slots[0]++;
608 while (path->slots[0] < btrfs_header_nritems(leaf)) {
609 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
610 if (key.objectid == op->bytenr &&
611 key.type == BTRFS_EXTENT_REF_KEY)
612 goto loop;
613 path->slots[0]++;
614 }
615
616 btrfs_mark_buffer_dirty(path->nodes[0]);
617 btrfs_release_path(extent_root, path);
618 goto search;
619
620out:
621 return 0;
622}
623
624static noinline int insert_extents(struct btrfs_trans_handle *trans,
625 struct btrfs_root *extent_root,
626 struct btrfs_path *path,
627 struct list_head *insert_list, int nr)
628{
629 struct btrfs_key *keys;
630 u32 *data_size;
631 struct pending_extent_op *op;
632 struct extent_buffer *leaf;
633 struct list_head *cur = insert_list->next;
634 struct btrfs_fs_info *info = extent_root->fs_info;
635 u64 ref_root = extent_root->root_key.objectid;
636 int i = 0, last = 0, ret;
637 int total = nr * 2;
638
639 if (!nr)
640 return 0;
641
642 keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
643 if (!keys)
644 return -ENOMEM;
645
646 data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
647 if (!data_size) {
648 kfree(keys);
649 return -ENOMEM;
650 }
651
652 list_for_each_entry(op, insert_list, list) {
653 keys[i].objectid = op->bytenr;
654 keys[i].offset = op->num_bytes;
655 keys[i].type = BTRFS_EXTENT_ITEM_KEY;
656 data_size[i] = sizeof(struct btrfs_extent_item);
657 i++;
658
659 keys[i].objectid = op->bytenr;
660 keys[i].offset = op->parent;
661 keys[i].type = BTRFS_EXTENT_REF_KEY;
662 data_size[i] = sizeof(struct btrfs_extent_ref);
663 i++;
664 }
665
666 op = list_entry(cur, struct pending_extent_op, list);
667 i = 0;
668 while (i < total) {
669 int c;
670 ret = btrfs_insert_some_items(trans, extent_root, path,
671 keys+i, data_size+i, total-i);
672 BUG_ON(ret < 0);
673
674 if (last && ret > 1)
675 BUG();
676
677 leaf = path->nodes[0];
678 for (c = 0; c < ret; c++) {
679 int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
680
681 /*
682 * if the first item we inserted was a backref, then
683 * the EXTENT_ITEM will be the odd c's, else it will
684 * be the even c's
685 */
686 if ((ref_first && (c % 2)) ||
687 (!ref_first && !(c % 2))) {
688 struct btrfs_extent_item *itm;
689
690 itm = btrfs_item_ptr(leaf, path->slots[0] + c,
691 struct btrfs_extent_item);
692 btrfs_set_extent_refs(path->nodes[0], itm, 1);
693 op->del++;
694 } else {
695 struct btrfs_extent_ref *ref;
696
697 ref = btrfs_item_ptr(leaf, path->slots[0] + c,
698 struct btrfs_extent_ref);
699 btrfs_set_ref_root(leaf, ref, ref_root);
700 btrfs_set_ref_generation(leaf, ref,
701 op->generation);
702 btrfs_set_ref_objectid(leaf, ref, op->level);
703 btrfs_set_ref_num_refs(leaf, ref, 1);
704 op->del++;
705 }
706
707 /*
708 * using del to see when its ok to free up the
709 * pending_extent_op. In the case where we insert the
710 * last item on the list in order to help do batching
711 * we need to not free the extent op until we actually
712 * insert the extent_item
713 */
714 if (op->del == 2) {
715 unlock_extent(&info->extent_ins, op->bytenr,
716 op->bytenr + op->num_bytes - 1,
717 GFP_NOFS);
718 cur = cur->next;
719 list_del_init(&op->list);
720 kfree(op);
721 if (cur != insert_list)
722 op = list_entry(cur,
723 struct pending_extent_op,
724 list);
725 }
726 }
727 btrfs_mark_buffer_dirty(leaf);
728 btrfs_release_path(extent_root, path);
729
730 /*
731 * Ok backref's and items usually go right next to eachother,
732 * but if we could only insert 1 item that means that we
733 * inserted on the end of a leaf, and we have no idea what may
734 * be on the next leaf so we just play it safe. In order to
735 * try and help this case we insert the last thing on our
736 * insert list so hopefully it will end up being the last
737 * thing on the leaf and everything else will be before it,
738 * which will let us insert a whole bunch of items at the same
739 * time.
740 */
741 if (ret == 1 && !last && (i + ret < total)) {
742 /*
743 * last: where we will pick up the next time around
744 * i: our current key to insert, will be total - 1
745 * cur: the current op we are screwing with
746 * op: duh
747 */
748 last = i + ret;
749 i = total - 1;
750 cur = insert_list->prev;
751 op = list_entry(cur, struct pending_extent_op, list);
752 } else if (last) {
753 /*
754 * ok we successfully inserted the last item on the
755 * list, lets reset everything
756 *
757 * i: our current key to insert, so where we left off
758 * last time
759 * last: done with this
760 * cur: the op we are messing with
761 * op: duh
762 * total: since we inserted the last key, we need to
763 * decrement total so we dont overflow
764 */
765 i = last;
766 last = 0;
767 total--;
768 if (i < total) {
769 cur = insert_list->next;
770 op = list_entry(cur, struct pending_extent_op,
771 list);
772 }
773 } else {
774 i += ret;
775 }
776
777 cond_resched();
778 }
779 ret = 0;
780 kfree(keys);
781 kfree(data_size);
782 return ret;
783}
784
785static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
786 struct btrfs_root *root,
787 struct btrfs_path *path,
788 u64 bytenr, u64 parent,
789 u64 ref_root, u64 ref_generation,
790 u64 owner_objectid)
791{
792 struct btrfs_key key;
793 struct extent_buffer *leaf;
794 struct btrfs_extent_ref *ref;
795 u32 num_refs;
796 int ret;
797
798 key.objectid = bytenr;
799 key.type = BTRFS_EXTENT_REF_KEY;
800 key.offset = parent;
801
802 ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref));
803 if (ret == 0) {
804 leaf = path->nodes[0];
805 ref = btrfs_item_ptr(leaf, path->slots[0],
806 struct btrfs_extent_ref);
807 btrfs_set_ref_root(leaf, ref, ref_root);
808 btrfs_set_ref_generation(leaf, ref, ref_generation);
809 btrfs_set_ref_objectid(leaf, ref, owner_objectid);
810 btrfs_set_ref_num_refs(leaf, ref, 1);
811 } else if (ret == -EEXIST) {
812 u64 existing_owner;
813 BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
814 leaf = path->nodes[0];
815 ref = btrfs_item_ptr(leaf, path->slots[0],
816 struct btrfs_extent_ref);
817 if (btrfs_ref_root(leaf, ref) != ref_root ||
818 btrfs_ref_generation(leaf, ref) != ref_generation) {
819 ret = -EIO;
820 WARN_ON(1);
821 goto out;
822 }
823
824 num_refs = btrfs_ref_num_refs(leaf, ref);
825 BUG_ON(num_refs == 0);
826 btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
827
828 existing_owner = btrfs_ref_objectid(leaf, ref);
829 if (existing_owner != owner_objectid &&
830 existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
831 btrfs_set_ref_objectid(leaf, ref,
832 BTRFS_MULTIPLE_OBJECTIDS);
833 }
834 ret = 0;
835 } else {
836 goto out;
837 }
838 btrfs_mark_buffer_dirty(path->nodes[0]);
839out:
840 btrfs_release_path(root, path);
841 return ret;
842}
843
844static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
845 struct btrfs_root *root,
846 struct btrfs_path *path)
847{
848 struct extent_buffer *leaf;
849 struct btrfs_extent_ref *ref;
850 u32 num_refs;
851 int ret = 0;
852
853 leaf = path->nodes[0];
854 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
855 num_refs = btrfs_ref_num_refs(leaf, ref);
856 BUG_ON(num_refs == 0);
857 num_refs -= 1;
858 if (num_refs == 0) {
859 ret = btrfs_del_item(trans, root, path);
860 } else {
861 btrfs_set_ref_num_refs(leaf, ref, num_refs);
862 btrfs_mark_buffer_dirty(leaf);
863 }
864 btrfs_release_path(root, path);
865 return ret;
866}
867
868#ifdef BIO_RW_DISCARD
869static void btrfs_issue_discard(struct block_device *bdev,
870 u64 start, u64 len)
871{
872 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
873}
874#endif
875
876static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
877 u64 num_bytes)
878{
879#ifdef BIO_RW_DISCARD
880 int ret;
881 u64 map_length = num_bytes;
882 struct btrfs_multi_bio *multi = NULL;
883
884 /* Tell the block device(s) that the sectors can be discarded */
885 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
886 bytenr, &map_length, &multi, 0);
887 if (!ret) {
888 struct btrfs_bio_stripe *stripe = multi->stripes;
889 int i;
890
891 if (map_length > num_bytes)
892 map_length = num_bytes;
893
894 for (i = 0; i < multi->num_stripes; i++, stripe++) {
895 btrfs_issue_discard(stripe->dev->bdev,
896 stripe->physical,
897 map_length);
898 }
899 kfree(multi);
900 }
901
902 return ret;
903#else
904 return 0;
905#endif
906}
907
908static noinline int free_extents(struct btrfs_trans_handle *trans,
909 struct btrfs_root *extent_root,
910 struct list_head *del_list)
911{
912 struct btrfs_fs_info *info = extent_root->fs_info;
913 struct btrfs_path *path;
914 struct btrfs_key key, found_key;
915 struct extent_buffer *leaf;
916 struct list_head *cur;
917 struct pending_extent_op *op;
918 struct btrfs_extent_item *ei;
919 int ret, num_to_del, extent_slot = 0, found_extent = 0;
920 u32 refs;
921 u64 bytes_freed = 0;
922
923 path = btrfs_alloc_path();
924 if (!path)
925 return -ENOMEM;
926 path->reada = 1;
927
928search:
929 /* search for the backref for the current ref we want to delete */
930 cur = del_list->next;
931 op = list_entry(cur, struct pending_extent_op, list);
932 ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
933 op->orig_parent,
934 extent_root->root_key.objectid,
935 op->orig_generation, op->level, 1);
936 if (ret) {
937 printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
938 "root %llu gen %llu owner %u\n",
939 (unsigned long long)op->bytenr,
940 (unsigned long long)extent_root->root_key.objectid,
941 (unsigned long long)op->orig_generation, op->level);
942 btrfs_print_leaf(extent_root, path->nodes[0]);
943 WARN_ON(1);
944 goto out;
945 }
946
947 extent_slot = path->slots[0];
948 num_to_del = 1;
949 found_extent = 0;
950
951 /*
952 * if we aren't the first item on the leaf we can move back one and see
953 * if our ref is right next to our extent item
954 */
955 if (likely(extent_slot)) {
956 extent_slot--;
957 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
958 extent_slot);
959 if (found_key.objectid == op->bytenr &&
960 found_key.type == BTRFS_EXTENT_ITEM_KEY &&
961 found_key.offset == op->num_bytes) {
962 num_to_del++;
963 found_extent = 1;
964 }
965 }
966
967 /*
968 * if we didn't find the extent we need to delete the backref and then
969 * search for the extent item key so we can update its ref count
970 */
971 if (!found_extent) {
972 key.objectid = op->bytenr;
973 key.type = BTRFS_EXTENT_ITEM_KEY;
974 key.offset = op->num_bytes;
975
976 ret = remove_extent_backref(trans, extent_root, path);
977 BUG_ON(ret);
978 btrfs_release_path(extent_root, path);
979 ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
980 BUG_ON(ret);
981 extent_slot = path->slots[0];
982 }
983
984 /* this is where we update the ref count for the extent */
985 leaf = path->nodes[0];
986 ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
987 refs = btrfs_extent_refs(leaf, ei);
988 BUG_ON(refs == 0);
989 refs--;
990 btrfs_set_extent_refs(leaf, ei, refs);
991
992 btrfs_mark_buffer_dirty(leaf);
993
994 /*
995 * This extent needs deleting. The reason cur_slot is extent_slot +
996 * num_to_del is because extent_slot points to the slot where the extent
997 * is, and if the backref was not right next to the extent we will be
998 * deleting at least 1 item, and will want to start searching at the
999 * slot directly next to extent_slot. However if we did find the
1000 * backref next to the extent item them we will be deleting at least 2
1001 * items and will want to start searching directly after the ref slot
1002 */
1003 if (!refs) {
1004 struct list_head *pos, *n, *end;
1005 int cur_slot = extent_slot+num_to_del;
1006 u64 super_used;
1007 u64 root_used;
1008
1009 path->slots[0] = extent_slot;
1010 bytes_freed = op->num_bytes;
1011
1012 mutex_lock(&info->pinned_mutex);
1013 ret = pin_down_bytes(trans, extent_root, op->bytenr,
1014 op->num_bytes, op->level >=
1015 BTRFS_FIRST_FREE_OBJECTID);
1016 mutex_unlock(&info->pinned_mutex);
1017 BUG_ON(ret < 0);
1018 op->del = ret;
1019
1020 /*
1021 * we need to see if we can delete multiple things at once, so
1022 * start looping through the list of extents we are wanting to
1023 * delete and see if their extent/backref's are right next to
1024 * eachother and the extents only have 1 ref
1025 */
1026 for (pos = cur->next; pos != del_list; pos = pos->next) {
1027 struct pending_extent_op *tmp;
1028
1029 tmp = list_entry(pos, struct pending_extent_op, list);
1030
1031 /* we only want to delete extent+ref at this stage */
1032 if (cur_slot >= btrfs_header_nritems(leaf) - 1)
1033 break;
1034
1035 btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
1036 if (found_key.objectid != tmp->bytenr ||
1037 found_key.type != BTRFS_EXTENT_ITEM_KEY ||
1038 found_key.offset != tmp->num_bytes)
1039 break;
1040
1041 /* check to make sure this extent only has one ref */
1042 ei = btrfs_item_ptr(leaf, cur_slot,
1043 struct btrfs_extent_item);
1044 if (btrfs_extent_refs(leaf, ei) != 1)
1045 break;
1046
1047 btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
1048 if (found_key.objectid != tmp->bytenr ||
1049 found_key.type != BTRFS_EXTENT_REF_KEY ||
1050 found_key.offset != tmp->orig_parent)
1051 break;
1052
1053 /*
1054 * the ref is right next to the extent, we can set the
1055 * ref count to 0 since we will delete them both now
1056 */
1057 btrfs_set_extent_refs(leaf, ei, 0);
1058
1059 /* pin down the bytes for this extent */
1060 mutex_lock(&info->pinned_mutex);
1061 ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
1062 tmp->num_bytes, tmp->level >=
1063 BTRFS_FIRST_FREE_OBJECTID);
1064 mutex_unlock(&info->pinned_mutex);
1065 BUG_ON(ret < 0);
1066
1067 /*
1068 * use the del field to tell if we need to go ahead and
1069 * free up the extent when we delete the item or not.
1070 */
1071 tmp->del = ret;
1072 bytes_freed += tmp->num_bytes;
1073
1074 num_to_del += 2;
1075 cur_slot += 2;
1076 }
1077 end = pos;
1078
1079 /* update the free space counters */
1080 spin_lock(&info->delalloc_lock);
1081 super_used = btrfs_super_bytes_used(&info->super_copy);
1082 btrfs_set_super_bytes_used(&info->super_copy,
1083 super_used - bytes_freed);
1084
1085 root_used = btrfs_root_used(&extent_root->root_item);
1086 btrfs_set_root_used(&extent_root->root_item,
1087 root_used - bytes_freed);
1088 spin_unlock(&info->delalloc_lock);
1089
1090 /* delete the items */
1091 ret = btrfs_del_items(trans, extent_root, path,
1092 path->slots[0], num_to_del);
1093 BUG_ON(ret);
1094
1095 /*
1096 * loop through the extents we deleted and do the cleanup work
1097 * on them
1098 */
1099 for (pos = cur, n = pos->next; pos != end;
1100 pos = n, n = pos->next) {
1101 struct pending_extent_op *tmp;
1102 tmp = list_entry(pos, struct pending_extent_op, list);
1103
1104 /*
1105 * remember tmp->del tells us wether or not we pinned
1106 * down the extent
1107 */
1108 ret = update_block_group(trans, extent_root,
1109 tmp->bytenr, tmp->num_bytes, 0,
1110 tmp->del);
1111 BUG_ON(ret);
1112
1113 list_del_init(&tmp->list);
1114 unlock_extent(&info->extent_ins, tmp->bytenr,
1115 tmp->bytenr + tmp->num_bytes - 1,
1116 GFP_NOFS);
1117 kfree(tmp);
1118 }
1119 } else if (refs && found_extent) {
1120 /*
1121 * the ref and extent were right next to eachother, but the
1122 * extent still has a ref, so just free the backref and keep
1123 * going
1124 */
1125 ret = remove_extent_backref(trans, extent_root, path);
1126 BUG_ON(ret);
1127
1128 list_del_init(&op->list);
1129 unlock_extent(&info->extent_ins, op->bytenr,
1130 op->bytenr + op->num_bytes - 1, GFP_NOFS);
1131 kfree(op);
1132 } else {
1133 /*
1134 * the extent has multiple refs and the backref we were looking
1135 * for was not right next to it, so just unlock and go next,
1136 * we're good to go
1137 */
1138 list_del_init(&op->list);
1139 unlock_extent(&info->extent_ins, op->bytenr,
1140 op->bytenr + op->num_bytes - 1, GFP_NOFS);
1141 kfree(op);
1142 }
1143
1144 btrfs_release_path(extent_root, path);
1145 if (!list_empty(del_list))
1146 goto search;
1147
1148out:
1149 btrfs_free_path(path);
1150 return ret;
1151}
1152
1153static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1154 struct btrfs_root *root, u64 bytenr,
1155 u64 orig_parent, u64 parent,
1156 u64 orig_root, u64 ref_root,
1157 u64 orig_generation, u64 ref_generation,
1158 u64 owner_objectid)
1159{
1160 int ret;
1161 struct btrfs_root *extent_root = root->fs_info->extent_root;
1162 struct btrfs_path *path;
1163
1164 if (root == root->fs_info->extent_root) {
1165 struct pending_extent_op *extent_op;
1166 u64 num_bytes;
1167
1168 BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
1169 num_bytes = btrfs_level_size(root, (int)owner_objectid);
1170 mutex_lock(&root->fs_info->extent_ins_mutex);
1171 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
1172 bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
1173 u64 priv;
1174 ret = get_state_private(&root->fs_info->extent_ins,
1175 bytenr, &priv);
1176 BUG_ON(ret);
1177 extent_op = (struct pending_extent_op *)
1178 (unsigned long)priv;
1179 BUG_ON(extent_op->parent != orig_parent);
1180 BUG_ON(extent_op->generation != orig_generation);
1181
1182 extent_op->parent = parent;
1183 extent_op->generation = ref_generation;
1184 } else {
1185 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
1186 BUG_ON(!extent_op);
1187
1188 extent_op->type = PENDING_BACKREF_UPDATE;
1189 extent_op->bytenr = bytenr;
1190 extent_op->num_bytes = num_bytes;
1191 extent_op->parent = parent;
1192 extent_op->orig_parent = orig_parent;
1193 extent_op->generation = ref_generation;
1194 extent_op->orig_generation = orig_generation;
1195 extent_op->level = (int)owner_objectid;
1196 INIT_LIST_HEAD(&extent_op->list);
1197 extent_op->del = 0;
1198
1199 set_extent_bits(&root->fs_info->extent_ins,
1200 bytenr, bytenr + num_bytes - 1,
1201 EXTENT_WRITEBACK, GFP_NOFS);
1202 set_state_private(&root->fs_info->extent_ins,
1203 bytenr, (unsigned long)extent_op);
1204 }
1205 mutex_unlock(&root->fs_info->extent_ins_mutex);
1206 return 0;
1207 }
1208
1209 path = btrfs_alloc_path();
1210 if (!path)
1211 return -ENOMEM;
1212 ret = lookup_extent_backref(trans, extent_root, path,
1213 bytenr, orig_parent, orig_root,
1214 orig_generation, owner_objectid, 1);
1215 if (ret)
1216 goto out;
1217 ret = remove_extent_backref(trans, extent_root, path);
1218 if (ret)
1219 goto out;
1220 ret = insert_extent_backref(trans, extent_root, path, bytenr,
1221 parent, ref_root, ref_generation,
1222 owner_objectid);
1223 BUG_ON(ret);
1224 finish_current_insert(trans, extent_root, 0);
1225 del_pending_extents(trans, extent_root, 0);
1226out:
1227 btrfs_free_path(path);
1228 return ret;
1229}
1230
1231int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1232 struct btrfs_root *root, u64 bytenr,
1233 u64 orig_parent, u64 parent,
1234 u64 ref_root, u64 ref_generation,
1235 u64 owner_objectid)
1236{
1237 int ret;
1238 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1239 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1240 return 0;
1241 ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
1242 parent, ref_root, ref_root,
1243 ref_generation, ref_generation,
1244 owner_objectid);
1245 return ret;
1246}
1247
1248static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1249 struct btrfs_root *root, u64 bytenr,
1250 u64 orig_parent, u64 parent,
1251 u64 orig_root, u64 ref_root,
1252 u64 orig_generation, u64 ref_generation,
1253 u64 owner_objectid)
1254{
1255 struct btrfs_path *path;
1256 int ret;
1257 struct btrfs_key key;
1258 struct extent_buffer *l;
1259 struct btrfs_extent_item *item;
1260 u32 refs;
1261
1262 path = btrfs_alloc_path();
1263 if (!path)
1264 return -ENOMEM;
1265
1266 path->reada = 1;
1267 key.objectid = bytenr;
1268 key.type = BTRFS_EXTENT_ITEM_KEY;
1269 key.offset = (u64)-1;
1270
1271 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
1272 0, 1);
1273 if (ret < 0)
1274 return ret;
1275 BUG_ON(ret == 0 || path->slots[0] == 0);
1276
1277 path->slots[0]--;
1278 l = path->nodes[0];
1279
1280 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
1281 if (key.objectid != bytenr) {
1282 btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
1283 printk(KERN_ERR "btrfs wanted %llu found %llu\n",
1284 (unsigned long long)bytenr,
1285 (unsigned long long)key.objectid);
1286 BUG();
1287 }
1288 BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
1289
1290 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
1291 refs = btrfs_extent_refs(l, item);
1292 btrfs_set_extent_refs(l, item, refs + 1);
1293 btrfs_mark_buffer_dirty(path->nodes[0]);
1294
1295 btrfs_release_path(root->fs_info->extent_root, path);
1296
1297 path->reada = 1;
1298 ret = insert_extent_backref(trans, root->fs_info->extent_root,
1299 path, bytenr, parent,
1300 ref_root, ref_generation,
1301 owner_objectid);
1302 BUG_ON(ret);
1303 finish_current_insert(trans, root->fs_info->extent_root, 0);
1304 del_pending_extents(trans, root->fs_info->extent_root, 0);
1305
1306 btrfs_free_path(path);
1307 return 0;
1308}
1309
1310int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1311 struct btrfs_root *root,
1312 u64 bytenr, u64 num_bytes, u64 parent,
1313 u64 ref_root, u64 ref_generation,
1314 u64 owner_objectid)
1315{
1316 int ret;
1317 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1318 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1319 return 0;
1320 ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
1321 0, ref_root, 0, ref_generation,
1322 owner_objectid);
1323 return ret;
1324}
1325
1326int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1327 struct btrfs_root *root)
1328{
1329 finish_current_insert(trans, root->fs_info->extent_root, 1);
1330 del_pending_extents(trans, root->fs_info->extent_root, 1);
1331 return 0;
1332}
1333
1334int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
1335 struct btrfs_root *root, u64 bytenr,
1336 u64 num_bytes, u32 *refs)
1337{
1338 struct btrfs_path *path;
1339 int ret;
1340 struct btrfs_key key;
1341 struct extent_buffer *l;
1342 struct btrfs_extent_item *item;
1343
1344 WARN_ON(num_bytes < root->sectorsize);
1345 path = btrfs_alloc_path();
1346 path->reada = 1;
1347 key.objectid = bytenr;
1348 key.offset = num_bytes;
1349 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
1350 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
1351 0, 0);
1352 if (ret < 0)
1353 goto out;
1354 if (ret != 0) {
1355 btrfs_print_leaf(root, path->nodes[0]);
1356 printk(KERN_INFO "btrfs failed to find block number %llu\n",
1357 (unsigned long long)bytenr);
1358 BUG();
1359 }
1360 l = path->nodes[0];
1361 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
1362 *refs = btrfs_extent_refs(l, item);
1363out:
1364 btrfs_free_path(path);
1365 return 0;
1366}
1367
1368int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1369 struct btrfs_root *root, u64 objectid, u64 bytenr)
1370{
1371 struct btrfs_root *extent_root = root->fs_info->extent_root;
1372 struct btrfs_path *path;
1373 struct extent_buffer *leaf;
1374 struct btrfs_extent_ref *ref_item;
1375 struct btrfs_key key;
1376 struct btrfs_key found_key;
1377 u64 ref_root;
1378 u64 last_snapshot;
1379 u32 nritems;
1380 int ret;
1381
1382 key.objectid = bytenr;
1383 key.offset = (u64)-1;
1384 key.type = BTRFS_EXTENT_ITEM_KEY;
1385
1386 path = btrfs_alloc_path();
1387 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
1388 if (ret < 0)
1389 goto out;
1390 BUG_ON(ret == 0);
1391
1392 ret = -ENOENT;
1393 if (path->slots[0] == 0)
1394 goto out;
1395
1396 path->slots[0]--;
1397 leaf = path->nodes[0];
1398 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1399
1400 if (found_key.objectid != bytenr ||
1401 found_key.type != BTRFS_EXTENT_ITEM_KEY)
1402 goto out;
1403
1404 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1405 while (1) {
1406 leaf = path->nodes[0];
1407 nritems = btrfs_header_nritems(leaf);
1408 if (path->slots[0] >= nritems) {
1409 ret = btrfs_next_leaf(extent_root, path);
1410 if (ret < 0)
1411 goto out;
1412 if (ret == 0)
1413 continue;
1414 break;
1415 }
1416 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1417 if (found_key.objectid != bytenr)
1418 break;
1419
1420 if (found_key.type != BTRFS_EXTENT_REF_KEY) {
1421 path->slots[0]++;
1422 continue;
1423 }
1424
1425 ref_item = btrfs_item_ptr(leaf, path->slots[0],
1426 struct btrfs_extent_ref);
1427 ref_root = btrfs_ref_root(leaf, ref_item);
1428 if ((ref_root != root->root_key.objectid &&
1429 ref_root != BTRFS_TREE_LOG_OBJECTID) ||
1430 objectid != btrfs_ref_objectid(leaf, ref_item)) {
1431 ret = 1;
1432 goto out;
1433 }
1434 if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) {
1435 ret = 1;
1436 goto out;
1437 }
1438
1439 path->slots[0]++;
1440 }
1441 ret = 0;
1442out:
1443 btrfs_free_path(path);
1444 return ret;
1445}
1446
1447int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1448 struct extent_buffer *buf, u32 nr_extents)
1449{
1450 struct btrfs_key key;
1451 struct btrfs_file_extent_item *fi;
1452 u64 root_gen;
1453 u32 nritems;
1454 int i;
1455 int level;
1456 int ret = 0;
1457 int shared = 0;
1458
1459 if (!root->ref_cows)
1460 return 0;
1461
1462 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1463 shared = 0;
1464 root_gen = root->root_key.offset;
1465 } else {
1466 shared = 1;
1467 root_gen = trans->transid - 1;
1468 }
1469
1470 level = btrfs_header_level(buf);
1471 nritems = btrfs_header_nritems(buf);
1472
1473 if (level == 0) {
1474 struct btrfs_leaf_ref *ref;
1475 struct btrfs_extent_info *info;
1476
1477 ref = btrfs_alloc_leaf_ref(root, nr_extents);
1478 if (!ref) {
1479 ret = -ENOMEM;
1480 goto out;
1481 }
1482
1483 ref->root_gen = root_gen;
1484 ref->bytenr = buf->start;
1485 ref->owner = btrfs_header_owner(buf);
1486 ref->generation = btrfs_header_generation(buf);
1487 ref->nritems = nr_extents;
1488 info = ref->extents;
1489
1490 for (i = 0; nr_extents > 0 && i < nritems; i++) {
1491 u64 disk_bytenr;
1492 btrfs_item_key_to_cpu(buf, &key, i);
1493 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1494 continue;
1495 fi = btrfs_item_ptr(buf, i,
1496 struct btrfs_file_extent_item);
1497 if (btrfs_file_extent_type(buf, fi) ==
1498 BTRFS_FILE_EXTENT_INLINE)
1499 continue;
1500 disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1501 if (disk_bytenr == 0)
1502 continue;
1503
1504 info->bytenr = disk_bytenr;
1505 info->num_bytes =
1506 btrfs_file_extent_disk_num_bytes(buf, fi);
1507 info->objectid = key.objectid;
1508 info->offset = key.offset;
1509 info++;
1510 }
1511
1512 ret = btrfs_add_leaf_ref(root, ref, shared);
1513 if (ret == -EEXIST && shared) {
1514 struct btrfs_leaf_ref *old;
1515 old = btrfs_lookup_leaf_ref(root, ref->bytenr);
1516 BUG_ON(!old);
1517 btrfs_remove_leaf_ref(root, old);
1518 btrfs_free_leaf_ref(root, old);
1519 ret = btrfs_add_leaf_ref(root, ref, shared);
1520 }
1521 WARN_ON(ret);
1522 btrfs_free_leaf_ref(root, ref);
1523 }
1524out:
1525 return ret;
1526}
1527
1528int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1529 struct extent_buffer *orig_buf, struct extent_buffer *buf,
1530 u32 *nr_extents)
1531{
1532 u64 bytenr;
1533 u64 ref_root;
1534 u64 orig_root;
1535 u64 ref_generation;
1536 u64 orig_generation;
1537 u32 nritems;
1538 u32 nr_file_extents = 0;
1539 struct btrfs_key key;
1540 struct btrfs_file_extent_item *fi;
1541 int i;
1542 int level;
1543 int ret = 0;
1544 int faili = 0;
1545 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1546 u64, u64, u64, u64, u64, u64, u64, u64);
1547
1548 ref_root = btrfs_header_owner(buf);
1549 ref_generation = btrfs_header_generation(buf);
1550 orig_root = btrfs_header_owner(orig_buf);
1551 orig_generation = btrfs_header_generation(orig_buf);
1552
1553 nritems = btrfs_header_nritems(buf);
1554 level = btrfs_header_level(buf);
1555
1556 if (root->ref_cows) {
1557 process_func = __btrfs_inc_extent_ref;
1558 } else {
1559 if (level == 0 &&
1560 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1561 goto out;
1562 if (level != 0 &&
1563 root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1564 goto out;
1565 process_func = __btrfs_update_extent_ref;
1566 }
1567
1568 for (i = 0; i < nritems; i++) {
1569 cond_resched();
1570 if (level == 0) {
1571 btrfs_item_key_to_cpu(buf, &key, i);
1572 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1573 continue;
1574 fi = btrfs_item_ptr(buf, i,
1575 struct btrfs_file_extent_item);
1576 if (btrfs_file_extent_type(buf, fi) ==
1577 BTRFS_FILE_EXTENT_INLINE)
1578 continue;
1579 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1580 if (bytenr == 0)
1581 continue;
1582
1583 nr_file_extents++;
1584
1585 ret = process_func(trans, root, bytenr,
1586 orig_buf->start, buf->start,
1587 orig_root, ref_root,
1588 orig_generation, ref_generation,
1589 key.objectid);
1590
1591 if (ret) {
1592 faili = i;
1593 WARN_ON(1);
1594 goto fail;
1595 }
1596 } else {
1597 bytenr = btrfs_node_blockptr(buf, i);
1598 ret = process_func(trans, root, bytenr,
1599 orig_buf->start, buf->start,
1600 orig_root, ref_root,
1601 orig_generation, ref_generation,
1602 level - 1);
1603 if (ret) {
1604 faili = i;
1605 WARN_ON(1);
1606 goto fail;
1607 }
1608 }
1609 }
1610out:
1611 if (nr_extents) {
1612 if (level == 0)
1613 *nr_extents = nr_file_extents;
1614 else
1615 *nr_extents = nritems;
1616 }
1617 return 0;
1618fail:
1619 WARN_ON(1);
1620 return ret;
1621}
1622
1623int btrfs_update_ref(struct btrfs_trans_handle *trans,
1624 struct btrfs_root *root, struct extent_buffer *orig_buf,
1625 struct extent_buffer *buf, int start_slot, int nr)
1626
1627{
1628 u64 bytenr;
1629 u64 ref_root;
1630 u64 orig_root;
1631 u64 ref_generation;
1632 u64 orig_generation;
1633 struct btrfs_key key;
1634 struct btrfs_file_extent_item *fi;
1635 int i;
1636 int ret;
1637 int slot;
1638 int level;
1639
1640 BUG_ON(start_slot < 0);
1641 BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
1642
1643 ref_root = btrfs_header_owner(buf);
1644 ref_generation = btrfs_header_generation(buf);
1645 orig_root = btrfs_header_owner(orig_buf);
1646 orig_generation = btrfs_header_generation(orig_buf);
1647 level = btrfs_header_level(buf);
1648
1649 if (!root->ref_cows) {
1650 if (level == 0 &&
1651 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1652 return 0;
1653 if (level != 0 &&
1654 root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1655 return 0;
1656 }
1657
1658 for (i = 0, slot = start_slot; i < nr; i++, slot++) {
1659 cond_resched();
1660 if (level == 0) {
1661 btrfs_item_key_to_cpu(buf, &key, slot);
1662 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1663 continue;
1664 fi = btrfs_item_ptr(buf, slot,
1665 struct btrfs_file_extent_item);
1666 if (btrfs_file_extent_type(buf, fi) ==
1667 BTRFS_FILE_EXTENT_INLINE)
1668 continue;
1669 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1670 if (bytenr == 0)
1671 continue;
1672 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1673 orig_buf->start, buf->start,
1674 orig_root, ref_root,
1675 orig_generation, ref_generation,
1676 key.objectid);
1677 if (ret)
1678 goto fail;
1679 } else {
1680 bytenr = btrfs_node_blockptr(buf, slot);
1681 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1682 orig_buf->start, buf->start,
1683 orig_root, ref_root,
1684 orig_generation, ref_generation,
1685 level - 1);
1686 if (ret)
1687 goto fail;
1688 }
1689 }
1690 return 0;
1691fail:
1692 WARN_ON(1);
1693 return -1;
1694}
1695
1696static int write_one_cache_group(struct btrfs_trans_handle *trans,
1697 struct btrfs_root *root,
1698 struct btrfs_path *path,
1699 struct btrfs_block_group_cache *cache)
1700{
1701 int ret;
1702 int pending_ret;
1703 struct btrfs_root *extent_root = root->fs_info->extent_root;
1704 unsigned long bi;
1705 struct extent_buffer *leaf;
1706
1707 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
1708 if (ret < 0)
1709 goto fail;
1710 BUG_ON(ret);
1711
1712 leaf = path->nodes[0];
1713 bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
1714 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
1715 btrfs_mark_buffer_dirty(leaf);
1716 btrfs_release_path(extent_root, path);
1717fail:
1718 finish_current_insert(trans, extent_root, 0);
1719 pending_ret = del_pending_extents(trans, extent_root, 0);
1720 if (ret)
1721 return ret;
1722 if (pending_ret)
1723 return pending_ret;
1724 return 0;
1725
1726}
1727
1728int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1729 struct btrfs_root *root)
1730{
1731 struct btrfs_block_group_cache *cache, *entry;
1732 struct rb_node *n;
1733 int err = 0;
1734 int werr = 0;
1735 struct btrfs_path *path;
1736 u64 last = 0;
1737
1738 path = btrfs_alloc_path();
1739 if (!path)
1740 return -ENOMEM;
1741
1742 while (1) {
1743 cache = NULL;
1744 spin_lock(&root->fs_info->block_group_cache_lock);
1745 for (n = rb_first(&root->fs_info->block_group_cache_tree);
1746 n; n = rb_next(n)) {
1747 entry = rb_entry(n, struct btrfs_block_group_cache,
1748 cache_node);
1749 if (entry->dirty) {
1750 cache = entry;
1751 break;
1752 }
1753 }
1754 spin_unlock(&root->fs_info->block_group_cache_lock);
1755
1756 if (!cache)
1757 break;
1758
1759 cache->dirty = 0;
1760 last += cache->key.offset;
1761
1762 err = write_one_cache_group(trans, root,
1763 path, cache);
1764 /*
1765 * if we fail to write the cache group, we want
1766 * to keep it marked dirty in hopes that a later
1767 * write will work
1768 */
1769 if (err) {
1770 werr = err;
1771 continue;
1772 }
1773 }
1774 btrfs_free_path(path);
1775 return werr;
1776}
1777
1778int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
1779{
1780 struct btrfs_block_group_cache *block_group;
1781 int readonly = 0;
1782
1783 block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
1784 if (!block_group || block_group->ro)
1785 readonly = 1;
1786 if (block_group)
1787 put_block_group(block_group);
1788 return readonly;
1789}
1790
1791static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1792 u64 total_bytes, u64 bytes_used,
1793 struct btrfs_space_info **space_info)
1794{
1795 struct btrfs_space_info *found;
1796
1797 found = __find_space_info(info, flags);
1798 if (found) {
1799 spin_lock(&found->lock);
1800 found->total_bytes += total_bytes;
1801 found->bytes_used += bytes_used;
1802 found->full = 0;
1803 spin_unlock(&found->lock);
1804 *space_info = found;
1805 return 0;
1806 }
1807 found = kzalloc(sizeof(*found), GFP_NOFS);
1808 if (!found)
1809 return -ENOMEM;
1810
1811 list_add(&found->list, &info->space_info);
1812 INIT_LIST_HEAD(&found->block_groups);
1813 init_rwsem(&found->groups_sem);
1814 spin_lock_init(&found->lock);
1815 found->flags = flags;
1816 found->total_bytes = total_bytes;
1817 found->bytes_used = bytes_used;
1818 found->bytes_pinned = 0;
1819 found->bytes_reserved = 0;
1820 found->bytes_readonly = 0;
1821 found->full = 0;
1822 found->force_alloc = 0;
1823 *space_info = found;
1824 return 0;
1825}
1826
1827static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1828{
1829 u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
1830 BTRFS_BLOCK_GROUP_RAID1 |
1831 BTRFS_BLOCK_GROUP_RAID10 |
1832 BTRFS_BLOCK_GROUP_DUP);
1833 if (extra_flags) {
1834 if (flags & BTRFS_BLOCK_GROUP_DATA)
1835 fs_info->avail_data_alloc_bits |= extra_flags;
1836 if (flags & BTRFS_BLOCK_GROUP_METADATA)
1837 fs_info->avail_metadata_alloc_bits |= extra_flags;
1838 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1839 fs_info->avail_system_alloc_bits |= extra_flags;
1840 }
1841}
1842
1843static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
1844{
1845 spin_lock(&cache->space_info->lock);
1846 spin_lock(&cache->lock);
1847 if (!cache->ro) {
1848 cache->space_info->bytes_readonly += cache->key.offset -
1849 btrfs_block_group_used(&cache->item);
1850 cache->ro = 1;
1851 }
1852 spin_unlock(&cache->lock);
1853 spin_unlock(&cache->space_info->lock);
1854}
1855
1856u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
1857{
1858 u64 num_devices = root->fs_info->fs_devices->rw_devices;
1859
1860 if (num_devices == 1)
1861 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
1862 if (num_devices < 4)
1863 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
1864
1865 if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
1866 (flags & (BTRFS_BLOCK_GROUP_RAID1 |
1867 BTRFS_BLOCK_GROUP_RAID10))) {
1868 flags &= ~BTRFS_BLOCK_GROUP_DUP;
1869 }
1870
1871 if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
1872 (flags & BTRFS_BLOCK_GROUP_RAID10)) {
1873 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
1874 }
1875
1876 if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
1877 ((flags & BTRFS_BLOCK_GROUP_RAID1) |
1878 (flags & BTRFS_BLOCK_GROUP_RAID10) |
1879 (flags & BTRFS_BLOCK_GROUP_DUP)))
1880 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
1881 return flags;
1882}
1883
1884static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1885 struct btrfs_root *extent_root, u64 alloc_bytes,
1886 u64 flags, int force)
1887{
1888 struct btrfs_space_info *space_info;
1889 u64 thresh;
1890 int ret = 0;
1891
1892 mutex_lock(&extent_root->fs_info->chunk_mutex);
1893
1894 flags = btrfs_reduce_alloc_profile(extent_root, flags);
1895
1896 space_info = __find_space_info(extent_root->fs_info, flags);
1897 if (!space_info) {
1898 ret = update_space_info(extent_root->fs_info, flags,
1899 0, 0, &space_info);
1900 BUG_ON(ret);
1901 }
1902 BUG_ON(!space_info);
1903
1904 spin_lock(&space_info->lock);
1905 if (space_info->force_alloc) {
1906 force = 1;
1907 space_info->force_alloc = 0;
1908 }
1909 if (space_info->full) {
1910 spin_unlock(&space_info->lock);
1911 goto out;
1912 }
1913
1914 thresh = space_info->total_bytes - space_info->bytes_readonly;
1915 thresh = div_factor(thresh, 6);
1916 if (!force &&
1917 (space_info->bytes_used + space_info->bytes_pinned +
1918 space_info->bytes_reserved + alloc_bytes) < thresh) {
1919 spin_unlock(&space_info->lock);
1920 goto out;
1921 }
1922 spin_unlock(&space_info->lock);
1923
1924 ret = btrfs_alloc_chunk(trans, extent_root, flags);
1925 if (ret)
1926 space_info->full = 1;
1927out:
1928 mutex_unlock(&extent_root->fs_info->chunk_mutex);
1929 return ret;
1930}
1931
1932static int update_block_group(struct btrfs_trans_handle *trans,
1933 struct btrfs_root *root,
1934 u64 bytenr, u64 num_bytes, int alloc,
1935 int mark_free)
1936{
1937 struct btrfs_block_group_cache *cache;
1938 struct btrfs_fs_info *info = root->fs_info;
1939 u64 total = num_bytes;
1940 u64 old_val;
1941 u64 byte_in_group;
1942
1943 while (total) {
1944 cache = btrfs_lookup_block_group(info, bytenr);
1945 if (!cache)
1946 return -1;
1947 byte_in_group = bytenr - cache->key.objectid;
1948 WARN_ON(byte_in_group > cache->key.offset);
1949
1950 spin_lock(&cache->space_info->lock);
1951 spin_lock(&cache->lock);
1952 cache->dirty = 1;
1953 old_val = btrfs_block_group_used(&cache->item);
1954 num_bytes = min(total, cache->key.offset - byte_in_group);
1955 if (alloc) {
1956 old_val += num_bytes;
1957 cache->space_info->bytes_used += num_bytes;
1958 if (cache->ro)
1959 cache->space_info->bytes_readonly -= num_bytes;
1960 btrfs_set_block_group_used(&cache->item, old_val);
1961 spin_unlock(&cache->lock);
1962 spin_unlock(&cache->space_info->lock);
1963 } else {
1964 old_val -= num_bytes;
1965 cache->space_info->bytes_used -= num_bytes;
1966 if (cache->ro)
1967 cache->space_info->bytes_readonly += num_bytes;
1968 btrfs_set_block_group_used(&cache->item, old_val);
1969 spin_unlock(&cache->lock);
1970 spin_unlock(&cache->space_info->lock);
1971 if (mark_free) {
1972 int ret;
1973
1974 ret = btrfs_discard_extent(root, bytenr,
1975 num_bytes);
1976 WARN_ON(ret);
1977
1978 ret = btrfs_add_free_space(cache, bytenr,
1979 num_bytes);
1980 WARN_ON(ret);
1981 }
1982 }
1983 put_block_group(cache);
1984 total -= num_bytes;
1985 bytenr += num_bytes;
1986 }
1987 return 0;
1988}
1989
1990static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
1991{
1992 struct btrfs_block_group_cache *cache;
1993 u64 bytenr;
1994
1995 cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
1996 if (!cache)
1997 return 0;
1998
1999 bytenr = cache->key.objectid;
2000 put_block_group(cache);
2001
2002 return bytenr;
2003}
2004
2005int btrfs_update_pinned_extents(struct btrfs_root *root,
2006 u64 bytenr, u64 num, int pin)
2007{
2008 u64 len;
2009 struct btrfs_block_group_cache *cache;
2010 struct btrfs_fs_info *fs_info = root->fs_info;
2011
2012 WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
2013 if (pin) {
2014 set_extent_dirty(&fs_info->pinned_extents,
2015 bytenr, bytenr + num - 1, GFP_NOFS);
2016 } else {
2017 clear_extent_dirty(&fs_info->pinned_extents,
2018 bytenr, bytenr + num - 1, GFP_NOFS);
2019 }
2020 while (num > 0) {
2021 cache = btrfs_lookup_block_group(fs_info, bytenr);
2022 BUG_ON(!cache);
2023 len = min(num, cache->key.offset -
2024 (bytenr - cache->key.objectid));
2025 if (pin) {
2026 spin_lock(&cache->space_info->lock);
2027 spin_lock(&cache->lock);
2028 cache->pinned += len;
2029 cache->space_info->bytes_pinned += len;
2030 spin_unlock(&cache->lock);
2031 spin_unlock(&cache->space_info->lock);
2032 fs_info->total_pinned += len;
2033 } else {
2034 spin_lock(&cache->space_info->lock);
2035 spin_lock(&cache->lock);
2036 cache->pinned -= len;
2037 cache->space_info->bytes_pinned -= len;
2038 spin_unlock(&cache->lock);
2039 spin_unlock(&cache->space_info->lock);
2040 fs_info->total_pinned -= len;
2041 if (cache->cached)
2042 btrfs_add_free_space(cache, bytenr, len);
2043 }
2044 put_block_group(cache);
2045 bytenr += len;
2046 num -= len;
2047 }
2048 return 0;
2049}
2050
2051static int update_reserved_extents(struct btrfs_root *root,
2052 u64 bytenr, u64 num, int reserve)
2053{
2054 u64 len;
2055 struct btrfs_block_group_cache *cache;
2056 struct btrfs_fs_info *fs_info = root->fs_info;
2057
2058 while (num > 0) {
2059 cache = btrfs_lookup_block_group(fs_info, bytenr);
2060 BUG_ON(!cache);
2061 len = min(num, cache->key.offset -
2062 (bytenr - cache->key.objectid));
2063
2064 spin_lock(&cache->space_info->lock);
2065 spin_lock(&cache->lock);
2066 if (reserve) {
2067 cache->reserved += len;
2068 cache->space_info->bytes_reserved += len;
2069 } else {
2070 cache->reserved -= len;
2071 cache->space_info->bytes_reserved -= len;
2072 }
2073 spin_unlock(&cache->lock);
2074 spin_unlock(&cache->space_info->lock);
2075 put_block_group(cache);
2076 bytenr += len;
2077 num -= len;
2078 }
2079 return 0;
2080}
2081
2082int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2083{
2084 u64 last = 0;
2085 u64 start;
2086 u64 end;
2087 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
2088 int ret;
2089
2090 mutex_lock(&root->fs_info->pinned_mutex);
2091 while (1) {
2092 ret = find_first_extent_bit(pinned_extents, last,
2093 &start, &end, EXTENT_DIRTY);
2094 if (ret)
2095 break;
2096 set_extent_dirty(copy, start, end, GFP_NOFS);
2097 last = end + 1;
2098 }
2099 mutex_unlock(&root->fs_info->pinned_mutex);
2100 return 0;
2101}
2102
2103int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2104 struct btrfs_root *root,
2105 struct extent_io_tree *unpin)
2106{
2107 u64 start;
2108 u64 end;
2109 int ret;
2110
2111 mutex_lock(&root->fs_info->pinned_mutex);
2112 while (1) {
2113 ret = find_first_extent_bit(unpin, 0, &start, &end,
2114 EXTENT_DIRTY);
2115 if (ret)
2116 break;
2117
2118 ret = btrfs_discard_extent(root, start, end + 1 - start);
2119
2120 btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
2121 clear_extent_dirty(unpin, start, end, GFP_NOFS);
2122
2123 if (need_resched()) {
2124 mutex_unlock(&root->fs_info->pinned_mutex);
2125 cond_resched();
2126 mutex_lock(&root->fs_info->pinned_mutex);
2127 }
2128 }
2129 mutex_unlock(&root->fs_info->pinned_mutex);
2130 return ret;
2131}
2132
2133static int finish_current_insert(struct btrfs_trans_handle *trans,
2134 struct btrfs_root *extent_root, int all)
2135{
2136 u64 start;
2137 u64 end;
2138 u64 priv;
2139 u64 search = 0;
2140 u64 skipped = 0;
2141 struct btrfs_fs_info *info = extent_root->fs_info;
2142 struct btrfs_path *path;
2143 struct pending_extent_op *extent_op, *tmp;
2144 struct list_head insert_list, update_list;
2145 int ret;
2146 int num_inserts = 0, max_inserts;
2147
2148 path = btrfs_alloc_path();
2149 INIT_LIST_HEAD(&insert_list);
2150 INIT_LIST_HEAD(&update_list);
2151
2152 max_inserts = extent_root->leafsize /
2153 (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
2154 sizeof(struct btrfs_extent_ref) +
2155 sizeof(struct btrfs_extent_item));
2156again:
2157 mutex_lock(&info->extent_ins_mutex);
2158 while (1) {
2159 ret = find_first_extent_bit(&info->extent_ins, search, &start,
2160 &end, EXTENT_WRITEBACK);
2161 if (ret) {
2162 if (skipped && all && !num_inserts) {
2163 skipped = 0;
2164 search = 0;
2165 continue;
2166 }
2167 mutex_unlock(&info->extent_ins_mutex);
2168 break;
2169 }
2170
2171 ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
2172 if (!ret) {
2173 skipped = 1;
2174 search = end + 1;
2175 if (need_resched()) {
2176 mutex_unlock(&info->extent_ins_mutex);
2177 cond_resched();
2178 mutex_lock(&info->extent_ins_mutex);
2179 }
2180 continue;
2181 }
2182
2183 ret = get_state_private(&info->extent_ins, start, &priv);
2184 BUG_ON(ret);
2185 extent_op = (struct pending_extent_op *)(unsigned long) priv;
2186
2187 if (extent_op->type == PENDING_EXTENT_INSERT) {
2188 num_inserts++;
2189 list_add_tail(&extent_op->list, &insert_list);
2190 search = end + 1;
2191 if (num_inserts == max_inserts) {
2192 mutex_unlock(&info->extent_ins_mutex);
2193 break;
2194 }
2195 } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
2196 list_add_tail(&extent_op->list, &update_list);
2197 search = end + 1;
2198 } else {
2199 BUG();
2200 }
2201 }
2202
2203 /*
2204 * process the update list, clear the writeback bit for it, and if
2205 * somebody marked this thing for deletion then just unlock it and be
2206 * done, the free_extents will handle it
2207 */
2208 mutex_lock(&info->extent_ins_mutex);
2209 list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
2210 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2211 extent_op->bytenr + extent_op->num_bytes - 1,
2212 EXTENT_WRITEBACK, GFP_NOFS);
2213 if (extent_op->del) {
2214 list_del_init(&extent_op->list);
2215 unlock_extent(&info->extent_ins, extent_op->bytenr,
2216 extent_op->bytenr + extent_op->num_bytes
2217 - 1, GFP_NOFS);
2218 kfree(extent_op);
2219 }
2220 }
2221 mutex_unlock(&info->extent_ins_mutex);
2222
2223 /*
2224 * still have things left on the update list, go ahead an update
2225 * everything
2226 */
2227 if (!list_empty(&update_list)) {
2228 ret = update_backrefs(trans, extent_root, path, &update_list);
2229 BUG_ON(ret);
2230 }
2231
2232 /*
2233 * if no inserts need to be done, but we skipped some extents and we
2234 * need to make sure everything is cleaned then reset everything and
2235 * go back to the beginning
2236 */
2237 if (!num_inserts && all && skipped) {
2238 search = 0;
2239 skipped = 0;
2240 INIT_LIST_HEAD(&update_list);
2241 INIT_LIST_HEAD(&insert_list);
2242 goto again;
2243 } else if (!num_inserts) {
2244 goto out;
2245 }
2246
2247 /*
2248 * process the insert extents list. Again if we are deleting this
2249 * extent, then just unlock it, pin down the bytes if need be, and be
2250 * done with it. Saves us from having to actually insert the extent
2251 * into the tree and then subsequently come along and delete it
2252 */
2253 mutex_lock(&info->extent_ins_mutex);
2254 list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
2255 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2256 extent_op->bytenr + extent_op->num_bytes - 1,
2257 EXTENT_WRITEBACK, GFP_NOFS);
2258 if (extent_op->del) {
2259 u64 used;
2260 list_del_init(&extent_op->list);
2261 unlock_extent(&info->extent_ins, extent_op->bytenr,
2262 extent_op->bytenr + extent_op->num_bytes
2263 - 1, GFP_NOFS);
2264
2265 mutex_lock(&extent_root->fs_info->pinned_mutex);
2266 ret = pin_down_bytes(trans, extent_root,
2267 extent_op->bytenr,
2268 extent_op->num_bytes, 0);
2269 mutex_unlock(&extent_root->fs_info->pinned_mutex);
2270
2271 spin_lock(&info->delalloc_lock);
2272 used = btrfs_super_bytes_used(&info->super_copy);
2273 btrfs_set_super_bytes_used(&info->super_copy,
2274 used - extent_op->num_bytes);
2275 used = btrfs_root_used(&extent_root->root_item);
2276 btrfs_set_root_used(&extent_root->root_item,
2277 used - extent_op->num_bytes);
2278 spin_unlock(&info->delalloc_lock);
2279
2280 ret = update_block_group(trans, extent_root,
2281 extent_op->bytenr,
2282 extent_op->num_bytes,
2283 0, ret > 0);
2284 BUG_ON(ret);
2285 kfree(extent_op);
2286 num_inserts--;
2287 }
2288 }
2289 mutex_unlock(&info->extent_ins_mutex);
2290
2291 ret = insert_extents(trans, extent_root, path, &insert_list,
2292 num_inserts);
2293 BUG_ON(ret);
2294
2295 /*
2296 * if we broke out of the loop in order to insert stuff because we hit
2297 * the maximum number of inserts at a time we can handle, then loop
2298 * back and pick up where we left off
2299 */
2300 if (num_inserts == max_inserts) {
2301 INIT_LIST_HEAD(&insert_list);
2302 INIT_LIST_HEAD(&update_list);
2303 num_inserts = 0;
2304 goto again;
2305 }
2306
2307 /*
2308 * again, if we need to make absolutely sure there are no more pending
2309 * extent operations left and we know that we skipped some, go back to
2310 * the beginning and do it all again
2311 */
2312 if (all && skipped) {
2313 INIT_LIST_HEAD(&insert_list);
2314 INIT_LIST_HEAD(&update_list);
2315 search = 0;
2316 skipped = 0;
2317 num_inserts = 0;
2318 goto again;
2319 }
2320out:
2321 btrfs_free_path(path);
2322 return 0;
2323}
2324
2325static int pin_down_bytes(struct btrfs_trans_handle *trans,
2326 struct btrfs_root *root,
2327 u64 bytenr, u64 num_bytes, int is_data)
2328{
2329 int err = 0;
2330 struct extent_buffer *buf;
2331
2332 if (is_data)
2333 goto pinit;
2334
2335 buf = btrfs_find_tree_block(root, bytenr, num_bytes);
2336 if (!buf)
2337 goto pinit;
2338
2339 /* we can reuse a block if it hasn't been written
2340 * and it is from this transaction. We can't
2341 * reuse anything from the tree log root because
2342 * it has tiny sub-transactions.
2343 */
2344 if (btrfs_buffer_uptodate(buf, 0) &&
2345 btrfs_try_tree_lock(buf)) {
2346 u64 header_owner = btrfs_header_owner(buf);
2347 u64 header_transid = btrfs_header_generation(buf);
2348 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
2349 header_owner != BTRFS_TREE_RELOC_OBJECTID &&
2350 header_transid == trans->transid &&
2351 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
2352 clean_tree_block(NULL, root, buf);
2353 btrfs_tree_unlock(buf);
2354 free_extent_buffer(buf);
2355 return 1;
2356 }
2357 btrfs_tree_unlock(buf);
2358 }
2359 free_extent_buffer(buf);
2360pinit:
2361 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2362
2363 BUG_ON(err < 0);
2364 return 0;
2365}
2366
2367/*
2368 * remove an extent from the root, returns 0 on success
2369 */
2370static int __free_extent(struct btrfs_trans_handle *trans,
2371 struct btrfs_root *root,
2372 u64 bytenr, u64 num_bytes, u64 parent,
2373 u64 root_objectid, u64 ref_generation,
2374 u64 owner_objectid, int pin, int mark_free)
2375{
2376 struct btrfs_path *path;
2377 struct btrfs_key key;
2378 struct btrfs_fs_info *info = root->fs_info;
2379 struct btrfs_root *extent_root = info->extent_root;
2380 struct extent_buffer *leaf;
2381 int ret;
2382 int extent_slot = 0;
2383 int found_extent = 0;
2384 int num_to_del = 1;
2385 struct btrfs_extent_item *ei;
2386 u32 refs;
2387
2388 key.objectid = bytenr;
2389 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
2390 key.offset = num_bytes;
2391 path = btrfs_alloc_path();
2392 if (!path)
2393 return -ENOMEM;
2394
2395 path->reada = 1;
2396 ret = lookup_extent_backref(trans, extent_root, path,
2397 bytenr, parent, root_objectid,
2398 ref_generation, owner_objectid, 1);
2399 if (ret == 0) {
2400 struct btrfs_key found_key;
2401 extent_slot = path->slots[0];
2402 while (extent_slot > 0) {
2403 extent_slot--;
2404 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2405 extent_slot);
2406 if (found_key.objectid != bytenr)
2407 break;
2408 if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
2409 found_key.offset == num_bytes) {
2410 found_extent = 1;
2411 break;
2412 }
2413 if (path->slots[0] - extent_slot > 5)
2414 break;
2415 }
2416 if (!found_extent) {
2417 ret = remove_extent_backref(trans, extent_root, path);
2418 BUG_ON(ret);
2419 btrfs_release_path(extent_root, path);
2420 ret = btrfs_search_slot(trans, extent_root,
2421 &key, path, -1, 1);
2422 if (ret) {
2423 printk(KERN_ERR "umm, got %d back from search"
2424 ", was looking for %llu\n", ret,
2425 (unsigned long long)bytenr);
2426 btrfs_print_leaf(extent_root, path->nodes[0]);
2427 }
2428 BUG_ON(ret);
2429 extent_slot = path->slots[0];
2430 }
2431 } else {
2432 btrfs_print_leaf(extent_root, path->nodes[0]);
2433 WARN_ON(1);
2434 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
2435 "root %llu gen %llu owner %llu\n",
2436 (unsigned long long)bytenr,
2437 (unsigned long long)root_objectid,
2438 (unsigned long long)ref_generation,
2439 (unsigned long long)owner_objectid);
2440 }
2441
2442 leaf = path->nodes[0];
2443 ei = btrfs_item_ptr(leaf, extent_slot,
2444 struct btrfs_extent_item);
2445 refs = btrfs_extent_refs(leaf, ei);
2446 BUG_ON(refs == 0);
2447 refs -= 1;
2448 btrfs_set_extent_refs(leaf, ei, refs);
2449
2450 btrfs_mark_buffer_dirty(leaf);
2451
2452 if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
2453 struct btrfs_extent_ref *ref;
2454 ref = btrfs_item_ptr(leaf, path->slots[0],
2455 struct btrfs_extent_ref);
2456 BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
2457 /* if the back ref and the extent are next to each other
2458 * they get deleted below in one shot
2459 */
2460 path->slots[0] = extent_slot;
2461 num_to_del = 2;
2462 } else if (found_extent) {
2463 /* otherwise delete the extent back ref */
2464 ret = remove_extent_backref(trans, extent_root, path);
2465 BUG_ON(ret);
2466 /* if refs are 0, we need to setup the path for deletion */
2467 if (refs == 0) {
2468 btrfs_release_path(extent_root, path);
2469 ret = btrfs_search_slot(trans, extent_root, &key, path,
2470 -1, 1);
2471 BUG_ON(ret);
2472 }
2473 }
2474
2475 if (refs == 0) {
2476 u64 super_used;
2477 u64 root_used;
2478
2479 if (pin) {
2480 mutex_lock(&root->fs_info->pinned_mutex);
2481 ret = pin_down_bytes(trans, root, bytenr, num_bytes,
2482 owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
2483 mutex_unlock(&root->fs_info->pinned_mutex);
2484 if (ret > 0)
2485 mark_free = 1;
2486 BUG_ON(ret < 0);
2487 }
2488 /* block accounting for super block */
2489 spin_lock(&info->delalloc_lock);
2490 super_used = btrfs_super_bytes_used(&info->super_copy);
2491 btrfs_set_super_bytes_used(&info->super_copy,
2492 super_used - num_bytes);
2493
2494 /* block accounting for root item */
2495 root_used = btrfs_root_used(&root->root_item);
2496 btrfs_set_root_used(&root->root_item,
2497 root_used - num_bytes);
2498 spin_unlock(&info->delalloc_lock);
2499 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
2500 num_to_del);
2501 BUG_ON(ret);
2502 btrfs_release_path(extent_root, path);
2503
2504 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
2505 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
2506 BUG_ON(ret);
2507 }
2508
2509 ret = update_block_group(trans, root, bytenr, num_bytes, 0,
2510 mark_free);
2511 BUG_ON(ret);
2512 }
2513 btrfs_free_path(path);
2514 finish_current_insert(trans, extent_root, 0);
2515 return ret;
2516}
2517
2518/*
2519 * find all the blocks marked as pending in the radix tree and remove
2520 * them from the extent map
2521 */
2522static int del_pending_extents(struct btrfs_trans_handle *trans,
2523 struct btrfs_root *extent_root, int all)
2524{
2525 int ret;
2526 int err = 0;
2527 u64 start;
2528 u64 end;
2529 u64 priv;
2530 u64 search = 0;
2531 int nr = 0, skipped = 0;
2532 struct extent_io_tree *pending_del;
2533 struct extent_io_tree *extent_ins;
2534 struct pending_extent_op *extent_op;
2535 struct btrfs_fs_info *info = extent_root->fs_info;
2536 struct list_head delete_list;
2537
2538 INIT_LIST_HEAD(&delete_list);
2539 extent_ins = &extent_root->fs_info->extent_ins;
2540 pending_del = &extent_root->fs_info->pending_del;
2541
2542again:
2543 mutex_lock(&info->extent_ins_mutex);
2544 while (1) {
2545 ret = find_first_extent_bit(pending_del, search, &start, &end,
2546 EXTENT_WRITEBACK);
2547 if (ret) {
2548 if (all && skipped && !nr) {
2549 search = 0;
2550 continue;
2551 }
2552 mutex_unlock(&info->extent_ins_mutex);
2553 break;
2554 }
2555
2556 ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
2557 if (!ret) {
2558 search = end+1;
2559 skipped = 1;
2560
2561 if (need_resched()) {
2562 mutex_unlock(&info->extent_ins_mutex);
2563 cond_resched();
2564 mutex_lock(&info->extent_ins_mutex);
2565 }
2566
2567 continue;
2568 }
2569 BUG_ON(ret < 0);
2570
2571 ret = get_state_private(pending_del, start, &priv);
2572 BUG_ON(ret);
2573 extent_op = (struct pending_extent_op *)(unsigned long)priv;
2574
2575 clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
2576 GFP_NOFS);
2577 if (!test_range_bit(extent_ins, start, end,
2578 EXTENT_WRITEBACK, 0)) {
2579 list_add_tail(&extent_op->list, &delete_list);
2580 nr++;
2581 } else {
2582 kfree(extent_op);
2583
2584 ret = get_state_private(&info->extent_ins, start,
2585 &priv);
2586 BUG_ON(ret);
2587 extent_op = (struct pending_extent_op *)
2588 (unsigned long)priv;
2589
2590 clear_extent_bits(&info->extent_ins, start, end,
2591 EXTENT_WRITEBACK, GFP_NOFS);
2592
2593 if (extent_op->type == PENDING_BACKREF_UPDATE) {
2594 list_add_tail(&extent_op->list, &delete_list);
2595 search = end + 1;
2596 nr++;
2597 continue;
2598 }
2599
2600 mutex_lock(&extent_root->fs_info->pinned_mutex);
2601 ret = pin_down_bytes(trans, extent_root, start,
2602 end + 1 - start, 0);
2603 mutex_unlock(&extent_root->fs_info->pinned_mutex);
2604
2605 ret = update_block_group(trans, extent_root, start,
2606 end + 1 - start, 0, ret > 0);
2607
2608 unlock_extent(extent_ins, start, end, GFP_NOFS);
2609 BUG_ON(ret);
2610 kfree(extent_op);
2611 }
2612 if (ret)
2613 err = ret;
2614
2615 search = end + 1;
2616
2617 if (need_resched()) {
2618 mutex_unlock(&info->extent_ins_mutex);
2619 cond_resched();
2620 mutex_lock(&info->extent_ins_mutex);
2621 }
2622 }
2623
2624 if (nr) {
2625 ret = free_extents(trans, extent_root, &delete_list);
2626 BUG_ON(ret);
2627 }
2628
2629 if (all && skipped) {
2630 INIT_LIST_HEAD(&delete_list);
2631 search = 0;
2632 nr = 0;
2633 goto again;
2634 }
2635
2636 return err;
2637}
2638
2639/*
2640 * remove an extent from the root, returns 0 on success
2641 */
2642static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2643 struct btrfs_root *root,
2644 u64 bytenr, u64 num_bytes, u64 parent,
2645 u64 root_objectid, u64 ref_generation,
2646 u64 owner_objectid, int pin)
2647{
2648 struct btrfs_root *extent_root = root->fs_info->extent_root;
2649 int pending_ret;
2650 int ret;
2651
2652 WARN_ON(num_bytes < root->sectorsize);
2653 if (root == extent_root) {
2654 struct pending_extent_op *extent_op = NULL;
2655
2656 mutex_lock(&root->fs_info->extent_ins_mutex);
2657 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
2658 bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
2659 u64 priv;
2660 ret = get_state_private(&root->fs_info->extent_ins,
2661 bytenr, &priv);
2662 BUG_ON(ret);
2663 extent_op = (struct pending_extent_op *)
2664 (unsigned long)priv;
2665
2666 extent_op->del = 1;
2667 if (extent_op->type == PENDING_EXTENT_INSERT) {
2668 mutex_unlock(&root->fs_info->extent_ins_mutex);
2669 return 0;
2670 }
2671 }
2672
2673 if (extent_op) {
2674 ref_generation = extent_op->orig_generation;
2675 parent = extent_op->orig_parent;
2676 }
2677
2678 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2679 BUG_ON(!extent_op);
2680
2681 extent_op->type = PENDING_EXTENT_DELETE;
2682 extent_op->bytenr = bytenr;
2683 extent_op->num_bytes = num_bytes;
2684 extent_op->parent = parent;
2685 extent_op->orig_parent = parent;
2686 extent_op->generation = ref_generation;
2687 extent_op->orig_generation = ref_generation;
2688 extent_op->level = (int)owner_objectid;
2689 INIT_LIST_HEAD(&extent_op->list);
2690 extent_op->del = 0;
2691
2692 set_extent_bits(&root->fs_info->pending_del,
2693 bytenr, bytenr + num_bytes - 1,
2694 EXTENT_WRITEBACK, GFP_NOFS);
2695 set_state_private(&root->fs_info->pending_del,
2696 bytenr, (unsigned long)extent_op);
2697 mutex_unlock(&root->fs_info->extent_ins_mutex);
2698 return 0;
2699 }
2700 /* if metadata always pin */
2701 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2702 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
2703 struct btrfs_block_group_cache *cache;
2704
2705 /* btrfs_free_reserved_extent */
2706 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
2707 BUG_ON(!cache);
2708 btrfs_add_free_space(cache, bytenr, num_bytes);
2709 put_block_group(cache);
2710 update_reserved_extents(root, bytenr, num_bytes, 0);
2711 return 0;
2712 }
2713 pin = 1;
2714 }
2715
2716 /* if data pin when any transaction has committed this */
2717 if (ref_generation != trans->transid)
2718 pin = 1;
2719
2720 ret = __free_extent(trans, root, bytenr, num_bytes, parent,
2721 root_objectid, ref_generation,
2722 owner_objectid, pin, pin == 0);
2723
2724 finish_current_insert(trans, root->fs_info->extent_root, 0);
2725 pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0);
2726 return ret ? ret : pending_ret;
2727}
2728
2729int btrfs_free_extent(struct btrfs_trans_handle *trans,
2730 struct btrfs_root *root,
2731 u64 bytenr, u64 num_bytes, u64 parent,
2732 u64 root_objectid, u64 ref_generation,
2733 u64 owner_objectid, int pin)
2734{
2735 int ret;
2736
2737 ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
2738 root_objectid, ref_generation,
2739 owner_objectid, pin);
2740 return ret;
2741}
2742
2743static u64 stripe_align(struct btrfs_root *root, u64 val)
2744{
2745 u64 mask = ((u64)root->stripesize - 1);
2746 u64 ret = (val + mask) & ~mask;
2747 return ret;
2748}
2749
2750/*
2751 * walks the btree of allocated extents and find a hole of a given size.
2752 * The key ins is changed to record the hole:
2753 * ins->objectid == block start
2754 * ins->flags = BTRFS_EXTENT_ITEM_KEY
2755 * ins->offset == number of blocks
2756 * Any available blocks before search_start are skipped.
2757 */
2758static noinline int find_free_extent(struct btrfs_trans_handle *trans,
2759 struct btrfs_root *orig_root,
2760 u64 num_bytes, u64 empty_size,
2761 u64 search_start, u64 search_end,
2762 u64 hint_byte, struct btrfs_key *ins,
2763 u64 exclude_start, u64 exclude_nr,
2764 int data)
2765{
2766 int ret = 0;
2767 struct btrfs_root *root = orig_root->fs_info->extent_root;
2768 u64 total_needed = num_bytes;
2769 u64 *last_ptr = NULL;
2770 u64 last_wanted = 0;
2771 struct btrfs_block_group_cache *block_group = NULL;
2772 int chunk_alloc_done = 0;
2773 int empty_cluster = 2 * 1024 * 1024;
2774 int allowed_chunk_alloc = 0;
2775 struct list_head *head = NULL, *cur = NULL;
2776 int loop = 0;
2777 int extra_loop = 0;
2778 struct btrfs_space_info *space_info;
2779
2780 WARN_ON(num_bytes < root->sectorsize);
2781 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
2782 ins->objectid = 0;
2783 ins->offset = 0;
2784
2785 if (orig_root->ref_cows || empty_size)
2786 allowed_chunk_alloc = 1;
2787
2788 if (data & BTRFS_BLOCK_GROUP_METADATA) {
2789 last_ptr = &root->fs_info->last_alloc;
2790 empty_cluster = 64 * 1024;
2791 }
2792
2793 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
2794 last_ptr = &root->fs_info->last_data_alloc;
2795
2796 if (last_ptr) {
2797 if (*last_ptr) {
2798 hint_byte = *last_ptr;
2799 last_wanted = *last_ptr;
2800 } else
2801 empty_size += empty_cluster;
2802 } else {
2803 empty_cluster = 0;
2804 }
2805 search_start = max(search_start, first_logical_byte(root, 0));
2806 search_start = max(search_start, hint_byte);
2807
2808 if (last_wanted && search_start != last_wanted) {
2809 last_wanted = 0;
2810 empty_size += empty_cluster;
2811 }
2812
2813 total_needed += empty_size;
2814 block_group = btrfs_lookup_block_group(root->fs_info, search_start);
2815 if (!block_group)
2816 block_group = btrfs_lookup_first_block_group(root->fs_info,
2817 search_start);
2818 space_info = __find_space_info(root->fs_info, data);
2819
2820 down_read(&space_info->groups_sem);
2821 while (1) {
2822 struct btrfs_free_space *free_space;
2823 /*
2824 * the only way this happens if our hint points to a block
2825 * group thats not of the proper type, while looping this
2826 * should never happen
2827 */
2828 if (empty_size)
2829 extra_loop = 1;
2830
2831 if (!block_group)
2832 goto new_group_no_lock;
2833
2834 if (unlikely(!block_group->cached)) {
2835 mutex_lock(&block_group->cache_mutex);
2836 ret = cache_block_group(root, block_group);
2837 mutex_unlock(&block_group->cache_mutex);
2838 if (ret)
2839 break;
2840 }
2841
2842 mutex_lock(&block_group->alloc_mutex);
2843 if (unlikely(!block_group_bits(block_group, data)))
2844 goto new_group;
2845
2846 if (unlikely(block_group->ro))
2847 goto new_group;
2848
2849 free_space = btrfs_find_free_space(block_group, search_start,
2850 total_needed);
2851 if (free_space) {
2852 u64 start = block_group->key.objectid;
2853 u64 end = block_group->key.objectid +
2854 block_group->key.offset;
2855
2856 search_start = stripe_align(root, free_space->offset);
2857
2858 /* move on to the next group */
2859 if (search_start + num_bytes >= search_end)
2860 goto new_group;
2861
2862 /* move on to the next group */
2863 if (search_start + num_bytes > end)
2864 goto new_group;
2865
2866 if (last_wanted && search_start != last_wanted) {
2867 total_needed += empty_cluster;
2868 empty_size += empty_cluster;
2869 last_wanted = 0;
2870 /*
2871 * if search_start is still in this block group
2872 * then we just re-search this block group
2873 */
2874 if (search_start >= start &&
2875 search_start < end) {
2876 mutex_unlock(&block_group->alloc_mutex);
2877 continue;
2878 }
2879
2880 /* else we go to the next block group */
2881 goto new_group;
2882 }
2883
2884 if (exclude_nr > 0 &&
2885 (search_start + num_bytes > exclude_start &&
2886 search_start < exclude_start + exclude_nr)) {
2887 search_start = exclude_start + exclude_nr;
2888 /*
2889 * if search_start is still in this block group
2890 * then we just re-search this block group
2891 */
2892 if (search_start >= start &&
2893 search_start < end) {
2894 mutex_unlock(&block_group->alloc_mutex);
2895 last_wanted = 0;
2896 continue;
2897 }
2898
2899 /* else we go to the next block group */
2900 goto new_group;
2901 }
2902
2903 ins->objectid = search_start;
2904 ins->offset = num_bytes;
2905
2906 btrfs_remove_free_space_lock(block_group, search_start,
2907 num_bytes);
2908 /* we are all good, lets return */
2909 mutex_unlock(&block_group->alloc_mutex);
2910 break;
2911 }
2912new_group:
2913 mutex_unlock(&block_group->alloc_mutex);
2914 put_block_group(block_group);
2915 block_group = NULL;
2916new_group_no_lock:
2917 /* don't try to compare new allocations against the
2918 * last allocation any more
2919 */
2920 last_wanted = 0;
2921
2922 /*
2923 * Here's how this works.
2924 * loop == 0: we were searching a block group via a hint
2925 * and didn't find anything, so we start at
2926 * the head of the block groups and keep searching
2927 * loop == 1: we're searching through all of the block groups
2928 * if we hit the head again we have searched
2929 * all of the block groups for this space and we
2930 * need to try and allocate, if we cant error out.
2931 * loop == 2: we allocated more space and are looping through
2932 * all of the block groups again.
2933 */
2934 if (loop == 0) {
2935 head = &space_info->block_groups;
2936 cur = head->next;
2937 loop++;
2938 } else if (loop == 1 && cur == head) {
2939 int keep_going;
2940
2941 /* at this point we give up on the empty_size
2942 * allocations and just try to allocate the min
2943 * space.
2944 *
2945 * The extra_loop field was set if an empty_size
2946 * allocation was attempted above, and if this
2947 * is try we need to try the loop again without
2948 * the additional empty_size.
2949 */
2950 total_needed -= empty_size;
2951 empty_size = 0;
2952 keep_going = extra_loop;
2953 loop++;
2954
2955 if (allowed_chunk_alloc && !chunk_alloc_done) {
2956 up_read(&space_info->groups_sem);
2957 ret = do_chunk_alloc(trans, root, num_bytes +
2958 2 * 1024 * 1024, data, 1);
2959 down_read(&space_info->groups_sem);
2960 if (ret < 0)
2961 goto loop_check;
2962 head = &space_info->block_groups;
2963 /*
2964 * we've allocated a new chunk, keep
2965 * trying
2966 */
2967 keep_going = 1;
2968 chunk_alloc_done = 1;
2969 } else if (!allowed_chunk_alloc) {
2970 space_info->force_alloc = 1;
2971 }
2972loop_check:
2973 if (keep_going) {
2974 cur = head->next;
2975 extra_loop = 0;
2976 } else {
2977 break;
2978 }
2979 } else if (cur == head) {
2980 break;
2981 }
2982
2983 block_group = list_entry(cur, struct btrfs_block_group_cache,
2984 list);
2985 atomic_inc(&block_group->count);
2986
2987 search_start = block_group->key.objectid;
2988 cur = cur->next;
2989 }
2990
2991 /* we found what we needed */
2992 if (ins->objectid) {
2993 if (!(data & BTRFS_BLOCK_GROUP_DATA))
2994 trans->block_group = block_group->key.objectid;
2995
2996 if (last_ptr)
2997 *last_ptr = ins->objectid + ins->offset;
2998 ret = 0;
2999 } else if (!ret) {
3000 printk(KERN_ERR "btrfs searching for %llu bytes, "
3001 "num_bytes %llu, loop %d, allowed_alloc %d\n",
3002 (unsigned long long)total_needed,
3003 (unsigned long long)num_bytes,
3004 loop, allowed_chunk_alloc);
3005 ret = -ENOSPC;
3006 }
3007 if (block_group)
3008 put_block_group(block_group);
3009
3010 up_read(&space_info->groups_sem);
3011 return ret;
3012}
3013
3014static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3015{
3016 struct btrfs_block_group_cache *cache;
3017 struct list_head *l;
3018
3019 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
3020 (unsigned long long)(info->total_bytes - info->bytes_used -
3021 info->bytes_pinned - info->bytes_reserved),
3022 (info->full) ? "" : "not ");
3023
3024 down_read(&info->groups_sem);
3025 list_for_each(l, &info->block_groups) {
3026 cache = list_entry(l, struct btrfs_block_group_cache, list);
3027 spin_lock(&cache->lock);
3028 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
3029 "%llu pinned %llu reserved\n",
3030 (unsigned long long)cache->key.objectid,
3031 (unsigned long long)cache->key.offset,
3032 (unsigned long long)btrfs_block_group_used(&cache->item),
3033 (unsigned long long)cache->pinned,
3034 (unsigned long long)cache->reserved);
3035 btrfs_dump_free_space(cache, bytes);
3036 spin_unlock(&cache->lock);
3037 }
3038 up_read(&info->groups_sem);
3039}
3040
3041static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3042 struct btrfs_root *root,
3043 u64 num_bytes, u64 min_alloc_size,
3044 u64 empty_size, u64 hint_byte,
3045 u64 search_end, struct btrfs_key *ins,
3046 u64 data)
3047{
3048 int ret;
3049 u64 search_start = 0;
3050 u64 alloc_profile;
3051 struct btrfs_fs_info *info = root->fs_info;
3052
3053 if (data) {
3054 alloc_profile = info->avail_data_alloc_bits &
3055 info->data_alloc_profile;
3056 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
3057 } else if (root == root->fs_info->chunk_root) {
3058 alloc_profile = info->avail_system_alloc_bits &
3059 info->system_alloc_profile;
3060 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
3061 } else {
3062 alloc_profile = info->avail_metadata_alloc_bits &
3063 info->metadata_alloc_profile;
3064 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
3065 }
3066again:
3067 data = btrfs_reduce_alloc_profile(root, data);
3068 /*
3069 * the only place that sets empty_size is btrfs_realloc_node, which
3070 * is not called recursively on allocations
3071 */
3072 if (empty_size || root->ref_cows) {
3073 if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
3074 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3075 2 * 1024 * 1024,
3076 BTRFS_BLOCK_GROUP_METADATA |
3077 (info->metadata_alloc_profile &
3078 info->avail_metadata_alloc_bits), 0);
3079 }
3080 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3081 num_bytes + 2 * 1024 * 1024, data, 0);
3082 }
3083
3084 WARN_ON(num_bytes < root->sectorsize);
3085 ret = find_free_extent(trans, root, num_bytes, empty_size,
3086 search_start, search_end, hint_byte, ins,
3087 trans->alloc_exclude_start,
3088 trans->alloc_exclude_nr, data);
3089
3090 if (ret == -ENOSPC && num_bytes > min_alloc_size) {
3091 num_bytes = num_bytes >> 1;
3092 num_bytes = num_bytes & ~(root->sectorsize - 1);
3093 num_bytes = max(num_bytes, min_alloc_size);
3094 do_chunk_alloc(trans, root->fs_info->extent_root,
3095 num_bytes, data, 1);
3096 goto again;
3097 }
3098 if (ret) {
3099 struct btrfs_space_info *sinfo;
3100
3101 sinfo = __find_space_info(root->fs_info, data);
3102 printk(KERN_ERR "btrfs allocation failed flags %llu, "
3103 "wanted %llu\n", (unsigned long long)data,
3104 (unsigned long long)num_bytes);
3105 dump_space_info(sinfo, num_bytes);
3106 BUG();
3107 }
3108
3109 return ret;
3110}
3111
3112int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
3113{
3114 struct btrfs_block_group_cache *cache;
3115 int ret = 0;
3116
3117 cache = btrfs_lookup_block_group(root->fs_info, start);
3118 if (!cache) {
3119 printk(KERN_ERR "Unable to find block group for %llu\n",
3120 (unsigned long long)start);
3121 return -ENOSPC;
3122 }
3123
3124 ret = btrfs_discard_extent(root, start, len);
3125
3126 btrfs_add_free_space(cache, start, len);
3127 put_block_group(cache);
3128 update_reserved_extents(root, start, len, 0);
3129
3130 return ret;
3131}
3132
3133int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3134 struct btrfs_root *root,
3135 u64 num_bytes, u64 min_alloc_size,
3136 u64 empty_size, u64 hint_byte,
3137 u64 search_end, struct btrfs_key *ins,
3138 u64 data)
3139{
3140 int ret;
3141 ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
3142 empty_size, hint_byte, search_end, ins,
3143 data);
3144 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3145 return ret;
3146}
3147
3148static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3149 struct btrfs_root *root, u64 parent,
3150 u64 root_objectid, u64 ref_generation,
3151 u64 owner, struct btrfs_key *ins)
3152{
3153 int ret;
3154 int pending_ret;
3155 u64 super_used;
3156 u64 root_used;
3157 u64 num_bytes = ins->offset;
3158 u32 sizes[2];
3159 struct btrfs_fs_info *info = root->fs_info;
3160 struct btrfs_root *extent_root = info->extent_root;
3161 struct btrfs_extent_item *extent_item;
3162 struct btrfs_extent_ref *ref;
3163 struct btrfs_path *path;
3164 struct btrfs_key keys[2];
3165
3166 if (parent == 0)
3167 parent = ins->objectid;
3168
3169 /* block accounting for super block */
3170 spin_lock(&info->delalloc_lock);
3171 super_used = btrfs_super_bytes_used(&info->super_copy);
3172 btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
3173
3174 /* block accounting for root item */
3175 root_used = btrfs_root_used(&root->root_item);
3176 btrfs_set_root_used(&root->root_item, root_used + num_bytes);
3177 spin_unlock(&info->delalloc_lock);
3178
3179 if (root == extent_root) {
3180 struct pending_extent_op *extent_op;
3181
3182 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
3183 BUG_ON(!extent_op);
3184
3185 extent_op->type = PENDING_EXTENT_INSERT;
3186 extent_op->bytenr = ins->objectid;
3187 extent_op->num_bytes = ins->offset;
3188 extent_op->parent = parent;
3189 extent_op->orig_parent = 0;
3190 extent_op->generation = ref_generation;
3191 extent_op->orig_generation = 0;
3192 extent_op->level = (int)owner;
3193 INIT_LIST_HEAD(&extent_op->list);
3194 extent_op->del = 0;
3195
3196 mutex_lock(&root->fs_info->extent_ins_mutex);
3197 set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
3198 ins->objectid + ins->offset - 1,
3199 EXTENT_WRITEBACK, GFP_NOFS);
3200 set_state_private(&root->fs_info->extent_ins,
3201 ins->objectid, (unsigned long)extent_op);
3202 mutex_unlock(&root->fs_info->extent_ins_mutex);
3203 goto update_block;
3204 }
3205
3206 memcpy(&keys[0], ins, sizeof(*ins));
3207 keys[1].objectid = ins->objectid;
3208 keys[1].type = BTRFS_EXTENT_REF_KEY;
3209 keys[1].offset = parent;
3210 sizes[0] = sizeof(*extent_item);
3211 sizes[1] = sizeof(*ref);
3212
3213 path = btrfs_alloc_path();
3214 BUG_ON(!path);
3215
3216 ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
3217 sizes, 2);
3218 BUG_ON(ret);
3219
3220 extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3221 struct btrfs_extent_item);
3222 btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
3223 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
3224 struct btrfs_extent_ref);
3225
3226 btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
3227 btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
3228 btrfs_set_ref_objectid(path->nodes[0], ref, owner);
3229 btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
3230
3231 btrfs_mark_buffer_dirty(path->nodes[0]);
3232
3233 trans->alloc_exclude_start = 0;
3234 trans->alloc_exclude_nr = 0;
3235 btrfs_free_path(path);
3236 finish_current_insert(trans, extent_root, 0);
3237 pending_ret = del_pending_extents(trans, extent_root, 0);
3238
3239 if (ret)
3240 goto out;
3241 if (pending_ret) {
3242 ret = pending_ret;
3243 goto out;
3244 }
3245
3246update_block:
3247 ret = update_block_group(trans, root, ins->objectid,
3248 ins->offset, 1, 0);
3249 if (ret) {
3250 printk(KERN_ERR "btrfs update block group failed for %llu "
3251 "%llu\n", (unsigned long long)ins->objectid,
3252 (unsigned long long)ins->offset);
3253 BUG();
3254 }
3255out:
3256 return ret;
3257}
3258
3259int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3260 struct btrfs_root *root, u64 parent,
3261 u64 root_objectid, u64 ref_generation,
3262 u64 owner, struct btrfs_key *ins)
3263{
3264 int ret;
3265
3266 if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
3267 return 0;
3268 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3269 ref_generation, owner, ins);
3270 update_reserved_extents(root, ins->objectid, ins->offset, 0);
3271 return ret;
3272}
3273
3274/*
3275 * this is used by the tree logging recovery code. It records that
3276 * an extent has been allocated and makes sure to clear the free
3277 * space cache bits as well
3278 */
3279int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
3280 struct btrfs_root *root, u64 parent,
3281 u64 root_objectid, u64 ref_generation,
3282 u64 owner, struct btrfs_key *ins)
3283{
3284 int ret;
3285 struct btrfs_block_group_cache *block_group;
3286
3287 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
3288 mutex_lock(&block_group->cache_mutex);
3289 cache_block_group(root, block_group);
3290 mutex_unlock(&block_group->cache_mutex);
3291
3292 ret = btrfs_remove_free_space(block_group, ins->objectid,
3293 ins->offset);
3294 BUG_ON(ret);
3295 put_block_group(block_group);
3296 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3297 ref_generation, owner, ins);
3298 return ret;
3299}
3300
3301/*
3302 * finds a free extent and does all the dirty work required for allocation
3303 * returns the key for the extent through ins, and a tree buffer for
3304 * the first block of the extent through buf.
3305 *
3306 * returns 0 if everything worked, non-zero otherwise.
3307 */
3308int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
3309 struct btrfs_root *root,
3310 u64 num_bytes, u64 parent, u64 min_alloc_size,
3311 u64 root_objectid, u64 ref_generation,
3312 u64 owner_objectid, u64 empty_size, u64 hint_byte,
3313 u64 search_end, struct btrfs_key *ins, u64 data)
3314{
3315 int ret;
3316
3317 ret = __btrfs_reserve_extent(trans, root, num_bytes,
3318 min_alloc_size, empty_size, hint_byte,
3319 search_end, ins, data);
3320 BUG_ON(ret);
3321 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
3322 ret = __btrfs_alloc_reserved_extent(trans, root, parent,
3323 root_objectid, ref_generation,
3324 owner_objectid, ins);
3325 BUG_ON(ret);
3326
3327 } else {
3328 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3329 }
3330 return ret;
3331}
3332
3333struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3334 struct btrfs_root *root,
3335 u64 bytenr, u32 blocksize)
3336{
3337 struct extent_buffer *buf;
3338
3339 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
3340 if (!buf)
3341 return ERR_PTR(-ENOMEM);
3342 btrfs_set_header_generation(buf, trans->transid);
3343 btrfs_tree_lock(buf);
3344 clean_tree_block(trans, root, buf);
3345 btrfs_set_buffer_uptodate(buf);
3346 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3347 set_extent_dirty(&root->dirty_log_pages, buf->start,
3348 buf->start + buf->len - 1, GFP_NOFS);
3349 } else {
3350 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
3351 buf->start + buf->len - 1, GFP_NOFS);
3352 }
3353 trans->blocks_used++;
3354 return buf;
3355}
3356
3357/*
3358 * helper function to allocate a block for a given tree
3359 * returns the tree buffer or NULL.
3360 */
3361struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
3362 struct btrfs_root *root,
3363 u32 blocksize, u64 parent,
3364 u64 root_objectid,
3365 u64 ref_generation,
3366 int level,
3367 u64 hint,
3368 u64 empty_size)
3369{
3370 struct btrfs_key ins;
3371 int ret;
3372 struct extent_buffer *buf;
3373
3374 ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
3375 root_objectid, ref_generation, level,
3376 empty_size, hint, (u64)-1, &ins, 0);
3377 if (ret) {
3378 BUG_ON(ret > 0);
3379 return ERR_PTR(ret);
3380 }
3381
3382 buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize);
3383 return buf;
3384}
3385
3386int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3387 struct btrfs_root *root, struct extent_buffer *leaf)
3388{
3389 u64 leaf_owner;
3390 u64 leaf_generation;
3391 struct btrfs_key key;
3392 struct btrfs_file_extent_item *fi;
3393 int i;
3394 int nritems;
3395 int ret;
3396
3397 BUG_ON(!btrfs_is_leaf(leaf));
3398 nritems = btrfs_header_nritems(leaf);
3399 leaf_owner = btrfs_header_owner(leaf);
3400 leaf_generation = btrfs_header_generation(leaf);
3401
3402 for (i = 0; i < nritems; i++) {
3403 u64 disk_bytenr;
3404 cond_resched();
3405
3406 btrfs_item_key_to_cpu(leaf, &key, i);
3407 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3408 continue;
3409 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3410 if (btrfs_file_extent_type(leaf, fi) ==
3411 BTRFS_FILE_EXTENT_INLINE)
3412 continue;
3413 /*
3414 * FIXME make sure to insert a trans record that
3415 * repeats the snapshot del on crash
3416 */
3417 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
3418 if (disk_bytenr == 0)
3419 continue;
3420
3421 ret = __btrfs_free_extent(trans, root, disk_bytenr,
3422 btrfs_file_extent_disk_num_bytes(leaf, fi),
3423 leaf->start, leaf_owner, leaf_generation,
3424 key.objectid, 0);
3425 BUG_ON(ret);
3426
3427 atomic_inc(&root->fs_info->throttle_gen);
3428 wake_up(&root->fs_info->transaction_throttle);
3429 cond_resched();
3430 }
3431 return 0;
3432}
3433
3434static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3435 struct btrfs_root *root,
3436 struct btrfs_leaf_ref *ref)
3437{
3438 int i;
3439 int ret;
3440 struct btrfs_extent_info *info = ref->extents;
3441
3442 for (i = 0; i < ref->nritems; i++) {
3443 ret = __btrfs_free_extent(trans, root, info->bytenr,
3444 info->num_bytes, ref->bytenr,
3445 ref->owner, ref->generation,
3446 info->objectid, 0);
3447
3448 atomic_inc(&root->fs_info->throttle_gen);
3449 wake_up(&root->fs_info->transaction_throttle);
3450 cond_resched();
3451
3452 BUG_ON(ret);
3453 info++;
3454 }
3455
3456 return 0;
3457}
3458
3459static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
3460 u64 len, u32 *refs)
3461{
3462 int ret;
3463
3464 ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
3465 BUG_ON(ret);
3466
3467#if 0 /* some debugging code in case we see problems here */
3468 /* if the refs count is one, it won't get increased again. But
3469 * if the ref count is > 1, someone may be decreasing it at
3470 * the same time we are.
3471 */
3472 if (*refs != 1) {
3473 struct extent_buffer *eb = NULL;
3474 eb = btrfs_find_create_tree_block(root, start, len);
3475 if (eb)
3476 btrfs_tree_lock(eb);
3477
3478 mutex_lock(&root->fs_info->alloc_mutex);
3479 ret = lookup_extent_ref(NULL, root, start, len, refs);
3480 BUG_ON(ret);
3481 mutex_unlock(&root->fs_info->alloc_mutex);
3482
3483 if (eb) {
3484 btrfs_tree_unlock(eb);
3485 free_extent_buffer(eb);
3486 }
3487 if (*refs == 1) {
3488 printk(KERN_ERR "btrfs block %llu went down to one "
3489 "during drop_snap\n", (unsigned long long)start);
3490 }
3491
3492 }
3493#endif
3494
3495 cond_resched();
3496 return ret;
3497}
3498
3499/*
3500 * helper function for drop_snapshot, this walks down the tree dropping ref
3501 * counts as it goes.
3502 */
3503static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3504 struct btrfs_root *root,
3505 struct btrfs_path *path, int *level)
3506{
3507 u64 root_owner;
3508 u64 root_gen;
3509 u64 bytenr;
3510 u64 ptr_gen;
3511 struct extent_buffer *next;
3512 struct extent_buffer *cur;
3513 struct extent_buffer *parent;
3514 struct btrfs_leaf_ref *ref;
3515 u32 blocksize;
3516 int ret;
3517 u32 refs;
3518
3519 WARN_ON(*level < 0);
3520 WARN_ON(*level >= BTRFS_MAX_LEVEL);
3521 ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
3522 path->nodes[*level]->len, &refs);
3523 BUG_ON(ret);
3524 if (refs > 1)
3525 goto out;
3526
3527 /*
3528 * walk down to the last node level and free all the leaves
3529 */
3530 while (*level >= 0) {
3531 WARN_ON(*level < 0);
3532 WARN_ON(*level >= BTRFS_MAX_LEVEL);
3533 cur = path->nodes[*level];
3534
3535 if (btrfs_header_level(cur) != *level)
3536 WARN_ON(1);
3537
3538 if (path->slots[*level] >=
3539 btrfs_header_nritems(cur))
3540 break;
3541 if (*level == 0) {
3542 ret = btrfs_drop_leaf_ref(trans, root, cur);
3543 BUG_ON(ret);
3544 break;
3545 }
3546 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3547 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3548 blocksize = btrfs_level_size(root, *level - 1);
3549
3550 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3551 BUG_ON(ret);
3552 if (refs != 1) {
3553 parent = path->nodes[*level];
3554 root_owner = btrfs_header_owner(parent);
3555 root_gen = btrfs_header_generation(parent);
3556 path->slots[*level]++;
3557
3558 ret = __btrfs_free_extent(trans, root, bytenr,
3559 blocksize, parent->start,
3560 root_owner, root_gen,
3561 *level - 1, 1);
3562 BUG_ON(ret);
3563
3564 atomic_inc(&root->fs_info->throttle_gen);
3565 wake_up(&root->fs_info->transaction_throttle);
3566 cond_resched();
3567
3568 continue;
3569 }
3570 /*
3571 * at this point, we have a single ref, and since the
3572 * only place referencing this extent is a dead root
3573 * the reference count should never go higher.
3574 * So, we don't need to check it again
3575 */
3576 if (*level == 1) {
3577 ref = btrfs_lookup_leaf_ref(root, bytenr);
3578 if (ref && ref->generation != ptr_gen) {
3579 btrfs_free_leaf_ref(root, ref);
3580 ref = NULL;
3581 }
3582 if (ref) {
3583 ret = cache_drop_leaf_ref(trans, root, ref);
3584 BUG_ON(ret);
3585 btrfs_remove_leaf_ref(root, ref);
3586 btrfs_free_leaf_ref(root, ref);
3587 *level = 0;
3588 break;
3589 }
3590 }
3591 next = btrfs_find_tree_block(root, bytenr, blocksize);
3592 if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
3593 free_extent_buffer(next);
3594
3595 next = read_tree_block(root, bytenr, blocksize,
3596 ptr_gen);
3597 cond_resched();
3598#if 0
3599 /*
3600 * this is a debugging check and can go away
3601 * the ref should never go all the way down to 1
3602 * at this point
3603 */
3604 ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
3605 &refs);
3606 BUG_ON(ret);
3607 WARN_ON(refs != 1);
3608#endif
3609 }
3610 WARN_ON(*level <= 0);
3611 if (path->nodes[*level-1])
3612 free_extent_buffer(path->nodes[*level-1]);
3613 path->nodes[*level-1] = next;
3614 *level = btrfs_header_level(next);
3615 path->slots[*level] = 0;
3616 cond_resched();
3617 }
3618out:
3619 WARN_ON(*level < 0);
3620 WARN_ON(*level >= BTRFS_MAX_LEVEL);
3621
3622 if (path->nodes[*level] == root->node) {
3623 parent = path->nodes[*level];
3624 bytenr = path->nodes[*level]->start;
3625 } else {
3626 parent = path->nodes[*level + 1];
3627 bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
3628 }
3629
3630 blocksize = btrfs_level_size(root, *level);
3631 root_owner = btrfs_header_owner(parent);
3632 root_gen = btrfs_header_generation(parent);
3633
3634 ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
3635 parent->start, root_owner, root_gen,
3636 *level, 1);
3637 free_extent_buffer(path->nodes[*level]);
3638 path->nodes[*level] = NULL;
3639 *level += 1;
3640 BUG_ON(ret);
3641
3642 cond_resched();
3643 return 0;
3644}
3645
3646/*
3647 * helper function for drop_subtree, this function is similar to
3648 * walk_down_tree. The main difference is that it checks reference
3649 * counts while tree blocks are locked.
3650 */
3651static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
3652 struct btrfs_root *root,
3653 struct btrfs_path *path, int *level)
3654{
3655 struct extent_buffer *next;
3656 struct extent_buffer *cur;
3657 struct extent_buffer *parent;
3658 u64 bytenr;
3659 u64 ptr_gen;
3660 u32 blocksize;
3661 u32 refs;
3662 int ret;
3663
3664 cur = path->nodes[*level];
3665 ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len,
3666 &refs);
3667 BUG_ON(ret);
3668 if (refs > 1)
3669 goto out;
3670
3671 while (*level >= 0) {
3672 cur = path->nodes[*level];
3673 if (*level == 0) {
3674 ret = btrfs_drop_leaf_ref(trans, root, cur);
3675 BUG_ON(ret);
3676 clean_tree_block(trans, root, cur);
3677 break;
3678 }
3679 if (path->slots[*level] >= btrfs_header_nritems(cur)) {
3680 clean_tree_block(trans, root, cur);
3681 break;
3682 }
3683
3684 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3685 blocksize = btrfs_level_size(root, *level - 1);
3686 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3687
3688 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3689 btrfs_tree_lock(next);
3690
3691 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
3692 &refs);
3693 BUG_ON(ret);
3694 if (refs > 1) {
3695 parent = path->nodes[*level];
3696 ret = btrfs_free_extent(trans, root, bytenr,
3697 blocksize, parent->start,
3698 btrfs_header_owner(parent),
3699 btrfs_header_generation(parent),
3700 *level - 1, 1);
3701 BUG_ON(ret);
3702 path->slots[*level]++;
3703 btrfs_tree_unlock(next);
3704 free_extent_buffer(next);
3705 continue;
3706 }
3707
3708 *level = btrfs_header_level(next);
3709 path->nodes[*level] = next;
3710 path->slots[*level] = 0;
3711 path->locks[*level] = 1;
3712 cond_resched();
3713 }
3714out:
3715 parent = path->nodes[*level + 1];
3716 bytenr = path->nodes[*level]->start;
3717 blocksize = path->nodes[*level]->len;
3718
3719 ret = btrfs_free_extent(trans, root, bytenr, blocksize,
3720 parent->start, btrfs_header_owner(parent),
3721 btrfs_header_generation(parent), *level, 1);
3722 BUG_ON(ret);
3723
3724 if (path->locks[*level]) {
3725 btrfs_tree_unlock(path->nodes[*level]);
3726 path->locks[*level] = 0;
3727 }
3728 free_extent_buffer(path->nodes[*level]);
3729 path->nodes[*level] = NULL;
3730 *level += 1;
3731 cond_resched();
3732 return 0;
3733}
3734
3735/*
3736 * helper for dropping snapshots. This walks back up the tree in the path
3737 * to find the first node higher up where we haven't yet gone through
3738 * all the slots
3739 */
3740static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3741 struct btrfs_root *root,
3742 struct btrfs_path *path,
3743 int *level, int max_level)
3744{
3745 u64 root_owner;
3746 u64 root_gen;
3747 struct btrfs_root_item *root_item = &root->root_item;
3748 int i;
3749 int slot;
3750 int ret;
3751
3752 for (i = *level; i < max_level && path->nodes[i]; i++) {
3753 slot = path->slots[i];
3754 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
3755 struct extent_buffer *node;
3756 struct btrfs_disk_key disk_key;
3757 node = path->nodes[i];
3758 path->slots[i]++;
3759 *level = i;
3760 WARN_ON(*level == 0);
3761 btrfs_node_key(node, &disk_key, path->slots[i]);
3762 memcpy(&root_item->drop_progress,
3763 &disk_key, sizeof(disk_key));
3764 root_item->drop_level = i;
3765 return 0;
3766 } else {
3767 struct extent_buffer *parent;
3768 if (path->nodes[*level] == root->node)
3769 parent = path->nodes[*level];
3770 else
3771 parent = path->nodes[*level + 1];
3772
3773 root_owner = btrfs_header_owner(parent);
3774 root_gen = btrfs_header_generation(parent);
3775
3776 clean_tree_block(trans, root, path->nodes[*level]);
3777 ret = btrfs_free_extent(trans, root,
3778 path->nodes[*level]->start,
3779 path->nodes[*level]->len,
3780 parent->start, root_owner,
3781 root_gen, *level, 1);
3782 BUG_ON(ret);
3783 if (path->locks[*level]) {
3784 btrfs_tree_unlock(path->nodes[*level]);
3785 path->locks[*level] = 0;
3786 }
3787 free_extent_buffer(path->nodes[*level]);
3788 path->nodes[*level] = NULL;
3789 *level = i + 1;
3790 }
3791 }
3792 return 1;
3793}
3794
3795/*
3796 * drop the reference count on the tree rooted at 'snap'. This traverses
3797 * the tree freeing any blocks that have a ref count of zero after being
3798 * decremented.
3799 */
3800int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
3801 *root)
3802{
3803 int ret = 0;
3804 int wret;
3805 int level;
3806 struct btrfs_path *path;
3807 int i;
3808 int orig_level;
3809 struct btrfs_root_item *root_item = &root->root_item;
3810
3811 WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
3812 path = btrfs_alloc_path();
3813 BUG_ON(!path);
3814
3815 level = btrfs_header_level(root->node);
3816 orig_level = level;
3817 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
3818 path->nodes[level] = root->node;
3819 extent_buffer_get(root->node);
3820 path->slots[level] = 0;
3821 } else {
3822 struct btrfs_key key;
3823 struct btrfs_disk_key found_key;
3824 struct extent_buffer *node;
3825
3826 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
3827 level = root_item->drop_level;
3828 path->lowest_level = level;
3829 wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3830 if (wret < 0) {
3831 ret = wret;
3832 goto out;
3833 }
3834 node = path->nodes[level];
3835 btrfs_node_key(node, &found_key, path->slots[level]);
3836 WARN_ON(memcmp(&found_key, &root_item->drop_progress,
3837 sizeof(found_key)));
3838 /*
3839 * unlock our path, this is safe because only this
3840 * function is allowed to delete this snapshot
3841 */
3842 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
3843 if (path->nodes[i] && path->locks[i]) {
3844 path->locks[i] = 0;
3845 btrfs_tree_unlock(path->nodes[i]);
3846 }
3847 }
3848 }
3849 while (1) {
3850 wret = walk_down_tree(trans, root, path, &level);
3851 if (wret > 0)
3852 break;
3853 if (wret < 0)
3854 ret = wret;
3855
3856 wret = walk_up_tree(trans, root, path, &level,
3857 BTRFS_MAX_LEVEL);
3858 if (wret > 0)
3859 break;
3860 if (wret < 0)
3861 ret = wret;
3862 if (trans->transaction->in_commit) {
3863 ret = -EAGAIN;
3864 break;
3865 }
3866 atomic_inc(&root->fs_info->throttle_gen);
3867 wake_up(&root->fs_info->transaction_throttle);
3868 }
3869 for (i = 0; i <= orig_level; i++) {
3870 if (path->nodes[i]) {
3871 free_extent_buffer(path->nodes[i]);
3872 path->nodes[i] = NULL;
3873 }
3874 }
3875out:
3876 btrfs_free_path(path);
3877 return ret;
3878}
3879
3880int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
3881 struct btrfs_root *root,
3882 struct extent_buffer *node,
3883 struct extent_buffer *parent)
3884{
3885 struct btrfs_path *path;
3886 int level;
3887 int parent_level;
3888 int ret = 0;
3889 int wret;
3890
3891 path = btrfs_alloc_path();
3892 BUG_ON(!path);
3893
3894 BUG_ON(!btrfs_tree_locked(parent));
3895 parent_level = btrfs_header_level(parent);
3896 extent_buffer_get(parent);
3897 path->nodes[parent_level] = parent;
3898 path->slots[parent_level] = btrfs_header_nritems(parent);
3899
3900 BUG_ON(!btrfs_tree_locked(node));
3901 level = btrfs_header_level(node);
3902 extent_buffer_get(node);
3903 path->nodes[level] = node;
3904 path->slots[level] = 0;
3905
3906 while (1) {
3907 wret = walk_down_subtree(trans, root, path, &level);
3908 if (wret < 0)
3909 ret = wret;
3910 if (wret != 0)
3911 break;
3912
3913 wret = walk_up_tree(trans, root, path, &level, parent_level);
3914 if (wret < 0)
3915 ret = wret;
3916 if (wret != 0)
3917 break;
3918 }
3919
3920 btrfs_free_path(path);
3921 return ret;
3922}
3923
3924static unsigned long calc_ra(unsigned long start, unsigned long last,
3925 unsigned long nr)
3926{
3927 return min(last, start + nr - 1);
3928}
3929
3930static noinline int relocate_inode_pages(struct inode *inode, u64 start,
3931 u64 len)
3932{
3933 u64 page_start;
3934 u64 page_end;
3935 unsigned long first_index;
3936 unsigned long last_index;
3937 unsigned long i;
3938 struct page *page;
3939 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3940 struct file_ra_state *ra;
3941 struct btrfs_ordered_extent *ordered;
3942 unsigned int total_read = 0;
3943 unsigned int total_dirty = 0;
3944 int ret = 0;
3945
3946 ra = kzalloc(sizeof(*ra), GFP_NOFS);
3947
3948 mutex_lock(&inode->i_mutex);
3949 first_index = start >> PAGE_CACHE_SHIFT;
3950 last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
3951
3952 /* make sure the dirty trick played by the caller work */
3953 ret = invalidate_inode_pages2_range(inode->i_mapping,
3954 first_index, last_index);
3955 if (ret)
3956 goto out_unlock;
3957
3958 file_ra_state_init(ra, inode->i_mapping);
3959
3960 for (i = first_index ; i <= last_index; i++) {
3961 if (total_read % ra->ra_pages == 0) {
3962 btrfs_force_ra(inode->i_mapping, ra, NULL, i,
3963 calc_ra(i, last_index, ra->ra_pages));
3964 }
3965 total_read++;
3966again:
3967 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
3968 BUG_ON(1);
3969 page = grab_cache_page(inode->i_mapping, i);
3970 if (!page) {
3971 ret = -ENOMEM;
3972 goto out_unlock;
3973 }
3974 if (!PageUptodate(page)) {
3975 btrfs_readpage(NULL, page);
3976 lock_page(page);
3977 if (!PageUptodate(page)) {
3978 unlock_page(page);
3979 page_cache_release(page);
3980 ret = -EIO;
3981 goto out_unlock;
3982 }
3983 }
3984 wait_on_page_writeback(page);
3985
3986 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
3987 page_end = page_start + PAGE_CACHE_SIZE - 1;
3988 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
3989
3990 ordered = btrfs_lookup_ordered_extent(inode, page_start);
3991 if (ordered) {
3992 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3993 unlock_page(page);
3994 page_cache_release(page);
3995 btrfs_start_ordered_extent(inode, ordered, 1);
3996 btrfs_put_ordered_extent(ordered);
3997 goto again;
3998 }
3999 set_page_extent_mapped(page);
4000
4001 if (i == first_index)
4002 set_extent_bits(io_tree, page_start, page_end,
4003 EXTENT_BOUNDARY, GFP_NOFS);
4004 btrfs_set_extent_delalloc(inode, page_start, page_end);
4005
4006 set_page_dirty(page);
4007 total_dirty++;
4008
4009 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4010 unlock_page(page);
4011 page_cache_release(page);
4012 }
4013
4014out_unlock:
4015 kfree(ra);
4016 mutex_unlock(&inode->i_mutex);
4017 balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
4018 return ret;
4019}
4020
4021static noinline int relocate_data_extent(struct inode *reloc_inode,
4022 struct btrfs_key *extent_key,
4023 u64 offset)
4024{
4025 struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
4026 struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
4027 struct extent_map *em;
4028 u64 start = extent_key->objectid - offset;
4029 u64 end = start + extent_key->offset - 1;
4030
4031 em = alloc_extent_map(GFP_NOFS);
4032 BUG_ON(!em || IS_ERR(em));
4033
4034 em->start = start;
4035 em->len = extent_key->offset;
4036 em->block_len = extent_key->offset;
4037 em->block_start = extent_key->objectid;
4038 em->bdev = root->fs_info->fs_devices->latest_bdev;
4039 set_bit(EXTENT_FLAG_PINNED, &em->flags);
4040
4041 /* setup extent map to cheat btrfs_readpage */
4042 lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
4043 while (1) {
4044 int ret;
4045 spin_lock(&em_tree->lock);
4046 ret = add_extent_mapping(em_tree, em);
4047 spin_unlock(&em_tree->lock);
4048 if (ret != -EEXIST) {
4049 free_extent_map(em);
4050 break;
4051 }
4052 btrfs_drop_extent_cache(reloc_inode, start, end, 0);
4053 }
4054 unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
4055
4056 return relocate_inode_pages(reloc_inode, start, extent_key->offset);
4057}
4058
4059struct btrfs_ref_path {
4060 u64 extent_start;
4061 u64 nodes[BTRFS_MAX_LEVEL];
4062 u64 root_objectid;
4063 u64 root_generation;
4064 u64 owner_objectid;
4065 u32 num_refs;
4066 int lowest_level;
4067 int current_level;
4068 int shared_level;
4069
4070 struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
4071 u64 new_nodes[BTRFS_MAX_LEVEL];
4072};
4073
4074struct disk_extent {
4075 u64 ram_bytes;
4076 u64 disk_bytenr;
4077 u64 disk_num_bytes;
4078 u64 offset;
4079 u64 num_bytes;
4080 u8 compression;
4081 u8 encryption;
4082 u16 other_encoding;
4083};
4084
4085static int is_cowonly_root(u64 root_objectid)
4086{
4087 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
4088 root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
4089 root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
4090 root_objectid == BTRFS_DEV_TREE_OBJECTID ||
4091 root_objectid == BTRFS_TREE_LOG_OBJECTID ||
4092 root_objectid == BTRFS_CSUM_TREE_OBJECTID)
4093 return 1;
4094 return 0;
4095}
4096
4097static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
4098 struct btrfs_root *extent_root,
4099 struct btrfs_ref_path *ref_path,
4100 int first_time)
4101{
4102 struct extent_buffer *leaf;
4103 struct btrfs_path *path;
4104 struct btrfs_extent_ref *ref;
4105 struct btrfs_key key;
4106 struct btrfs_key found_key;
4107 u64 bytenr;
4108 u32 nritems;
4109 int level;
4110 int ret = 1;
4111
4112 path = btrfs_alloc_path();
4113 if (!path)
4114 return -ENOMEM;
4115
4116 if (first_time) {
4117 ref_path->lowest_level = -1;
4118 ref_path->current_level = -1;
4119 ref_path->shared_level = -1;
4120 goto walk_up;
4121 }
4122walk_down:
4123 level = ref_path->current_level - 1;
4124 while (level >= -1) {
4125 u64 parent;
4126 if (level < ref_path->lowest_level)
4127 break;
4128
4129 if (level >= 0)
4130 bytenr = ref_path->nodes[level];
4131 else
4132 bytenr = ref_path->extent_start;
4133 BUG_ON(bytenr == 0);
4134
4135 parent = ref_path->nodes[level + 1];
4136 ref_path->nodes[level + 1] = 0;
4137 ref_path->current_level = level;
4138 BUG_ON(parent == 0);
4139
4140 key.objectid = bytenr;
4141 key.offset = parent + 1;
4142 key.type = BTRFS_EXTENT_REF_KEY;
4143
4144 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
4145 if (ret < 0)
4146 goto out;
4147 BUG_ON(ret == 0);
4148
4149 leaf = path->nodes[0];
4150 nritems = btrfs_header_nritems(leaf);
4151 if (path->slots[0] >= nritems) {
4152 ret = btrfs_next_leaf(extent_root, path);
4153 if (ret < 0)
4154 goto out;
4155 if (ret > 0)
4156 goto next;
4157 leaf = path->nodes[0];
4158 }
4159
4160 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4161 if (found_key.objectid == bytenr &&
4162 found_key.type == BTRFS_EXTENT_REF_KEY) {
4163 if (level < ref_path->shared_level)
4164 ref_path->shared_level = level;
4165 goto found;
4166 }
4167next:
4168 level--;
4169 btrfs_release_path(extent_root, path);
4170 cond_resched();
4171 }
4172 /* reached lowest level */
4173 ret = 1;
4174 goto out;
4175walk_up:
4176 level = ref_path->current_level;
4177 while (level < BTRFS_MAX_LEVEL - 1) {
4178 u64 ref_objectid;
4179
4180 if (level >= 0)
4181 bytenr = ref_path->nodes[level];
4182 else
4183 bytenr = ref_path->extent_start;
4184
4185 BUG_ON(bytenr == 0);
4186
4187 key.objectid = bytenr;
4188 key.offset = 0;
4189 key.type = BTRFS_EXTENT_REF_KEY;
4190
4191 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
4192 if (ret < 0)
4193 goto out;
4194
4195 leaf = path->nodes[0];
4196 nritems = btrfs_header_nritems(leaf);
4197 if (path->slots[0] >= nritems) {
4198 ret = btrfs_next_leaf(extent_root, path);
4199 if (ret < 0)
4200 goto out;
4201 if (ret > 0) {
4202 /* the extent was freed by someone */
4203 if (ref_path->lowest_level == level)
4204 goto out;
4205 btrfs_release_path(extent_root, path);
4206 goto walk_down;
4207 }
4208 leaf = path->nodes[0];
4209 }
4210
4211 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4212 if (found_key.objectid != bytenr ||
4213 found_key.type != BTRFS_EXTENT_REF_KEY) {
4214 /* the extent was freed by someone */
4215 if (ref_path->lowest_level == level) {
4216 ret = 1;
4217 goto out;
4218 }
4219 btrfs_release_path(extent_root, path);
4220 goto walk_down;
4221 }
4222found:
4223 ref = btrfs_item_ptr(leaf, path->slots[0],
4224 struct btrfs_extent_ref);
4225 ref_objectid = btrfs_ref_objectid(leaf, ref);
4226 if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
4227 if (first_time) {
4228 level = (int)ref_objectid;
4229 BUG_ON(level >= BTRFS_MAX_LEVEL);
4230 ref_path->lowest_level = level;
4231 ref_path->current_level = level;
4232 ref_path->nodes[level] = bytenr;
4233 } else {
4234 WARN_ON(ref_objectid != level);
4235 }
4236 } else {
4237 WARN_ON(level != -1);
4238 }
4239 first_time = 0;
4240
4241 if (ref_path->lowest_level == level) {
4242 ref_path->owner_objectid = ref_objectid;
4243 ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
4244 }
4245
4246 /*
4247 * the block is tree root or the block isn't in reference
4248 * counted tree.
4249 */
4250 if (found_key.objectid == found_key.offset ||
4251 is_cowonly_root(btrfs_ref_root(leaf, ref))) {
4252 ref_path->root_objectid = btrfs_ref_root(leaf, ref);
4253 ref_path->root_generation =
4254 btrfs_ref_generation(leaf, ref);
4255 if (level < 0) {
4256 /* special reference from the tree log */
4257 ref_path->nodes[0] = found_key.offset;
4258 ref_path->current_level = 0;
4259 }
4260 ret = 0;
4261 goto out;
4262 }
4263
4264 level++;
4265 BUG_ON(ref_path->nodes[level] != 0);
4266 ref_path->nodes[level] = found_key.offset;
4267 ref_path->current_level = level;
4268
4269 /*
4270 * the reference was created in the running transaction,
4271 * no need to continue walking up.
4272 */
4273 if (btrfs_ref_generation(leaf, ref) == trans->transid) {
4274 ref_path->root_objectid = btrfs_ref_root(leaf, ref);
4275 ref_path->root_generation =
4276 btrfs_ref_generation(leaf, ref);
4277 ret = 0;
4278 goto out;
4279 }
4280
4281 btrfs_release_path(extent_root, path);
4282 cond_resched();
4283 }
4284 /* reached max tree level, but no tree root found. */
4285 BUG();
4286out:
4287 btrfs_free_path(path);
4288 return ret;
4289}
4290
4291static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
4292 struct btrfs_root *extent_root,
4293 struct btrfs_ref_path *ref_path,
4294 u64 extent_start)
4295{
4296 memset(ref_path, 0, sizeof(*ref_path));
4297 ref_path->extent_start = extent_start;
4298
4299 return __next_ref_path(trans, extent_root, ref_path, 1);
4300}
4301
4302static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
4303 struct btrfs_root *extent_root,
4304 struct btrfs_ref_path *ref_path)
4305{
4306 return __next_ref_path(trans, extent_root, ref_path, 0);
4307}
4308
4309static noinline int get_new_locations(struct inode *reloc_inode,
4310 struct btrfs_key *extent_key,
4311 u64 offset, int no_fragment,
4312 struct disk_extent **extents,
4313 int *nr_extents)
4314{
4315 struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
4316 struct btrfs_path *path;
4317 struct btrfs_file_extent_item *fi;
4318 struct extent_buffer *leaf;
4319 struct disk_extent *exts = *extents;
4320 struct btrfs_key found_key;
4321 u64 cur_pos;
4322 u64 last_byte;
4323 u32 nritems;
4324 int nr = 0;
4325 int max = *nr_extents;
4326 int ret;
4327
4328 WARN_ON(!no_fragment && *extents);
4329 if (!exts) {
4330 max = 1;
4331 exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
4332 if (!exts)
4333 return -ENOMEM;
4334 }
4335
4336 path = btrfs_alloc_path();
4337 BUG_ON(!path);
4338
4339 cur_pos = extent_key->objectid - offset;
4340 last_byte = extent_key->objectid + extent_key->offset;
4341 ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
4342 cur_pos, 0);
4343 if (ret < 0)
4344 goto out;
4345 if (ret > 0) {
4346 ret = -ENOENT;
4347 goto out;
4348 }
4349
4350 while (1) {
4351 leaf = path->nodes[0];
4352 nritems = btrfs_header_nritems(leaf);
4353 if (path->slots[0] >= nritems) {
4354 ret = btrfs_next_leaf(root, path);
4355 if (ret < 0)
4356 goto out;
4357 if (ret > 0)
4358 break;
4359 leaf = path->nodes[0];
4360 }
4361
4362 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4363 if (found_key.offset != cur_pos ||
4364 found_key.type != BTRFS_EXTENT_DATA_KEY ||
4365 found_key.objectid != reloc_inode->i_ino)
4366 break;
4367
4368 fi = btrfs_item_ptr(leaf, path->slots[0],
4369 struct btrfs_file_extent_item);
4370 if (btrfs_file_extent_type(leaf, fi) !=
4371 BTRFS_FILE_EXTENT_REG ||
4372 btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
4373 break;
4374
4375 if (nr == max) {
4376 struct disk_extent *old = exts;
4377 max *= 2;
4378 exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
4379 memcpy(exts, old, sizeof(*exts) * nr);
4380 if (old != *extents)
4381 kfree(old);
4382 }
4383
4384 exts[nr].disk_bytenr =
4385 btrfs_file_extent_disk_bytenr(leaf, fi);
4386 exts[nr].disk_num_bytes =
4387 btrfs_file_extent_disk_num_bytes(leaf, fi);
4388 exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
4389 exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4390 exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
4391 exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
4392 exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
4393 exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
4394 fi);
4395 BUG_ON(exts[nr].offset > 0);
4396 BUG_ON(exts[nr].compression || exts[nr].encryption);
4397 BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
4398
4399 cur_pos += exts[nr].num_bytes;
4400 nr++;
4401
4402 if (cur_pos + offset >= last_byte)
4403 break;
4404
4405 if (no_fragment) {
4406 ret = 1;
4407 goto out;
4408 }
4409 path->slots[0]++;
4410 }
4411
4412 BUG_ON(cur_pos + offset > last_byte);
4413 if (cur_pos + offset < last_byte) {
4414 ret = -ENOENT;
4415 goto out;
4416 }
4417 ret = 0;
4418out:
4419 btrfs_free_path(path);
4420 if (ret) {
4421 if (exts != *extents)
4422 kfree(exts);
4423 } else {
4424 *extents = exts;
4425 *nr_extents = nr;
4426 }
4427 return ret;
4428}
4429
4430static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
4431 struct btrfs_root *root,
4432 struct btrfs_path *path,
4433 struct btrfs_key *extent_key,
4434 struct btrfs_key *leaf_key,
4435 struct btrfs_ref_path *ref_path,
4436 struct disk_extent *new_extents,
4437 int nr_extents)
4438{
4439 struct extent_buffer *leaf;
4440 struct btrfs_file_extent_item *fi;
4441 struct inode *inode = NULL;
4442 struct btrfs_key key;
4443 u64 lock_start = 0;
4444 u64 lock_end = 0;
4445 u64 num_bytes;
4446 u64 ext_offset;
4447 u64 first_pos;
4448 u32 nritems;
4449 int nr_scaned = 0;
4450 int extent_locked = 0;
4451 int extent_type;
4452 int ret;
4453
4454 memcpy(&key, leaf_key, sizeof(key));
4455 first_pos = INT_LIMIT(loff_t) - extent_key->offset;
4456 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4457 if (key.objectid < ref_path->owner_objectid ||
4458 (key.objectid == ref_path->owner_objectid &&
4459 key.type < BTRFS_EXTENT_DATA_KEY)) {
4460 key.objectid = ref_path->owner_objectid;
4461 key.type = BTRFS_EXTENT_DATA_KEY;
4462 key.offset = 0;
4463 }
4464 }
4465
4466 while (1) {
4467 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4468 if (ret < 0)
4469 goto out;
4470
4471 leaf = path->nodes[0];
4472 nritems = btrfs_header_nritems(leaf);
4473next:
4474 if (extent_locked && ret > 0) {
4475 /*
4476 * the file extent item was modified by someone
4477 * before the extent got locked.
4478 */
4479 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4480 lock_end, GFP_NOFS);
4481 extent_locked = 0;
4482 }
4483
4484 if (path->slots[0] >= nritems) {
4485 if (++nr_scaned > 2)
4486 break;
4487
4488 BUG_ON(extent_locked);
4489 ret = btrfs_next_leaf(root, path);
4490 if (ret < 0)
4491 goto out;
4492 if (ret > 0)
4493 break;
4494 leaf = path->nodes[0];
4495 nritems = btrfs_header_nritems(leaf);
4496 }
4497
4498 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4499
4500 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4501 if ((key.objectid > ref_path->owner_objectid) ||
4502 (key.objectid == ref_path->owner_objectid &&
4503 key.type > BTRFS_EXTENT_DATA_KEY) ||
4504 (key.offset >= first_pos + extent_key->offset))
4505 break;
4506 }
4507
4508 if (inode && key.objectid != inode->i_ino) {
4509 BUG_ON(extent_locked);
4510 btrfs_release_path(root, path);
4511 mutex_unlock(&inode->i_mutex);
4512 iput(inode);
4513 inode = NULL;
4514 continue;
4515 }
4516
4517 if (key.type != BTRFS_EXTENT_DATA_KEY) {
4518 path->slots[0]++;
4519 ret = 1;
4520 goto next;
4521 }
4522 fi = btrfs_item_ptr(leaf, path->slots[0],
4523 struct btrfs_file_extent_item);
4524 extent_type = btrfs_file_extent_type(leaf, fi);
4525 if ((extent_type != BTRFS_FILE_EXTENT_REG &&
4526 extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
4527 (btrfs_file_extent_disk_bytenr(leaf, fi) !=
4528 extent_key->objectid)) {
4529 path->slots[0]++;
4530 ret = 1;
4531 goto next;
4532 }
4533
4534 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4535 ext_offset = btrfs_file_extent_offset(leaf, fi);
4536
4537 if (first_pos > key.offset - ext_offset)
4538 first_pos = key.offset - ext_offset;
4539
4540 if (!extent_locked) {
4541 lock_start = key.offset;
4542 lock_end = lock_start + num_bytes - 1;
4543 } else {
4544 if (lock_start > key.offset ||
4545 lock_end + 1 < key.offset + num_bytes) {
4546 unlock_extent(&BTRFS_I(inode)->io_tree,
4547 lock_start, lock_end, GFP_NOFS);
4548 extent_locked = 0;
4549 }
4550 }
4551
4552 if (!inode) {
4553 btrfs_release_path(root, path);
4554
4555 inode = btrfs_iget_locked(root->fs_info->sb,
4556 key.objectid, root);
4557 if (inode->i_state & I_NEW) {
4558 BTRFS_I(inode)->root = root;
4559 BTRFS_I(inode)->location.objectid =
4560 key.objectid;
4561 BTRFS_I(inode)->location.type =
4562 BTRFS_INODE_ITEM_KEY;
4563 BTRFS_I(inode)->location.offset = 0;
4564 btrfs_read_locked_inode(inode);
4565 unlock_new_inode(inode);
4566 }
4567 /*
4568 * some code call btrfs_commit_transaction while
4569 * holding the i_mutex, so we can't use mutex_lock
4570 * here.
4571 */
4572 if (is_bad_inode(inode) ||
4573 !mutex_trylock(&inode->i_mutex)) {
4574 iput(inode);
4575 inode = NULL;
4576 key.offset = (u64)-1;
4577 goto skip;
4578 }
4579 }
4580
4581 if (!extent_locked) {
4582 struct btrfs_ordered_extent *ordered;
4583
4584 btrfs_release_path(root, path);
4585
4586 lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4587 lock_end, GFP_NOFS);
4588 ordered = btrfs_lookup_first_ordered_extent(inode,
4589 lock_end);
4590 if (ordered &&
4591 ordered->file_offset <= lock_end &&
4592 ordered->file_offset + ordered->len > lock_start) {
4593 unlock_extent(&BTRFS_I(inode)->io_tree,
4594 lock_start, lock_end, GFP_NOFS);
4595 btrfs_start_ordered_extent(inode, ordered, 1);
4596 btrfs_put_ordered_extent(ordered);
4597 key.offset += num_bytes;
4598 goto skip;
4599 }
4600 if (ordered)
4601 btrfs_put_ordered_extent(ordered);
4602
4603 extent_locked = 1;
4604 continue;
4605 }
4606
4607 if (nr_extents == 1) {
4608 /* update extent pointer in place */
4609 btrfs_set_file_extent_disk_bytenr(leaf, fi,
4610 new_extents[0].disk_bytenr);
4611 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4612 new_extents[0].disk_num_bytes);
4613 btrfs_mark_buffer_dirty(leaf);
4614
4615 btrfs_drop_extent_cache(inode, key.offset,
4616 key.offset + num_bytes - 1, 0);
4617
4618 ret = btrfs_inc_extent_ref(trans, root,
4619 new_extents[0].disk_bytenr,
4620 new_extents[0].disk_num_bytes,
4621 leaf->start,
4622 root->root_key.objectid,
4623 trans->transid,
4624 key.objectid);
4625 BUG_ON(ret);
4626
4627 ret = btrfs_free_extent(trans, root,
4628 extent_key->objectid,
4629 extent_key->offset,
4630 leaf->start,
4631 btrfs_header_owner(leaf),
4632 btrfs_header_generation(leaf),
4633 key.objectid, 0);
4634 BUG_ON(ret);
4635
4636 btrfs_release_path(root, path);
4637 key.offset += num_bytes;
4638 } else {
4639 BUG_ON(1);
4640#if 0
4641 u64 alloc_hint;
4642 u64 extent_len;
4643 int i;
4644 /*
4645 * drop old extent pointer at first, then insert the
4646 * new pointers one bye one
4647 */
4648 btrfs_release_path(root, path);
4649 ret = btrfs_drop_extents(trans, root, inode, key.offset,
4650 key.offset + num_bytes,
4651 key.offset, &alloc_hint);
4652 BUG_ON(ret);
4653
4654 for (i = 0; i < nr_extents; i++) {
4655 if (ext_offset >= new_extents[i].num_bytes) {
4656 ext_offset -= new_extents[i].num_bytes;
4657 continue;
4658 }
4659 extent_len = min(new_extents[i].num_bytes -
4660 ext_offset, num_bytes);
4661
4662 ret = btrfs_insert_empty_item(trans, root,
4663 path, &key,
4664 sizeof(*fi));
4665 BUG_ON(ret);
4666
4667 leaf = path->nodes[0];
4668 fi = btrfs_item_ptr(leaf, path->slots[0],
4669 struct btrfs_file_extent_item);
4670 btrfs_set_file_extent_generation(leaf, fi,
4671 trans->transid);
4672 btrfs_set_file_extent_type(leaf, fi,
4673 BTRFS_FILE_EXTENT_REG);
4674 btrfs_set_file_extent_disk_bytenr(leaf, fi,
4675 new_extents[i].disk_bytenr);
4676 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4677 new_extents[i].disk_num_bytes);
4678 btrfs_set_file_extent_ram_bytes(leaf, fi,
4679 new_extents[i].ram_bytes);
4680
4681 btrfs_set_file_extent_compression(leaf, fi,
4682 new_extents[i].compression);
4683 btrfs_set_file_extent_encryption(leaf, fi,
4684 new_extents[i].encryption);
4685 btrfs_set_file_extent_other_encoding(leaf, fi,
4686 new_extents[i].other_encoding);
4687
4688 btrfs_set_file_extent_num_bytes(leaf, fi,
4689 extent_len);
4690 ext_offset += new_extents[i].offset;
4691 btrfs_set_file_extent_offset(leaf, fi,
4692 ext_offset);
4693 btrfs_mark_buffer_dirty(leaf);
4694
4695 btrfs_drop_extent_cache(inode, key.offset,
4696 key.offset + extent_len - 1, 0);
4697
4698 ret = btrfs_inc_extent_ref(trans, root,
4699 new_extents[i].disk_bytenr,
4700 new_extents[i].disk_num_bytes,
4701 leaf->start,
4702 root->root_key.objectid,
4703 trans->transid, key.objectid);
4704 BUG_ON(ret);
4705 btrfs_release_path(root, path);
4706
4707 inode_add_bytes(inode, extent_len);
4708
4709 ext_offset = 0;
4710 num_bytes -= extent_len;
4711 key.offset += extent_len;
4712
4713 if (num_bytes == 0)
4714 break;
4715 }
4716 BUG_ON(i >= nr_extents);
4717#endif
4718 }
4719
4720 if (extent_locked) {
4721 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4722 lock_end, GFP_NOFS);
4723 extent_locked = 0;
4724 }
4725skip:
4726 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
4727 key.offset >= first_pos + extent_key->offset)
4728 break;
4729
4730 cond_resched();
4731 }
4732 ret = 0;
4733out:
4734 btrfs_release_path(root, path);
4735 if (inode) {
4736 mutex_unlock(&inode->i_mutex);
4737 if (extent_locked) {
4738 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4739 lock_end, GFP_NOFS);
4740 }
4741 iput(inode);
4742 }
4743 return ret;
4744}
4745
4746int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
4747 struct btrfs_root *root,
4748 struct extent_buffer *buf, u64 orig_start)
4749{
4750 int level;
4751 int ret;
4752
4753 BUG_ON(btrfs_header_generation(buf) != trans->transid);
4754 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
4755
4756 level = btrfs_header_level(buf);
4757 if (level == 0) {
4758 struct btrfs_leaf_ref *ref;
4759 struct btrfs_leaf_ref *orig_ref;
4760
4761 orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
4762 if (!orig_ref)
4763 return -ENOENT;
4764
4765 ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
4766 if (!ref) {
4767 btrfs_free_leaf_ref(root, orig_ref);
4768 return -ENOMEM;
4769 }
4770
4771 ref->nritems = orig_ref->nritems;
4772 memcpy(ref->extents, orig_ref->extents,
4773 sizeof(ref->extents[0]) * ref->nritems);
4774
4775 btrfs_free_leaf_ref(root, orig_ref);
4776
4777 ref->root_gen = trans->transid;
4778 ref->bytenr = buf->start;
4779 ref->owner = btrfs_header_owner(buf);
4780 ref->generation = btrfs_header_generation(buf);
4781 ret = btrfs_add_leaf_ref(root, ref, 0);
4782 WARN_ON(ret);
4783 btrfs_free_leaf_ref(root, ref);
4784 }
4785 return 0;
4786}
4787
4788static noinline int invalidate_extent_cache(struct btrfs_root *root,
4789 struct extent_buffer *leaf,
4790 struct btrfs_block_group_cache *group,
4791 struct btrfs_root *target_root)
4792{
4793 struct btrfs_key key;
4794 struct inode *inode = NULL;
4795 struct btrfs_file_extent_item *fi;
4796 u64 num_bytes;
4797 u64 skip_objectid = 0;
4798 u32 nritems;
4799 u32 i;
4800
4801 nritems = btrfs_header_nritems(leaf);
4802 for (i = 0; i < nritems; i++) {
4803 btrfs_item_key_to_cpu(leaf, &key, i);
4804 if (key.objectid == skip_objectid ||
4805 key.type != BTRFS_EXTENT_DATA_KEY)
4806 continue;
4807 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4808 if (btrfs_file_extent_type(leaf, fi) ==
4809 BTRFS_FILE_EXTENT_INLINE)
4810 continue;
4811 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
4812 continue;
4813 if (!inode || inode->i_ino != key.objectid) {
4814 iput(inode);
4815 inode = btrfs_ilookup(target_root->fs_info->sb,
4816 key.objectid, target_root, 1);
4817 }
4818 if (!inode) {
4819 skip_objectid = key.objectid;
4820 continue;
4821 }
4822 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4823
4824 lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
4825 key.offset + num_bytes - 1, GFP_NOFS);
4826 btrfs_drop_extent_cache(inode, key.offset,
4827 key.offset + num_bytes - 1, 1);
4828 unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
4829 key.offset + num_bytes - 1, GFP_NOFS);
4830 cond_resched();
4831 }
4832 iput(inode);
4833 return 0;
4834}
4835
4836static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
4837 struct btrfs_root *root,
4838 struct extent_buffer *leaf,
4839 struct btrfs_block_group_cache *group,
4840 struct inode *reloc_inode)
4841{
4842 struct btrfs_key key;
4843 struct btrfs_key extent_key;
4844 struct btrfs_file_extent_item *fi;
4845 struct btrfs_leaf_ref *ref;
4846 struct disk_extent *new_extent;
4847 u64 bytenr;
4848 u64 num_bytes;
4849 u32 nritems;
4850 u32 i;
4851 int ext_index;
4852 int nr_extent;
4853 int ret;
4854
4855 new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
4856 BUG_ON(!new_extent);
4857
4858 ref = btrfs_lookup_leaf_ref(root, leaf->start);
4859 BUG_ON(!ref);
4860
4861 ext_index = -1;
4862 nritems = btrfs_header_nritems(leaf);
4863 for (i = 0; i < nritems; i++) {
4864 btrfs_item_key_to_cpu(leaf, &key, i);
4865 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
4866 continue;
4867 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4868 if (btrfs_file_extent_type(leaf, fi) ==
4869 BTRFS_FILE_EXTENT_INLINE)
4870 continue;
4871 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
4872 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
4873 if (bytenr == 0)
4874 continue;
4875
4876 ext_index++;
4877 if (bytenr >= group->key.objectid + group->key.offset ||
4878 bytenr + num_bytes <= group->key.objectid)
4879 continue;
4880
4881 extent_key.objectid = bytenr;
4882 extent_key.offset = num_bytes;
4883 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4884 nr_extent = 1;
4885 ret = get_new_locations(reloc_inode, &extent_key,
4886 group->key.objectid, 1,
4887 &new_extent, &nr_extent);
4888 if (ret > 0)
4889 continue;
4890 BUG_ON(ret < 0);
4891
4892 BUG_ON(ref->extents[ext_index].bytenr != bytenr);
4893 BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
4894 ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
4895 ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
4896
4897 btrfs_set_file_extent_disk_bytenr(leaf, fi,
4898 new_extent->disk_bytenr);
4899 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4900 new_extent->disk_num_bytes);
4901 btrfs_mark_buffer_dirty(leaf);
4902
4903 ret = btrfs_inc_extent_ref(trans, root,
4904 new_extent->disk_bytenr,
4905 new_extent->disk_num_bytes,
4906 leaf->start,
4907 root->root_key.objectid,
4908 trans->transid, key.objectid);
4909 BUG_ON(ret);
4910 ret = btrfs_free_extent(trans, root,
4911 bytenr, num_bytes, leaf->start,
4912 btrfs_header_owner(leaf),
4913 btrfs_header_generation(leaf),
4914 key.objectid, 0);
4915 BUG_ON(ret);
4916 cond_resched();
4917 }
4918 kfree(new_extent);
4919 BUG_ON(ext_index + 1 != ref->nritems);
4920 btrfs_free_leaf_ref(root, ref);
4921 return 0;
4922}
4923
4924int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
4925 struct btrfs_root *root)
4926{
4927 struct btrfs_root *reloc_root;
4928 int ret;
4929
4930 if (root->reloc_root) {
4931 reloc_root = root->reloc_root;
4932 root->reloc_root = NULL;
4933 list_add(&reloc_root->dead_list,
4934 &root->fs_info->dead_reloc_roots);
4935
4936 btrfs_set_root_bytenr(&reloc_root->root_item,
4937 reloc_root->node->start);
4938 btrfs_set_root_level(&root->root_item,
4939 btrfs_header_level(reloc_root->node));
4940 memset(&reloc_root->root_item.drop_progress, 0,
4941 sizeof(struct btrfs_disk_key));
4942 reloc_root->root_item.drop_level = 0;
4943
4944 ret = btrfs_update_root(trans, root->fs_info->tree_root,
4945 &reloc_root->root_key,
4946 &reloc_root->root_item);
4947 BUG_ON(ret);
4948 }
4949 return 0;
4950}
4951
4952int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
4953{
4954 struct btrfs_trans_handle *trans;
4955 struct btrfs_root *reloc_root;
4956 struct btrfs_root *prev_root = NULL;
4957 struct list_head dead_roots;
4958 int ret;
4959 unsigned long nr;
4960
4961 INIT_LIST_HEAD(&dead_roots);
4962 list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
4963
4964 while (!list_empty(&dead_roots)) {
4965 reloc_root = list_entry(dead_roots.prev,
4966 struct btrfs_root, dead_list);
4967 list_del_init(&reloc_root->dead_list);
4968
4969 BUG_ON(reloc_root->commit_root != NULL);
4970 while (1) {
4971 trans = btrfs_join_transaction(root, 1);
4972 BUG_ON(!trans);
4973
4974 mutex_lock(&root->fs_info->drop_mutex);
4975 ret = btrfs_drop_snapshot(trans, reloc_root);
4976 if (ret != -EAGAIN)
4977 break;
4978 mutex_unlock(&root->fs_info->drop_mutex);
4979
4980 nr = trans->blocks_used;
4981 ret = btrfs_end_transaction(trans, root);
4982 BUG_ON(ret);
4983 btrfs_btree_balance_dirty(root, nr);
4984 }
4985
4986 free_extent_buffer(reloc_root->node);
4987
4988 ret = btrfs_del_root(trans, root->fs_info->tree_root,
4989 &reloc_root->root_key);
4990 BUG_ON(ret);
4991 mutex_unlock(&root->fs_info->drop_mutex);
4992
4993 nr = trans->blocks_used;
4994 ret = btrfs_end_transaction(trans, root);
4995 BUG_ON(ret);
4996 btrfs_btree_balance_dirty(root, nr);
4997
4998 kfree(prev_root);
4999 prev_root = reloc_root;
5000 }
5001 if (prev_root) {
5002 btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
5003 kfree(prev_root);
5004 }
5005 return 0;
5006}
5007
5008int btrfs_add_dead_reloc_root(struct btrfs_root *root)
5009{
5010 list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
5011 return 0;
5012}
5013
5014int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
5015{
5016 struct btrfs_root *reloc_root;
5017 struct btrfs_trans_handle *trans;
5018 struct btrfs_key location;
5019 int found;
5020 int ret;
5021
5022 mutex_lock(&root->fs_info->tree_reloc_mutex);
5023 ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
5024 BUG_ON(ret);
5025 found = !list_empty(&root->fs_info->dead_reloc_roots);
5026 mutex_unlock(&root->fs_info->tree_reloc_mutex);
5027
5028 if (found) {
5029 trans = btrfs_start_transaction(root, 1);
5030 BUG_ON(!trans);
5031 ret = btrfs_commit_transaction(trans, root);
5032 BUG_ON(ret);
5033 }
5034
5035 location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
5036 location.offset = (u64)-1;
5037 location.type = BTRFS_ROOT_ITEM_KEY;
5038
5039 reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
5040 BUG_ON(!reloc_root);
5041 btrfs_orphan_cleanup(reloc_root);
5042 return 0;
5043}
5044
5045static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
5046 struct btrfs_root *root)
5047{
5048 struct btrfs_root *reloc_root;
5049 struct extent_buffer *eb;
5050 struct btrfs_root_item *root_item;
5051 struct btrfs_key root_key;
5052 int ret;
5053
5054 BUG_ON(!root->ref_cows);
5055 if (root->reloc_root)
5056 return 0;
5057
5058 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
5059 BUG_ON(!root_item);
5060
5061 ret = btrfs_copy_root(trans, root, root->commit_root,
5062 &eb, BTRFS_TREE_RELOC_OBJECTID);
5063 BUG_ON(ret);
5064
5065 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
5066 root_key.offset = root->root_key.objectid;
5067 root_key.type = BTRFS_ROOT_ITEM_KEY;
5068
5069 memcpy(root_item, &root->root_item, sizeof(root_item));
5070 btrfs_set_root_refs(root_item, 0);
5071 btrfs_set_root_bytenr(root_item, eb->start);
5072 btrfs_set_root_level(root_item, btrfs_header_level(eb));
5073 btrfs_set_root_generation(root_item, trans->transid);
5074
5075 btrfs_tree_unlock(eb);
5076 free_extent_buffer(eb);
5077
5078 ret = btrfs_insert_root(trans, root->fs_info->tree_root,
5079 &root_key, root_item);
5080 BUG_ON(ret);
5081 kfree(root_item);
5082
5083 reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
5084 &root_key);
5085 BUG_ON(!reloc_root);
5086 reloc_root->last_trans = trans->transid;
5087 reloc_root->commit_root = NULL;
5088 reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
5089
5090 root->reloc_root = reloc_root;
5091 return 0;
5092}
5093
5094/*
5095 * Core function of space balance.
5096 *
5097 * The idea is using reloc trees to relocate tree blocks in reference
5098 * counted roots. There is one reloc tree for each subvol, and all
5099 * reloc trees share same root key objectid. Reloc trees are snapshots
5100 * of the latest committed roots of subvols (root->commit_root).
5101 *
5102 * To relocate a tree block referenced by a subvol, there are two steps.
5103 * COW the block through subvol's reloc tree, then update block pointer
5104 * in the subvol to point to the new block. Since all reloc trees share
5105 * same root key objectid, doing special handing for tree blocks owned
5106 * by them is easy. Once a tree block has been COWed in one reloc tree,
5107 * we can use the resulting new block directly when the same block is
5108 * required to COW again through other reloc trees. By this way, relocated
5109 * tree blocks are shared between reloc trees, so they are also shared
5110 * between subvols.
5111 */
5112static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
5113 struct btrfs_root *root,
5114 struct btrfs_path *path,
5115 struct btrfs_key *first_key,
5116 struct btrfs_ref_path *ref_path,
5117 struct btrfs_block_group_cache *group,
5118 struct inode *reloc_inode)
5119{
5120 struct btrfs_root *reloc_root;
5121 struct extent_buffer *eb = NULL;
5122 struct btrfs_key *keys;
5123 u64 *nodes;
5124 int level;
5125 int shared_level;
5126 int lowest_level = 0;
5127 int ret;
5128
5129 if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
5130 lowest_level = ref_path->owner_objectid;
5131
5132 if (!root->ref_cows) {
5133 path->lowest_level = lowest_level;
5134 ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
5135 BUG_ON(ret < 0);
5136 path->lowest_level = 0;
5137 btrfs_release_path(root, path);
5138 return 0;
5139 }
5140
5141 mutex_lock(&root->fs_info->tree_reloc_mutex);
5142 ret = init_reloc_tree(trans, root);
5143 BUG_ON(ret);
5144 reloc_root = root->reloc_root;
5145
5146 shared_level = ref_path->shared_level;
5147 ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
5148
5149 keys = ref_path->node_keys;
5150 nodes = ref_path->new_nodes;
5151 memset(&keys[shared_level + 1], 0,
5152 sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
5153 memset(&nodes[shared_level + 1], 0,
5154 sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
5155
5156 if (nodes[lowest_level] == 0) {
5157 path->lowest_level = lowest_level;
5158 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
5159 0, 1);
5160 BUG_ON(ret);
5161 for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
5162 eb = path->nodes[level];
5163 if (!eb || eb == reloc_root->node)
5164 break;
5165 nodes[level] = eb->start;
5166 if (level == 0)
5167 btrfs_item_key_to_cpu(eb, &keys[level], 0);
5168 else
5169 btrfs_node_key_to_cpu(eb, &keys[level], 0);
5170 }
5171 if (nodes[0] &&
5172 ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5173 eb = path->nodes[0];
5174 ret = replace_extents_in_leaf(trans, reloc_root, eb,
5175 group, reloc_inode);
5176 BUG_ON(ret);
5177 }
5178 btrfs_release_path(reloc_root, path);
5179 } else {
5180 ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
5181 lowest_level);
5182 BUG_ON(ret);
5183 }
5184
5185 /*
5186 * replace tree blocks in the fs tree with tree blocks in
5187 * the reloc tree.
5188 */
5189 ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
5190 BUG_ON(ret < 0);
5191
5192 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5193 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
5194 0, 0);
5195 BUG_ON(ret);
5196 extent_buffer_get(path->nodes[0]);
5197 eb = path->nodes[0];
5198 btrfs_release_path(reloc_root, path);
5199 ret = invalidate_extent_cache(reloc_root, eb, group, root);
5200 BUG_ON(ret);
5201 free_extent_buffer(eb);
5202 }
5203
5204 mutex_unlock(&root->fs_info->tree_reloc_mutex);
5205 path->lowest_level = 0;
5206 return 0;
5207}
5208
5209static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
5210 struct btrfs_root *root,
5211 struct btrfs_path *path,
5212 struct btrfs_key *first_key,
5213 struct btrfs_ref_path *ref_path)
5214{
5215 int ret;
5216
5217 ret = relocate_one_path(trans, root, path, first_key,
5218 ref_path, NULL, NULL);
5219 BUG_ON(ret);
5220
5221 if (root == root->fs_info->extent_root)
5222 btrfs_extent_post_op(trans, root);
5223
5224 return 0;
5225}
5226
5227static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
5228 struct btrfs_root *extent_root,
5229 struct btrfs_path *path,
5230 struct btrfs_key *extent_key)
5231{
5232 int ret;
5233
5234 ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
5235 if (ret)
5236 goto out;
5237 ret = btrfs_del_item(trans, extent_root, path);
5238out:
5239 btrfs_release_path(extent_root, path);
5240 return ret;
5241}
5242
5243static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
5244 struct btrfs_ref_path *ref_path)
5245{
5246 struct btrfs_key root_key;
5247
5248 root_key.objectid = ref_path->root_objectid;
5249 root_key.type = BTRFS_ROOT_ITEM_KEY;
5250 if (is_cowonly_root(ref_path->root_objectid))
5251 root_key.offset = 0;
5252 else
5253 root_key.offset = (u64)-1;
5254
5255 return btrfs_read_fs_root_no_name(fs_info, &root_key);
5256}
5257
5258static noinline int relocate_one_extent(struct btrfs_root *extent_root,
5259 struct btrfs_path *path,
5260 struct btrfs_key *extent_key,
5261 struct btrfs_block_group_cache *group,
5262 struct inode *reloc_inode, int pass)
5263{
5264 struct btrfs_trans_handle *trans;
5265 struct btrfs_root *found_root;
5266 struct btrfs_ref_path *ref_path = NULL;
5267 struct disk_extent *new_extents = NULL;
5268 int nr_extents = 0;
5269 int loops;
5270 int ret;
5271 int level;
5272 struct btrfs_key first_key;
5273 u64 prev_block = 0;
5274
5275
5276 trans = btrfs_start_transaction(extent_root, 1);
5277 BUG_ON(!trans);
5278
5279 if (extent_key->objectid == 0) {
5280 ret = del_extent_zero(trans, extent_root, path, extent_key);
5281 goto out;
5282 }
5283
5284 ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
5285 if (!ref_path) {
5286 ret = -ENOMEM;
5287 goto out;
5288 }
5289
5290 for (loops = 0; ; loops++) {
5291 if (loops == 0) {
5292 ret = btrfs_first_ref_path(trans, extent_root, ref_path,
5293 extent_key->objectid);
5294 } else {
5295 ret = btrfs_next_ref_path(trans, extent_root, ref_path);
5296 }
5297 if (ret < 0)
5298 goto out;
5299 if (ret > 0)
5300 break;
5301
5302 if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
5303 ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
5304 continue;
5305
5306 found_root = read_ref_root(extent_root->fs_info, ref_path);
5307 BUG_ON(!found_root);
5308 /*
5309 * for reference counted tree, only process reference paths
5310 * rooted at the latest committed root.
5311 */
5312 if (found_root->ref_cows &&
5313 ref_path->root_generation != found_root->root_key.offset)
5314 continue;
5315
5316 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5317 if (pass == 0) {
5318 /*
5319 * copy data extents to new locations
5320 */
5321 u64 group_start = group->key.objectid;
5322 ret = relocate_data_extent(reloc_inode,
5323 extent_key,
5324 group_start);
5325 if (ret < 0)
5326 goto out;
5327 break;
5328 }
5329 level = 0;
5330 } else {
5331 level = ref_path->owner_objectid;
5332 }
5333
5334 if (prev_block != ref_path->nodes[level]) {
5335 struct extent_buffer *eb;
5336 u64 block_start = ref_path->nodes[level];
5337 u64 block_size = btrfs_level_size(found_root, level);
5338
5339 eb = read_tree_block(found_root, block_start,
5340 block_size, 0);
5341 btrfs_tree_lock(eb);
5342 BUG_ON(level != btrfs_header_level(eb));
5343
5344 if (level == 0)
5345 btrfs_item_key_to_cpu(eb, &first_key, 0);
5346 else
5347 btrfs_node_key_to_cpu(eb, &first_key, 0);
5348
5349 btrfs_tree_unlock(eb);
5350 free_extent_buffer(eb);
5351 prev_block = block_start;
5352 }
5353
5354 btrfs_record_root_in_trans(found_root);
5355 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5356 /*
5357 * try to update data extent references while
5358 * keeping metadata shared between snapshots.
5359 */
5360 if (pass == 1) {
5361 ret = relocate_one_path(trans, found_root,
5362 path, &first_key, ref_path,
5363 group, reloc_inode);
5364 if (ret < 0)
5365 goto out;
5366 continue;
5367 }
5368 /*
5369 * use fallback method to process the remaining
5370 * references.
5371 */
5372 if (!new_extents) {
5373 u64 group_start = group->key.objectid;
5374 new_extents = kmalloc(sizeof(*new_extents),
5375 GFP_NOFS);
5376 nr_extents = 1;
5377 ret = get_new_locations(reloc_inode,
5378 extent_key,
5379 group_start, 1,
5380 &new_extents,
5381 &nr_extents);
5382 if (ret)
5383 goto out;
5384 }
5385 ret = replace_one_extent(trans, found_root,
5386 path, extent_key,
5387 &first_key, ref_path,
5388 new_extents, nr_extents);
5389 } else {
5390 ret = relocate_tree_block(trans, found_root, path,
5391 &first_key, ref_path);
5392 }
5393 if (ret < 0)
5394 goto out;
5395 }
5396 ret = 0;
5397out:
5398 btrfs_end_transaction(trans, extent_root);
5399 kfree(new_extents);
5400 kfree(ref_path);
5401 return ret;
5402}
5403
5404static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
5405{
5406 u64 num_devices;
5407 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
5408 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
5409
5410 num_devices = root->fs_info->fs_devices->rw_devices;
5411 if (num_devices == 1) {
5412 stripped |= BTRFS_BLOCK_GROUP_DUP;
5413 stripped = flags & ~stripped;
5414
5415 /* turn raid0 into single device chunks */
5416 if (flags & BTRFS_BLOCK_GROUP_RAID0)
5417 return stripped;
5418
5419 /* turn mirroring into duplication */
5420 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
5421 BTRFS_BLOCK_GROUP_RAID10))
5422 return stripped | BTRFS_BLOCK_GROUP_DUP;
5423 return flags;
5424 } else {
5425 /* they already had raid on here, just return */
5426 if (flags & stripped)
5427 return flags;
5428
5429 stripped |= BTRFS_BLOCK_GROUP_DUP;
5430 stripped = flags & ~stripped;
5431
5432 /* switch duplicated blocks with raid1 */
5433 if (flags & BTRFS_BLOCK_GROUP_DUP)
5434 return stripped | BTRFS_BLOCK_GROUP_RAID1;
5435
5436 /* turn single device chunks into raid0 */
5437 return stripped | BTRFS_BLOCK_GROUP_RAID0;
5438 }
5439 return flags;
5440}
5441
5442static int __alloc_chunk_for_shrink(struct btrfs_root *root,
5443 struct btrfs_block_group_cache *shrink_block_group,
5444 int force)
5445{
5446 struct btrfs_trans_handle *trans;
5447 u64 new_alloc_flags;
5448 u64 calc;
5449
5450 spin_lock(&shrink_block_group->lock);
5451 if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
5452 spin_unlock(&shrink_block_group->lock);
5453
5454 trans = btrfs_start_transaction(root, 1);
5455 spin_lock(&shrink_block_group->lock);
5456
5457 new_alloc_flags = update_block_group_flags(root,
5458 shrink_block_group->flags);
5459 if (new_alloc_flags != shrink_block_group->flags) {
5460 calc =
5461 btrfs_block_group_used(&shrink_block_group->item);
5462 } else {
5463 calc = shrink_block_group->key.offset;
5464 }
5465 spin_unlock(&shrink_block_group->lock);
5466
5467 do_chunk_alloc(trans, root->fs_info->extent_root,
5468 calc + 2 * 1024 * 1024, new_alloc_flags, force);
5469
5470 btrfs_end_transaction(trans, root);
5471 } else
5472 spin_unlock(&shrink_block_group->lock);
5473 return 0;
5474}
5475
5476static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
5477 struct btrfs_root *root,
5478 u64 objectid, u64 size)
5479{
5480 struct btrfs_path *path;
5481 struct btrfs_inode_item *item;
5482 struct extent_buffer *leaf;
5483 int ret;
5484
5485 path = btrfs_alloc_path();
5486 if (!path)
5487 return -ENOMEM;
5488
5489 ret = btrfs_insert_empty_inode(trans, root, path, objectid);
5490 if (ret)
5491 goto out;
5492
5493 leaf = path->nodes[0];
5494 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
5495 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
5496 btrfs_set_inode_generation(leaf, item, 1);
5497 btrfs_set_inode_size(leaf, item, size);
5498 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
5499 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
5500 btrfs_mark_buffer_dirty(leaf);
5501 btrfs_release_path(root, path);
5502out:
5503 btrfs_free_path(path);
5504 return ret;
5505}
5506
5507static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
5508 struct btrfs_block_group_cache *group)
5509{
5510 struct inode *inode = NULL;
5511 struct btrfs_trans_handle *trans;
5512 struct btrfs_root *root;
5513 struct btrfs_key root_key;
5514 u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
5515 int err = 0;
5516
5517 root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
5518 root_key.type = BTRFS_ROOT_ITEM_KEY;
5519 root_key.offset = (u64)-1;
5520 root = btrfs_read_fs_root_no_name(fs_info, &root_key);
5521 if (IS_ERR(root))
5522 return ERR_CAST(root);
5523
5524 trans = btrfs_start_transaction(root, 1);
5525 BUG_ON(!trans);
5526
5527 err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
5528 if (err)
5529 goto out;
5530
5531 err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
5532 BUG_ON(err);
5533
5534 err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
5535 group->key.offset, 0, group->key.offset,
5536 0, 0, 0);
5537 BUG_ON(err);
5538
5539 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
5540 if (inode->i_state & I_NEW) {
5541 BTRFS_I(inode)->root = root;
5542 BTRFS_I(inode)->location.objectid = objectid;
5543 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
5544 BTRFS_I(inode)->location.offset = 0;
5545 btrfs_read_locked_inode(inode);
5546 unlock_new_inode(inode);
5547 BUG_ON(is_bad_inode(inode));
5548 } else {
5549 BUG_ON(1);
5550 }
5551 BTRFS_I(inode)->index_cnt = group->key.objectid;
5552
5553 err = btrfs_orphan_add(trans, inode);
5554out:
5555 btrfs_end_transaction(trans, root);
5556 if (err) {
5557 if (inode)
5558 iput(inode);
5559 inode = ERR_PTR(err);
5560 }
5561 return inode;
5562}
5563
5564int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
5565{
5566
5567 struct btrfs_ordered_sum *sums;
5568 struct btrfs_sector_sum *sector_sum;
5569 struct btrfs_ordered_extent *ordered;
5570 struct btrfs_root *root = BTRFS_I(inode)->root;
5571 struct list_head list;
5572 size_t offset;
5573 int ret;
5574 u64 disk_bytenr;
5575
5576 INIT_LIST_HEAD(&list);
5577
5578 ordered = btrfs_lookup_ordered_extent(inode, file_pos);
5579 BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
5580
5581 disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
5582 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
5583 disk_bytenr + len - 1, &list);
5584
5585 while (!list_empty(&list)) {
5586 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
5587 list_del_init(&sums->list);
5588
5589 sector_sum = sums->sums;
5590 sums->bytenr = ordered->start;
5591
5592 offset = 0;
5593 while (offset < sums->len) {
5594 sector_sum->bytenr += ordered->start - disk_bytenr;
5595 sector_sum++;
5596 offset += root->sectorsize;
5597 }
5598
5599 btrfs_add_ordered_sum(inode, ordered, sums);
5600 }
5601 btrfs_put_ordered_extent(ordered);
5602 return 0;
5603}
5604
5605int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
5606{
5607 struct btrfs_trans_handle *trans;
5608 struct btrfs_path *path;
5609 struct btrfs_fs_info *info = root->fs_info;
5610 struct extent_buffer *leaf;
5611 struct inode *reloc_inode;
5612 struct btrfs_block_group_cache *block_group;
5613 struct btrfs_key key;
5614 u64 skipped;
5615 u64 cur_byte;
5616 u64 total_found;
5617 u32 nritems;
5618 int ret;
5619 int progress;
5620 int pass = 0;
5621
5622 root = root->fs_info->extent_root;
5623
5624 block_group = btrfs_lookup_block_group(info, group_start);
5625 BUG_ON(!block_group);
5626
5627 printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
5628 (unsigned long long)block_group->key.objectid,
5629 (unsigned long long)block_group->flags);
5630
5631 path = btrfs_alloc_path();
5632 BUG_ON(!path);
5633
5634 reloc_inode = create_reloc_inode(info, block_group);
5635 BUG_ON(IS_ERR(reloc_inode));
5636
5637 __alloc_chunk_for_shrink(root, block_group, 1);
5638 set_block_group_readonly(block_group);
5639
5640 btrfs_start_delalloc_inodes(info->tree_root);
5641 btrfs_wait_ordered_extents(info->tree_root, 0);
5642again:
5643 skipped = 0;
5644 total_found = 0;
5645 progress = 0;
5646 key.objectid = block_group->key.objectid;
5647 key.offset = 0;
5648 key.type = 0;
5649 cur_byte = key.objectid;
5650
5651 trans = btrfs_start_transaction(info->tree_root, 1);
5652 btrfs_commit_transaction(trans, info->tree_root);
5653
5654 mutex_lock(&root->fs_info->cleaner_mutex);
5655 btrfs_clean_old_snapshots(info->tree_root);
5656 btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
5657 mutex_unlock(&root->fs_info->cleaner_mutex);
5658
5659 while (1) {
5660 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5661 if (ret < 0)
5662 goto out;
5663next:
5664 leaf = path->nodes[0];
5665 nritems = btrfs_header_nritems(leaf);
5666 if (path->slots[0] >= nritems) {
5667 ret = btrfs_next_leaf(root, path);
5668 if (ret < 0)
5669 goto out;
5670 if (ret == 1) {
5671 ret = 0;
5672 break;
5673 }
5674 leaf = path->nodes[0];
5675 nritems = btrfs_header_nritems(leaf);
5676 }
5677
5678 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5679
5680 if (key.objectid >= block_group->key.objectid +
5681 block_group->key.offset)
5682 break;
5683
5684 if (progress && need_resched()) {
5685 btrfs_release_path(root, path);
5686 cond_resched();
5687 progress = 0;
5688 continue;
5689 }
5690 progress = 1;
5691
5692 if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
5693 key.objectid + key.offset <= cur_byte) {
5694 path->slots[0]++;
5695 goto next;
5696 }
5697
5698 total_found++;
5699 cur_byte = key.objectid + key.offset;
5700 btrfs_release_path(root, path);
5701
5702 __alloc_chunk_for_shrink(root, block_group, 0);
5703 ret = relocate_one_extent(root, path, &key, block_group,
5704 reloc_inode, pass);
5705 BUG_ON(ret < 0);
5706 if (ret > 0)
5707 skipped++;
5708
5709 key.objectid = cur_byte;
5710 key.type = 0;
5711 key.offset = 0;
5712 }
5713
5714 btrfs_release_path(root, path);
5715
5716 if (pass == 0) {
5717 btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
5718 invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
5719 }
5720
5721 if (total_found > 0) {
5722 printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
5723 (unsigned long long)total_found, pass);
5724 pass++;
5725 if (total_found == skipped && pass > 2) {
5726 iput(reloc_inode);
5727 reloc_inode = create_reloc_inode(info, block_group);
5728 pass = 0;
5729 }
5730 goto again;
5731 }
5732
5733 /* delete reloc_inode */
5734 iput(reloc_inode);
5735
5736 /* unpin extents in this range */
5737 trans = btrfs_start_transaction(info->tree_root, 1);
5738 btrfs_commit_transaction(trans, info->tree_root);
5739
5740 spin_lock(&block_group->lock);
5741 WARN_ON(block_group->pinned > 0);
5742 WARN_ON(block_group->reserved > 0);
5743 WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
5744 spin_unlock(&block_group->lock);
5745 put_block_group(block_group);
5746 ret = 0;
5747out:
5748 btrfs_free_path(path);
5749 return ret;
5750}
5751
5752static int find_first_block_group(struct btrfs_root *root,
5753 struct btrfs_path *path, struct btrfs_key *key)
5754{
5755 int ret = 0;
5756 struct btrfs_key found_key;
5757 struct extent_buffer *leaf;
5758 int slot;
5759
5760 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
5761 if (ret < 0)
5762 goto out;
5763
5764 while (1) {
5765 slot = path->slots[0];
5766 leaf = path->nodes[0];
5767 if (slot >= btrfs_header_nritems(leaf)) {
5768 ret = btrfs_next_leaf(root, path);
5769 if (ret == 0)
5770 continue;
5771 if (ret < 0)
5772 goto out;
5773 break;
5774 }
5775 btrfs_item_key_to_cpu(leaf, &found_key, slot);
5776
5777 if (found_key.objectid >= key->objectid &&
5778 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
5779 ret = 0;
5780 goto out;
5781 }
5782 path->slots[0]++;
5783 }
5784 ret = -ENOENT;
5785out:
5786 return ret;
5787}
5788
5789int btrfs_free_block_groups(struct btrfs_fs_info *info)
5790{
5791 struct btrfs_block_group_cache *block_group;
5792 struct rb_node *n;
5793
5794 spin_lock(&info->block_group_cache_lock);
5795 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
5796 block_group = rb_entry(n, struct btrfs_block_group_cache,
5797 cache_node);
5798 rb_erase(&block_group->cache_node,
5799 &info->block_group_cache_tree);
5800 spin_unlock(&info->block_group_cache_lock);
5801
5802 btrfs_remove_free_space_cache(block_group);
5803 down_write(&block_group->space_info->groups_sem);
5804 list_del(&block_group->list);
5805 up_write(&block_group->space_info->groups_sem);
5806
5807 WARN_ON(atomic_read(&block_group->count) != 1);
5808 kfree(block_group);
5809
5810 spin_lock(&info->block_group_cache_lock);
5811 }
5812 spin_unlock(&info->block_group_cache_lock);
5813 return 0;
5814}
5815
5816int btrfs_read_block_groups(struct btrfs_root *root)
5817{
5818 struct btrfs_path *path;
5819 int ret;
5820 struct btrfs_block_group_cache *cache;
5821 struct btrfs_fs_info *info = root->fs_info;
5822 struct btrfs_space_info *space_info;
5823 struct btrfs_key key;
5824 struct btrfs_key found_key;
5825 struct extent_buffer *leaf;
5826
5827 root = info->extent_root;
5828 key.objectid = 0;
5829 key.offset = 0;
5830 btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
5831 path = btrfs_alloc_path();
5832 if (!path)
5833 return -ENOMEM;
5834
5835 while (1) {
5836 ret = find_first_block_group(root, path, &key);
5837 if (ret > 0) {
5838 ret = 0;
5839 goto error;
5840 }
5841 if (ret != 0)
5842 goto error;
5843
5844 leaf = path->nodes[0];
5845 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5846 cache = kzalloc(sizeof(*cache), GFP_NOFS);
5847 if (!cache) {
5848 ret = -ENOMEM;
5849 break;
5850 }
5851
5852 atomic_set(&cache->count, 1);
5853 spin_lock_init(&cache->lock);
5854 mutex_init(&cache->alloc_mutex);
5855 mutex_init(&cache->cache_mutex);
5856 INIT_LIST_HEAD(&cache->list);
5857 read_extent_buffer(leaf, &cache->item,
5858 btrfs_item_ptr_offset(leaf, path->slots[0]),
5859 sizeof(cache->item));
5860 memcpy(&cache->key, &found_key, sizeof(found_key));
5861
5862 key.objectid = found_key.objectid + found_key.offset;
5863 btrfs_release_path(root, path);
5864 cache->flags = btrfs_block_group_flags(&cache->item);
5865
5866 ret = update_space_info(info, cache->flags, found_key.offset,
5867 btrfs_block_group_used(&cache->item),
5868 &space_info);
5869 BUG_ON(ret);
5870 cache->space_info = space_info;
5871 down_write(&space_info->groups_sem);
5872 list_add_tail(&cache->list, &space_info->block_groups);
5873 up_write(&space_info->groups_sem);
5874
5875 ret = btrfs_add_block_group_cache(root->fs_info, cache);
5876 BUG_ON(ret);
5877
5878 set_avail_alloc_bits(root->fs_info, cache->flags);
5879 if (btrfs_chunk_readonly(root, cache->key.objectid))
5880 set_block_group_readonly(cache);
5881 }
5882 ret = 0;
5883error:
5884 btrfs_free_path(path);
5885 return ret;
5886}
5887
5888int btrfs_make_block_group(struct btrfs_trans_handle *trans,
5889 struct btrfs_root *root, u64 bytes_used,
5890 u64 type, u64 chunk_objectid, u64 chunk_offset,
5891 u64 size)
5892{
5893 int ret;
5894 struct btrfs_root *extent_root;
5895 struct btrfs_block_group_cache *cache;
5896
5897 extent_root = root->fs_info->extent_root;
5898
5899 root->fs_info->last_trans_new_blockgroup = trans->transid;
5900
5901 cache = kzalloc(sizeof(*cache), GFP_NOFS);
5902 if (!cache)
5903 return -ENOMEM;
5904
5905 cache->key.objectid = chunk_offset;
5906 cache->key.offset = size;
5907 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
5908 atomic_set(&cache->count, 1);
5909 spin_lock_init(&cache->lock);
5910 mutex_init(&cache->alloc_mutex);
5911 mutex_init(&cache->cache_mutex);
5912 INIT_LIST_HEAD(&cache->list);
5913
5914 btrfs_set_block_group_used(&cache->item, bytes_used);
5915 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
5916 cache->flags = type;
5917 btrfs_set_block_group_flags(&cache->item, type);
5918
5919 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
5920 &cache->space_info);
5921 BUG_ON(ret);
5922 down_write(&cache->space_info->groups_sem);
5923 list_add_tail(&cache->list, &cache->space_info->block_groups);
5924 up_write(&cache->space_info->groups_sem);
5925
5926 ret = btrfs_add_block_group_cache(root->fs_info, cache);
5927 BUG_ON(ret);
5928
5929 ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
5930 sizeof(cache->item));
5931 BUG_ON(ret);
5932
5933 finish_current_insert(trans, extent_root, 0);
5934 ret = del_pending_extents(trans, extent_root, 0);
5935 BUG_ON(ret);
5936 set_avail_alloc_bits(extent_root->fs_info, type);
5937
5938 return 0;
5939}
5940
5941int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5942 struct btrfs_root *root, u64 group_start)
5943{
5944 struct btrfs_path *path;
5945 struct btrfs_block_group_cache *block_group;
5946 struct btrfs_key key;
5947 int ret;
5948
5949 root = root->fs_info->extent_root;
5950
5951 block_group = btrfs_lookup_block_group(root->fs_info, group_start);
5952 BUG_ON(!block_group);
5953 BUG_ON(!block_group->ro);
5954
5955 memcpy(&key, &block_group->key, sizeof(key));
5956
5957 path = btrfs_alloc_path();
5958 BUG_ON(!path);
5959
5960 btrfs_remove_free_space_cache(block_group);
5961 rb_erase(&block_group->cache_node,
5962 &root->fs_info->block_group_cache_tree);
5963 down_write(&block_group->space_info->groups_sem);
5964 list_del(&block_group->list);
5965 up_write(&block_group->space_info->groups_sem);
5966
5967 spin_lock(&block_group->space_info->lock);
5968 block_group->space_info->total_bytes -= block_group->key.offset;
5969 block_group->space_info->bytes_readonly -= block_group->key.offset;
5970 spin_unlock(&block_group->space_info->lock);
5971 block_group->space_info->full = 0;
5972
5973 put_block_group(block_group);
5974 put_block_group(block_group);
5975
5976 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
5977 if (ret > 0)
5978 ret = -EIO;
5979 if (ret < 0)
5980 goto out;
5981
5982 ret = btrfs_del_item(trans, root, path);
5983out:
5984 btrfs_free_path(path);
5985 return ret;
5986}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
new file mode 100644
index 000000000000..e086d407f1fa
--- /dev/null
+++ b/fs/btrfs/extent_io.c
@@ -0,0 +1,3717 @@
1#include <linux/bitops.h>
2#include <linux/slab.h>
3#include <linux/bio.h>
4#include <linux/mm.h>
5#include <linux/gfp.h>
6#include <linux/pagemap.h>
7#include <linux/page-flags.h>
8#include <linux/module.h>
9#include <linux/spinlock.h>
10#include <linux/blkdev.h>
11#include <linux/swap.h>
12#include <linux/version.h>
13#include <linux/writeback.h>
14#include <linux/pagevec.h>
15#include "extent_io.h"
16#include "extent_map.h"
17#include "compat.h"
18#include "ctree.h"
19#include "btrfs_inode.h"
20
21/* temporary define until extent_map moves out of btrfs */
22struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
23 unsigned long extra_flags,
24 void (*ctor)(void *, struct kmem_cache *,
25 unsigned long));
26
27static struct kmem_cache *extent_state_cache;
28static struct kmem_cache *extent_buffer_cache;
29
30static LIST_HEAD(buffers);
31static LIST_HEAD(states);
32
33#define LEAK_DEBUG 0
34#ifdef LEAK_DEBUG
35static DEFINE_SPINLOCK(leak_lock);
36#endif
37
38#define BUFFER_LRU_MAX 64
39
40struct tree_entry {
41 u64 start;
42 u64 end;
43 struct rb_node rb_node;
44};
45
46struct extent_page_data {
47 struct bio *bio;
48 struct extent_io_tree *tree;
49 get_extent_t *get_extent;
50
51 /* tells writepage not to lock the state bits for this range
52 * it still does the unlocking
53 */
54 int extent_locked;
55};
56
57int __init extent_io_init(void)
58{
59 extent_state_cache = btrfs_cache_create("extent_state",
60 sizeof(struct extent_state), 0,
61 NULL);
62 if (!extent_state_cache)
63 return -ENOMEM;
64
65 extent_buffer_cache = btrfs_cache_create("extent_buffers",
66 sizeof(struct extent_buffer), 0,
67 NULL);
68 if (!extent_buffer_cache)
69 goto free_state_cache;
70 return 0;
71
72free_state_cache:
73 kmem_cache_destroy(extent_state_cache);
74 return -ENOMEM;
75}
76
77void extent_io_exit(void)
78{
79 struct extent_state *state;
80 struct extent_buffer *eb;
81
82 while (!list_empty(&states)) {
83 state = list_entry(states.next, struct extent_state, leak_list);
84 printk(KERN_ERR "btrfs state leak: start %llu end %llu "
85 "state %lu in tree %p refs %d\n",
86 (unsigned long long)state->start,
87 (unsigned long long)state->end,
88 state->state, state->tree, atomic_read(&state->refs));
89 list_del(&state->leak_list);
90 kmem_cache_free(extent_state_cache, state);
91
92 }
93
94 while (!list_empty(&buffers)) {
95 eb = list_entry(buffers.next, struct extent_buffer, leak_list);
96 printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
97 "refs %d\n", (unsigned long long)eb->start,
98 eb->len, atomic_read(&eb->refs));
99 list_del(&eb->leak_list);
100 kmem_cache_free(extent_buffer_cache, eb);
101 }
102 if (extent_state_cache)
103 kmem_cache_destroy(extent_state_cache);
104 if (extent_buffer_cache)
105 kmem_cache_destroy(extent_buffer_cache);
106}
107
108void extent_io_tree_init(struct extent_io_tree *tree,
109 struct address_space *mapping, gfp_t mask)
110{
111 tree->state.rb_node = NULL;
112 tree->buffer.rb_node = NULL;
113 tree->ops = NULL;
114 tree->dirty_bytes = 0;
115 spin_lock_init(&tree->lock);
116 spin_lock_init(&tree->buffer_lock);
117 tree->mapping = mapping;
118}
119
120static struct extent_state *alloc_extent_state(gfp_t mask)
121{
122 struct extent_state *state;
123#ifdef LEAK_DEBUG
124 unsigned long flags;
125#endif
126
127 state = kmem_cache_alloc(extent_state_cache, mask);
128 if (!state)
129 return state;
130 state->state = 0;
131 state->private = 0;
132 state->tree = NULL;
133#ifdef LEAK_DEBUG
134 spin_lock_irqsave(&leak_lock, flags);
135 list_add(&state->leak_list, &states);
136 spin_unlock_irqrestore(&leak_lock, flags);
137#endif
138 atomic_set(&state->refs, 1);
139 init_waitqueue_head(&state->wq);
140 return state;
141}
142
143static void free_extent_state(struct extent_state *state)
144{
145 if (!state)
146 return;
147 if (atomic_dec_and_test(&state->refs)) {
148#ifdef LEAK_DEBUG
149 unsigned long flags;
150#endif
151 WARN_ON(state->tree);
152#ifdef LEAK_DEBUG
153 spin_lock_irqsave(&leak_lock, flags);
154 list_del(&state->leak_list);
155 spin_unlock_irqrestore(&leak_lock, flags);
156#endif
157 kmem_cache_free(extent_state_cache, state);
158 }
159}
160
161static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
162 struct rb_node *node)
163{
164 struct rb_node **p = &root->rb_node;
165 struct rb_node *parent = NULL;
166 struct tree_entry *entry;
167
168 while (*p) {
169 parent = *p;
170 entry = rb_entry(parent, struct tree_entry, rb_node);
171
172 if (offset < entry->start)
173 p = &(*p)->rb_left;
174 else if (offset > entry->end)
175 p = &(*p)->rb_right;
176 else
177 return parent;
178 }
179
180 entry = rb_entry(node, struct tree_entry, rb_node);
181 rb_link_node(node, parent, p);
182 rb_insert_color(node, root);
183 return NULL;
184}
185
186static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
187 struct rb_node **prev_ret,
188 struct rb_node **next_ret)
189{
190 struct rb_root *root = &tree->state;
191 struct rb_node *n = root->rb_node;
192 struct rb_node *prev = NULL;
193 struct rb_node *orig_prev = NULL;
194 struct tree_entry *entry;
195 struct tree_entry *prev_entry = NULL;
196
197 while (n) {
198 entry = rb_entry(n, struct tree_entry, rb_node);
199 prev = n;
200 prev_entry = entry;
201
202 if (offset < entry->start)
203 n = n->rb_left;
204 else if (offset > entry->end)
205 n = n->rb_right;
206 else
207 return n;
208 }
209
210 if (prev_ret) {
211 orig_prev = prev;
212 while (prev && offset > prev_entry->end) {
213 prev = rb_next(prev);
214 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
215 }
216 *prev_ret = prev;
217 prev = orig_prev;
218 }
219
220 if (next_ret) {
221 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
222 while (prev && offset < prev_entry->start) {
223 prev = rb_prev(prev);
224 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
225 }
226 *next_ret = prev;
227 }
228 return NULL;
229}
230
231static inline struct rb_node *tree_search(struct extent_io_tree *tree,
232 u64 offset)
233{
234 struct rb_node *prev = NULL;
235 struct rb_node *ret;
236
237 ret = __etree_search(tree, offset, &prev, NULL);
238 if (!ret)
239 return prev;
240 return ret;
241}
242
243static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
244 u64 offset, struct rb_node *node)
245{
246 struct rb_root *root = &tree->buffer;
247 struct rb_node **p = &root->rb_node;
248 struct rb_node *parent = NULL;
249 struct extent_buffer *eb;
250
251 while (*p) {
252 parent = *p;
253 eb = rb_entry(parent, struct extent_buffer, rb_node);
254
255 if (offset < eb->start)
256 p = &(*p)->rb_left;
257 else if (offset > eb->start)
258 p = &(*p)->rb_right;
259 else
260 return eb;
261 }
262
263 rb_link_node(node, parent, p);
264 rb_insert_color(node, root);
265 return NULL;
266}
267
268static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
269 u64 offset)
270{
271 struct rb_root *root = &tree->buffer;
272 struct rb_node *n = root->rb_node;
273 struct extent_buffer *eb;
274
275 while (n) {
276 eb = rb_entry(n, struct extent_buffer, rb_node);
277 if (offset < eb->start)
278 n = n->rb_left;
279 else if (offset > eb->start)
280 n = n->rb_right;
281 else
282 return eb;
283 }
284 return NULL;
285}
286
287/*
288 * utility function to look for merge candidates inside a given range.
289 * Any extents with matching state are merged together into a single
290 * extent in the tree. Extents with EXTENT_IO in their state field
291 * are not merged because the end_io handlers need to be able to do
292 * operations on them without sleeping (or doing allocations/splits).
293 *
294 * This should be called with the tree lock held.
295 */
296static int merge_state(struct extent_io_tree *tree,
297 struct extent_state *state)
298{
299 struct extent_state *other;
300 struct rb_node *other_node;
301
302 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
303 return 0;
304
305 other_node = rb_prev(&state->rb_node);
306 if (other_node) {
307 other = rb_entry(other_node, struct extent_state, rb_node);
308 if (other->end == state->start - 1 &&
309 other->state == state->state) {
310 state->start = other->start;
311 other->tree = NULL;
312 rb_erase(&other->rb_node, &tree->state);
313 free_extent_state(other);
314 }
315 }
316 other_node = rb_next(&state->rb_node);
317 if (other_node) {
318 other = rb_entry(other_node, struct extent_state, rb_node);
319 if (other->start == state->end + 1 &&
320 other->state == state->state) {
321 other->start = state->start;
322 state->tree = NULL;
323 rb_erase(&state->rb_node, &tree->state);
324 free_extent_state(state);
325 }
326 }
327 return 0;
328}
329
330static void set_state_cb(struct extent_io_tree *tree,
331 struct extent_state *state,
332 unsigned long bits)
333{
334 if (tree->ops && tree->ops->set_bit_hook) {
335 tree->ops->set_bit_hook(tree->mapping->host, state->start,
336 state->end, state->state, bits);
337 }
338}
339
340static void clear_state_cb(struct extent_io_tree *tree,
341 struct extent_state *state,
342 unsigned long bits)
343{
344 if (tree->ops && tree->ops->clear_bit_hook) {
345 tree->ops->clear_bit_hook(tree->mapping->host, state->start,
346 state->end, state->state, bits);
347 }
348}
349
350/*
351 * insert an extent_state struct into the tree. 'bits' are set on the
352 * struct before it is inserted.
353 *
354 * This may return -EEXIST if the extent is already there, in which case the
355 * state struct is freed.
356 *
357 * The tree lock is not taken internally. This is a utility function and
358 * probably isn't what you want to call (see set/clear_extent_bit).
359 */
360static int insert_state(struct extent_io_tree *tree,
361 struct extent_state *state, u64 start, u64 end,
362 int bits)
363{
364 struct rb_node *node;
365
366 if (end < start) {
367 printk(KERN_ERR "btrfs end < start %llu %llu\n",
368 (unsigned long long)end,
369 (unsigned long long)start);
370 WARN_ON(1);
371 }
372 if (bits & EXTENT_DIRTY)
373 tree->dirty_bytes += end - start + 1;
374 set_state_cb(tree, state, bits);
375 state->state |= bits;
376 state->start = start;
377 state->end = end;
378 node = tree_insert(&tree->state, end, &state->rb_node);
379 if (node) {
380 struct extent_state *found;
381 found = rb_entry(node, struct extent_state, rb_node);
382 printk(KERN_ERR "btrfs found node %llu %llu on insert of "
383 "%llu %llu\n", (unsigned long long)found->start,
384 (unsigned long long)found->end,
385 (unsigned long long)start, (unsigned long long)end);
386 free_extent_state(state);
387 return -EEXIST;
388 }
389 state->tree = tree;
390 merge_state(tree, state);
391 return 0;
392}
393
394/*
395 * split a given extent state struct in two, inserting the preallocated
396 * struct 'prealloc' as the newly created second half. 'split' indicates an
397 * offset inside 'orig' where it should be split.
398 *
399 * Before calling,
400 * the tree has 'orig' at [orig->start, orig->end]. After calling, there
401 * are two extent state structs in the tree:
402 * prealloc: [orig->start, split - 1]
403 * orig: [ split, orig->end ]
404 *
405 * The tree locks are not taken by this function. They need to be held
406 * by the caller.
407 */
408static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
409 struct extent_state *prealloc, u64 split)
410{
411 struct rb_node *node;
412 prealloc->start = orig->start;
413 prealloc->end = split - 1;
414 prealloc->state = orig->state;
415 orig->start = split;
416
417 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
418 if (node) {
419 struct extent_state *found;
420 found = rb_entry(node, struct extent_state, rb_node);
421 free_extent_state(prealloc);
422 return -EEXIST;
423 }
424 prealloc->tree = tree;
425 return 0;
426}
427
428/*
429 * utility function to clear some bits in an extent state struct.
430 * it will optionally wake up any one waiting on this state (wake == 1), or
431 * forcibly remove the state from the tree (delete == 1).
432 *
433 * If no bits are set on the state struct after clearing things, the
434 * struct is freed and removed from the tree
435 */
436static int clear_state_bit(struct extent_io_tree *tree,
437 struct extent_state *state, int bits, int wake,
438 int delete)
439{
440 int ret = state->state & bits;
441
442 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
443 u64 range = state->end - state->start + 1;
444 WARN_ON(range > tree->dirty_bytes);
445 tree->dirty_bytes -= range;
446 }
447 clear_state_cb(tree, state, bits);
448 state->state &= ~bits;
449 if (wake)
450 wake_up(&state->wq);
451 if (delete || state->state == 0) {
452 if (state->tree) {
453 clear_state_cb(tree, state, state->state);
454 rb_erase(&state->rb_node, &tree->state);
455 state->tree = NULL;
456 free_extent_state(state);
457 } else {
458 WARN_ON(1);
459 }
460 } else {
461 merge_state(tree, state);
462 }
463 return ret;
464}
465
466/*
467 * clear some bits on a range in the tree. This may require splitting
468 * or inserting elements in the tree, so the gfp mask is used to
469 * indicate which allocations or sleeping are allowed.
470 *
471 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
472 * the given range from the tree regardless of state (ie for truncate).
473 *
474 * the range [start, end] is inclusive.
475 *
476 * This takes the tree lock, and returns < 0 on error, > 0 if any of the
477 * bits were already set, or zero if none of the bits were already set.
478 */
479int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
480 int bits, int wake, int delete, gfp_t mask)
481{
482 struct extent_state *state;
483 struct extent_state *prealloc = NULL;
484 struct rb_node *node;
485 int err;
486 int set = 0;
487
488again:
489 if (!prealloc && (mask & __GFP_WAIT)) {
490 prealloc = alloc_extent_state(mask);
491 if (!prealloc)
492 return -ENOMEM;
493 }
494
495 spin_lock(&tree->lock);
496 /*
497 * this search will find the extents that end after
498 * our range starts
499 */
500 node = tree_search(tree, start);
501 if (!node)
502 goto out;
503 state = rb_entry(node, struct extent_state, rb_node);
504 if (state->start > end)
505 goto out;
506 WARN_ON(state->end < start);
507
508 /*
509 * | ---- desired range ---- |
510 * | state | or
511 * | ------------- state -------------- |
512 *
513 * We need to split the extent we found, and may flip
514 * bits on second half.
515 *
516 * If the extent we found extends past our range, we
517 * just split and search again. It'll get split again
518 * the next time though.
519 *
520 * If the extent we found is inside our range, we clear
521 * the desired bit on it.
522 */
523
524 if (state->start < start) {
525 if (!prealloc)
526 prealloc = alloc_extent_state(GFP_ATOMIC);
527 err = split_state(tree, state, prealloc, start);
528 BUG_ON(err == -EEXIST);
529 prealloc = NULL;
530 if (err)
531 goto out;
532 if (state->end <= end) {
533 start = state->end + 1;
534 set |= clear_state_bit(tree, state, bits,
535 wake, delete);
536 } else {
537 start = state->start;
538 }
539 goto search_again;
540 }
541 /*
542 * | ---- desired range ---- |
543 * | state |
544 * We need to split the extent, and clear the bit
545 * on the first half
546 */
547 if (state->start <= end && state->end > end) {
548 if (!prealloc)
549 prealloc = alloc_extent_state(GFP_ATOMIC);
550 err = split_state(tree, state, prealloc, end + 1);
551 BUG_ON(err == -EEXIST);
552
553 if (wake)
554 wake_up(&state->wq);
555 set |= clear_state_bit(tree, prealloc, bits,
556 wake, delete);
557 prealloc = NULL;
558 goto out;
559 }
560
561 start = state->end + 1;
562 set |= clear_state_bit(tree, state, bits, wake, delete);
563 goto search_again;
564
565out:
566 spin_unlock(&tree->lock);
567 if (prealloc)
568 free_extent_state(prealloc);
569
570 return set;
571
572search_again:
573 if (start > end)
574 goto out;
575 spin_unlock(&tree->lock);
576 if (mask & __GFP_WAIT)
577 cond_resched();
578 goto again;
579}
580
581static int wait_on_state(struct extent_io_tree *tree,
582 struct extent_state *state)
583 __releases(tree->lock)
584 __acquires(tree->lock)
585{
586 DEFINE_WAIT(wait);
587 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
588 spin_unlock(&tree->lock);
589 schedule();
590 spin_lock(&tree->lock);
591 finish_wait(&state->wq, &wait);
592 return 0;
593}
594
595/*
596 * waits for one or more bits to clear on a range in the state tree.
597 * The range [start, end] is inclusive.
598 * The tree lock is taken by this function
599 */
600int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
601{
602 struct extent_state *state;
603 struct rb_node *node;
604
605 spin_lock(&tree->lock);
606again:
607 while (1) {
608 /*
609 * this search will find all the extents that end after
610 * our range starts
611 */
612 node = tree_search(tree, start);
613 if (!node)
614 break;
615
616 state = rb_entry(node, struct extent_state, rb_node);
617
618 if (state->start > end)
619 goto out;
620
621 if (state->state & bits) {
622 start = state->start;
623 atomic_inc(&state->refs);
624 wait_on_state(tree, state);
625 free_extent_state(state);
626 goto again;
627 }
628 start = state->end + 1;
629
630 if (start > end)
631 break;
632
633 if (need_resched()) {
634 spin_unlock(&tree->lock);
635 cond_resched();
636 spin_lock(&tree->lock);
637 }
638 }
639out:
640 spin_unlock(&tree->lock);
641 return 0;
642}
643
644static void set_state_bits(struct extent_io_tree *tree,
645 struct extent_state *state,
646 int bits)
647{
648 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
649 u64 range = state->end - state->start + 1;
650 tree->dirty_bytes += range;
651 }
652 set_state_cb(tree, state, bits);
653 state->state |= bits;
654}
655
656/*
657 * set some bits on a range in the tree. This may require allocations
658 * or sleeping, so the gfp mask is used to indicate what is allowed.
659 *
660 * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
661 * range already has the desired bits set. The start of the existing
662 * range is returned in failed_start in this case.
663 *
664 * [start, end] is inclusive
665 * This takes the tree lock.
666 */
667static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
668 int bits, int exclusive, u64 *failed_start,
669 gfp_t mask)
670{
671 struct extent_state *state;
672 struct extent_state *prealloc = NULL;
673 struct rb_node *node;
674 int err = 0;
675 int set;
676 u64 last_start;
677 u64 last_end;
678again:
679 if (!prealloc && (mask & __GFP_WAIT)) {
680 prealloc = alloc_extent_state(mask);
681 if (!prealloc)
682 return -ENOMEM;
683 }
684
685 spin_lock(&tree->lock);
686 /*
687 * this search will find all the extents that end after
688 * our range starts.
689 */
690 node = tree_search(tree, start);
691 if (!node) {
692 err = insert_state(tree, prealloc, start, end, bits);
693 prealloc = NULL;
694 BUG_ON(err == -EEXIST);
695 goto out;
696 }
697
698 state = rb_entry(node, struct extent_state, rb_node);
699 last_start = state->start;
700 last_end = state->end;
701
702 /*
703 * | ---- desired range ---- |
704 * | state |
705 *
706 * Just lock what we found and keep going
707 */
708 if (state->start == start && state->end <= end) {
709 set = state->state & bits;
710 if (set && exclusive) {
711 *failed_start = state->start;
712 err = -EEXIST;
713 goto out;
714 }
715 set_state_bits(tree, state, bits);
716 start = state->end + 1;
717 merge_state(tree, state);
718 goto search_again;
719 }
720
721 /*
722 * | ---- desired range ---- |
723 * | state |
724 * or
725 * | ------------- state -------------- |
726 *
727 * We need to split the extent we found, and may flip bits on
728 * second half.
729 *
730 * If the extent we found extends past our
731 * range, we just split and search again. It'll get split
732 * again the next time though.
733 *
734 * If the extent we found is inside our range, we set the
735 * desired bit on it.
736 */
737 if (state->start < start) {
738 set = state->state & bits;
739 if (exclusive && set) {
740 *failed_start = start;
741 err = -EEXIST;
742 goto out;
743 }
744 err = split_state(tree, state, prealloc, start);
745 BUG_ON(err == -EEXIST);
746 prealloc = NULL;
747 if (err)
748 goto out;
749 if (state->end <= end) {
750 set_state_bits(tree, state, bits);
751 start = state->end + 1;
752 merge_state(tree, state);
753 } else {
754 start = state->start;
755 }
756 goto search_again;
757 }
758 /*
759 * | ---- desired range ---- |
760 * | state | or | state |
761 *
762 * There's a hole, we need to insert something in it and
763 * ignore the extent we found.
764 */
765 if (state->start > start) {
766 u64 this_end;
767 if (end < last_start)
768 this_end = end;
769 else
770 this_end = last_start - 1;
771 err = insert_state(tree, prealloc, start, this_end,
772 bits);
773 prealloc = NULL;
774 BUG_ON(err == -EEXIST);
775 if (err)
776 goto out;
777 start = this_end + 1;
778 goto search_again;
779 }
780 /*
781 * | ---- desired range ---- |
782 * | state |
783 * We need to split the extent, and set the bit
784 * on the first half
785 */
786 if (state->start <= end && state->end > end) {
787 set = state->state & bits;
788 if (exclusive && set) {
789 *failed_start = start;
790 err = -EEXIST;
791 goto out;
792 }
793 err = split_state(tree, state, prealloc, end + 1);
794 BUG_ON(err == -EEXIST);
795
796 set_state_bits(tree, prealloc, bits);
797 merge_state(tree, prealloc);
798 prealloc = NULL;
799 goto out;
800 }
801
802 goto search_again;
803
804out:
805 spin_unlock(&tree->lock);
806 if (prealloc)
807 free_extent_state(prealloc);
808
809 return err;
810
811search_again:
812 if (start > end)
813 goto out;
814 spin_unlock(&tree->lock);
815 if (mask & __GFP_WAIT)
816 cond_resched();
817 goto again;
818}
819
820/* wrappers around set/clear extent bit */
821int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
822 gfp_t mask)
823{
824 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
825 mask);
826}
827
828int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
829 gfp_t mask)
830{
831 return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
832}
833
834int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
835 int bits, gfp_t mask)
836{
837 return set_extent_bit(tree, start, end, bits, 0, NULL,
838 mask);
839}
840
841int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
842 int bits, gfp_t mask)
843{
844 return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
845}
846
847int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
848 gfp_t mask)
849{
850 return set_extent_bit(tree, start, end,
851 EXTENT_DELALLOC | EXTENT_DIRTY,
852 0, NULL, mask);
853}
854
855int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
856 gfp_t mask)
857{
858 return clear_extent_bit(tree, start, end,
859 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
860}
861
862int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
863 gfp_t mask)
864{
865 return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
866}
867
868int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
869 gfp_t mask)
870{
871 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
872 mask);
873}
874
875static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
876 gfp_t mask)
877{
878 return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
879}
880
881int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
882 gfp_t mask)
883{
884 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
885 mask);
886}
887
888static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
889 u64 end, gfp_t mask)
890{
891 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
892}
893
894static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
895 gfp_t mask)
896{
897 return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
898 0, NULL, mask);
899}
900
901static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
902 u64 end, gfp_t mask)
903{
904 return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
905}
906
907int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
908{
909 return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
910}
911
912/*
913 * either insert or lock state struct between start and end use mask to tell
914 * us if waiting is desired.
915 */
916int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
917{
918 int err;
919 u64 failed_start;
920 while (1) {
921 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
922 &failed_start, mask);
923 if (err == -EEXIST && (mask & __GFP_WAIT)) {
924 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
925 start = failed_start;
926 } else {
927 break;
928 }
929 WARN_ON(start > end);
930 }
931 return err;
932}
933
934int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
935 gfp_t mask)
936{
937 int err;
938 u64 failed_start;
939
940 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
941 &failed_start, mask);
942 if (err == -EEXIST) {
943 if (failed_start > start)
944 clear_extent_bit(tree, start, failed_start - 1,
945 EXTENT_LOCKED, 1, 0, mask);
946 return 0;
947 }
948 return 1;
949}
950
951int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
952 gfp_t mask)
953{
954 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
955}
956
957/*
958 * helper function to set pages and extents in the tree dirty
959 */
960int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
961{
962 unsigned long index = start >> PAGE_CACHE_SHIFT;
963 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
964 struct page *page;
965
966 while (index <= end_index) {
967 page = find_get_page(tree->mapping, index);
968 BUG_ON(!page);
969 __set_page_dirty_nobuffers(page);
970 page_cache_release(page);
971 index++;
972 }
973 set_extent_dirty(tree, start, end, GFP_NOFS);
974 return 0;
975}
976
977/*
978 * helper function to set both pages and extents in the tree writeback
979 */
980static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
981{
982 unsigned long index = start >> PAGE_CACHE_SHIFT;
983 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
984 struct page *page;
985
986 while (index <= end_index) {
987 page = find_get_page(tree->mapping, index);
988 BUG_ON(!page);
989 set_page_writeback(page);
990 page_cache_release(page);
991 index++;
992 }
993 set_extent_writeback(tree, start, end, GFP_NOFS);
994 return 0;
995}
996
997/*
998 * find the first offset in the io tree with 'bits' set. zero is
999 * returned if we find something, and *start_ret and *end_ret are
1000 * set to reflect the state struct that was found.
1001 *
1002 * If nothing was found, 1 is returned, < 0 on error
1003 */
1004int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1005 u64 *start_ret, u64 *end_ret, int bits)
1006{
1007 struct rb_node *node;
1008 struct extent_state *state;
1009 int ret = 1;
1010
1011 spin_lock(&tree->lock);
1012 /*
1013 * this search will find all the extents that end after
1014 * our range starts.
1015 */
1016 node = tree_search(tree, start);
1017 if (!node)
1018 goto out;
1019
1020 while (1) {
1021 state = rb_entry(node, struct extent_state, rb_node);
1022 if (state->end >= start && (state->state & bits)) {
1023 *start_ret = state->start;
1024 *end_ret = state->end;
1025 ret = 0;
1026 break;
1027 }
1028 node = rb_next(node);
1029 if (!node)
1030 break;
1031 }
1032out:
1033 spin_unlock(&tree->lock);
1034 return ret;
1035}
1036
1037/* find the first state struct with 'bits' set after 'start', and
1038 * return it. tree->lock must be held. NULL will returned if
1039 * nothing was found after 'start'
1040 */
1041struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1042 u64 start, int bits)
1043{
1044 struct rb_node *node;
1045 struct extent_state *state;
1046
1047 /*
1048 * this search will find all the extents that end after
1049 * our range starts.
1050 */
1051 node = tree_search(tree, start);
1052 if (!node)
1053 goto out;
1054
1055 while (1) {
1056 state = rb_entry(node, struct extent_state, rb_node);
1057 if (state->end >= start && (state->state & bits))
1058 return state;
1059
1060 node = rb_next(node);
1061 if (!node)
1062 break;
1063 }
1064out:
1065 return NULL;
1066}
1067
1068/*
1069 * find a contiguous range of bytes in the file marked as delalloc, not
1070 * more than 'max_bytes'. start and end are used to return the range,
1071 *
1072 * 1 is returned if we find something, 0 if nothing was in the tree
1073 */
1074static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1075 u64 *start, u64 *end, u64 max_bytes)
1076{
1077 struct rb_node *node;
1078 struct extent_state *state;
1079 u64 cur_start = *start;
1080 u64 found = 0;
1081 u64 total_bytes = 0;
1082
1083 spin_lock(&tree->lock);
1084
1085 /*
1086 * this search will find all the extents that end after
1087 * our range starts.
1088 */
1089 node = tree_search(tree, cur_start);
1090 if (!node) {
1091 if (!found)
1092 *end = (u64)-1;
1093 goto out;
1094 }
1095
1096 while (1) {
1097 state = rb_entry(node, struct extent_state, rb_node);
1098 if (found && (state->start != cur_start ||
1099 (state->state & EXTENT_BOUNDARY))) {
1100 goto out;
1101 }
1102 if (!(state->state & EXTENT_DELALLOC)) {
1103 if (!found)
1104 *end = state->end;
1105 goto out;
1106 }
1107 if (!found)
1108 *start = state->start;
1109 found++;
1110 *end = state->end;
1111 cur_start = state->end + 1;
1112 node = rb_next(node);
1113 if (!node)
1114 break;
1115 total_bytes += state->end - state->start + 1;
1116 if (total_bytes >= max_bytes)
1117 break;
1118 }
1119out:
1120 spin_unlock(&tree->lock);
1121 return found;
1122}
1123
1124static noinline int __unlock_for_delalloc(struct inode *inode,
1125 struct page *locked_page,
1126 u64 start, u64 end)
1127{
1128 int ret;
1129 struct page *pages[16];
1130 unsigned long index = start >> PAGE_CACHE_SHIFT;
1131 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1132 unsigned long nr_pages = end_index - index + 1;
1133 int i;
1134
1135 if (index == locked_page->index && end_index == index)
1136 return 0;
1137
1138 while (nr_pages > 0) {
1139 ret = find_get_pages_contig(inode->i_mapping, index,
1140 min_t(unsigned long, nr_pages,
1141 ARRAY_SIZE(pages)), pages);
1142 for (i = 0; i < ret; i++) {
1143 if (pages[i] != locked_page)
1144 unlock_page(pages[i]);
1145 page_cache_release(pages[i]);
1146 }
1147 nr_pages -= ret;
1148 index += ret;
1149 cond_resched();
1150 }
1151 return 0;
1152}
1153
1154static noinline int lock_delalloc_pages(struct inode *inode,
1155 struct page *locked_page,
1156 u64 delalloc_start,
1157 u64 delalloc_end)
1158{
1159 unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1160 unsigned long start_index = index;
1161 unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1162 unsigned long pages_locked = 0;
1163 struct page *pages[16];
1164 unsigned long nrpages;
1165 int ret;
1166 int i;
1167
1168 /* the caller is responsible for locking the start index */
1169 if (index == locked_page->index && index == end_index)
1170 return 0;
1171
1172 /* skip the page at the start index */
1173 nrpages = end_index - index + 1;
1174 while (nrpages > 0) {
1175 ret = find_get_pages_contig(inode->i_mapping, index,
1176 min_t(unsigned long,
1177 nrpages, ARRAY_SIZE(pages)), pages);
1178 if (ret == 0) {
1179 ret = -EAGAIN;
1180 goto done;
1181 }
1182 /* now we have an array of pages, lock them all */
1183 for (i = 0; i < ret; i++) {
1184 /*
1185 * the caller is taking responsibility for
1186 * locked_page
1187 */
1188 if (pages[i] != locked_page) {
1189 lock_page(pages[i]);
1190 if (!PageDirty(pages[i]) ||
1191 pages[i]->mapping != inode->i_mapping) {
1192 ret = -EAGAIN;
1193 unlock_page(pages[i]);
1194 page_cache_release(pages[i]);
1195 goto done;
1196 }
1197 }
1198 page_cache_release(pages[i]);
1199 pages_locked++;
1200 }
1201 nrpages -= ret;
1202 index += ret;
1203 cond_resched();
1204 }
1205 ret = 0;
1206done:
1207 if (ret && pages_locked) {
1208 __unlock_for_delalloc(inode, locked_page,
1209 delalloc_start,
1210 ((u64)(start_index + pages_locked - 1)) <<
1211 PAGE_CACHE_SHIFT);
1212 }
1213 return ret;
1214}
1215
1216/*
1217 * find a contiguous range of bytes in the file marked as delalloc, not
1218 * more than 'max_bytes'. start and end are used to return the range,
1219 *
1220 * 1 is returned if we find something, 0 if nothing was in the tree
1221 */
1222static noinline u64 find_lock_delalloc_range(struct inode *inode,
1223 struct extent_io_tree *tree,
1224 struct page *locked_page,
1225 u64 *start, u64 *end,
1226 u64 max_bytes)
1227{
1228 u64 delalloc_start;
1229 u64 delalloc_end;
1230 u64 found;
1231 int ret;
1232 int loops = 0;
1233
1234again:
1235 /* step one, find a bunch of delalloc bytes starting at start */
1236 delalloc_start = *start;
1237 delalloc_end = 0;
1238 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1239 max_bytes);
1240 if (!found || delalloc_end <= *start) {
1241 *start = delalloc_start;
1242 *end = delalloc_end;
1243 return found;
1244 }
1245
1246 /*
1247 * start comes from the offset of locked_page. We have to lock
1248 * pages in order, so we can't process delalloc bytes before
1249 * locked_page
1250 */
1251 if (delalloc_start < *start)
1252 delalloc_start = *start;
1253
1254 /*
1255 * make sure to limit the number of pages we try to lock down
1256 * if we're looping.
1257 */
1258 if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1259 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1260
1261 /* step two, lock all the pages after the page that has start */
1262 ret = lock_delalloc_pages(inode, locked_page,
1263 delalloc_start, delalloc_end);
1264 if (ret == -EAGAIN) {
1265 /* some of the pages are gone, lets avoid looping by
1266 * shortening the size of the delalloc range we're searching
1267 */
1268 if (!loops) {
1269 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1270 max_bytes = PAGE_CACHE_SIZE - offset;
1271 loops = 1;
1272 goto again;
1273 } else {
1274 found = 0;
1275 goto out_failed;
1276 }
1277 }
1278 BUG_ON(ret);
1279
1280 /* step three, lock the state bits for the whole range */
1281 lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1282
1283 /* then test to make sure it is all still delalloc */
1284 ret = test_range_bit(tree, delalloc_start, delalloc_end,
1285 EXTENT_DELALLOC, 1);
1286 if (!ret) {
1287 unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1288 __unlock_for_delalloc(inode, locked_page,
1289 delalloc_start, delalloc_end);
1290 cond_resched();
1291 goto again;
1292 }
1293 *start = delalloc_start;
1294 *end = delalloc_end;
1295out_failed:
1296 return found;
1297}
1298
1299int extent_clear_unlock_delalloc(struct inode *inode,
1300 struct extent_io_tree *tree,
1301 u64 start, u64 end, struct page *locked_page,
1302 int unlock_pages,
1303 int clear_unlock,
1304 int clear_delalloc, int clear_dirty,
1305 int set_writeback,
1306 int end_writeback)
1307{
1308 int ret;
1309 struct page *pages[16];
1310 unsigned long index = start >> PAGE_CACHE_SHIFT;
1311 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1312 unsigned long nr_pages = end_index - index + 1;
1313 int i;
1314 int clear_bits = 0;
1315
1316 if (clear_unlock)
1317 clear_bits |= EXTENT_LOCKED;
1318 if (clear_dirty)
1319 clear_bits |= EXTENT_DIRTY;
1320
1321 if (clear_delalloc)
1322 clear_bits |= EXTENT_DELALLOC;
1323
1324 clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
1325 if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
1326 return 0;
1327
1328 while (nr_pages > 0) {
1329 ret = find_get_pages_contig(inode->i_mapping, index,
1330 min_t(unsigned long,
1331 nr_pages, ARRAY_SIZE(pages)), pages);
1332 for (i = 0; i < ret; i++) {
1333 if (pages[i] == locked_page) {
1334 page_cache_release(pages[i]);
1335 continue;
1336 }
1337 if (clear_dirty)
1338 clear_page_dirty_for_io(pages[i]);
1339 if (set_writeback)
1340 set_page_writeback(pages[i]);
1341 if (end_writeback)
1342 end_page_writeback(pages[i]);
1343 if (unlock_pages)
1344 unlock_page(pages[i]);
1345 page_cache_release(pages[i]);
1346 }
1347 nr_pages -= ret;
1348 index += ret;
1349 cond_resched();
1350 }
1351 return 0;
1352}
1353
1354/*
1355 * count the number of bytes in the tree that have a given bit(s)
1356 * set. This can be fairly slow, except for EXTENT_DIRTY which is
1357 * cached. The total number found is returned.
1358 */
1359u64 count_range_bits(struct extent_io_tree *tree,
1360 u64 *start, u64 search_end, u64 max_bytes,
1361 unsigned long bits)
1362{
1363 struct rb_node *node;
1364 struct extent_state *state;
1365 u64 cur_start = *start;
1366 u64 total_bytes = 0;
1367 int found = 0;
1368
1369 if (search_end <= cur_start) {
1370 WARN_ON(1);
1371 return 0;
1372 }
1373
1374 spin_lock(&tree->lock);
1375 if (cur_start == 0 && bits == EXTENT_DIRTY) {
1376 total_bytes = tree->dirty_bytes;
1377 goto out;
1378 }
1379 /*
1380 * this search will find all the extents that end after
1381 * our range starts.
1382 */
1383 node = tree_search(tree, cur_start);
1384 if (!node)
1385 goto out;
1386
1387 while (1) {
1388 state = rb_entry(node, struct extent_state, rb_node);
1389 if (state->start > search_end)
1390 break;
1391 if (state->end >= cur_start && (state->state & bits)) {
1392 total_bytes += min(search_end, state->end) + 1 -
1393 max(cur_start, state->start);
1394 if (total_bytes >= max_bytes)
1395 break;
1396 if (!found) {
1397 *start = state->start;
1398 found = 1;
1399 }
1400 }
1401 node = rb_next(node);
1402 if (!node)
1403 break;
1404 }
1405out:
1406 spin_unlock(&tree->lock);
1407 return total_bytes;
1408}
1409
1410#if 0
1411/*
1412 * helper function to lock both pages and extents in the tree.
1413 * pages must be locked first.
1414 */
1415static int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
1416{
1417 unsigned long index = start >> PAGE_CACHE_SHIFT;
1418 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1419 struct page *page;
1420 int err;
1421
1422 while (index <= end_index) {
1423 page = grab_cache_page(tree->mapping, index);
1424 if (!page) {
1425 err = -ENOMEM;
1426 goto failed;
1427 }
1428 if (IS_ERR(page)) {
1429 err = PTR_ERR(page);
1430 goto failed;
1431 }
1432 index++;
1433 }
1434 lock_extent(tree, start, end, GFP_NOFS);
1435 return 0;
1436
1437failed:
1438 /*
1439 * we failed above in getting the page at 'index', so we undo here
1440 * up to but not including the page at 'index'
1441 */
1442 end_index = index;
1443 index = start >> PAGE_CACHE_SHIFT;
1444 while (index < end_index) {
1445 page = find_get_page(tree->mapping, index);
1446 unlock_page(page);
1447 page_cache_release(page);
1448 index++;
1449 }
1450 return err;
1451}
1452
1453/*
1454 * helper function to unlock both pages and extents in the tree.
1455 */
1456static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
1457{
1458 unsigned long index = start >> PAGE_CACHE_SHIFT;
1459 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1460 struct page *page;
1461
1462 while (index <= end_index) {
1463 page = find_get_page(tree->mapping, index);
1464 unlock_page(page);
1465 page_cache_release(page);
1466 index++;
1467 }
1468 unlock_extent(tree, start, end, GFP_NOFS);
1469 return 0;
1470}
1471#endif
1472
1473/*
1474 * set the private field for a given byte offset in the tree. If there isn't
1475 * an extent_state there already, this does nothing.
1476 */
1477int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1478{
1479 struct rb_node *node;
1480 struct extent_state *state;
1481 int ret = 0;
1482
1483 spin_lock(&tree->lock);
1484 /*
1485 * this search will find all the extents that end after
1486 * our range starts.
1487 */
1488 node = tree_search(tree, start);
1489 if (!node) {
1490 ret = -ENOENT;
1491 goto out;
1492 }
1493 state = rb_entry(node, struct extent_state, rb_node);
1494 if (state->start != start) {
1495 ret = -ENOENT;
1496 goto out;
1497 }
1498 state->private = private;
1499out:
1500 spin_unlock(&tree->lock);
1501 return ret;
1502}
1503
1504int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1505{
1506 struct rb_node *node;
1507 struct extent_state *state;
1508 int ret = 0;
1509
1510 spin_lock(&tree->lock);
1511 /*
1512 * this search will find all the extents that end after
1513 * our range starts.
1514 */
1515 node = tree_search(tree, start);
1516 if (!node) {
1517 ret = -ENOENT;
1518 goto out;
1519 }
1520 state = rb_entry(node, struct extent_state, rb_node);
1521 if (state->start != start) {
1522 ret = -ENOENT;
1523 goto out;
1524 }
1525 *private = state->private;
1526out:
1527 spin_unlock(&tree->lock);
1528 return ret;
1529}
1530
1531/*
1532 * searches a range in the state tree for a given mask.
1533 * If 'filled' == 1, this returns 1 only if every extent in the tree
1534 * has the bits set. Otherwise, 1 is returned if any bit in the
1535 * range is found set.
1536 */
1537int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1538 int bits, int filled)
1539{
1540 struct extent_state *state = NULL;
1541 struct rb_node *node;
1542 int bitset = 0;
1543
1544 spin_lock(&tree->lock);
1545 node = tree_search(tree, start);
1546 while (node && start <= end) {
1547 state = rb_entry(node, struct extent_state, rb_node);
1548
1549 if (filled && state->start > start) {
1550 bitset = 0;
1551 break;
1552 }
1553
1554 if (state->start > end)
1555 break;
1556
1557 if (state->state & bits) {
1558 bitset = 1;
1559 if (!filled)
1560 break;
1561 } else if (filled) {
1562 bitset = 0;
1563 break;
1564 }
1565 start = state->end + 1;
1566 if (start > end)
1567 break;
1568 node = rb_next(node);
1569 if (!node) {
1570 if (filled)
1571 bitset = 0;
1572 break;
1573 }
1574 }
1575 spin_unlock(&tree->lock);
1576 return bitset;
1577}
1578
1579/*
1580 * helper function to set a given page up to date if all the
1581 * extents in the tree for that page are up to date
1582 */
1583static int check_page_uptodate(struct extent_io_tree *tree,
1584 struct page *page)
1585{
1586 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1587 u64 end = start + PAGE_CACHE_SIZE - 1;
1588 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
1589 SetPageUptodate(page);
1590 return 0;
1591}
1592
1593/*
1594 * helper function to unlock a page if all the extents in the tree
1595 * for that page are unlocked
1596 */
1597static int check_page_locked(struct extent_io_tree *tree,
1598 struct page *page)
1599{
1600 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1601 u64 end = start + PAGE_CACHE_SIZE - 1;
1602 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
1603 unlock_page(page);
1604 return 0;
1605}
1606
1607/*
1608 * helper function to end page writeback if all the extents
1609 * in the tree for that page are done with writeback
1610 */
1611static int check_page_writeback(struct extent_io_tree *tree,
1612 struct page *page)
1613{
1614 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1615 u64 end = start + PAGE_CACHE_SIZE - 1;
1616 if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1617 end_page_writeback(page);
1618 return 0;
1619}
1620
1621/* lots and lots of room for performance fixes in the end_bio funcs */
1622
1623/*
1624 * after a writepage IO is done, we need to:
1625 * clear the uptodate bits on error
1626 * clear the writeback bits in the extent tree for this IO
1627 * end_page_writeback if the page has no more pending IO
1628 *
1629 * Scheduling is not allowed, so the extent state tree is expected
1630 * to have one and only one object corresponding to this IO.
1631 */
1632static void end_bio_extent_writepage(struct bio *bio, int err)
1633{
1634 int uptodate = err == 0;
1635 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1636 struct extent_io_tree *tree;
1637 u64 start;
1638 u64 end;
1639 int whole_page;
1640 int ret;
1641
1642 do {
1643 struct page *page = bvec->bv_page;
1644 tree = &BTRFS_I(page->mapping->host)->io_tree;
1645
1646 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1647 bvec->bv_offset;
1648 end = start + bvec->bv_len - 1;
1649
1650 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1651 whole_page = 1;
1652 else
1653 whole_page = 0;
1654
1655 if (--bvec >= bio->bi_io_vec)
1656 prefetchw(&bvec->bv_page->flags);
1657 if (tree->ops && tree->ops->writepage_end_io_hook) {
1658 ret = tree->ops->writepage_end_io_hook(page, start,
1659 end, NULL, uptodate);
1660 if (ret)
1661 uptodate = 0;
1662 }
1663
1664 if (!uptodate && tree->ops &&
1665 tree->ops->writepage_io_failed_hook) {
1666 ret = tree->ops->writepage_io_failed_hook(bio, page,
1667 start, end, NULL);
1668 if (ret == 0) {
1669 uptodate = (err == 0);
1670 continue;
1671 }
1672 }
1673
1674 if (!uptodate) {
1675 clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
1676 ClearPageUptodate(page);
1677 SetPageError(page);
1678 }
1679
1680 clear_extent_writeback(tree, start, end, GFP_ATOMIC);
1681
1682 if (whole_page)
1683 end_page_writeback(page);
1684 else
1685 check_page_writeback(tree, page);
1686 } while (bvec >= bio->bi_io_vec);
1687
1688 bio_put(bio);
1689}
1690
1691/*
1692 * after a readpage IO is done, we need to:
1693 * clear the uptodate bits on error
1694 * set the uptodate bits if things worked
1695 * set the page up to date if all extents in the tree are uptodate
1696 * clear the lock bit in the extent tree
1697 * unlock the page if there are no other extents locked for it
1698 *
1699 * Scheduling is not allowed, so the extent state tree is expected
1700 * to have one and only one object corresponding to this IO.
1701 */
1702static void end_bio_extent_readpage(struct bio *bio, int err)
1703{
1704 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1705 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1706 struct extent_io_tree *tree;
1707 u64 start;
1708 u64 end;
1709 int whole_page;
1710 int ret;
1711
1712 if (err)
1713 uptodate = 0;
1714
1715 do {
1716 struct page *page = bvec->bv_page;
1717 tree = &BTRFS_I(page->mapping->host)->io_tree;
1718
1719 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1720 bvec->bv_offset;
1721 end = start + bvec->bv_len - 1;
1722
1723 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1724 whole_page = 1;
1725 else
1726 whole_page = 0;
1727
1728 if (--bvec >= bio->bi_io_vec)
1729 prefetchw(&bvec->bv_page->flags);
1730
1731 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1732 ret = tree->ops->readpage_end_io_hook(page, start, end,
1733 NULL);
1734 if (ret)
1735 uptodate = 0;
1736 }
1737 if (!uptodate && tree->ops &&
1738 tree->ops->readpage_io_failed_hook) {
1739 ret = tree->ops->readpage_io_failed_hook(bio, page,
1740 start, end, NULL);
1741 if (ret == 0) {
1742 uptodate =
1743 test_bit(BIO_UPTODATE, &bio->bi_flags);
1744 if (err)
1745 uptodate = 0;
1746 continue;
1747 }
1748 }
1749
1750 if (uptodate) {
1751 set_extent_uptodate(tree, start, end,
1752 GFP_ATOMIC);
1753 }
1754 unlock_extent(tree, start, end, GFP_ATOMIC);
1755
1756 if (whole_page) {
1757 if (uptodate) {
1758 SetPageUptodate(page);
1759 } else {
1760 ClearPageUptodate(page);
1761 SetPageError(page);
1762 }
1763 unlock_page(page);
1764 } else {
1765 if (uptodate) {
1766 check_page_uptodate(tree, page);
1767 } else {
1768 ClearPageUptodate(page);
1769 SetPageError(page);
1770 }
1771 check_page_locked(tree, page);
1772 }
1773 } while (bvec >= bio->bi_io_vec);
1774
1775 bio_put(bio);
1776}
1777
1778/*
1779 * IO done from prepare_write is pretty simple, we just unlock
1780 * the structs in the extent tree when done, and set the uptodate bits
1781 * as appropriate.
1782 */
1783static void end_bio_extent_preparewrite(struct bio *bio, int err)
1784{
1785 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1786 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1787 struct extent_io_tree *tree;
1788 u64 start;
1789 u64 end;
1790
1791 do {
1792 struct page *page = bvec->bv_page;
1793 tree = &BTRFS_I(page->mapping->host)->io_tree;
1794
1795 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1796 bvec->bv_offset;
1797 end = start + bvec->bv_len - 1;
1798
1799 if (--bvec >= bio->bi_io_vec)
1800 prefetchw(&bvec->bv_page->flags);
1801
1802 if (uptodate) {
1803 set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1804 } else {
1805 ClearPageUptodate(page);
1806 SetPageError(page);
1807 }
1808
1809 unlock_extent(tree, start, end, GFP_ATOMIC);
1810
1811 } while (bvec >= bio->bi_io_vec);
1812
1813 bio_put(bio);
1814}
1815
1816static struct bio *
1817extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1818 gfp_t gfp_flags)
1819{
1820 struct bio *bio;
1821
1822 bio = bio_alloc(gfp_flags, nr_vecs);
1823
1824 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1825 while (!bio && (nr_vecs /= 2))
1826 bio = bio_alloc(gfp_flags, nr_vecs);
1827 }
1828
1829 if (bio) {
1830 bio->bi_size = 0;
1831 bio->bi_bdev = bdev;
1832 bio->bi_sector = first_sector;
1833 }
1834 return bio;
1835}
1836
1837static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1838 unsigned long bio_flags)
1839{
1840 int ret = 0;
1841 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1842 struct page *page = bvec->bv_page;
1843 struct extent_io_tree *tree = bio->bi_private;
1844 u64 start;
1845 u64 end;
1846
1847 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1848 end = start + bvec->bv_len - 1;
1849
1850 bio->bi_private = NULL;
1851
1852 bio_get(bio);
1853
1854 if (tree->ops && tree->ops->submit_bio_hook)
1855 tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1856 mirror_num, bio_flags);
1857 else
1858 submit_bio(rw, bio);
1859 if (bio_flagged(bio, BIO_EOPNOTSUPP))
1860 ret = -EOPNOTSUPP;
1861 bio_put(bio);
1862 return ret;
1863}
1864
1865static int submit_extent_page(int rw, struct extent_io_tree *tree,
1866 struct page *page, sector_t sector,
1867 size_t size, unsigned long offset,
1868 struct block_device *bdev,
1869 struct bio **bio_ret,
1870 unsigned long max_pages,
1871 bio_end_io_t end_io_func,
1872 int mirror_num,
1873 unsigned long prev_bio_flags,
1874 unsigned long bio_flags)
1875{
1876 int ret = 0;
1877 struct bio *bio;
1878 int nr;
1879 int contig = 0;
1880 int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
1881 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
1882 size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
1883
1884 if (bio_ret && *bio_ret) {
1885 bio = *bio_ret;
1886 if (old_compressed)
1887 contig = bio->bi_sector == sector;
1888 else
1889 contig = bio->bi_sector + (bio->bi_size >> 9) ==
1890 sector;
1891
1892 if (prev_bio_flags != bio_flags || !contig ||
1893 (tree->ops && tree->ops->merge_bio_hook &&
1894 tree->ops->merge_bio_hook(page, offset, page_size, bio,
1895 bio_flags)) ||
1896 bio_add_page(bio, page, page_size, offset) < page_size) {
1897 ret = submit_one_bio(rw, bio, mirror_num,
1898 prev_bio_flags);
1899 bio = NULL;
1900 } else {
1901 return 0;
1902 }
1903 }
1904 if (this_compressed)
1905 nr = BIO_MAX_PAGES;
1906 else
1907 nr = bio_get_nr_vecs(bdev);
1908
1909 bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1910
1911 bio_add_page(bio, page, page_size, offset);
1912 bio->bi_end_io = end_io_func;
1913 bio->bi_private = tree;
1914
1915 if (bio_ret)
1916 *bio_ret = bio;
1917 else
1918 ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
1919
1920 return ret;
1921}
1922
1923void set_page_extent_mapped(struct page *page)
1924{
1925 if (!PagePrivate(page)) {
1926 SetPagePrivate(page);
1927 page_cache_get(page);
1928 set_page_private(page, EXTENT_PAGE_PRIVATE);
1929 }
1930}
1931
1932static void set_page_extent_head(struct page *page, unsigned long len)
1933{
1934 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1935}
1936
1937/*
1938 * basic readpage implementation. Locked extent state structs are inserted
1939 * into the tree that are removed when the IO is done (by the end_io
1940 * handlers)
1941 */
1942static int __extent_read_full_page(struct extent_io_tree *tree,
1943 struct page *page,
1944 get_extent_t *get_extent,
1945 struct bio **bio, int mirror_num,
1946 unsigned long *bio_flags)
1947{
1948 struct inode *inode = page->mapping->host;
1949 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1950 u64 page_end = start + PAGE_CACHE_SIZE - 1;
1951 u64 end;
1952 u64 cur = start;
1953 u64 extent_offset;
1954 u64 last_byte = i_size_read(inode);
1955 u64 block_start;
1956 u64 cur_end;
1957 sector_t sector;
1958 struct extent_map *em;
1959 struct block_device *bdev;
1960 int ret;
1961 int nr = 0;
1962 size_t page_offset = 0;
1963 size_t iosize;
1964 size_t disk_io_size;
1965 size_t blocksize = inode->i_sb->s_blocksize;
1966 unsigned long this_bio_flag = 0;
1967
1968 set_page_extent_mapped(page);
1969
1970 end = page_end;
1971 lock_extent(tree, start, end, GFP_NOFS);
1972
1973 if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
1974 char *userpage;
1975 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
1976
1977 if (zero_offset) {
1978 iosize = PAGE_CACHE_SIZE - zero_offset;
1979 userpage = kmap_atomic(page, KM_USER0);
1980 memset(userpage + zero_offset, 0, iosize);
1981 flush_dcache_page(page);
1982 kunmap_atomic(userpage, KM_USER0);
1983 }
1984 }
1985 while (cur <= end) {
1986 if (cur >= last_byte) {
1987 char *userpage;
1988 iosize = PAGE_CACHE_SIZE - page_offset;
1989 userpage = kmap_atomic(page, KM_USER0);
1990 memset(userpage + page_offset, 0, iosize);
1991 flush_dcache_page(page);
1992 kunmap_atomic(userpage, KM_USER0);
1993 set_extent_uptodate(tree, cur, cur + iosize - 1,
1994 GFP_NOFS);
1995 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1996 break;
1997 }
1998 em = get_extent(inode, page, page_offset, cur,
1999 end - cur + 1, 0);
2000 if (IS_ERR(em) || !em) {
2001 SetPageError(page);
2002 unlock_extent(tree, cur, end, GFP_NOFS);
2003 break;
2004 }
2005 extent_offset = cur - em->start;
2006 BUG_ON(extent_map_end(em) <= cur);
2007 BUG_ON(end < cur);
2008
2009 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2010 this_bio_flag = EXTENT_BIO_COMPRESSED;
2011
2012 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2013 cur_end = min(extent_map_end(em) - 1, end);
2014 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2015 if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2016 disk_io_size = em->block_len;
2017 sector = em->block_start >> 9;
2018 } else {
2019 sector = (em->block_start + extent_offset) >> 9;
2020 disk_io_size = iosize;
2021 }
2022 bdev = em->bdev;
2023 block_start = em->block_start;
2024 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2025 block_start = EXTENT_MAP_HOLE;
2026 free_extent_map(em);
2027 em = NULL;
2028
2029 /* we've found a hole, just zero and go on */
2030 if (block_start == EXTENT_MAP_HOLE) {
2031 char *userpage;
2032 userpage = kmap_atomic(page, KM_USER0);
2033 memset(userpage + page_offset, 0, iosize);
2034 flush_dcache_page(page);
2035 kunmap_atomic(userpage, KM_USER0);
2036
2037 set_extent_uptodate(tree, cur, cur + iosize - 1,
2038 GFP_NOFS);
2039 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2040 cur = cur + iosize;
2041 page_offset += iosize;
2042 continue;
2043 }
2044 /* the get_extent function already copied into the page */
2045 if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
2046 check_page_uptodate(tree, page);
2047 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2048 cur = cur + iosize;
2049 page_offset += iosize;
2050 continue;
2051 }
2052 /* we have an inline extent but it didn't get marked up
2053 * to date. Error out
2054 */
2055 if (block_start == EXTENT_MAP_INLINE) {
2056 SetPageError(page);
2057 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2058 cur = cur + iosize;
2059 page_offset += iosize;
2060 continue;
2061 }
2062
2063 ret = 0;
2064 if (tree->ops && tree->ops->readpage_io_hook) {
2065 ret = tree->ops->readpage_io_hook(page, cur,
2066 cur + iosize - 1);
2067 }
2068 if (!ret) {
2069 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2070 pnr -= page->index;
2071 ret = submit_extent_page(READ, tree, page,
2072 sector, disk_io_size, page_offset,
2073 bdev, bio, pnr,
2074 end_bio_extent_readpage, mirror_num,
2075 *bio_flags,
2076 this_bio_flag);
2077 nr++;
2078 *bio_flags = this_bio_flag;
2079 }
2080 if (ret)
2081 SetPageError(page);
2082 cur = cur + iosize;
2083 page_offset += iosize;
2084 }
2085 if (!nr) {
2086 if (!PageError(page))
2087 SetPageUptodate(page);
2088 unlock_page(page);
2089 }
2090 return 0;
2091}
2092
2093int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2094 get_extent_t *get_extent)
2095{
2096 struct bio *bio = NULL;
2097 unsigned long bio_flags = 0;
2098 int ret;
2099
2100 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2101 &bio_flags);
2102 if (bio)
2103 submit_one_bio(READ, bio, 0, bio_flags);
2104 return ret;
2105}
2106
2107/*
2108 * the writepage semantics are similar to regular writepage. extent
2109 * records are inserted to lock ranges in the tree, and as dirty areas
2110 * are found, they are marked writeback. Then the lock bits are removed
2111 * and the end_io handler clears the writeback ranges
2112 */
2113static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2114 void *data)
2115{
2116 struct inode *inode = page->mapping->host;
2117 struct extent_page_data *epd = data;
2118 struct extent_io_tree *tree = epd->tree;
2119 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2120 u64 delalloc_start;
2121 u64 page_end = start + PAGE_CACHE_SIZE - 1;
2122 u64 end;
2123 u64 cur = start;
2124 u64 extent_offset;
2125 u64 last_byte = i_size_read(inode);
2126 u64 block_start;
2127 u64 iosize;
2128 u64 unlock_start;
2129 sector_t sector;
2130 struct extent_map *em;
2131 struct block_device *bdev;
2132 int ret;
2133 int nr = 0;
2134 size_t pg_offset = 0;
2135 size_t blocksize;
2136 loff_t i_size = i_size_read(inode);
2137 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2138 u64 nr_delalloc;
2139 u64 delalloc_end;
2140 int page_started;
2141 int compressed;
2142 unsigned long nr_written = 0;
2143
2144 WARN_ON(!PageLocked(page));
2145 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2146 if (page->index > end_index ||
2147 (page->index == end_index && !pg_offset)) {
2148 page->mapping->a_ops->invalidatepage(page, 0);
2149 unlock_page(page);
2150 return 0;
2151 }
2152
2153 if (page->index == end_index) {
2154 char *userpage;
2155
2156 userpage = kmap_atomic(page, KM_USER0);
2157 memset(userpage + pg_offset, 0,
2158 PAGE_CACHE_SIZE - pg_offset);
2159 kunmap_atomic(userpage, KM_USER0);
2160 flush_dcache_page(page);
2161 }
2162 pg_offset = 0;
2163
2164 set_page_extent_mapped(page);
2165
2166 delalloc_start = start;
2167 delalloc_end = 0;
2168 page_started = 0;
2169 if (!epd->extent_locked) {
2170 while (delalloc_end < page_end) {
2171 nr_delalloc = find_lock_delalloc_range(inode, tree,
2172 page,
2173 &delalloc_start,
2174 &delalloc_end,
2175 128 * 1024 * 1024);
2176 if (nr_delalloc == 0) {
2177 delalloc_start = delalloc_end + 1;
2178 continue;
2179 }
2180 tree->ops->fill_delalloc(inode, page, delalloc_start,
2181 delalloc_end, &page_started,
2182 &nr_written);
2183 delalloc_start = delalloc_end + 1;
2184 }
2185
2186 /* did the fill delalloc function already unlock and start
2187 * the IO?
2188 */
2189 if (page_started) {
2190 ret = 0;
2191 goto update_nr_written;
2192 }
2193 }
2194 lock_extent(tree, start, page_end, GFP_NOFS);
2195
2196 unlock_start = start;
2197
2198 if (tree->ops && tree->ops->writepage_start_hook) {
2199 ret = tree->ops->writepage_start_hook(page, start,
2200 page_end);
2201 if (ret == -EAGAIN) {
2202 unlock_extent(tree, start, page_end, GFP_NOFS);
2203 redirty_page_for_writepage(wbc, page);
2204 unlock_page(page);
2205 ret = 0;
2206 goto update_nr_written;
2207 }
2208 }
2209
2210 nr_written++;
2211
2212 end = page_end;
2213 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
2214 printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
2215
2216 if (last_byte <= start) {
2217 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
2218 unlock_extent(tree, start, page_end, GFP_NOFS);
2219 if (tree->ops && tree->ops->writepage_end_io_hook)
2220 tree->ops->writepage_end_io_hook(page, start,
2221 page_end, NULL, 1);
2222 unlock_start = page_end + 1;
2223 goto done;
2224 }
2225
2226 set_extent_uptodate(tree, start, page_end, GFP_NOFS);
2227 blocksize = inode->i_sb->s_blocksize;
2228
2229 while (cur <= end) {
2230 if (cur >= last_byte) {
2231 clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
2232 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2233 if (tree->ops && tree->ops->writepage_end_io_hook)
2234 tree->ops->writepage_end_io_hook(page, cur,
2235 page_end, NULL, 1);
2236 unlock_start = page_end + 1;
2237 break;
2238 }
2239 em = epd->get_extent(inode, page, pg_offset, cur,
2240 end - cur + 1, 1);
2241 if (IS_ERR(em) || !em) {
2242 SetPageError(page);
2243 break;
2244 }
2245
2246 extent_offset = cur - em->start;
2247 BUG_ON(extent_map_end(em) <= cur);
2248 BUG_ON(end < cur);
2249 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2250 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2251 sector = (em->block_start + extent_offset) >> 9;
2252 bdev = em->bdev;
2253 block_start = em->block_start;
2254 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2255 free_extent_map(em);
2256 em = NULL;
2257
2258 /*
2259 * compressed and inline extents are written through other
2260 * paths in the FS
2261 */
2262 if (compressed || block_start == EXTENT_MAP_HOLE ||
2263 block_start == EXTENT_MAP_INLINE) {
2264 clear_extent_dirty(tree, cur,
2265 cur + iosize - 1, GFP_NOFS);
2266
2267 unlock_extent(tree, unlock_start, cur + iosize - 1,
2268 GFP_NOFS);
2269
2270 /*
2271 * end_io notification does not happen here for
2272 * compressed extents
2273 */
2274 if (!compressed && tree->ops &&
2275 tree->ops->writepage_end_io_hook)
2276 tree->ops->writepage_end_io_hook(page, cur,
2277 cur + iosize - 1,
2278 NULL, 1);
2279 else if (compressed) {
2280 /* we don't want to end_page_writeback on
2281 * a compressed extent. this happens
2282 * elsewhere
2283 */
2284 nr++;
2285 }
2286
2287 cur += iosize;
2288 pg_offset += iosize;
2289 unlock_start = cur;
2290 continue;
2291 }
2292 /* leave this out until we have a page_mkwrite call */
2293 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2294 EXTENT_DIRTY, 0)) {
2295 cur = cur + iosize;
2296 pg_offset += iosize;
2297 continue;
2298 }
2299
2300 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
2301 if (tree->ops && tree->ops->writepage_io_hook) {
2302 ret = tree->ops->writepage_io_hook(page, cur,
2303 cur + iosize - 1);
2304 } else {
2305 ret = 0;
2306 }
2307 if (ret) {
2308 SetPageError(page);
2309 } else {
2310 unsigned long max_nr = end_index + 1;
2311
2312 set_range_writeback(tree, cur, cur + iosize - 1);
2313 if (!PageWriteback(page)) {
2314 printk(KERN_ERR "btrfs warning page %lu not "
2315 "writeback, cur %llu end %llu\n",
2316 page->index, (unsigned long long)cur,
2317 (unsigned long long)end);
2318 }
2319
2320 ret = submit_extent_page(WRITE, tree, page, sector,
2321 iosize, pg_offset, bdev,
2322 &epd->bio, max_nr,
2323 end_bio_extent_writepage,
2324 0, 0, 0);
2325 if (ret)
2326 SetPageError(page);
2327 }
2328 cur = cur + iosize;
2329 pg_offset += iosize;
2330 nr++;
2331 }
2332done:
2333 if (nr == 0) {
2334 /* make sure the mapping tag for page dirty gets cleared */
2335 set_page_writeback(page);
2336 end_page_writeback(page);
2337 }
2338 if (unlock_start <= page_end)
2339 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2340 unlock_page(page);
2341
2342update_nr_written:
2343 wbc->nr_to_write -= nr_written;
2344 if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2345 wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2346 page->mapping->writeback_index = page->index + nr_written;
2347 return 0;
2348}
2349
2350/**
2351 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2352 * @mapping: address space structure to write
2353 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2354 * @writepage: function called for each page
2355 * @data: data passed to writepage function
2356 *
2357 * If a page is already under I/O, write_cache_pages() skips it, even
2358 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
2359 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
2360 * and msync() need to guarantee that all the data which was dirty at the time
2361 * the call was made get new I/O started against them. If wbc->sync_mode is
2362 * WB_SYNC_ALL then we were called for data integrity and we must wait for
2363 * existing IO to complete.
2364 */
2365static int extent_write_cache_pages(struct extent_io_tree *tree,
2366 struct address_space *mapping,
2367 struct writeback_control *wbc,
2368 writepage_t writepage, void *data,
2369 void (*flush_fn)(void *))
2370{
2371 struct backing_dev_info *bdi = mapping->backing_dev_info;
2372 int ret = 0;
2373 int done = 0;
2374 struct pagevec pvec;
2375 int nr_pages;
2376 pgoff_t index;
2377 pgoff_t end; /* Inclusive */
2378 int scanned = 0;
2379 int range_whole = 0;
2380
2381 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2382 wbc->encountered_congestion = 1;
2383 return 0;
2384 }
2385
2386 pagevec_init(&pvec, 0);
2387 if (wbc->range_cyclic) {
2388 index = mapping->writeback_index; /* Start from prev offset */
2389 end = -1;
2390 } else {
2391 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2392 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2393 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2394 range_whole = 1;
2395 scanned = 1;
2396 }
2397retry:
2398 while (!done && (index <= end) &&
2399 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2400 PAGECACHE_TAG_DIRTY, min(end - index,
2401 (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2402 unsigned i;
2403
2404 scanned = 1;
2405 for (i = 0; i < nr_pages; i++) {
2406 struct page *page = pvec.pages[i];
2407
2408 /*
2409 * At this point we hold neither mapping->tree_lock nor
2410 * lock on the page itself: the page may be truncated or
2411 * invalidated (changing page->mapping to NULL), or even
2412 * swizzled back from swapper_space to tmpfs file
2413 * mapping
2414 */
2415 if (tree->ops && tree->ops->write_cache_pages_lock_hook)
2416 tree->ops->write_cache_pages_lock_hook(page);
2417 else
2418 lock_page(page);
2419
2420 if (unlikely(page->mapping != mapping)) {
2421 unlock_page(page);
2422 continue;
2423 }
2424
2425 if (!wbc->range_cyclic && page->index > end) {
2426 done = 1;
2427 unlock_page(page);
2428 continue;
2429 }
2430
2431 if (wbc->sync_mode != WB_SYNC_NONE) {
2432 if (PageWriteback(page))
2433 flush_fn(data);
2434 wait_on_page_writeback(page);
2435 }
2436
2437 if (PageWriteback(page) ||
2438 !clear_page_dirty_for_io(page)) {
2439 unlock_page(page);
2440 continue;
2441 }
2442
2443 ret = (*writepage)(page, wbc, data);
2444
2445 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2446 unlock_page(page);
2447 ret = 0;
2448 }
2449 if (ret || wbc->nr_to_write <= 0)
2450 done = 1;
2451 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2452 wbc->encountered_congestion = 1;
2453 done = 1;
2454 }
2455 }
2456 pagevec_release(&pvec);
2457 cond_resched();
2458 }
2459 if (!scanned && !done) {
2460 /*
2461 * We hit the last page and there is more work to be done: wrap
2462 * back to the start of the file
2463 */
2464 scanned = 1;
2465 index = 0;
2466 goto retry;
2467 }
2468 return ret;
2469}
2470
2471static noinline void flush_write_bio(void *data)
2472{
2473 struct extent_page_data *epd = data;
2474 if (epd->bio) {
2475 submit_one_bio(WRITE, epd->bio, 0, 0);
2476 epd->bio = NULL;
2477 }
2478}
2479
2480int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2481 get_extent_t *get_extent,
2482 struct writeback_control *wbc)
2483{
2484 int ret;
2485 struct address_space *mapping = page->mapping;
2486 struct extent_page_data epd = {
2487 .bio = NULL,
2488 .tree = tree,
2489 .get_extent = get_extent,
2490 .extent_locked = 0,
2491 };
2492 struct writeback_control wbc_writepages = {
2493 .bdi = wbc->bdi,
2494 .sync_mode = WB_SYNC_NONE,
2495 .older_than_this = NULL,
2496 .nr_to_write = 64,
2497 .range_start = page_offset(page) + PAGE_CACHE_SIZE,
2498 .range_end = (loff_t)-1,
2499 };
2500
2501
2502 ret = __extent_writepage(page, wbc, &epd);
2503
2504 extent_write_cache_pages(tree, mapping, &wbc_writepages,
2505 __extent_writepage, &epd, flush_write_bio);
2506 if (epd.bio)
2507 submit_one_bio(WRITE, epd.bio, 0, 0);
2508 return ret;
2509}
2510
2511int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2512 u64 start, u64 end, get_extent_t *get_extent,
2513 int mode)
2514{
2515 int ret = 0;
2516 struct address_space *mapping = inode->i_mapping;
2517 struct page *page;
2518 unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
2519 PAGE_CACHE_SHIFT;
2520
2521 struct extent_page_data epd = {
2522 .bio = NULL,
2523 .tree = tree,
2524 .get_extent = get_extent,
2525 .extent_locked = 1,
2526 };
2527 struct writeback_control wbc_writepages = {
2528 .bdi = inode->i_mapping->backing_dev_info,
2529 .sync_mode = mode,
2530 .older_than_this = NULL,
2531 .nr_to_write = nr_pages * 2,
2532 .range_start = start,
2533 .range_end = end + 1,
2534 };
2535
2536 while (start <= end) {
2537 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
2538 if (clear_page_dirty_for_io(page))
2539 ret = __extent_writepage(page, &wbc_writepages, &epd);
2540 else {
2541 if (tree->ops && tree->ops->writepage_end_io_hook)
2542 tree->ops->writepage_end_io_hook(page, start,
2543 start + PAGE_CACHE_SIZE - 1,
2544 NULL, 1);
2545 unlock_page(page);
2546 }
2547 page_cache_release(page);
2548 start += PAGE_CACHE_SIZE;
2549 }
2550
2551 if (epd.bio)
2552 submit_one_bio(WRITE, epd.bio, 0, 0);
2553 return ret;
2554}
2555
2556int extent_writepages(struct extent_io_tree *tree,
2557 struct address_space *mapping,
2558 get_extent_t *get_extent,
2559 struct writeback_control *wbc)
2560{
2561 int ret = 0;
2562 struct extent_page_data epd = {
2563 .bio = NULL,
2564 .tree = tree,
2565 .get_extent = get_extent,
2566 .extent_locked = 0,
2567 };
2568
2569 ret = extent_write_cache_pages(tree, mapping, wbc,
2570 __extent_writepage, &epd,
2571 flush_write_bio);
2572 if (epd.bio)
2573 submit_one_bio(WRITE, epd.bio, 0, 0);
2574 return ret;
2575}
2576
2577int extent_readpages(struct extent_io_tree *tree,
2578 struct address_space *mapping,
2579 struct list_head *pages, unsigned nr_pages,
2580 get_extent_t get_extent)
2581{
2582 struct bio *bio = NULL;
2583 unsigned page_idx;
2584 struct pagevec pvec;
2585 unsigned long bio_flags = 0;
2586
2587 pagevec_init(&pvec, 0);
2588 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2589 struct page *page = list_entry(pages->prev, struct page, lru);
2590
2591 prefetchw(&page->flags);
2592 list_del(&page->lru);
2593 /*
2594 * what we want to do here is call add_to_page_cache_lru,
2595 * but that isn't exported, so we reproduce it here
2596 */
2597 if (!add_to_page_cache(page, mapping,
2598 page->index, GFP_KERNEL)) {
2599
2600 /* open coding of lru_cache_add, also not exported */
2601 page_cache_get(page);
2602 if (!pagevec_add(&pvec, page))
2603 __pagevec_lru_add_file(&pvec);
2604 __extent_read_full_page(tree, page, get_extent,
2605 &bio, 0, &bio_flags);
2606 }
2607 page_cache_release(page);
2608 }
2609 if (pagevec_count(&pvec))
2610 __pagevec_lru_add_file(&pvec);
2611 BUG_ON(!list_empty(pages));
2612 if (bio)
2613 submit_one_bio(READ, bio, 0, bio_flags);
2614 return 0;
2615}
2616
2617/*
2618 * basic invalidatepage code, this waits on any locked or writeback
2619 * ranges corresponding to the page, and then deletes any extent state
2620 * records from the tree
2621 */
2622int extent_invalidatepage(struct extent_io_tree *tree,
2623 struct page *page, unsigned long offset)
2624{
2625 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2626 u64 end = start + PAGE_CACHE_SIZE - 1;
2627 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2628
2629 start += (offset + blocksize - 1) & ~(blocksize - 1);
2630 if (start > end)
2631 return 0;
2632
2633 lock_extent(tree, start, end, GFP_NOFS);
2634 wait_on_extent_writeback(tree, start, end);
2635 clear_extent_bit(tree, start, end,
2636 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
2637 1, 1, GFP_NOFS);
2638 return 0;
2639}
2640
2641/*
2642 * simple commit_write call, set_range_dirty is used to mark both
2643 * the pages and the extent records as dirty
2644 */
2645int extent_commit_write(struct extent_io_tree *tree,
2646 struct inode *inode, struct page *page,
2647 unsigned from, unsigned to)
2648{
2649 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2650
2651 set_page_extent_mapped(page);
2652 set_page_dirty(page);
2653
2654 if (pos > inode->i_size) {
2655 i_size_write(inode, pos);
2656 mark_inode_dirty(inode);
2657 }
2658 return 0;
2659}
2660
2661int extent_prepare_write(struct extent_io_tree *tree,
2662 struct inode *inode, struct page *page,
2663 unsigned from, unsigned to, get_extent_t *get_extent)
2664{
2665 u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2666 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2667 u64 block_start;
2668 u64 orig_block_start;
2669 u64 block_end;
2670 u64 cur_end;
2671 struct extent_map *em;
2672 unsigned blocksize = 1 << inode->i_blkbits;
2673 size_t page_offset = 0;
2674 size_t block_off_start;
2675 size_t block_off_end;
2676 int err = 0;
2677 int iocount = 0;
2678 int ret = 0;
2679 int isnew;
2680
2681 set_page_extent_mapped(page);
2682
2683 block_start = (page_start + from) & ~((u64)blocksize - 1);
2684 block_end = (page_start + to - 1) | (blocksize - 1);
2685 orig_block_start = block_start;
2686
2687 lock_extent(tree, page_start, page_end, GFP_NOFS);
2688 while (block_start <= block_end) {
2689 em = get_extent(inode, page, page_offset, block_start,
2690 block_end - block_start + 1, 1);
2691 if (IS_ERR(em) || !em)
2692 goto err;
2693
2694 cur_end = min(block_end, extent_map_end(em) - 1);
2695 block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2696 block_off_end = block_off_start + blocksize;
2697 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2698
2699 if (!PageUptodate(page) && isnew &&
2700 (block_off_end > to || block_off_start < from)) {
2701 void *kaddr;
2702
2703 kaddr = kmap_atomic(page, KM_USER0);
2704 if (block_off_end > to)
2705 memset(kaddr + to, 0, block_off_end - to);
2706 if (block_off_start < from)
2707 memset(kaddr + block_off_start, 0,
2708 from - block_off_start);
2709 flush_dcache_page(page);
2710 kunmap_atomic(kaddr, KM_USER0);
2711 }
2712 if ((em->block_start != EXTENT_MAP_HOLE &&
2713 em->block_start != EXTENT_MAP_INLINE) &&
2714 !isnew && !PageUptodate(page) &&
2715 (block_off_end > to || block_off_start < from) &&
2716 !test_range_bit(tree, block_start, cur_end,
2717 EXTENT_UPTODATE, 1)) {
2718 u64 sector;
2719 u64 extent_offset = block_start - em->start;
2720 size_t iosize;
2721 sector = (em->block_start + extent_offset) >> 9;
2722 iosize = (cur_end - block_start + blocksize) &
2723 ~((u64)blocksize - 1);
2724 /*
2725 * we've already got the extent locked, but we
2726 * need to split the state such that our end_bio
2727 * handler can clear the lock.
2728 */
2729 set_extent_bit(tree, block_start,
2730 block_start + iosize - 1,
2731 EXTENT_LOCKED, 0, NULL, GFP_NOFS);
2732 ret = submit_extent_page(READ, tree, page,
2733 sector, iosize, page_offset, em->bdev,
2734 NULL, 1,
2735 end_bio_extent_preparewrite, 0,
2736 0, 0);
2737 iocount++;
2738 block_start = block_start + iosize;
2739 } else {
2740 set_extent_uptodate(tree, block_start, cur_end,
2741 GFP_NOFS);
2742 unlock_extent(tree, block_start, cur_end, GFP_NOFS);
2743 block_start = cur_end + 1;
2744 }
2745 page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2746 free_extent_map(em);
2747 }
2748 if (iocount) {
2749 wait_extent_bit(tree, orig_block_start,
2750 block_end, EXTENT_LOCKED);
2751 }
2752 check_page_uptodate(tree, page);
2753err:
2754 /* FIXME, zero out newly allocated blocks on error */
2755 return err;
2756}
2757
2758/*
2759 * a helper for releasepage, this tests for areas of the page that
2760 * are locked or under IO and drops the related state bits if it is safe
2761 * to drop the page.
2762 */
2763int try_release_extent_state(struct extent_map_tree *map,
2764 struct extent_io_tree *tree, struct page *page,
2765 gfp_t mask)
2766{
2767 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2768 u64 end = start + PAGE_CACHE_SIZE - 1;
2769 int ret = 1;
2770
2771 if (test_range_bit(tree, start, end,
2772 EXTENT_IOBITS | EXTENT_ORDERED, 0))
2773 ret = 0;
2774 else {
2775 if ((mask & GFP_NOFS) == GFP_NOFS)
2776 mask = GFP_NOFS;
2777 clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
2778 1, 1, mask);
2779 }
2780 return ret;
2781}
2782
2783/*
2784 * a helper for releasepage. As long as there are no locked extents
2785 * in the range corresponding to the page, both state records and extent
2786 * map records are removed
2787 */
2788int try_release_extent_mapping(struct extent_map_tree *map,
2789 struct extent_io_tree *tree, struct page *page,
2790 gfp_t mask)
2791{
2792 struct extent_map *em;
2793 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2794 u64 end = start + PAGE_CACHE_SIZE - 1;
2795
2796 if ((mask & __GFP_WAIT) &&
2797 page->mapping->host->i_size > 16 * 1024 * 1024) {
2798 u64 len;
2799 while (start <= end) {
2800 len = end - start + 1;
2801 spin_lock(&map->lock);
2802 em = lookup_extent_mapping(map, start, len);
2803 if (!em || IS_ERR(em)) {
2804 spin_unlock(&map->lock);
2805 break;
2806 }
2807 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2808 em->start != start) {
2809 spin_unlock(&map->lock);
2810 free_extent_map(em);
2811 break;
2812 }
2813 if (!test_range_bit(tree, em->start,
2814 extent_map_end(em) - 1,
2815 EXTENT_LOCKED | EXTENT_WRITEBACK |
2816 EXTENT_ORDERED,
2817 0)) {
2818 remove_extent_mapping(map, em);
2819 /* once for the rb tree */
2820 free_extent_map(em);
2821 }
2822 start = extent_map_end(em);
2823 spin_unlock(&map->lock);
2824
2825 /* once for us */
2826 free_extent_map(em);
2827 }
2828 }
2829 return try_release_extent_state(map, tree, page, mask);
2830}
2831
2832sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2833 get_extent_t *get_extent)
2834{
2835 struct inode *inode = mapping->host;
2836 u64 start = iblock << inode->i_blkbits;
2837 sector_t sector = 0;
2838 size_t blksize = (1 << inode->i_blkbits);
2839 struct extent_map *em;
2840
2841 lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2842 GFP_NOFS);
2843 em = get_extent(inode, NULL, 0, start, blksize, 0);
2844 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2845 GFP_NOFS);
2846 if (!em || IS_ERR(em))
2847 return 0;
2848
2849 if (em->block_start > EXTENT_MAP_LAST_BYTE)
2850 goto out;
2851
2852 sector = (em->block_start + start - em->start) >> inode->i_blkbits;
2853out:
2854 free_extent_map(em);
2855 return sector;
2856}
2857
2858static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2859 unsigned long i)
2860{
2861 struct page *p;
2862 struct address_space *mapping;
2863
2864 if (i == 0)
2865 return eb->first_page;
2866 i += eb->start >> PAGE_CACHE_SHIFT;
2867 mapping = eb->first_page->mapping;
2868 if (!mapping)
2869 return NULL;
2870
2871 /*
2872 * extent_buffer_page is only called after pinning the page
2873 * by increasing the reference count. So we know the page must
2874 * be in the radix tree.
2875 */
2876 rcu_read_lock();
2877 p = radix_tree_lookup(&mapping->page_tree, i);
2878 rcu_read_unlock();
2879
2880 return p;
2881}
2882
2883static inline unsigned long num_extent_pages(u64 start, u64 len)
2884{
2885 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2886 (start >> PAGE_CACHE_SHIFT);
2887}
2888
2889static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2890 u64 start,
2891 unsigned long len,
2892 gfp_t mask)
2893{
2894 struct extent_buffer *eb = NULL;
2895#ifdef LEAK_DEBUG
2896 unsigned long flags;
2897#endif
2898
2899 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2900 eb->start = start;
2901 eb->len = len;
2902 mutex_init(&eb->mutex);
2903#ifdef LEAK_DEBUG
2904 spin_lock_irqsave(&leak_lock, flags);
2905 list_add(&eb->leak_list, &buffers);
2906 spin_unlock_irqrestore(&leak_lock, flags);
2907#endif
2908 atomic_set(&eb->refs, 1);
2909
2910 return eb;
2911}
2912
2913static void __free_extent_buffer(struct extent_buffer *eb)
2914{
2915#ifdef LEAK_DEBUG
2916 unsigned long flags;
2917 spin_lock_irqsave(&leak_lock, flags);
2918 list_del(&eb->leak_list);
2919 spin_unlock_irqrestore(&leak_lock, flags);
2920#endif
2921 kmem_cache_free(extent_buffer_cache, eb);
2922}
2923
2924struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
2925 u64 start, unsigned long len,
2926 struct page *page0,
2927 gfp_t mask)
2928{
2929 unsigned long num_pages = num_extent_pages(start, len);
2930 unsigned long i;
2931 unsigned long index = start >> PAGE_CACHE_SHIFT;
2932 struct extent_buffer *eb;
2933 struct extent_buffer *exists = NULL;
2934 struct page *p;
2935 struct address_space *mapping = tree->mapping;
2936 int uptodate = 1;
2937
2938 spin_lock(&tree->buffer_lock);
2939 eb = buffer_search(tree, start);
2940 if (eb) {
2941 atomic_inc(&eb->refs);
2942 spin_unlock(&tree->buffer_lock);
2943 mark_page_accessed(eb->first_page);
2944 return eb;
2945 }
2946 spin_unlock(&tree->buffer_lock);
2947
2948 eb = __alloc_extent_buffer(tree, start, len, mask);
2949 if (!eb)
2950 return NULL;
2951
2952 if (page0) {
2953 eb->first_page = page0;
2954 i = 1;
2955 index++;
2956 page_cache_get(page0);
2957 mark_page_accessed(page0);
2958 set_page_extent_mapped(page0);
2959 set_page_extent_head(page0, len);
2960 uptodate = PageUptodate(page0);
2961 } else {
2962 i = 0;
2963 }
2964 for (; i < num_pages; i++, index++) {
2965 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
2966 if (!p) {
2967 WARN_ON(1);
2968 goto free_eb;
2969 }
2970 set_page_extent_mapped(p);
2971 mark_page_accessed(p);
2972 if (i == 0) {
2973 eb->first_page = p;
2974 set_page_extent_head(p, len);
2975 } else {
2976 set_page_private(p, EXTENT_PAGE_PRIVATE);
2977 }
2978 if (!PageUptodate(p))
2979 uptodate = 0;
2980 unlock_page(p);
2981 }
2982 if (uptodate)
2983 eb->flags |= EXTENT_UPTODATE;
2984 eb->flags |= EXTENT_BUFFER_FILLED;
2985
2986 spin_lock(&tree->buffer_lock);
2987 exists = buffer_tree_insert(tree, start, &eb->rb_node);
2988 if (exists) {
2989 /* add one reference for the caller */
2990 atomic_inc(&exists->refs);
2991 spin_unlock(&tree->buffer_lock);
2992 goto free_eb;
2993 }
2994 spin_unlock(&tree->buffer_lock);
2995
2996 /* add one reference for the tree */
2997 atomic_inc(&eb->refs);
2998 return eb;
2999
3000free_eb:
3001 if (!atomic_dec_and_test(&eb->refs))
3002 return exists;
3003 for (index = 1; index < i; index++)
3004 page_cache_release(extent_buffer_page(eb, index));
3005 page_cache_release(extent_buffer_page(eb, 0));
3006 __free_extent_buffer(eb);
3007 return exists;
3008}
3009
3010struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3011 u64 start, unsigned long len,
3012 gfp_t mask)
3013{
3014 struct extent_buffer *eb;
3015
3016 spin_lock(&tree->buffer_lock);
3017 eb = buffer_search(tree, start);
3018 if (eb)
3019 atomic_inc(&eb->refs);
3020 spin_unlock(&tree->buffer_lock);
3021
3022 if (eb)
3023 mark_page_accessed(eb->first_page);
3024
3025 return eb;
3026}
3027
3028void free_extent_buffer(struct extent_buffer *eb)
3029{
3030 if (!eb)
3031 return;
3032
3033 if (!atomic_dec_and_test(&eb->refs))
3034 return;
3035
3036 WARN_ON(1);
3037}
3038
3039int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3040 struct extent_buffer *eb)
3041{
3042 int set;
3043 unsigned long i;
3044 unsigned long num_pages;
3045 struct page *page;
3046
3047 u64 start = eb->start;
3048 u64 end = start + eb->len - 1;
3049
3050 set = clear_extent_dirty(tree, start, end, GFP_NOFS);
3051 num_pages = num_extent_pages(eb->start, eb->len);
3052
3053 for (i = 0; i < num_pages; i++) {
3054 page = extent_buffer_page(eb, i);
3055 if (!set && !PageDirty(page))
3056 continue;
3057
3058 lock_page(page);
3059 if (i == 0)
3060 set_page_extent_head(page, eb->len);
3061 else
3062 set_page_private(page, EXTENT_PAGE_PRIVATE);
3063
3064 /*
3065 * if we're on the last page or the first page and the
3066 * block isn't aligned on a page boundary, do extra checks
3067 * to make sure we don't clean page that is partially dirty
3068 */
3069 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3070 ((i == num_pages - 1) &&
3071 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3072 start = (u64)page->index << PAGE_CACHE_SHIFT;
3073 end = start + PAGE_CACHE_SIZE - 1;
3074 if (test_range_bit(tree, start, end,
3075 EXTENT_DIRTY, 0)) {
3076 unlock_page(page);
3077 continue;
3078 }
3079 }
3080 clear_page_dirty_for_io(page);
3081 spin_lock_irq(&page->mapping->tree_lock);
3082 if (!PageDirty(page)) {
3083 radix_tree_tag_clear(&page->mapping->page_tree,
3084 page_index(page),
3085 PAGECACHE_TAG_DIRTY);
3086 }
3087 spin_unlock_irq(&page->mapping->tree_lock);
3088 unlock_page(page);
3089 }
3090 return 0;
3091}
3092
3093int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
3094 struct extent_buffer *eb)
3095{
3096 return wait_on_extent_writeback(tree, eb->start,
3097 eb->start + eb->len - 1);
3098}
3099
3100int set_extent_buffer_dirty(struct extent_io_tree *tree,
3101 struct extent_buffer *eb)
3102{
3103 unsigned long i;
3104 unsigned long num_pages;
3105
3106 num_pages = num_extent_pages(eb->start, eb->len);
3107 for (i = 0; i < num_pages; i++) {
3108 struct page *page = extent_buffer_page(eb, i);
3109 /* writepage may need to do something special for the
3110 * first page, we have to make sure page->private is
3111 * properly set. releasepage may drop page->private
3112 * on us if the page isn't already dirty.
3113 */
3114 lock_page(page);
3115 if (i == 0) {
3116 set_page_extent_head(page, eb->len);
3117 } else if (PagePrivate(page) &&
3118 page->private != EXTENT_PAGE_PRIVATE) {
3119 set_page_extent_mapped(page);
3120 }
3121 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3122 set_extent_dirty(tree, page_offset(page),
3123 page_offset(page) + PAGE_CACHE_SIZE - 1,
3124 GFP_NOFS);
3125 unlock_page(page);
3126 }
3127 return 0;
3128}
3129
3130int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3131 struct extent_buffer *eb)
3132{
3133 unsigned long i;
3134 struct page *page;
3135 unsigned long num_pages;
3136
3137 num_pages = num_extent_pages(eb->start, eb->len);
3138 eb->flags &= ~EXTENT_UPTODATE;
3139
3140 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3141 GFP_NOFS);
3142 for (i = 0; i < num_pages; i++) {
3143 page = extent_buffer_page(eb, i);
3144 if (page)
3145 ClearPageUptodate(page);
3146 }
3147 return 0;
3148}
3149
3150int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3151 struct extent_buffer *eb)
3152{
3153 unsigned long i;
3154 struct page *page;
3155 unsigned long num_pages;
3156
3157 num_pages = num_extent_pages(eb->start, eb->len);
3158
3159 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3160 GFP_NOFS);
3161 for (i = 0; i < num_pages; i++) {
3162 page = extent_buffer_page(eb, i);
3163 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3164 ((i == num_pages - 1) &&
3165 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3166 check_page_uptodate(tree, page);
3167 continue;
3168 }
3169 SetPageUptodate(page);
3170 }
3171 return 0;
3172}
3173
3174int extent_range_uptodate(struct extent_io_tree *tree,
3175 u64 start, u64 end)
3176{
3177 struct page *page;
3178 int ret;
3179 int pg_uptodate = 1;
3180 int uptodate;
3181 unsigned long index;
3182
3183 ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
3184 if (ret)
3185 return 1;
3186 while (start <= end) {
3187 index = start >> PAGE_CACHE_SHIFT;
3188 page = find_get_page(tree->mapping, index);
3189 uptodate = PageUptodate(page);
3190 page_cache_release(page);
3191 if (!uptodate) {
3192 pg_uptodate = 0;
3193 break;
3194 }
3195 start += PAGE_CACHE_SIZE;
3196 }
3197 return pg_uptodate;
3198}
3199
3200int extent_buffer_uptodate(struct extent_io_tree *tree,
3201 struct extent_buffer *eb)
3202{
3203 int ret = 0;
3204 unsigned long num_pages;
3205 unsigned long i;
3206 struct page *page;
3207 int pg_uptodate = 1;
3208
3209 if (eb->flags & EXTENT_UPTODATE)
3210 return 1;
3211
3212 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3213 EXTENT_UPTODATE, 1);
3214 if (ret)
3215 return ret;
3216
3217 num_pages = num_extent_pages(eb->start, eb->len);
3218 for (i = 0; i < num_pages; i++) {
3219 page = extent_buffer_page(eb, i);
3220 if (!PageUptodate(page)) {
3221 pg_uptodate = 0;
3222 break;
3223 }
3224 }
3225 return pg_uptodate;
3226}
3227
3228int read_extent_buffer_pages(struct extent_io_tree *tree,
3229 struct extent_buffer *eb,
3230 u64 start, int wait,
3231 get_extent_t *get_extent, int mirror_num)
3232{
3233 unsigned long i;
3234 unsigned long start_i;
3235 struct page *page;
3236 int err;
3237 int ret = 0;
3238 int locked_pages = 0;
3239 int all_uptodate = 1;
3240 int inc_all_pages = 0;
3241 unsigned long num_pages;
3242 struct bio *bio = NULL;
3243 unsigned long bio_flags = 0;
3244
3245 if (eb->flags & EXTENT_UPTODATE)
3246 return 0;
3247
3248 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3249 EXTENT_UPTODATE, 1)) {
3250 return 0;
3251 }
3252
3253 if (start) {
3254 WARN_ON(start < eb->start);
3255 start_i = (start >> PAGE_CACHE_SHIFT) -
3256 (eb->start >> PAGE_CACHE_SHIFT);
3257 } else {
3258 start_i = 0;
3259 }
3260
3261 num_pages = num_extent_pages(eb->start, eb->len);
3262 for (i = start_i; i < num_pages; i++) {
3263 page = extent_buffer_page(eb, i);
3264 if (!wait) {
3265 if (!trylock_page(page))
3266 goto unlock_exit;
3267 } else {
3268 lock_page(page);
3269 }
3270 locked_pages++;
3271 if (!PageUptodate(page))
3272 all_uptodate = 0;
3273 }
3274 if (all_uptodate) {
3275 if (start_i == 0)
3276 eb->flags |= EXTENT_UPTODATE;
3277 goto unlock_exit;
3278 }
3279
3280 for (i = start_i; i < num_pages; i++) {
3281 page = extent_buffer_page(eb, i);
3282 if (inc_all_pages)
3283 page_cache_get(page);
3284 if (!PageUptodate(page)) {
3285 if (start_i == 0)
3286 inc_all_pages = 1;
3287 ClearPageError(page);
3288 err = __extent_read_full_page(tree, page,
3289 get_extent, &bio,
3290 mirror_num, &bio_flags);
3291 if (err)
3292 ret = err;
3293 } else {
3294 unlock_page(page);
3295 }
3296 }
3297
3298 if (bio)
3299 submit_one_bio(READ, bio, mirror_num, bio_flags);
3300
3301 if (ret || !wait)
3302 return ret;
3303
3304 for (i = start_i; i < num_pages; i++) {
3305 page = extent_buffer_page(eb, i);
3306 wait_on_page_locked(page);
3307 if (!PageUptodate(page))
3308 ret = -EIO;
3309 }
3310
3311 if (!ret)
3312 eb->flags |= EXTENT_UPTODATE;
3313 return ret;
3314
3315unlock_exit:
3316 i = start_i;
3317 while (locked_pages > 0) {
3318 page = extent_buffer_page(eb, i);
3319 i++;
3320 unlock_page(page);
3321 locked_pages--;
3322 }
3323 return ret;
3324}
3325
3326void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3327 unsigned long start,
3328 unsigned long len)
3329{
3330 size_t cur;
3331 size_t offset;
3332 struct page *page;
3333 char *kaddr;
3334 char *dst = (char *)dstv;
3335 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3336 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3337
3338 WARN_ON(start > eb->len);
3339 WARN_ON(start + len > eb->start + eb->len);
3340
3341 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3342
3343 while (len > 0) {
3344 page = extent_buffer_page(eb, i);
3345
3346 cur = min(len, (PAGE_CACHE_SIZE - offset));
3347 kaddr = kmap_atomic(page, KM_USER1);
3348 memcpy(dst, kaddr + offset, cur);
3349 kunmap_atomic(kaddr, KM_USER1);
3350
3351 dst += cur;
3352 len -= cur;
3353 offset = 0;
3354 i++;
3355 }
3356}
3357
3358int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3359 unsigned long min_len, char **token, char **map,
3360 unsigned long *map_start,
3361 unsigned long *map_len, int km)
3362{
3363 size_t offset = start & (PAGE_CACHE_SIZE - 1);
3364 char *kaddr;
3365 struct page *p;
3366 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3367 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3368 unsigned long end_i = (start_offset + start + min_len - 1) >>
3369 PAGE_CACHE_SHIFT;
3370
3371 if (i != end_i)
3372 return -EINVAL;
3373
3374 if (i == 0) {
3375 offset = start_offset;
3376 *map_start = 0;
3377 } else {
3378 offset = 0;
3379 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
3380 }
3381
3382 if (start + min_len > eb->len) {
3383 printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
3384 "wanted %lu %lu\n", (unsigned long long)eb->start,
3385 eb->len, start, min_len);
3386 WARN_ON(1);
3387 }
3388
3389 p = extent_buffer_page(eb, i);
3390 kaddr = kmap_atomic(p, km);
3391 *token = kaddr;
3392 *map = kaddr + offset;
3393 *map_len = PAGE_CACHE_SIZE - offset;
3394 return 0;
3395}
3396
3397int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3398 unsigned long min_len,
3399 char **token, char **map,
3400 unsigned long *map_start,
3401 unsigned long *map_len, int km)
3402{
3403 int err;
3404 int save = 0;
3405 if (eb->map_token) {
3406 unmap_extent_buffer(eb, eb->map_token, km);
3407 eb->map_token = NULL;
3408 save = 1;
3409 WARN_ON(!mutex_is_locked(&eb->mutex));
3410 }
3411 err = map_private_extent_buffer(eb, start, min_len, token, map,
3412 map_start, map_len, km);
3413 if (!err && save) {
3414 eb->map_token = *token;
3415 eb->kaddr = *map;
3416 eb->map_start = *map_start;
3417 eb->map_len = *map_len;
3418 }
3419 return err;
3420}
3421
3422void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3423{
3424 kunmap_atomic(token, km);
3425}
3426
3427int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3428 unsigned long start,
3429 unsigned long len)
3430{
3431 size_t cur;
3432 size_t offset;
3433 struct page *page;
3434 char *kaddr;
3435 char *ptr = (char *)ptrv;
3436 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3437 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3438 int ret = 0;
3439
3440 WARN_ON(start > eb->len);
3441 WARN_ON(start + len > eb->start + eb->len);
3442
3443 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3444
3445 while (len > 0) {
3446 page = extent_buffer_page(eb, i);
3447
3448 cur = min(len, (PAGE_CACHE_SIZE - offset));
3449
3450 kaddr = kmap_atomic(page, KM_USER0);
3451 ret = memcmp(ptr, kaddr + offset, cur);
3452 kunmap_atomic(kaddr, KM_USER0);
3453 if (ret)
3454 break;
3455
3456 ptr += cur;
3457 len -= cur;
3458 offset = 0;
3459 i++;
3460 }
3461 return ret;
3462}
3463
3464void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3465 unsigned long start, unsigned long len)
3466{
3467 size_t cur;
3468 size_t offset;
3469 struct page *page;
3470 char *kaddr;
3471 char *src = (char *)srcv;
3472 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3473 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3474
3475 WARN_ON(start > eb->len);
3476 WARN_ON(start + len > eb->start + eb->len);
3477
3478 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3479
3480 while (len > 0) {
3481 page = extent_buffer_page(eb, i);
3482 WARN_ON(!PageUptodate(page));
3483
3484 cur = min(len, PAGE_CACHE_SIZE - offset);
3485 kaddr = kmap_atomic(page, KM_USER1);
3486 memcpy(kaddr + offset, src, cur);
3487 kunmap_atomic(kaddr, KM_USER1);
3488
3489 src += cur;
3490 len -= cur;
3491 offset = 0;
3492 i++;
3493 }
3494}
3495
3496void memset_extent_buffer(struct extent_buffer *eb, char c,
3497 unsigned long start, unsigned long len)
3498{
3499 size_t cur;
3500 size_t offset;
3501 struct page *page;
3502 char *kaddr;
3503 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3504 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3505
3506 WARN_ON(start > eb->len);
3507 WARN_ON(start + len > eb->start + eb->len);
3508
3509 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3510
3511 while (len > 0) {
3512 page = extent_buffer_page(eb, i);
3513 WARN_ON(!PageUptodate(page));
3514
3515 cur = min(len, PAGE_CACHE_SIZE - offset);
3516 kaddr = kmap_atomic(page, KM_USER0);
3517 memset(kaddr + offset, c, cur);
3518 kunmap_atomic(kaddr, KM_USER0);
3519
3520 len -= cur;
3521 offset = 0;
3522 i++;
3523 }
3524}
3525
3526void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3527 unsigned long dst_offset, unsigned long src_offset,
3528 unsigned long len)
3529{
3530 u64 dst_len = dst->len;
3531 size_t cur;
3532 size_t offset;
3533 struct page *page;
3534 char *kaddr;
3535 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3536 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3537
3538 WARN_ON(src->len != dst_len);
3539
3540 offset = (start_offset + dst_offset) &
3541 ((unsigned long)PAGE_CACHE_SIZE - 1);
3542
3543 while (len > 0) {
3544 page = extent_buffer_page(dst, i);
3545 WARN_ON(!PageUptodate(page));
3546
3547 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3548
3549 kaddr = kmap_atomic(page, KM_USER0);
3550 read_extent_buffer(src, kaddr + offset, src_offset, cur);
3551 kunmap_atomic(kaddr, KM_USER0);
3552
3553 src_offset += cur;
3554 len -= cur;
3555 offset = 0;
3556 i++;
3557 }
3558}
3559
3560static void move_pages(struct page *dst_page, struct page *src_page,
3561 unsigned long dst_off, unsigned long src_off,
3562 unsigned long len)
3563{
3564 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3565 if (dst_page == src_page) {
3566 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3567 } else {
3568 char *src_kaddr = kmap_atomic(src_page, KM_USER1);
3569 char *p = dst_kaddr + dst_off + len;
3570 char *s = src_kaddr + src_off + len;
3571
3572 while (len--)
3573 *--p = *--s;
3574
3575 kunmap_atomic(src_kaddr, KM_USER1);
3576 }
3577 kunmap_atomic(dst_kaddr, KM_USER0);
3578}
3579
3580static void copy_pages(struct page *dst_page, struct page *src_page,
3581 unsigned long dst_off, unsigned long src_off,
3582 unsigned long len)
3583{
3584 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3585 char *src_kaddr;
3586
3587 if (dst_page != src_page)
3588 src_kaddr = kmap_atomic(src_page, KM_USER1);
3589 else
3590 src_kaddr = dst_kaddr;
3591
3592 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3593 kunmap_atomic(dst_kaddr, KM_USER0);
3594 if (dst_page != src_page)
3595 kunmap_atomic(src_kaddr, KM_USER1);
3596}
3597
3598void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3599 unsigned long src_offset, unsigned long len)
3600{
3601 size_t cur;
3602 size_t dst_off_in_page;
3603 size_t src_off_in_page;
3604 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3605 unsigned long dst_i;
3606 unsigned long src_i;
3607
3608 if (src_offset + len > dst->len) {
3609 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3610 "len %lu dst len %lu\n", src_offset, len, dst->len);
3611 BUG_ON(1);
3612 }
3613 if (dst_offset + len > dst->len) {
3614 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3615 "len %lu dst len %lu\n", dst_offset, len, dst->len);
3616 BUG_ON(1);
3617 }
3618
3619 while (len > 0) {
3620 dst_off_in_page = (start_offset + dst_offset) &
3621 ((unsigned long)PAGE_CACHE_SIZE - 1);
3622 src_off_in_page = (start_offset + src_offset) &
3623 ((unsigned long)PAGE_CACHE_SIZE - 1);
3624
3625 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3626 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3627
3628 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3629 src_off_in_page));
3630 cur = min_t(unsigned long, cur,
3631 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3632
3633 copy_pages(extent_buffer_page(dst, dst_i),
3634 extent_buffer_page(dst, src_i),
3635 dst_off_in_page, src_off_in_page, cur);
3636
3637 src_offset += cur;
3638 dst_offset += cur;
3639 len -= cur;
3640 }
3641}
3642
3643void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3644 unsigned long src_offset, unsigned long len)
3645{
3646 size_t cur;
3647 size_t dst_off_in_page;
3648 size_t src_off_in_page;
3649 unsigned long dst_end = dst_offset + len - 1;
3650 unsigned long src_end = src_offset + len - 1;
3651 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3652 unsigned long dst_i;
3653 unsigned long src_i;
3654
3655 if (src_offset + len > dst->len) {
3656 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3657 "len %lu len %lu\n", src_offset, len, dst->len);
3658 BUG_ON(1);
3659 }
3660 if (dst_offset + len > dst->len) {
3661 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3662 "len %lu len %lu\n", dst_offset, len, dst->len);
3663 BUG_ON(1);
3664 }
3665 if (dst_offset < src_offset) {
3666 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3667 return;
3668 }
3669 while (len > 0) {
3670 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3671 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3672
3673 dst_off_in_page = (start_offset + dst_end) &
3674 ((unsigned long)PAGE_CACHE_SIZE - 1);
3675 src_off_in_page = (start_offset + src_end) &
3676 ((unsigned long)PAGE_CACHE_SIZE - 1);
3677
3678 cur = min_t(unsigned long, len, src_off_in_page + 1);
3679 cur = min(cur, dst_off_in_page + 1);
3680 move_pages(extent_buffer_page(dst, dst_i),
3681 extent_buffer_page(dst, src_i),
3682 dst_off_in_page - cur + 1,
3683 src_off_in_page - cur + 1, cur);
3684
3685 dst_end -= cur;
3686 src_end -= cur;
3687 len -= cur;
3688 }
3689}
3690
3691int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3692{
3693 u64 start = page_offset(page);
3694 struct extent_buffer *eb;
3695 int ret = 1;
3696 unsigned long i;
3697 unsigned long num_pages;
3698
3699 spin_lock(&tree->buffer_lock);
3700 eb = buffer_search(tree, start);
3701 if (!eb)
3702 goto out;
3703
3704 if (atomic_read(&eb->refs) > 1) {
3705 ret = 0;
3706 goto out;
3707 }
3708 /* at this point we can safely release the extent buffer */
3709 num_pages = num_extent_pages(eb->start, eb->len);
3710 for (i = 0; i < num_pages; i++)
3711 page_cache_release(extent_buffer_page(eb, i));
3712 rb_erase(&eb->rb_node, &tree->buffer);
3713 __free_extent_buffer(eb);
3714out:
3715 spin_unlock(&tree->buffer_lock);
3716 return ret;
3717}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
new file mode 100644
index 000000000000..c5b483a79137
--- /dev/null
+++ b/fs/btrfs/extent_io.h
@@ -0,0 +1,269 @@
1#ifndef __EXTENTIO__
2#define __EXTENTIO__
3
4#include <linux/rbtree.h>
5
6/* bits for the extent state */
7#define EXTENT_DIRTY 1
8#define EXTENT_WRITEBACK (1 << 1)
9#define EXTENT_UPTODATE (1 << 2)
10#define EXTENT_LOCKED (1 << 3)
11#define EXTENT_NEW (1 << 4)
12#define EXTENT_DELALLOC (1 << 5)
13#define EXTENT_DEFRAG (1 << 6)
14#define EXTENT_DEFRAG_DONE (1 << 7)
15#define EXTENT_BUFFER_FILLED (1 << 8)
16#define EXTENT_ORDERED (1 << 9)
17#define EXTENT_ORDERED_METADATA (1 << 10)
18#define EXTENT_BOUNDARY (1 << 11)
19#define EXTENT_NODATASUM (1 << 12)
20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21
22/* flags for bio submission */
23#define EXTENT_BIO_COMPRESSED 1
24
25/*
26 * page->private values. Every page that is controlled by the extent
27 * map has page->private set to one.
28 */
29#define EXTENT_PAGE_PRIVATE 1
30#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
31
32struct extent_state;
33
34typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
35 struct bio *bio, int mirror_num,
36 unsigned long bio_flags);
37struct extent_io_ops {
38 int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
39 u64 start, u64 end, int *page_started,
40 unsigned long *nr_written);
41 int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
42 int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
43 extent_submit_bio_hook_t *submit_bio_hook;
44 int (*merge_bio_hook)(struct page *page, unsigned long offset,
45 size_t size, struct bio *bio,
46 unsigned long bio_flags);
47 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
48 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
49 u64 start, u64 end,
50 struct extent_state *state);
51 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
52 u64 start, u64 end,
53 struct extent_state *state);
54 int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
55 struct extent_state *state);
56 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
57 struct extent_state *state, int uptodate);
58 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
59 unsigned long old, unsigned long bits);
60 int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
61 unsigned long old, unsigned long bits);
62 int (*write_cache_pages_lock_hook)(struct page *page);
63};
64
65struct extent_io_tree {
66 struct rb_root state;
67 struct rb_root buffer;
68 struct address_space *mapping;
69 u64 dirty_bytes;
70 spinlock_t lock;
71 spinlock_t buffer_lock;
72 struct extent_io_ops *ops;
73};
74
75struct extent_state {
76 u64 start;
77 u64 end; /* inclusive */
78 struct rb_node rb_node;
79 struct extent_io_tree *tree;
80 wait_queue_head_t wq;
81 atomic_t refs;
82 unsigned long state;
83
84 /* for use by the FS */
85 u64 private;
86
87 struct list_head leak_list;
88};
89
90struct extent_buffer {
91 u64 start;
92 unsigned long len;
93 char *map_token;
94 char *kaddr;
95 unsigned long map_start;
96 unsigned long map_len;
97 struct page *first_page;
98 atomic_t refs;
99 int flags;
100 struct list_head leak_list;
101 struct rb_node rb_node;
102 struct mutex mutex;
103};
104
105struct extent_map_tree;
106
107static inline struct extent_state *extent_state_next(struct extent_state *state)
108{
109 struct rb_node *node;
110 node = rb_next(&state->rb_node);
111 if (!node)
112 return NULL;
113 return rb_entry(node, struct extent_state, rb_node);
114}
115
116typedef struct extent_map *(get_extent_t)(struct inode *inode,
117 struct page *page,
118 size_t page_offset,
119 u64 start, u64 len,
120 int create);
121
122void extent_io_tree_init(struct extent_io_tree *tree,
123 struct address_space *mapping, gfp_t mask);
124int try_release_extent_mapping(struct extent_map_tree *map,
125 struct extent_io_tree *tree, struct page *page,
126 gfp_t mask);
127int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page);
128int try_release_extent_state(struct extent_map_tree *map,
129 struct extent_io_tree *tree, struct page *page,
130 gfp_t mask);
131int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
132int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
133int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
134 gfp_t mask);
135int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
136 get_extent_t *get_extent);
137int __init extent_io_init(void);
138void extent_io_exit(void);
139
140u64 count_range_bits(struct extent_io_tree *tree,
141 u64 *start, u64 search_end,
142 u64 max_bytes, unsigned long bits);
143
144int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
145 int bits, int filled);
146int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
147 int bits, gfp_t mask);
148int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
149 int bits, int wake, int delete, gfp_t mask);
150int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
151 int bits, gfp_t mask);
152int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
153 gfp_t mask);
154int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
155 gfp_t mask);
156int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
157 gfp_t mask);
158int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
159 gfp_t mask);
160int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
161 gfp_t mask);
162int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
163 u64 end, gfp_t mask);
164int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
165 gfp_t mask);
166int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
167 gfp_t mask);
168int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
169 u64 *start_ret, u64 *end_ret, int bits);
170struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
171 u64 start, int bits);
172int extent_invalidatepage(struct extent_io_tree *tree,
173 struct page *page, unsigned long offset);
174int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
175 get_extent_t *get_extent,
176 struct writeback_control *wbc);
177int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
178 u64 start, u64 end, get_extent_t *get_extent,
179 int mode);
180int extent_writepages(struct extent_io_tree *tree,
181 struct address_space *mapping,
182 get_extent_t *get_extent,
183 struct writeback_control *wbc);
184int extent_readpages(struct extent_io_tree *tree,
185 struct address_space *mapping,
186 struct list_head *pages, unsigned nr_pages,
187 get_extent_t get_extent);
188int extent_prepare_write(struct extent_io_tree *tree,
189 struct inode *inode, struct page *page,
190 unsigned from, unsigned to, get_extent_t *get_extent);
191int extent_commit_write(struct extent_io_tree *tree,
192 struct inode *inode, struct page *page,
193 unsigned from, unsigned to);
194sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
195 get_extent_t *get_extent);
196int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
197int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
198int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
199void set_page_extent_mapped(struct page *page);
200
201struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
202 u64 start, unsigned long len,
203 struct page *page0,
204 gfp_t mask);
205struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
206 u64 start, unsigned long len,
207 gfp_t mask);
208void free_extent_buffer(struct extent_buffer *eb);
209int read_extent_buffer_pages(struct extent_io_tree *tree,
210 struct extent_buffer *eb, u64 start, int wait,
211 get_extent_t *get_extent, int mirror_num);
212
213static inline void extent_buffer_get(struct extent_buffer *eb)
214{
215 atomic_inc(&eb->refs);
216}
217
218int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
219 unsigned long start,
220 unsigned long len);
221void read_extent_buffer(struct extent_buffer *eb, void *dst,
222 unsigned long start,
223 unsigned long len);
224void write_extent_buffer(struct extent_buffer *eb, const void *src,
225 unsigned long start, unsigned long len);
226void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
227 unsigned long dst_offset, unsigned long src_offset,
228 unsigned long len);
229void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
230 unsigned long src_offset, unsigned long len);
231void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
232 unsigned long src_offset, unsigned long len);
233void memset_extent_buffer(struct extent_buffer *eb, char c,
234 unsigned long start, unsigned long len);
235int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
236 struct extent_buffer *eb);
237int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end);
238int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
239int clear_extent_buffer_dirty(struct extent_io_tree *tree,
240 struct extent_buffer *eb);
241int set_extent_buffer_dirty(struct extent_io_tree *tree,
242 struct extent_buffer *eb);
243int set_extent_buffer_uptodate(struct extent_io_tree *tree,
244 struct extent_buffer *eb);
245int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
246 struct extent_buffer *eb);
247int extent_buffer_uptodate(struct extent_io_tree *tree,
248 struct extent_buffer *eb);
249int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
250 unsigned long min_len, char **token, char **map,
251 unsigned long *map_start,
252 unsigned long *map_len, int km);
253int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
254 unsigned long min_len, char **token, char **map,
255 unsigned long *map_start,
256 unsigned long *map_len, int km);
257void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
258int release_extent_buffer_tail_pages(struct extent_buffer *eb);
259int extent_range_uptodate(struct extent_io_tree *tree,
260 u64 start, u64 end);
261int extent_clear_unlock_delalloc(struct inode *inode,
262 struct extent_io_tree *tree,
263 u64 start, u64 end, struct page *locked_page,
264 int unlock_page,
265 int clear_unlock,
266 int clear_delalloc, int clear_dirty,
267 int set_writeback,
268 int end_writeback);
269#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
new file mode 100644
index 000000000000..4a83e33ada32
--- /dev/null
+++ b/fs/btrfs/extent_map.c
@@ -0,0 +1,351 @@
1#include <linux/err.h>
2#include <linux/gfp.h>
3#include <linux/slab.h>
4#include <linux/module.h>
5#include <linux/spinlock.h>
6#include <linux/version.h>
7#include <linux/hardirq.h>
8#include "extent_map.h"
9
10/* temporary define until extent_map moves out of btrfs */
11struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
12 unsigned long extra_flags,
13 void (*ctor)(void *, struct kmem_cache *,
14 unsigned long));
15
16static struct kmem_cache *extent_map_cache;
17
18int __init extent_map_init(void)
19{
20 extent_map_cache = btrfs_cache_create("extent_map",
21 sizeof(struct extent_map), 0,
22 NULL);
23 if (!extent_map_cache)
24 return -ENOMEM;
25 return 0;
26}
27
28void extent_map_exit(void)
29{
30 if (extent_map_cache)
31 kmem_cache_destroy(extent_map_cache);
32}
33
34/**
35 * extent_map_tree_init - initialize extent map tree
36 * @tree: tree to initialize
37 * @mask: flags for memory allocations during tree operations
38 *
39 * Initialize the extent tree @tree. Should be called for each new inode
40 * or other user of the extent_map interface.
41 */
42void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
43{
44 tree->map.rb_node = NULL;
45 spin_lock_init(&tree->lock);
46}
47EXPORT_SYMBOL(extent_map_tree_init);
48
49/**
50 * alloc_extent_map - allocate new extent map structure
51 * @mask: memory allocation flags
52 *
53 * Allocate a new extent_map structure. The new structure is
54 * returned with a reference count of one and needs to be
55 * freed using free_extent_map()
56 */
57struct extent_map *alloc_extent_map(gfp_t mask)
58{
59 struct extent_map *em;
60 em = kmem_cache_alloc(extent_map_cache, mask);
61 if (!em || IS_ERR(em))
62 return em;
63 em->in_tree = 0;
64 em->flags = 0;
65 atomic_set(&em->refs, 1);
66 return em;
67}
68EXPORT_SYMBOL(alloc_extent_map);
69
70/**
71 * free_extent_map - drop reference count of an extent_map
72 * @em: extent map beeing releasead
73 *
74 * Drops the reference out on @em by one and free the structure
75 * if the reference count hits zero.
76 */
77void free_extent_map(struct extent_map *em)
78{
79 if (!em)
80 return;
81 WARN_ON(atomic_read(&em->refs) == 0);
82 if (atomic_dec_and_test(&em->refs)) {
83 WARN_ON(em->in_tree);
84 kmem_cache_free(extent_map_cache, em);
85 }
86}
87EXPORT_SYMBOL(free_extent_map);
88
89static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
90 struct rb_node *node)
91{
92 struct rb_node **p = &root->rb_node;
93 struct rb_node *parent = NULL;
94 struct extent_map *entry;
95
96 while (*p) {
97 parent = *p;
98 entry = rb_entry(parent, struct extent_map, rb_node);
99
100 WARN_ON(!entry->in_tree);
101
102 if (offset < entry->start)
103 p = &(*p)->rb_left;
104 else if (offset >= extent_map_end(entry))
105 p = &(*p)->rb_right;
106 else
107 return parent;
108 }
109
110 entry = rb_entry(node, struct extent_map, rb_node);
111 entry->in_tree = 1;
112 rb_link_node(node, parent, p);
113 rb_insert_color(node, root);
114 return NULL;
115}
116
117/*
118 * search through the tree for an extent_map with a given offset. If
119 * it can't be found, try to find some neighboring extents
120 */
121static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
122 struct rb_node **prev_ret,
123 struct rb_node **next_ret)
124{
125 struct rb_node *n = root->rb_node;
126 struct rb_node *prev = NULL;
127 struct rb_node *orig_prev = NULL;
128 struct extent_map *entry;
129 struct extent_map *prev_entry = NULL;
130
131 while (n) {
132 entry = rb_entry(n, struct extent_map, rb_node);
133 prev = n;
134 prev_entry = entry;
135
136 WARN_ON(!entry->in_tree);
137
138 if (offset < entry->start)
139 n = n->rb_left;
140 else if (offset >= extent_map_end(entry))
141 n = n->rb_right;
142 else
143 return n;
144 }
145
146 if (prev_ret) {
147 orig_prev = prev;
148 while (prev && offset >= extent_map_end(prev_entry)) {
149 prev = rb_next(prev);
150 prev_entry = rb_entry(prev, struct extent_map, rb_node);
151 }
152 *prev_ret = prev;
153 prev = orig_prev;
154 }
155
156 if (next_ret) {
157 prev_entry = rb_entry(prev, struct extent_map, rb_node);
158 while (prev && offset < prev_entry->start) {
159 prev = rb_prev(prev);
160 prev_entry = rb_entry(prev, struct extent_map, rb_node);
161 }
162 *next_ret = prev;
163 }
164 return NULL;
165}
166
167/*
168 * look for an offset in the tree, and if it can't be found, return
169 * the first offset we can find smaller than 'offset'.
170 */
171static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
172{
173 struct rb_node *prev;
174 struct rb_node *ret;
175 ret = __tree_search(root, offset, &prev, NULL);
176 if (!ret)
177 return prev;
178 return ret;
179}
180
181/* check to see if two extent_map structs are adjacent and safe to merge */
182static int mergable_maps(struct extent_map *prev, struct extent_map *next)
183{
184 if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
185 return 0;
186
187 /*
188 * don't merge compressed extents, we need to know their
189 * actual size
190 */
191 if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
192 return 0;
193
194 if (extent_map_end(prev) == next->start &&
195 prev->flags == next->flags &&
196 prev->bdev == next->bdev &&
197 ((next->block_start == EXTENT_MAP_HOLE &&
198 prev->block_start == EXTENT_MAP_HOLE) ||
199 (next->block_start == EXTENT_MAP_INLINE &&
200 prev->block_start == EXTENT_MAP_INLINE) ||
201 (next->block_start == EXTENT_MAP_DELALLOC &&
202 prev->block_start == EXTENT_MAP_DELALLOC) ||
203 (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
204 next->block_start == extent_map_block_end(prev)))) {
205 return 1;
206 }
207 return 0;
208}
209
210/**
211 * add_extent_mapping - add new extent map to the extent tree
212 * @tree: tree to insert new map in
213 * @em: map to insert
214 *
215 * Insert @em into @tree or perform a simple forward/backward merge with
216 * existing mappings. The extent_map struct passed in will be inserted
217 * into the tree directly, with an additional reference taken, or a
218 * reference dropped if the merge attempt was sucessfull.
219 */
220int add_extent_mapping(struct extent_map_tree *tree,
221 struct extent_map *em)
222{
223 int ret = 0;
224 struct extent_map *merge = NULL;
225 struct rb_node *rb;
226 struct extent_map *exist;
227
228 exist = lookup_extent_mapping(tree, em->start, em->len);
229 if (exist) {
230 free_extent_map(exist);
231 ret = -EEXIST;
232 goto out;
233 }
234 assert_spin_locked(&tree->lock);
235 rb = tree_insert(&tree->map, em->start, &em->rb_node);
236 if (rb) {
237 ret = -EEXIST;
238 free_extent_map(merge);
239 goto out;
240 }
241 atomic_inc(&em->refs);
242 if (em->start != 0) {
243 rb = rb_prev(&em->rb_node);
244 if (rb)
245 merge = rb_entry(rb, struct extent_map, rb_node);
246 if (rb && mergable_maps(merge, em)) {
247 em->start = merge->start;
248 em->len += merge->len;
249 em->block_len += merge->block_len;
250 em->block_start = merge->block_start;
251 merge->in_tree = 0;
252 rb_erase(&merge->rb_node, &tree->map);
253 free_extent_map(merge);
254 }
255 }
256 rb = rb_next(&em->rb_node);
257 if (rb)
258 merge = rb_entry(rb, struct extent_map, rb_node);
259 if (rb && mergable_maps(em, merge)) {
260 em->len += merge->len;
261 em->block_len += merge->len;
262 rb_erase(&merge->rb_node, &tree->map);
263 merge->in_tree = 0;
264 free_extent_map(merge);
265 }
266out:
267 return ret;
268}
269EXPORT_SYMBOL(add_extent_mapping);
270
271/* simple helper to do math around the end of an extent, handling wrap */
272static u64 range_end(u64 start, u64 len)
273{
274 if (start + len < start)
275 return (u64)-1;
276 return start + len;
277}
278
279/**
280 * lookup_extent_mapping - lookup extent_map
281 * @tree: tree to lookup in
282 * @start: byte offset to start the search
283 * @len: length of the lookup range
284 *
285 * Find and return the first extent_map struct in @tree that intersects the
286 * [start, len] range. There may be additional objects in the tree that
287 * intersect, so check the object returned carefully to make sure that no
288 * additional lookups are needed.
289 */
290struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
291 u64 start, u64 len)
292{
293 struct extent_map *em;
294 struct rb_node *rb_node;
295 struct rb_node *prev = NULL;
296 struct rb_node *next = NULL;
297 u64 end = range_end(start, len);
298
299 assert_spin_locked(&tree->lock);
300 rb_node = __tree_search(&tree->map, start, &prev, &next);
301 if (!rb_node && prev) {
302 em = rb_entry(prev, struct extent_map, rb_node);
303 if (end > em->start && start < extent_map_end(em))
304 goto found;
305 }
306 if (!rb_node && next) {
307 em = rb_entry(next, struct extent_map, rb_node);
308 if (end > em->start && start < extent_map_end(em))
309 goto found;
310 }
311 if (!rb_node) {
312 em = NULL;
313 goto out;
314 }
315 if (IS_ERR(rb_node)) {
316 em = ERR_PTR(PTR_ERR(rb_node));
317 goto out;
318 }
319 em = rb_entry(rb_node, struct extent_map, rb_node);
320 if (end > em->start && start < extent_map_end(em))
321 goto found;
322
323 em = NULL;
324 goto out;
325
326found:
327 atomic_inc(&em->refs);
328out:
329 return em;
330}
331EXPORT_SYMBOL(lookup_extent_mapping);
332
333/**
334 * remove_extent_mapping - removes an extent_map from the extent tree
335 * @tree: extent tree to remove from
336 * @em: extent map beeing removed
337 *
338 * Removes @em from @tree. No reference counts are dropped, and no checks
339 * are done to see if the range is in use
340 */
341int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
342{
343 int ret = 0;
344
345 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
346 assert_spin_locked(&tree->lock);
347 rb_erase(&em->rb_node, &tree->map);
348 em->in_tree = 0;
349 return ret;
350}
351EXPORT_SYMBOL(remove_extent_mapping);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
new file mode 100644
index 000000000000..fb6eeef06bb0
--- /dev/null
+++ b/fs/btrfs/extent_map.h
@@ -0,0 +1,62 @@
1#ifndef __EXTENTMAP__
2#define __EXTENTMAP__
3
4#include <linux/rbtree.h>
5
6#define EXTENT_MAP_LAST_BYTE (u64)-4
7#define EXTENT_MAP_HOLE (u64)-3
8#define EXTENT_MAP_INLINE (u64)-2
9#define EXTENT_MAP_DELALLOC (u64)-1
10
11/* bits for the flags field */
12#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
13#define EXTENT_FLAG_COMPRESSED 1
14#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
15#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
16
17struct extent_map {
18 struct rb_node rb_node;
19
20 /* all of these are in bytes */
21 u64 start;
22 u64 len;
23 u64 orig_start;
24 u64 block_start;
25 u64 block_len;
26 unsigned long flags;
27 struct block_device *bdev;
28 atomic_t refs;
29 int in_tree;
30};
31
32struct extent_map_tree {
33 struct rb_root map;
34 spinlock_t lock;
35};
36
37static inline u64 extent_map_end(struct extent_map *em)
38{
39 if (em->start + em->len < em->start)
40 return (u64)-1;
41 return em->start + em->len;
42}
43
44static inline u64 extent_map_block_end(struct extent_map *em)
45{
46 if (em->block_start + em->block_len < em->block_start)
47 return (u64)-1;
48 return em->block_start + em->block_len;
49}
50
51void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
52struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
53 u64 start, u64 len);
54int add_extent_mapping(struct extent_map_tree *tree,
55 struct extent_map *em);
56int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
57
58struct extent_map *alloc_extent_map(gfp_t mask);
59void free_extent_map(struct extent_map *em);
60int __init extent_map_init(void);
61void extent_map_exit(void);
62#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
new file mode 100644
index 000000000000..964652435fd1
--- /dev/null
+++ b/fs/btrfs/file-item.c
@@ -0,0 +1,831 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/bio.h>
20#include <linux/pagemap.h>
21#include <linux/highmem.h>
22#include "ctree.h"
23#include "disk-io.h"
24#include "transaction.h"
25#include "print-tree.h"
26
27#define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
28 sizeof(struct btrfs_item) * 2) / \
29 size) - 1))
30
31#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
32 sizeof(struct btrfs_ordered_sum)) / \
33 sizeof(struct btrfs_sector_sum) * \
34 (r)->sectorsize - (r)->sectorsize)
35
36int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
37 struct btrfs_root *root,
38 u64 objectid, u64 pos,
39 u64 disk_offset, u64 disk_num_bytes,
40 u64 num_bytes, u64 offset, u64 ram_bytes,
41 u8 compression, u8 encryption, u16 other_encoding)
42{
43 int ret = 0;
44 struct btrfs_file_extent_item *item;
45 struct btrfs_key file_key;
46 struct btrfs_path *path;
47 struct extent_buffer *leaf;
48
49 path = btrfs_alloc_path();
50 BUG_ON(!path);
51 file_key.objectid = objectid;
52 file_key.offset = pos;
53 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
54
55 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
56 sizeof(*item));
57 if (ret < 0)
58 goto out;
59 BUG_ON(ret);
60 leaf = path->nodes[0];
61 item = btrfs_item_ptr(leaf, path->slots[0],
62 struct btrfs_file_extent_item);
63 btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
64 btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
65 btrfs_set_file_extent_offset(leaf, item, offset);
66 btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
67 btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
68 btrfs_set_file_extent_generation(leaf, item, trans->transid);
69 btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
70 btrfs_set_file_extent_compression(leaf, item, compression);
71 btrfs_set_file_extent_encryption(leaf, item, encryption);
72 btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
73
74 btrfs_mark_buffer_dirty(leaf);
75out:
76 btrfs_free_path(path);
77 return ret;
78}
79
80struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
81 struct btrfs_root *root,
82 struct btrfs_path *path,
83 u64 bytenr, int cow)
84{
85 int ret;
86 struct btrfs_key file_key;
87 struct btrfs_key found_key;
88 struct btrfs_csum_item *item;
89 struct extent_buffer *leaf;
90 u64 csum_offset = 0;
91 u16 csum_size =
92 btrfs_super_csum_size(&root->fs_info->super_copy);
93 int csums_in_item;
94
95 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
96 file_key.offset = bytenr;
97 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
98 ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
99 if (ret < 0)
100 goto fail;
101 leaf = path->nodes[0];
102 if (ret > 0) {
103 ret = 1;
104 if (path->slots[0] == 0)
105 goto fail;
106 path->slots[0]--;
107 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
108 if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY)
109 goto fail;
110
111 csum_offset = (bytenr - found_key.offset) >>
112 root->fs_info->sb->s_blocksize_bits;
113 csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
114 csums_in_item /= csum_size;
115
116 if (csum_offset >= csums_in_item) {
117 ret = -EFBIG;
118 goto fail;
119 }
120 }
121 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
122 item = (struct btrfs_csum_item *)((unsigned char *)item +
123 csum_offset * csum_size);
124 return item;
125fail:
126 if (ret > 0)
127 ret = -ENOENT;
128 return ERR_PTR(ret);
129}
130
131
132int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
133 struct btrfs_root *root,
134 struct btrfs_path *path, u64 objectid,
135 u64 offset, int mod)
136{
137 int ret;
138 struct btrfs_key file_key;
139 int ins_len = mod < 0 ? -1 : 0;
140 int cow = mod != 0;
141
142 file_key.objectid = objectid;
143 file_key.offset = offset;
144 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
145 ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
146 return ret;
147}
148
149
150int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
151 struct bio *bio, u32 *dst)
152{
153 u32 sum;
154 struct bio_vec *bvec = bio->bi_io_vec;
155 int bio_index = 0;
156 u64 offset;
157 u64 item_start_offset = 0;
158 u64 item_last_offset = 0;
159 u64 disk_bytenr;
160 u32 diff;
161 u16 csum_size =
162 btrfs_super_csum_size(&root->fs_info->super_copy);
163 int ret;
164 struct btrfs_path *path;
165 struct btrfs_csum_item *item = NULL;
166 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
167
168 path = btrfs_alloc_path();
169 if (bio->bi_size > PAGE_CACHE_SIZE * 8)
170 path->reada = 2;
171
172 WARN_ON(bio->bi_vcnt <= 0);
173
174 disk_bytenr = (u64)bio->bi_sector << 9;
175 while (bio_index < bio->bi_vcnt) {
176 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
177 ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
178 if (ret == 0)
179 goto found;
180
181 if (!item || disk_bytenr < item_start_offset ||
182 disk_bytenr >= item_last_offset) {
183 struct btrfs_key found_key;
184 u32 item_size;
185
186 if (item)
187 btrfs_release_path(root, path);
188 item = btrfs_lookup_csum(NULL, root->fs_info->csum_root,
189 path, disk_bytenr, 0);
190 if (IS_ERR(item)) {
191 ret = PTR_ERR(item);
192 if (ret == -ENOENT || ret == -EFBIG)
193 ret = 0;
194 sum = 0;
195 if (BTRFS_I(inode)->root->root_key.objectid ==
196 BTRFS_DATA_RELOC_TREE_OBJECTID) {
197 set_extent_bits(io_tree, offset,
198 offset + bvec->bv_len - 1,
199 EXTENT_NODATASUM, GFP_NOFS);
200 } else {
201 printk(KERN_INFO "btrfs no csum found "
202 "for inode %lu start %llu\n",
203 inode->i_ino,
204 (unsigned long long)offset);
205 }
206 item = NULL;
207 btrfs_release_path(root, path);
208 goto found;
209 }
210 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
211 path->slots[0]);
212
213 item_start_offset = found_key.offset;
214 item_size = btrfs_item_size_nr(path->nodes[0],
215 path->slots[0]);
216 item_last_offset = item_start_offset +
217 (item_size / csum_size) *
218 root->sectorsize;
219 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
220 struct btrfs_csum_item);
221 }
222 /*
223 * this byte range must be able to fit inside
224 * a single leaf so it will also fit inside a u32
225 */
226 diff = disk_bytenr - item_start_offset;
227 diff = diff / root->sectorsize;
228 diff = diff * csum_size;
229
230 read_extent_buffer(path->nodes[0], &sum,
231 ((unsigned long)item) + diff,
232 csum_size);
233found:
234 if (dst)
235 *dst++ = sum;
236 else
237 set_state_private(io_tree, offset, sum);
238 disk_bytenr += bvec->bv_len;
239 bio_index++;
240 bvec++;
241 }
242 btrfs_free_path(path);
243 return 0;
244}
245
246int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
247 struct list_head *list)
248{
249 struct btrfs_key key;
250 struct btrfs_path *path;
251 struct extent_buffer *leaf;
252 struct btrfs_ordered_sum *sums;
253 struct btrfs_sector_sum *sector_sum;
254 struct btrfs_csum_item *item;
255 unsigned long offset;
256 int ret;
257 size_t size;
258 u64 csum_end;
259 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
260
261 path = btrfs_alloc_path();
262 BUG_ON(!path);
263
264 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
265 key.offset = start;
266 key.type = BTRFS_EXTENT_CSUM_KEY;
267
268 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
269 if (ret < 0)
270 goto fail;
271 if (ret > 0 && path->slots[0] > 0) {
272 leaf = path->nodes[0];
273 btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
274 if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
275 key.type == BTRFS_EXTENT_CSUM_KEY) {
276 offset = (start - key.offset) >>
277 root->fs_info->sb->s_blocksize_bits;
278 if (offset * csum_size <
279 btrfs_item_size_nr(leaf, path->slots[0] - 1))
280 path->slots[0]--;
281 }
282 }
283
284 while (start <= end) {
285 leaf = path->nodes[0];
286 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
287 ret = btrfs_next_leaf(root, path);
288 if (ret < 0)
289 goto fail;
290 if (ret > 0)
291 break;
292 leaf = path->nodes[0];
293 }
294
295 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
296 if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
297 key.type != BTRFS_EXTENT_CSUM_KEY)
298 break;
299
300 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
301 if (key.offset > end)
302 break;
303
304 if (key.offset > start)
305 start = key.offset;
306
307 size = btrfs_item_size_nr(leaf, path->slots[0]);
308 csum_end = key.offset + (size / csum_size) * root->sectorsize;
309 if (csum_end <= start) {
310 path->slots[0]++;
311 continue;
312 }
313
314 csum_end = min(csum_end, end + 1);
315 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
316 struct btrfs_csum_item);
317 while (start < csum_end) {
318 size = min_t(size_t, csum_end - start,
319 MAX_ORDERED_SUM_BYTES(root));
320 sums = kzalloc(btrfs_ordered_sum_size(root, size),
321 GFP_NOFS);
322 BUG_ON(!sums);
323
324 sector_sum = sums->sums;
325 sums->bytenr = start;
326 sums->len = size;
327
328 offset = (start - key.offset) >>
329 root->fs_info->sb->s_blocksize_bits;
330 offset *= csum_size;
331
332 while (size > 0) {
333 read_extent_buffer(path->nodes[0],
334 &sector_sum->sum,
335 ((unsigned long)item) +
336 offset, csum_size);
337 sector_sum->bytenr = start;
338
339 size -= root->sectorsize;
340 start += root->sectorsize;
341 offset += csum_size;
342 sector_sum++;
343 }
344 list_add_tail(&sums->list, list);
345 }
346 path->slots[0]++;
347 }
348 ret = 0;
349fail:
350 btrfs_free_path(path);
351 return ret;
352}
353
354int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
355 struct bio *bio, u64 file_start, int contig)
356{
357 struct btrfs_ordered_sum *sums;
358 struct btrfs_sector_sum *sector_sum;
359 struct btrfs_ordered_extent *ordered;
360 char *data;
361 struct bio_vec *bvec = bio->bi_io_vec;
362 int bio_index = 0;
363 unsigned long total_bytes = 0;
364 unsigned long this_sum_bytes = 0;
365 u64 offset;
366 u64 disk_bytenr;
367
368 WARN_ON(bio->bi_vcnt <= 0);
369 sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
370 if (!sums)
371 return -ENOMEM;
372
373 sector_sum = sums->sums;
374 disk_bytenr = (u64)bio->bi_sector << 9;
375 sums->len = bio->bi_size;
376 INIT_LIST_HEAD(&sums->list);
377
378 if (contig)
379 offset = file_start;
380 else
381 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
382
383 ordered = btrfs_lookup_ordered_extent(inode, offset);
384 BUG_ON(!ordered);
385 sums->bytenr = ordered->start;
386
387 while (bio_index < bio->bi_vcnt) {
388 if (!contig)
389 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
390
391 if (!contig && (offset >= ordered->file_offset + ordered->len ||
392 offset < ordered->file_offset)) {
393 unsigned long bytes_left;
394 sums->len = this_sum_bytes;
395 this_sum_bytes = 0;
396 btrfs_add_ordered_sum(inode, ordered, sums);
397 btrfs_put_ordered_extent(ordered);
398
399 bytes_left = bio->bi_size - total_bytes;
400
401 sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
402 GFP_NOFS);
403 BUG_ON(!sums);
404 sector_sum = sums->sums;
405 sums->len = bytes_left;
406 ordered = btrfs_lookup_ordered_extent(inode, offset);
407 BUG_ON(!ordered);
408 sums->bytenr = ordered->start;
409 }
410
411 data = kmap_atomic(bvec->bv_page, KM_USER0);
412 sector_sum->sum = ~(u32)0;
413 sector_sum->sum = btrfs_csum_data(root,
414 data + bvec->bv_offset,
415 sector_sum->sum,
416 bvec->bv_len);
417 kunmap_atomic(data, KM_USER0);
418 btrfs_csum_final(sector_sum->sum,
419 (char *)&sector_sum->sum);
420 sector_sum->bytenr = disk_bytenr;
421
422 sector_sum++;
423 bio_index++;
424 total_bytes += bvec->bv_len;
425 this_sum_bytes += bvec->bv_len;
426 disk_bytenr += bvec->bv_len;
427 offset += bvec->bv_len;
428 bvec++;
429 }
430 this_sum_bytes = 0;
431 btrfs_add_ordered_sum(inode, ordered, sums);
432 btrfs_put_ordered_extent(ordered);
433 return 0;
434}
435
436/*
437 * helper function for csum removal, this expects the
438 * key to describe the csum pointed to by the path, and it expects
439 * the csum to overlap the range [bytenr, len]
440 *
441 * The csum should not be entirely contained in the range and the
442 * range should not be entirely contained in the csum.
443 *
444 * This calls btrfs_truncate_item with the correct args based on the
445 * overlap, and fixes up the key as required.
446 */
447static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
448 struct btrfs_root *root,
449 struct btrfs_path *path,
450 struct btrfs_key *key,
451 u64 bytenr, u64 len)
452{
453 struct extent_buffer *leaf;
454 u16 csum_size =
455 btrfs_super_csum_size(&root->fs_info->super_copy);
456 u64 csum_end;
457 u64 end_byte = bytenr + len;
458 u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
459 int ret;
460
461 leaf = path->nodes[0];
462 csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
463 csum_end <<= root->fs_info->sb->s_blocksize_bits;
464 csum_end += key->offset;
465
466 if (key->offset < bytenr && csum_end <= end_byte) {
467 /*
468 * [ bytenr - len ]
469 * [ ]
470 * [csum ]
471 * A simple truncate off the end of the item
472 */
473 u32 new_size = (bytenr - key->offset) >> blocksize_bits;
474 new_size *= csum_size;
475 ret = btrfs_truncate_item(trans, root, path, new_size, 1);
476 BUG_ON(ret);
477 } else if (key->offset >= bytenr && csum_end > end_byte &&
478 end_byte > key->offset) {
479 /*
480 * [ bytenr - len ]
481 * [ ]
482 * [csum ]
483 * we need to truncate from the beginning of the csum
484 */
485 u32 new_size = (csum_end - end_byte) >> blocksize_bits;
486 new_size *= csum_size;
487
488 ret = btrfs_truncate_item(trans, root, path, new_size, 0);
489 BUG_ON(ret);
490
491 key->offset = end_byte;
492 ret = btrfs_set_item_key_safe(trans, root, path, key);
493 BUG_ON(ret);
494 } else {
495 BUG();
496 }
497 return 0;
498}
499
500/*
501 * deletes the csum items from the csum tree for a given
502 * range of bytes.
503 */
504int btrfs_del_csums(struct btrfs_trans_handle *trans,
505 struct btrfs_root *root, u64 bytenr, u64 len)
506{
507 struct btrfs_path *path;
508 struct btrfs_key key;
509 u64 end_byte = bytenr + len;
510 u64 csum_end;
511 struct extent_buffer *leaf;
512 int ret;
513 u16 csum_size =
514 btrfs_super_csum_size(&root->fs_info->super_copy);
515 int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
516
517 root = root->fs_info->csum_root;
518
519 path = btrfs_alloc_path();
520
521 while (1) {
522 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
523 key.offset = end_byte - 1;
524 key.type = BTRFS_EXTENT_CSUM_KEY;
525
526 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
527 if (ret > 0) {
528 if (path->slots[0] == 0)
529 goto out;
530 path->slots[0]--;
531 }
532 leaf = path->nodes[0];
533 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
534
535 if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
536 key.type != BTRFS_EXTENT_CSUM_KEY) {
537 break;
538 }
539
540 if (key.offset >= end_byte)
541 break;
542
543 csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
544 csum_end <<= blocksize_bits;
545 csum_end += key.offset;
546
547 /* this csum ends before we start, we're done */
548 if (csum_end <= bytenr)
549 break;
550
551 /* delete the entire item, it is inside our range */
552 if (key.offset >= bytenr && csum_end <= end_byte) {
553 ret = btrfs_del_item(trans, root, path);
554 BUG_ON(ret);
555 if (key.offset == bytenr)
556 break;
557 } else if (key.offset < bytenr && csum_end > end_byte) {
558 unsigned long offset;
559 unsigned long shift_len;
560 unsigned long item_offset;
561 /*
562 * [ bytenr - len ]
563 * [csum ]
564 *
565 * Our bytes are in the middle of the csum,
566 * we need to split this item and insert a new one.
567 *
568 * But we can't drop the path because the
569 * csum could change, get removed, extended etc.
570 *
571 * The trick here is the max size of a csum item leaves
572 * enough room in the tree block for a single
573 * item header. So, we split the item in place,
574 * adding a new header pointing to the existing
575 * bytes. Then we loop around again and we have
576 * a nicely formed csum item that we can neatly
577 * truncate.
578 */
579 offset = (bytenr - key.offset) >> blocksize_bits;
580 offset *= csum_size;
581
582 shift_len = (len >> blocksize_bits) * csum_size;
583
584 item_offset = btrfs_item_ptr_offset(leaf,
585 path->slots[0]);
586
587 memset_extent_buffer(leaf, 0, item_offset + offset,
588 shift_len);
589 key.offset = bytenr;
590
591 /*
592 * btrfs_split_item returns -EAGAIN when the
593 * item changed size or key
594 */
595 ret = btrfs_split_item(trans, root, path, &key, offset);
596 BUG_ON(ret && ret != -EAGAIN);
597
598 key.offset = end_byte - 1;
599 } else {
600 ret = truncate_one_csum(trans, root, path,
601 &key, bytenr, len);
602 BUG_ON(ret);
603 if (key.offset < bytenr)
604 break;
605 }
606 btrfs_release_path(root, path);
607 }
608out:
609 btrfs_free_path(path);
610 return 0;
611}
612
613int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
614 struct btrfs_root *root,
615 struct btrfs_ordered_sum *sums)
616{
617 u64 bytenr;
618 int ret;
619 struct btrfs_key file_key;
620 struct btrfs_key found_key;
621 u64 next_offset;
622 u64 total_bytes = 0;
623 int found_next;
624 struct btrfs_path *path;
625 struct btrfs_csum_item *item;
626 struct btrfs_csum_item *item_end;
627 struct extent_buffer *leaf = NULL;
628 u64 csum_offset;
629 struct btrfs_sector_sum *sector_sum;
630 u32 nritems;
631 u32 ins_size;
632 char *eb_map;
633 char *eb_token;
634 unsigned long map_len;
635 unsigned long map_start;
636 u16 csum_size =
637 btrfs_super_csum_size(&root->fs_info->super_copy);
638
639 path = btrfs_alloc_path();
640 BUG_ON(!path);
641 sector_sum = sums->sums;
642again:
643 next_offset = (u64)-1;
644 found_next = 0;
645 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
646 file_key.offset = sector_sum->bytenr;
647 bytenr = sector_sum->bytenr;
648 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
649
650 item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1);
651 if (!IS_ERR(item)) {
652 leaf = path->nodes[0];
653 ret = 0;
654 goto found;
655 }
656 ret = PTR_ERR(item);
657 if (ret == -EFBIG) {
658 u32 item_size;
659 /* we found one, but it isn't big enough yet */
660 leaf = path->nodes[0];
661 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
662 if ((item_size / csum_size) >=
663 MAX_CSUM_ITEMS(root, csum_size)) {
664 /* already at max size, make a new one */
665 goto insert;
666 }
667 } else {
668 int slot = path->slots[0] + 1;
669 /* we didn't find a csum item, insert one */
670 nritems = btrfs_header_nritems(path->nodes[0]);
671 if (path->slots[0] >= nritems - 1) {
672 ret = btrfs_next_leaf(root, path);
673 if (ret == 1)
674 found_next = 1;
675 if (ret != 0)
676 goto insert;
677 slot = 0;
678 }
679 btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
680 if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
681 found_key.type != BTRFS_EXTENT_CSUM_KEY) {
682 found_next = 1;
683 goto insert;
684 }
685 next_offset = found_key.offset;
686 found_next = 1;
687 goto insert;
688 }
689
690 /*
691 * at this point, we know the tree has an item, but it isn't big
692 * enough yet to put our csum in. Grow it
693 */
694 btrfs_release_path(root, path);
695 ret = btrfs_search_slot(trans, root, &file_key, path,
696 csum_size, 1);
697 if (ret < 0)
698 goto fail_unlock;
699
700 if (ret > 0) {
701 if (path->slots[0] == 0)
702 goto insert;
703 path->slots[0]--;
704 }
705
706 leaf = path->nodes[0];
707 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
708 csum_offset = (bytenr - found_key.offset) >>
709 root->fs_info->sb->s_blocksize_bits;
710
711 if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY ||
712 found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
713 csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
714 goto insert;
715 }
716
717 if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) /
718 csum_size) {
719 u32 diff = (csum_offset + 1) * csum_size;
720
721 /*
722 * is the item big enough already? we dropped our lock
723 * before and need to recheck
724 */
725 if (diff < btrfs_item_size_nr(leaf, path->slots[0]))
726 goto csum;
727
728 diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
729 if (diff != csum_size)
730 goto insert;
731
732 ret = btrfs_extend_item(trans, root, path, diff);
733 BUG_ON(ret);
734 goto csum;
735 }
736
737insert:
738 btrfs_release_path(root, path);
739 csum_offset = 0;
740 if (found_next) {
741 u64 tmp = total_bytes + root->sectorsize;
742 u64 next_sector = sector_sum->bytenr;
743 struct btrfs_sector_sum *next = sector_sum + 1;
744
745 while (tmp < sums->len) {
746 if (next_sector + root->sectorsize != next->bytenr)
747 break;
748 tmp += root->sectorsize;
749 next_sector = next->bytenr;
750 next++;
751 }
752 tmp = min(tmp, next_offset - file_key.offset);
753 tmp >>= root->fs_info->sb->s_blocksize_bits;
754 tmp = max((u64)1, tmp);
755 tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
756 ins_size = csum_size * tmp;
757 } else {
758 ins_size = csum_size;
759 }
760 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
761 ins_size);
762 if (ret < 0)
763 goto fail_unlock;
764 if (ret != 0) {
765 WARN_ON(1);
766 goto fail_unlock;
767 }
768csum:
769 leaf = path->nodes[0];
770 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
771 ret = 0;
772 item = (struct btrfs_csum_item *)((unsigned char *)item +
773 csum_offset * csum_size);
774found:
775 item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
776 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
777 btrfs_item_size_nr(leaf, path->slots[0]));
778 eb_token = NULL;
779 cond_resched();
780next_sector:
781
782 if (!eb_token ||
783 (unsigned long)item + csum_size >= map_start + map_len) {
784 int err;
785
786 if (eb_token)
787 unmap_extent_buffer(leaf, eb_token, KM_USER1);
788 eb_token = NULL;
789 err = map_private_extent_buffer(leaf, (unsigned long)item,
790 csum_size,
791 &eb_token, &eb_map,
792 &map_start, &map_len, KM_USER1);
793 if (err)
794 eb_token = NULL;
795 }
796 if (eb_token) {
797 memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)),
798 &sector_sum->sum, csum_size);
799 } else {
800 write_extent_buffer(leaf, &sector_sum->sum,
801 (unsigned long)item, csum_size);
802 }
803
804 total_bytes += root->sectorsize;
805 sector_sum++;
806 if (total_bytes < sums->len) {
807 item = (struct btrfs_csum_item *)((char *)item +
808 csum_size);
809 if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
810 sector_sum->bytenr) {
811 bytenr = sector_sum->bytenr;
812 goto next_sector;
813 }
814 }
815 if (eb_token) {
816 unmap_extent_buffer(leaf, eb_token, KM_USER1);
817 eb_token = NULL;
818 }
819 btrfs_mark_buffer_dirty(path->nodes[0]);
820 cond_resched();
821 if (total_bytes < sums->len) {
822 btrfs_release_path(root, path);
823 goto again;
824 }
825out:
826 btrfs_free_path(path);
827 return ret;
828
829fail_unlock:
830 goto out;
831}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
new file mode 100644
index 000000000000..90268334145e
--- /dev/null
+++ b/fs/btrfs/file.c
@@ -0,0 +1,1288 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/pagemap.h>
21#include <linux/highmem.h>
22#include <linux/time.h>
23#include <linux/init.h>
24#include <linux/string.h>
25#include <linux/smp_lock.h>
26#include <linux/backing-dev.h>
27#include <linux/mpage.h>
28#include <linux/swap.h>
29#include <linux/writeback.h>
30#include <linux/statfs.h>
31#include <linux/compat.h>
32#include <linux/version.h>
33#include "ctree.h"
34#include "disk-io.h"
35#include "transaction.h"
36#include "btrfs_inode.h"
37#include "ioctl.h"
38#include "print-tree.h"
39#include "tree-log.h"
40#include "locking.h"
41#include "compat.h"
42
43
44/* simple helper to fault in pages and copy. This should go away
45 * and be replaced with calls into generic code.
46 */
47static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
48 int write_bytes,
49 struct page **prepared_pages,
50 const char __user *buf)
51{
52 long page_fault = 0;
53 int i;
54 int offset = pos & (PAGE_CACHE_SIZE - 1);
55
56 for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
57 size_t count = min_t(size_t,
58 PAGE_CACHE_SIZE - offset, write_bytes);
59 struct page *page = prepared_pages[i];
60 fault_in_pages_readable(buf, count);
61
62 /* Copy data from userspace to the current page */
63 kmap(page);
64 page_fault = __copy_from_user(page_address(page) + offset,
65 buf, count);
66 /* Flush processor's dcache for this page */
67 flush_dcache_page(page);
68 kunmap(page);
69 buf += count;
70 write_bytes -= count;
71
72 if (page_fault)
73 break;
74 }
75 return page_fault ? -EFAULT : 0;
76}
77
78/*
79 * unlocks pages after btrfs_file_write is done with them
80 */
81static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
82{
83 size_t i;
84 for (i = 0; i < num_pages; i++) {
85 if (!pages[i])
86 break;
87 /* page checked is some magic around finding pages that
88 * have been modified without going through btrfs_set_page_dirty
89 * clear it here
90 */
91 ClearPageChecked(pages[i]);
92 unlock_page(pages[i]);
93 mark_page_accessed(pages[i]);
94 page_cache_release(pages[i]);
95 }
96}
97
98/*
99 * after copy_from_user, pages need to be dirtied and we need to make
100 * sure holes are created between the current EOF and the start of
101 * any next extents (if required).
102 *
103 * this also makes the decision about creating an inline extent vs
104 * doing real data extents, marking pages dirty and delalloc as required.
105 */
106static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
107 struct btrfs_root *root,
108 struct file *file,
109 struct page **pages,
110 size_t num_pages,
111 loff_t pos,
112 size_t write_bytes)
113{
114 int err = 0;
115 int i;
116 struct inode *inode = fdentry(file)->d_inode;
117 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
118 u64 hint_byte;
119 u64 num_bytes;
120 u64 start_pos;
121 u64 end_of_last_block;
122 u64 end_pos = pos + write_bytes;
123 loff_t isize = i_size_read(inode);
124
125 start_pos = pos & ~((u64)root->sectorsize - 1);
126 num_bytes = (write_bytes + pos - start_pos +
127 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
128
129 end_of_last_block = start_pos + num_bytes - 1;
130
131 lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
132 trans = btrfs_join_transaction(root, 1);
133 if (!trans) {
134 err = -ENOMEM;
135 goto out_unlock;
136 }
137 btrfs_set_trans_block_group(trans, inode);
138 hint_byte = 0;
139
140 set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
141
142 /* check for reserved extents on each page, we don't want
143 * to reset the delalloc bit on things that already have
144 * extents reserved.
145 */
146 btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
147 for (i = 0; i < num_pages; i++) {
148 struct page *p = pages[i];
149 SetPageUptodate(p);
150 ClearPageChecked(p);
151 set_page_dirty(p);
152 }
153 if (end_pos > isize) {
154 i_size_write(inode, end_pos);
155 btrfs_update_inode(trans, root, inode);
156 }
157 err = btrfs_end_transaction(trans, root);
158out_unlock:
159 unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
160 return err;
161}
162
163/*
164 * this drops all the extents in the cache that intersect the range
165 * [start, end]. Existing extents are split as required.
166 */
167int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
168 int skip_pinned)
169{
170 struct extent_map *em;
171 struct extent_map *split = NULL;
172 struct extent_map *split2 = NULL;
173 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
174 u64 len = end - start + 1;
175 int ret;
176 int testend = 1;
177 unsigned long flags;
178 int compressed = 0;
179
180 WARN_ON(end < start);
181 if (end == (u64)-1) {
182 len = (u64)-1;
183 testend = 0;
184 }
185 while (1) {
186 if (!split)
187 split = alloc_extent_map(GFP_NOFS);
188 if (!split2)
189 split2 = alloc_extent_map(GFP_NOFS);
190
191 spin_lock(&em_tree->lock);
192 em = lookup_extent_mapping(em_tree, start, len);
193 if (!em) {
194 spin_unlock(&em_tree->lock);
195 break;
196 }
197 flags = em->flags;
198 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
199 spin_unlock(&em_tree->lock);
200 if (em->start <= start &&
201 (!testend || em->start + em->len >= start + len)) {
202 free_extent_map(em);
203 break;
204 }
205 if (start < em->start) {
206 len = em->start - start;
207 } else {
208 len = start + len - (em->start + em->len);
209 start = em->start + em->len;
210 }
211 free_extent_map(em);
212 continue;
213 }
214 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
215 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
216 remove_extent_mapping(em_tree, em);
217
218 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
219 em->start < start) {
220 split->start = em->start;
221 split->len = start - em->start;
222 split->orig_start = em->orig_start;
223 split->block_start = em->block_start;
224
225 if (compressed)
226 split->block_len = em->block_len;
227 else
228 split->block_len = split->len;
229
230 split->bdev = em->bdev;
231 split->flags = flags;
232 ret = add_extent_mapping(em_tree, split);
233 BUG_ON(ret);
234 free_extent_map(split);
235 split = split2;
236 split2 = NULL;
237 }
238 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
239 testend && em->start + em->len > start + len) {
240 u64 diff = start + len - em->start;
241
242 split->start = start + len;
243 split->len = em->start + em->len - (start + len);
244 split->bdev = em->bdev;
245 split->flags = flags;
246
247 if (compressed) {
248 split->block_len = em->block_len;
249 split->block_start = em->block_start;
250 split->orig_start = em->orig_start;
251 } else {
252 split->block_len = split->len;
253 split->block_start = em->block_start + diff;
254 split->orig_start = split->start;
255 }
256
257 ret = add_extent_mapping(em_tree, split);
258 BUG_ON(ret);
259 free_extent_map(split);
260 split = NULL;
261 }
262 spin_unlock(&em_tree->lock);
263
264 /* once for us */
265 free_extent_map(em);
266 /* once for the tree*/
267 free_extent_map(em);
268 }
269 if (split)
270 free_extent_map(split);
271 if (split2)
272 free_extent_map(split2);
273 return 0;
274}
275
276int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
277{
278 return 0;
279#if 0
280 struct btrfs_path *path;
281 struct btrfs_key found_key;
282 struct extent_buffer *leaf;
283 struct btrfs_file_extent_item *extent;
284 u64 last_offset = 0;
285 int nritems;
286 int slot;
287 int found_type;
288 int ret;
289 int err = 0;
290 u64 extent_end = 0;
291
292 path = btrfs_alloc_path();
293 ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino,
294 last_offset, 0);
295 while (1) {
296 nritems = btrfs_header_nritems(path->nodes[0]);
297 if (path->slots[0] >= nritems) {
298 ret = btrfs_next_leaf(root, path);
299 if (ret)
300 goto out;
301 nritems = btrfs_header_nritems(path->nodes[0]);
302 }
303 slot = path->slots[0];
304 leaf = path->nodes[0];
305 btrfs_item_key_to_cpu(leaf, &found_key, slot);
306 if (found_key.objectid != inode->i_ino)
307 break;
308 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
309 goto out;
310
311 if (found_key.offset < last_offset) {
312 WARN_ON(1);
313 btrfs_print_leaf(root, leaf);
314 printk(KERN_ERR "inode %lu found offset %llu "
315 "expected %llu\n", inode->i_ino,
316 (unsigned long long)found_key.offset,
317 (unsigned long long)last_offset);
318 err = 1;
319 goto out;
320 }
321 extent = btrfs_item_ptr(leaf, slot,
322 struct btrfs_file_extent_item);
323 found_type = btrfs_file_extent_type(leaf, extent);
324 if (found_type == BTRFS_FILE_EXTENT_REG) {
325 extent_end = found_key.offset +
326 btrfs_file_extent_num_bytes(leaf, extent);
327 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
328 struct btrfs_item *item;
329 item = btrfs_item_nr(leaf, slot);
330 extent_end = found_key.offset +
331 btrfs_file_extent_inline_len(leaf, extent);
332 extent_end = (extent_end + root->sectorsize - 1) &
333 ~((u64)root->sectorsize - 1);
334 }
335 last_offset = extent_end;
336 path->slots[0]++;
337 }
338 if (0 && last_offset < inode->i_size) {
339 WARN_ON(1);
340 btrfs_print_leaf(root, leaf);
341 printk(KERN_ERR "inode %lu found offset %llu size %llu\n",
342 inode->i_ino, (unsigned long long)last_offset,
343 (unsigned long long)inode->i_size);
344 err = 1;
345
346 }
347out:
348 btrfs_free_path(path);
349 return err;
350#endif
351}
352
353/*
354 * this is very complex, but the basic idea is to drop all extents
355 * in the range start - end. hint_block is filled in with a block number
356 * that would be a good hint to the block allocator for this file.
357 *
358 * If an extent intersects the range but is not entirely inside the range
359 * it is either truncated or split. Anything entirely inside the range
360 * is deleted from the tree.
361 *
362 * inline_limit is used to tell this code which offsets in the file to keep
363 * if they contain inline extents.
364 */
365noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
366 struct btrfs_root *root, struct inode *inode,
367 u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
368{
369 u64 extent_end = 0;
370 u64 locked_end = end;
371 u64 search_start = start;
372 u64 leaf_start;
373 u64 ram_bytes = 0;
374 u64 orig_parent = 0;
375 u64 disk_bytenr = 0;
376 u8 compression;
377 u8 encryption;
378 u16 other_encoding = 0;
379 u64 root_gen;
380 u64 root_owner;
381 struct extent_buffer *leaf;
382 struct btrfs_file_extent_item *extent;
383 struct btrfs_path *path;
384 struct btrfs_key key;
385 struct btrfs_file_extent_item old;
386 int keep;
387 int slot;
388 int bookend;
389 int found_type = 0;
390 int found_extent;
391 int found_inline;
392 int recow;
393 int ret;
394
395 inline_limit = 0;
396 btrfs_drop_extent_cache(inode, start, end - 1, 0);
397
398 path = btrfs_alloc_path();
399 if (!path)
400 return -ENOMEM;
401 while (1) {
402 recow = 0;
403 btrfs_release_path(root, path);
404 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
405 search_start, -1);
406 if (ret < 0)
407 goto out;
408 if (ret > 0) {
409 if (path->slots[0] == 0) {
410 ret = 0;
411 goto out;
412 }
413 path->slots[0]--;
414 }
415next_slot:
416 keep = 0;
417 bookend = 0;
418 found_extent = 0;
419 found_inline = 0;
420 leaf_start = 0;
421 root_gen = 0;
422 root_owner = 0;
423 compression = 0;
424 encryption = 0;
425 extent = NULL;
426 leaf = path->nodes[0];
427 slot = path->slots[0];
428 ret = 0;
429 btrfs_item_key_to_cpu(leaf, &key, slot);
430 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY &&
431 key.offset >= end) {
432 goto out;
433 }
434 if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
435 key.objectid != inode->i_ino) {
436 goto out;
437 }
438 if (recow) {
439 search_start = max(key.offset, start);
440 continue;
441 }
442 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
443 extent = btrfs_item_ptr(leaf, slot,
444 struct btrfs_file_extent_item);
445 found_type = btrfs_file_extent_type(leaf, extent);
446 compression = btrfs_file_extent_compression(leaf,
447 extent);
448 encryption = btrfs_file_extent_encryption(leaf,
449 extent);
450 other_encoding = btrfs_file_extent_other_encoding(leaf,
451 extent);
452 if (found_type == BTRFS_FILE_EXTENT_REG ||
453 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
454 extent_end =
455 btrfs_file_extent_disk_bytenr(leaf,
456 extent);
457 if (extent_end)
458 *hint_byte = extent_end;
459
460 extent_end = key.offset +
461 btrfs_file_extent_num_bytes(leaf, extent);
462 ram_bytes = btrfs_file_extent_ram_bytes(leaf,
463 extent);
464 found_extent = 1;
465 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
466 found_inline = 1;
467 extent_end = key.offset +
468 btrfs_file_extent_inline_len(leaf, extent);
469 }
470 } else {
471 extent_end = search_start;
472 }
473
474 /* we found nothing we can drop */
475 if ((!found_extent && !found_inline) ||
476 search_start >= extent_end) {
477 int nextret;
478 u32 nritems;
479 nritems = btrfs_header_nritems(leaf);
480 if (slot >= nritems - 1) {
481 nextret = btrfs_next_leaf(root, path);
482 if (nextret)
483 goto out;
484 recow = 1;
485 } else {
486 path->slots[0]++;
487 }
488 goto next_slot;
489 }
490
491 if (end <= extent_end && start >= key.offset && found_inline)
492 *hint_byte = EXTENT_MAP_INLINE;
493
494 if (found_extent) {
495 read_extent_buffer(leaf, &old, (unsigned long)extent,
496 sizeof(old));
497 root_gen = btrfs_header_generation(leaf);
498 root_owner = btrfs_header_owner(leaf);
499 leaf_start = leaf->start;
500 }
501
502 if (end < extent_end && end >= key.offset) {
503 bookend = 1;
504 if (found_inline && start <= key.offset)
505 keep = 1;
506 }
507
508 if (bookend && found_extent) {
509 if (locked_end < extent_end) {
510 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
511 locked_end, extent_end - 1,
512 GFP_NOFS);
513 if (!ret) {
514 btrfs_release_path(root, path);
515 lock_extent(&BTRFS_I(inode)->io_tree,
516 locked_end, extent_end - 1,
517 GFP_NOFS);
518 locked_end = extent_end;
519 continue;
520 }
521 locked_end = extent_end;
522 }
523 orig_parent = path->nodes[0]->start;
524 disk_bytenr = le64_to_cpu(old.disk_bytenr);
525 if (disk_bytenr != 0) {
526 ret = btrfs_inc_extent_ref(trans, root,
527 disk_bytenr,
528 le64_to_cpu(old.disk_num_bytes),
529 orig_parent, root->root_key.objectid,
530 trans->transid, inode->i_ino);
531 BUG_ON(ret);
532 }
533 }
534
535 if (found_inline) {
536 u64 mask = root->sectorsize - 1;
537 search_start = (extent_end + mask) & ~mask;
538 } else
539 search_start = extent_end;
540
541 /* truncate existing extent */
542 if (start > key.offset) {
543 u64 new_num;
544 u64 old_num;
545 keep = 1;
546 WARN_ON(start & (root->sectorsize - 1));
547 if (found_extent) {
548 new_num = start - key.offset;
549 old_num = btrfs_file_extent_num_bytes(leaf,
550 extent);
551 *hint_byte =
552 btrfs_file_extent_disk_bytenr(leaf,
553 extent);
554 if (btrfs_file_extent_disk_bytenr(leaf,
555 extent)) {
556 inode_sub_bytes(inode, old_num -
557 new_num);
558 }
559 btrfs_set_file_extent_num_bytes(leaf,
560 extent, new_num);
561 btrfs_mark_buffer_dirty(leaf);
562 } else if (key.offset < inline_limit &&
563 (end > extent_end) &&
564 (inline_limit < extent_end)) {
565 u32 new_size;
566 new_size = btrfs_file_extent_calc_inline_size(
567 inline_limit - key.offset);
568 inode_sub_bytes(inode, extent_end -
569 inline_limit);
570 btrfs_set_file_extent_ram_bytes(leaf, extent,
571 new_size);
572 if (!compression && !encryption) {
573 btrfs_truncate_item(trans, root, path,
574 new_size, 1);
575 }
576 }
577 }
578 /* delete the entire extent */
579 if (!keep) {
580 if (found_inline)
581 inode_sub_bytes(inode, extent_end -
582 key.offset);
583 ret = btrfs_del_item(trans, root, path);
584 /* TODO update progress marker and return */
585 BUG_ON(ret);
586 extent = NULL;
587 btrfs_release_path(root, path);
588 /* the extent will be freed later */
589 }
590 if (bookend && found_inline && start <= key.offset) {
591 u32 new_size;
592 new_size = btrfs_file_extent_calc_inline_size(
593 extent_end - end);
594 inode_sub_bytes(inode, end - key.offset);
595 btrfs_set_file_extent_ram_bytes(leaf, extent,
596 new_size);
597 if (!compression && !encryption)
598 ret = btrfs_truncate_item(trans, root, path,
599 new_size, 0);
600 BUG_ON(ret);
601 }
602 /* create bookend, splitting the extent in two */
603 if (bookend && found_extent) {
604 struct btrfs_key ins;
605 ins.objectid = inode->i_ino;
606 ins.offset = end;
607 btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
608
609 btrfs_release_path(root, path);
610 ret = btrfs_insert_empty_item(trans, root, path, &ins,
611 sizeof(*extent));
612 BUG_ON(ret);
613
614 leaf = path->nodes[0];
615 extent = btrfs_item_ptr(leaf, path->slots[0],
616 struct btrfs_file_extent_item);
617 write_extent_buffer(leaf, &old,
618 (unsigned long)extent, sizeof(old));
619
620 btrfs_set_file_extent_compression(leaf, extent,
621 compression);
622 btrfs_set_file_extent_encryption(leaf, extent,
623 encryption);
624 btrfs_set_file_extent_other_encoding(leaf, extent,
625 other_encoding);
626 btrfs_set_file_extent_offset(leaf, extent,
627 le64_to_cpu(old.offset) + end - key.offset);
628 WARN_ON(le64_to_cpu(old.num_bytes) <
629 (extent_end - end));
630 btrfs_set_file_extent_num_bytes(leaf, extent,
631 extent_end - end);
632
633 /*
634 * set the ram bytes to the size of the full extent
635 * before splitting. This is a worst case flag,
636 * but its the best we can do because we don't know
637 * how splitting affects compression
638 */
639 btrfs_set_file_extent_ram_bytes(leaf, extent,
640 ram_bytes);
641 btrfs_set_file_extent_type(leaf, extent, found_type);
642
643 btrfs_mark_buffer_dirty(path->nodes[0]);
644
645 if (disk_bytenr != 0) {
646 ret = btrfs_update_extent_ref(trans, root,
647 disk_bytenr, orig_parent,
648 leaf->start,
649 root->root_key.objectid,
650 trans->transid, ins.objectid);
651
652 BUG_ON(ret);
653 }
654 btrfs_release_path(root, path);
655 if (disk_bytenr != 0)
656 inode_add_bytes(inode, extent_end - end);
657 }
658
659 if (found_extent && !keep) {
660 u64 old_disk_bytenr = le64_to_cpu(old.disk_bytenr);
661
662 if (old_disk_bytenr != 0) {
663 inode_sub_bytes(inode,
664 le64_to_cpu(old.num_bytes));
665 ret = btrfs_free_extent(trans, root,
666 old_disk_bytenr,
667 le64_to_cpu(old.disk_num_bytes),
668 leaf_start, root_owner,
669 root_gen, key.objectid, 0);
670 BUG_ON(ret);
671 *hint_byte = old_disk_bytenr;
672 }
673 }
674
675 if (search_start >= end) {
676 ret = 0;
677 goto out;
678 }
679 }
680out:
681 btrfs_free_path(path);
682 if (locked_end > end) {
683 unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
684 GFP_NOFS);
685 }
686 btrfs_check_file(root, inode);
687 return ret;
688}
689
690static int extent_mergeable(struct extent_buffer *leaf, int slot,
691 u64 objectid, u64 bytenr, u64 *start, u64 *end)
692{
693 struct btrfs_file_extent_item *fi;
694 struct btrfs_key key;
695 u64 extent_end;
696
697 if (slot < 0 || slot >= btrfs_header_nritems(leaf))
698 return 0;
699
700 btrfs_item_key_to_cpu(leaf, &key, slot);
701 if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
702 return 0;
703
704 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
705 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
706 btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
707 btrfs_file_extent_compression(leaf, fi) ||
708 btrfs_file_extent_encryption(leaf, fi) ||
709 btrfs_file_extent_other_encoding(leaf, fi))
710 return 0;
711
712 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
713 if ((*start && *start != key.offset) || (*end && *end != extent_end))
714 return 0;
715
716 *start = key.offset;
717 *end = extent_end;
718 return 1;
719}
720
721/*
722 * Mark extent in the range start - end as written.
723 *
724 * This changes extent type from 'pre-allocated' to 'regular'. If only
725 * part of extent is marked as written, the extent will be split into
726 * two or three.
727 */
728int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
729 struct btrfs_root *root,
730 struct inode *inode, u64 start, u64 end)
731{
732 struct extent_buffer *leaf;
733 struct btrfs_path *path;
734 struct btrfs_file_extent_item *fi;
735 struct btrfs_key key;
736 u64 bytenr;
737 u64 num_bytes;
738 u64 extent_end;
739 u64 extent_offset;
740 u64 other_start;
741 u64 other_end;
742 u64 split = start;
743 u64 locked_end = end;
744 u64 orig_parent;
745 int extent_type;
746 int split_end = 1;
747 int ret;
748
749 btrfs_drop_extent_cache(inode, start, end - 1, 0);
750
751 path = btrfs_alloc_path();
752 BUG_ON(!path);
753again:
754 key.objectid = inode->i_ino;
755 key.type = BTRFS_EXTENT_DATA_KEY;
756 if (split == start)
757 key.offset = split;
758 else
759 key.offset = split - 1;
760
761 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
762 if (ret > 0 && path->slots[0] > 0)
763 path->slots[0]--;
764
765 leaf = path->nodes[0];
766 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
767 BUG_ON(key.objectid != inode->i_ino ||
768 key.type != BTRFS_EXTENT_DATA_KEY);
769 fi = btrfs_item_ptr(leaf, path->slots[0],
770 struct btrfs_file_extent_item);
771 extent_type = btrfs_file_extent_type(leaf, fi);
772 BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC);
773 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
774 BUG_ON(key.offset > start || extent_end < end);
775
776 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
777 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
778 extent_offset = btrfs_file_extent_offset(leaf, fi);
779
780 if (key.offset == start)
781 split = end;
782
783 if (key.offset == start && extent_end == end) {
784 int del_nr = 0;
785 int del_slot = 0;
786 u64 leaf_owner = btrfs_header_owner(leaf);
787 u64 leaf_gen = btrfs_header_generation(leaf);
788 other_start = end;
789 other_end = 0;
790 if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
791 bytenr, &other_start, &other_end)) {
792 extent_end = other_end;
793 del_slot = path->slots[0] + 1;
794 del_nr++;
795 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
796 leaf->start, leaf_owner,
797 leaf_gen, inode->i_ino, 0);
798 BUG_ON(ret);
799 }
800 other_start = 0;
801 other_end = start;
802 if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
803 bytenr, &other_start, &other_end)) {
804 key.offset = other_start;
805 del_slot = path->slots[0];
806 del_nr++;
807 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
808 leaf->start, leaf_owner,
809 leaf_gen, inode->i_ino, 0);
810 BUG_ON(ret);
811 }
812 split_end = 0;
813 if (del_nr == 0) {
814 btrfs_set_file_extent_type(leaf, fi,
815 BTRFS_FILE_EXTENT_REG);
816 goto done;
817 }
818
819 fi = btrfs_item_ptr(leaf, del_slot - 1,
820 struct btrfs_file_extent_item);
821 btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
822 btrfs_set_file_extent_num_bytes(leaf, fi,
823 extent_end - key.offset);
824 btrfs_mark_buffer_dirty(leaf);
825
826 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
827 BUG_ON(ret);
828 goto done;
829 } else if (split == start) {
830 if (locked_end < extent_end) {
831 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
832 locked_end, extent_end - 1, GFP_NOFS);
833 if (!ret) {
834 btrfs_release_path(root, path);
835 lock_extent(&BTRFS_I(inode)->io_tree,
836 locked_end, extent_end - 1, GFP_NOFS);
837 locked_end = extent_end;
838 goto again;
839 }
840 locked_end = extent_end;
841 }
842 btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
843 extent_offset += split - key.offset;
844 } else {
845 BUG_ON(key.offset != start);
846 btrfs_set_file_extent_offset(leaf, fi, extent_offset +
847 split - key.offset);
848 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
849 key.offset = split;
850 btrfs_set_item_key_safe(trans, root, path, &key);
851 extent_end = split;
852 }
853
854 if (extent_end == end) {
855 split_end = 0;
856 extent_type = BTRFS_FILE_EXTENT_REG;
857 }
858 if (extent_end == end && split == start) {
859 other_start = end;
860 other_end = 0;
861 if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
862 bytenr, &other_start, &other_end)) {
863 path->slots[0]++;
864 fi = btrfs_item_ptr(leaf, path->slots[0],
865 struct btrfs_file_extent_item);
866 key.offset = split;
867 btrfs_set_item_key_safe(trans, root, path, &key);
868 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
869 btrfs_set_file_extent_num_bytes(leaf, fi,
870 other_end - split);
871 goto done;
872 }
873 }
874 if (extent_end == end && split == end) {
875 other_start = 0;
876 other_end = start;
877 if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino,
878 bytenr, &other_start, &other_end)) {
879 path->slots[0]--;
880 fi = btrfs_item_ptr(leaf, path->slots[0],
881 struct btrfs_file_extent_item);
882 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end -
883 other_start);
884 goto done;
885 }
886 }
887
888 btrfs_mark_buffer_dirty(leaf);
889
890 orig_parent = leaf->start;
891 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
892 orig_parent, root->root_key.objectid,
893 trans->transid, inode->i_ino);
894 BUG_ON(ret);
895 btrfs_release_path(root, path);
896
897 key.offset = start;
898 ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi));
899 BUG_ON(ret);
900
901 leaf = path->nodes[0];
902 fi = btrfs_item_ptr(leaf, path->slots[0],
903 struct btrfs_file_extent_item);
904 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
905 btrfs_set_file_extent_type(leaf, fi, extent_type);
906 btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
907 btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
908 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
909 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
910 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
911 btrfs_set_file_extent_compression(leaf, fi, 0);
912 btrfs_set_file_extent_encryption(leaf, fi, 0);
913 btrfs_set_file_extent_other_encoding(leaf, fi, 0);
914
915 if (orig_parent != leaf->start) {
916 ret = btrfs_update_extent_ref(trans, root, bytenr,
917 orig_parent, leaf->start,
918 root->root_key.objectid,
919 trans->transid, inode->i_ino);
920 BUG_ON(ret);
921 }
922done:
923 btrfs_mark_buffer_dirty(leaf);
924 btrfs_release_path(root, path);
925 if (split_end && split == start) {
926 split = end;
927 goto again;
928 }
929 if (locked_end > end) {
930 unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
931 GFP_NOFS);
932 }
933 btrfs_free_path(path);
934 return 0;
935}
936
937/*
938 * this gets pages into the page cache and locks them down, it also properly
939 * waits for data=ordered extents to finish before allowing the pages to be
940 * modified.
941 */
942static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
943 struct page **pages, size_t num_pages,
944 loff_t pos, unsigned long first_index,
945 unsigned long last_index, size_t write_bytes)
946{
947 int i;
948 unsigned long index = pos >> PAGE_CACHE_SHIFT;
949 struct inode *inode = fdentry(file)->d_inode;
950 int err = 0;
951 u64 start_pos;
952 u64 last_pos;
953
954 start_pos = pos & ~((u64)root->sectorsize - 1);
955 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
956
957 if (start_pos > inode->i_size) {
958 err = btrfs_cont_expand(inode, start_pos);
959 if (err)
960 return err;
961 }
962
963 memset(pages, 0, num_pages * sizeof(struct page *));
964again:
965 for (i = 0; i < num_pages; i++) {
966 pages[i] = grab_cache_page(inode->i_mapping, index + i);
967 if (!pages[i]) {
968 err = -ENOMEM;
969 BUG_ON(1);
970 }
971 wait_on_page_writeback(pages[i]);
972 }
973 if (start_pos < inode->i_size) {
974 struct btrfs_ordered_extent *ordered;
975 lock_extent(&BTRFS_I(inode)->io_tree,
976 start_pos, last_pos - 1, GFP_NOFS);
977 ordered = btrfs_lookup_first_ordered_extent(inode,
978 last_pos - 1);
979 if (ordered &&
980 ordered->file_offset + ordered->len > start_pos &&
981 ordered->file_offset < last_pos) {
982 btrfs_put_ordered_extent(ordered);
983 unlock_extent(&BTRFS_I(inode)->io_tree,
984 start_pos, last_pos - 1, GFP_NOFS);
985 for (i = 0; i < num_pages; i++) {
986 unlock_page(pages[i]);
987 page_cache_release(pages[i]);
988 }
989 btrfs_wait_ordered_range(inode, start_pos,
990 last_pos - start_pos);
991 goto again;
992 }
993 if (ordered)
994 btrfs_put_ordered_extent(ordered);
995
996 clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
997 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
998 GFP_NOFS);
999 unlock_extent(&BTRFS_I(inode)->io_tree,
1000 start_pos, last_pos - 1, GFP_NOFS);
1001 }
1002 for (i = 0; i < num_pages; i++) {
1003 clear_page_dirty_for_io(pages[i]);
1004 set_page_extent_mapped(pages[i]);
1005 WARN_ON(!PageLocked(pages[i]));
1006 }
1007 return 0;
1008}
1009
1010static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1011 size_t count, loff_t *ppos)
1012{
1013 loff_t pos;
1014 loff_t start_pos;
1015 ssize_t num_written = 0;
1016 ssize_t err = 0;
1017 int ret = 0;
1018 struct inode *inode = fdentry(file)->d_inode;
1019 struct btrfs_root *root = BTRFS_I(inode)->root;
1020 struct page **pages = NULL;
1021 int nrptrs;
1022 struct page *pinned[2];
1023 unsigned long first_index;
1024 unsigned long last_index;
1025 int will_write;
1026
1027 will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) ||
1028 (file->f_flags & O_DIRECT));
1029
1030 nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
1031 PAGE_CACHE_SIZE / (sizeof(struct page *)));
1032 pinned[0] = NULL;
1033 pinned[1] = NULL;
1034
1035 pos = *ppos;
1036 start_pos = pos;
1037
1038 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
1039 current->backing_dev_info = inode->i_mapping->backing_dev_info;
1040 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
1041 if (err)
1042 goto out_nolock;
1043 if (count == 0)
1044 goto out_nolock;
1045
1046 err = file_remove_suid(file);
1047 if (err)
1048 goto out_nolock;
1049 file_update_time(file);
1050
1051 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
1052
1053 mutex_lock(&inode->i_mutex);
1054 BTRFS_I(inode)->sequence++;
1055 first_index = pos >> PAGE_CACHE_SHIFT;
1056 last_index = (pos + count) >> PAGE_CACHE_SHIFT;
1057
1058 /*
1059 * there are lots of better ways to do this, but this code
1060 * makes sure the first and last page in the file range are
1061 * up to date and ready for cow
1062 */
1063 if ((pos & (PAGE_CACHE_SIZE - 1))) {
1064 pinned[0] = grab_cache_page(inode->i_mapping, first_index);
1065 if (!PageUptodate(pinned[0])) {
1066 ret = btrfs_readpage(NULL, pinned[0]);
1067 BUG_ON(ret);
1068 wait_on_page_locked(pinned[0]);
1069 } else {
1070 unlock_page(pinned[0]);
1071 }
1072 }
1073 if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
1074 pinned[1] = grab_cache_page(inode->i_mapping, last_index);
1075 if (!PageUptodate(pinned[1])) {
1076 ret = btrfs_readpage(NULL, pinned[1]);
1077 BUG_ON(ret);
1078 wait_on_page_locked(pinned[1]);
1079 } else {
1080 unlock_page(pinned[1]);
1081 }
1082 }
1083
1084 while (count > 0) {
1085 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
1086 size_t write_bytes = min(count, nrptrs *
1087 (size_t)PAGE_CACHE_SIZE -
1088 offset);
1089 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
1090 PAGE_CACHE_SHIFT;
1091
1092 WARN_ON(num_pages > nrptrs);
1093 memset(pages, 0, sizeof(struct page *) * nrptrs);
1094
1095 ret = btrfs_check_free_space(root, write_bytes, 0);
1096 if (ret)
1097 goto out;
1098
1099 ret = prepare_pages(root, file, pages, num_pages,
1100 pos, first_index, last_index,
1101 write_bytes);
1102 if (ret)
1103 goto out;
1104
1105 ret = btrfs_copy_from_user(pos, num_pages,
1106 write_bytes, pages, buf);
1107 if (ret) {
1108 btrfs_drop_pages(pages, num_pages);
1109 goto out;
1110 }
1111
1112 ret = dirty_and_release_pages(NULL, root, file, pages,
1113 num_pages, pos, write_bytes);
1114 btrfs_drop_pages(pages, num_pages);
1115 if (ret)
1116 goto out;
1117
1118 if (will_write) {
1119 btrfs_fdatawrite_range(inode->i_mapping, pos,
1120 pos + write_bytes - 1,
1121 WB_SYNC_NONE);
1122 } else {
1123 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
1124 num_pages);
1125 if (num_pages <
1126 (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1127 btrfs_btree_balance_dirty(root, 1);
1128 btrfs_throttle(root);
1129 }
1130
1131 buf += write_bytes;
1132 count -= write_bytes;
1133 pos += write_bytes;
1134 num_written += write_bytes;
1135
1136 cond_resched();
1137 }
1138out:
1139 mutex_unlock(&inode->i_mutex);
1140
1141out_nolock:
1142 kfree(pages);
1143 if (pinned[0])
1144 page_cache_release(pinned[0]);
1145 if (pinned[1])
1146 page_cache_release(pinned[1]);
1147 *ppos = pos;
1148
1149 if (num_written > 0 && will_write) {
1150 struct btrfs_trans_handle *trans;
1151
1152 err = btrfs_wait_ordered_range(inode, start_pos, num_written);
1153 if (err)
1154 num_written = err;
1155
1156 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
1157 trans = btrfs_start_transaction(root, 1);
1158 ret = btrfs_log_dentry_safe(trans, root,
1159 file->f_dentry);
1160 if (ret == 0) {
1161 btrfs_sync_log(trans, root);
1162 btrfs_end_transaction(trans, root);
1163 } else {
1164 btrfs_commit_transaction(trans, root);
1165 }
1166 }
1167 if (file->f_flags & O_DIRECT) {
1168 invalidate_mapping_pages(inode->i_mapping,
1169 start_pos >> PAGE_CACHE_SHIFT,
1170 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
1171 }
1172 }
1173 current->backing_dev_info = NULL;
1174 return num_written ? num_written : err;
1175}
1176
1177int btrfs_release_file(struct inode *inode, struct file *filp)
1178{
1179 if (filp->private_data)
1180 btrfs_ioctl_trans_end(filp);
1181 return 0;
1182}
1183
1184/*
1185 * fsync call for both files and directories. This logs the inode into
1186 * the tree log instead of forcing full commits whenever possible.
1187 *
1188 * It needs to call filemap_fdatawait so that all ordered extent updates are
1189 * in the metadata btree are up to date for copying to the log.
1190 *
1191 * It drops the inode mutex before doing the tree log commit. This is an
1192 * important optimization for directories because holding the mutex prevents
1193 * new operations on the dir while we write to disk.
1194 */
1195int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1196{
1197 struct inode *inode = dentry->d_inode;
1198 struct btrfs_root *root = BTRFS_I(inode)->root;
1199 int ret = 0;
1200 struct btrfs_trans_handle *trans;
1201
1202 /*
1203 * check the transaction that last modified this inode
1204 * and see if its already been committed
1205 */
1206 if (!BTRFS_I(inode)->last_trans)
1207 goto out;
1208
1209 mutex_lock(&root->fs_info->trans_mutex);
1210 if (BTRFS_I(inode)->last_trans <=
1211 root->fs_info->last_trans_committed) {
1212 BTRFS_I(inode)->last_trans = 0;
1213 mutex_unlock(&root->fs_info->trans_mutex);
1214 goto out;
1215 }
1216 mutex_unlock(&root->fs_info->trans_mutex);
1217
1218 root->fs_info->tree_log_batch++;
1219 filemap_fdatawrite(inode->i_mapping);
1220 btrfs_wait_ordered_range(inode, 0, (u64)-1);
1221 root->fs_info->tree_log_batch++;
1222
1223 /*
1224 * ok we haven't committed the transaction yet, lets do a commit
1225 */
1226 if (file->private_data)
1227 btrfs_ioctl_trans_end(file);
1228
1229 trans = btrfs_start_transaction(root, 1);
1230 if (!trans) {
1231 ret = -ENOMEM;
1232 goto out;
1233 }
1234
1235 ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
1236 if (ret < 0)
1237 goto out;
1238
1239 /* we've logged all the items and now have a consistent
1240 * version of the file in the log. It is possible that
1241 * someone will come in and modify the file, but that's
1242 * fine because the log is consistent on disk, and we
1243 * have references to all of the file's extents
1244 *
1245 * It is possible that someone will come in and log the
1246 * file again, but that will end up using the synchronization
1247 * inside btrfs_sync_log to keep things safe.
1248 */
1249 mutex_unlock(&file->f_dentry->d_inode->i_mutex);
1250
1251 if (ret > 0) {
1252 ret = btrfs_commit_transaction(trans, root);
1253 } else {
1254 btrfs_sync_log(trans, root);
1255 ret = btrfs_end_transaction(trans, root);
1256 }
1257 mutex_lock(&file->f_dentry->d_inode->i_mutex);
1258out:
1259 return ret > 0 ? EIO : ret;
1260}
1261
1262static struct vm_operations_struct btrfs_file_vm_ops = {
1263 .fault = filemap_fault,
1264 .page_mkwrite = btrfs_page_mkwrite,
1265};
1266
1267static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1268{
1269 vma->vm_ops = &btrfs_file_vm_ops;
1270 file_accessed(filp);
1271 return 0;
1272}
1273
1274struct file_operations btrfs_file_operations = {
1275 .llseek = generic_file_llseek,
1276 .read = do_sync_read,
1277 .aio_read = generic_file_aio_read,
1278 .splice_read = generic_file_splice_read,
1279 .write = btrfs_file_write,
1280 .mmap = btrfs_file_mmap,
1281 .open = generic_file_open,
1282 .release = btrfs_release_file,
1283 .fsync = btrfs_sync_file,
1284 .unlocked_ioctl = btrfs_ioctl,
1285#ifdef CONFIG_COMPAT
1286 .compat_ioctl = btrfs_ioctl,
1287#endif
1288};
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
new file mode 100644
index 000000000000..d1e5f0e84c58
--- /dev/null
+++ b/fs/btrfs/free-space-cache.c
@@ -0,0 +1,495 @@
1/*
2 * Copyright (C) 2008 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21
22static int tree_insert_offset(struct rb_root *root, u64 offset,
23 struct rb_node *node)
24{
25 struct rb_node **p = &root->rb_node;
26 struct rb_node *parent = NULL;
27 struct btrfs_free_space *info;
28
29 while (*p) {
30 parent = *p;
31 info = rb_entry(parent, struct btrfs_free_space, offset_index);
32
33 if (offset < info->offset)
34 p = &(*p)->rb_left;
35 else if (offset > info->offset)
36 p = &(*p)->rb_right;
37 else
38 return -EEXIST;
39 }
40
41 rb_link_node(node, parent, p);
42 rb_insert_color(node, root);
43
44 return 0;
45}
46
47static int tree_insert_bytes(struct rb_root *root, u64 bytes,
48 struct rb_node *node)
49{
50 struct rb_node **p = &root->rb_node;
51 struct rb_node *parent = NULL;
52 struct btrfs_free_space *info;
53
54 while (*p) {
55 parent = *p;
56 info = rb_entry(parent, struct btrfs_free_space, bytes_index);
57
58 if (bytes < info->bytes)
59 p = &(*p)->rb_left;
60 else
61 p = &(*p)->rb_right;
62 }
63
64 rb_link_node(node, parent, p);
65 rb_insert_color(node, root);
66
67 return 0;
68}
69
70/*
71 * searches the tree for the given offset. If contains is set we will return
72 * the free space that contains the given offset. If contains is not set we
73 * will return the free space that starts at or after the given offset and is
74 * at least bytes long.
75 */
76static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
77 u64 offset, u64 bytes,
78 int contains)
79{
80 struct rb_node *n = root->rb_node;
81 struct btrfs_free_space *entry, *ret = NULL;
82
83 while (n) {
84 entry = rb_entry(n, struct btrfs_free_space, offset_index);
85
86 if (offset < entry->offset) {
87 if (!contains &&
88 (!ret || entry->offset < ret->offset) &&
89 (bytes <= entry->bytes))
90 ret = entry;
91 n = n->rb_left;
92 } else if (offset > entry->offset) {
93 if ((entry->offset + entry->bytes - 1) >= offset &&
94 bytes <= entry->bytes) {
95 ret = entry;
96 break;
97 }
98 n = n->rb_right;
99 } else {
100 if (bytes > entry->bytes) {
101 n = n->rb_right;
102 continue;
103 }
104 ret = entry;
105 break;
106 }
107 }
108
109 return ret;
110}
111
112/*
113 * return a chunk at least bytes size, as close to offset that we can get.
114 */
115static struct btrfs_free_space *tree_search_bytes(struct rb_root *root,
116 u64 offset, u64 bytes)
117{
118 struct rb_node *n = root->rb_node;
119 struct btrfs_free_space *entry, *ret = NULL;
120
121 while (n) {
122 entry = rb_entry(n, struct btrfs_free_space, bytes_index);
123
124 if (bytes < entry->bytes) {
125 /*
126 * We prefer to get a hole size as close to the size we
127 * are asking for so we don't take small slivers out of
128 * huge holes, but we also want to get as close to the
129 * offset as possible so we don't have a whole lot of
130 * fragmentation.
131 */
132 if (offset <= entry->offset) {
133 if (!ret)
134 ret = entry;
135 else if (entry->bytes < ret->bytes)
136 ret = entry;
137 else if (entry->offset < ret->offset)
138 ret = entry;
139 }
140 n = n->rb_left;
141 } else if (bytes > entry->bytes) {
142 n = n->rb_right;
143 } else {
144 /*
145 * Ok we may have multiple chunks of the wanted size,
146 * so we don't want to take the first one we find, we
147 * want to take the one closest to our given offset, so
148 * keep searching just in case theres a better match.
149 */
150 n = n->rb_right;
151 if (offset > entry->offset)
152 continue;
153 else if (!ret || entry->offset < ret->offset)
154 ret = entry;
155 }
156 }
157
158 return ret;
159}
160
161static void unlink_free_space(struct btrfs_block_group_cache *block_group,
162 struct btrfs_free_space *info)
163{
164 rb_erase(&info->offset_index, &block_group->free_space_offset);
165 rb_erase(&info->bytes_index, &block_group->free_space_bytes);
166}
167
168static int link_free_space(struct btrfs_block_group_cache *block_group,
169 struct btrfs_free_space *info)
170{
171 int ret = 0;
172
173
174 ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
175 &info->offset_index);
176 if (ret)
177 return ret;
178
179 ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes,
180 &info->bytes_index);
181 if (ret)
182 return ret;
183
184 return ret;
185}
186
187static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
188 u64 offset, u64 bytes)
189{
190 struct btrfs_free_space *right_info;
191 struct btrfs_free_space *left_info;
192 struct btrfs_free_space *info = NULL;
193 struct btrfs_free_space *alloc_info;
194 int ret = 0;
195
196 alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
197 if (!alloc_info)
198 return -ENOMEM;
199
200 /*
201 * first we want to see if there is free space adjacent to the range we
202 * are adding, if there is remove that struct and add a new one to
203 * cover the entire range
204 */
205 right_info = tree_search_offset(&block_group->free_space_offset,
206 offset+bytes, 0, 1);
207 left_info = tree_search_offset(&block_group->free_space_offset,
208 offset-1, 0, 1);
209
210 if (right_info && right_info->offset == offset+bytes) {
211 unlink_free_space(block_group, right_info);
212 info = right_info;
213 info->offset = offset;
214 info->bytes += bytes;
215 } else if (right_info && right_info->offset != offset+bytes) {
216 printk(KERN_ERR "btrfs adding space in the middle of an "
217 "existing free space area. existing: "
218 "offset=%llu, bytes=%llu. new: offset=%llu, "
219 "bytes=%llu\n", (unsigned long long)right_info->offset,
220 (unsigned long long)right_info->bytes,
221 (unsigned long long)offset,
222 (unsigned long long)bytes);
223 BUG();
224 }
225
226 if (left_info) {
227 unlink_free_space(block_group, left_info);
228
229 if (unlikely((left_info->offset + left_info->bytes) !=
230 offset)) {
231 printk(KERN_ERR "btrfs free space to the left "
232 "of new free space isn't "
233 "quite right. existing: offset=%llu, "
234 "bytes=%llu. new: offset=%llu, bytes=%llu\n",
235 (unsigned long long)left_info->offset,
236 (unsigned long long)left_info->bytes,
237 (unsigned long long)offset,
238 (unsigned long long)bytes);
239 BUG();
240 }
241
242 if (info) {
243 info->offset = left_info->offset;
244 info->bytes += left_info->bytes;
245 kfree(left_info);
246 } else {
247 info = left_info;
248 info->bytes += bytes;
249 }
250 }
251
252 if (info) {
253 ret = link_free_space(block_group, info);
254 if (!ret)
255 info = NULL;
256 goto out;
257 }
258
259 info = alloc_info;
260 alloc_info = NULL;
261 info->offset = offset;
262 info->bytes = bytes;
263
264 ret = link_free_space(block_group, info);
265 if (ret)
266 kfree(info);
267out:
268 if (ret) {
269 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
270 if (ret == -EEXIST)
271 BUG();
272 }
273
274 kfree(alloc_info);
275
276 return ret;
277}
278
279static int
280__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
281 u64 offset, u64 bytes)
282{
283 struct btrfs_free_space *info;
284 int ret = 0;
285
286 info = tree_search_offset(&block_group->free_space_offset, offset, 0,
287 1);
288
289 if (info && info->offset == offset) {
290 if (info->bytes < bytes) {
291 printk(KERN_ERR "Found free space at %llu, size %llu,"
292 "trying to use %llu\n",
293 (unsigned long long)info->offset,
294 (unsigned long long)info->bytes,
295 (unsigned long long)bytes);
296 WARN_ON(1);
297 ret = -EINVAL;
298 goto out;
299 }
300 unlink_free_space(block_group, info);
301
302 if (info->bytes == bytes) {
303 kfree(info);
304 goto out;
305 }
306
307 info->offset += bytes;
308 info->bytes -= bytes;
309
310 ret = link_free_space(block_group, info);
311 BUG_ON(ret);
312 } else if (info && info->offset < offset &&
313 info->offset + info->bytes >= offset + bytes) {
314 u64 old_start = info->offset;
315 /*
316 * we're freeing space in the middle of the info,
317 * this can happen during tree log replay
318 *
319 * first unlink the old info and then
320 * insert it again after the hole we're creating
321 */
322 unlink_free_space(block_group, info);
323 if (offset + bytes < info->offset + info->bytes) {
324 u64 old_end = info->offset + info->bytes;
325
326 info->offset = offset + bytes;
327 info->bytes = old_end - info->offset;
328 ret = link_free_space(block_group, info);
329 BUG_ON(ret);
330 } else {
331 /* the hole we're creating ends at the end
332 * of the info struct, just free the info
333 */
334 kfree(info);
335 }
336
337 /* step two, insert a new info struct to cover anything
338 * before the hole
339 */
340 ret = __btrfs_add_free_space(block_group, old_start,
341 offset - old_start);
342 BUG_ON(ret);
343 } else {
344 WARN_ON(1);
345 }
346out:
347 return ret;
348}
349
350int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
351 u64 offset, u64 bytes)
352{
353 int ret;
354 struct btrfs_free_space *sp;
355
356 mutex_lock(&block_group->alloc_mutex);
357 ret = __btrfs_add_free_space(block_group, offset, bytes);
358 sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
359 BUG_ON(!sp);
360 mutex_unlock(&block_group->alloc_mutex);
361
362 return ret;
363}
364
365int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
366 u64 offset, u64 bytes)
367{
368 int ret;
369 struct btrfs_free_space *sp;
370
371 ret = __btrfs_add_free_space(block_group, offset, bytes);
372 sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
373 BUG_ON(!sp);
374
375 return ret;
376}
377
378int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
379 u64 offset, u64 bytes)
380{
381 int ret = 0;
382
383 mutex_lock(&block_group->alloc_mutex);
384 ret = __btrfs_remove_free_space(block_group, offset, bytes);
385 mutex_unlock(&block_group->alloc_mutex);
386
387 return ret;
388}
389
390int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
391 u64 offset, u64 bytes)
392{
393 int ret;
394
395 ret = __btrfs_remove_free_space(block_group, offset, bytes);
396
397 return ret;
398}
399
400void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
401 u64 bytes)
402{
403 struct btrfs_free_space *info;
404 struct rb_node *n;
405 int count = 0;
406
407 for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) {
408 info = rb_entry(n, struct btrfs_free_space, offset_index);
409 if (info->bytes >= bytes)
410 count++;
411 }
412 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
413 "\n", count);
414}
415
416u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
417{
418 struct btrfs_free_space *info;
419 struct rb_node *n;
420 u64 ret = 0;
421
422 for (n = rb_first(&block_group->free_space_offset); n;
423 n = rb_next(n)) {
424 info = rb_entry(n, struct btrfs_free_space, offset_index);
425 ret += info->bytes;
426 }
427
428 return ret;
429}
430
431void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
432{
433 struct btrfs_free_space *info;
434 struct rb_node *node;
435
436 mutex_lock(&block_group->alloc_mutex);
437 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
438 info = rb_entry(node, struct btrfs_free_space, bytes_index);
439 unlink_free_space(block_group, info);
440 kfree(info);
441 if (need_resched()) {
442 mutex_unlock(&block_group->alloc_mutex);
443 cond_resched();
444 mutex_lock(&block_group->alloc_mutex);
445 }
446 }
447 mutex_unlock(&block_group->alloc_mutex);
448}
449
450#if 0
451static struct btrfs_free_space *btrfs_find_free_space_offset(struct
452 btrfs_block_group_cache
453 *block_group, u64 offset,
454 u64 bytes)
455{
456 struct btrfs_free_space *ret;
457
458 mutex_lock(&block_group->alloc_mutex);
459 ret = tree_search_offset(&block_group->free_space_offset, offset,
460 bytes, 0);
461 mutex_unlock(&block_group->alloc_mutex);
462
463 return ret;
464}
465
466static struct btrfs_free_space *btrfs_find_free_space_bytes(struct
467 btrfs_block_group_cache
468 *block_group, u64 offset,
469 u64 bytes)
470{
471 struct btrfs_free_space *ret;
472
473 mutex_lock(&block_group->alloc_mutex);
474
475 ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes);
476 mutex_unlock(&block_group->alloc_mutex);
477
478 return ret;
479}
480#endif
481
482struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
483 *block_group, u64 offset,
484 u64 bytes)
485{
486 struct btrfs_free_space *ret = NULL;
487
488 ret = tree_search_offset(&block_group->free_space_offset, offset,
489 bytes, 0);
490 if (!ret)
491 ret = tree_search_bytes(&block_group->free_space_bytes,
492 offset, bytes);
493
494 return ret;
495}
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
new file mode 100644
index 000000000000..2a020b276768
--- /dev/null
+++ b/fs/btrfs/hash.h
@@ -0,0 +1,27 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __HASH__
20#define __HASH__
21
22#include "crc32c.h"
23static inline u64 btrfs_name_hash(const char *name, int len)
24{
25 return btrfs_crc32c((u32)~1, name, len);
26}
27#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
new file mode 100644
index 000000000000..3d46fa1f29a4
--- /dev/null
+++ b/fs/btrfs/inode-item.c
@@ -0,0 +1,206 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "transaction.h"
22
23static int find_name_in_backref(struct btrfs_path *path, const char *name,
24 int name_len, struct btrfs_inode_ref **ref_ret)
25{
26 struct extent_buffer *leaf;
27 struct btrfs_inode_ref *ref;
28 unsigned long ptr;
29 unsigned long name_ptr;
30 u32 item_size;
31 u32 cur_offset = 0;
32 int len;
33
34 leaf = path->nodes[0];
35 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
36 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
37 while (cur_offset < item_size) {
38 ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
39 len = btrfs_inode_ref_name_len(leaf, ref);
40 name_ptr = (unsigned long)(ref + 1);
41 cur_offset += len + sizeof(*ref);
42 if (len != name_len)
43 continue;
44 if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) {
45 *ref_ret = ref;
46 return 1;
47 }
48 }
49 return 0;
50}
51
52int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
53 struct btrfs_root *root,
54 const char *name, int name_len,
55 u64 inode_objectid, u64 ref_objectid, u64 *index)
56{
57 struct btrfs_path *path;
58 struct btrfs_key key;
59 struct btrfs_inode_ref *ref;
60 struct extent_buffer *leaf;
61 unsigned long ptr;
62 unsigned long item_start;
63 u32 item_size;
64 u32 sub_item_len;
65 int ret;
66 int del_len = name_len + sizeof(*ref);
67
68 key.objectid = inode_objectid;
69 key.offset = ref_objectid;
70 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
71
72 path = btrfs_alloc_path();
73 if (!path)
74 return -ENOMEM;
75
76 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
77 if (ret > 0) {
78 ret = -ENOENT;
79 goto out;
80 } else if (ret < 0) {
81 goto out;
82 }
83 if (!find_name_in_backref(path, name, name_len, &ref)) {
84 ret = -ENOENT;
85 goto out;
86 }
87 leaf = path->nodes[0];
88 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
89
90 if (index)
91 *index = btrfs_inode_ref_index(leaf, ref);
92
93 if (del_len == item_size) {
94 ret = btrfs_del_item(trans, root, path);
95 goto out;
96 }
97 ptr = (unsigned long)ref;
98 sub_item_len = name_len + sizeof(*ref);
99 item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
100 memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
101 item_size - (ptr + sub_item_len - item_start));
102 ret = btrfs_truncate_item(trans, root, path,
103 item_size - sub_item_len, 1);
104 BUG_ON(ret);
105out:
106 btrfs_free_path(path);
107 return ret;
108}
109
110int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
111 struct btrfs_root *root,
112 const char *name, int name_len,
113 u64 inode_objectid, u64 ref_objectid, u64 index)
114{
115 struct btrfs_path *path;
116 struct btrfs_key key;
117 struct btrfs_inode_ref *ref;
118 unsigned long ptr;
119 int ret;
120 int ins_len = name_len + sizeof(*ref);
121
122 key.objectid = inode_objectid;
123 key.offset = ref_objectid;
124 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
125
126 path = btrfs_alloc_path();
127 if (!path)
128 return -ENOMEM;
129
130 ret = btrfs_insert_empty_item(trans, root, path, &key,
131 ins_len);
132 if (ret == -EEXIST) {
133 u32 old_size;
134
135 if (find_name_in_backref(path, name, name_len, &ref))
136 goto out;
137
138 old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
139 ret = btrfs_extend_item(trans, root, path, ins_len);
140 BUG_ON(ret);
141 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
142 struct btrfs_inode_ref);
143 ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
144 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
145 btrfs_set_inode_ref_index(path->nodes[0], ref, index);
146 ptr = (unsigned long)(ref + 1);
147 ret = 0;
148 } else if (ret < 0) {
149 goto out;
150 } else {
151 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
152 struct btrfs_inode_ref);
153 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
154 btrfs_set_inode_ref_index(path->nodes[0], ref, index);
155 ptr = (unsigned long)(ref + 1);
156 }
157 write_extent_buffer(path->nodes[0], name, ptr, name_len);
158 btrfs_mark_buffer_dirty(path->nodes[0]);
159
160out:
161 btrfs_free_path(path);
162 return ret;
163}
164
165int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
166 struct btrfs_root *root,
167 struct btrfs_path *path, u64 objectid)
168{
169 struct btrfs_key key;
170 int ret;
171 key.objectid = objectid;
172 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
173 key.offset = 0;
174
175 ret = btrfs_insert_empty_item(trans, root, path, &key,
176 sizeof(struct btrfs_inode_item));
177 if (ret == 0 && objectid > root->highest_inode)
178 root->highest_inode = objectid;
179 return ret;
180}
181
182int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
183 *root, struct btrfs_path *path,
184 struct btrfs_key *location, int mod)
185{
186 int ins_len = mod < 0 ? -1 : 0;
187 int cow = mod != 0;
188 int ret;
189 int slot;
190 struct extent_buffer *leaf;
191 struct btrfs_key found_key;
192
193 ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
194 if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY &&
195 location->offset == (u64)-1 && path->slots[0] != 0) {
196 slot = path->slots[0] - 1;
197 leaf = path->nodes[0];
198 btrfs_item_key_to_cpu(leaf, &found_key, slot);
199 if (found_key.objectid == location->objectid &&
200 btrfs_key_type(&found_key) == btrfs_key_type(location)) {
201 path->slots[0]--;
202 return 0;
203 }
204 }
205 return ret;
206}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
new file mode 100644
index 000000000000..2aa79873eb46
--- /dev/null
+++ b/fs/btrfs/inode-map.c
@@ -0,0 +1,144 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "transaction.h"
22
23int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
24{
25 struct btrfs_path *path;
26 int ret;
27 struct extent_buffer *l;
28 struct btrfs_key search_key;
29 struct btrfs_key found_key;
30 int slot;
31
32 path = btrfs_alloc_path();
33 BUG_ON(!path);
34
35 search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
36 search_key.type = -1;
37 search_key.offset = (u64)-1;
38 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
39 if (ret < 0)
40 goto error;
41 BUG_ON(ret == 0);
42 if (path->slots[0] > 0) {
43 slot = path->slots[0] - 1;
44 l = path->nodes[0];
45 btrfs_item_key_to_cpu(l, &found_key, slot);
46 *objectid = found_key.objectid;
47 } else {
48 *objectid = BTRFS_FIRST_FREE_OBJECTID;
49 }
50 ret = 0;
51error:
52 btrfs_free_path(path);
53 return ret;
54}
55
56/*
57 * walks the btree of allocated inodes and find a hole.
58 */
59int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
60 struct btrfs_root *root,
61 u64 dirid, u64 *objectid)
62{
63 struct btrfs_path *path;
64 struct btrfs_key key;
65 int ret;
66 int slot = 0;
67 u64 last_ino = 0;
68 int start_found;
69 struct extent_buffer *l;
70 struct btrfs_key search_key;
71 u64 search_start = dirid;
72
73 mutex_lock(&root->objectid_mutex);
74 if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID &&
75 root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) {
76 *objectid = ++root->last_inode_alloc;
77 mutex_unlock(&root->objectid_mutex);
78 return 0;
79 }
80 path = btrfs_alloc_path();
81 BUG_ON(!path);
82 search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID);
83 search_key.objectid = search_start;
84 search_key.type = 0;
85 search_key.offset = 0;
86
87 btrfs_init_path(path);
88 start_found = 0;
89 ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
90 if (ret < 0)
91 goto error;
92
93 while (1) {
94 l = path->nodes[0];
95 slot = path->slots[0];
96 if (slot >= btrfs_header_nritems(l)) {
97 ret = btrfs_next_leaf(root, path);
98 if (ret == 0)
99 continue;
100 if (ret < 0)
101 goto error;
102 if (!start_found) {
103 *objectid = search_start;
104 start_found = 1;
105 goto found;
106 }
107 *objectid = last_ino > search_start ?
108 last_ino : search_start;
109 goto found;
110 }
111 btrfs_item_key_to_cpu(l, &key, slot);
112 if (key.objectid >= search_start) {
113 if (start_found) {
114 if (last_ino < search_start)
115 last_ino = search_start;
116 if (key.objectid > last_ino) {
117 *objectid = last_ino;
118 goto found;
119 }
120 } else if (key.objectid > search_start) {
121 *objectid = search_start;
122 goto found;
123 }
124 }
125 if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
126 break;
127
128 start_found = 1;
129 last_ino = key.objectid + 1;
130 path->slots[0]++;
131 }
132 BUG_ON(1);
133found:
134 btrfs_release_path(root, path);
135 btrfs_free_path(path);
136 BUG_ON(*objectid < search_start);
137 mutex_unlock(&root->objectid_mutex);
138 return 0;
139error:
140 btrfs_release_path(root, path);
141 btrfs_free_path(path);
142 mutex_unlock(&root->objectid_mutex);
143 return ret;
144}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
new file mode 100644
index 000000000000..8adfe059ab41
--- /dev/null
+++ b/fs/btrfs/inode.c
@@ -0,0 +1,5035 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bio.h>
21#include <linux/buffer_head.h>
22#include <linux/file.h>
23#include <linux/fs.h>
24#include <linux/pagemap.h>
25#include <linux/highmem.h>
26#include <linux/time.h>
27#include <linux/init.h>
28#include <linux/string.h>
29#include <linux/smp_lock.h>
30#include <linux/backing-dev.h>
31#include <linux/mpage.h>
32#include <linux/swap.h>
33#include <linux/writeback.h>
34#include <linux/statfs.h>
35#include <linux/compat.h>
36#include <linux/bit_spinlock.h>
37#include <linux/version.h>
38#include <linux/xattr.h>
39#include <linux/posix_acl.h>
40#include <linux/falloc.h>
41#include "compat.h"
42#include "ctree.h"
43#include "disk-io.h"
44#include "transaction.h"
45#include "btrfs_inode.h"
46#include "ioctl.h"
47#include "print-tree.h"
48#include "volumes.h"
49#include "ordered-data.h"
50#include "xattr.h"
51#include "tree-log.h"
52#include "ref-cache.h"
53#include "compression.h"
54
55struct btrfs_iget_args {
56 u64 ino;
57 struct btrfs_root *root;
58};
59
60static struct inode_operations btrfs_dir_inode_operations;
61static struct inode_operations btrfs_symlink_inode_operations;
62static struct inode_operations btrfs_dir_ro_inode_operations;
63static struct inode_operations btrfs_special_inode_operations;
64static struct inode_operations btrfs_file_inode_operations;
65static struct address_space_operations btrfs_aops;
66static struct address_space_operations btrfs_symlink_aops;
67static struct file_operations btrfs_dir_file_operations;
68static struct extent_io_ops btrfs_extent_io_ops;
69
70static struct kmem_cache *btrfs_inode_cachep;
71struct kmem_cache *btrfs_trans_handle_cachep;
72struct kmem_cache *btrfs_transaction_cachep;
73struct kmem_cache *btrfs_bit_radix_cachep;
74struct kmem_cache *btrfs_path_cachep;
75
76#define S_SHIFT 12
77static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
78 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
79 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
80 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
81 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV,
82 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO,
83 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK,
84 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
85};
86
87static void btrfs_truncate(struct inode *inode);
88static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
89static noinline int cow_file_range(struct inode *inode,
90 struct page *locked_page,
91 u64 start, u64 end, int *page_started,
92 unsigned long *nr_written, int unlock);
93
94/*
95 * a very lame attempt at stopping writes when the FS is 85% full. There
96 * are countless ways this is incorrect, but it is better than nothing.
97 */
98int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
99 int for_del)
100{
101 u64 total;
102 u64 used;
103 u64 thresh;
104 int ret = 0;
105
106 spin_lock(&root->fs_info->delalloc_lock);
107 total = btrfs_super_total_bytes(&root->fs_info->super_copy);
108 used = btrfs_super_bytes_used(&root->fs_info->super_copy);
109 if (for_del)
110 thresh = total * 90;
111 else
112 thresh = total * 85;
113
114 do_div(thresh, 100);
115
116 if (used + root->fs_info->delalloc_bytes + num_required > thresh)
117 ret = -ENOSPC;
118 spin_unlock(&root->fs_info->delalloc_lock);
119 return ret;
120}
121
122/*
123 * this does all the hard work for inserting an inline extent into
124 * the btree. The caller should have done a btrfs_drop_extents so that
125 * no overlapping inline items exist in the btree
126 */
127static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
128 struct btrfs_root *root, struct inode *inode,
129 u64 start, size_t size, size_t compressed_size,
130 struct page **compressed_pages)
131{
132 struct btrfs_key key;
133 struct btrfs_path *path;
134 struct extent_buffer *leaf;
135 struct page *page = NULL;
136 char *kaddr;
137 unsigned long ptr;
138 struct btrfs_file_extent_item *ei;
139 int err = 0;
140 int ret;
141 size_t cur_size = size;
142 size_t datasize;
143 unsigned long offset;
144 int use_compress = 0;
145
146 if (compressed_size && compressed_pages) {
147 use_compress = 1;
148 cur_size = compressed_size;
149 }
150
151 path = btrfs_alloc_path();
152 if (!path)
153 return -ENOMEM;
154
155 btrfs_set_trans_block_group(trans, inode);
156
157 key.objectid = inode->i_ino;
158 key.offset = start;
159 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
160 datasize = btrfs_file_extent_calc_inline_size(cur_size);
161
162 inode_add_bytes(inode, size);
163 ret = btrfs_insert_empty_item(trans, root, path, &key,
164 datasize);
165 BUG_ON(ret);
166 if (ret) {
167 err = ret;
168 goto fail;
169 }
170 leaf = path->nodes[0];
171 ei = btrfs_item_ptr(leaf, path->slots[0],
172 struct btrfs_file_extent_item);
173 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
174 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
175 btrfs_set_file_extent_encryption(leaf, ei, 0);
176 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
177 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
178 ptr = btrfs_file_extent_inline_start(ei);
179
180 if (use_compress) {
181 struct page *cpage;
182 int i = 0;
183 while (compressed_size > 0) {
184 cpage = compressed_pages[i];
185 cur_size = min_t(unsigned long, compressed_size,
186 PAGE_CACHE_SIZE);
187
188 kaddr = kmap(cpage);
189 write_extent_buffer(leaf, kaddr, ptr, cur_size);
190 kunmap(cpage);
191
192 i++;
193 ptr += cur_size;
194 compressed_size -= cur_size;
195 }
196 btrfs_set_file_extent_compression(leaf, ei,
197 BTRFS_COMPRESS_ZLIB);
198 } else {
199 page = find_get_page(inode->i_mapping,
200 start >> PAGE_CACHE_SHIFT);
201 btrfs_set_file_extent_compression(leaf, ei, 0);
202 kaddr = kmap_atomic(page, KM_USER0);
203 offset = start & (PAGE_CACHE_SIZE - 1);
204 write_extent_buffer(leaf, kaddr + offset, ptr, size);
205 kunmap_atomic(kaddr, KM_USER0);
206 page_cache_release(page);
207 }
208 btrfs_mark_buffer_dirty(leaf);
209 btrfs_free_path(path);
210
211 BTRFS_I(inode)->disk_i_size = inode->i_size;
212 btrfs_update_inode(trans, root, inode);
213 return 0;
214fail:
215 btrfs_free_path(path);
216 return err;
217}
218
219
220/*
221 * conditionally insert an inline extent into the file. This
222 * does the checks required to make sure the data is small enough
223 * to fit as an inline extent.
224 */
225static int cow_file_range_inline(struct btrfs_trans_handle *trans,
226 struct btrfs_root *root,
227 struct inode *inode, u64 start, u64 end,
228 size_t compressed_size,
229 struct page **compressed_pages)
230{
231 u64 isize = i_size_read(inode);
232 u64 actual_end = min(end + 1, isize);
233 u64 inline_len = actual_end - start;
234 u64 aligned_end = (end + root->sectorsize - 1) &
235 ~((u64)root->sectorsize - 1);
236 u64 hint_byte;
237 u64 data_len = inline_len;
238 int ret;
239
240 if (compressed_size)
241 data_len = compressed_size;
242
243 if (start > 0 ||
244 actual_end >= PAGE_CACHE_SIZE ||
245 data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
246 (!compressed_size &&
247 (actual_end & (root->sectorsize - 1)) == 0) ||
248 end + 1 < isize ||
249 data_len > root->fs_info->max_inline) {
250 return 1;
251 }
252
253 ret = btrfs_drop_extents(trans, root, inode, start,
254 aligned_end, start, &hint_byte);
255 BUG_ON(ret);
256
257 if (isize > actual_end)
258 inline_len = min_t(u64, isize, actual_end);
259 ret = insert_inline_extent(trans, root, inode, start,
260 inline_len, compressed_size,
261 compressed_pages);
262 BUG_ON(ret);
263 btrfs_drop_extent_cache(inode, start, aligned_end, 0);
264 return 0;
265}
266
267struct async_extent {
268 u64 start;
269 u64 ram_size;
270 u64 compressed_size;
271 struct page **pages;
272 unsigned long nr_pages;
273 struct list_head list;
274};
275
276struct async_cow {
277 struct inode *inode;
278 struct btrfs_root *root;
279 struct page *locked_page;
280 u64 start;
281 u64 end;
282 struct list_head extents;
283 struct btrfs_work work;
284};
285
286static noinline int add_async_extent(struct async_cow *cow,
287 u64 start, u64 ram_size,
288 u64 compressed_size,
289 struct page **pages,
290 unsigned long nr_pages)
291{
292 struct async_extent *async_extent;
293
294 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
295 async_extent->start = start;
296 async_extent->ram_size = ram_size;
297 async_extent->compressed_size = compressed_size;
298 async_extent->pages = pages;
299 async_extent->nr_pages = nr_pages;
300 list_add_tail(&async_extent->list, &cow->extents);
301 return 0;
302}
303
304/*
305 * we create compressed extents in two phases. The first
306 * phase compresses a range of pages that have already been
307 * locked (both pages and state bits are locked).
308 *
309 * This is done inside an ordered work queue, and the compression
310 * is spread across many cpus. The actual IO submission is step
311 * two, and the ordered work queue takes care of making sure that
312 * happens in the same order things were put onto the queue by
313 * writepages and friends.
314 *
315 * If this code finds it can't get good compression, it puts an
316 * entry onto the work queue to write the uncompressed bytes. This
317 * makes sure that both compressed inodes and uncompressed inodes
318 * are written in the same order that pdflush sent them down.
319 */
320static noinline int compress_file_range(struct inode *inode,
321 struct page *locked_page,
322 u64 start, u64 end,
323 struct async_cow *async_cow,
324 int *num_added)
325{
326 struct btrfs_root *root = BTRFS_I(inode)->root;
327 struct btrfs_trans_handle *trans;
328 u64 num_bytes;
329 u64 orig_start;
330 u64 disk_num_bytes;
331 u64 blocksize = root->sectorsize;
332 u64 actual_end;
333 u64 isize = i_size_read(inode);
334 int ret = 0;
335 struct page **pages = NULL;
336 unsigned long nr_pages;
337 unsigned long nr_pages_ret = 0;
338 unsigned long total_compressed = 0;
339 unsigned long total_in = 0;
340 unsigned long max_compressed = 128 * 1024;
341 unsigned long max_uncompressed = 128 * 1024;
342 int i;
343 int will_compress;
344
345 orig_start = start;
346
347 actual_end = min_t(u64, isize, end + 1);
348again:
349 will_compress = 0;
350 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
351 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
352
353 total_compressed = actual_end - start;
354
355 /* we want to make sure that amount of ram required to uncompress
356 * an extent is reasonable, so we limit the total size in ram
357 * of a compressed extent to 128k. This is a crucial number
358 * because it also controls how easily we can spread reads across
359 * cpus for decompression.
360 *
361 * We also want to make sure the amount of IO required to do
362 * a random read is reasonably small, so we limit the size of
363 * a compressed extent to 128k.
364 */
365 total_compressed = min(total_compressed, max_uncompressed);
366 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
367 num_bytes = max(blocksize, num_bytes);
368 disk_num_bytes = num_bytes;
369 total_in = 0;
370 ret = 0;
371
372 /*
373 * we do compression for mount -o compress and when the
374 * inode has not been flagged as nocompress. This flag can
375 * change at any time if we discover bad compression ratios.
376 */
377 if (!btrfs_test_flag(inode, NOCOMPRESS) &&
378 btrfs_test_opt(root, COMPRESS)) {
379 WARN_ON(pages);
380 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
381
382 ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
383 total_compressed, pages,
384 nr_pages, &nr_pages_ret,
385 &total_in,
386 &total_compressed,
387 max_compressed);
388
389 if (!ret) {
390 unsigned long offset = total_compressed &
391 (PAGE_CACHE_SIZE - 1);
392 struct page *page = pages[nr_pages_ret - 1];
393 char *kaddr;
394
395 /* zero the tail end of the last page, we might be
396 * sending it down to disk
397 */
398 if (offset) {
399 kaddr = kmap_atomic(page, KM_USER0);
400 memset(kaddr + offset, 0,
401 PAGE_CACHE_SIZE - offset);
402 kunmap_atomic(kaddr, KM_USER0);
403 }
404 will_compress = 1;
405 }
406 }
407 if (start == 0) {
408 trans = btrfs_join_transaction(root, 1);
409 BUG_ON(!trans);
410 btrfs_set_trans_block_group(trans, inode);
411
412 /* lets try to make an inline extent */
413 if (ret || total_in < (actual_end - start)) {
414 /* we didn't compress the entire range, try
415 * to make an uncompressed inline extent.
416 */
417 ret = cow_file_range_inline(trans, root, inode,
418 start, end, 0, NULL);
419 } else {
420 /* try making a compressed inline extent */
421 ret = cow_file_range_inline(trans, root, inode,
422 start, end,
423 total_compressed, pages);
424 }
425 btrfs_end_transaction(trans, root);
426 if (ret == 0) {
427 /*
428 * inline extent creation worked, we don't need
429 * to create any more async work items. Unlock
430 * and free up our temp pages.
431 */
432 extent_clear_unlock_delalloc(inode,
433 &BTRFS_I(inode)->io_tree,
434 start, end, NULL, 1, 0,
435 0, 1, 1, 1);
436 ret = 0;
437 goto free_pages_out;
438 }
439 }
440
441 if (will_compress) {
442 /*
443 * we aren't doing an inline extent round the compressed size
444 * up to a block size boundary so the allocator does sane
445 * things
446 */
447 total_compressed = (total_compressed + blocksize - 1) &
448 ~(blocksize - 1);
449
450 /*
451 * one last check to make sure the compression is really a
452 * win, compare the page count read with the blocks on disk
453 */
454 total_in = (total_in + PAGE_CACHE_SIZE - 1) &
455 ~(PAGE_CACHE_SIZE - 1);
456 if (total_compressed >= total_in) {
457 will_compress = 0;
458 } else {
459 disk_num_bytes = total_compressed;
460 num_bytes = total_in;
461 }
462 }
463 if (!will_compress && pages) {
464 /*
465 * the compression code ran but failed to make things smaller,
466 * free any pages it allocated and our page pointer array
467 */
468 for (i = 0; i < nr_pages_ret; i++) {
469 WARN_ON(pages[i]->mapping);
470 page_cache_release(pages[i]);
471 }
472 kfree(pages);
473 pages = NULL;
474 total_compressed = 0;
475 nr_pages_ret = 0;
476
477 /* flag the file so we don't compress in the future */
478 btrfs_set_flag(inode, NOCOMPRESS);
479 }
480 if (will_compress) {
481 *num_added += 1;
482
483 /* the async work queues will take care of doing actual
484 * allocation on disk for these compressed pages,
485 * and will submit them to the elevator.
486 */
487 add_async_extent(async_cow, start, num_bytes,
488 total_compressed, pages, nr_pages_ret);
489
490 if (start + num_bytes < end && start + num_bytes < actual_end) {
491 start += num_bytes;
492 pages = NULL;
493 cond_resched();
494 goto again;
495 }
496 } else {
497 /*
498 * No compression, but we still need to write the pages in
499 * the file we've been given so far. redirty the locked
500 * page if it corresponds to our extent and set things up
501 * for the async work queue to run cow_file_range to do
502 * the normal delalloc dance
503 */
504 if (page_offset(locked_page) >= start &&
505 page_offset(locked_page) <= end) {
506 __set_page_dirty_nobuffers(locked_page);
507 /* unlocked later on in the async handlers */
508 }
509 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
510 *num_added += 1;
511 }
512
513out:
514 return 0;
515
516free_pages_out:
517 for (i = 0; i < nr_pages_ret; i++) {
518 WARN_ON(pages[i]->mapping);
519 page_cache_release(pages[i]);
520 }
521 kfree(pages);
522
523 goto out;
524}
525
526/*
527 * phase two of compressed writeback. This is the ordered portion
528 * of the code, which only gets called in the order the work was
529 * queued. We walk all the async extents created by compress_file_range
530 * and send them down to the disk.
531 */
532static noinline int submit_compressed_extents(struct inode *inode,
533 struct async_cow *async_cow)
534{
535 struct async_extent *async_extent;
536 u64 alloc_hint = 0;
537 struct btrfs_trans_handle *trans;
538 struct btrfs_key ins;
539 struct extent_map *em;
540 struct btrfs_root *root = BTRFS_I(inode)->root;
541 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
542 struct extent_io_tree *io_tree;
543 int ret;
544
545 if (list_empty(&async_cow->extents))
546 return 0;
547
548 trans = btrfs_join_transaction(root, 1);
549
550 while (!list_empty(&async_cow->extents)) {
551 async_extent = list_entry(async_cow->extents.next,
552 struct async_extent, list);
553 list_del(&async_extent->list);
554
555 io_tree = &BTRFS_I(inode)->io_tree;
556
557 /* did the compression code fall back to uncompressed IO? */
558 if (!async_extent->pages) {
559 int page_started = 0;
560 unsigned long nr_written = 0;
561
562 lock_extent(io_tree, async_extent->start,
563 async_extent->start +
564 async_extent->ram_size - 1, GFP_NOFS);
565
566 /* allocate blocks */
567 cow_file_range(inode, async_cow->locked_page,
568 async_extent->start,
569 async_extent->start +
570 async_extent->ram_size - 1,
571 &page_started, &nr_written, 0);
572
573 /*
574 * if page_started, cow_file_range inserted an
575 * inline extent and took care of all the unlocking
576 * and IO for us. Otherwise, we need to submit
577 * all those pages down to the drive.
578 */
579 if (!page_started)
580 extent_write_locked_range(io_tree,
581 inode, async_extent->start,
582 async_extent->start +
583 async_extent->ram_size - 1,
584 btrfs_get_extent,
585 WB_SYNC_ALL);
586 kfree(async_extent);
587 cond_resched();
588 continue;
589 }
590
591 lock_extent(io_tree, async_extent->start,
592 async_extent->start + async_extent->ram_size - 1,
593 GFP_NOFS);
594 /*
595 * here we're doing allocation and writeback of the
596 * compressed pages
597 */
598 btrfs_drop_extent_cache(inode, async_extent->start,
599 async_extent->start +
600 async_extent->ram_size - 1, 0);
601
602 ret = btrfs_reserve_extent(trans, root,
603 async_extent->compressed_size,
604 async_extent->compressed_size,
605 0, alloc_hint,
606 (u64)-1, &ins, 1);
607 BUG_ON(ret);
608 em = alloc_extent_map(GFP_NOFS);
609 em->start = async_extent->start;
610 em->len = async_extent->ram_size;
611 em->orig_start = em->start;
612
613 em->block_start = ins.objectid;
614 em->block_len = ins.offset;
615 em->bdev = root->fs_info->fs_devices->latest_bdev;
616 set_bit(EXTENT_FLAG_PINNED, &em->flags);
617 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
618
619 while (1) {
620 spin_lock(&em_tree->lock);
621 ret = add_extent_mapping(em_tree, em);
622 spin_unlock(&em_tree->lock);
623 if (ret != -EEXIST) {
624 free_extent_map(em);
625 break;
626 }
627 btrfs_drop_extent_cache(inode, async_extent->start,
628 async_extent->start +
629 async_extent->ram_size - 1, 0);
630 }
631
632 ret = btrfs_add_ordered_extent(inode, async_extent->start,
633 ins.objectid,
634 async_extent->ram_size,
635 ins.offset,
636 BTRFS_ORDERED_COMPRESSED);
637 BUG_ON(ret);
638
639 btrfs_end_transaction(trans, root);
640
641 /*
642 * clear dirty, set writeback and unlock the pages.
643 */
644 extent_clear_unlock_delalloc(inode,
645 &BTRFS_I(inode)->io_tree,
646 async_extent->start,
647 async_extent->start +
648 async_extent->ram_size - 1,
649 NULL, 1, 1, 0, 1, 1, 0);
650
651 ret = btrfs_submit_compressed_write(inode,
652 async_extent->start,
653 async_extent->ram_size,
654 ins.objectid,
655 ins.offset, async_extent->pages,
656 async_extent->nr_pages);
657
658 BUG_ON(ret);
659 trans = btrfs_join_transaction(root, 1);
660 alloc_hint = ins.objectid + ins.offset;
661 kfree(async_extent);
662 cond_resched();
663 }
664
665 btrfs_end_transaction(trans, root);
666 return 0;
667}
668
669/*
670 * when extent_io.c finds a delayed allocation range in the file,
671 * the call backs end up in this code. The basic idea is to
672 * allocate extents on disk for the range, and create ordered data structs
673 * in ram to track those extents.
674 *
675 * locked_page is the page that writepage had locked already. We use
676 * it to make sure we don't do extra locks or unlocks.
677 *
678 * *page_started is set to one if we unlock locked_page and do everything
679 * required to start IO on it. It may be clean and already done with
680 * IO when we return.
681 */
682static noinline int cow_file_range(struct inode *inode,
683 struct page *locked_page,
684 u64 start, u64 end, int *page_started,
685 unsigned long *nr_written,
686 int unlock)
687{
688 struct btrfs_root *root = BTRFS_I(inode)->root;
689 struct btrfs_trans_handle *trans;
690 u64 alloc_hint = 0;
691 u64 num_bytes;
692 unsigned long ram_size;
693 u64 disk_num_bytes;
694 u64 cur_alloc_size;
695 u64 blocksize = root->sectorsize;
696 u64 actual_end;
697 u64 isize = i_size_read(inode);
698 struct btrfs_key ins;
699 struct extent_map *em;
700 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
701 int ret = 0;
702
703 trans = btrfs_join_transaction(root, 1);
704 BUG_ON(!trans);
705 btrfs_set_trans_block_group(trans, inode);
706
707 actual_end = min_t(u64, isize, end + 1);
708
709 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
710 num_bytes = max(blocksize, num_bytes);
711 disk_num_bytes = num_bytes;
712 ret = 0;
713
714 if (start == 0) {
715 /* lets try to make an inline extent */
716 ret = cow_file_range_inline(trans, root, inode,
717 start, end, 0, NULL);
718 if (ret == 0) {
719 extent_clear_unlock_delalloc(inode,
720 &BTRFS_I(inode)->io_tree,
721 start, end, NULL, 1, 1,
722 1, 1, 1, 1);
723 *nr_written = *nr_written +
724 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
725 *page_started = 1;
726 ret = 0;
727 goto out;
728 }
729 }
730
731 BUG_ON(disk_num_bytes >
732 btrfs_super_total_bytes(&root->fs_info->super_copy));
733
734 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
735
736 while (disk_num_bytes > 0) {
737 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
738 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
739 root->sectorsize, 0, alloc_hint,
740 (u64)-1, &ins, 1);
741 BUG_ON(ret);
742
743 em = alloc_extent_map(GFP_NOFS);
744 em->start = start;
745 em->orig_start = em->start;
746
747 ram_size = ins.offset;
748 em->len = ins.offset;
749
750 em->block_start = ins.objectid;
751 em->block_len = ins.offset;
752 em->bdev = root->fs_info->fs_devices->latest_bdev;
753 set_bit(EXTENT_FLAG_PINNED, &em->flags);
754
755 while (1) {
756 spin_lock(&em_tree->lock);
757 ret = add_extent_mapping(em_tree, em);
758 spin_unlock(&em_tree->lock);
759 if (ret != -EEXIST) {
760 free_extent_map(em);
761 break;
762 }
763 btrfs_drop_extent_cache(inode, start,
764 start + ram_size - 1, 0);
765 }
766
767 cur_alloc_size = ins.offset;
768 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
769 ram_size, cur_alloc_size, 0);
770 BUG_ON(ret);
771
772 if (root->root_key.objectid ==
773 BTRFS_DATA_RELOC_TREE_OBJECTID) {
774 ret = btrfs_reloc_clone_csums(inode, start,
775 cur_alloc_size);
776 BUG_ON(ret);
777 }
778
779 if (disk_num_bytes < cur_alloc_size)
780 break;
781
782 /* we're not doing compressed IO, don't unlock the first
783 * page (which the caller expects to stay locked), don't
784 * clear any dirty bits and don't set any writeback bits
785 */
786 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
787 start, start + ram_size - 1,
788 locked_page, unlock, 1,
789 1, 0, 0, 0);
790 disk_num_bytes -= cur_alloc_size;
791 num_bytes -= cur_alloc_size;
792 alloc_hint = ins.objectid + ins.offset;
793 start += cur_alloc_size;
794 }
795out:
796 ret = 0;
797 btrfs_end_transaction(trans, root);
798
799 return ret;
800}
801
802/*
803 * work queue call back to started compression on a file and pages
804 */
805static noinline void async_cow_start(struct btrfs_work *work)
806{
807 struct async_cow *async_cow;
808 int num_added = 0;
809 async_cow = container_of(work, struct async_cow, work);
810
811 compress_file_range(async_cow->inode, async_cow->locked_page,
812 async_cow->start, async_cow->end, async_cow,
813 &num_added);
814 if (num_added == 0)
815 async_cow->inode = NULL;
816}
817
818/*
819 * work queue call back to submit previously compressed pages
820 */
821static noinline void async_cow_submit(struct btrfs_work *work)
822{
823 struct async_cow *async_cow;
824 struct btrfs_root *root;
825 unsigned long nr_pages;
826
827 async_cow = container_of(work, struct async_cow, work);
828
829 root = async_cow->root;
830 nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
831 PAGE_CACHE_SHIFT;
832
833 atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
834
835 if (atomic_read(&root->fs_info->async_delalloc_pages) <
836 5 * 1042 * 1024 &&
837 waitqueue_active(&root->fs_info->async_submit_wait))
838 wake_up(&root->fs_info->async_submit_wait);
839
840 if (async_cow->inode)
841 submit_compressed_extents(async_cow->inode, async_cow);
842}
843
844static noinline void async_cow_free(struct btrfs_work *work)
845{
846 struct async_cow *async_cow;
847 async_cow = container_of(work, struct async_cow, work);
848 kfree(async_cow);
849}
850
851static int cow_file_range_async(struct inode *inode, struct page *locked_page,
852 u64 start, u64 end, int *page_started,
853 unsigned long *nr_written)
854{
855 struct async_cow *async_cow;
856 struct btrfs_root *root = BTRFS_I(inode)->root;
857 unsigned long nr_pages;
858 u64 cur_end;
859 int limit = 10 * 1024 * 1042;
860
861 if (!btrfs_test_opt(root, COMPRESS)) {
862 return cow_file_range(inode, locked_page, start, end,
863 page_started, nr_written, 1);
864 }
865
866 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
867 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
868 while (start < end) {
869 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
870 async_cow->inode = inode;
871 async_cow->root = root;
872 async_cow->locked_page = locked_page;
873 async_cow->start = start;
874
875 if (btrfs_test_flag(inode, NOCOMPRESS))
876 cur_end = end;
877 else
878 cur_end = min(end, start + 512 * 1024 - 1);
879
880 async_cow->end = cur_end;
881 INIT_LIST_HEAD(&async_cow->extents);
882
883 async_cow->work.func = async_cow_start;
884 async_cow->work.ordered_func = async_cow_submit;
885 async_cow->work.ordered_free = async_cow_free;
886 async_cow->work.flags = 0;
887
888 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
889 PAGE_CACHE_SHIFT;
890 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
891
892 btrfs_queue_worker(&root->fs_info->delalloc_workers,
893 &async_cow->work);
894
895 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
896 wait_event(root->fs_info->async_submit_wait,
897 (atomic_read(&root->fs_info->async_delalloc_pages) <
898 limit));
899 }
900
901 while (atomic_read(&root->fs_info->async_submit_draining) &&
902 atomic_read(&root->fs_info->async_delalloc_pages)) {
903 wait_event(root->fs_info->async_submit_wait,
904 (atomic_read(&root->fs_info->async_delalloc_pages) ==
905 0));
906 }
907
908 *nr_written += nr_pages;
909 start = cur_end + 1;
910 }
911 *page_started = 1;
912 return 0;
913}
914
915static noinline int csum_exist_in_range(struct btrfs_root *root,
916 u64 bytenr, u64 num_bytes)
917{
918 int ret;
919 struct btrfs_ordered_sum *sums;
920 LIST_HEAD(list);
921
922 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
923 bytenr + num_bytes - 1, &list);
924 if (ret == 0 && list_empty(&list))
925 return 0;
926
927 while (!list_empty(&list)) {
928 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
929 list_del(&sums->list);
930 kfree(sums);
931 }
932 return 1;
933}
934
935/*
936 * when nowcow writeback call back. This checks for snapshots or COW copies
937 * of the extents that exist in the file, and COWs the file as required.
938 *
939 * If no cow copies or snapshots exist, we write directly to the existing
940 * blocks on disk
941 */
942static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
943 u64 start, u64 end, int *page_started, int force,
944 unsigned long *nr_written)
945{
946 struct btrfs_root *root = BTRFS_I(inode)->root;
947 struct btrfs_trans_handle *trans;
948 struct extent_buffer *leaf;
949 struct btrfs_path *path;
950 struct btrfs_file_extent_item *fi;
951 struct btrfs_key found_key;
952 u64 cow_start;
953 u64 cur_offset;
954 u64 extent_end;
955 u64 disk_bytenr;
956 u64 num_bytes;
957 int extent_type;
958 int ret;
959 int type;
960 int nocow;
961 int check_prev = 1;
962
963 path = btrfs_alloc_path();
964 BUG_ON(!path);
965 trans = btrfs_join_transaction(root, 1);
966 BUG_ON(!trans);
967
968 cow_start = (u64)-1;
969 cur_offset = start;
970 while (1) {
971 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
972 cur_offset, 0);
973 BUG_ON(ret < 0);
974 if (ret > 0 && path->slots[0] > 0 && check_prev) {
975 leaf = path->nodes[0];
976 btrfs_item_key_to_cpu(leaf, &found_key,
977 path->slots[0] - 1);
978 if (found_key.objectid == inode->i_ino &&
979 found_key.type == BTRFS_EXTENT_DATA_KEY)
980 path->slots[0]--;
981 }
982 check_prev = 0;
983next_slot:
984 leaf = path->nodes[0];
985 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
986 ret = btrfs_next_leaf(root, path);
987 if (ret < 0)
988 BUG_ON(1);
989 if (ret > 0)
990 break;
991 leaf = path->nodes[0];
992 }
993
994 nocow = 0;
995 disk_bytenr = 0;
996 num_bytes = 0;
997 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
998
999 if (found_key.objectid > inode->i_ino ||
1000 found_key.type > BTRFS_EXTENT_DATA_KEY ||
1001 found_key.offset > end)
1002 break;
1003
1004 if (found_key.offset > cur_offset) {
1005 extent_end = found_key.offset;
1006 goto out_check;
1007 }
1008
1009 fi = btrfs_item_ptr(leaf, path->slots[0],
1010 struct btrfs_file_extent_item);
1011 extent_type = btrfs_file_extent_type(leaf, fi);
1012
1013 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1014 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1015 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1016 extent_end = found_key.offset +
1017 btrfs_file_extent_num_bytes(leaf, fi);
1018 if (extent_end <= start) {
1019 path->slots[0]++;
1020 goto next_slot;
1021 }
1022 if (disk_bytenr == 0)
1023 goto out_check;
1024 if (btrfs_file_extent_compression(leaf, fi) ||
1025 btrfs_file_extent_encryption(leaf, fi) ||
1026 btrfs_file_extent_other_encoding(leaf, fi))
1027 goto out_check;
1028 if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1029 goto out_check;
1030 if (btrfs_extent_readonly(root, disk_bytenr))
1031 goto out_check;
1032 if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
1033 disk_bytenr))
1034 goto out_check;
1035 disk_bytenr += btrfs_file_extent_offset(leaf, fi);
1036 disk_bytenr += cur_offset - found_key.offset;
1037 num_bytes = min(end + 1, extent_end) - cur_offset;
1038 /*
1039 * force cow if csum exists in the range.
1040 * this ensure that csum for a given extent are
1041 * either valid or do not exist.
1042 */
1043 if (csum_exist_in_range(root, disk_bytenr, num_bytes))
1044 goto out_check;
1045 nocow = 1;
1046 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1047 extent_end = found_key.offset +
1048 btrfs_file_extent_inline_len(leaf, fi);
1049 extent_end = ALIGN(extent_end, root->sectorsize);
1050 } else {
1051 BUG_ON(1);
1052 }
1053out_check:
1054 if (extent_end <= start) {
1055 path->slots[0]++;
1056 goto next_slot;
1057 }
1058 if (!nocow) {
1059 if (cow_start == (u64)-1)
1060 cow_start = cur_offset;
1061 cur_offset = extent_end;
1062 if (cur_offset > end)
1063 break;
1064 path->slots[0]++;
1065 goto next_slot;
1066 }
1067
1068 btrfs_release_path(root, path);
1069 if (cow_start != (u64)-1) {
1070 ret = cow_file_range(inode, locked_page, cow_start,
1071 found_key.offset - 1, page_started,
1072 nr_written, 1);
1073 BUG_ON(ret);
1074 cow_start = (u64)-1;
1075 }
1076
1077 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1078 struct extent_map *em;
1079 struct extent_map_tree *em_tree;
1080 em_tree = &BTRFS_I(inode)->extent_tree;
1081 em = alloc_extent_map(GFP_NOFS);
1082 em->start = cur_offset;
1083 em->orig_start = em->start;
1084 em->len = num_bytes;
1085 em->block_len = num_bytes;
1086 em->block_start = disk_bytenr;
1087 em->bdev = root->fs_info->fs_devices->latest_bdev;
1088 set_bit(EXTENT_FLAG_PINNED, &em->flags);
1089 while (1) {
1090 spin_lock(&em_tree->lock);
1091 ret = add_extent_mapping(em_tree, em);
1092 spin_unlock(&em_tree->lock);
1093 if (ret != -EEXIST) {
1094 free_extent_map(em);
1095 break;
1096 }
1097 btrfs_drop_extent_cache(inode, em->start,
1098 em->start + em->len - 1, 0);
1099 }
1100 type = BTRFS_ORDERED_PREALLOC;
1101 } else {
1102 type = BTRFS_ORDERED_NOCOW;
1103 }
1104
1105 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1106 num_bytes, num_bytes, type);
1107 BUG_ON(ret);
1108
1109 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1110 cur_offset, cur_offset + num_bytes - 1,
1111 locked_page, 1, 1, 1, 0, 0, 0);
1112 cur_offset = extent_end;
1113 if (cur_offset > end)
1114 break;
1115 }
1116 btrfs_release_path(root, path);
1117
1118 if (cur_offset <= end && cow_start == (u64)-1)
1119 cow_start = cur_offset;
1120 if (cow_start != (u64)-1) {
1121 ret = cow_file_range(inode, locked_page, cow_start, end,
1122 page_started, nr_written, 1);
1123 BUG_ON(ret);
1124 }
1125
1126 ret = btrfs_end_transaction(trans, root);
1127 BUG_ON(ret);
1128 btrfs_free_path(path);
1129 return 0;
1130}
1131
1132/*
1133 * extent_io.c call back to do delayed allocation processing
1134 */
1135static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1136 u64 start, u64 end, int *page_started,
1137 unsigned long *nr_written)
1138{
1139 int ret;
1140
1141 if (btrfs_test_flag(inode, NODATACOW))
1142 ret = run_delalloc_nocow(inode, locked_page, start, end,
1143 page_started, 1, nr_written);
1144 else if (btrfs_test_flag(inode, PREALLOC))
1145 ret = run_delalloc_nocow(inode, locked_page, start, end,
1146 page_started, 0, nr_written);
1147 else
1148 ret = cow_file_range_async(inode, locked_page, start, end,
1149 page_started, nr_written);
1150
1151 return ret;
1152}
1153
1154/*
1155 * extent_io.c set_bit_hook, used to track delayed allocation
1156 * bytes in this file, and to maintain the list of inodes that
1157 * have pending delalloc work to be done.
1158 */
1159static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1160 unsigned long old, unsigned long bits)
1161{
1162 /*
1163 * set_bit and clear bit hooks normally require _irqsave/restore
1164 * but in this case, we are only testeing for the DELALLOC
1165 * bit, which is only set or cleared with irqs on
1166 */
1167 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1168 struct btrfs_root *root = BTRFS_I(inode)->root;
1169 spin_lock(&root->fs_info->delalloc_lock);
1170 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
1171 root->fs_info->delalloc_bytes += end - start + 1;
1172 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1173 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1174 &root->fs_info->delalloc_inodes);
1175 }
1176 spin_unlock(&root->fs_info->delalloc_lock);
1177 }
1178 return 0;
1179}
1180
1181/*
1182 * extent_io.c clear_bit_hook, see set_bit_hook for why
1183 */
1184static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
1185 unsigned long old, unsigned long bits)
1186{
1187 /*
1188 * set_bit and clear bit hooks normally require _irqsave/restore
1189 * but in this case, we are only testeing for the DELALLOC
1190 * bit, which is only set or cleared with irqs on
1191 */
1192 if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1193 struct btrfs_root *root = BTRFS_I(inode)->root;
1194
1195 spin_lock(&root->fs_info->delalloc_lock);
1196 if (end - start + 1 > root->fs_info->delalloc_bytes) {
1197 printk(KERN_INFO "btrfs warning: delalloc account "
1198 "%llu %llu\n",
1199 (unsigned long long)end - start + 1,
1200 (unsigned long long)
1201 root->fs_info->delalloc_bytes);
1202 root->fs_info->delalloc_bytes = 0;
1203 BTRFS_I(inode)->delalloc_bytes = 0;
1204 } else {
1205 root->fs_info->delalloc_bytes -= end - start + 1;
1206 BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
1207 }
1208 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
1209 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1210 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1211 }
1212 spin_unlock(&root->fs_info->delalloc_lock);
1213 }
1214 return 0;
1215}
1216
1217/*
1218 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1219 * we don't create bios that span stripes or chunks
1220 */
1221int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1222 size_t size, struct bio *bio,
1223 unsigned long bio_flags)
1224{
1225 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1226 struct btrfs_mapping_tree *map_tree;
1227 u64 logical = (u64)bio->bi_sector << 9;
1228 u64 length = 0;
1229 u64 map_length;
1230 int ret;
1231
1232 if (bio_flags & EXTENT_BIO_COMPRESSED)
1233 return 0;
1234
1235 length = bio->bi_size;
1236 map_tree = &root->fs_info->mapping_tree;
1237 map_length = length;
1238 ret = btrfs_map_block(map_tree, READ, logical,
1239 &map_length, NULL, 0);
1240
1241 if (map_length < length + size)
1242 return 1;
1243 return 0;
1244}
1245
1246/*
1247 * in order to insert checksums into the metadata in large chunks,
1248 * we wait until bio submission time. All the pages in the bio are
1249 * checksummed and sums are attached onto the ordered extent record.
1250 *
1251 * At IO completion time the cums attached on the ordered extent record
1252 * are inserted into the btree
1253 */
1254static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1255 struct bio *bio, int mirror_num,
1256 unsigned long bio_flags)
1257{
1258 struct btrfs_root *root = BTRFS_I(inode)->root;
1259 int ret = 0;
1260
1261 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1262 BUG_ON(ret);
1263 return 0;
1264}
1265
1266/*
1267 * in order to insert checksums into the metadata in large chunks,
1268 * we wait until bio submission time. All the pages in the bio are
1269 * checksummed and sums are attached onto the ordered extent record.
1270 *
1271 * At IO completion time the cums attached on the ordered extent record
1272 * are inserted into the btree
1273 */
1274static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1275 int mirror_num, unsigned long bio_flags)
1276{
1277 struct btrfs_root *root = BTRFS_I(inode)->root;
1278 return btrfs_map_bio(root, rw, bio, mirror_num, 1);
1279}
1280
1281/*
1282 * extent_io.c submission hook. This does the right thing for csum calculation
1283 * on write, or reading the csums from the tree before a read
1284 */
1285static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1286 int mirror_num, unsigned long bio_flags)
1287{
1288 struct btrfs_root *root = BTRFS_I(inode)->root;
1289 int ret = 0;
1290 int skip_sum;
1291
1292 skip_sum = btrfs_test_flag(inode, NODATASUM);
1293
1294 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
1295 BUG_ON(ret);
1296
1297 if (!(rw & (1 << BIO_RW))) {
1298 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1299 return btrfs_submit_compressed_read(inode, bio,
1300 mirror_num, bio_flags);
1301 } else if (!skip_sum)
1302 btrfs_lookup_bio_sums(root, inode, bio, NULL);
1303 goto mapit;
1304 } else if (!skip_sum) {
1305 /* csum items have already been cloned */
1306 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1307 goto mapit;
1308 /* we're doing a write, do the async checksumming */
1309 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1310 inode, rw, bio, mirror_num,
1311 bio_flags, __btrfs_submit_bio_start,
1312 __btrfs_submit_bio_done);
1313 }
1314
1315mapit:
1316 return btrfs_map_bio(root, rw, bio, mirror_num, 0);
1317}
1318
1319/*
1320 * given a list of ordered sums record them in the inode. This happens
1321 * at IO completion time based on sums calculated at bio submission time.
1322 */
1323static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1324 struct inode *inode, u64 file_offset,
1325 struct list_head *list)
1326{
1327 struct list_head *cur;
1328 struct btrfs_ordered_sum *sum;
1329
1330 btrfs_set_trans_block_group(trans, inode);
1331 list_for_each(cur, list) {
1332 sum = list_entry(cur, struct btrfs_ordered_sum, list);
1333 btrfs_csum_file_blocks(trans,
1334 BTRFS_I(inode)->root->fs_info->csum_root, sum);
1335 }
1336 return 0;
1337}
1338
1339int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
1340{
1341 if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
1342 WARN_ON(1);
1343 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1344 GFP_NOFS);
1345}
1346
1347/* see btrfs_writepage_start_hook for details on why this is required */
1348struct btrfs_writepage_fixup {
1349 struct page *page;
1350 struct btrfs_work work;
1351};
1352
1353static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1354{
1355 struct btrfs_writepage_fixup *fixup;
1356 struct btrfs_ordered_extent *ordered;
1357 struct page *page;
1358 struct inode *inode;
1359 u64 page_start;
1360 u64 page_end;
1361
1362 fixup = container_of(work, struct btrfs_writepage_fixup, work);
1363 page = fixup->page;
1364again:
1365 lock_page(page);
1366 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
1367 ClearPageChecked(page);
1368 goto out_page;
1369 }
1370
1371 inode = page->mapping->host;
1372 page_start = page_offset(page);
1373 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1374
1375 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
1376
1377 /* already ordered? We're done */
1378 if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
1379 EXTENT_ORDERED, 0)) {
1380 goto out;
1381 }
1382
1383 ordered = btrfs_lookup_ordered_extent(inode, page_start);
1384 if (ordered) {
1385 unlock_extent(&BTRFS_I(inode)->io_tree, page_start,
1386 page_end, GFP_NOFS);
1387 unlock_page(page);
1388 btrfs_start_ordered_extent(inode, ordered, 1);
1389 goto again;
1390 }
1391
1392 btrfs_set_extent_delalloc(inode, page_start, page_end);
1393 ClearPageChecked(page);
1394out:
1395 unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
1396out_page:
1397 unlock_page(page);
1398 page_cache_release(page);
1399}
1400
1401/*
1402 * There are a few paths in the higher layers of the kernel that directly
1403 * set the page dirty bit without asking the filesystem if it is a
1404 * good idea. This causes problems because we want to make sure COW
1405 * properly happens and the data=ordered rules are followed.
1406 *
1407 * In our case any range that doesn't have the ORDERED bit set
1408 * hasn't been properly setup for IO. We kick off an async process
1409 * to fix it up. The async helper will wait for ordered extents, set
1410 * the delalloc bit and make it safe to write the page.
1411 */
1412static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1413{
1414 struct inode *inode = page->mapping->host;
1415 struct btrfs_writepage_fixup *fixup;
1416 struct btrfs_root *root = BTRFS_I(inode)->root;
1417 int ret;
1418
1419 ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1420 EXTENT_ORDERED, 0);
1421 if (ret)
1422 return 0;
1423
1424 if (PageChecked(page))
1425 return -EAGAIN;
1426
1427 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
1428 if (!fixup)
1429 return -EAGAIN;
1430
1431 SetPageChecked(page);
1432 page_cache_get(page);
1433 fixup->work.func = btrfs_writepage_fixup_worker;
1434 fixup->page = page;
1435 btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
1436 return -EAGAIN;
1437}
1438
1439static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1440 struct inode *inode, u64 file_pos,
1441 u64 disk_bytenr, u64 disk_num_bytes,
1442 u64 num_bytes, u64 ram_bytes,
1443 u8 compression, u8 encryption,
1444 u16 other_encoding, int extent_type)
1445{
1446 struct btrfs_root *root = BTRFS_I(inode)->root;
1447 struct btrfs_file_extent_item *fi;
1448 struct btrfs_path *path;
1449 struct extent_buffer *leaf;
1450 struct btrfs_key ins;
1451 u64 hint;
1452 int ret;
1453
1454 path = btrfs_alloc_path();
1455 BUG_ON(!path);
1456
1457 ret = btrfs_drop_extents(trans, root, inode, file_pos,
1458 file_pos + num_bytes, file_pos, &hint);
1459 BUG_ON(ret);
1460
1461 ins.objectid = inode->i_ino;
1462 ins.offset = file_pos;
1463 ins.type = BTRFS_EXTENT_DATA_KEY;
1464 ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
1465 BUG_ON(ret);
1466 leaf = path->nodes[0];
1467 fi = btrfs_item_ptr(leaf, path->slots[0],
1468 struct btrfs_file_extent_item);
1469 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1470 btrfs_set_file_extent_type(leaf, fi, extent_type);
1471 btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
1472 btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
1473 btrfs_set_file_extent_offset(leaf, fi, 0);
1474 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1475 btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
1476 btrfs_set_file_extent_compression(leaf, fi, compression);
1477 btrfs_set_file_extent_encryption(leaf, fi, encryption);
1478 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1479 btrfs_mark_buffer_dirty(leaf);
1480
1481 inode_add_bytes(inode, num_bytes);
1482 btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
1483
1484 ins.objectid = disk_bytenr;
1485 ins.offset = disk_num_bytes;
1486 ins.type = BTRFS_EXTENT_ITEM_KEY;
1487 ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
1488 root->root_key.objectid,
1489 trans->transid, inode->i_ino, &ins);
1490 BUG_ON(ret);
1491
1492 btrfs_free_path(path);
1493 return 0;
1494}
1495
1496/* as ordered data IO finishes, this gets called so we can finish
1497 * an ordered extent if the range of bytes in the file it covers are
1498 * fully written.
1499 */
1500static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1501{
1502 struct btrfs_root *root = BTRFS_I(inode)->root;
1503 struct btrfs_trans_handle *trans;
1504 struct btrfs_ordered_extent *ordered_extent;
1505 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1506 int compressed = 0;
1507 int ret;
1508
1509 ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
1510 if (!ret)
1511 return 0;
1512
1513 trans = btrfs_join_transaction(root, 1);
1514
1515 ordered_extent = btrfs_lookup_ordered_extent(inode, start);
1516 BUG_ON(!ordered_extent);
1517 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
1518 goto nocow;
1519
1520 lock_extent(io_tree, ordered_extent->file_offset,
1521 ordered_extent->file_offset + ordered_extent->len - 1,
1522 GFP_NOFS);
1523
1524 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1525 compressed = 1;
1526 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1527 BUG_ON(compressed);
1528 ret = btrfs_mark_extent_written(trans, root, inode,
1529 ordered_extent->file_offset,
1530 ordered_extent->file_offset +
1531 ordered_extent->len);
1532 BUG_ON(ret);
1533 } else {
1534 ret = insert_reserved_file_extent(trans, inode,
1535 ordered_extent->file_offset,
1536 ordered_extent->start,
1537 ordered_extent->disk_len,
1538 ordered_extent->len,
1539 ordered_extent->len,
1540 compressed, 0, 0,
1541 BTRFS_FILE_EXTENT_REG);
1542 BUG_ON(ret);
1543 }
1544 unlock_extent(io_tree, ordered_extent->file_offset,
1545 ordered_extent->file_offset + ordered_extent->len - 1,
1546 GFP_NOFS);
1547nocow:
1548 add_pending_csums(trans, inode, ordered_extent->file_offset,
1549 &ordered_extent->list);
1550
1551 mutex_lock(&BTRFS_I(inode)->extent_mutex);
1552 btrfs_ordered_update_i_size(inode, ordered_extent);
1553 btrfs_update_inode(trans, root, inode);
1554 btrfs_remove_ordered_extent(inode, ordered_extent);
1555 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
1556
1557 /* once for us */
1558 btrfs_put_ordered_extent(ordered_extent);
1559 /* once for the tree */
1560 btrfs_put_ordered_extent(ordered_extent);
1561
1562 btrfs_end_transaction(trans, root);
1563 return 0;
1564}
1565
1566static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1567 struct extent_state *state, int uptodate)
1568{
1569 return btrfs_finish_ordered_io(page->mapping->host, start, end);
1570}
1571
1572/*
1573 * When IO fails, either with EIO or csum verification fails, we
1574 * try other mirrors that might have a good copy of the data. This
1575 * io_failure_record is used to record state as we go through all the
1576 * mirrors. If another mirror has good data, the page is set up to date
1577 * and things continue. If a good mirror can't be found, the original
1578 * bio end_io callback is called to indicate things have failed.
1579 */
1580struct io_failure_record {
1581 struct page *page;
1582 u64 start;
1583 u64 len;
1584 u64 logical;
1585 unsigned long bio_flags;
1586 int last_mirror;
1587};
1588
1589static int btrfs_io_failed_hook(struct bio *failed_bio,
1590 struct page *page, u64 start, u64 end,
1591 struct extent_state *state)
1592{
1593 struct io_failure_record *failrec = NULL;
1594 u64 private;
1595 struct extent_map *em;
1596 struct inode *inode = page->mapping->host;
1597 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1598 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1599 struct bio *bio;
1600 int num_copies;
1601 int ret;
1602 int rw;
1603 u64 logical;
1604
1605 ret = get_state_private(failure_tree, start, &private);
1606 if (ret) {
1607 failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
1608 if (!failrec)
1609 return -ENOMEM;
1610 failrec->start = start;
1611 failrec->len = end - start + 1;
1612 failrec->last_mirror = 0;
1613 failrec->bio_flags = 0;
1614
1615 spin_lock(&em_tree->lock);
1616 em = lookup_extent_mapping(em_tree, start, failrec->len);
1617 if (em->start > start || em->start + em->len < start) {
1618 free_extent_map(em);
1619 em = NULL;
1620 }
1621 spin_unlock(&em_tree->lock);
1622
1623 if (!em || IS_ERR(em)) {
1624 kfree(failrec);
1625 return -EIO;
1626 }
1627 logical = start - em->start;
1628 logical = em->block_start + logical;
1629 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1630 logical = em->block_start;
1631 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1632 }
1633 failrec->logical = logical;
1634 free_extent_map(em);
1635 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
1636 EXTENT_DIRTY, GFP_NOFS);
1637 set_state_private(failure_tree, start,
1638 (u64)(unsigned long)failrec);
1639 } else {
1640 failrec = (struct io_failure_record *)(unsigned long)private;
1641 }
1642 num_copies = btrfs_num_copies(
1643 &BTRFS_I(inode)->root->fs_info->mapping_tree,
1644 failrec->logical, failrec->len);
1645 failrec->last_mirror++;
1646 if (!state) {
1647 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1648 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1649 failrec->start,
1650 EXTENT_LOCKED);
1651 if (state && state->start != failrec->start)
1652 state = NULL;
1653 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1654 }
1655 if (!state || failrec->last_mirror > num_copies) {
1656 set_state_private(failure_tree, failrec->start, 0);
1657 clear_extent_bits(failure_tree, failrec->start,
1658 failrec->start + failrec->len - 1,
1659 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1660 kfree(failrec);
1661 return -EIO;
1662 }
1663 bio = bio_alloc(GFP_NOFS, 1);
1664 bio->bi_private = state;
1665 bio->bi_end_io = failed_bio->bi_end_io;
1666 bio->bi_sector = failrec->logical >> 9;
1667 bio->bi_bdev = failed_bio->bi_bdev;
1668 bio->bi_size = 0;
1669
1670 bio_add_page(bio, page, failrec->len, start - page_offset(page));
1671 if (failed_bio->bi_rw & (1 << BIO_RW))
1672 rw = WRITE;
1673 else
1674 rw = READ;
1675
1676 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1677 failrec->last_mirror,
1678 failrec->bio_flags);
1679 return 0;
1680}
1681
1682/*
1683 * each time an IO finishes, we do a fast check in the IO failure tree
1684 * to see if we need to process or clean up an io_failure_record
1685 */
1686static int btrfs_clean_io_failures(struct inode *inode, u64 start)
1687{
1688 u64 private;
1689 u64 private_failure;
1690 struct io_failure_record *failure;
1691 int ret;
1692
1693 private = 0;
1694 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1695 (u64)-1, 1, EXTENT_DIRTY)) {
1696 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
1697 start, &private_failure);
1698 if (ret == 0) {
1699 failure = (struct io_failure_record *)(unsigned long)
1700 private_failure;
1701 set_state_private(&BTRFS_I(inode)->io_failure_tree,
1702 failure->start, 0);
1703 clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
1704 failure->start,
1705 failure->start + failure->len - 1,
1706 EXTENT_DIRTY | EXTENT_LOCKED,
1707 GFP_NOFS);
1708 kfree(failure);
1709 }
1710 }
1711 return 0;
1712}
1713
1714/*
1715 * when reads are done, we need to check csums to verify the data is correct
1716 * if there's a match, we allow the bio to finish. If not, we go through
1717 * the io_failure_record routines to find good copies
1718 */
1719static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1720 struct extent_state *state)
1721{
1722 size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
1723 struct inode *inode = page->mapping->host;
1724 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1725 char *kaddr;
1726 u64 private = ~(u32)0;
1727 int ret;
1728 struct btrfs_root *root = BTRFS_I(inode)->root;
1729 u32 csum = ~(u32)0;
1730
1731 if (PageChecked(page)) {
1732 ClearPageChecked(page);
1733 goto good;
1734 }
1735 if (btrfs_test_flag(inode, NODATASUM))
1736 return 0;
1737
1738 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
1739 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) {
1740 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
1741 GFP_NOFS);
1742 return 0;
1743 }
1744
1745 if (state && state->start == start) {
1746 private = state->private;
1747 ret = 0;
1748 } else {
1749 ret = get_state_private(io_tree, start, &private);
1750 }
1751 kaddr = kmap_atomic(page, KM_USER0);
1752 if (ret)
1753 goto zeroit;
1754
1755 csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1);
1756 btrfs_csum_final(csum, (char *)&csum);
1757 if (csum != private)
1758 goto zeroit;
1759
1760 kunmap_atomic(kaddr, KM_USER0);
1761good:
1762 /* if the io failure tree for this inode is non-empty,
1763 * check to see if we've recovered from a failed IO
1764 */
1765 btrfs_clean_io_failures(inode, start);
1766 return 0;
1767
1768zeroit:
1769 printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
1770 "private %llu\n", page->mapping->host->i_ino,
1771 (unsigned long long)start, csum,
1772 (unsigned long long)private);
1773 memset(kaddr + offset, 1, end - start + 1);
1774 flush_dcache_page(page);
1775 kunmap_atomic(kaddr, KM_USER0);
1776 if (private == 0)
1777 return 0;
1778 return -EIO;
1779}
1780
1781/*
1782 * This creates an orphan entry for the given inode in case something goes
1783 * wrong in the middle of an unlink/truncate.
1784 */
1785int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
1786{
1787 struct btrfs_root *root = BTRFS_I(inode)->root;
1788 int ret = 0;
1789
1790 spin_lock(&root->list_lock);
1791
1792 /* already on the orphan list, we're good */
1793 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
1794 spin_unlock(&root->list_lock);
1795 return 0;
1796 }
1797
1798 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
1799
1800 spin_unlock(&root->list_lock);
1801
1802 /*
1803 * insert an orphan item to track this unlinked/truncated file
1804 */
1805 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
1806
1807 return ret;
1808}
1809
1810/*
1811 * We have done the truncate/delete so we can go ahead and remove the orphan
1812 * item for this particular inode.
1813 */
1814int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
1815{
1816 struct btrfs_root *root = BTRFS_I(inode)->root;
1817 int ret = 0;
1818
1819 spin_lock(&root->list_lock);
1820
1821 if (list_empty(&BTRFS_I(inode)->i_orphan)) {
1822 spin_unlock(&root->list_lock);
1823 return 0;
1824 }
1825
1826 list_del_init(&BTRFS_I(inode)->i_orphan);
1827 if (!trans) {
1828 spin_unlock(&root->list_lock);
1829 return 0;
1830 }
1831
1832 spin_unlock(&root->list_lock);
1833
1834 ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
1835
1836 return ret;
1837}
1838
1839/*
1840 * this cleans up any orphans that may be left on the list from the last use
1841 * of this root.
1842 */
1843void btrfs_orphan_cleanup(struct btrfs_root *root)
1844{
1845 struct btrfs_path *path;
1846 struct extent_buffer *leaf;
1847 struct btrfs_item *item;
1848 struct btrfs_key key, found_key;
1849 struct btrfs_trans_handle *trans;
1850 struct inode *inode;
1851 int ret = 0, nr_unlink = 0, nr_truncate = 0;
1852
1853 path = btrfs_alloc_path();
1854 if (!path)
1855 return;
1856 path->reada = -1;
1857
1858 key.objectid = BTRFS_ORPHAN_OBJECTID;
1859 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1860 key.offset = (u64)-1;
1861
1862
1863 while (1) {
1864 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1865 if (ret < 0) {
1866 printk(KERN_ERR "Error searching slot for orphan: %d"
1867 "\n", ret);
1868 break;
1869 }
1870
1871 /*
1872 * if ret == 0 means we found what we were searching for, which
1873 * is weird, but possible, so only screw with path if we didnt
1874 * find the key and see if we have stuff that matches
1875 */
1876 if (ret > 0) {
1877 if (path->slots[0] == 0)
1878 break;
1879 path->slots[0]--;
1880 }
1881
1882 /* pull out the item */
1883 leaf = path->nodes[0];
1884 item = btrfs_item_nr(leaf, path->slots[0]);
1885 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1886
1887 /* make sure the item matches what we want */
1888 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
1889 break;
1890 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
1891 break;
1892
1893 /* release the path since we're done with it */
1894 btrfs_release_path(root, path);
1895
1896 /*
1897 * this is where we are basically btrfs_lookup, without the
1898 * crossing root thing. we store the inode number in the
1899 * offset of the orphan item.
1900 */
1901 inode = btrfs_iget_locked(root->fs_info->sb,
1902 found_key.offset, root);
1903 if (!inode)
1904 break;
1905
1906 if (inode->i_state & I_NEW) {
1907 BTRFS_I(inode)->root = root;
1908
1909 /* have to set the location manually */
1910 BTRFS_I(inode)->location.objectid = inode->i_ino;
1911 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
1912 BTRFS_I(inode)->location.offset = 0;
1913
1914 btrfs_read_locked_inode(inode);
1915 unlock_new_inode(inode);
1916 }
1917
1918 /*
1919 * add this inode to the orphan list so btrfs_orphan_del does
1920 * the proper thing when we hit it
1921 */
1922 spin_lock(&root->list_lock);
1923 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
1924 spin_unlock(&root->list_lock);
1925
1926 /*
1927 * if this is a bad inode, means we actually succeeded in
1928 * removing the inode, but not the orphan record, which means
1929 * we need to manually delete the orphan since iput will just
1930 * do a destroy_inode
1931 */
1932 if (is_bad_inode(inode)) {
1933 trans = btrfs_start_transaction(root, 1);
1934 btrfs_orphan_del(trans, inode);
1935 btrfs_end_transaction(trans, root);
1936 iput(inode);
1937 continue;
1938 }
1939
1940 /* if we have links, this was a truncate, lets do that */
1941 if (inode->i_nlink) {
1942 nr_truncate++;
1943 btrfs_truncate(inode);
1944 } else {
1945 nr_unlink++;
1946 }
1947
1948 /* this will do delete_inode and everything for us */
1949 iput(inode);
1950 }
1951
1952 if (nr_unlink)
1953 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
1954 if (nr_truncate)
1955 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
1956
1957 btrfs_free_path(path);
1958}
1959
1960/*
1961 * read an inode from the btree into the in-memory inode
1962 */
1963void btrfs_read_locked_inode(struct inode *inode)
1964{
1965 struct btrfs_path *path;
1966 struct extent_buffer *leaf;
1967 struct btrfs_inode_item *inode_item;
1968 struct btrfs_timespec *tspec;
1969 struct btrfs_root *root = BTRFS_I(inode)->root;
1970 struct btrfs_key location;
1971 u64 alloc_group_block;
1972 u32 rdev;
1973 int ret;
1974
1975 path = btrfs_alloc_path();
1976 BUG_ON(!path);
1977 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
1978
1979 ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
1980 if (ret)
1981 goto make_bad;
1982
1983 leaf = path->nodes[0];
1984 inode_item = btrfs_item_ptr(leaf, path->slots[0],
1985 struct btrfs_inode_item);
1986
1987 inode->i_mode = btrfs_inode_mode(leaf, inode_item);
1988 inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
1989 inode->i_uid = btrfs_inode_uid(leaf, inode_item);
1990 inode->i_gid = btrfs_inode_gid(leaf, inode_item);
1991 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
1992
1993 tspec = btrfs_inode_atime(inode_item);
1994 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
1995 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
1996
1997 tspec = btrfs_inode_mtime(inode_item);
1998 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
1999 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
2000
2001 tspec = btrfs_inode_ctime(inode_item);
2002 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
2003 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
2004
2005 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
2006 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
2007 BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item);
2008 inode->i_generation = BTRFS_I(inode)->generation;
2009 inode->i_rdev = 0;
2010 rdev = btrfs_inode_rdev(leaf, inode_item);
2011
2012 BTRFS_I(inode)->index_cnt = (u64)-1;
2013 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
2014
2015 alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
2016 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
2017 alloc_group_block, 0);
2018 btrfs_free_path(path);
2019 inode_item = NULL;
2020
2021 switch (inode->i_mode & S_IFMT) {
2022 case S_IFREG:
2023 inode->i_mapping->a_ops = &btrfs_aops;
2024 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2025 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
2026 inode->i_fop = &btrfs_file_operations;
2027 inode->i_op = &btrfs_file_inode_operations;
2028 break;
2029 case S_IFDIR:
2030 inode->i_fop = &btrfs_dir_file_operations;
2031 if (root == root->fs_info->tree_root)
2032 inode->i_op = &btrfs_dir_ro_inode_operations;
2033 else
2034 inode->i_op = &btrfs_dir_inode_operations;
2035 break;
2036 case S_IFLNK:
2037 inode->i_op = &btrfs_symlink_inode_operations;
2038 inode->i_mapping->a_ops = &btrfs_symlink_aops;
2039 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2040 break;
2041 default:
2042 init_special_inode(inode, inode->i_mode, rdev);
2043 break;
2044 }
2045 return;
2046
2047make_bad:
2048 btrfs_free_path(path);
2049 make_bad_inode(inode);
2050}
2051
2052/*
2053 * given a leaf and an inode, copy the inode fields into the leaf
2054 */
2055static void fill_inode_item(struct btrfs_trans_handle *trans,
2056 struct extent_buffer *leaf,
2057 struct btrfs_inode_item *item,
2058 struct inode *inode)
2059{
2060 btrfs_set_inode_uid(leaf, item, inode->i_uid);
2061 btrfs_set_inode_gid(leaf, item, inode->i_gid);
2062 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
2063 btrfs_set_inode_mode(leaf, item, inode->i_mode);
2064 btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
2065
2066 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
2067 inode->i_atime.tv_sec);
2068 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
2069 inode->i_atime.tv_nsec);
2070
2071 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
2072 inode->i_mtime.tv_sec);
2073 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
2074 inode->i_mtime.tv_nsec);
2075
2076 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
2077 inode->i_ctime.tv_sec);
2078 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
2079 inode->i_ctime.tv_nsec);
2080
2081 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
2082 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
2083 btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence);
2084 btrfs_set_inode_transid(leaf, item, trans->transid);
2085 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2086 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2087 btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
2088}
2089
2090/*
2091 * copy everything in the in-memory inode into the btree.
2092 */
2093noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2094 struct btrfs_root *root, struct inode *inode)
2095{
2096 struct btrfs_inode_item *inode_item;
2097 struct btrfs_path *path;
2098 struct extent_buffer *leaf;
2099 int ret;
2100
2101 path = btrfs_alloc_path();
2102 BUG_ON(!path);
2103 ret = btrfs_lookup_inode(trans, root, path,
2104 &BTRFS_I(inode)->location, 1);
2105 if (ret) {
2106 if (ret > 0)
2107 ret = -ENOENT;
2108 goto failed;
2109 }
2110
2111 leaf = path->nodes[0];
2112 inode_item = btrfs_item_ptr(leaf, path->slots[0],
2113 struct btrfs_inode_item);
2114
2115 fill_inode_item(trans, leaf, inode_item, inode);
2116 btrfs_mark_buffer_dirty(leaf);
2117 btrfs_set_inode_last_trans(trans, inode);
2118 ret = 0;
2119failed:
2120 btrfs_free_path(path);
2121 return ret;
2122}
2123
2124
2125/*
2126 * unlink helper that gets used here in inode.c and in the tree logging
2127 * recovery code. It remove a link in a directory with a given name, and
2128 * also drops the back refs in the inode to the directory
2129 */
2130int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2131 struct btrfs_root *root,
2132 struct inode *dir, struct inode *inode,
2133 const char *name, int name_len)
2134{
2135 struct btrfs_path *path;
2136 int ret = 0;
2137 struct extent_buffer *leaf;
2138 struct btrfs_dir_item *di;
2139 struct btrfs_key key;
2140 u64 index;
2141
2142 path = btrfs_alloc_path();
2143 if (!path) {
2144 ret = -ENOMEM;
2145 goto err;
2146 }
2147
2148 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2149 name, name_len, -1);
2150 if (IS_ERR(di)) {
2151 ret = PTR_ERR(di);
2152 goto err;
2153 }
2154 if (!di) {
2155 ret = -ENOENT;
2156 goto err;
2157 }
2158 leaf = path->nodes[0];
2159 btrfs_dir_item_key_to_cpu(leaf, di, &key);
2160 ret = btrfs_delete_one_dir_name(trans, root, path, di);
2161 if (ret)
2162 goto err;
2163 btrfs_release_path(root, path);
2164
2165 ret = btrfs_del_inode_ref(trans, root, name, name_len,
2166 inode->i_ino,
2167 dir->i_ino, &index);
2168 if (ret) {
2169 printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
2170 "inode %lu parent %lu\n", name_len, name,
2171 inode->i_ino, dir->i_ino);
2172 goto err;
2173 }
2174
2175 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
2176 index, name, name_len, -1);
2177 if (IS_ERR(di)) {
2178 ret = PTR_ERR(di);
2179 goto err;
2180 }
2181 if (!di) {
2182 ret = -ENOENT;
2183 goto err;
2184 }
2185 ret = btrfs_delete_one_dir_name(trans, root, path, di);
2186 btrfs_release_path(root, path);
2187
2188 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
2189 inode, dir->i_ino);
2190 BUG_ON(ret != 0 && ret != -ENOENT);
2191 if (ret != -ENOENT)
2192 BTRFS_I(dir)->log_dirty_trans = trans->transid;
2193
2194 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
2195 dir, index);
2196 BUG_ON(ret);
2197err:
2198 btrfs_free_path(path);
2199 if (ret)
2200 goto out;
2201
2202 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
2203 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2204 btrfs_update_inode(trans, root, dir);
2205 btrfs_drop_nlink(inode);
2206 ret = btrfs_update_inode(trans, root, inode);
2207 dir->i_sb->s_dirt = 1;
2208out:
2209 return ret;
2210}
2211
2212static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2213{
2214 struct btrfs_root *root;
2215 struct btrfs_trans_handle *trans;
2216 struct inode *inode = dentry->d_inode;
2217 int ret;
2218 unsigned long nr = 0;
2219
2220 root = BTRFS_I(dir)->root;
2221
2222 ret = btrfs_check_free_space(root, 1, 1);
2223 if (ret)
2224 goto fail;
2225
2226 trans = btrfs_start_transaction(root, 1);
2227
2228 btrfs_set_trans_block_group(trans, dir);
2229 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2230 dentry->d_name.name, dentry->d_name.len);
2231
2232 if (inode->i_nlink == 0)
2233 ret = btrfs_orphan_add(trans, inode);
2234
2235 nr = trans->blocks_used;
2236
2237 btrfs_end_transaction_throttle(trans, root);
2238fail:
2239 btrfs_btree_balance_dirty(root, nr);
2240 return ret;
2241}
2242
2243static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2244{
2245 struct inode *inode = dentry->d_inode;
2246 int err = 0;
2247 int ret;
2248 struct btrfs_root *root = BTRFS_I(dir)->root;
2249 struct btrfs_trans_handle *trans;
2250 unsigned long nr = 0;
2251
2252 /*
2253 * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
2254 * the root of a subvolume or snapshot
2255 */
2256 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
2257 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
2258 return -ENOTEMPTY;
2259 }
2260
2261 ret = btrfs_check_free_space(root, 1, 1);
2262 if (ret)
2263 goto fail;
2264
2265 trans = btrfs_start_transaction(root, 1);
2266 btrfs_set_trans_block_group(trans, dir);
2267
2268 err = btrfs_orphan_add(trans, inode);
2269 if (err)
2270 goto fail_trans;
2271
2272 /* now the directory is empty */
2273 err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2274 dentry->d_name.name, dentry->d_name.len);
2275 if (!err)
2276 btrfs_i_size_write(inode, 0);
2277
2278fail_trans:
2279 nr = trans->blocks_used;
2280 ret = btrfs_end_transaction_throttle(trans, root);
2281fail:
2282 btrfs_btree_balance_dirty(root, nr);
2283
2284 if (ret && !err)
2285 err = ret;
2286 return err;
2287}
2288
2289#if 0
2290/*
2291 * when truncating bytes in a file, it is possible to avoid reading
2292 * the leaves that contain only checksum items. This can be the
2293 * majority of the IO required to delete a large file, but it must
2294 * be done carefully.
2295 *
2296 * The keys in the level just above the leaves are checked to make sure
2297 * the lowest key in a given leaf is a csum key, and starts at an offset
2298 * after the new size.
2299 *
2300 * Then the key for the next leaf is checked to make sure it also has
2301 * a checksum item for the same file. If it does, we know our target leaf
2302 * contains only checksum items, and it can be safely freed without reading
2303 * it.
2304 *
2305 * This is just an optimization targeted at large files. It may do
2306 * nothing. It will return 0 unless things went badly.
2307 */
2308static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
2309 struct btrfs_root *root,
2310 struct btrfs_path *path,
2311 struct inode *inode, u64 new_size)
2312{
2313 struct btrfs_key key;
2314 int ret;
2315 int nritems;
2316 struct btrfs_key found_key;
2317 struct btrfs_key other_key;
2318 struct btrfs_leaf_ref *ref;
2319 u64 leaf_gen;
2320 u64 leaf_start;
2321
2322 path->lowest_level = 1;
2323 key.objectid = inode->i_ino;
2324 key.type = BTRFS_CSUM_ITEM_KEY;
2325 key.offset = new_size;
2326again:
2327 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2328 if (ret < 0)
2329 goto out;
2330
2331 if (path->nodes[1] == NULL) {
2332 ret = 0;
2333 goto out;
2334 }
2335 ret = 0;
2336 btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]);
2337 nritems = btrfs_header_nritems(path->nodes[1]);
2338
2339 if (!nritems)
2340 goto out;
2341
2342 if (path->slots[1] >= nritems)
2343 goto next_node;
2344
2345 /* did we find a key greater than anything we want to delete? */
2346 if (found_key.objectid > inode->i_ino ||
2347 (found_key.objectid == inode->i_ino && found_key.type > key.type))
2348 goto out;
2349
2350 /* we check the next key in the node to make sure the leave contains
2351 * only checksum items. This comparison doesn't work if our
2352 * leaf is the last one in the node
2353 */
2354 if (path->slots[1] + 1 >= nritems) {
2355next_node:
2356 /* search forward from the last key in the node, this
2357 * will bring us into the next node in the tree
2358 */
2359 btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1);
2360
2361 /* unlikely, but we inc below, so check to be safe */
2362 if (found_key.offset == (u64)-1)
2363 goto out;
2364
2365 /* search_forward needs a path with locks held, do the
2366 * search again for the original key. It is possible
2367 * this will race with a balance and return a path that
2368 * we could modify, but this drop is just an optimization
2369 * and is allowed to miss some leaves.
2370 */
2371 btrfs_release_path(root, path);
2372 found_key.offset++;
2373
2374 /* setup a max key for search_forward */
2375 other_key.offset = (u64)-1;
2376 other_key.type = key.type;
2377 other_key.objectid = key.objectid;
2378
2379 path->keep_locks = 1;
2380 ret = btrfs_search_forward(root, &found_key, &other_key,
2381 path, 0, 0);
2382 path->keep_locks = 0;
2383 if (ret || found_key.objectid != key.objectid ||
2384 found_key.type != key.type) {
2385 ret = 0;
2386 goto out;
2387 }
2388
2389 key.offset = found_key.offset;
2390 btrfs_release_path(root, path);
2391 cond_resched();
2392 goto again;
2393 }
2394
2395 /* we know there's one more slot after us in the tree,
2396 * read that key so we can verify it is also a checksum item
2397 */
2398 btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1);
2399
2400 if (found_key.objectid < inode->i_ino)
2401 goto next_key;
2402
2403 if (found_key.type != key.type || found_key.offset < new_size)
2404 goto next_key;
2405
2406 /*
2407 * if the key for the next leaf isn't a csum key from this objectid,
2408 * we can't be sure there aren't good items inside this leaf.
2409 * Bail out
2410 */
2411 if (other_key.objectid != inode->i_ino || other_key.type != key.type)
2412 goto out;
2413
2414 leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]);
2415 leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]);
2416 /*
2417 * it is safe to delete this leaf, it contains only
2418 * csum items from this inode at an offset >= new_size
2419 */
2420 ret = btrfs_del_leaf(trans, root, path, leaf_start);
2421 BUG_ON(ret);
2422
2423 if (root->ref_cows && leaf_gen < trans->transid) {
2424 ref = btrfs_alloc_leaf_ref(root, 0);
2425 if (ref) {
2426 ref->root_gen = root->root_key.offset;
2427 ref->bytenr = leaf_start;
2428 ref->owner = 0;
2429 ref->generation = leaf_gen;
2430 ref->nritems = 0;
2431
2432 ret = btrfs_add_leaf_ref(root, ref, 0);
2433 WARN_ON(ret);
2434 btrfs_free_leaf_ref(root, ref);
2435 } else {
2436 WARN_ON(1);
2437 }
2438 }
2439next_key:
2440 btrfs_release_path(root, path);
2441
2442 if (other_key.objectid == inode->i_ino &&
2443 other_key.type == key.type && other_key.offset > key.offset) {
2444 key.offset = other_key.offset;
2445 cond_resched();
2446 goto again;
2447 }
2448 ret = 0;
2449out:
2450 /* fixup any changes we've made to the path */
2451 path->lowest_level = 0;
2452 path->keep_locks = 0;
2453 btrfs_release_path(root, path);
2454 return ret;
2455}
2456
2457#endif
2458
2459/*
2460 * this can truncate away extent items, csum items and directory items.
2461 * It starts at a high offset and removes keys until it can't find
2462 * any higher than new_size
2463 *
2464 * csum items that cross the new i_size are truncated to the new size
2465 * as well.
2466 *
2467 * min_type is the minimum key type to truncate down to. If set to 0, this
2468 * will kill all the items on this inode, including the INODE_ITEM_KEY.
2469 */
2470noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2471 struct btrfs_root *root,
2472 struct inode *inode,
2473 u64 new_size, u32 min_type)
2474{
2475 int ret;
2476 struct btrfs_path *path;
2477 struct btrfs_key key;
2478 struct btrfs_key found_key;
2479 u32 found_type;
2480 struct extent_buffer *leaf;
2481 struct btrfs_file_extent_item *fi;
2482 u64 extent_start = 0;
2483 u64 extent_num_bytes = 0;
2484 u64 item_end = 0;
2485 u64 root_gen = 0;
2486 u64 root_owner = 0;
2487 int found_extent;
2488 int del_item;
2489 int pending_del_nr = 0;
2490 int pending_del_slot = 0;
2491 int extent_type = -1;
2492 int encoding;
2493 u64 mask = root->sectorsize - 1;
2494
2495 if (root->ref_cows)
2496 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
2497 path = btrfs_alloc_path();
2498 path->reada = -1;
2499 BUG_ON(!path);
2500
2501 /* FIXME, add redo link to tree so we don't leak on crash */
2502 key.objectid = inode->i_ino;
2503 key.offset = (u64)-1;
2504 key.type = (u8)-1;
2505
2506 btrfs_init_path(path);
2507
2508search_again:
2509 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2510 if (ret < 0)
2511 goto error;
2512
2513 if (ret > 0) {
2514 /* there are no items in the tree for us to truncate, we're
2515 * done
2516 */
2517 if (path->slots[0] == 0) {
2518 ret = 0;
2519 goto error;
2520 }
2521 path->slots[0]--;
2522 }
2523
2524 while (1) {
2525 fi = NULL;
2526 leaf = path->nodes[0];
2527 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2528 found_type = btrfs_key_type(&found_key);
2529 encoding = 0;
2530
2531 if (found_key.objectid != inode->i_ino)
2532 break;
2533
2534 if (found_type < min_type)
2535 break;
2536
2537 item_end = found_key.offset;
2538 if (found_type == BTRFS_EXTENT_DATA_KEY) {
2539 fi = btrfs_item_ptr(leaf, path->slots[0],
2540 struct btrfs_file_extent_item);
2541 extent_type = btrfs_file_extent_type(leaf, fi);
2542 encoding = btrfs_file_extent_compression(leaf, fi);
2543 encoding |= btrfs_file_extent_encryption(leaf, fi);
2544 encoding |= btrfs_file_extent_other_encoding(leaf, fi);
2545
2546 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
2547 item_end +=
2548 btrfs_file_extent_num_bytes(leaf, fi);
2549 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
2550 item_end += btrfs_file_extent_inline_len(leaf,
2551 fi);
2552 }
2553 item_end--;
2554 }
2555 if (item_end < new_size) {
2556 if (found_type == BTRFS_DIR_ITEM_KEY)
2557 found_type = BTRFS_INODE_ITEM_KEY;
2558 else if (found_type == BTRFS_EXTENT_ITEM_KEY)
2559 found_type = BTRFS_EXTENT_DATA_KEY;
2560 else if (found_type == BTRFS_EXTENT_DATA_KEY)
2561 found_type = BTRFS_XATTR_ITEM_KEY;
2562 else if (found_type == BTRFS_XATTR_ITEM_KEY)
2563 found_type = BTRFS_INODE_REF_KEY;
2564 else if (found_type)
2565 found_type--;
2566 else
2567 break;
2568 btrfs_set_key_type(&key, found_type);
2569 goto next;
2570 }
2571 if (found_key.offset >= new_size)
2572 del_item = 1;
2573 else
2574 del_item = 0;
2575 found_extent = 0;
2576
2577 /* FIXME, shrink the extent if the ref count is only 1 */
2578 if (found_type != BTRFS_EXTENT_DATA_KEY)
2579 goto delete;
2580
2581 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
2582 u64 num_dec;
2583 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
2584 if (!del_item && !encoding) {
2585 u64 orig_num_bytes =
2586 btrfs_file_extent_num_bytes(leaf, fi);
2587 extent_num_bytes = new_size -
2588 found_key.offset + root->sectorsize - 1;
2589 extent_num_bytes = extent_num_bytes &
2590 ~((u64)root->sectorsize - 1);
2591 btrfs_set_file_extent_num_bytes(leaf, fi,
2592 extent_num_bytes);
2593 num_dec = (orig_num_bytes -
2594 extent_num_bytes);
2595 if (root->ref_cows && extent_start != 0)
2596 inode_sub_bytes(inode, num_dec);
2597 btrfs_mark_buffer_dirty(leaf);
2598 } else {
2599 extent_num_bytes =
2600 btrfs_file_extent_disk_num_bytes(leaf,
2601 fi);
2602 /* FIXME blocksize != 4096 */
2603 num_dec = btrfs_file_extent_num_bytes(leaf, fi);
2604 if (extent_start != 0) {
2605 found_extent = 1;
2606 if (root->ref_cows)
2607 inode_sub_bytes(inode, num_dec);
2608 }
2609 root_gen = btrfs_header_generation(leaf);
2610 root_owner = btrfs_header_owner(leaf);
2611 }
2612 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
2613 /*
2614 * we can't truncate inline items that have had
2615 * special encodings
2616 */
2617 if (!del_item &&
2618 btrfs_file_extent_compression(leaf, fi) == 0 &&
2619 btrfs_file_extent_encryption(leaf, fi) == 0 &&
2620 btrfs_file_extent_other_encoding(leaf, fi) == 0) {
2621 u32 size = new_size - found_key.offset;
2622
2623 if (root->ref_cows) {
2624 inode_sub_bytes(inode, item_end + 1 -
2625 new_size);
2626 }
2627 size =
2628 btrfs_file_extent_calc_inline_size(size);
2629 ret = btrfs_truncate_item(trans, root, path,
2630 size, 1);
2631 BUG_ON(ret);
2632 } else if (root->ref_cows) {
2633 inode_sub_bytes(inode, item_end + 1 -
2634 found_key.offset);
2635 }
2636 }
2637delete:
2638 if (del_item) {
2639 if (!pending_del_nr) {
2640 /* no pending yet, add ourselves */
2641 pending_del_slot = path->slots[0];
2642 pending_del_nr = 1;
2643 } else if (pending_del_nr &&
2644 path->slots[0] + 1 == pending_del_slot) {
2645 /* hop on the pending chunk */
2646 pending_del_nr++;
2647 pending_del_slot = path->slots[0];
2648 } else {
2649 BUG();
2650 }
2651 } else {
2652 break;
2653 }
2654 if (found_extent) {
2655 ret = btrfs_free_extent(trans, root, extent_start,
2656 extent_num_bytes,
2657 leaf->start, root_owner,
2658 root_gen, inode->i_ino, 0);
2659 BUG_ON(ret);
2660 }
2661next:
2662 if (path->slots[0] == 0) {
2663 if (pending_del_nr)
2664 goto del_pending;
2665 btrfs_release_path(root, path);
2666 goto search_again;
2667 }
2668
2669 path->slots[0]--;
2670 if (pending_del_nr &&
2671 path->slots[0] + 1 != pending_del_slot) {
2672 struct btrfs_key debug;
2673del_pending:
2674 btrfs_item_key_to_cpu(path->nodes[0], &debug,
2675 pending_del_slot);
2676 ret = btrfs_del_items(trans, root, path,
2677 pending_del_slot,
2678 pending_del_nr);
2679 BUG_ON(ret);
2680 pending_del_nr = 0;
2681 btrfs_release_path(root, path);
2682 goto search_again;
2683 }
2684 }
2685 ret = 0;
2686error:
2687 if (pending_del_nr) {
2688 ret = btrfs_del_items(trans, root, path, pending_del_slot,
2689 pending_del_nr);
2690 }
2691 btrfs_free_path(path);
2692 inode->i_sb->s_dirt = 1;
2693 return ret;
2694}
2695
2696/*
2697 * taken from block_truncate_page, but does cow as it zeros out
2698 * any bytes left in the last page in the file.
2699 */
2700static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
2701{
2702 struct inode *inode = mapping->host;
2703 struct btrfs_root *root = BTRFS_I(inode)->root;
2704 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2705 struct btrfs_ordered_extent *ordered;
2706 char *kaddr;
2707 u32 blocksize = root->sectorsize;
2708 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2709 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2710 struct page *page;
2711 int ret = 0;
2712 u64 page_start;
2713 u64 page_end;
2714
2715 if ((offset & (blocksize - 1)) == 0)
2716 goto out;
2717
2718 ret = -ENOMEM;
2719again:
2720 page = grab_cache_page(mapping, index);
2721 if (!page)
2722 goto out;
2723
2724 page_start = page_offset(page);
2725 page_end = page_start + PAGE_CACHE_SIZE - 1;
2726
2727 if (!PageUptodate(page)) {
2728 ret = btrfs_readpage(NULL, page);
2729 lock_page(page);
2730 if (page->mapping != mapping) {
2731 unlock_page(page);
2732 page_cache_release(page);
2733 goto again;
2734 }
2735 if (!PageUptodate(page)) {
2736 ret = -EIO;
2737 goto out_unlock;
2738 }
2739 }
2740 wait_on_page_writeback(page);
2741
2742 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
2743 set_page_extent_mapped(page);
2744
2745 ordered = btrfs_lookup_ordered_extent(inode, page_start);
2746 if (ordered) {
2747 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2748 unlock_page(page);
2749 page_cache_release(page);
2750 btrfs_start_ordered_extent(inode, ordered, 1);
2751 btrfs_put_ordered_extent(ordered);
2752 goto again;
2753 }
2754
2755 btrfs_set_extent_delalloc(inode, page_start, page_end);
2756 ret = 0;
2757 if (offset != PAGE_CACHE_SIZE) {
2758 kaddr = kmap(page);
2759 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2760 flush_dcache_page(page);
2761 kunmap(page);
2762 }
2763 ClearPageChecked(page);
2764 set_page_dirty(page);
2765 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2766
2767out_unlock:
2768 unlock_page(page);
2769 page_cache_release(page);
2770out:
2771 return ret;
2772}
2773
2774int btrfs_cont_expand(struct inode *inode, loff_t size)
2775{
2776 struct btrfs_trans_handle *trans;
2777 struct btrfs_root *root = BTRFS_I(inode)->root;
2778 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2779 struct extent_map *em;
2780 u64 mask = root->sectorsize - 1;
2781 u64 hole_start = (inode->i_size + mask) & ~mask;
2782 u64 block_end = (size + mask) & ~mask;
2783 u64 last_byte;
2784 u64 cur_offset;
2785 u64 hole_size;
2786 int err;
2787
2788 if (size <= hole_start)
2789 return 0;
2790
2791 err = btrfs_check_free_space(root, 1, 0);
2792 if (err)
2793 return err;
2794
2795 btrfs_truncate_page(inode->i_mapping, inode->i_size);
2796
2797 while (1) {
2798 struct btrfs_ordered_extent *ordered;
2799 btrfs_wait_ordered_range(inode, hole_start,
2800 block_end - hole_start);
2801 lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
2802 ordered = btrfs_lookup_ordered_extent(inode, hole_start);
2803 if (!ordered)
2804 break;
2805 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
2806 btrfs_put_ordered_extent(ordered);
2807 }
2808
2809 trans = btrfs_start_transaction(root, 1);
2810 btrfs_set_trans_block_group(trans, inode);
2811
2812 cur_offset = hole_start;
2813 while (1) {
2814 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
2815 block_end - cur_offset, 0);
2816 BUG_ON(IS_ERR(em) || !em);
2817 last_byte = min(extent_map_end(em), block_end);
2818 last_byte = (last_byte + mask) & ~mask;
2819 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
2820 u64 hint_byte = 0;
2821 hole_size = last_byte - cur_offset;
2822 err = btrfs_drop_extents(trans, root, inode,
2823 cur_offset,
2824 cur_offset + hole_size,
2825 cur_offset, &hint_byte);
2826 if (err)
2827 break;
2828 err = btrfs_insert_file_extent(trans, root,
2829 inode->i_ino, cur_offset, 0,
2830 0, hole_size, 0, hole_size,
2831 0, 0, 0);
2832 btrfs_drop_extent_cache(inode, hole_start,
2833 last_byte - 1, 0);
2834 }
2835 free_extent_map(em);
2836 cur_offset = last_byte;
2837 if (err || cur_offset >= block_end)
2838 break;
2839 }
2840
2841 btrfs_end_transaction(trans, root);
2842 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
2843 return err;
2844}
2845
2846static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
2847{
2848 struct inode *inode = dentry->d_inode;
2849 int err;
2850
2851 err = inode_change_ok(inode, attr);
2852 if (err)
2853 return err;
2854
2855 if (S_ISREG(inode->i_mode) &&
2856 attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
2857 err = btrfs_cont_expand(inode, attr->ia_size);
2858 if (err)
2859 return err;
2860 }
2861
2862 err = inode_setattr(inode, attr);
2863
2864 if (!err && ((attr->ia_valid & ATTR_MODE)))
2865 err = btrfs_acl_chmod(inode);
2866 return err;
2867}
2868
2869void btrfs_delete_inode(struct inode *inode)
2870{
2871 struct btrfs_trans_handle *trans;
2872 struct btrfs_root *root = BTRFS_I(inode)->root;
2873 unsigned long nr;
2874 int ret;
2875
2876 truncate_inode_pages(&inode->i_data, 0);
2877 if (is_bad_inode(inode)) {
2878 btrfs_orphan_del(NULL, inode);
2879 goto no_delete;
2880 }
2881 btrfs_wait_ordered_range(inode, 0, (u64)-1);
2882
2883 btrfs_i_size_write(inode, 0);
2884 trans = btrfs_join_transaction(root, 1);
2885
2886 btrfs_set_trans_block_group(trans, inode);
2887 ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0);
2888 if (ret) {
2889 btrfs_orphan_del(NULL, inode);
2890 goto no_delete_lock;
2891 }
2892
2893 btrfs_orphan_del(trans, inode);
2894
2895 nr = trans->blocks_used;
2896 clear_inode(inode);
2897
2898 btrfs_end_transaction(trans, root);
2899 btrfs_btree_balance_dirty(root, nr);
2900 return;
2901
2902no_delete_lock:
2903 nr = trans->blocks_used;
2904 btrfs_end_transaction(trans, root);
2905 btrfs_btree_balance_dirty(root, nr);
2906no_delete:
2907 clear_inode(inode);
2908}
2909
2910/*
2911 * this returns the key found in the dir entry in the location pointer.
2912 * If no dir entries were found, location->objectid is 0.
2913 */
2914static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
2915 struct btrfs_key *location)
2916{
2917 const char *name = dentry->d_name.name;
2918 int namelen = dentry->d_name.len;
2919 struct btrfs_dir_item *di;
2920 struct btrfs_path *path;
2921 struct btrfs_root *root = BTRFS_I(dir)->root;
2922 int ret = 0;
2923
2924 path = btrfs_alloc_path();
2925 BUG_ON(!path);
2926
2927 di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
2928 namelen, 0);
2929 if (IS_ERR(di))
2930 ret = PTR_ERR(di);
2931
2932 if (!di || IS_ERR(di))
2933 goto out_err;
2934
2935 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
2936out:
2937 btrfs_free_path(path);
2938 return ret;
2939out_err:
2940 location->objectid = 0;
2941 goto out;
2942}
2943
2944/*
2945 * when we hit a tree root in a directory, the btrfs part of the inode
2946 * needs to be changed to reflect the root directory of the tree root. This
2947 * is kind of like crossing a mount point.
2948 */
2949static int fixup_tree_root_location(struct btrfs_root *root,
2950 struct btrfs_key *location,
2951 struct btrfs_root **sub_root,
2952 struct dentry *dentry)
2953{
2954 struct btrfs_root_item *ri;
2955
2956 if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
2957 return 0;
2958 if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
2959 return 0;
2960
2961 *sub_root = btrfs_read_fs_root(root->fs_info, location,
2962 dentry->d_name.name,
2963 dentry->d_name.len);
2964 if (IS_ERR(*sub_root))
2965 return PTR_ERR(*sub_root);
2966
2967 ri = &(*sub_root)->root_item;
2968 location->objectid = btrfs_root_dirid(ri);
2969 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
2970 location->offset = 0;
2971
2972 return 0;
2973}
2974
2975static noinline void init_btrfs_i(struct inode *inode)
2976{
2977 struct btrfs_inode *bi = BTRFS_I(inode);
2978
2979 bi->i_acl = NULL;
2980 bi->i_default_acl = NULL;
2981
2982 bi->generation = 0;
2983 bi->sequence = 0;
2984 bi->last_trans = 0;
2985 bi->logged_trans = 0;
2986 bi->delalloc_bytes = 0;
2987 bi->disk_i_size = 0;
2988 bi->flags = 0;
2989 bi->index_cnt = (u64)-1;
2990 bi->log_dirty_trans = 0;
2991 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
2992 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
2993 inode->i_mapping, GFP_NOFS);
2994 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
2995 inode->i_mapping, GFP_NOFS);
2996 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
2997 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
2998 mutex_init(&BTRFS_I(inode)->extent_mutex);
2999 mutex_init(&BTRFS_I(inode)->log_mutex);
3000}
3001
3002static int btrfs_init_locked_inode(struct inode *inode, void *p)
3003{
3004 struct btrfs_iget_args *args = p;
3005 inode->i_ino = args->ino;
3006 init_btrfs_i(inode);
3007 BTRFS_I(inode)->root = args->root;
3008 return 0;
3009}
3010
3011static int btrfs_find_actor(struct inode *inode, void *opaque)
3012{
3013 struct btrfs_iget_args *args = opaque;
3014 return args->ino == inode->i_ino &&
3015 args->root == BTRFS_I(inode)->root;
3016}
3017
3018struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
3019 struct btrfs_root *root, int wait)
3020{
3021 struct inode *inode;
3022 struct btrfs_iget_args args;
3023 args.ino = objectid;
3024 args.root = root;
3025
3026 if (wait) {
3027 inode = ilookup5(s, objectid, btrfs_find_actor,
3028 (void *)&args);
3029 } else {
3030 inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
3031 (void *)&args);
3032 }
3033 return inode;
3034}
3035
3036struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
3037 struct btrfs_root *root)
3038{
3039 struct inode *inode;
3040 struct btrfs_iget_args args;
3041 args.ino = objectid;
3042 args.root = root;
3043
3044 inode = iget5_locked(s, objectid, btrfs_find_actor,
3045 btrfs_init_locked_inode,
3046 (void *)&args);
3047 return inode;
3048}
3049
3050/* Get an inode object given its location and corresponding root.
3051 * Returns in *is_new if the inode was read from disk
3052 */
3053struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3054 struct btrfs_root *root, int *is_new)
3055{
3056 struct inode *inode;
3057
3058 inode = btrfs_iget_locked(s, location->objectid, root);
3059 if (!inode)
3060 return ERR_PTR(-EACCES);
3061
3062 if (inode->i_state & I_NEW) {
3063 BTRFS_I(inode)->root = root;
3064 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
3065 btrfs_read_locked_inode(inode);
3066 unlock_new_inode(inode);
3067 if (is_new)
3068 *is_new = 1;
3069 } else {
3070 if (is_new)
3071 *is_new = 0;
3072 }
3073
3074 return inode;
3075}
3076
3077struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3078{
3079 struct inode *inode;
3080 struct btrfs_inode *bi = BTRFS_I(dir);
3081 struct btrfs_root *root = bi->root;
3082 struct btrfs_root *sub_root = root;
3083 struct btrfs_key location;
3084 int ret, new;
3085
3086 if (dentry->d_name.len > BTRFS_NAME_LEN)
3087 return ERR_PTR(-ENAMETOOLONG);
3088
3089 ret = btrfs_inode_by_name(dir, dentry, &location);
3090
3091 if (ret < 0)
3092 return ERR_PTR(ret);
3093
3094 inode = NULL;
3095 if (location.objectid) {
3096 ret = fixup_tree_root_location(root, &location, &sub_root,
3097 dentry);
3098 if (ret < 0)
3099 return ERR_PTR(ret);
3100 if (ret > 0)
3101 return ERR_PTR(-ENOENT);
3102 inode = btrfs_iget(dir->i_sb, &location, sub_root, &new);
3103 if (IS_ERR(inode))
3104 return ERR_CAST(inode);
3105 }
3106 return inode;
3107}
3108
3109static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
3110 struct nameidata *nd)
3111{
3112 struct inode *inode;
3113
3114 if (dentry->d_name.len > BTRFS_NAME_LEN)
3115 return ERR_PTR(-ENAMETOOLONG);
3116
3117 inode = btrfs_lookup_dentry(dir, dentry);
3118 if (IS_ERR(inode))
3119 return ERR_CAST(inode);
3120
3121 return d_splice_alias(inode, dentry);
3122}
3123
3124static unsigned char btrfs_filetype_table[] = {
3125 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
3126};
3127
3128static int btrfs_real_readdir(struct file *filp, void *dirent,
3129 filldir_t filldir)
3130{
3131 struct inode *inode = filp->f_dentry->d_inode;
3132 struct btrfs_root *root = BTRFS_I(inode)->root;
3133 struct btrfs_item *item;
3134 struct btrfs_dir_item *di;
3135 struct btrfs_key key;
3136 struct btrfs_key found_key;
3137 struct btrfs_path *path;
3138 int ret;
3139 u32 nritems;
3140 struct extent_buffer *leaf;
3141 int slot;
3142 int advance;
3143 unsigned char d_type;
3144 int over = 0;
3145 u32 di_cur;
3146 u32 di_total;
3147 u32 di_len;
3148 int key_type = BTRFS_DIR_INDEX_KEY;
3149 char tmp_name[32];
3150 char *name_ptr;
3151 int name_len;
3152
3153 /* FIXME, use a real flag for deciding about the key type */
3154 if (root->fs_info->tree_root == root)
3155 key_type = BTRFS_DIR_ITEM_KEY;
3156
3157 /* special case for "." */
3158 if (filp->f_pos == 0) {
3159 over = filldir(dirent, ".", 1,
3160 1, inode->i_ino,
3161 DT_DIR);
3162 if (over)
3163 return 0;
3164 filp->f_pos = 1;
3165 }
3166 /* special case for .., just use the back ref */
3167 if (filp->f_pos == 1) {
3168 u64 pino = parent_ino(filp->f_path.dentry);
3169 over = filldir(dirent, "..", 2,
3170 2, pino, DT_DIR);
3171 if (over)
3172 return 0;
3173 filp->f_pos = 2;
3174 }
3175 path = btrfs_alloc_path();
3176 path->reada = 2;
3177
3178 btrfs_set_key_type(&key, key_type);
3179 key.offset = filp->f_pos;
3180 key.objectid = inode->i_ino;
3181
3182 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3183 if (ret < 0)
3184 goto err;
3185 advance = 0;
3186
3187 while (1) {
3188 leaf = path->nodes[0];
3189 nritems = btrfs_header_nritems(leaf);
3190 slot = path->slots[0];
3191 if (advance || slot >= nritems) {
3192 if (slot >= nritems - 1) {
3193 ret = btrfs_next_leaf(root, path);
3194 if (ret)
3195 break;
3196 leaf = path->nodes[0];
3197 nritems = btrfs_header_nritems(leaf);
3198 slot = path->slots[0];
3199 } else {
3200 slot++;
3201 path->slots[0]++;
3202 }
3203 }
3204
3205 advance = 1;
3206 item = btrfs_item_nr(leaf, slot);
3207 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3208
3209 if (found_key.objectid != key.objectid)
3210 break;
3211 if (btrfs_key_type(&found_key) != key_type)
3212 break;
3213 if (found_key.offset < filp->f_pos)
3214 continue;
3215
3216 filp->f_pos = found_key.offset;
3217
3218 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
3219 di_cur = 0;
3220 di_total = btrfs_item_size(leaf, item);
3221
3222 while (di_cur < di_total) {
3223 struct btrfs_key location;
3224
3225 name_len = btrfs_dir_name_len(leaf, di);
3226 if (name_len <= sizeof(tmp_name)) {
3227 name_ptr = tmp_name;
3228 } else {
3229 name_ptr = kmalloc(name_len, GFP_NOFS);
3230 if (!name_ptr) {
3231 ret = -ENOMEM;
3232 goto err;
3233 }
3234 }
3235 read_extent_buffer(leaf, name_ptr,
3236 (unsigned long)(di + 1), name_len);
3237
3238 d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
3239 btrfs_dir_item_key_to_cpu(leaf, di, &location);
3240
3241 /* is this a reference to our own snapshot? If so
3242 * skip it
3243 */
3244 if (location.type == BTRFS_ROOT_ITEM_KEY &&
3245 location.objectid == root->root_key.objectid) {
3246 over = 0;
3247 goto skip;
3248 }
3249 over = filldir(dirent, name_ptr, name_len,
3250 found_key.offset, location.objectid,
3251 d_type);
3252
3253skip:
3254 if (name_ptr != tmp_name)
3255 kfree(name_ptr);
3256
3257 if (over)
3258 goto nopos;
3259 di_len = btrfs_dir_name_len(leaf, di) +
3260 btrfs_dir_data_len(leaf, di) + sizeof(*di);
3261 di_cur += di_len;
3262 di = (struct btrfs_dir_item *)((char *)di + di_len);
3263 }
3264 }
3265
3266 /* Reached end of directory/root. Bump pos past the last item. */
3267 if (key_type == BTRFS_DIR_INDEX_KEY)
3268 filp->f_pos = INT_LIMIT(typeof(filp->f_pos));
3269 else
3270 filp->f_pos++;
3271nopos:
3272 ret = 0;
3273err:
3274 btrfs_free_path(path);
3275 return ret;
3276}
3277
3278int btrfs_write_inode(struct inode *inode, int wait)
3279{
3280 struct btrfs_root *root = BTRFS_I(inode)->root;
3281 struct btrfs_trans_handle *trans;
3282 int ret = 0;
3283
3284 if (root->fs_info->btree_inode == inode)
3285 return 0;
3286
3287 if (wait) {
3288 trans = btrfs_join_transaction(root, 1);
3289 btrfs_set_trans_block_group(trans, inode);
3290 ret = btrfs_commit_transaction(trans, root);
3291 }
3292 return ret;
3293}
3294
3295/*
3296 * This is somewhat expensive, updating the tree every time the
3297 * inode changes. But, it is most likely to find the inode in cache.
3298 * FIXME, needs more benchmarking...there are no reasons other than performance
3299 * to keep or drop this code.
3300 */
3301void btrfs_dirty_inode(struct inode *inode)
3302{
3303 struct btrfs_root *root = BTRFS_I(inode)->root;
3304 struct btrfs_trans_handle *trans;
3305
3306 trans = btrfs_join_transaction(root, 1);
3307 btrfs_set_trans_block_group(trans, inode);
3308 btrfs_update_inode(trans, root, inode);
3309 btrfs_end_transaction(trans, root);
3310}
3311
3312/*
3313 * find the highest existing sequence number in a directory
3314 * and then set the in-memory index_cnt variable to reflect
3315 * free sequence numbers
3316 */
3317static int btrfs_set_inode_index_count(struct inode *inode)
3318{
3319 struct btrfs_root *root = BTRFS_I(inode)->root;
3320 struct btrfs_key key, found_key;
3321 struct btrfs_path *path;
3322 struct extent_buffer *leaf;
3323 int ret;
3324
3325 key.objectid = inode->i_ino;
3326 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
3327 key.offset = (u64)-1;
3328
3329 path = btrfs_alloc_path();
3330 if (!path)
3331 return -ENOMEM;
3332
3333 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3334 if (ret < 0)
3335 goto out;
3336 /* FIXME: we should be able to handle this */
3337 if (ret == 0)
3338 goto out;
3339 ret = 0;
3340
3341 /*
3342 * MAGIC NUMBER EXPLANATION:
3343 * since we search a directory based on f_pos we have to start at 2
3344 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
3345 * else has to start at 2
3346 */
3347 if (path->slots[0] == 0) {
3348 BTRFS_I(inode)->index_cnt = 2;
3349 goto out;
3350 }
3351
3352 path->slots[0]--;
3353
3354 leaf = path->nodes[0];
3355 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3356
3357 if (found_key.objectid != inode->i_ino ||
3358 btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
3359 BTRFS_I(inode)->index_cnt = 2;
3360 goto out;
3361 }
3362
3363 BTRFS_I(inode)->index_cnt = found_key.offset + 1;
3364out:
3365 btrfs_free_path(path);
3366 return ret;
3367}
3368
3369/*
3370 * helper to find a free sequence number in a given directory. This current
3371 * code is very simple, later versions will do smarter things in the btree
3372 */
3373int btrfs_set_inode_index(struct inode *dir, u64 *index)
3374{
3375 int ret = 0;
3376
3377 if (BTRFS_I(dir)->index_cnt == (u64)-1) {
3378 ret = btrfs_set_inode_index_count(dir);
3379 if (ret)
3380 return ret;
3381 }
3382
3383 *index = BTRFS_I(dir)->index_cnt;
3384 BTRFS_I(dir)->index_cnt++;
3385
3386 return ret;
3387}
3388
3389static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3390 struct btrfs_root *root,
3391 struct inode *dir,
3392 const char *name, int name_len,
3393 u64 ref_objectid, u64 objectid,
3394 u64 alloc_hint, int mode, u64 *index)
3395{
3396 struct inode *inode;
3397 struct btrfs_inode_item *inode_item;
3398 struct btrfs_key *location;
3399 struct btrfs_path *path;
3400 struct btrfs_inode_ref *ref;
3401 struct btrfs_key key[2];
3402 u32 sizes[2];
3403 unsigned long ptr;
3404 int ret;
3405 int owner;
3406
3407 path = btrfs_alloc_path();
3408 BUG_ON(!path);
3409
3410 inode = new_inode(root->fs_info->sb);
3411 if (!inode)
3412 return ERR_PTR(-ENOMEM);
3413
3414 if (dir) {
3415 ret = btrfs_set_inode_index(dir, index);
3416 if (ret)
3417 return ERR_PTR(ret);
3418 }
3419 /*
3420 * index_cnt is ignored for everything but a dir,
3421 * btrfs_get_inode_index_count has an explanation for the magic
3422 * number
3423 */
3424 init_btrfs_i(inode);
3425 BTRFS_I(inode)->index_cnt = 2;
3426 BTRFS_I(inode)->root = root;
3427 BTRFS_I(inode)->generation = trans->transid;
3428
3429 if (mode & S_IFDIR)
3430 owner = 0;
3431 else
3432 owner = 1;
3433 BTRFS_I(inode)->block_group =
3434 btrfs_find_block_group(root, 0, alloc_hint, owner);
3435 if ((mode & S_IFREG)) {
3436 if (btrfs_test_opt(root, NODATASUM))
3437 btrfs_set_flag(inode, NODATASUM);
3438 if (btrfs_test_opt(root, NODATACOW))
3439 btrfs_set_flag(inode, NODATACOW);
3440 }
3441
3442 key[0].objectid = objectid;
3443 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
3444 key[0].offset = 0;
3445
3446 key[1].objectid = objectid;
3447 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
3448 key[1].offset = ref_objectid;
3449
3450 sizes[0] = sizeof(struct btrfs_inode_item);
3451 sizes[1] = name_len + sizeof(*ref);
3452
3453 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
3454 if (ret != 0)
3455 goto fail;
3456
3457 if (objectid > root->highest_inode)
3458 root->highest_inode = objectid;
3459
3460 inode->i_uid = current_fsuid();
3461 inode->i_gid = current_fsgid();
3462 inode->i_mode = mode;
3463 inode->i_ino = objectid;
3464 inode_set_bytes(inode, 0);
3465 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
3466 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3467 struct btrfs_inode_item);
3468 fill_inode_item(trans, path->nodes[0], inode_item, inode);
3469
3470 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
3471 struct btrfs_inode_ref);
3472 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
3473 btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
3474 ptr = (unsigned long)(ref + 1);
3475 write_extent_buffer(path->nodes[0], name, ptr, name_len);
3476
3477 btrfs_mark_buffer_dirty(path->nodes[0]);
3478 btrfs_free_path(path);
3479
3480 location = &BTRFS_I(inode)->location;
3481 location->objectid = objectid;
3482 location->offset = 0;
3483 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
3484
3485 insert_inode_hash(inode);
3486 return inode;
3487fail:
3488 if (dir)
3489 BTRFS_I(dir)->index_cnt--;
3490 btrfs_free_path(path);
3491 return ERR_PTR(ret);
3492}
3493
3494static inline u8 btrfs_inode_type(struct inode *inode)
3495{
3496 return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
3497}
3498
3499/*
3500 * utility function to add 'inode' into 'parent_inode' with
3501 * a give name and a given sequence number.
3502 * if 'add_backref' is true, also insert a backref from the
3503 * inode to the parent directory.
3504 */
3505int btrfs_add_link(struct btrfs_trans_handle *trans,
3506 struct inode *parent_inode, struct inode *inode,
3507 const char *name, int name_len, int add_backref, u64 index)
3508{
3509 int ret;
3510 struct btrfs_key key;
3511 struct btrfs_root *root = BTRFS_I(parent_inode)->root;
3512
3513 key.objectid = inode->i_ino;
3514 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
3515 key.offset = 0;
3516
3517 ret = btrfs_insert_dir_item(trans, root, name, name_len,
3518 parent_inode->i_ino,
3519 &key, btrfs_inode_type(inode),
3520 index);
3521 if (ret == 0) {
3522 if (add_backref) {
3523 ret = btrfs_insert_inode_ref(trans, root,
3524 name, name_len,
3525 inode->i_ino,
3526 parent_inode->i_ino,
3527 index);
3528 }
3529 btrfs_i_size_write(parent_inode, parent_inode->i_size +
3530 name_len * 2);
3531 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
3532 ret = btrfs_update_inode(trans, root, parent_inode);
3533 }
3534 return ret;
3535}
3536
3537static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
3538 struct dentry *dentry, struct inode *inode,
3539 int backref, u64 index)
3540{
3541 int err = btrfs_add_link(trans, dentry->d_parent->d_inode,
3542 inode, dentry->d_name.name,
3543 dentry->d_name.len, backref, index);
3544 if (!err) {
3545 d_instantiate(dentry, inode);
3546 return 0;
3547 }
3548 if (err > 0)
3549 err = -EEXIST;
3550 return err;
3551}
3552
3553static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3554 int mode, dev_t rdev)
3555{
3556 struct btrfs_trans_handle *trans;
3557 struct btrfs_root *root = BTRFS_I(dir)->root;
3558 struct inode *inode = NULL;
3559 int err;
3560 int drop_inode = 0;
3561 u64 objectid;
3562 unsigned long nr = 0;
3563 u64 index = 0;
3564
3565 if (!new_valid_dev(rdev))
3566 return -EINVAL;
3567
3568 err = btrfs_check_free_space(root, 1, 0);
3569 if (err)
3570 goto fail;
3571
3572 trans = btrfs_start_transaction(root, 1);
3573 btrfs_set_trans_block_group(trans, dir);
3574
3575 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3576 if (err) {
3577 err = -ENOSPC;
3578 goto out_unlock;
3579 }
3580
3581 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3582 dentry->d_name.len,
3583 dentry->d_parent->d_inode->i_ino, objectid,
3584 BTRFS_I(dir)->block_group, mode, &index);
3585 err = PTR_ERR(inode);
3586 if (IS_ERR(inode))
3587 goto out_unlock;
3588
3589 err = btrfs_init_acl(inode, dir);
3590 if (err) {
3591 drop_inode = 1;
3592 goto out_unlock;
3593 }
3594
3595 btrfs_set_trans_block_group(trans, inode);
3596 err = btrfs_add_nondir(trans, dentry, inode, 0, index);
3597 if (err)
3598 drop_inode = 1;
3599 else {
3600 inode->i_op = &btrfs_special_inode_operations;
3601 init_special_inode(inode, inode->i_mode, rdev);
3602 btrfs_update_inode(trans, root, inode);
3603 }
3604 dir->i_sb->s_dirt = 1;
3605 btrfs_update_inode_block_group(trans, inode);
3606 btrfs_update_inode_block_group(trans, dir);
3607out_unlock:
3608 nr = trans->blocks_used;
3609 btrfs_end_transaction_throttle(trans, root);
3610fail:
3611 if (drop_inode) {
3612 inode_dec_link_count(inode);
3613 iput(inode);
3614 }
3615 btrfs_btree_balance_dirty(root, nr);
3616 return err;
3617}
3618
3619static int btrfs_create(struct inode *dir, struct dentry *dentry,
3620 int mode, struct nameidata *nd)
3621{
3622 struct btrfs_trans_handle *trans;
3623 struct btrfs_root *root = BTRFS_I(dir)->root;
3624 struct inode *inode = NULL;
3625 int err;
3626 int drop_inode = 0;
3627 unsigned long nr = 0;
3628 u64 objectid;
3629 u64 index = 0;
3630
3631 err = btrfs_check_free_space(root, 1, 0);
3632 if (err)
3633 goto fail;
3634 trans = btrfs_start_transaction(root, 1);
3635 btrfs_set_trans_block_group(trans, dir);
3636
3637 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3638 if (err) {
3639 err = -ENOSPC;
3640 goto out_unlock;
3641 }
3642
3643 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3644 dentry->d_name.len,
3645 dentry->d_parent->d_inode->i_ino,
3646 objectid, BTRFS_I(dir)->block_group, mode,
3647 &index);
3648 err = PTR_ERR(inode);
3649 if (IS_ERR(inode))
3650 goto out_unlock;
3651
3652 err = btrfs_init_acl(inode, dir);
3653 if (err) {
3654 drop_inode = 1;
3655 goto out_unlock;
3656 }
3657
3658 btrfs_set_trans_block_group(trans, inode);
3659 err = btrfs_add_nondir(trans, dentry, inode, 0, index);
3660 if (err)
3661 drop_inode = 1;
3662 else {
3663 inode->i_mapping->a_ops = &btrfs_aops;
3664 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3665 inode->i_fop = &btrfs_file_operations;
3666 inode->i_op = &btrfs_file_inode_operations;
3667 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3668 }
3669 dir->i_sb->s_dirt = 1;
3670 btrfs_update_inode_block_group(trans, inode);
3671 btrfs_update_inode_block_group(trans, dir);
3672out_unlock:
3673 nr = trans->blocks_used;
3674 btrfs_end_transaction_throttle(trans, root);
3675fail:
3676 if (drop_inode) {
3677 inode_dec_link_count(inode);
3678 iput(inode);
3679 }
3680 btrfs_btree_balance_dirty(root, nr);
3681 return err;
3682}
3683
3684static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
3685 struct dentry *dentry)
3686{
3687 struct btrfs_trans_handle *trans;
3688 struct btrfs_root *root = BTRFS_I(dir)->root;
3689 struct inode *inode = old_dentry->d_inode;
3690 u64 index;
3691 unsigned long nr = 0;
3692 int err;
3693 int drop_inode = 0;
3694
3695 if (inode->i_nlink == 0)
3696 return -ENOENT;
3697
3698 btrfs_inc_nlink(inode);
3699 err = btrfs_check_free_space(root, 1, 0);
3700 if (err)
3701 goto fail;
3702 err = btrfs_set_inode_index(dir, &index);
3703 if (err)
3704 goto fail;
3705
3706 trans = btrfs_start_transaction(root, 1);
3707
3708 btrfs_set_trans_block_group(trans, dir);
3709 atomic_inc(&inode->i_count);
3710
3711 err = btrfs_add_nondir(trans, dentry, inode, 1, index);
3712
3713 if (err)
3714 drop_inode = 1;
3715
3716 dir->i_sb->s_dirt = 1;
3717 btrfs_update_inode_block_group(trans, dir);
3718 err = btrfs_update_inode(trans, root, inode);
3719
3720 if (err)
3721 drop_inode = 1;
3722
3723 nr = trans->blocks_used;
3724 btrfs_end_transaction_throttle(trans, root);
3725fail:
3726 if (drop_inode) {
3727 inode_dec_link_count(inode);
3728 iput(inode);
3729 }
3730 btrfs_btree_balance_dirty(root, nr);
3731 return err;
3732}
3733
3734static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
3735{
3736 struct inode *inode = NULL;
3737 struct btrfs_trans_handle *trans;
3738 struct btrfs_root *root = BTRFS_I(dir)->root;
3739 int err = 0;
3740 int drop_on_err = 0;
3741 u64 objectid = 0;
3742 u64 index = 0;
3743 unsigned long nr = 1;
3744
3745 err = btrfs_check_free_space(root, 1, 0);
3746 if (err)
3747 goto out_unlock;
3748
3749 trans = btrfs_start_transaction(root, 1);
3750 btrfs_set_trans_block_group(trans, dir);
3751
3752 if (IS_ERR(trans)) {
3753 err = PTR_ERR(trans);
3754 goto out_unlock;
3755 }
3756
3757 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3758 if (err) {
3759 err = -ENOSPC;
3760 goto out_unlock;
3761 }
3762
3763 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3764 dentry->d_name.len,
3765 dentry->d_parent->d_inode->i_ino, objectid,
3766 BTRFS_I(dir)->block_group, S_IFDIR | mode,
3767 &index);
3768 if (IS_ERR(inode)) {
3769 err = PTR_ERR(inode);
3770 goto out_fail;
3771 }
3772
3773 drop_on_err = 1;
3774
3775 err = btrfs_init_acl(inode, dir);
3776 if (err)
3777 goto out_fail;
3778
3779 inode->i_op = &btrfs_dir_inode_operations;
3780 inode->i_fop = &btrfs_dir_file_operations;
3781 btrfs_set_trans_block_group(trans, inode);
3782
3783 btrfs_i_size_write(inode, 0);
3784 err = btrfs_update_inode(trans, root, inode);
3785 if (err)
3786 goto out_fail;
3787
3788 err = btrfs_add_link(trans, dentry->d_parent->d_inode,
3789 inode, dentry->d_name.name,
3790 dentry->d_name.len, 0, index);
3791 if (err)
3792 goto out_fail;
3793
3794 d_instantiate(dentry, inode);
3795 drop_on_err = 0;
3796 dir->i_sb->s_dirt = 1;
3797 btrfs_update_inode_block_group(trans, inode);
3798 btrfs_update_inode_block_group(trans, dir);
3799
3800out_fail:
3801 nr = trans->blocks_used;
3802 btrfs_end_transaction_throttle(trans, root);
3803
3804out_unlock:
3805 if (drop_on_err)
3806 iput(inode);
3807 btrfs_btree_balance_dirty(root, nr);
3808 return err;
3809}
3810
3811/* helper for btfs_get_extent. Given an existing extent in the tree,
3812 * and an extent that you want to insert, deal with overlap and insert
3813 * the new extent into the tree.
3814 */
3815static int merge_extent_mapping(struct extent_map_tree *em_tree,
3816 struct extent_map *existing,
3817 struct extent_map *em,
3818 u64 map_start, u64 map_len)
3819{
3820 u64 start_diff;
3821
3822 BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
3823 start_diff = map_start - em->start;
3824 em->start = map_start;
3825 em->len = map_len;
3826 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
3827 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
3828 em->block_start += start_diff;
3829 em->block_len -= start_diff;
3830 }
3831 return add_extent_mapping(em_tree, em);
3832}
3833
3834static noinline int uncompress_inline(struct btrfs_path *path,
3835 struct inode *inode, struct page *page,
3836 size_t pg_offset, u64 extent_offset,
3837 struct btrfs_file_extent_item *item)
3838{
3839 int ret;
3840 struct extent_buffer *leaf = path->nodes[0];
3841 char *tmp;
3842 size_t max_size;
3843 unsigned long inline_size;
3844 unsigned long ptr;
3845
3846 WARN_ON(pg_offset != 0);
3847 max_size = btrfs_file_extent_ram_bytes(leaf, item);
3848 inline_size = btrfs_file_extent_inline_item_len(leaf,
3849 btrfs_item_nr(leaf, path->slots[0]));
3850 tmp = kmalloc(inline_size, GFP_NOFS);
3851 ptr = btrfs_file_extent_inline_start(item);
3852
3853 read_extent_buffer(leaf, tmp, ptr, inline_size);
3854
3855 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
3856 ret = btrfs_zlib_decompress(tmp, page, extent_offset,
3857 inline_size, max_size);
3858 if (ret) {
3859 char *kaddr = kmap_atomic(page, KM_USER0);
3860 unsigned long copy_size = min_t(u64,
3861 PAGE_CACHE_SIZE - pg_offset,
3862 max_size - extent_offset);
3863 memset(kaddr + pg_offset, 0, copy_size);
3864 kunmap_atomic(kaddr, KM_USER0);
3865 }
3866 kfree(tmp);
3867 return 0;
3868}
3869
3870/*
3871 * a bit scary, this does extent mapping from logical file offset to the disk.
3872 * the ugly parts come from merging extents from the disk with the in-ram
3873 * representation. This gets more complex because of the data=ordered code,
3874 * where the in-ram extents might be locked pending data=ordered completion.
3875 *
3876 * This also copies inline extents directly into the page.
3877 */
3878
3879struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
3880 size_t pg_offset, u64 start, u64 len,
3881 int create)
3882{
3883 int ret;
3884 int err = 0;
3885 u64 bytenr;
3886 u64 extent_start = 0;
3887 u64 extent_end = 0;
3888 u64 objectid = inode->i_ino;
3889 u32 found_type;
3890 struct btrfs_path *path = NULL;
3891 struct btrfs_root *root = BTRFS_I(inode)->root;
3892 struct btrfs_file_extent_item *item;
3893 struct extent_buffer *leaf;
3894 struct btrfs_key found_key;
3895 struct extent_map *em = NULL;
3896 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3897 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3898 struct btrfs_trans_handle *trans = NULL;
3899 int compressed;
3900
3901again:
3902 spin_lock(&em_tree->lock);
3903 em = lookup_extent_mapping(em_tree, start, len);
3904 if (em)
3905 em->bdev = root->fs_info->fs_devices->latest_bdev;
3906 spin_unlock(&em_tree->lock);
3907
3908 if (em) {
3909 if (em->start > start || em->start + em->len <= start)
3910 free_extent_map(em);
3911 else if (em->block_start == EXTENT_MAP_INLINE && page)
3912 free_extent_map(em);
3913 else
3914 goto out;
3915 }
3916 em = alloc_extent_map(GFP_NOFS);
3917 if (!em) {
3918 err = -ENOMEM;
3919 goto out;
3920 }
3921 em->bdev = root->fs_info->fs_devices->latest_bdev;
3922 em->start = EXTENT_MAP_HOLE;
3923 em->orig_start = EXTENT_MAP_HOLE;
3924 em->len = (u64)-1;
3925 em->block_len = (u64)-1;
3926
3927 if (!path) {
3928 path = btrfs_alloc_path();
3929 BUG_ON(!path);
3930 }
3931
3932 ret = btrfs_lookup_file_extent(trans, root, path,
3933 objectid, start, trans != NULL);
3934 if (ret < 0) {
3935 err = ret;
3936 goto out;
3937 }
3938
3939 if (ret != 0) {
3940 if (path->slots[0] == 0)
3941 goto not_found;
3942 path->slots[0]--;
3943 }
3944
3945 leaf = path->nodes[0];
3946 item = btrfs_item_ptr(leaf, path->slots[0],
3947 struct btrfs_file_extent_item);
3948 /* are we inside the extent that was found? */
3949 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3950 found_type = btrfs_key_type(&found_key);
3951 if (found_key.objectid != objectid ||
3952 found_type != BTRFS_EXTENT_DATA_KEY) {
3953 goto not_found;
3954 }
3955
3956 found_type = btrfs_file_extent_type(leaf, item);
3957 extent_start = found_key.offset;
3958 compressed = btrfs_file_extent_compression(leaf, item);
3959 if (found_type == BTRFS_FILE_EXTENT_REG ||
3960 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
3961 extent_end = extent_start +
3962 btrfs_file_extent_num_bytes(leaf, item);
3963 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
3964 size_t size;
3965 size = btrfs_file_extent_inline_len(leaf, item);
3966 extent_end = (extent_start + size + root->sectorsize - 1) &
3967 ~((u64)root->sectorsize - 1);
3968 }
3969
3970 if (start >= extent_end) {
3971 path->slots[0]++;
3972 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3973 ret = btrfs_next_leaf(root, path);
3974 if (ret < 0) {
3975 err = ret;
3976 goto out;
3977 }
3978 if (ret > 0)
3979 goto not_found;
3980 leaf = path->nodes[0];
3981 }
3982 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3983 if (found_key.objectid != objectid ||
3984 found_key.type != BTRFS_EXTENT_DATA_KEY)
3985 goto not_found;
3986 if (start + len <= found_key.offset)
3987 goto not_found;
3988 em->start = start;
3989 em->len = found_key.offset - start;
3990 goto not_found_em;
3991 }
3992
3993 if (found_type == BTRFS_FILE_EXTENT_REG ||
3994 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
3995 em->start = extent_start;
3996 em->len = extent_end - extent_start;
3997 em->orig_start = extent_start -
3998 btrfs_file_extent_offset(leaf, item);
3999 bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
4000 if (bytenr == 0) {
4001 em->block_start = EXTENT_MAP_HOLE;
4002 goto insert;
4003 }
4004 if (compressed) {
4005 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
4006 em->block_start = bytenr;
4007 em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
4008 item);
4009 } else {
4010 bytenr += btrfs_file_extent_offset(leaf, item);
4011 em->block_start = bytenr;
4012 em->block_len = em->len;
4013 if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
4014 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
4015 }
4016 goto insert;
4017 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
4018 unsigned long ptr;
4019 char *map;
4020 size_t size;
4021 size_t extent_offset;
4022 size_t copy_size;
4023
4024 em->block_start = EXTENT_MAP_INLINE;
4025 if (!page || create) {
4026 em->start = extent_start;
4027 em->len = extent_end - extent_start;
4028 goto out;
4029 }
4030
4031 size = btrfs_file_extent_inline_len(leaf, item);
4032 extent_offset = page_offset(page) + pg_offset - extent_start;
4033 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
4034 size - extent_offset);
4035 em->start = extent_start + extent_offset;
4036 em->len = (copy_size + root->sectorsize - 1) &
4037 ~((u64)root->sectorsize - 1);
4038 em->orig_start = EXTENT_MAP_INLINE;
4039 if (compressed)
4040 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
4041 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
4042 if (create == 0 && !PageUptodate(page)) {
4043 if (btrfs_file_extent_compression(leaf, item) ==
4044 BTRFS_COMPRESS_ZLIB) {
4045 ret = uncompress_inline(path, inode, page,
4046 pg_offset,
4047 extent_offset, item);
4048 BUG_ON(ret);
4049 } else {
4050 map = kmap(page);
4051 read_extent_buffer(leaf, map + pg_offset, ptr,
4052 copy_size);
4053 kunmap(page);
4054 }
4055 flush_dcache_page(page);
4056 } else if (create && PageUptodate(page)) {
4057 if (!trans) {
4058 kunmap(page);
4059 free_extent_map(em);
4060 em = NULL;
4061 btrfs_release_path(root, path);
4062 trans = btrfs_join_transaction(root, 1);
4063 goto again;
4064 }
4065 map = kmap(page);
4066 write_extent_buffer(leaf, map + pg_offset, ptr,
4067 copy_size);
4068 kunmap(page);
4069 btrfs_mark_buffer_dirty(leaf);
4070 }
4071 set_extent_uptodate(io_tree, em->start,
4072 extent_map_end(em) - 1, GFP_NOFS);
4073 goto insert;
4074 } else {
4075 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
4076 WARN_ON(1);
4077 }
4078not_found:
4079 em->start = start;
4080 em->len = len;
4081not_found_em:
4082 em->block_start = EXTENT_MAP_HOLE;
4083 set_bit(EXTENT_FLAG_VACANCY, &em->flags);
4084insert:
4085 btrfs_release_path(root, path);
4086 if (em->start > start || extent_map_end(em) <= start) {
4087 printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
4088 "[%llu %llu]\n", (unsigned long long)em->start,
4089 (unsigned long long)em->len,
4090 (unsigned long long)start,
4091 (unsigned long long)len);
4092 err = -EIO;
4093 goto out;
4094 }
4095
4096 err = 0;
4097 spin_lock(&em_tree->lock);
4098 ret = add_extent_mapping(em_tree, em);
4099 /* it is possible that someone inserted the extent into the tree
4100 * while we had the lock dropped. It is also possible that
4101 * an overlapping map exists in the tree
4102 */
4103 if (ret == -EEXIST) {
4104 struct extent_map *existing;
4105
4106 ret = 0;
4107
4108 existing = lookup_extent_mapping(em_tree, start, len);
4109 if (existing && (existing->start > start ||
4110 existing->start + existing->len <= start)) {
4111 free_extent_map(existing);
4112 existing = NULL;
4113 }
4114 if (!existing) {
4115 existing = lookup_extent_mapping(em_tree, em->start,
4116 em->len);
4117 if (existing) {
4118 err = merge_extent_mapping(em_tree, existing,
4119 em, start,
4120 root->sectorsize);
4121 free_extent_map(existing);
4122 if (err) {
4123 free_extent_map(em);
4124 em = NULL;
4125 }
4126 } else {
4127 err = -EIO;
4128 free_extent_map(em);
4129 em = NULL;
4130 }
4131 } else {
4132 free_extent_map(em);
4133 em = existing;
4134 err = 0;
4135 }
4136 }
4137 spin_unlock(&em_tree->lock);
4138out:
4139 if (path)
4140 btrfs_free_path(path);
4141 if (trans) {
4142 ret = btrfs_end_transaction(trans, root);
4143 if (!err)
4144 err = ret;
4145 }
4146 if (err) {
4147 free_extent_map(em);
4148 WARN_ON(1);
4149 return ERR_PTR(err);
4150 }
4151 return em;
4152}
4153
4154static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
4155 const struct iovec *iov, loff_t offset,
4156 unsigned long nr_segs)
4157{
4158 return -EINVAL;
4159}
4160
4161static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
4162{
4163 return extent_bmap(mapping, iblock, btrfs_get_extent);
4164}
4165
4166int btrfs_readpage(struct file *file, struct page *page)
4167{
4168 struct extent_io_tree *tree;
4169 tree = &BTRFS_I(page->mapping->host)->io_tree;
4170 return extent_read_full_page(tree, page, btrfs_get_extent);
4171}
4172
4173static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
4174{
4175 struct extent_io_tree *tree;
4176
4177
4178 if (current->flags & PF_MEMALLOC) {
4179 redirty_page_for_writepage(wbc, page);
4180 unlock_page(page);
4181 return 0;
4182 }
4183 tree = &BTRFS_I(page->mapping->host)->io_tree;
4184 return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
4185}
4186
4187int btrfs_writepages(struct address_space *mapping,
4188 struct writeback_control *wbc)
4189{
4190 struct extent_io_tree *tree;
4191
4192 tree = &BTRFS_I(mapping->host)->io_tree;
4193 return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
4194}
4195
4196static int
4197btrfs_readpages(struct file *file, struct address_space *mapping,
4198 struct list_head *pages, unsigned nr_pages)
4199{
4200 struct extent_io_tree *tree;
4201 tree = &BTRFS_I(mapping->host)->io_tree;
4202 return extent_readpages(tree, mapping, pages, nr_pages,
4203 btrfs_get_extent);
4204}
4205static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
4206{
4207 struct extent_io_tree *tree;
4208 struct extent_map_tree *map;
4209 int ret;
4210
4211 tree = &BTRFS_I(page->mapping->host)->io_tree;
4212 map = &BTRFS_I(page->mapping->host)->extent_tree;
4213 ret = try_release_extent_mapping(map, tree, page, gfp_flags);
4214 if (ret == 1) {
4215 ClearPagePrivate(page);
4216 set_page_private(page, 0);
4217 page_cache_release(page);
4218 }
4219 return ret;
4220}
4221
4222static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
4223{
4224 if (PageWriteback(page) || PageDirty(page))
4225 return 0;
4226 return __btrfs_releasepage(page, gfp_flags);
4227}
4228
4229static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4230{
4231 struct extent_io_tree *tree;
4232 struct btrfs_ordered_extent *ordered;
4233 u64 page_start = page_offset(page);
4234 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
4235
4236 wait_on_page_writeback(page);
4237 tree = &BTRFS_I(page->mapping->host)->io_tree;
4238 if (offset) {
4239 btrfs_releasepage(page, GFP_NOFS);
4240 return;
4241 }
4242
4243 lock_extent(tree, page_start, page_end, GFP_NOFS);
4244 ordered = btrfs_lookup_ordered_extent(page->mapping->host,
4245 page_offset(page));
4246 if (ordered) {
4247 /*
4248 * IO on this page will never be started, so we need
4249 * to account for any ordered extents now
4250 */
4251 clear_extent_bit(tree, page_start, page_end,
4252 EXTENT_DIRTY | EXTENT_DELALLOC |
4253 EXTENT_LOCKED, 1, 0, GFP_NOFS);
4254 btrfs_finish_ordered_io(page->mapping->host,
4255 page_start, page_end);
4256 btrfs_put_ordered_extent(ordered);
4257 lock_extent(tree, page_start, page_end, GFP_NOFS);
4258 }
4259 clear_extent_bit(tree, page_start, page_end,
4260 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
4261 EXTENT_ORDERED,
4262 1, 1, GFP_NOFS);
4263 __btrfs_releasepage(page, GFP_NOFS);
4264
4265 ClearPageChecked(page);
4266 if (PagePrivate(page)) {
4267 ClearPagePrivate(page);
4268 set_page_private(page, 0);
4269 page_cache_release(page);
4270 }
4271}
4272
4273/*
4274 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
4275 * called from a page fault handler when a page is first dirtied. Hence we must
4276 * be careful to check for EOF conditions here. We set the page up correctly
4277 * for a written page which means we get ENOSPC checking when writing into
4278 * holes and correct delalloc and unwritten extent mapping on filesystems that
4279 * support these features.
4280 *
4281 * We are not allowed to take the i_mutex here so we have to play games to
4282 * protect against truncate races as the page could now be beyond EOF. Because
4283 * vmtruncate() writes the inode size before removing pages, once we have the
4284 * page lock we can determine safely if the page is beyond EOF. If it is not
4285 * beyond EOF, then the page is guaranteed safe against truncation until we
4286 * unlock the page.
4287 */
4288int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4289{
4290 struct inode *inode = fdentry(vma->vm_file)->d_inode;
4291 struct btrfs_root *root = BTRFS_I(inode)->root;
4292 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4293 struct btrfs_ordered_extent *ordered;
4294 char *kaddr;
4295 unsigned long zero_start;
4296 loff_t size;
4297 int ret;
4298 u64 page_start;
4299 u64 page_end;
4300
4301 ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
4302 if (ret)
4303 goto out;
4304
4305 ret = -EINVAL;
4306again:
4307 lock_page(page);
4308 size = i_size_read(inode);
4309 page_start = page_offset(page);
4310 page_end = page_start + PAGE_CACHE_SIZE - 1;
4311
4312 if ((page->mapping != inode->i_mapping) ||
4313 (page_start >= size)) {
4314 /* page got truncated out from underneath us */
4315 goto out_unlock;
4316 }
4317 wait_on_page_writeback(page);
4318
4319 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
4320 set_page_extent_mapped(page);
4321
4322 /*
4323 * we can't set the delalloc bits if there are pending ordered
4324 * extents. Drop our locks and wait for them to finish
4325 */
4326 ordered = btrfs_lookup_ordered_extent(inode, page_start);
4327 if (ordered) {
4328 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4329 unlock_page(page);
4330 btrfs_start_ordered_extent(inode, ordered, 1);
4331 btrfs_put_ordered_extent(ordered);
4332 goto again;
4333 }
4334
4335 btrfs_set_extent_delalloc(inode, page_start, page_end);
4336 ret = 0;
4337
4338 /* page is wholly or partially inside EOF */
4339 if (page_start + PAGE_CACHE_SIZE > size)
4340 zero_start = size & ~PAGE_CACHE_MASK;
4341 else
4342 zero_start = PAGE_CACHE_SIZE;
4343
4344 if (zero_start != PAGE_CACHE_SIZE) {
4345 kaddr = kmap(page);
4346 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
4347 flush_dcache_page(page);
4348 kunmap(page);
4349 }
4350 ClearPageChecked(page);
4351 set_page_dirty(page);
4352 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4353
4354out_unlock:
4355 unlock_page(page);
4356out:
4357 return ret;
4358}
4359
4360static void btrfs_truncate(struct inode *inode)
4361{
4362 struct btrfs_root *root = BTRFS_I(inode)->root;
4363 int ret;
4364 struct btrfs_trans_handle *trans;
4365 unsigned long nr;
4366 u64 mask = root->sectorsize - 1;
4367
4368 if (!S_ISREG(inode->i_mode))
4369 return;
4370 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4371 return;
4372
4373 btrfs_truncate_page(inode->i_mapping, inode->i_size);
4374 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
4375
4376 trans = btrfs_start_transaction(root, 1);
4377 btrfs_set_trans_block_group(trans, inode);
4378 btrfs_i_size_write(inode, inode->i_size);
4379
4380 ret = btrfs_orphan_add(trans, inode);
4381 if (ret)
4382 goto out;
4383 /* FIXME, add redo link to tree so we don't leak on crash */
4384 ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size,
4385 BTRFS_EXTENT_DATA_KEY);
4386 btrfs_update_inode(trans, root, inode);
4387
4388 ret = btrfs_orphan_del(trans, inode);
4389 BUG_ON(ret);
4390
4391out:
4392 nr = trans->blocks_used;
4393 ret = btrfs_end_transaction_throttle(trans, root);
4394 BUG_ON(ret);
4395 btrfs_btree_balance_dirty(root, nr);
4396}
4397
4398/*
4399 * create a new subvolume directory/inode (helper for the ioctl).
4400 */
4401int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
4402 struct btrfs_root *new_root, struct dentry *dentry,
4403 u64 new_dirid, u64 alloc_hint)
4404{
4405 struct inode *inode;
4406 int error;
4407 u64 index = 0;
4408
4409 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
4410 new_dirid, alloc_hint, S_IFDIR | 0700, &index);
4411 if (IS_ERR(inode))
4412 return PTR_ERR(inode);
4413 inode->i_op = &btrfs_dir_inode_operations;
4414 inode->i_fop = &btrfs_dir_file_operations;
4415
4416 inode->i_nlink = 1;
4417 btrfs_i_size_write(inode, 0);
4418
4419 error = btrfs_update_inode(trans, new_root, inode);
4420 if (error)
4421 return error;
4422
4423 d_instantiate(dentry, inode);
4424 return 0;
4425}
4426
4427/* helper function for file defrag and space balancing. This
4428 * forces readahead on a given range of bytes in an inode
4429 */
4430unsigned long btrfs_force_ra(struct address_space *mapping,
4431 struct file_ra_state *ra, struct file *file,
4432 pgoff_t offset, pgoff_t last_index)
4433{
4434 pgoff_t req_size = last_index - offset + 1;
4435
4436 page_cache_sync_readahead(mapping, ra, file, offset, req_size);
4437 return offset + req_size;
4438}
4439
4440struct inode *btrfs_alloc_inode(struct super_block *sb)
4441{
4442 struct btrfs_inode *ei;
4443
4444 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
4445 if (!ei)
4446 return NULL;
4447 ei->last_trans = 0;
4448 ei->logged_trans = 0;
4449 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
4450 ei->i_acl = BTRFS_ACL_NOT_CACHED;
4451 ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
4452 INIT_LIST_HEAD(&ei->i_orphan);
4453 return &ei->vfs_inode;
4454}
4455
4456void btrfs_destroy_inode(struct inode *inode)
4457{
4458 struct btrfs_ordered_extent *ordered;
4459 WARN_ON(!list_empty(&inode->i_dentry));
4460 WARN_ON(inode->i_data.nrpages);
4461
4462 if (BTRFS_I(inode)->i_acl &&
4463 BTRFS_I(inode)->i_acl != BTRFS_ACL_NOT_CACHED)
4464 posix_acl_release(BTRFS_I(inode)->i_acl);
4465 if (BTRFS_I(inode)->i_default_acl &&
4466 BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
4467 posix_acl_release(BTRFS_I(inode)->i_default_acl);
4468
4469 spin_lock(&BTRFS_I(inode)->root->list_lock);
4470 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
4471 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
4472 " list\n", inode->i_ino);
4473 dump_stack();
4474 }
4475 spin_unlock(&BTRFS_I(inode)->root->list_lock);
4476
4477 while (1) {
4478 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
4479 if (!ordered)
4480 break;
4481 else {
4482 printk(KERN_ERR "btrfs found ordered "
4483 "extent %llu %llu on inode cleanup\n",
4484 (unsigned long long)ordered->file_offset,
4485 (unsigned long long)ordered->len);
4486 btrfs_remove_ordered_extent(inode, ordered);
4487 btrfs_put_ordered_extent(ordered);
4488 btrfs_put_ordered_extent(ordered);
4489 }
4490 }
4491 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
4492 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
4493}
4494
4495static void init_once(void *foo)
4496{
4497 struct btrfs_inode *ei = (struct btrfs_inode *) foo;
4498
4499 inode_init_once(&ei->vfs_inode);
4500}
4501
4502void btrfs_destroy_cachep(void)
4503{
4504 if (btrfs_inode_cachep)
4505 kmem_cache_destroy(btrfs_inode_cachep);
4506 if (btrfs_trans_handle_cachep)
4507 kmem_cache_destroy(btrfs_trans_handle_cachep);
4508 if (btrfs_transaction_cachep)
4509 kmem_cache_destroy(btrfs_transaction_cachep);
4510 if (btrfs_bit_radix_cachep)
4511 kmem_cache_destroy(btrfs_bit_radix_cachep);
4512 if (btrfs_path_cachep)
4513 kmem_cache_destroy(btrfs_path_cachep);
4514}
4515
4516struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
4517 unsigned long extra_flags,
4518 void (*ctor)(void *))
4519{
4520 return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
4521 SLAB_MEM_SPREAD | extra_flags), ctor);
4522}
4523
4524int btrfs_init_cachep(void)
4525{
4526 btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache",
4527 sizeof(struct btrfs_inode),
4528 0, init_once);
4529 if (!btrfs_inode_cachep)
4530 goto fail;
4531 btrfs_trans_handle_cachep =
4532 btrfs_cache_create("btrfs_trans_handle_cache",
4533 sizeof(struct btrfs_trans_handle),
4534 0, NULL);
4535 if (!btrfs_trans_handle_cachep)
4536 goto fail;
4537 btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache",
4538 sizeof(struct btrfs_transaction),
4539 0, NULL);
4540 if (!btrfs_transaction_cachep)
4541 goto fail;
4542 btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache",
4543 sizeof(struct btrfs_path),
4544 0, NULL);
4545 if (!btrfs_path_cachep)
4546 goto fail;
4547 btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256,
4548 SLAB_DESTROY_BY_RCU, NULL);
4549 if (!btrfs_bit_radix_cachep)
4550 goto fail;
4551 return 0;
4552fail:
4553 btrfs_destroy_cachep();
4554 return -ENOMEM;
4555}
4556
4557static int btrfs_getattr(struct vfsmount *mnt,
4558 struct dentry *dentry, struct kstat *stat)
4559{
4560 struct inode *inode = dentry->d_inode;
4561 generic_fillattr(inode, stat);
4562 stat->dev = BTRFS_I(inode)->root->anon_super.s_dev;
4563 stat->blksize = PAGE_CACHE_SIZE;
4564 stat->blocks = (inode_get_bytes(inode) +
4565 BTRFS_I(inode)->delalloc_bytes) >> 9;
4566 return 0;
4567}
4568
4569static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4570 struct inode *new_dir, struct dentry *new_dentry)
4571{
4572 struct btrfs_trans_handle *trans;
4573 struct btrfs_root *root = BTRFS_I(old_dir)->root;
4574 struct inode *new_inode = new_dentry->d_inode;
4575 struct inode *old_inode = old_dentry->d_inode;
4576 struct timespec ctime = CURRENT_TIME;
4577 u64 index = 0;
4578 int ret;
4579
4580 /* we're not allowed to rename between subvolumes */
4581 if (BTRFS_I(old_inode)->root->root_key.objectid !=
4582 BTRFS_I(new_dir)->root->root_key.objectid)
4583 return -EXDEV;
4584
4585 if (S_ISDIR(old_inode->i_mode) && new_inode &&
4586 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
4587 return -ENOTEMPTY;
4588 }
4589
4590 /* to rename a snapshot or subvolume, we need to juggle the
4591 * backrefs. This isn't coded yet
4592 */
4593 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
4594 return -EXDEV;
4595
4596 ret = btrfs_check_free_space(root, 1, 0);
4597 if (ret)
4598 goto out_unlock;
4599
4600 trans = btrfs_start_transaction(root, 1);
4601
4602 btrfs_set_trans_block_group(trans, new_dir);
4603
4604 btrfs_inc_nlink(old_dentry->d_inode);
4605 old_dir->i_ctime = old_dir->i_mtime = ctime;
4606 new_dir->i_ctime = new_dir->i_mtime = ctime;
4607 old_inode->i_ctime = ctime;
4608
4609 ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
4610 old_dentry->d_name.name,
4611 old_dentry->d_name.len);
4612 if (ret)
4613 goto out_fail;
4614
4615 if (new_inode) {
4616 new_inode->i_ctime = CURRENT_TIME;
4617 ret = btrfs_unlink_inode(trans, root, new_dir,
4618 new_dentry->d_inode,
4619 new_dentry->d_name.name,
4620 new_dentry->d_name.len);
4621 if (ret)
4622 goto out_fail;
4623 if (new_inode->i_nlink == 0) {
4624 ret = btrfs_orphan_add(trans, new_dentry->d_inode);
4625 if (ret)
4626 goto out_fail;
4627 }
4628
4629 }
4630 ret = btrfs_set_inode_index(new_dir, &index);
4631 if (ret)
4632 goto out_fail;
4633
4634 ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
4635 old_inode, new_dentry->d_name.name,
4636 new_dentry->d_name.len, 1, index);
4637 if (ret)
4638 goto out_fail;
4639
4640out_fail:
4641 btrfs_end_transaction_throttle(trans, root);
4642out_unlock:
4643 return ret;
4644}
4645
4646/*
4647 * some fairly slow code that needs optimization. This walks the list
4648 * of all the inodes with pending delalloc and forces them to disk.
4649 */
4650int btrfs_start_delalloc_inodes(struct btrfs_root *root)
4651{
4652 struct list_head *head = &root->fs_info->delalloc_inodes;
4653 struct btrfs_inode *binode;
4654 struct inode *inode;
4655
4656 if (root->fs_info->sb->s_flags & MS_RDONLY)
4657 return -EROFS;
4658
4659 spin_lock(&root->fs_info->delalloc_lock);
4660 while (!list_empty(head)) {
4661 binode = list_entry(head->next, struct btrfs_inode,
4662 delalloc_inodes);
4663 inode = igrab(&binode->vfs_inode);
4664 if (!inode)
4665 list_del_init(&binode->delalloc_inodes);
4666 spin_unlock(&root->fs_info->delalloc_lock);
4667 if (inode) {
4668 filemap_flush(inode->i_mapping);
4669 iput(inode);
4670 }
4671 cond_resched();
4672 spin_lock(&root->fs_info->delalloc_lock);
4673 }
4674 spin_unlock(&root->fs_info->delalloc_lock);
4675
4676 /* the filemap_flush will queue IO into the worker threads, but
4677 * we have to make sure the IO is actually started and that
4678 * ordered extents get created before we return
4679 */
4680 atomic_inc(&root->fs_info->async_submit_draining);
4681 while (atomic_read(&root->fs_info->nr_async_submits) ||
4682 atomic_read(&root->fs_info->async_delalloc_pages)) {
4683 wait_event(root->fs_info->async_submit_wait,
4684 (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
4685 atomic_read(&root->fs_info->async_delalloc_pages) == 0));
4686 }
4687 atomic_dec(&root->fs_info->async_submit_draining);
4688 return 0;
4689}
4690
4691static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
4692 const char *symname)
4693{
4694 struct btrfs_trans_handle *trans;
4695 struct btrfs_root *root = BTRFS_I(dir)->root;
4696 struct btrfs_path *path;
4697 struct btrfs_key key;
4698 struct inode *inode = NULL;
4699 int err;
4700 int drop_inode = 0;
4701 u64 objectid;
4702 u64 index = 0 ;
4703 int name_len;
4704 int datasize;
4705 unsigned long ptr;
4706 struct btrfs_file_extent_item *ei;
4707 struct extent_buffer *leaf;
4708 unsigned long nr = 0;
4709
4710 name_len = strlen(symname) + 1;
4711 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
4712 return -ENAMETOOLONG;
4713
4714 err = btrfs_check_free_space(root, 1, 0);
4715 if (err)
4716 goto out_fail;
4717
4718 trans = btrfs_start_transaction(root, 1);
4719 btrfs_set_trans_block_group(trans, dir);
4720
4721 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4722 if (err) {
4723 err = -ENOSPC;
4724 goto out_unlock;
4725 }
4726
4727 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4728 dentry->d_name.len,
4729 dentry->d_parent->d_inode->i_ino, objectid,
4730 BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
4731 &index);
4732 err = PTR_ERR(inode);
4733 if (IS_ERR(inode))
4734 goto out_unlock;
4735
4736 err = btrfs_init_acl(inode, dir);
4737 if (err) {
4738 drop_inode = 1;
4739 goto out_unlock;
4740 }
4741
4742 btrfs_set_trans_block_group(trans, inode);
4743 err = btrfs_add_nondir(trans, dentry, inode, 0, index);
4744 if (err)
4745 drop_inode = 1;
4746 else {
4747 inode->i_mapping->a_ops = &btrfs_aops;
4748 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
4749 inode->i_fop = &btrfs_file_operations;
4750 inode->i_op = &btrfs_file_inode_operations;
4751 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
4752 }
4753 dir->i_sb->s_dirt = 1;
4754 btrfs_update_inode_block_group(trans, inode);
4755 btrfs_update_inode_block_group(trans, dir);
4756 if (drop_inode)
4757 goto out_unlock;
4758
4759 path = btrfs_alloc_path();
4760 BUG_ON(!path);
4761 key.objectid = inode->i_ino;
4762 key.offset = 0;
4763 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
4764 datasize = btrfs_file_extent_calc_inline_size(name_len);
4765 err = btrfs_insert_empty_item(trans, root, path, &key,
4766 datasize);
4767 if (err) {
4768 drop_inode = 1;
4769 goto out_unlock;
4770 }
4771 leaf = path->nodes[0];
4772 ei = btrfs_item_ptr(leaf, path->slots[0],
4773 struct btrfs_file_extent_item);
4774 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
4775 btrfs_set_file_extent_type(leaf, ei,
4776 BTRFS_FILE_EXTENT_INLINE);
4777 btrfs_set_file_extent_encryption(leaf, ei, 0);
4778 btrfs_set_file_extent_compression(leaf, ei, 0);
4779 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
4780 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
4781
4782 ptr = btrfs_file_extent_inline_start(ei);
4783 write_extent_buffer(leaf, symname, ptr, name_len);
4784 btrfs_mark_buffer_dirty(leaf);
4785 btrfs_free_path(path);
4786
4787 inode->i_op = &btrfs_symlink_inode_operations;
4788 inode->i_mapping->a_ops = &btrfs_symlink_aops;
4789 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
4790 inode_set_bytes(inode, name_len);
4791 btrfs_i_size_write(inode, name_len - 1);
4792 err = btrfs_update_inode(trans, root, inode);
4793 if (err)
4794 drop_inode = 1;
4795
4796out_unlock:
4797 nr = trans->blocks_used;
4798 btrfs_end_transaction_throttle(trans, root);
4799out_fail:
4800 if (drop_inode) {
4801 inode_dec_link_count(inode);
4802 iput(inode);
4803 }
4804 btrfs_btree_balance_dirty(root, nr);
4805 return err;
4806}
4807
4808static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
4809 u64 alloc_hint, int mode)
4810{
4811 struct btrfs_trans_handle *trans;
4812 struct btrfs_root *root = BTRFS_I(inode)->root;
4813 struct btrfs_key ins;
4814 u64 alloc_size;
4815 u64 cur_offset = start;
4816 u64 num_bytes = end - start;
4817 int ret = 0;
4818
4819 trans = btrfs_join_transaction(root, 1);
4820 BUG_ON(!trans);
4821 btrfs_set_trans_block_group(trans, inode);
4822
4823 while (num_bytes > 0) {
4824 alloc_size = min(num_bytes, root->fs_info->max_extent);
4825 ret = btrfs_reserve_extent(trans, root, alloc_size,
4826 root->sectorsize, 0, alloc_hint,
4827 (u64)-1, &ins, 1);
4828 if (ret) {
4829 WARN_ON(1);
4830 goto out;
4831 }
4832 ret = insert_reserved_file_extent(trans, inode,
4833 cur_offset, ins.objectid,
4834 ins.offset, ins.offset,
4835 ins.offset, 0, 0, 0,
4836 BTRFS_FILE_EXTENT_PREALLOC);
4837 BUG_ON(ret);
4838 num_bytes -= ins.offset;
4839 cur_offset += ins.offset;
4840 alloc_hint = ins.objectid + ins.offset;
4841 }
4842out:
4843 if (cur_offset > start) {
4844 inode->i_ctime = CURRENT_TIME;
4845 btrfs_set_flag(inode, PREALLOC);
4846 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4847 cur_offset > i_size_read(inode))
4848 btrfs_i_size_write(inode, cur_offset);
4849 ret = btrfs_update_inode(trans, root, inode);
4850 BUG_ON(ret);
4851 }
4852
4853 btrfs_end_transaction(trans, root);
4854 return ret;
4855}
4856
4857static long btrfs_fallocate(struct inode *inode, int mode,
4858 loff_t offset, loff_t len)
4859{
4860 u64 cur_offset;
4861 u64 last_byte;
4862 u64 alloc_start;
4863 u64 alloc_end;
4864 u64 alloc_hint = 0;
4865 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
4866 struct extent_map *em;
4867 int ret;
4868
4869 alloc_start = offset & ~mask;
4870 alloc_end = (offset + len + mask) & ~mask;
4871
4872 mutex_lock(&inode->i_mutex);
4873 if (alloc_start > inode->i_size) {
4874 ret = btrfs_cont_expand(inode, alloc_start);
4875 if (ret)
4876 goto out;
4877 }
4878
4879 while (1) {
4880 struct btrfs_ordered_extent *ordered;
4881 lock_extent(&BTRFS_I(inode)->io_tree, alloc_start,
4882 alloc_end - 1, GFP_NOFS);
4883 ordered = btrfs_lookup_first_ordered_extent(inode,
4884 alloc_end - 1);
4885 if (ordered &&
4886 ordered->file_offset + ordered->len > alloc_start &&
4887 ordered->file_offset < alloc_end) {
4888 btrfs_put_ordered_extent(ordered);
4889 unlock_extent(&BTRFS_I(inode)->io_tree,
4890 alloc_start, alloc_end - 1, GFP_NOFS);
4891 btrfs_wait_ordered_range(inode, alloc_start,
4892 alloc_end - alloc_start);
4893 } else {
4894 if (ordered)
4895 btrfs_put_ordered_extent(ordered);
4896 break;
4897 }
4898 }
4899
4900 cur_offset = alloc_start;
4901 while (1) {
4902 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4903 alloc_end - cur_offset, 0);
4904 BUG_ON(IS_ERR(em) || !em);
4905 last_byte = min(extent_map_end(em), alloc_end);
4906 last_byte = (last_byte + mask) & ~mask;
4907 if (em->block_start == EXTENT_MAP_HOLE) {
4908 ret = prealloc_file_range(inode, cur_offset,
4909 last_byte, alloc_hint, mode);
4910 if (ret < 0) {
4911 free_extent_map(em);
4912 break;
4913 }
4914 }
4915 if (em->block_start <= EXTENT_MAP_LAST_BYTE)
4916 alloc_hint = em->block_start;
4917 free_extent_map(em);
4918
4919 cur_offset = last_byte;
4920 if (cur_offset >= alloc_end) {
4921 ret = 0;
4922 break;
4923 }
4924 }
4925 unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1,
4926 GFP_NOFS);
4927out:
4928 mutex_unlock(&inode->i_mutex);
4929 return ret;
4930}
4931
4932static int btrfs_set_page_dirty(struct page *page)
4933{
4934 return __set_page_dirty_nobuffers(page);
4935}
4936
4937static int btrfs_permission(struct inode *inode, int mask)
4938{
4939 if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
4940 return -EACCES;
4941 return generic_permission(inode, mask, btrfs_check_acl);
4942}
4943
4944static struct inode_operations btrfs_dir_inode_operations = {
4945 .getattr = btrfs_getattr,
4946 .lookup = btrfs_lookup,
4947 .create = btrfs_create,
4948 .unlink = btrfs_unlink,
4949 .link = btrfs_link,
4950 .mkdir = btrfs_mkdir,
4951 .rmdir = btrfs_rmdir,
4952 .rename = btrfs_rename,
4953 .symlink = btrfs_symlink,
4954 .setattr = btrfs_setattr,
4955 .mknod = btrfs_mknod,
4956 .setxattr = btrfs_setxattr,
4957 .getxattr = btrfs_getxattr,
4958 .listxattr = btrfs_listxattr,
4959 .removexattr = btrfs_removexattr,
4960 .permission = btrfs_permission,
4961};
4962static struct inode_operations btrfs_dir_ro_inode_operations = {
4963 .lookup = btrfs_lookup,
4964 .permission = btrfs_permission,
4965};
4966static struct file_operations btrfs_dir_file_operations = {
4967 .llseek = generic_file_llseek,
4968 .read = generic_read_dir,
4969 .readdir = btrfs_real_readdir,
4970 .unlocked_ioctl = btrfs_ioctl,
4971#ifdef CONFIG_COMPAT
4972 .compat_ioctl = btrfs_ioctl,
4973#endif
4974 .release = btrfs_release_file,
4975 .fsync = btrfs_sync_file,
4976};
4977
4978static struct extent_io_ops btrfs_extent_io_ops = {
4979 .fill_delalloc = run_delalloc_range,
4980 .submit_bio_hook = btrfs_submit_bio_hook,
4981 .merge_bio_hook = btrfs_merge_bio_hook,
4982 .readpage_end_io_hook = btrfs_readpage_end_io_hook,
4983 .writepage_end_io_hook = btrfs_writepage_end_io_hook,
4984 .writepage_start_hook = btrfs_writepage_start_hook,
4985 .readpage_io_failed_hook = btrfs_io_failed_hook,
4986 .set_bit_hook = btrfs_set_bit_hook,
4987 .clear_bit_hook = btrfs_clear_bit_hook,
4988};
4989
4990static struct address_space_operations btrfs_aops = {
4991 .readpage = btrfs_readpage,
4992 .writepage = btrfs_writepage,
4993 .writepages = btrfs_writepages,
4994 .readpages = btrfs_readpages,
4995 .sync_page = block_sync_page,
4996 .bmap = btrfs_bmap,
4997 .direct_IO = btrfs_direct_IO,
4998 .invalidatepage = btrfs_invalidatepage,
4999 .releasepage = btrfs_releasepage,
5000 .set_page_dirty = btrfs_set_page_dirty,
5001};
5002
5003static struct address_space_operations btrfs_symlink_aops = {
5004 .readpage = btrfs_readpage,
5005 .writepage = btrfs_writepage,
5006 .invalidatepage = btrfs_invalidatepage,
5007 .releasepage = btrfs_releasepage,
5008};
5009
5010static struct inode_operations btrfs_file_inode_operations = {
5011 .truncate = btrfs_truncate,
5012 .getattr = btrfs_getattr,
5013 .setattr = btrfs_setattr,
5014 .setxattr = btrfs_setxattr,
5015 .getxattr = btrfs_getxattr,
5016 .listxattr = btrfs_listxattr,
5017 .removexattr = btrfs_removexattr,
5018 .permission = btrfs_permission,
5019 .fallocate = btrfs_fallocate,
5020};
5021static struct inode_operations btrfs_special_inode_operations = {
5022 .getattr = btrfs_getattr,
5023 .setattr = btrfs_setattr,
5024 .permission = btrfs_permission,
5025 .setxattr = btrfs_setxattr,
5026 .getxattr = btrfs_getxattr,
5027 .listxattr = btrfs_listxattr,
5028 .removexattr = btrfs_removexattr,
5029};
5030static struct inode_operations btrfs_symlink_inode_operations = {
5031 .readlink = generic_readlink,
5032 .follow_link = page_follow_link_light,
5033 .put_link = page_put_link,
5034 .permission = btrfs_permission,
5035};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
new file mode 100644
index 000000000000..c2aa33e3feb5
--- /dev/null
+++ b/fs/btrfs/ioctl.c
@@ -0,0 +1,1132 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bio.h>
21#include <linux/buffer_head.h>
22#include <linux/file.h>
23#include <linux/fs.h>
24#include <linux/fsnotify.h>
25#include <linux/pagemap.h>
26#include <linux/highmem.h>
27#include <linux/time.h>
28#include <linux/init.h>
29#include <linux/string.h>
30#include <linux/smp_lock.h>
31#include <linux/backing-dev.h>
32#include <linux/mount.h>
33#include <linux/mpage.h>
34#include <linux/namei.h>
35#include <linux/swap.h>
36#include <linux/writeback.h>
37#include <linux/statfs.h>
38#include <linux/compat.h>
39#include <linux/bit_spinlock.h>
40#include <linux/security.h>
41#include <linux/version.h>
42#include <linux/xattr.h>
43#include <linux/vmalloc.h>
44#include "compat.h"
45#include "ctree.h"
46#include "disk-io.h"
47#include "transaction.h"
48#include "btrfs_inode.h"
49#include "ioctl.h"
50#include "print-tree.h"
51#include "volumes.h"
52#include "locking.h"
53
54
55
56static noinline int create_subvol(struct btrfs_root *root,
57 struct dentry *dentry,
58 char *name, int namelen)
59{
60 struct btrfs_trans_handle *trans;
61 struct btrfs_key key;
62 struct btrfs_root_item root_item;
63 struct btrfs_inode_item *inode_item;
64 struct extent_buffer *leaf;
65 struct btrfs_root *new_root = root;
66 struct inode *dir;
67 int ret;
68 int err;
69 u64 objectid;
70 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
71 u64 index = 0;
72 unsigned long nr = 1;
73
74 ret = btrfs_check_free_space(root, 1, 0);
75 if (ret)
76 goto fail_commit;
77
78 trans = btrfs_start_transaction(root, 1);
79 BUG_ON(!trans);
80
81 ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
82 0, &objectid);
83 if (ret)
84 goto fail;
85
86 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
87 objectid, trans->transid, 0, 0, 0);
88 if (IS_ERR(leaf)) {
89 ret = PTR_ERR(leaf);
90 goto fail;
91 }
92
93 btrfs_set_header_nritems(leaf, 0);
94 btrfs_set_header_level(leaf, 0);
95 btrfs_set_header_bytenr(leaf, leaf->start);
96 btrfs_set_header_generation(leaf, trans->transid);
97 btrfs_set_header_owner(leaf, objectid);
98
99 write_extent_buffer(leaf, root->fs_info->fsid,
100 (unsigned long)btrfs_header_fsid(leaf),
101 BTRFS_FSID_SIZE);
102 btrfs_mark_buffer_dirty(leaf);
103
104 inode_item = &root_item.inode;
105 memset(inode_item, 0, sizeof(*inode_item));
106 inode_item->generation = cpu_to_le64(1);
107 inode_item->size = cpu_to_le64(3);
108 inode_item->nlink = cpu_to_le32(1);
109 inode_item->nbytes = cpu_to_le64(root->leafsize);
110 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
111
112 btrfs_set_root_bytenr(&root_item, leaf->start);
113 btrfs_set_root_generation(&root_item, trans->transid);
114 btrfs_set_root_level(&root_item, 0);
115 btrfs_set_root_refs(&root_item, 1);
116 btrfs_set_root_used(&root_item, 0);
117 btrfs_set_root_last_snapshot(&root_item, 0);
118
119 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
120 root_item.drop_level = 0;
121
122 btrfs_tree_unlock(leaf);
123 free_extent_buffer(leaf);
124 leaf = NULL;
125
126 btrfs_set_root_dirid(&root_item, new_dirid);
127
128 key.objectid = objectid;
129 key.offset = 1;
130 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
131 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
132 &root_item);
133 if (ret)
134 goto fail;
135
136 /*
137 * insert the directory item
138 */
139 key.offset = (u64)-1;
140 dir = dentry->d_parent->d_inode;
141 ret = btrfs_set_inode_index(dir, &index);
142 BUG_ON(ret);
143
144 ret = btrfs_insert_dir_item(trans, root,
145 name, namelen, dir->i_ino, &key,
146 BTRFS_FT_DIR, index);
147 if (ret)
148 goto fail;
149
150 btrfs_i_size_write(dir, dir->i_size + namelen * 2);
151 ret = btrfs_update_inode(trans, root, dir);
152 BUG_ON(ret);
153
154 /* add the backref first */
155 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
156 objectid, BTRFS_ROOT_BACKREF_KEY,
157 root->root_key.objectid,
158 dir->i_ino, index, name, namelen);
159
160 BUG_ON(ret);
161
162 /* now add the forward ref */
163 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
164 root->root_key.objectid, BTRFS_ROOT_REF_KEY,
165 objectid,
166 dir->i_ino, index, name, namelen);
167
168 BUG_ON(ret);
169
170 ret = btrfs_commit_transaction(trans, root);
171 if (ret)
172 goto fail_commit;
173
174 new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
175 BUG_ON(!new_root);
176
177 trans = btrfs_start_transaction(new_root, 1);
178 BUG_ON(!trans);
179
180 ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid,
181 BTRFS_I(dir)->block_group);
182 if (ret)
183 goto fail;
184
185fail:
186 nr = trans->blocks_used;
187 err = btrfs_commit_transaction(trans, new_root);
188 if (err && !ret)
189 ret = err;
190fail_commit:
191 btrfs_btree_balance_dirty(root, nr);
192 return ret;
193}
194
195static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
196 char *name, int namelen)
197{
198 struct btrfs_pending_snapshot *pending_snapshot;
199 struct btrfs_trans_handle *trans;
200 int ret = 0;
201 int err;
202 unsigned long nr = 0;
203
204 if (!root->ref_cows)
205 return -EINVAL;
206
207 ret = btrfs_check_free_space(root, 1, 0);
208 if (ret)
209 goto fail_unlock;
210
211 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
212 if (!pending_snapshot) {
213 ret = -ENOMEM;
214 goto fail_unlock;
215 }
216 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
217 if (!pending_snapshot->name) {
218 ret = -ENOMEM;
219 kfree(pending_snapshot);
220 goto fail_unlock;
221 }
222 memcpy(pending_snapshot->name, name, namelen);
223 pending_snapshot->name[namelen] = '\0';
224 pending_snapshot->dentry = dentry;
225 trans = btrfs_start_transaction(root, 1);
226 BUG_ON(!trans);
227 pending_snapshot->root = root;
228 list_add(&pending_snapshot->list,
229 &trans->transaction->pending_snapshots);
230 err = btrfs_commit_transaction(trans, root);
231
232fail_unlock:
233 btrfs_btree_balance_dirty(root, nr);
234 return ret;
235}
236
237/* copy of may_create in fs/namei.c() */
238static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
239{
240 if (child->d_inode)
241 return -EEXIST;
242 if (IS_DEADDIR(dir))
243 return -ENOENT;
244 return inode_permission(dir, MAY_WRITE | MAY_EXEC);
245}
246
247/*
248 * Create a new subvolume below @parent. This is largely modeled after
249 * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
250 * inside this filesystem so it's quite a bit simpler.
251 */
252static noinline int btrfs_mksubvol(struct path *parent, char *name,
253 int mode, int namelen,
254 struct btrfs_root *snap_src)
255{
256 struct dentry *dentry;
257 int error;
258
259 mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
260
261 dentry = lookup_one_len(name, parent->dentry, namelen);
262 error = PTR_ERR(dentry);
263 if (IS_ERR(dentry))
264 goto out_unlock;
265
266 error = -EEXIST;
267 if (dentry->d_inode)
268 goto out_dput;
269
270 if (!IS_POSIXACL(parent->dentry->d_inode))
271 mode &= ~current->fs->umask;
272
273 error = mnt_want_write(parent->mnt);
274 if (error)
275 goto out_dput;
276
277 error = btrfs_may_create(parent->dentry->d_inode, dentry);
278 if (error)
279 goto out_drop_write;
280
281 /*
282 * Actually perform the low-level subvolume creation after all
283 * this VFS fuzz.
284 *
285 * Eventually we want to pass in an inode under which we create this
286 * subvolume, but for now all are under the filesystem root.
287 *
288 * Also we should pass on the mode eventually to allow creating new
289 * subvolume with specific mode bits.
290 */
291 if (snap_src) {
292 struct dentry *dir = dentry->d_parent;
293 struct dentry *test = dir->d_parent;
294 struct btrfs_path *path = btrfs_alloc_path();
295 int ret;
296 u64 test_oid;
297 u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid;
298
299 test_oid = snap_src->root_key.objectid;
300
301 ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
302 path, parent_oid, test_oid);
303 if (ret == 0)
304 goto create;
305 btrfs_release_path(snap_src->fs_info->tree_root, path);
306
307 /* we need to make sure we aren't creating a directory loop
308 * by taking a snapshot of something that has our current
309 * subvol in its directory tree. So, this loops through
310 * the dentries and checks the forward refs for each subvolume
311 * to see if is references the subvolume where we are
312 * placing this new snapshot.
313 */
314 while (1) {
315 if (!test ||
316 dir == snap_src->fs_info->sb->s_root ||
317 test == snap_src->fs_info->sb->s_root ||
318 test->d_inode->i_sb != snap_src->fs_info->sb) {
319 break;
320 }
321 if (S_ISLNK(test->d_inode->i_mode)) {
322 printk(KERN_INFO "Btrfs symlink in snapshot "
323 "path, failed\n");
324 error = -EMLINK;
325 btrfs_free_path(path);
326 goto out_drop_write;
327 }
328 test_oid =
329 BTRFS_I(test->d_inode)->root->root_key.objectid;
330 ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
331 path, test_oid, parent_oid);
332 if (ret == 0) {
333 printk(KERN_INFO "Btrfs snapshot creation "
334 "failed, looping\n");
335 error = -EMLINK;
336 btrfs_free_path(path);
337 goto out_drop_write;
338 }
339 btrfs_release_path(snap_src->fs_info->tree_root, path);
340 test = test->d_parent;
341 }
342create:
343 btrfs_free_path(path);
344 error = create_snapshot(snap_src, dentry, name, namelen);
345 } else {
346 error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root,
347 dentry, name, namelen);
348 }
349 if (error)
350 goto out_drop_write;
351
352 fsnotify_mkdir(parent->dentry->d_inode, dentry);
353out_drop_write:
354 mnt_drop_write(parent->mnt);
355out_dput:
356 dput(dentry);
357out_unlock:
358 mutex_unlock(&parent->dentry->d_inode->i_mutex);
359 return error;
360}
361
362
363static int btrfs_defrag_file(struct file *file)
364{
365 struct inode *inode = fdentry(file)->d_inode;
366 struct btrfs_root *root = BTRFS_I(inode)->root;
367 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
368 struct btrfs_ordered_extent *ordered;
369 struct page *page;
370 unsigned long last_index;
371 unsigned long ra_pages = root->fs_info->bdi.ra_pages;
372 unsigned long total_read = 0;
373 u64 page_start;
374 u64 page_end;
375 unsigned long i;
376 int ret;
377
378 ret = btrfs_check_free_space(root, inode->i_size, 0);
379 if (ret)
380 return -ENOSPC;
381
382 mutex_lock(&inode->i_mutex);
383 last_index = inode->i_size >> PAGE_CACHE_SHIFT;
384 for (i = 0; i <= last_index; i++) {
385 if (total_read % ra_pages == 0) {
386 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
387 min(last_index, i + ra_pages - 1));
388 }
389 total_read++;
390again:
391 page = grab_cache_page(inode->i_mapping, i);
392 if (!page)
393 goto out_unlock;
394 if (!PageUptodate(page)) {
395 btrfs_readpage(NULL, page);
396 lock_page(page);
397 if (!PageUptodate(page)) {
398 unlock_page(page);
399 page_cache_release(page);
400 goto out_unlock;
401 }
402 }
403
404 wait_on_page_writeback(page);
405
406 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
407 page_end = page_start + PAGE_CACHE_SIZE - 1;
408 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
409
410 ordered = btrfs_lookup_ordered_extent(inode, page_start);
411 if (ordered) {
412 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
413 unlock_page(page);
414 page_cache_release(page);
415 btrfs_start_ordered_extent(inode, ordered, 1);
416 btrfs_put_ordered_extent(ordered);
417 goto again;
418 }
419 set_page_extent_mapped(page);
420
421 /*
422 * this makes sure page_mkwrite is called on the
423 * page if it is dirtied again later
424 */
425 clear_page_dirty_for_io(page);
426
427 btrfs_set_extent_delalloc(inode, page_start, page_end);
428
429 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
430 set_page_dirty(page);
431 unlock_page(page);
432 page_cache_release(page);
433 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
434 }
435
436out_unlock:
437 mutex_unlock(&inode->i_mutex);
438 return 0;
439}
440
441/*
442 * Called inside transaction, so use GFP_NOFS
443 */
444
445static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
446{
447 u64 new_size;
448 u64 old_size;
449 u64 devid = 1;
450 struct btrfs_ioctl_vol_args *vol_args;
451 struct btrfs_trans_handle *trans;
452 struct btrfs_device *device = NULL;
453 char *sizestr;
454 char *devstr = NULL;
455 int ret = 0;
456 int namelen;
457 int mod = 0;
458
459 if (root->fs_info->sb->s_flags & MS_RDONLY)
460 return -EROFS;
461
462 if (!capable(CAP_SYS_ADMIN))
463 return -EPERM;
464
465 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
466
467 if (!vol_args)
468 return -ENOMEM;
469
470 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
471 ret = -EFAULT;
472 goto out;
473 }
474
475 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
476 namelen = strlen(vol_args->name);
477
478 mutex_lock(&root->fs_info->volume_mutex);
479 sizestr = vol_args->name;
480 devstr = strchr(sizestr, ':');
481 if (devstr) {
482 char *end;
483 sizestr = devstr + 1;
484 *devstr = '\0';
485 devstr = vol_args->name;
486 devid = simple_strtoull(devstr, &end, 10);
487 printk(KERN_INFO "resizing devid %llu\n", devid);
488 }
489 device = btrfs_find_device(root, devid, NULL, NULL);
490 if (!device) {
491 printk(KERN_INFO "resizer unable to find device %llu\n", devid);
492 ret = -EINVAL;
493 goto out_unlock;
494 }
495 if (!strcmp(sizestr, "max"))
496 new_size = device->bdev->bd_inode->i_size;
497 else {
498 if (sizestr[0] == '-') {
499 mod = -1;
500 sizestr++;
501 } else if (sizestr[0] == '+') {
502 mod = 1;
503 sizestr++;
504 }
505 new_size = btrfs_parse_size(sizestr);
506 if (new_size == 0) {
507 ret = -EINVAL;
508 goto out_unlock;
509 }
510 }
511
512 old_size = device->total_bytes;
513
514 if (mod < 0) {
515 if (new_size > old_size) {
516 ret = -EINVAL;
517 goto out_unlock;
518 }
519 new_size = old_size - new_size;
520 } else if (mod > 0) {
521 new_size = old_size + new_size;
522 }
523
524 if (new_size < 256 * 1024 * 1024) {
525 ret = -EINVAL;
526 goto out_unlock;
527 }
528 if (new_size > device->bdev->bd_inode->i_size) {
529 ret = -EFBIG;
530 goto out_unlock;
531 }
532
533 do_div(new_size, root->sectorsize);
534 new_size *= root->sectorsize;
535
536 printk(KERN_INFO "new size for %s is %llu\n",
537 device->name, (unsigned long long)new_size);
538
539 if (new_size > old_size) {
540 trans = btrfs_start_transaction(root, 1);
541 ret = btrfs_grow_device(trans, device, new_size);
542 btrfs_commit_transaction(trans, root);
543 } else {
544 ret = btrfs_shrink_device(device, new_size);
545 }
546
547out_unlock:
548 mutex_unlock(&root->fs_info->volume_mutex);
549out:
550 kfree(vol_args);
551 return ret;
552}
553
554static noinline int btrfs_ioctl_snap_create(struct file *file,
555 void __user *arg, int subvol)
556{
557 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
558 struct btrfs_ioctl_vol_args *vol_args;
559 struct btrfs_dir_item *di;
560 struct btrfs_path *path;
561 struct file *src_file;
562 u64 root_dirid;
563 int namelen;
564 int ret = 0;
565
566 if (root->fs_info->sb->s_flags & MS_RDONLY)
567 return -EROFS;
568
569 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
570
571 if (!vol_args)
572 return -ENOMEM;
573
574 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
575 ret = -EFAULT;
576 goto out;
577 }
578
579 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
580 namelen = strlen(vol_args->name);
581 if (strchr(vol_args->name, '/')) {
582 ret = -EINVAL;
583 goto out;
584 }
585
586 path = btrfs_alloc_path();
587 if (!path) {
588 ret = -ENOMEM;
589 goto out;
590 }
591
592 root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
593 di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
594 path, root_dirid,
595 vol_args->name, namelen, 0);
596 btrfs_free_path(path);
597
598 if (di && !IS_ERR(di)) {
599 ret = -EEXIST;
600 goto out;
601 }
602
603 if (IS_ERR(di)) {
604 ret = PTR_ERR(di);
605 goto out;
606 }
607
608 if (subvol) {
609 ret = btrfs_mksubvol(&file->f_path, vol_args->name,
610 file->f_path.dentry->d_inode->i_mode,
611 namelen, NULL);
612 } else {
613 struct inode *src_inode;
614 src_file = fget(vol_args->fd);
615 if (!src_file) {
616 ret = -EINVAL;
617 goto out;
618 }
619
620 src_inode = src_file->f_path.dentry->d_inode;
621 if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) {
622 printk(KERN_INFO "btrfs: Snapshot src from "
623 "another FS\n");
624 ret = -EINVAL;
625 fput(src_file);
626 goto out;
627 }
628 ret = btrfs_mksubvol(&file->f_path, vol_args->name,
629 file->f_path.dentry->d_inode->i_mode,
630 namelen, BTRFS_I(src_inode)->root);
631 fput(src_file);
632 }
633
634out:
635 kfree(vol_args);
636 return ret;
637}
638
639static int btrfs_ioctl_defrag(struct file *file)
640{
641 struct inode *inode = fdentry(file)->d_inode;
642 struct btrfs_root *root = BTRFS_I(inode)->root;
643 int ret;
644
645 ret = mnt_want_write(file->f_path.mnt);
646 if (ret)
647 return ret;
648
649 switch (inode->i_mode & S_IFMT) {
650 case S_IFDIR:
651 if (!capable(CAP_SYS_ADMIN)) {
652 ret = -EPERM;
653 goto out;
654 }
655 btrfs_defrag_root(root, 0);
656 btrfs_defrag_root(root->fs_info->extent_root, 0);
657 break;
658 case S_IFREG:
659 if (!(file->f_mode & FMODE_WRITE)) {
660 ret = -EINVAL;
661 goto out;
662 }
663 btrfs_defrag_file(file);
664 break;
665 }
666out:
667 mnt_drop_write(file->f_path.mnt);
668 return ret;
669}
670
671static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
672{
673 struct btrfs_ioctl_vol_args *vol_args;
674 int ret;
675
676 if (!capable(CAP_SYS_ADMIN))
677 return -EPERM;
678
679 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
680
681 if (!vol_args)
682 return -ENOMEM;
683
684 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
685 ret = -EFAULT;
686 goto out;
687 }
688 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
689 ret = btrfs_init_new_device(root, vol_args->name);
690
691out:
692 kfree(vol_args);
693 return ret;
694}
695
696static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
697{
698 struct btrfs_ioctl_vol_args *vol_args;
699 int ret;
700
701 if (!capable(CAP_SYS_ADMIN))
702 return -EPERM;
703
704 if (root->fs_info->sb->s_flags & MS_RDONLY)
705 return -EROFS;
706
707 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
708
709 if (!vol_args)
710 return -ENOMEM;
711
712 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
713 ret = -EFAULT;
714 goto out;
715 }
716 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
717 ret = btrfs_rm_device(root, vol_args->name);
718
719out:
720 kfree(vol_args);
721 return ret;
722}
723
724static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
725 u64 off, u64 olen, u64 destoff)
726{
727 struct inode *inode = fdentry(file)->d_inode;
728 struct btrfs_root *root = BTRFS_I(inode)->root;
729 struct file *src_file;
730 struct inode *src;
731 struct btrfs_trans_handle *trans;
732 struct btrfs_path *path;
733 struct extent_buffer *leaf;
734 char *buf;
735 struct btrfs_key key;
736 u32 nritems;
737 int slot;
738 int ret;
739 u64 len = olen;
740 u64 bs = root->fs_info->sb->s_blocksize;
741 u64 hint_byte;
742
743 /*
744 * TODO:
745 * - split compressed inline extents. annoying: we need to
746 * decompress into destination's address_space (the file offset
747 * may change, so source mapping won't do), then recompress (or
748 * otherwise reinsert) a subrange.
749 * - allow ranges within the same file to be cloned (provided
750 * they don't overlap)?
751 */
752
753 /* the destination must be opened for writing */
754 if (!(file->f_mode & FMODE_WRITE))
755 return -EINVAL;
756
757 ret = mnt_want_write(file->f_path.mnt);
758 if (ret)
759 return ret;
760
761 src_file = fget(srcfd);
762 if (!src_file) {
763 ret = -EBADF;
764 goto out_drop_write;
765 }
766 src = src_file->f_dentry->d_inode;
767
768 ret = -EINVAL;
769 if (src == inode)
770 goto out_fput;
771
772 ret = -EISDIR;
773 if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
774 goto out_fput;
775
776 ret = -EXDEV;
777 if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root)
778 goto out_fput;
779
780 ret = -ENOMEM;
781 buf = vmalloc(btrfs_level_size(root, 0));
782 if (!buf)
783 goto out_fput;
784
785 path = btrfs_alloc_path();
786 if (!path) {
787 vfree(buf);
788 goto out_fput;
789 }
790 path->reada = 2;
791
792 if (inode < src) {
793 mutex_lock(&inode->i_mutex);
794 mutex_lock(&src->i_mutex);
795 } else {
796 mutex_lock(&src->i_mutex);
797 mutex_lock(&inode->i_mutex);
798 }
799
800 /* determine range to clone */
801 ret = -EINVAL;
802 if (off >= src->i_size || off + len > src->i_size)
803 goto out_unlock;
804 if (len == 0)
805 olen = len = src->i_size - off;
806 /* if we extend to eof, continue to block boundary */
807 if (off + len == src->i_size)
808 len = ((src->i_size + bs-1) & ~(bs-1))
809 - off;
810
811 /* verify the end result is block aligned */
812 if ((off & (bs-1)) ||
813 ((off + len) & (bs-1)))
814 goto out_unlock;
815
816 /* do any pending delalloc/csum calc on src, one way or
817 another, and lock file content */
818 while (1) {
819 struct btrfs_ordered_extent *ordered;
820 lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
821 ordered = btrfs_lookup_first_ordered_extent(inode, off+len);
822 if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered)
823 break;
824 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
825 if (ordered)
826 btrfs_put_ordered_extent(ordered);
827 btrfs_wait_ordered_range(src, off, off+len);
828 }
829
830 trans = btrfs_start_transaction(root, 1);
831 BUG_ON(!trans);
832
833 /* punch hole in destination first */
834 btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte);
835
836 /* clone data */
837 key.objectid = src->i_ino;
838 key.type = BTRFS_EXTENT_DATA_KEY;
839 key.offset = 0;
840
841 while (1) {
842 /*
843 * note the key will change type as we walk through the
844 * tree.
845 */
846 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
847 if (ret < 0)
848 goto out;
849
850 nritems = btrfs_header_nritems(path->nodes[0]);
851 if (path->slots[0] >= nritems) {
852 ret = btrfs_next_leaf(root, path);
853 if (ret < 0)
854 goto out;
855 if (ret > 0)
856 break;
857 nritems = btrfs_header_nritems(path->nodes[0]);
858 }
859 leaf = path->nodes[0];
860 slot = path->slots[0];
861
862 btrfs_item_key_to_cpu(leaf, &key, slot);
863 if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
864 key.objectid != src->i_ino)
865 break;
866
867 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
868 struct btrfs_file_extent_item *extent;
869 int type;
870 u32 size;
871 struct btrfs_key new_key;
872 u64 disko = 0, diskl = 0;
873 u64 datao = 0, datal = 0;
874 u8 comp;
875
876 size = btrfs_item_size_nr(leaf, slot);
877 read_extent_buffer(leaf, buf,
878 btrfs_item_ptr_offset(leaf, slot),
879 size);
880
881 extent = btrfs_item_ptr(leaf, slot,
882 struct btrfs_file_extent_item);
883 comp = btrfs_file_extent_compression(leaf, extent);
884 type = btrfs_file_extent_type(leaf, extent);
885 if (type == BTRFS_FILE_EXTENT_REG) {
886 disko = btrfs_file_extent_disk_bytenr(leaf,
887 extent);
888 diskl = btrfs_file_extent_disk_num_bytes(leaf,
889 extent);
890 datao = btrfs_file_extent_offset(leaf, extent);
891 datal = btrfs_file_extent_num_bytes(leaf,
892 extent);
893 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
894 /* take upper bound, may be compressed */
895 datal = btrfs_file_extent_ram_bytes(leaf,
896 extent);
897 }
898 btrfs_release_path(root, path);
899
900 if (key.offset + datal < off ||
901 key.offset >= off+len)
902 goto next;
903
904 memcpy(&new_key, &key, sizeof(new_key));
905 new_key.objectid = inode->i_ino;
906 new_key.offset = key.offset + destoff - off;
907
908 if (type == BTRFS_FILE_EXTENT_REG) {
909 ret = btrfs_insert_empty_item(trans, root, path,
910 &new_key, size);
911 if (ret)
912 goto out;
913
914 leaf = path->nodes[0];
915 slot = path->slots[0];
916 write_extent_buffer(leaf, buf,
917 btrfs_item_ptr_offset(leaf, slot),
918 size);
919
920 extent = btrfs_item_ptr(leaf, slot,
921 struct btrfs_file_extent_item);
922
923 if (off > key.offset) {
924 datao += off - key.offset;
925 datal -= off - key.offset;
926 }
927 if (key.offset + datao + datal + key.offset >
928 off + len)
929 datal = off + len - key.offset - datao;
930 /* disko == 0 means it's a hole */
931 if (!disko)
932 datao = 0;
933
934 btrfs_set_file_extent_offset(leaf, extent,
935 datao);
936 btrfs_set_file_extent_num_bytes(leaf, extent,
937 datal);
938 if (disko) {
939 inode_add_bytes(inode, datal);
940 ret = btrfs_inc_extent_ref(trans, root,
941 disko, diskl, leaf->start,
942 root->root_key.objectid,
943 trans->transid,
944 inode->i_ino);
945 BUG_ON(ret);
946 }
947 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
948 u64 skip = 0;
949 u64 trim = 0;
950 if (off > key.offset) {
951 skip = off - key.offset;
952 new_key.offset += skip;
953 }
954
955 if (key.offset + datal > off+len)
956 trim = key.offset + datal - (off+len);
957
958 if (comp && (skip || trim)) {
959 ret = -EINVAL;
960 goto out;
961 }
962 size -= skip + trim;
963 datal -= skip + trim;
964 ret = btrfs_insert_empty_item(trans, root, path,
965 &new_key, size);
966 if (ret)
967 goto out;
968
969 if (skip) {
970 u32 start =
971 btrfs_file_extent_calc_inline_size(0);
972 memmove(buf+start, buf+start+skip,
973 datal);
974 }
975
976 leaf = path->nodes[0];
977 slot = path->slots[0];
978 write_extent_buffer(leaf, buf,
979 btrfs_item_ptr_offset(leaf, slot),
980 size);
981 inode_add_bytes(inode, datal);
982 }
983
984 btrfs_mark_buffer_dirty(leaf);
985 }
986
987next:
988 btrfs_release_path(root, path);
989 key.offset++;
990 }
991 ret = 0;
992out:
993 btrfs_release_path(root, path);
994 if (ret == 0) {
995 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
996 if (destoff + olen > inode->i_size)
997 btrfs_i_size_write(inode, destoff + olen);
998 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
999 ret = btrfs_update_inode(trans, root, inode);
1000 }
1001 btrfs_end_transaction(trans, root);
1002 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1003 if (ret)
1004 vmtruncate(inode, 0);
1005out_unlock:
1006 mutex_unlock(&src->i_mutex);
1007 mutex_unlock(&inode->i_mutex);
1008 vfree(buf);
1009 btrfs_free_path(path);
1010out_fput:
1011 fput(src_file);
1012out_drop_write:
1013 mnt_drop_write(file->f_path.mnt);
1014 return ret;
1015}
1016
1017static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
1018{
1019 struct btrfs_ioctl_clone_range_args args;
1020
1021 if (copy_from_user(&args, argp, sizeof(args)))
1022 return -EFAULT;
1023 return btrfs_ioctl_clone(file, args.src_fd, args.src_offset,
1024 args.src_length, args.dest_offset);
1025}
1026
1027/*
1028 * there are many ways the trans_start and trans_end ioctls can lead
1029 * to deadlocks. They should only be used by applications that
1030 * basically own the machine, and have a very in depth understanding
1031 * of all the possible deadlocks and enospc problems.
1032 */
1033static long btrfs_ioctl_trans_start(struct file *file)
1034{
1035 struct inode *inode = fdentry(file)->d_inode;
1036 struct btrfs_root *root = BTRFS_I(inode)->root;
1037 struct btrfs_trans_handle *trans;
1038 int ret = 0;
1039
1040 if (!capable(CAP_SYS_ADMIN))
1041 return -EPERM;
1042
1043 if (file->private_data) {
1044 ret = -EINPROGRESS;
1045 goto out;
1046 }
1047
1048 ret = mnt_want_write(file->f_path.mnt);
1049 if (ret)
1050 goto out;
1051
1052 mutex_lock(&root->fs_info->trans_mutex);
1053 root->fs_info->open_ioctl_trans++;
1054 mutex_unlock(&root->fs_info->trans_mutex);
1055
1056 trans = btrfs_start_ioctl_transaction(root, 0);
1057 if (trans)
1058 file->private_data = trans;
1059 else
1060 ret = -ENOMEM;
1061 /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
1062out:
1063 return ret;
1064}
1065
1066/*
1067 * there are many ways the trans_start and trans_end ioctls can lead
1068 * to deadlocks. They should only be used by applications that
1069 * basically own the machine, and have a very in depth understanding
1070 * of all the possible deadlocks and enospc problems.
1071 */
1072long btrfs_ioctl_trans_end(struct file *file)
1073{
1074 struct inode *inode = fdentry(file)->d_inode;
1075 struct btrfs_root *root = BTRFS_I(inode)->root;
1076 struct btrfs_trans_handle *trans;
1077 int ret = 0;
1078
1079 trans = file->private_data;
1080 if (!trans) {
1081 ret = -EINVAL;
1082 goto out;
1083 }
1084 btrfs_end_transaction(trans, root);
1085 file->private_data = NULL;
1086
1087 mutex_lock(&root->fs_info->trans_mutex);
1088 root->fs_info->open_ioctl_trans--;
1089 mutex_unlock(&root->fs_info->trans_mutex);
1090
1091 mnt_drop_write(file->f_path.mnt);
1092
1093out:
1094 return ret;
1095}
1096
1097long btrfs_ioctl(struct file *file, unsigned int
1098 cmd, unsigned long arg)
1099{
1100 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
1101 void __user *argp = (void __user *)arg;
1102
1103 switch (cmd) {
1104 case BTRFS_IOC_SNAP_CREATE:
1105 return btrfs_ioctl_snap_create(file, argp, 0);
1106 case BTRFS_IOC_SUBVOL_CREATE:
1107 return btrfs_ioctl_snap_create(file, argp, 1);
1108 case BTRFS_IOC_DEFRAG:
1109 return btrfs_ioctl_defrag(file);
1110 case BTRFS_IOC_RESIZE:
1111 return btrfs_ioctl_resize(root, argp);
1112 case BTRFS_IOC_ADD_DEV:
1113 return btrfs_ioctl_add_dev(root, argp);
1114 case BTRFS_IOC_RM_DEV:
1115 return btrfs_ioctl_rm_dev(root, argp);
1116 case BTRFS_IOC_BALANCE:
1117 return btrfs_balance(root->fs_info->dev_root);
1118 case BTRFS_IOC_CLONE:
1119 return btrfs_ioctl_clone(file, arg, 0, 0, 0);
1120 case BTRFS_IOC_CLONE_RANGE:
1121 return btrfs_ioctl_clone_range(file, argp);
1122 case BTRFS_IOC_TRANS_START:
1123 return btrfs_ioctl_trans_start(file);
1124 case BTRFS_IOC_TRANS_END:
1125 return btrfs_ioctl_trans_end(file);
1126 case BTRFS_IOC_SYNC:
1127 btrfs_sync_fs(file->f_dentry->d_sb, 1);
1128 return 0;
1129 }
1130
1131 return -ENOTTY;
1132}
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
new file mode 100644
index 000000000000..78049ea208db
--- /dev/null
+++ b/fs/btrfs/ioctl.h
@@ -0,0 +1,67 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __IOCTL_
20#define __IOCTL_
21#include <linux/ioctl.h>
22
23#define BTRFS_IOCTL_MAGIC 0x94
24#define BTRFS_VOL_NAME_MAX 255
25#define BTRFS_PATH_NAME_MAX 3072
26
27struct btrfs_ioctl_vol_args {
28 __s64 fd;
29 char name[BTRFS_PATH_NAME_MAX + 1];
30};
31
32#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
33 struct btrfs_ioctl_vol_args)
34#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
35 struct btrfs_ioctl_vol_args)
36#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
37 struct btrfs_ioctl_vol_args)
38#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
39 struct btrfs_ioctl_vol_args)
40/* trans start and trans end are dangerous, and only for
41 * use by applications that know how to avoid the
42 * resulting deadlocks
43 */
44#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6)
45#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7)
46#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8)
47
48#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int)
49#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
50 struct btrfs_ioctl_vol_args)
51#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
52 struct btrfs_ioctl_vol_args)
53#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
54 struct btrfs_ioctl_vol_args)
55struct btrfs_ioctl_clone_range_args {
56 __s64 src_fd;
57 __u64 src_offset, src_length;
58 __u64 dest_offset;
59};
60
61#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
62 struct btrfs_ioctl_clone_range_args)
63
64#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
65 struct btrfs_ioctl_vol_args)
66
67#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
new file mode 100644
index 000000000000..39bae7761db6
--- /dev/null
+++ b/fs/btrfs/locking.c
@@ -0,0 +1,88 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/gfp.h>
20#include <linux/pagemap.h>
21#include <linux/spinlock.h>
22#include <linux/page-flags.h>
23#include <asm/bug.h>
24#include "ctree.h"
25#include "extent_io.h"
26#include "locking.h"
27
28/*
29 * locks the per buffer mutex in an extent buffer. This uses adaptive locks
30 * and the spin is not tuned very extensively. The spinning does make a big
31 * difference in almost every workload, but spinning for the right amount of
32 * time needs some help.
33 *
34 * In general, we want to spin as long as the lock holder is doing btree
35 * searches, and we should give up if they are in more expensive code.
36 */
37
38int btrfs_tree_lock(struct extent_buffer *eb)
39{
40 int i;
41
42 if (mutex_trylock(&eb->mutex))
43 return 0;
44 for (i = 0; i < 512; i++) {
45 cpu_relax();
46 if (mutex_trylock(&eb->mutex))
47 return 0;
48 }
49 cpu_relax();
50 mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
51 return 0;
52}
53
54int btrfs_try_tree_lock(struct extent_buffer *eb)
55{
56 return mutex_trylock(&eb->mutex);
57}
58
59int btrfs_tree_unlock(struct extent_buffer *eb)
60{
61 mutex_unlock(&eb->mutex);
62 return 0;
63}
64
65int btrfs_tree_locked(struct extent_buffer *eb)
66{
67 return mutex_is_locked(&eb->mutex);
68}
69
70/*
71 * btrfs_search_slot uses this to decide if it should drop its locks
72 * before doing something expensive like allocating free blocks for cow.
73 */
74int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
75{
76 int i;
77 struct extent_buffer *eb;
78 for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {
79 eb = path->nodes[i];
80 if (!eb)
81 break;
82 smp_mb();
83 if (!list_empty(&eb->mutex.wait_list))
84 return 1;
85 }
86 return 0;
87}
88
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
new file mode 100644
index 000000000000..bc1faef12519
--- /dev/null
+++ b/fs/btrfs/locking.h
@@ -0,0 +1,27 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_LOCKING_
20#define __BTRFS_LOCKING_
21
22int btrfs_tree_lock(struct extent_buffer *eb);
23int btrfs_tree_unlock(struct extent_buffer *eb);
24int btrfs_tree_locked(struct extent_buffer *eb);
25int btrfs_try_tree_lock(struct extent_buffer *eb);
26int btrfs_path_lock_waiting(struct btrfs_path *path, int level);
27#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
new file mode 100644
index 000000000000..a20940170274
--- /dev/null
+++ b/fs/btrfs/ordered-data.c
@@ -0,0 +1,730 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/gfp.h>
20#include <linux/slab.h>
21#include <linux/blkdev.h>
22#include <linux/writeback.h>
23#include <linux/pagevec.h>
24#include "ctree.h"
25#include "transaction.h"
26#include "btrfs_inode.h"
27#include "extent_io.h"
28
29static u64 entry_end(struct btrfs_ordered_extent *entry)
30{
31 if (entry->file_offset + entry->len < entry->file_offset)
32 return (u64)-1;
33 return entry->file_offset + entry->len;
34}
35
36/* returns NULL if the insertion worked, or it returns the node it did find
37 * in the tree
38 */
39static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
40 struct rb_node *node)
41{
42 struct rb_node **p = &root->rb_node;
43 struct rb_node *parent = NULL;
44 struct btrfs_ordered_extent *entry;
45
46 while (*p) {
47 parent = *p;
48 entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
49
50 if (file_offset < entry->file_offset)
51 p = &(*p)->rb_left;
52 else if (file_offset >= entry_end(entry))
53 p = &(*p)->rb_right;
54 else
55 return parent;
56 }
57
58 rb_link_node(node, parent, p);
59 rb_insert_color(node, root);
60 return NULL;
61}
62
63/*
64 * look for a given offset in the tree, and if it can't be found return the
65 * first lesser offset
66 */
67static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
68 struct rb_node **prev_ret)
69{
70 struct rb_node *n = root->rb_node;
71 struct rb_node *prev = NULL;
72 struct rb_node *test;
73 struct btrfs_ordered_extent *entry;
74 struct btrfs_ordered_extent *prev_entry = NULL;
75
76 while (n) {
77 entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
78 prev = n;
79 prev_entry = entry;
80
81 if (file_offset < entry->file_offset)
82 n = n->rb_left;
83 else if (file_offset >= entry_end(entry))
84 n = n->rb_right;
85 else
86 return n;
87 }
88 if (!prev_ret)
89 return NULL;
90
91 while (prev && file_offset >= entry_end(prev_entry)) {
92 test = rb_next(prev);
93 if (!test)
94 break;
95 prev_entry = rb_entry(test, struct btrfs_ordered_extent,
96 rb_node);
97 if (file_offset < entry_end(prev_entry))
98 break;
99
100 prev = test;
101 }
102 if (prev)
103 prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
104 rb_node);
105 while (prev && file_offset < entry_end(prev_entry)) {
106 test = rb_prev(prev);
107 if (!test)
108 break;
109 prev_entry = rb_entry(test, struct btrfs_ordered_extent,
110 rb_node);
111 prev = test;
112 }
113 *prev_ret = prev;
114 return NULL;
115}
116
117/*
118 * helper to check if a given offset is inside a given entry
119 */
120static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
121{
122 if (file_offset < entry->file_offset ||
123 entry->file_offset + entry->len <= file_offset)
124 return 0;
125 return 1;
126}
127
128/*
129 * look find the first ordered struct that has this offset, otherwise
130 * the first one less than this offset
131 */
132static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
133 u64 file_offset)
134{
135 struct rb_root *root = &tree->tree;
136 struct rb_node *prev;
137 struct rb_node *ret;
138 struct btrfs_ordered_extent *entry;
139
140 if (tree->last) {
141 entry = rb_entry(tree->last, struct btrfs_ordered_extent,
142 rb_node);
143 if (offset_in_entry(entry, file_offset))
144 return tree->last;
145 }
146 ret = __tree_search(root, file_offset, &prev);
147 if (!ret)
148 ret = prev;
149 if (ret)
150 tree->last = ret;
151 return ret;
152}
153
154/* allocate and add a new ordered_extent into the per-inode tree.
155 * file_offset is the logical offset in the file
156 *
157 * start is the disk block number of an extent already reserved in the
158 * extent allocation tree
159 *
160 * len is the length of the extent
161 *
162 * This also sets the EXTENT_ORDERED bit on the range in the inode.
163 *
164 * The tree is given a single reference on the ordered extent that was
165 * inserted.
166 */
167int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
168 u64 start, u64 len, u64 disk_len, int type)
169{
170 struct btrfs_ordered_inode_tree *tree;
171 struct rb_node *node;
172 struct btrfs_ordered_extent *entry;
173
174 tree = &BTRFS_I(inode)->ordered_tree;
175 entry = kzalloc(sizeof(*entry), GFP_NOFS);
176 if (!entry)
177 return -ENOMEM;
178
179 mutex_lock(&tree->mutex);
180 entry->file_offset = file_offset;
181 entry->start = start;
182 entry->len = len;
183 entry->disk_len = disk_len;
184 entry->inode = inode;
185 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
186 set_bit(type, &entry->flags);
187
188 /* one ref for the tree */
189 atomic_set(&entry->refs, 1);
190 init_waitqueue_head(&entry->wait);
191 INIT_LIST_HEAD(&entry->list);
192 INIT_LIST_HEAD(&entry->root_extent_list);
193
194 node = tree_insert(&tree->tree, file_offset,
195 &entry->rb_node);
196 BUG_ON(node);
197
198 set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
199 entry_end(entry) - 1, GFP_NOFS);
200
201 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
202 list_add_tail(&entry->root_extent_list,
203 &BTRFS_I(inode)->root->fs_info->ordered_extents);
204 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
205
206 mutex_unlock(&tree->mutex);
207 BUG_ON(node);
208 return 0;
209}
210
211/*
212 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
213 * when an ordered extent is finished. If the list covers more than one
214 * ordered extent, it is split across multiples.
215 */
216int btrfs_add_ordered_sum(struct inode *inode,
217 struct btrfs_ordered_extent *entry,
218 struct btrfs_ordered_sum *sum)
219{
220 struct btrfs_ordered_inode_tree *tree;
221
222 tree = &BTRFS_I(inode)->ordered_tree;
223 mutex_lock(&tree->mutex);
224 list_add_tail(&sum->list, &entry->list);
225 mutex_unlock(&tree->mutex);
226 return 0;
227}
228
229/*
230 * this is used to account for finished IO across a given range
231 * of the file. The IO should not span ordered extents. If
232 * a given ordered_extent is completely done, 1 is returned, otherwise
233 * 0.
234 *
235 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
236 * to make sure this function only returns 1 once for a given ordered extent.
237 */
238int btrfs_dec_test_ordered_pending(struct inode *inode,
239 u64 file_offset, u64 io_size)
240{
241 struct btrfs_ordered_inode_tree *tree;
242 struct rb_node *node;
243 struct btrfs_ordered_extent *entry;
244 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
245 int ret;
246
247 tree = &BTRFS_I(inode)->ordered_tree;
248 mutex_lock(&tree->mutex);
249 clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
250 GFP_NOFS);
251 node = tree_search(tree, file_offset);
252 if (!node) {
253 ret = 1;
254 goto out;
255 }
256
257 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
258 if (!offset_in_entry(entry, file_offset)) {
259 ret = 1;
260 goto out;
261 }
262
263 ret = test_range_bit(io_tree, entry->file_offset,
264 entry->file_offset + entry->len - 1,
265 EXTENT_ORDERED, 0);
266 if (ret == 0)
267 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
268out:
269 mutex_unlock(&tree->mutex);
270 return ret == 0;
271}
272
273/*
274 * used to drop a reference on an ordered extent. This will free
275 * the extent if the last reference is dropped
276 */
277int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
278{
279 struct list_head *cur;
280 struct btrfs_ordered_sum *sum;
281
282 if (atomic_dec_and_test(&entry->refs)) {
283 while (!list_empty(&entry->list)) {
284 cur = entry->list.next;
285 sum = list_entry(cur, struct btrfs_ordered_sum, list);
286 list_del(&sum->list);
287 kfree(sum);
288 }
289 kfree(entry);
290 }
291 return 0;
292}
293
294/*
295 * remove an ordered extent from the tree. No references are dropped
296 * but, anyone waiting on this extent is woken up.
297 */
298int btrfs_remove_ordered_extent(struct inode *inode,
299 struct btrfs_ordered_extent *entry)
300{
301 struct btrfs_ordered_inode_tree *tree;
302 struct rb_node *node;
303
304 tree = &BTRFS_I(inode)->ordered_tree;
305 mutex_lock(&tree->mutex);
306 node = &entry->rb_node;
307 rb_erase(node, &tree->tree);
308 tree->last = NULL;
309 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
310
311 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
312 list_del_init(&entry->root_extent_list);
313 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
314
315 mutex_unlock(&tree->mutex);
316 wake_up(&entry->wait);
317 return 0;
318}
319
320/*
321 * wait for all the ordered extents in a root. This is done when balancing
322 * space between drives.
323 */
324int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
325{
326 struct list_head splice;
327 struct list_head *cur;
328 struct btrfs_ordered_extent *ordered;
329 struct inode *inode;
330
331 INIT_LIST_HEAD(&splice);
332
333 spin_lock(&root->fs_info->ordered_extent_lock);
334 list_splice_init(&root->fs_info->ordered_extents, &splice);
335 while (!list_empty(&splice)) {
336 cur = splice.next;
337 ordered = list_entry(cur, struct btrfs_ordered_extent,
338 root_extent_list);
339 if (nocow_only &&
340 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
341 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
342 list_move(&ordered->root_extent_list,
343 &root->fs_info->ordered_extents);
344 cond_resched_lock(&root->fs_info->ordered_extent_lock);
345 continue;
346 }
347
348 list_del_init(&ordered->root_extent_list);
349 atomic_inc(&ordered->refs);
350
351 /*
352 * the inode may be getting freed (in sys_unlink path).
353 */
354 inode = igrab(ordered->inode);
355
356 spin_unlock(&root->fs_info->ordered_extent_lock);
357
358 if (inode) {
359 btrfs_start_ordered_extent(inode, ordered, 1);
360 btrfs_put_ordered_extent(ordered);
361 iput(inode);
362 } else {
363 btrfs_put_ordered_extent(ordered);
364 }
365
366 spin_lock(&root->fs_info->ordered_extent_lock);
367 }
368 spin_unlock(&root->fs_info->ordered_extent_lock);
369 return 0;
370}
371
372/*
373 * Used to start IO or wait for a given ordered extent to finish.
374 *
375 * If wait is one, this effectively waits on page writeback for all the pages
376 * in the extent, and it waits on the io completion code to insert
377 * metadata into the btree corresponding to the extent
378 */
379void btrfs_start_ordered_extent(struct inode *inode,
380 struct btrfs_ordered_extent *entry,
381 int wait)
382{
383 u64 start = entry->file_offset;
384 u64 end = start + entry->len - 1;
385
386 /*
387 * pages in the range can be dirty, clean or writeback. We
388 * start IO on any dirty ones so the wait doesn't stall waiting
389 * for pdflush to find them
390 */
391 btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL);
392 if (wait) {
393 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
394 &entry->flags));
395 }
396}
397
398/*
399 * Used to wait on ordered extents across a large range of bytes.
400 */
401int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
402{
403 u64 end;
404 u64 orig_end;
405 u64 wait_end;
406 struct btrfs_ordered_extent *ordered;
407
408 if (start + len < start) {
409 orig_end = INT_LIMIT(loff_t);
410 } else {
411 orig_end = start + len - 1;
412 if (orig_end > INT_LIMIT(loff_t))
413 orig_end = INT_LIMIT(loff_t);
414 }
415 wait_end = orig_end;
416again:
417 /* start IO across the range first to instantiate any delalloc
418 * extents
419 */
420 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
421
422 /* The compression code will leave pages locked but return from
423 * writepage without setting the page writeback. Starting again
424 * with WB_SYNC_ALL will end up waiting for the IO to actually start.
425 */
426 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
427
428 btrfs_wait_on_page_writeback_range(inode->i_mapping,
429 start >> PAGE_CACHE_SHIFT,
430 orig_end >> PAGE_CACHE_SHIFT);
431
432 end = orig_end;
433 while (1) {
434 ordered = btrfs_lookup_first_ordered_extent(inode, end);
435 if (!ordered)
436 break;
437 if (ordered->file_offset > orig_end) {
438 btrfs_put_ordered_extent(ordered);
439 break;
440 }
441 if (ordered->file_offset + ordered->len < start) {
442 btrfs_put_ordered_extent(ordered);
443 break;
444 }
445 btrfs_start_ordered_extent(inode, ordered, 1);
446 end = ordered->file_offset;
447 btrfs_put_ordered_extent(ordered);
448 if (end == 0 || end == start)
449 break;
450 end--;
451 }
452 if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
453 EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
454 schedule_timeout(1);
455 goto again;
456 }
457 return 0;
458}
459
460/*
461 * find an ordered extent corresponding to file_offset. return NULL if
462 * nothing is found, otherwise take a reference on the extent and return it
463 */
464struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
465 u64 file_offset)
466{
467 struct btrfs_ordered_inode_tree *tree;
468 struct rb_node *node;
469 struct btrfs_ordered_extent *entry = NULL;
470
471 tree = &BTRFS_I(inode)->ordered_tree;
472 mutex_lock(&tree->mutex);
473 node = tree_search(tree, file_offset);
474 if (!node)
475 goto out;
476
477 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
478 if (!offset_in_entry(entry, file_offset))
479 entry = NULL;
480 if (entry)
481 atomic_inc(&entry->refs);
482out:
483 mutex_unlock(&tree->mutex);
484 return entry;
485}
486
487/*
488 * lookup and return any extent before 'file_offset'. NULL is returned
489 * if none is found
490 */
491struct btrfs_ordered_extent *
492btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
493{
494 struct btrfs_ordered_inode_tree *tree;
495 struct rb_node *node;
496 struct btrfs_ordered_extent *entry = NULL;
497
498 tree = &BTRFS_I(inode)->ordered_tree;
499 mutex_lock(&tree->mutex);
500 node = tree_search(tree, file_offset);
501 if (!node)
502 goto out;
503
504 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
505 atomic_inc(&entry->refs);
506out:
507 mutex_unlock(&tree->mutex);
508 return entry;
509}
510
511/*
512 * After an extent is done, call this to conditionally update the on disk
513 * i_size. i_size is updated to cover any fully written part of the file.
514 */
515int btrfs_ordered_update_i_size(struct inode *inode,
516 struct btrfs_ordered_extent *ordered)
517{
518 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
519 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
520 u64 disk_i_size;
521 u64 new_i_size;
522 u64 i_size_test;
523 struct rb_node *node;
524 struct btrfs_ordered_extent *test;
525
526 mutex_lock(&tree->mutex);
527 disk_i_size = BTRFS_I(inode)->disk_i_size;
528
529 /*
530 * if the disk i_size is already at the inode->i_size, or
531 * this ordered extent is inside the disk i_size, we're done
532 */
533 if (disk_i_size >= inode->i_size ||
534 ordered->file_offset + ordered->len <= disk_i_size) {
535 goto out;
536 }
537
538 /*
539 * we can't update the disk_isize if there are delalloc bytes
540 * between disk_i_size and this ordered extent
541 */
542 if (test_range_bit(io_tree, disk_i_size,
543 ordered->file_offset + ordered->len - 1,
544 EXTENT_DELALLOC, 0)) {
545 goto out;
546 }
547 /*
548 * walk backward from this ordered extent to disk_i_size.
549 * if we find an ordered extent then we can't update disk i_size
550 * yet
551 */
552 node = &ordered->rb_node;
553 while (1) {
554 node = rb_prev(node);
555 if (!node)
556 break;
557 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
558 if (test->file_offset + test->len <= disk_i_size)
559 break;
560 if (test->file_offset >= inode->i_size)
561 break;
562 if (test->file_offset >= disk_i_size)
563 goto out;
564 }
565 new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode));
566
567 /*
568 * at this point, we know we can safely update i_size to at least
569 * the offset from this ordered extent. But, we need to
570 * walk forward and see if ios from higher up in the file have
571 * finished.
572 */
573 node = rb_next(&ordered->rb_node);
574 i_size_test = 0;
575 if (node) {
576 /*
577 * do we have an area where IO might have finished
578 * between our ordered extent and the next one.
579 */
580 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
581 if (test->file_offset > entry_end(ordered))
582 i_size_test = test->file_offset;
583 } else {
584 i_size_test = i_size_read(inode);
585 }
586
587 /*
588 * i_size_test is the end of a region after this ordered
589 * extent where there are no ordered extents. As long as there
590 * are no delalloc bytes in this area, it is safe to update
591 * disk_i_size to the end of the region.
592 */
593 if (i_size_test > entry_end(ordered) &&
594 !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
595 EXTENT_DELALLOC, 0)) {
596 new_i_size = min_t(u64, i_size_test, i_size_read(inode));
597 }
598 BTRFS_I(inode)->disk_i_size = new_i_size;
599out:
600 mutex_unlock(&tree->mutex);
601 return 0;
602}
603
604/*
605 * search the ordered extents for one corresponding to 'offset' and
606 * try to find a checksum. This is used because we allow pages to
607 * be reclaimed before their checksum is actually put into the btree
608 */
609int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
610 u32 *sum)
611{
612 struct btrfs_ordered_sum *ordered_sum;
613 struct btrfs_sector_sum *sector_sums;
614 struct btrfs_ordered_extent *ordered;
615 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
616 struct list_head *cur;
617 unsigned long num_sectors;
618 unsigned long i;
619 u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
620 int ret = 1;
621
622 ordered = btrfs_lookup_ordered_extent(inode, offset);
623 if (!ordered)
624 return 1;
625
626 mutex_lock(&tree->mutex);
627 list_for_each_prev(cur, &ordered->list) {
628 ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
629 if (disk_bytenr >= ordered_sum->bytenr) {
630 num_sectors = ordered_sum->len / sectorsize;
631 sector_sums = ordered_sum->sums;
632 for (i = 0; i < num_sectors; i++) {
633 if (sector_sums[i].bytenr == disk_bytenr) {
634 *sum = sector_sums[i].sum;
635 ret = 0;
636 goto out;
637 }
638 }
639 }
640 }
641out:
642 mutex_unlock(&tree->mutex);
643 btrfs_put_ordered_extent(ordered);
644 return ret;
645}
646
647
648/**
649 * taken from mm/filemap.c because it isn't exported
650 *
651 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
652 * @mapping: address space structure to write
653 * @start: offset in bytes where the range starts
654 * @end: offset in bytes where the range ends (inclusive)
655 * @sync_mode: enable synchronous operation
656 *
657 * Start writeback against all of a mapping's dirty pages that lie
658 * within the byte offsets <start, end> inclusive.
659 *
660 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
661 * opposed to a regular memory cleansing writeback. The difference between
662 * these two operations is that if a dirty page/buffer is encountered, it must
663 * be waited upon, and not just skipped over.
664 */
665int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
666 loff_t end, int sync_mode)
667{
668 struct writeback_control wbc = {
669 .sync_mode = sync_mode,
670 .nr_to_write = mapping->nrpages * 2,
671 .range_start = start,
672 .range_end = end,
673 .for_writepages = 1,
674 };
675 return btrfs_writepages(mapping, &wbc);
676}
677
678/**
679 * taken from mm/filemap.c because it isn't exported
680 *
681 * wait_on_page_writeback_range - wait for writeback to complete
682 * @mapping: target address_space
683 * @start: beginning page index
684 * @end: ending page index
685 *
686 * Wait for writeback to complete against pages indexed by start->end
687 * inclusive
688 */
689int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
690 pgoff_t start, pgoff_t end)
691{
692 struct pagevec pvec;
693 int nr_pages;
694 int ret = 0;
695 pgoff_t index;
696
697 if (end < start)
698 return 0;
699
700 pagevec_init(&pvec, 0);
701 index = start;
702 while ((index <= end) &&
703 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
704 PAGECACHE_TAG_WRITEBACK,
705 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
706 unsigned i;
707
708 for (i = 0; i < nr_pages; i++) {
709 struct page *page = pvec.pages[i];
710
711 /* until radix tree lookup accepts end_index */
712 if (page->index > end)
713 continue;
714
715 wait_on_page_writeback(page);
716 if (PageError(page))
717 ret = -EIO;
718 }
719 pagevec_release(&pvec);
720 cond_resched();
721 }
722
723 /* Check for outstanding write errors */
724 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
725 ret = -ENOSPC;
726 if (test_and_clear_bit(AS_EIO, &mapping->flags))
727 ret = -EIO;
728
729 return ret;
730}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
new file mode 100644
index 000000000000..ab66d5e8d6d6
--- /dev/null
+++ b/fs/btrfs/ordered-data.h
@@ -0,0 +1,158 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_ORDERED_DATA__
20#define __BTRFS_ORDERED_DATA__
21
22/* one of these per inode */
23struct btrfs_ordered_inode_tree {
24 struct mutex mutex;
25 struct rb_root tree;
26 struct rb_node *last;
27};
28
29/*
30 * these are used to collect checksums done just before bios submission.
31 * They are attached via a list into the ordered extent, and
32 * checksum items are inserted into the tree after all the blocks in
33 * the ordered extent are on disk
34 */
35struct btrfs_sector_sum {
36 /* bytenr on disk */
37 u64 bytenr;
38 u32 sum;
39};
40
41struct btrfs_ordered_sum {
42 /* bytenr is the start of this extent on disk */
43 u64 bytenr;
44
45 /*
46 * this is the length in bytes covered by the sums array below.
47 */
48 unsigned long len;
49 struct list_head list;
50 /* last field is a variable length array of btrfs_sector_sums */
51 struct btrfs_sector_sum sums[];
52};
53
54/*
55 * bits for the flags field:
56 *
57 * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written.
58 * It is used to make sure metadata is inserted into the tree only once
59 * per extent.
60 *
61 * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the
62 * rbtree, just before waking any waiters. It is used to indicate the
63 * IO is done and any metadata is inserted into the tree.
64 */
65#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
66
67#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
68
69#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
70
71#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
72
73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
74
75struct btrfs_ordered_extent {
76 /* logical offset in the file */
77 u64 file_offset;
78
79 /* disk byte number */
80 u64 start;
81
82 /* ram length of the extent in bytes */
83 u64 len;
84
85 /* extent length on disk */
86 u64 disk_len;
87
88 /* flags (described above) */
89 unsigned long flags;
90
91 /* reference count */
92 atomic_t refs;
93
94 /* the inode we belong to */
95 struct inode *inode;
96
97 /* list of checksums for insertion when the extent io is done */
98 struct list_head list;
99
100 /* used to wait for the BTRFS_ORDERED_COMPLETE bit */
101 wait_queue_head_t wait;
102
103 /* our friendly rbtree entry */
104 struct rb_node rb_node;
105
106 /* a per root list of all the pending ordered extents */
107 struct list_head root_extent_list;
108};
109
110
111/*
112 * calculates the total size you need to allocate for an ordered sum
113 * structure spanning 'bytes' in the file
114 */
115static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
116 unsigned long bytes)
117{
118 unsigned long num_sectors = (bytes + root->sectorsize - 1) /
119 root->sectorsize;
120 num_sectors++;
121 return sizeof(struct btrfs_ordered_sum) +
122 num_sectors * sizeof(struct btrfs_sector_sum);
123}
124
125static inline void
126btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
127{
128 mutex_init(&t->mutex);
129 t->tree.rb_node = NULL;
130 t->last = NULL;
131}
132
133int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
134int btrfs_remove_ordered_extent(struct inode *inode,
135 struct btrfs_ordered_extent *entry);
136int btrfs_dec_test_ordered_pending(struct inode *inode,
137 u64 file_offset, u64 io_size);
138int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
139 u64 start, u64 len, u64 disk_len, int tyep);
140int btrfs_add_ordered_sum(struct inode *inode,
141 struct btrfs_ordered_extent *entry,
142 struct btrfs_ordered_sum *sum);
143struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
144 u64 file_offset);
145void btrfs_start_ordered_extent(struct inode *inode,
146 struct btrfs_ordered_extent *entry, int wait);
147int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
148struct btrfs_ordered_extent *
149btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
150int btrfs_ordered_update_i_size(struct inode *inode,
151 struct btrfs_ordered_extent *ordered);
152int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
153int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
154 pgoff_t start, pgoff_t end);
155int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
156 loff_t end, int sync_mode);
157int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
158#endif
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
new file mode 100644
index 000000000000..3c0d52af4f80
--- /dev/null
+++ b/fs/btrfs/orphan.c
@@ -0,0 +1,67 @@
1/*
2 * Copyright (C) 2008 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21
22int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
23 struct btrfs_root *root, u64 offset)
24{
25 struct btrfs_path *path;
26 struct btrfs_key key;
27 int ret = 0;
28
29 key.objectid = BTRFS_ORPHAN_OBJECTID;
30 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
31 key.offset = offset;
32
33 path = btrfs_alloc_path();
34 if (!path)
35 return -ENOMEM;
36
37 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
38
39 btrfs_free_path(path);
40 return ret;
41}
42
43int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
44 struct btrfs_root *root, u64 offset)
45{
46 struct btrfs_path *path;
47 struct btrfs_key key;
48 int ret = 0;
49
50 key.objectid = BTRFS_ORPHAN_OBJECTID;
51 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
52 key.offset = offset;
53
54 path = btrfs_alloc_path();
55 if (!path)
56 return -ENOMEM;
57
58 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
59 if (ret)
60 goto out;
61
62 ret = btrfs_del_item(trans, root, path);
63
64out:
65 btrfs_free_path(path);
66 return ret;
67}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
new file mode 100644
index 000000000000..5f8f218c1005
--- /dev/null
+++ b/fs/btrfs/print-tree.c
@@ -0,0 +1,216 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "print-tree.h"
22
23static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
24{
25 int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
26 int i;
27 printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu "
28 "num_stripes %d\n",
29 (unsigned long long)btrfs_chunk_length(eb, chunk),
30 (unsigned long long)btrfs_chunk_owner(eb, chunk),
31 (unsigned long long)btrfs_chunk_type(eb, chunk),
32 num_stripes);
33 for (i = 0 ; i < num_stripes ; i++) {
34 printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i,
35 (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i),
36 (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i));
37 }
38}
39static void print_dev_item(struct extent_buffer *eb,
40 struct btrfs_dev_item *dev_item)
41{
42 printk(KERN_INFO "\t\tdev item devid %llu "
43 "total_bytes %llu bytes used %llu\n",
44 (unsigned long long)btrfs_device_id(eb, dev_item),
45 (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
46 (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
47}
48void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
49{
50 int i;
51 u32 nr = btrfs_header_nritems(l);
52 struct btrfs_item *item;
53 struct btrfs_extent_item *ei;
54 struct btrfs_root_item *ri;
55 struct btrfs_dir_item *di;
56 struct btrfs_inode_item *ii;
57 struct btrfs_block_group_item *bi;
58 struct btrfs_file_extent_item *fi;
59 struct btrfs_key key;
60 struct btrfs_key found_key;
61 struct btrfs_extent_ref *ref;
62 struct btrfs_dev_extent *dev_extent;
63 u32 type;
64
65 printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
66 (unsigned long long)btrfs_header_bytenr(l), nr,
67 btrfs_leaf_free_space(root, l));
68 for (i = 0 ; i < nr ; i++) {
69 item = btrfs_item_nr(l, i);
70 btrfs_item_key_to_cpu(l, &key, i);
71 type = btrfs_key_type(&key);
72 printk(KERN_INFO "\titem %d key (%llu %x %llu) itemoff %d "
73 "itemsize %d\n",
74 i,
75 (unsigned long long)key.objectid, type,
76 (unsigned long long)key.offset,
77 btrfs_item_offset(l, item), btrfs_item_size(l, item));
78 switch (type) {
79 case BTRFS_INODE_ITEM_KEY:
80 ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
81 printk(KERN_INFO "\t\tinode generation %llu size %llu "
82 "mode %o\n",
83 (unsigned long long)
84 btrfs_inode_generation(l, ii),
85 (unsigned long long)btrfs_inode_size(l, ii),
86 btrfs_inode_mode(l, ii));
87 break;
88 case BTRFS_DIR_ITEM_KEY:
89 di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
90 btrfs_dir_item_key_to_cpu(l, di, &found_key);
91 printk(KERN_INFO "\t\tdir oid %llu type %u\n",
92 (unsigned long long)found_key.objectid,
93 btrfs_dir_type(l, di));
94 break;
95 case BTRFS_ROOT_ITEM_KEY:
96 ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
97 printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n",
98 (unsigned long long)
99 btrfs_disk_root_bytenr(l, ri),
100 btrfs_disk_root_refs(l, ri));
101 break;
102 case BTRFS_EXTENT_ITEM_KEY:
103 ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
104 printk(KERN_INFO "\t\textent data refs %u\n",
105 btrfs_extent_refs(l, ei));
106 break;
107 case BTRFS_EXTENT_REF_KEY:
108 ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref);
109 printk(KERN_INFO "\t\textent back ref root %llu "
110 "gen %llu owner %llu num_refs %lu\n",
111 (unsigned long long)btrfs_ref_root(l, ref),
112 (unsigned long long)btrfs_ref_generation(l, ref),
113 (unsigned long long)btrfs_ref_objectid(l, ref),
114 (unsigned long)btrfs_ref_num_refs(l, ref));
115 break;
116
117 case BTRFS_EXTENT_DATA_KEY:
118 fi = btrfs_item_ptr(l, i,
119 struct btrfs_file_extent_item);
120 if (btrfs_file_extent_type(l, fi) ==
121 BTRFS_FILE_EXTENT_INLINE) {
122 printk(KERN_INFO "\t\tinline extent data "
123 "size %u\n",
124 btrfs_file_extent_inline_len(l, fi));
125 break;
126 }
127 printk(KERN_INFO "\t\textent data disk bytenr %llu "
128 "nr %llu\n",
129 (unsigned long long)
130 btrfs_file_extent_disk_bytenr(l, fi),
131 (unsigned long long)
132 btrfs_file_extent_disk_num_bytes(l, fi));
133 printk(KERN_INFO "\t\textent data offset %llu "
134 "nr %llu ram %llu\n",
135 (unsigned long long)
136 btrfs_file_extent_offset(l, fi),
137 (unsigned long long)
138 btrfs_file_extent_num_bytes(l, fi),
139 (unsigned long long)
140 btrfs_file_extent_ram_bytes(l, fi));
141 break;
142 case BTRFS_BLOCK_GROUP_ITEM_KEY:
143 bi = btrfs_item_ptr(l, i,
144 struct btrfs_block_group_item);
145 printk(KERN_INFO "\t\tblock group used %llu\n",
146 (unsigned long long)
147 btrfs_disk_block_group_used(l, bi));
148 break;
149 case BTRFS_CHUNK_ITEM_KEY:
150 print_chunk(l, btrfs_item_ptr(l, i,
151 struct btrfs_chunk));
152 break;
153 case BTRFS_DEV_ITEM_KEY:
154 print_dev_item(l, btrfs_item_ptr(l, i,
155 struct btrfs_dev_item));
156 break;
157 case BTRFS_DEV_EXTENT_KEY:
158 dev_extent = btrfs_item_ptr(l, i,
159 struct btrfs_dev_extent);
160 printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n"
161 "\t\tchunk objectid %llu chunk offset %llu "
162 "length %llu\n",
163 (unsigned long long)
164 btrfs_dev_extent_chunk_tree(l, dev_extent),
165 (unsigned long long)
166 btrfs_dev_extent_chunk_objectid(l, dev_extent),
167 (unsigned long long)
168 btrfs_dev_extent_chunk_offset(l, dev_extent),
169 (unsigned long long)
170 btrfs_dev_extent_length(l, dev_extent));
171 };
172 }
173}
174
175void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
176{
177 int i; u32 nr;
178 struct btrfs_key key;
179 int level;
180
181 if (!c)
182 return;
183 nr = btrfs_header_nritems(c);
184 level = btrfs_header_level(c);
185 if (level == 0) {
186 btrfs_print_leaf(root, c);
187 return;
188 }
189 printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
190 (unsigned long long)btrfs_header_bytenr(c),
191 btrfs_header_level(c), nr,
192 (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
193 for (i = 0; i < nr; i++) {
194 btrfs_node_key_to_cpu(c, &key, i);
195 printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n",
196 i,
197 (unsigned long long)key.objectid,
198 key.type,
199 (unsigned long long)key.offset,
200 (unsigned long long)btrfs_node_blockptr(c, i));
201 }
202 for (i = 0; i < nr; i++) {
203 struct extent_buffer *next = read_tree_block(root,
204 btrfs_node_blockptr(c, i),
205 btrfs_level_size(root, level - 1),
206 btrfs_node_ptr_generation(c, i));
207 if (btrfs_is_leaf(next) &&
208 btrfs_header_level(c) != 1)
209 BUG();
210 if (btrfs_header_level(next) !=
211 btrfs_header_level(c) - 1)
212 BUG();
213 btrfs_print_tree(root, next);
214 free_extent_buffer(next);
215 }
216}
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
new file mode 100644
index 000000000000..da75efe534d5
--- /dev/null
+++ b/fs/btrfs/print-tree.h
@@ -0,0 +1,23 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __PRINT_TREE_
20#define __PRINT_TREE_
21void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l);
22void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t);
23#endif
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
new file mode 100644
index 000000000000..6f0acc4c9eab
--- /dev/null
+++ b/fs/btrfs/ref-cache.c
@@ -0,0 +1,230 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "ref-cache.h"
22#include "transaction.h"
23
24/*
25 * leaf refs are used to cache the information about which extents
26 * a given leaf has references on. This allows us to process that leaf
27 * in btrfs_drop_snapshot without needing to read it back from disk.
28 */
29
30/*
31 * kmalloc a leaf reference struct and update the counters for the
32 * total ref cache size
33 */
34struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
35 int nr_extents)
36{
37 struct btrfs_leaf_ref *ref;
38 size_t size = btrfs_leaf_ref_size(nr_extents);
39
40 ref = kmalloc(size, GFP_NOFS);
41 if (ref) {
42 spin_lock(&root->fs_info->ref_cache_lock);
43 root->fs_info->total_ref_cache_size += size;
44 spin_unlock(&root->fs_info->ref_cache_lock);
45
46 memset(ref, 0, sizeof(*ref));
47 atomic_set(&ref->usage, 1);
48 INIT_LIST_HEAD(&ref->list);
49 }
50 return ref;
51}
52
53/*
54 * free a leaf reference struct and update the counters for the
55 * total ref cache size
56 */
57void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
58{
59 if (!ref)
60 return;
61 WARN_ON(atomic_read(&ref->usage) == 0);
62 if (atomic_dec_and_test(&ref->usage)) {
63 size_t size = btrfs_leaf_ref_size(ref->nritems);
64
65 BUG_ON(ref->in_tree);
66 kfree(ref);
67
68 spin_lock(&root->fs_info->ref_cache_lock);
69 root->fs_info->total_ref_cache_size -= size;
70 spin_unlock(&root->fs_info->ref_cache_lock);
71 }
72}
73
74static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
75 struct rb_node *node)
76{
77 struct rb_node **p = &root->rb_node;
78 struct rb_node *parent = NULL;
79 struct btrfs_leaf_ref *entry;
80
81 while (*p) {
82 parent = *p;
83 entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
84
85 if (bytenr < entry->bytenr)
86 p = &(*p)->rb_left;
87 else if (bytenr > entry->bytenr)
88 p = &(*p)->rb_right;
89 else
90 return parent;
91 }
92
93 entry = rb_entry(node, struct btrfs_leaf_ref, rb_node);
94 rb_link_node(node, parent, p);
95 rb_insert_color(node, root);
96 return NULL;
97}
98
99static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
100{
101 struct rb_node *n = root->rb_node;
102 struct btrfs_leaf_ref *entry;
103
104 while (n) {
105 entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
106 WARN_ON(!entry->in_tree);
107
108 if (bytenr < entry->bytenr)
109 n = n->rb_left;
110 else if (bytenr > entry->bytenr)
111 n = n->rb_right;
112 else
113 return n;
114 }
115 return NULL;
116}
117
118int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
119 int shared)
120{
121 struct btrfs_leaf_ref *ref = NULL;
122 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
123
124 if (shared)
125 tree = &root->fs_info->shared_ref_tree;
126 if (!tree)
127 return 0;
128
129 spin_lock(&tree->lock);
130 while (!list_empty(&tree->list)) {
131 ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list);
132 BUG_ON(ref->tree != tree);
133 if (ref->root_gen > max_root_gen)
134 break;
135 if (!xchg(&ref->in_tree, 0)) {
136 cond_resched_lock(&tree->lock);
137 continue;
138 }
139
140 rb_erase(&ref->rb_node, &tree->root);
141 list_del_init(&ref->list);
142
143 spin_unlock(&tree->lock);
144 btrfs_free_leaf_ref(root, ref);
145 cond_resched();
146 spin_lock(&tree->lock);
147 }
148 spin_unlock(&tree->lock);
149 return 0;
150}
151
152/*
153 * find the leaf ref for a given extent. This returns the ref struct with
154 * a usage reference incremented
155 */
156struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
157 u64 bytenr)
158{
159 struct rb_node *rb;
160 struct btrfs_leaf_ref *ref = NULL;
161 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
162again:
163 if (tree) {
164 spin_lock(&tree->lock);
165 rb = tree_search(&tree->root, bytenr);
166 if (rb)
167 ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
168 if (ref)
169 atomic_inc(&ref->usage);
170 spin_unlock(&tree->lock);
171 if (ref)
172 return ref;
173 }
174 if (tree != &root->fs_info->shared_ref_tree) {
175 tree = &root->fs_info->shared_ref_tree;
176 goto again;
177 }
178 return NULL;
179}
180
181/*
182 * add a fully filled in leaf ref struct
183 * remove all the refs older than a given root generation
184 */
185int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
186 int shared)
187{
188 int ret = 0;
189 struct rb_node *rb;
190 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
191
192 if (shared)
193 tree = &root->fs_info->shared_ref_tree;
194
195 spin_lock(&tree->lock);
196 rb = tree_insert(&tree->root, ref->bytenr, &ref->rb_node);
197 if (rb) {
198 ret = -EEXIST;
199 } else {
200 atomic_inc(&ref->usage);
201 ref->tree = tree;
202 ref->in_tree = 1;
203 list_add_tail(&ref->list, &tree->list);
204 }
205 spin_unlock(&tree->lock);
206 return ret;
207}
208
209/*
210 * remove a single leaf ref from the tree. This drops the ref held by the tree
211 * only
212 */
213int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
214{
215 struct btrfs_leaf_ref_tree *tree;
216
217 if (!xchg(&ref->in_tree, 0))
218 return 0;
219
220 tree = ref->tree;
221 spin_lock(&tree->lock);
222
223 rb_erase(&ref->rb_node, &tree->root);
224 list_del_init(&ref->list);
225
226 spin_unlock(&tree->lock);
227
228 btrfs_free_leaf_ref(root, ref);
229 return 0;
230}
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
new file mode 100644
index 000000000000..16f3183d7c59
--- /dev/null
+++ b/fs/btrfs/ref-cache.h
@@ -0,0 +1,77 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#ifndef __REFCACHE__
19#define __REFCACHE__
20
21struct btrfs_extent_info {
22 /* bytenr and num_bytes find the extent in the extent allocation tree */
23 u64 bytenr;
24 u64 num_bytes;
25
26 /* objectid and offset find the back reference for the file */
27 u64 objectid;
28 u64 offset;
29};
30
31struct btrfs_leaf_ref {
32 struct rb_node rb_node;
33 struct btrfs_leaf_ref_tree *tree;
34 int in_tree;
35 atomic_t usage;
36
37 u64 root_gen;
38 u64 bytenr;
39 u64 owner;
40 u64 generation;
41 int nritems;
42
43 struct list_head list;
44 struct btrfs_extent_info extents[];
45};
46
47static inline size_t btrfs_leaf_ref_size(int nr_extents)
48{
49 return sizeof(struct btrfs_leaf_ref) +
50 sizeof(struct btrfs_extent_info) * nr_extents;
51}
52
53static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
54{
55 tree->root.rb_node = NULL;
56 INIT_LIST_HEAD(&tree->list);
57 spin_lock_init(&tree->lock);
58}
59
60static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree)
61{
62 return RB_EMPTY_ROOT(&tree->root);
63}
64
65void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree);
66struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
67 int nr_extents);
68void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
69struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
70 u64 bytenr);
71int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
72 int shared);
73int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
74 int shared);
75int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
76
77#endif
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
new file mode 100644
index 000000000000..b48650de4472
--- /dev/null
+++ b/fs/btrfs/root-tree.c
@@ -0,0 +1,366 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "transaction.h"
21#include "disk-io.h"
22#include "print-tree.h"
23
24/*
25 * search forward for a root, starting with objectid 'search_start'
26 * if a root key is found, the objectid we find is filled into 'found_objectid'
27 * and 0 is returned. < 0 is returned on error, 1 if there is nothing
28 * left in the tree.
29 */
30int btrfs_search_root(struct btrfs_root *root, u64 search_start,
31 u64 *found_objectid)
32{
33 struct btrfs_path *path;
34 struct btrfs_key search_key;
35 int ret;
36
37 root = root->fs_info->tree_root;
38 search_key.objectid = search_start;
39 search_key.type = (u8)-1;
40 search_key.offset = (u64)-1;
41
42 path = btrfs_alloc_path();
43 BUG_ON(!path);
44again:
45 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
46 if (ret < 0)
47 goto out;
48 if (ret == 0) {
49 ret = 1;
50 goto out;
51 }
52 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
53 ret = btrfs_next_leaf(root, path);
54 if (ret)
55 goto out;
56 }
57 btrfs_item_key_to_cpu(path->nodes[0], &search_key, path->slots[0]);
58 if (search_key.type != BTRFS_ROOT_ITEM_KEY) {
59 search_key.offset++;
60 btrfs_release_path(root, path);
61 goto again;
62 }
63 ret = 0;
64 *found_objectid = search_key.objectid;
65
66out:
67 btrfs_free_path(path);
68 return ret;
69}
70
71/*
72 * lookup the root with the highest offset for a given objectid. The key we do
73 * find is copied into 'key'. If we find something return 0, otherwise 1, < 0
74 * on error.
75 */
76int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
77 struct btrfs_root_item *item, struct btrfs_key *key)
78{
79 struct btrfs_path *path;
80 struct btrfs_key search_key;
81 struct btrfs_key found_key;
82 struct extent_buffer *l;
83 int ret;
84 int slot;
85
86 search_key.objectid = objectid;
87 search_key.type = BTRFS_ROOT_ITEM_KEY;
88 search_key.offset = (u64)-1;
89
90 path = btrfs_alloc_path();
91 BUG_ON(!path);
92 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
93 if (ret < 0)
94 goto out;
95
96 BUG_ON(ret == 0);
97 l = path->nodes[0];
98 BUG_ON(path->slots[0] == 0);
99 slot = path->slots[0] - 1;
100 btrfs_item_key_to_cpu(l, &found_key, slot);
101 if (found_key.objectid != objectid) {
102 ret = 1;
103 goto out;
104 }
105 read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
106 sizeof(*item));
107 memcpy(key, &found_key, sizeof(found_key));
108 ret = 0;
109out:
110 btrfs_free_path(path);
111 return ret;
112}
113
114/*
115 * copy the data in 'item' into the btree
116 */
117int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
118 *root, struct btrfs_key *key, struct btrfs_root_item
119 *item)
120{
121 struct btrfs_path *path;
122 struct extent_buffer *l;
123 int ret;
124 int slot;
125 unsigned long ptr;
126
127 path = btrfs_alloc_path();
128 BUG_ON(!path);
129 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
130 if (ret < 0)
131 goto out;
132
133 if (ret != 0) {
134 btrfs_print_leaf(root, path->nodes[0]);
135 printk(KERN_CRIT "unable to update root key %llu %u %llu\n",
136 (unsigned long long)key->objectid, key->type,
137 (unsigned long long)key->offset);
138 BUG_ON(1);
139 }
140
141 l = path->nodes[0];
142 slot = path->slots[0];
143 ptr = btrfs_item_ptr_offset(l, slot);
144 write_extent_buffer(l, item, ptr, sizeof(*item));
145 btrfs_mark_buffer_dirty(path->nodes[0]);
146out:
147 btrfs_release_path(root, path);
148 btrfs_free_path(path);
149 return ret;
150}
151
152int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
153 *root, struct btrfs_key *key, struct btrfs_root_item
154 *item)
155{
156 int ret;
157 ret = btrfs_insert_item(trans, root, key, item, sizeof(*item));
158 return ret;
159}
160
161/*
162 * at mount time we want to find all the old transaction snapshots that were in
163 * the process of being deleted if we crashed. This is any root item with an
164 * offset lower than the latest root. They need to be queued for deletion to
165 * finish what was happening when we crashed.
166 */
167int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
168 struct btrfs_root *latest)
169{
170 struct btrfs_root *dead_root;
171 struct btrfs_item *item;
172 struct btrfs_root_item *ri;
173 struct btrfs_key key;
174 struct btrfs_key found_key;
175 struct btrfs_path *path;
176 int ret;
177 u32 nritems;
178 struct extent_buffer *leaf;
179 int slot;
180
181 key.objectid = objectid;
182 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
183 key.offset = 0;
184 path = btrfs_alloc_path();
185 if (!path)
186 return -ENOMEM;
187
188again:
189 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
190 if (ret < 0)
191 goto err;
192 while (1) {
193 leaf = path->nodes[0];
194 nritems = btrfs_header_nritems(leaf);
195 slot = path->slots[0];
196 if (slot >= nritems) {
197 ret = btrfs_next_leaf(root, path);
198 if (ret)
199 break;
200 leaf = path->nodes[0];
201 nritems = btrfs_header_nritems(leaf);
202 slot = path->slots[0];
203 }
204 item = btrfs_item_nr(leaf, slot);
205 btrfs_item_key_to_cpu(leaf, &key, slot);
206 if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
207 goto next;
208
209 if (key.objectid < objectid)
210 goto next;
211
212 if (key.objectid > objectid)
213 break;
214
215 ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
216 if (btrfs_disk_root_refs(leaf, ri) != 0)
217 goto next;
218
219 memcpy(&found_key, &key, sizeof(key));
220 key.offset++;
221 btrfs_release_path(root, path);
222 dead_root =
223 btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
224 &found_key);
225 if (IS_ERR(dead_root)) {
226 ret = PTR_ERR(dead_root);
227 goto err;
228 }
229
230 if (objectid == BTRFS_TREE_RELOC_OBJECTID)
231 ret = btrfs_add_dead_reloc_root(dead_root);
232 else
233 ret = btrfs_add_dead_root(dead_root, latest);
234 if (ret)
235 goto err;
236 goto again;
237next:
238 slot++;
239 path->slots[0]++;
240 }
241 ret = 0;
242err:
243 btrfs_free_path(path);
244 return ret;
245}
246
247/* drop the root item for 'key' from 'root' */
248int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
249 struct btrfs_key *key)
250{
251 struct btrfs_path *path;
252 int ret;
253 u32 refs;
254 struct btrfs_root_item *ri;
255 struct extent_buffer *leaf;
256
257 path = btrfs_alloc_path();
258 BUG_ON(!path);
259 ret = btrfs_search_slot(trans, root, key, path, -1, 1);
260 if (ret < 0)
261 goto out;
262
263 BUG_ON(ret != 0);
264 leaf = path->nodes[0];
265 ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
266
267 refs = btrfs_disk_root_refs(leaf, ri);
268 BUG_ON(refs != 0);
269 ret = btrfs_del_item(trans, root, path);
270out:
271 btrfs_release_path(root, path);
272 btrfs_free_path(path);
273 return ret;
274}
275
276#if 0 /* this will get used when snapshot deletion is implemented */
277int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
278 struct btrfs_root *tree_root,
279 u64 root_id, u8 type, u64 ref_id)
280{
281 struct btrfs_key key;
282 int ret;
283 struct btrfs_path *path;
284
285 path = btrfs_alloc_path();
286
287 key.objectid = root_id;
288 key.type = type;
289 key.offset = ref_id;
290
291 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
292 BUG_ON(ret);
293
294 ret = btrfs_del_item(trans, tree_root, path);
295 BUG_ON(ret);
296
297 btrfs_free_path(path);
298 return ret;
299}
300#endif
301
302int btrfs_find_root_ref(struct btrfs_root *tree_root,
303 struct btrfs_path *path,
304 u64 root_id, u64 ref_id)
305{
306 struct btrfs_key key;
307 int ret;
308
309 key.objectid = root_id;
310 key.type = BTRFS_ROOT_REF_KEY;
311 key.offset = ref_id;
312
313 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
314 return ret;
315}
316
317
318/*
319 * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY
320 * or BTRFS_ROOT_BACKREF_KEY.
321 *
322 * The dirid, sequence, name and name_len refer to the directory entry
323 * that is referencing the root.
324 *
325 * For a forward ref, the root_id is the id of the tree referencing
326 * the root and ref_id is the id of the subvol or snapshot.
327 *
328 * For a back ref the root_id is the id of the subvol or snapshot and
329 * ref_id is the id of the tree referencing it.
330 */
331int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
332 struct btrfs_root *tree_root,
333 u64 root_id, u8 type, u64 ref_id,
334 u64 dirid, u64 sequence,
335 const char *name, int name_len)
336{
337 struct btrfs_key key;
338 int ret;
339 struct btrfs_path *path;
340 struct btrfs_root_ref *ref;
341 struct extent_buffer *leaf;
342 unsigned long ptr;
343
344
345 path = btrfs_alloc_path();
346
347 key.objectid = root_id;
348 key.type = type;
349 key.offset = ref_id;
350
351 ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
352 sizeof(*ref) + name_len);
353 BUG_ON(ret);
354
355 leaf = path->nodes[0];
356 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
357 btrfs_set_root_ref_dirid(leaf, ref, dirid);
358 btrfs_set_root_ref_sequence(leaf, ref, sequence);
359 btrfs_set_root_ref_name_len(leaf, ref, name_len);
360 ptr = (unsigned long)(ref + 1);
361 write_extent_buffer(leaf, name, ptr, name_len);
362 btrfs_mark_buffer_dirty(leaf);
363
364 btrfs_free_path(path);
365 return ret;
366}
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
new file mode 100644
index 000000000000..c0f7ecaf1e79
--- /dev/null
+++ b/fs/btrfs/struct-funcs.c
@@ -0,0 +1,139 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/highmem.h>
20
21/* this is some deeply nasty code. ctree.h has a different
22 * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef
23 *
24 * The end result is that anyone who #includes ctree.h gets a
25 * declaration for the btrfs_set_foo functions and btrfs_foo functions
26 *
27 * This file declares the macros and then #includes ctree.h, which results
28 * in cpp creating the function here based on the template below.
29 *
30 * These setget functions do all the extent_buffer related mapping
31 * required to efficiently read and write specific fields in the extent
32 * buffers. Every pointer to metadata items in btrfs is really just
33 * an unsigned long offset into the extent buffer which has been
34 * cast to a specific type. This gives us all the gcc type checking.
35 *
36 * The extent buffer api is used to do all the kmapping and page
37 * spanning work required to get extent buffers in highmem and have
38 * a metadata blocksize different from the page size.
39 *
40 * The macro starts with a simple function prototype declaration so that
41 * sparse won't complain about it being static.
42 */
43
44#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
45u##bits btrfs_##name(struct extent_buffer *eb, type *s); \
46void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); \
47u##bits btrfs_##name(struct extent_buffer *eb, \
48 type *s) \
49{ \
50 unsigned long part_offset = (unsigned long)s; \
51 unsigned long offset = part_offset + offsetof(type, member); \
52 type *p; \
53 /* ugly, but we want the fast path here */ \
54 if (eb->map_token && offset >= eb->map_start && \
55 offset + sizeof(((type *)0)->member) <= eb->map_start + \
56 eb->map_len) { \
57 p = (type *)(eb->kaddr + part_offset - eb->map_start); \
58 return le##bits##_to_cpu(p->member); \
59 } \
60 { \
61 int err; \
62 char *map_token; \
63 char *kaddr; \
64 int unmap_on_exit = (eb->map_token == NULL); \
65 unsigned long map_start; \
66 unsigned long map_len; \
67 u##bits res; \
68 err = map_extent_buffer(eb, offset, \
69 sizeof(((type *)0)->member), \
70 &map_token, &kaddr, \
71 &map_start, &map_len, KM_USER1); \
72 if (err) { \
73 __le##bits leres; \
74 read_eb_member(eb, s, type, member, &leres); \
75 return le##bits##_to_cpu(leres); \
76 } \
77 p = (type *)(kaddr + part_offset - map_start); \
78 res = le##bits##_to_cpu(p->member); \
79 if (unmap_on_exit) \
80 unmap_extent_buffer(eb, map_token, KM_USER1); \
81 return res; \
82 } \
83} \
84void btrfs_set_##name(struct extent_buffer *eb, \
85 type *s, u##bits val) \
86{ \
87 unsigned long part_offset = (unsigned long)s; \
88 unsigned long offset = part_offset + offsetof(type, member); \
89 type *p; \
90 /* ugly, but we want the fast path here */ \
91 if (eb->map_token && offset >= eb->map_start && \
92 offset + sizeof(((type *)0)->member) <= eb->map_start + \
93 eb->map_len) { \
94 p = (type *)(eb->kaddr + part_offset - eb->map_start); \
95 p->member = cpu_to_le##bits(val); \
96 return; \
97 } \
98 { \
99 int err; \
100 char *map_token; \
101 char *kaddr; \
102 int unmap_on_exit = (eb->map_token == NULL); \
103 unsigned long map_start; \
104 unsigned long map_len; \
105 err = map_extent_buffer(eb, offset, \
106 sizeof(((type *)0)->member), \
107 &map_token, &kaddr, \
108 &map_start, &map_len, KM_USER1); \
109 if (err) { \
110 __le##bits val2; \
111 val2 = cpu_to_le##bits(val); \
112 write_eb_member(eb, s, type, member, &val2); \
113 return; \
114 } \
115 p = (type *)(kaddr + part_offset - map_start); \
116 p->member = cpu_to_le##bits(val); \
117 if (unmap_on_exit) \
118 unmap_extent_buffer(eb, map_token, KM_USER1); \
119 } \
120}
121
122#include "ctree.h"
123
124void btrfs_node_key(struct extent_buffer *eb,
125 struct btrfs_disk_key *disk_key, int nr)
126{
127 unsigned long ptr = btrfs_node_key_ptr_offset(nr);
128 if (eb->map_token && ptr >= eb->map_start &&
129 ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) {
130 memcpy(disk_key, eb->kaddr + ptr - eb->map_start,
131 sizeof(*disk_key));
132 return;
133 } else if (eb->map_token) {
134 unmap_extent_buffer(eb, eb->map_token, KM_USER1);
135 eb->map_token = NULL;
136 }
137 read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
138 struct btrfs_key_ptr, key, disk_key);
139}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
new file mode 100644
index 000000000000..0a14b495532f
--- /dev/null
+++ b/fs/btrfs/super.c
@@ -0,0 +1,722 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/blkdev.h>
20#include <linux/module.h>
21#include <linux/buffer_head.h>
22#include <linux/fs.h>
23#include <linux/pagemap.h>
24#include <linux/highmem.h>
25#include <linux/time.h>
26#include <linux/init.h>
27#include <linux/string.h>
28#include <linux/smp_lock.h>
29#include <linux/backing-dev.h>
30#include <linux/mount.h>
31#include <linux/mpage.h>
32#include <linux/swap.h>
33#include <linux/writeback.h>
34#include <linux/statfs.h>
35#include <linux/compat.h>
36#include <linux/parser.h>
37#include <linux/ctype.h>
38#include <linux/namei.h>
39#include <linux/miscdevice.h>
40#include <linux/version.h>
41#include "compat.h"
42#include "ctree.h"
43#include "disk-io.h"
44#include "transaction.h"
45#include "btrfs_inode.h"
46#include "ioctl.h"
47#include "print-tree.h"
48#include "xattr.h"
49#include "volumes.h"
50#include "version.h"
51#include "export.h"
52#include "compression.h"
53
54#define BTRFS_SUPER_MAGIC 0x9123683E
55
56static struct super_operations btrfs_super_ops;
57
58static void btrfs_put_super(struct super_block *sb)
59{
60 struct btrfs_root *root = btrfs_sb(sb);
61 int ret;
62
63 ret = close_ctree(root);
64 sb->s_fs_info = NULL;
65}
66
67enum {
68 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
69 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
70 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err,
71};
72
73static match_table_t tokens = {
74 {Opt_degraded, "degraded"},
75 {Opt_subvol, "subvol=%s"},
76 {Opt_device, "device=%s"},
77 {Opt_nodatasum, "nodatasum"},
78 {Opt_nodatacow, "nodatacow"},
79 {Opt_nobarrier, "nobarrier"},
80 {Opt_max_extent, "max_extent=%s"},
81 {Opt_max_inline, "max_inline=%s"},
82 {Opt_alloc_start, "alloc_start=%s"},
83 {Opt_thread_pool, "thread_pool=%d"},
84 {Opt_compress, "compress"},
85 {Opt_ssd, "ssd"},
86 {Opt_noacl, "noacl"},
87 {Opt_err, NULL},
88};
89
90u64 btrfs_parse_size(char *str)
91{
92 u64 res;
93 int mult = 1;
94 char *end;
95 char last;
96
97 res = simple_strtoul(str, &end, 10);
98
99 last = end[0];
100 if (isalpha(last)) {
101 last = tolower(last);
102 switch (last) {
103 case 'g':
104 mult *= 1024;
105 case 'm':
106 mult *= 1024;
107 case 'k':
108 mult *= 1024;
109 }
110 res = res * mult;
111 }
112 return res;
113}
114
115/*
116 * Regular mount options parser. Everything that is needed only when
117 * reading in a new superblock is parsed here.
118 */
119int btrfs_parse_options(struct btrfs_root *root, char *options)
120{
121 struct btrfs_fs_info *info = root->fs_info;
122 substring_t args[MAX_OPT_ARGS];
123 char *p, *num;
124 int intarg;
125
126 if (!options)
127 return 0;
128
129 /*
130 * strsep changes the string, duplicate it because parse_options
131 * gets called twice
132 */
133 options = kstrdup(options, GFP_NOFS);
134 if (!options)
135 return -ENOMEM;
136
137
138 while ((p = strsep(&options, ",")) != NULL) {
139 int token;
140 if (!*p)
141 continue;
142
143 token = match_token(p, tokens, args);
144 switch (token) {
145 case Opt_degraded:
146 printk(KERN_INFO "btrfs: allowing degraded mounts\n");
147 btrfs_set_opt(info->mount_opt, DEGRADED);
148 break;
149 case Opt_subvol:
150 case Opt_device:
151 /*
152 * These are parsed by btrfs_parse_early_options
153 * and can be happily ignored here.
154 */
155 break;
156 case Opt_nodatasum:
157 printk(KERN_INFO "btrfs: setting nodatacsum\n");
158 btrfs_set_opt(info->mount_opt, NODATASUM);
159 break;
160 case Opt_nodatacow:
161 printk(KERN_INFO "btrfs: setting nodatacow\n");
162 btrfs_set_opt(info->mount_opt, NODATACOW);
163 btrfs_set_opt(info->mount_opt, NODATASUM);
164 break;
165 case Opt_compress:
166 printk(KERN_INFO "btrfs: use compression\n");
167 btrfs_set_opt(info->mount_opt, COMPRESS);
168 break;
169 case Opt_ssd:
170 printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
171 btrfs_set_opt(info->mount_opt, SSD);
172 break;
173 case Opt_nobarrier:
174 printk(KERN_INFO "btrfs: turning off barriers\n");
175 btrfs_set_opt(info->mount_opt, NOBARRIER);
176 break;
177 case Opt_thread_pool:
178 intarg = 0;
179 match_int(&args[0], &intarg);
180 if (intarg) {
181 info->thread_pool_size = intarg;
182 printk(KERN_INFO "btrfs: thread pool %d\n",
183 info->thread_pool_size);
184 }
185 break;
186 case Opt_max_extent:
187 num = match_strdup(&args[0]);
188 if (num) {
189 info->max_extent = btrfs_parse_size(num);
190 kfree(num);
191
192 info->max_extent = max_t(u64,
193 info->max_extent, root->sectorsize);
194 printk(KERN_INFO "btrfs: max_extent at %llu\n",
195 info->max_extent);
196 }
197 break;
198 case Opt_max_inline:
199 num = match_strdup(&args[0]);
200 if (num) {
201 info->max_inline = btrfs_parse_size(num);
202 kfree(num);
203
204 if (info->max_inline) {
205 info->max_inline = max_t(u64,
206 info->max_inline,
207 root->sectorsize);
208 }
209 printk(KERN_INFO "btrfs: max_inline at %llu\n",
210 info->max_inline);
211 }
212 break;
213 case Opt_alloc_start:
214 num = match_strdup(&args[0]);
215 if (num) {
216 info->alloc_start = btrfs_parse_size(num);
217 kfree(num);
218 printk(KERN_INFO
219 "btrfs: allocations start at %llu\n",
220 info->alloc_start);
221 }
222 break;
223 case Opt_noacl:
224 root->fs_info->sb->s_flags &= ~MS_POSIXACL;
225 break;
226 default:
227 break;
228 }
229 }
230 kfree(options);
231 return 0;
232}
233
234/*
235 * Parse mount options that are required early in the mount process.
236 *
237 * All other options will be parsed on much later in the mount process and
238 * only when we need to allocate a new super block.
239 */
240static int btrfs_parse_early_options(const char *options, fmode_t flags,
241 void *holder, char **subvol_name,
242 struct btrfs_fs_devices **fs_devices)
243{
244 substring_t args[MAX_OPT_ARGS];
245 char *opts, *p;
246 int error = 0;
247
248 if (!options)
249 goto out;
250
251 /*
252 * strsep changes the string, duplicate it because parse_options
253 * gets called twice
254 */
255 opts = kstrdup(options, GFP_KERNEL);
256 if (!opts)
257 return -ENOMEM;
258
259 while ((p = strsep(&opts, ",")) != NULL) {
260 int token;
261 if (!*p)
262 continue;
263
264 token = match_token(p, tokens, args);
265 switch (token) {
266 case Opt_subvol:
267 *subvol_name = match_strdup(&args[0]);
268 break;
269 case Opt_device:
270 error = btrfs_scan_one_device(match_strdup(&args[0]),
271 flags, holder, fs_devices);
272 if (error)
273 goto out_free_opts;
274 break;
275 default:
276 break;
277 }
278 }
279
280 out_free_opts:
281 kfree(opts);
282 out:
283 /*
284 * If no subvolume name is specified we use the default one. Allocate
285 * a copy of the string "." here so that code later in the
286 * mount path doesn't care if it's the default volume or another one.
287 */
288 if (!*subvol_name) {
289 *subvol_name = kstrdup(".", GFP_KERNEL);
290 if (!*subvol_name)
291 return -ENOMEM;
292 }
293 return error;
294}
295
296static int btrfs_fill_super(struct super_block *sb,
297 struct btrfs_fs_devices *fs_devices,
298 void *data, int silent)
299{
300 struct inode *inode;
301 struct dentry *root_dentry;
302 struct btrfs_super_block *disk_super;
303 struct btrfs_root *tree_root;
304 struct btrfs_inode *bi;
305 int err;
306
307 sb->s_maxbytes = MAX_LFS_FILESIZE;
308 sb->s_magic = BTRFS_SUPER_MAGIC;
309 sb->s_op = &btrfs_super_ops;
310 sb->s_export_op = &btrfs_export_ops;
311 sb->s_xattr = btrfs_xattr_handlers;
312 sb->s_time_gran = 1;
313 sb->s_flags |= MS_POSIXACL;
314
315 tree_root = open_ctree(sb, fs_devices, (char *)data);
316
317 if (IS_ERR(tree_root)) {
318 printk("btrfs: open_ctree failed\n");
319 return PTR_ERR(tree_root);
320 }
321 sb->s_fs_info = tree_root;
322 disk_super = &tree_root->fs_info->super_copy;
323 inode = btrfs_iget_locked(sb, BTRFS_FIRST_FREE_OBJECTID,
324 tree_root->fs_info->fs_root);
325 bi = BTRFS_I(inode);
326 bi->location.objectid = inode->i_ino;
327 bi->location.offset = 0;
328 bi->root = tree_root->fs_info->fs_root;
329
330 btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
331
332 if (!inode) {
333 err = -ENOMEM;
334 goto fail_close;
335 }
336 if (inode->i_state & I_NEW) {
337 btrfs_read_locked_inode(inode);
338 unlock_new_inode(inode);
339 }
340
341 root_dentry = d_alloc_root(inode);
342 if (!root_dentry) {
343 iput(inode);
344 err = -ENOMEM;
345 goto fail_close;
346 }
347#if 0
348 /* this does the super kobj at the same time */
349 err = btrfs_sysfs_add_super(tree_root->fs_info);
350 if (err)
351 goto fail_close;
352#endif
353
354 sb->s_root = root_dentry;
355
356 save_mount_options(sb, data);
357 return 0;
358
359fail_close:
360 close_ctree(tree_root);
361 return err;
362}
363
364int btrfs_sync_fs(struct super_block *sb, int wait)
365{
366 struct btrfs_trans_handle *trans;
367 struct btrfs_root *root;
368 int ret;
369 root = btrfs_sb(sb);
370
371 if (sb->s_flags & MS_RDONLY)
372 return 0;
373
374 sb->s_dirt = 0;
375 if (!wait) {
376 filemap_flush(root->fs_info->btree_inode->i_mapping);
377 return 0;
378 }
379
380 btrfs_start_delalloc_inodes(root);
381 btrfs_wait_ordered_extents(root, 0);
382
383 btrfs_clean_old_snapshots(root);
384 trans = btrfs_start_transaction(root, 1);
385 ret = btrfs_commit_transaction(trans, root);
386 sb->s_dirt = 0;
387 return ret;
388}
389
390static void btrfs_write_super(struct super_block *sb)
391{
392 sb->s_dirt = 0;
393}
394
395static int btrfs_test_super(struct super_block *s, void *data)
396{
397 struct btrfs_fs_devices *test_fs_devices = data;
398 struct btrfs_root *root = btrfs_sb(s);
399
400 return root->fs_info->fs_devices == test_fs_devices;
401}
402
403/*
404 * Find a superblock for the given device / mount point.
405 *
406 * Note: This is based on get_sb_bdev from fs/super.c with a few additions
407 * for multiple device setup. Make sure to keep it in sync.
408 */
409static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
410 const char *dev_name, void *data, struct vfsmount *mnt)
411{
412 char *subvol_name = NULL;
413 struct block_device *bdev = NULL;
414 struct super_block *s;
415 struct dentry *root;
416 struct btrfs_fs_devices *fs_devices = NULL;
417 fmode_t mode = FMODE_READ;
418 int error = 0;
419
420 if (!(flags & MS_RDONLY))
421 mode |= FMODE_WRITE;
422
423 error = btrfs_parse_early_options(data, mode, fs_type,
424 &subvol_name, &fs_devices);
425 if (error)
426 return error;
427
428 error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
429 if (error)
430 goto error_free_subvol_name;
431
432 error = btrfs_open_devices(fs_devices, mode, fs_type);
433 if (error)
434 goto error_free_subvol_name;
435
436 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
437 error = -EACCES;
438 goto error_close_devices;
439 }
440
441 bdev = fs_devices->latest_bdev;
442 s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices);
443 if (IS_ERR(s))
444 goto error_s;
445
446 if (s->s_root) {
447 if ((flags ^ s->s_flags) & MS_RDONLY) {
448 up_write(&s->s_umount);
449 deactivate_super(s);
450 error = -EBUSY;
451 goto error_close_devices;
452 }
453
454 btrfs_close_devices(fs_devices);
455 } else {
456 char b[BDEVNAME_SIZE];
457
458 s->s_flags = flags;
459 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
460 error = btrfs_fill_super(s, fs_devices, data,
461 flags & MS_SILENT ? 1 : 0);
462 if (error) {
463 up_write(&s->s_umount);
464 deactivate_super(s);
465 goto error_free_subvol_name;
466 }
467
468 btrfs_sb(s)->fs_info->bdev_holder = fs_type;
469 s->s_flags |= MS_ACTIVE;
470 }
471
472 if (!strcmp(subvol_name, "."))
473 root = dget(s->s_root);
474 else {
475 mutex_lock(&s->s_root->d_inode->i_mutex);
476 root = lookup_one_len(subvol_name, s->s_root,
477 strlen(subvol_name));
478 mutex_unlock(&s->s_root->d_inode->i_mutex);
479
480 if (IS_ERR(root)) {
481 up_write(&s->s_umount);
482 deactivate_super(s);
483 error = PTR_ERR(root);
484 goto error_free_subvol_name;
485 }
486 if (!root->d_inode) {
487 dput(root);
488 up_write(&s->s_umount);
489 deactivate_super(s);
490 error = -ENXIO;
491 goto error_free_subvol_name;
492 }
493 }
494
495 mnt->mnt_sb = s;
496 mnt->mnt_root = root;
497
498 kfree(subvol_name);
499 return 0;
500
501error_s:
502 error = PTR_ERR(s);
503error_close_devices:
504 btrfs_close_devices(fs_devices);
505error_free_subvol_name:
506 kfree(subvol_name);
507 return error;
508}
509
510static int btrfs_remount(struct super_block *sb, int *flags, char *data)
511{
512 struct btrfs_root *root = btrfs_sb(sb);
513 int ret;
514
515 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
516 return 0;
517
518 if (*flags & MS_RDONLY) {
519 sb->s_flags |= MS_RDONLY;
520
521 ret = btrfs_commit_super(root);
522 WARN_ON(ret);
523 } else {
524 if (root->fs_info->fs_devices->rw_devices == 0)
525 return -EACCES;
526
527 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
528 return -EINVAL;
529
530 ret = btrfs_cleanup_reloc_trees(root);
531 WARN_ON(ret);
532
533 ret = btrfs_cleanup_fs_roots(root->fs_info);
534 WARN_ON(ret);
535
536 sb->s_flags &= ~MS_RDONLY;
537 }
538
539 return 0;
540}
541
542static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
543{
544 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
545 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
546 int bits = dentry->d_sb->s_blocksize_bits;
547 __be32 *fsid = (__be32 *)root->fs_info->fsid;
548
549 buf->f_namelen = BTRFS_NAME_LEN;
550 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
551 buf->f_bfree = buf->f_blocks -
552 (btrfs_super_bytes_used(disk_super) >> bits);
553 buf->f_bavail = buf->f_bfree;
554 buf->f_bsize = dentry->d_sb->s_blocksize;
555 buf->f_type = BTRFS_SUPER_MAGIC;
556
557 /* We treat it as constant endianness (it doesn't matter _which_)
558 because we want the fsid to come out the same whether mounted
559 on a big-endian or little-endian host */
560 buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
561 buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
562 /* Mask in the root object ID too, to disambiguate subvols */
563 buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32;
564 buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid;
565
566 return 0;
567}
568
569static struct file_system_type btrfs_fs_type = {
570 .owner = THIS_MODULE,
571 .name = "btrfs",
572 .get_sb = btrfs_get_sb,
573 .kill_sb = kill_anon_super,
574 .fs_flags = FS_REQUIRES_DEV,
575};
576
577/*
578 * used by btrfsctl to scan devices when no FS is mounted
579 */
580static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
581 unsigned long arg)
582{
583 struct btrfs_ioctl_vol_args *vol;
584 struct btrfs_fs_devices *fs_devices;
585 int ret = 0;
586 int len;
587
588 if (!capable(CAP_SYS_ADMIN))
589 return -EPERM;
590
591 vol = kmalloc(sizeof(*vol), GFP_KERNEL);
592 if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
593 ret = -EFAULT;
594 goto out;
595 }
596 len = strnlen(vol->name, BTRFS_PATH_NAME_MAX);
597 switch (cmd) {
598 case BTRFS_IOC_SCAN_DEV:
599 ret = btrfs_scan_one_device(vol->name, FMODE_READ,
600 &btrfs_fs_type, &fs_devices);
601 break;
602 }
603out:
604 kfree(vol);
605 return ret;
606}
607
608static int btrfs_freeze(struct super_block *sb)
609{
610 struct btrfs_root *root = btrfs_sb(sb);
611 mutex_lock(&root->fs_info->transaction_kthread_mutex);
612 mutex_lock(&root->fs_info->cleaner_mutex);
613 return 0;
614}
615
616static int btrfs_unfreeze(struct super_block *sb)
617{
618 struct btrfs_root *root = btrfs_sb(sb);
619 mutex_unlock(&root->fs_info->cleaner_mutex);
620 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
621 return 0;
622}
623
624static struct super_operations btrfs_super_ops = {
625 .delete_inode = btrfs_delete_inode,
626 .put_super = btrfs_put_super,
627 .write_super = btrfs_write_super,
628 .sync_fs = btrfs_sync_fs,
629 .show_options = generic_show_options,
630 .write_inode = btrfs_write_inode,
631 .dirty_inode = btrfs_dirty_inode,
632 .alloc_inode = btrfs_alloc_inode,
633 .destroy_inode = btrfs_destroy_inode,
634 .statfs = btrfs_statfs,
635 .remount_fs = btrfs_remount,
636 .freeze_fs = btrfs_freeze,
637 .unfreeze_fs = btrfs_unfreeze,
638};
639
640static const struct file_operations btrfs_ctl_fops = {
641 .unlocked_ioctl = btrfs_control_ioctl,
642 .compat_ioctl = btrfs_control_ioctl,
643 .owner = THIS_MODULE,
644};
645
646static struct miscdevice btrfs_misc = {
647 .minor = MISC_DYNAMIC_MINOR,
648 .name = "btrfs-control",
649 .fops = &btrfs_ctl_fops
650};
651
652static int btrfs_interface_init(void)
653{
654 return misc_register(&btrfs_misc);
655}
656
657static void btrfs_interface_exit(void)
658{
659 if (misc_deregister(&btrfs_misc) < 0)
660 printk(KERN_INFO "misc_deregister failed for control device");
661}
662
663static int __init init_btrfs_fs(void)
664{
665 int err;
666
667 err = btrfs_init_sysfs();
668 if (err)
669 return err;
670
671 err = btrfs_init_cachep();
672 if (err)
673 goto free_sysfs;
674
675 err = extent_io_init();
676 if (err)
677 goto free_cachep;
678
679 err = extent_map_init();
680 if (err)
681 goto free_extent_io;
682
683 err = btrfs_interface_init();
684 if (err)
685 goto free_extent_map;
686
687 err = register_filesystem(&btrfs_fs_type);
688 if (err)
689 goto unregister_ioctl;
690
691 printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
692 return 0;
693
694unregister_ioctl:
695 btrfs_interface_exit();
696free_extent_map:
697 extent_map_exit();
698free_extent_io:
699 extent_io_exit();
700free_cachep:
701 btrfs_destroy_cachep();
702free_sysfs:
703 btrfs_exit_sysfs();
704 return err;
705}
706
707static void __exit exit_btrfs_fs(void)
708{
709 btrfs_destroy_cachep();
710 extent_map_exit();
711 extent_io_exit();
712 btrfs_interface_exit();
713 unregister_filesystem(&btrfs_fs_type);
714 btrfs_exit_sysfs();
715 btrfs_cleanup_fs_uuids();
716 btrfs_zlib_exit();
717}
718
719module_init(init_btrfs_fs)
720module_exit(exit_btrfs_fs)
721
722MODULE_LICENSE("GPL");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
new file mode 100644
index 000000000000..a240b6fa81df
--- /dev/null
+++ b/fs/btrfs/sysfs.c
@@ -0,0 +1,269 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/slab.h>
21#include <linux/spinlock.h>
22#include <linux/completion.h>
23#include <linux/buffer_head.h>
24#include <linux/module.h>
25#include <linux/kobject.h>
26
27#include "ctree.h"
28#include "disk-io.h"
29#include "transaction.h"
30
31static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf)
32{
33 return snprintf(buf, PAGE_SIZE, "%llu\n",
34 (unsigned long long)btrfs_root_used(&root->root_item));
35}
36
37static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf)
38{
39 return snprintf(buf, PAGE_SIZE, "%llu\n",
40 (unsigned long long)btrfs_root_limit(&root->root_item));
41}
42
43static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf)
44{
45
46 return snprintf(buf, PAGE_SIZE, "%llu\n",
47 (unsigned long long)btrfs_super_bytes_used(&fs->super_copy));
48}
49
50static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf)
51{
52 return snprintf(buf, PAGE_SIZE, "%llu\n",
53 (unsigned long long)btrfs_super_total_bytes(&fs->super_copy));
54}
55
56static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf)
57{
58 return snprintf(buf, PAGE_SIZE, "%llu\n",
59 (unsigned long long)btrfs_super_sectorsize(&fs->super_copy));
60}
61
62/* this is for root attrs (subvols/snapshots) */
63struct btrfs_root_attr {
64 struct attribute attr;
65 ssize_t (*show)(struct btrfs_root *, char *);
66 ssize_t (*store)(struct btrfs_root *, const char *, size_t);
67};
68
69#define ROOT_ATTR(name, mode, show, store) \
70static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \
71 show, store)
72
73ROOT_ATTR(blocks_used, 0444, root_blocks_used_show, NULL);
74ROOT_ATTR(block_limit, 0644, root_block_limit_show, NULL);
75
76static struct attribute *btrfs_root_attrs[] = {
77 &btrfs_root_attr_blocks_used.attr,
78 &btrfs_root_attr_block_limit.attr,
79 NULL,
80};
81
82/* this is for super attrs (actual full fs) */
83struct btrfs_super_attr {
84 struct attribute attr;
85 ssize_t (*show)(struct btrfs_fs_info *, char *);
86 ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t);
87};
88
89#define SUPER_ATTR(name, mode, show, store) \
90static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \
91 show, store)
92
93SUPER_ATTR(blocks_used, 0444, super_blocks_used_show, NULL);
94SUPER_ATTR(total_blocks, 0444, super_total_blocks_show, NULL);
95SUPER_ATTR(blocksize, 0444, super_blocksize_show, NULL);
96
97static struct attribute *btrfs_super_attrs[] = {
98 &btrfs_super_attr_blocks_used.attr,
99 &btrfs_super_attr_total_blocks.attr,
100 &btrfs_super_attr_blocksize.attr,
101 NULL,
102};
103
104static ssize_t btrfs_super_attr_show(struct kobject *kobj,
105 struct attribute *attr, char *buf)
106{
107 struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
108 super_kobj);
109 struct btrfs_super_attr *a = container_of(attr,
110 struct btrfs_super_attr,
111 attr);
112
113 return a->show ? a->show(fs, buf) : 0;
114}
115
116static ssize_t btrfs_super_attr_store(struct kobject *kobj,
117 struct attribute *attr,
118 const char *buf, size_t len)
119{
120 struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
121 super_kobj);
122 struct btrfs_super_attr *a = container_of(attr,
123 struct btrfs_super_attr,
124 attr);
125
126 return a->store ? a->store(fs, buf, len) : 0;
127}
128
129static ssize_t btrfs_root_attr_show(struct kobject *kobj,
130 struct attribute *attr, char *buf)
131{
132 struct btrfs_root *root = container_of(kobj, struct btrfs_root,
133 root_kobj);
134 struct btrfs_root_attr *a = container_of(attr,
135 struct btrfs_root_attr,
136 attr);
137
138 return a->show ? a->show(root, buf) : 0;
139}
140
141static ssize_t btrfs_root_attr_store(struct kobject *kobj,
142 struct attribute *attr,
143 const char *buf, size_t len)
144{
145 struct btrfs_root *root = container_of(kobj, struct btrfs_root,
146 root_kobj);
147 struct btrfs_root_attr *a = container_of(attr,
148 struct btrfs_root_attr,
149 attr);
150 return a->store ? a->store(root, buf, len) : 0;
151}
152
153static void btrfs_super_release(struct kobject *kobj)
154{
155 struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
156 super_kobj);
157 complete(&fs->kobj_unregister);
158}
159
160static void btrfs_root_release(struct kobject *kobj)
161{
162 struct btrfs_root *root = container_of(kobj, struct btrfs_root,
163 root_kobj);
164 complete(&root->kobj_unregister);
165}
166
167static struct sysfs_ops btrfs_super_attr_ops = {
168 .show = btrfs_super_attr_show,
169 .store = btrfs_super_attr_store,
170};
171
172static struct sysfs_ops btrfs_root_attr_ops = {
173 .show = btrfs_root_attr_show,
174 .store = btrfs_root_attr_store,
175};
176
177static struct kobj_type btrfs_root_ktype = {
178 .default_attrs = btrfs_root_attrs,
179 .sysfs_ops = &btrfs_root_attr_ops,
180 .release = btrfs_root_release,
181};
182
183static struct kobj_type btrfs_super_ktype = {
184 .default_attrs = btrfs_super_attrs,
185 .sysfs_ops = &btrfs_super_attr_ops,
186 .release = btrfs_super_release,
187};
188
189/* /sys/fs/btrfs/ entry */
190static struct kset *btrfs_kset;
191
192int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
193{
194 int error;
195 char *name;
196 char c;
197 int len = strlen(fs->sb->s_id) + 1;
198 int i;
199
200 name = kmalloc(len, GFP_NOFS);
201 if (!name) {
202 error = -ENOMEM;
203 goto fail;
204 }
205
206 for (i = 0; i < len; i++) {
207 c = fs->sb->s_id[i];
208 if (c == '/' || c == '\\')
209 c = '!';
210 name[i] = c;
211 }
212 name[len] = '\0';
213
214 fs->super_kobj.kset = btrfs_kset;
215 error = kobject_init_and_add(&fs->super_kobj, &btrfs_super_ktype,
216 NULL, "%s", name);
217 kfree(name);
218 if (error)
219 goto fail;
220
221 return 0;
222
223fail:
224 printk(KERN_ERR "btrfs: sysfs creation for super failed\n");
225 return error;
226}
227
228int btrfs_sysfs_add_root(struct btrfs_root *root)
229{
230 int error;
231
232 error = kobject_init_and_add(&root->root_kobj, &btrfs_root_ktype,
233 &root->fs_info->super_kobj,
234 "%s", root->name);
235 if (error)
236 goto fail;
237
238 return 0;
239
240fail:
241 printk(KERN_ERR "btrfs: sysfs creation for root failed\n");
242 return error;
243}
244
245void btrfs_sysfs_del_root(struct btrfs_root *root)
246{
247 kobject_put(&root->root_kobj);
248 wait_for_completion(&root->kobj_unregister);
249}
250
251void btrfs_sysfs_del_super(struct btrfs_fs_info *fs)
252{
253 kobject_put(&fs->super_kobj);
254 wait_for_completion(&fs->kobj_unregister);
255}
256
257int btrfs_init_sysfs(void)
258{
259 btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
260 if (!btrfs_kset)
261 return -ENOMEM;
262 return 0;
263}
264
265void btrfs_exit_sysfs(void)
266{
267 kset_unregister(btrfs_kset);
268}
269
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
new file mode 100644
index 000000000000..8a08f9443340
--- /dev/null
+++ b/fs/btrfs/transaction.c
@@ -0,0 +1,1097 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/sched.h>
21#include <linux/writeback.h>
22#include <linux/pagemap.h>
23#include <linux/blkdev.h>
24#include "ctree.h"
25#include "disk-io.h"
26#include "transaction.h"
27#include "locking.h"
28#include "ref-cache.h"
29#include "tree-log.h"
30
31#define BTRFS_ROOT_TRANS_TAG 0
32
33static noinline void put_transaction(struct btrfs_transaction *transaction)
34{
35 WARN_ON(transaction->use_count == 0);
36 transaction->use_count--;
37 if (transaction->use_count == 0) {
38 list_del_init(&transaction->list);
39 memset(transaction, 0, sizeof(*transaction));
40 kmem_cache_free(btrfs_transaction_cachep, transaction);
41 }
42}
43
44/*
45 * either allocate a new transaction or hop into the existing one
46 */
47static noinline int join_transaction(struct btrfs_root *root)
48{
49 struct btrfs_transaction *cur_trans;
50 cur_trans = root->fs_info->running_transaction;
51 if (!cur_trans) {
52 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
53 GFP_NOFS);
54 BUG_ON(!cur_trans);
55 root->fs_info->generation++;
56 root->fs_info->last_alloc = 0;
57 root->fs_info->last_data_alloc = 0;
58 cur_trans->num_writers = 1;
59 cur_trans->num_joined = 0;
60 cur_trans->transid = root->fs_info->generation;
61 init_waitqueue_head(&cur_trans->writer_wait);
62 init_waitqueue_head(&cur_trans->commit_wait);
63 cur_trans->in_commit = 0;
64 cur_trans->blocked = 0;
65 cur_trans->use_count = 1;
66 cur_trans->commit_done = 0;
67 cur_trans->start_time = get_seconds();
68 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
69 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
70 extent_io_tree_init(&cur_trans->dirty_pages,
71 root->fs_info->btree_inode->i_mapping,
72 GFP_NOFS);
73 spin_lock(&root->fs_info->new_trans_lock);
74 root->fs_info->running_transaction = cur_trans;
75 spin_unlock(&root->fs_info->new_trans_lock);
76 } else {
77 cur_trans->num_writers++;
78 cur_trans->num_joined++;
79 }
80
81 return 0;
82}
83
84/*
85 * this does all the record keeping required to make sure that a reference
86 * counted root is properly recorded in a given transaction. This is required
87 * to make sure the old root from before we joined the transaction is deleted
88 * when the transaction commits
89 */
90noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
91{
92 struct btrfs_dirty_root *dirty;
93 u64 running_trans_id = root->fs_info->running_transaction->transid;
94 if (root->ref_cows && root->last_trans < running_trans_id) {
95 WARN_ON(root == root->fs_info->extent_root);
96 if (root->root_item.refs != 0) {
97 radix_tree_tag_set(&root->fs_info->fs_roots_radix,
98 (unsigned long)root->root_key.objectid,
99 BTRFS_ROOT_TRANS_TAG);
100
101 dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
102 BUG_ON(!dirty);
103 dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
104 BUG_ON(!dirty->root);
105 dirty->latest_root = root;
106 INIT_LIST_HEAD(&dirty->list);
107
108 root->commit_root = btrfs_root_node(root);
109
110 memcpy(dirty->root, root, sizeof(*root));
111 spin_lock_init(&dirty->root->node_lock);
112 spin_lock_init(&dirty->root->list_lock);
113 mutex_init(&dirty->root->objectid_mutex);
114 mutex_init(&dirty->root->log_mutex);
115 INIT_LIST_HEAD(&dirty->root->dead_list);
116 dirty->root->node = root->commit_root;
117 dirty->root->commit_root = NULL;
118
119 spin_lock(&root->list_lock);
120 list_add(&dirty->root->dead_list, &root->dead_list);
121 spin_unlock(&root->list_lock);
122
123 root->dirty_root = dirty;
124 } else {
125 WARN_ON(1);
126 }
127 root->last_trans = running_trans_id;
128 }
129 return 0;
130}
131
132/* wait for commit against the current transaction to become unblocked
133 * when this is done, it is safe to start a new transaction, but the current
134 * transaction might not be fully on disk.
135 */
136static void wait_current_trans(struct btrfs_root *root)
137{
138 struct btrfs_transaction *cur_trans;
139
140 cur_trans = root->fs_info->running_transaction;
141 if (cur_trans && cur_trans->blocked) {
142 DEFINE_WAIT(wait);
143 cur_trans->use_count++;
144 while (1) {
145 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
146 TASK_UNINTERRUPTIBLE);
147 if (cur_trans->blocked) {
148 mutex_unlock(&root->fs_info->trans_mutex);
149 schedule();
150 mutex_lock(&root->fs_info->trans_mutex);
151 finish_wait(&root->fs_info->transaction_wait,
152 &wait);
153 } else {
154 finish_wait(&root->fs_info->transaction_wait,
155 &wait);
156 break;
157 }
158 }
159 put_transaction(cur_trans);
160 }
161}
162
163static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
164 int num_blocks, int wait)
165{
166 struct btrfs_trans_handle *h =
167 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
168 int ret;
169
170 mutex_lock(&root->fs_info->trans_mutex);
171 if (!root->fs_info->log_root_recovering &&
172 ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2))
173 wait_current_trans(root);
174 ret = join_transaction(root);
175 BUG_ON(ret);
176
177 btrfs_record_root_in_trans(root);
178 h->transid = root->fs_info->running_transaction->transid;
179 h->transaction = root->fs_info->running_transaction;
180 h->blocks_reserved = num_blocks;
181 h->blocks_used = 0;
182 h->block_group = 0;
183 h->alloc_exclude_nr = 0;
184 h->alloc_exclude_start = 0;
185 root->fs_info->running_transaction->use_count++;
186 mutex_unlock(&root->fs_info->trans_mutex);
187 return h;
188}
189
190struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
191 int num_blocks)
192{
193 return start_transaction(root, num_blocks, 1);
194}
195struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
196 int num_blocks)
197{
198 return start_transaction(root, num_blocks, 0);
199}
200
201struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
202 int num_blocks)
203{
204 return start_transaction(r, num_blocks, 2);
205}
206
207/* wait for a transaction commit to be fully complete */
208static noinline int wait_for_commit(struct btrfs_root *root,
209 struct btrfs_transaction *commit)
210{
211 DEFINE_WAIT(wait);
212 mutex_lock(&root->fs_info->trans_mutex);
213 while (!commit->commit_done) {
214 prepare_to_wait(&commit->commit_wait, &wait,
215 TASK_UNINTERRUPTIBLE);
216 if (commit->commit_done)
217 break;
218 mutex_unlock(&root->fs_info->trans_mutex);
219 schedule();
220 mutex_lock(&root->fs_info->trans_mutex);
221 }
222 mutex_unlock(&root->fs_info->trans_mutex);
223 finish_wait(&commit->commit_wait, &wait);
224 return 0;
225}
226
227/*
228 * rate limit against the drop_snapshot code. This helps to slow down new
229 * operations if the drop_snapshot code isn't able to keep up.
230 */
231static void throttle_on_drops(struct btrfs_root *root)
232{
233 struct btrfs_fs_info *info = root->fs_info;
234 int harder_count = 0;
235
236harder:
237 if (atomic_read(&info->throttles)) {
238 DEFINE_WAIT(wait);
239 int thr;
240 thr = atomic_read(&info->throttle_gen);
241
242 do {
243 prepare_to_wait(&info->transaction_throttle,
244 &wait, TASK_UNINTERRUPTIBLE);
245 if (!atomic_read(&info->throttles)) {
246 finish_wait(&info->transaction_throttle, &wait);
247 break;
248 }
249 schedule();
250 finish_wait(&info->transaction_throttle, &wait);
251 } while (thr == atomic_read(&info->throttle_gen));
252 harder_count++;
253
254 if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
255 harder_count < 2)
256 goto harder;
257
258 if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
259 harder_count < 10)
260 goto harder;
261
262 if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
263 harder_count < 20)
264 goto harder;
265 }
266}
267
268void btrfs_throttle(struct btrfs_root *root)
269{
270 mutex_lock(&root->fs_info->trans_mutex);
271 if (!root->fs_info->open_ioctl_trans)
272 wait_current_trans(root);
273 mutex_unlock(&root->fs_info->trans_mutex);
274
275 throttle_on_drops(root);
276}
277
278static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
279 struct btrfs_root *root, int throttle)
280{
281 struct btrfs_transaction *cur_trans;
282 struct btrfs_fs_info *info = root->fs_info;
283
284 mutex_lock(&info->trans_mutex);
285 cur_trans = info->running_transaction;
286 WARN_ON(cur_trans != trans->transaction);
287 WARN_ON(cur_trans->num_writers < 1);
288 cur_trans->num_writers--;
289
290 if (waitqueue_active(&cur_trans->writer_wait))
291 wake_up(&cur_trans->writer_wait);
292 put_transaction(cur_trans);
293 mutex_unlock(&info->trans_mutex);
294 memset(trans, 0, sizeof(*trans));
295 kmem_cache_free(btrfs_trans_handle_cachep, trans);
296
297 if (throttle)
298 throttle_on_drops(root);
299
300 return 0;
301}
302
303int btrfs_end_transaction(struct btrfs_trans_handle *trans,
304 struct btrfs_root *root)
305{
306 return __btrfs_end_transaction(trans, root, 0);
307}
308
309int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
310 struct btrfs_root *root)
311{
312 return __btrfs_end_transaction(trans, root, 1);
313}
314
315/*
316 * when btree blocks are allocated, they have some corresponding bits set for
317 * them in one of two extent_io trees. This is used to make sure all of
318 * those extents are on disk for transaction or log commit
319 */
320int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
321 struct extent_io_tree *dirty_pages)
322{
323 int ret;
324 int err = 0;
325 int werr = 0;
326 struct page *page;
327 struct inode *btree_inode = root->fs_info->btree_inode;
328 u64 start = 0;
329 u64 end;
330 unsigned long index;
331
332 while (1) {
333 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
334 EXTENT_DIRTY);
335 if (ret)
336 break;
337 while (start <= end) {
338 cond_resched();
339
340 index = start >> PAGE_CACHE_SHIFT;
341 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
342 page = find_get_page(btree_inode->i_mapping, index);
343 if (!page)
344 continue;
345
346 btree_lock_page_hook(page);
347 if (!page->mapping) {
348 unlock_page(page);
349 page_cache_release(page);
350 continue;
351 }
352
353 if (PageWriteback(page)) {
354 if (PageDirty(page))
355 wait_on_page_writeback(page);
356 else {
357 unlock_page(page);
358 page_cache_release(page);
359 continue;
360 }
361 }
362 err = write_one_page(page, 0);
363 if (err)
364 werr = err;
365 page_cache_release(page);
366 }
367 }
368 while (1) {
369 ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
370 EXTENT_DIRTY);
371 if (ret)
372 break;
373
374 clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
375 while (start <= end) {
376 index = start >> PAGE_CACHE_SHIFT;
377 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
378 page = find_get_page(btree_inode->i_mapping, index);
379 if (!page)
380 continue;
381 if (PageDirty(page)) {
382 btree_lock_page_hook(page);
383 wait_on_page_writeback(page);
384 err = write_one_page(page, 0);
385 if (err)
386 werr = err;
387 }
388 wait_on_page_writeback(page);
389 page_cache_release(page);
390 cond_resched();
391 }
392 }
393 if (err)
394 werr = err;
395 return werr;
396}
397
398int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
399 struct btrfs_root *root)
400{
401 if (!trans || !trans->transaction) {
402 struct inode *btree_inode;
403 btree_inode = root->fs_info->btree_inode;
404 return filemap_write_and_wait(btree_inode->i_mapping);
405 }
406 return btrfs_write_and_wait_marked_extents(root,
407 &trans->transaction->dirty_pages);
408}
409
410/*
411 * this is used to update the root pointer in the tree of tree roots.
412 *
413 * But, in the case of the extent allocation tree, updating the root
414 * pointer may allocate blocks which may change the root of the extent
415 * allocation tree.
416 *
417 * So, this loops and repeats and makes sure the cowonly root didn't
418 * change while the root pointer was being updated in the metadata.
419 */
420static int update_cowonly_root(struct btrfs_trans_handle *trans,
421 struct btrfs_root *root)
422{
423 int ret;
424 u64 old_root_bytenr;
425 struct btrfs_root *tree_root = root->fs_info->tree_root;
426
427 btrfs_extent_post_op(trans, root);
428 btrfs_write_dirty_block_groups(trans, root);
429 btrfs_extent_post_op(trans, root);
430
431 while (1) {
432 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
433 if (old_root_bytenr == root->node->start)
434 break;
435 btrfs_set_root_bytenr(&root->root_item,
436 root->node->start);
437 btrfs_set_root_level(&root->root_item,
438 btrfs_header_level(root->node));
439 btrfs_set_root_generation(&root->root_item, trans->transid);
440
441 btrfs_extent_post_op(trans, root);
442
443 ret = btrfs_update_root(trans, tree_root,
444 &root->root_key,
445 &root->root_item);
446 BUG_ON(ret);
447 btrfs_write_dirty_block_groups(trans, root);
448 btrfs_extent_post_op(trans, root);
449 }
450 return 0;
451}
452
453/*
454 * update all the cowonly tree roots on disk
455 */
456int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
457 struct btrfs_root *root)
458{
459 struct btrfs_fs_info *fs_info = root->fs_info;
460 struct list_head *next;
461 struct extent_buffer *eb;
462
463 btrfs_extent_post_op(trans, fs_info->tree_root);
464
465 eb = btrfs_lock_root_node(fs_info->tree_root);
466 btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0);
467 btrfs_tree_unlock(eb);
468 free_extent_buffer(eb);
469
470 btrfs_extent_post_op(trans, fs_info->tree_root);
471
472 while (!list_empty(&fs_info->dirty_cowonly_roots)) {
473 next = fs_info->dirty_cowonly_roots.next;
474 list_del_init(next);
475 root = list_entry(next, struct btrfs_root, dirty_list);
476
477 update_cowonly_root(trans, root);
478 }
479 return 0;
480}
481
482/*
483 * dead roots are old snapshots that need to be deleted. This allocates
484 * a dirty root struct and adds it into the list of dead roots that need to
485 * be deleted
486 */
487int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
488{
489 struct btrfs_dirty_root *dirty;
490
491 dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
492 if (!dirty)
493 return -ENOMEM;
494 dirty->root = root;
495 dirty->latest_root = latest;
496
497 mutex_lock(&root->fs_info->trans_mutex);
498 list_add(&dirty->list, &latest->fs_info->dead_roots);
499 mutex_unlock(&root->fs_info->trans_mutex);
500 return 0;
501}
502
503/*
504 * at transaction commit time we need to schedule the old roots for
505 * deletion via btrfs_drop_snapshot. This runs through all the
506 * reference counted roots that were modified in the current
507 * transaction and puts them into the drop list
508 */
509static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
510 struct radix_tree_root *radix,
511 struct list_head *list)
512{
513 struct btrfs_dirty_root *dirty;
514 struct btrfs_root *gang[8];
515 struct btrfs_root *root;
516 int i;
517 int ret;
518 int err = 0;
519 u32 refs;
520
521 while (1) {
522 ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
523 ARRAY_SIZE(gang),
524 BTRFS_ROOT_TRANS_TAG);
525 if (ret == 0)
526 break;
527 for (i = 0; i < ret; i++) {
528 root = gang[i];
529 radix_tree_tag_clear(radix,
530 (unsigned long)root->root_key.objectid,
531 BTRFS_ROOT_TRANS_TAG);
532
533 BUG_ON(!root->ref_tree);
534 dirty = root->dirty_root;
535
536 btrfs_free_log(trans, root);
537 btrfs_free_reloc_root(trans, root);
538
539 if (root->commit_root == root->node) {
540 WARN_ON(root->node->start !=
541 btrfs_root_bytenr(&root->root_item));
542
543 free_extent_buffer(root->commit_root);
544 root->commit_root = NULL;
545 root->dirty_root = NULL;
546
547 spin_lock(&root->list_lock);
548 list_del_init(&dirty->root->dead_list);
549 spin_unlock(&root->list_lock);
550
551 kfree(dirty->root);
552 kfree(dirty);
553
554 /* make sure to update the root on disk
555 * so we get any updates to the block used
556 * counts
557 */
558 err = btrfs_update_root(trans,
559 root->fs_info->tree_root,
560 &root->root_key,
561 &root->root_item);
562 continue;
563 }
564
565 memset(&root->root_item.drop_progress, 0,
566 sizeof(struct btrfs_disk_key));
567 root->root_item.drop_level = 0;
568 root->commit_root = NULL;
569 root->dirty_root = NULL;
570 root->root_key.offset = root->fs_info->generation;
571 btrfs_set_root_bytenr(&root->root_item,
572 root->node->start);
573 btrfs_set_root_level(&root->root_item,
574 btrfs_header_level(root->node));
575 btrfs_set_root_generation(&root->root_item,
576 root->root_key.offset);
577
578 err = btrfs_insert_root(trans, root->fs_info->tree_root,
579 &root->root_key,
580 &root->root_item);
581 if (err)
582 break;
583
584 refs = btrfs_root_refs(&dirty->root->root_item);
585 btrfs_set_root_refs(&dirty->root->root_item, refs - 1);
586 err = btrfs_update_root(trans, root->fs_info->tree_root,
587 &dirty->root->root_key,
588 &dirty->root->root_item);
589
590 BUG_ON(err);
591 if (refs == 1) {
592 list_add(&dirty->list, list);
593 } else {
594 WARN_ON(1);
595 free_extent_buffer(dirty->root->node);
596 kfree(dirty->root);
597 kfree(dirty);
598 }
599 }
600 }
601 return err;
602}
603
604/*
605 * defrag a given btree. If cacheonly == 1, this won't read from the disk,
606 * otherwise every leaf in the btree is read and defragged.
607 */
608int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
609{
610 struct btrfs_fs_info *info = root->fs_info;
611 int ret;
612 struct btrfs_trans_handle *trans;
613 unsigned long nr;
614
615 smp_mb();
616 if (root->defrag_running)
617 return 0;
618 trans = btrfs_start_transaction(root, 1);
619 while (1) {
620 root->defrag_running = 1;
621 ret = btrfs_defrag_leaves(trans, root, cacheonly);
622 nr = trans->blocks_used;
623 btrfs_end_transaction(trans, root);
624 btrfs_btree_balance_dirty(info->tree_root, nr);
625 cond_resched();
626
627 trans = btrfs_start_transaction(root, 1);
628 if (root->fs_info->closing || ret != -EAGAIN)
629 break;
630 }
631 root->defrag_running = 0;
632 smp_mb();
633 btrfs_end_transaction(trans, root);
634 return 0;
635}
636
637/*
638 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
639 * all of them
640 */
641static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
642 struct list_head *list)
643{
644 struct btrfs_dirty_root *dirty;
645 struct btrfs_trans_handle *trans;
646 unsigned long nr;
647 u64 num_bytes;
648 u64 bytes_used;
649 u64 max_useless;
650 int ret = 0;
651 int err;
652
653 while (!list_empty(list)) {
654 struct btrfs_root *root;
655
656 dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
657 list_del_init(&dirty->list);
658
659 num_bytes = btrfs_root_used(&dirty->root->root_item);
660 root = dirty->latest_root;
661 atomic_inc(&root->fs_info->throttles);
662
663 while (1) {
664 trans = btrfs_start_transaction(tree_root, 1);
665 mutex_lock(&root->fs_info->drop_mutex);
666 ret = btrfs_drop_snapshot(trans, dirty->root);
667 if (ret != -EAGAIN)
668 break;
669 mutex_unlock(&root->fs_info->drop_mutex);
670
671 err = btrfs_update_root(trans,
672 tree_root,
673 &dirty->root->root_key,
674 &dirty->root->root_item);
675 if (err)
676 ret = err;
677 nr = trans->blocks_used;
678 ret = btrfs_end_transaction(trans, tree_root);
679 BUG_ON(ret);
680
681 btrfs_btree_balance_dirty(tree_root, nr);
682 cond_resched();
683 }
684 BUG_ON(ret);
685 atomic_dec(&root->fs_info->throttles);
686 wake_up(&root->fs_info->transaction_throttle);
687
688 num_bytes -= btrfs_root_used(&dirty->root->root_item);
689 bytes_used = btrfs_root_used(&root->root_item);
690 if (num_bytes) {
691 btrfs_record_root_in_trans(root);
692 btrfs_set_root_used(&root->root_item,
693 bytes_used - num_bytes);
694 }
695
696 ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
697 if (ret) {
698 BUG();
699 break;
700 }
701 mutex_unlock(&root->fs_info->drop_mutex);
702
703 spin_lock(&root->list_lock);
704 list_del_init(&dirty->root->dead_list);
705 if (!list_empty(&root->dead_list)) {
706 struct btrfs_root *oldest;
707 oldest = list_entry(root->dead_list.prev,
708 struct btrfs_root, dead_list);
709 max_useless = oldest->root_key.offset - 1;
710 } else {
711 max_useless = root->root_key.offset - 1;
712 }
713 spin_unlock(&root->list_lock);
714
715 nr = trans->blocks_used;
716 ret = btrfs_end_transaction(trans, tree_root);
717 BUG_ON(ret);
718
719 ret = btrfs_remove_leaf_refs(root, max_useless, 0);
720 BUG_ON(ret);
721
722 free_extent_buffer(dirty->root->node);
723 kfree(dirty->root);
724 kfree(dirty);
725
726 btrfs_btree_balance_dirty(tree_root, nr);
727 cond_resched();
728 }
729 return ret;
730}
731
732/*
733 * new snapshots need to be created at a very specific time in the
734 * transaction commit. This does the actual creation
735 */
736static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
737 struct btrfs_fs_info *fs_info,
738 struct btrfs_pending_snapshot *pending)
739{
740 struct btrfs_key key;
741 struct btrfs_root_item *new_root_item;
742 struct btrfs_root *tree_root = fs_info->tree_root;
743 struct btrfs_root *root = pending->root;
744 struct extent_buffer *tmp;
745 struct extent_buffer *old;
746 int ret;
747 u64 objectid;
748
749 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
750 if (!new_root_item) {
751 ret = -ENOMEM;
752 goto fail;
753 }
754 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
755 if (ret)
756 goto fail;
757
758 btrfs_record_root_in_trans(root);
759 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
760 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
761
762 key.objectid = objectid;
763 key.offset = trans->transid;
764 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
765
766 old = btrfs_lock_root_node(root);
767 btrfs_cow_block(trans, root, old, NULL, 0, &old, 0);
768
769 btrfs_copy_root(trans, root, old, &tmp, objectid);
770 btrfs_tree_unlock(old);
771 free_extent_buffer(old);
772
773 btrfs_set_root_bytenr(new_root_item, tmp->start);
774 btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
775 btrfs_set_root_generation(new_root_item, trans->transid);
776 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
777 new_root_item);
778 btrfs_tree_unlock(tmp);
779 free_extent_buffer(tmp);
780 if (ret)
781 goto fail;
782
783 key.offset = (u64)-1;
784 memcpy(&pending->root_key, &key, sizeof(key));
785fail:
786 kfree(new_root_item);
787 return ret;
788}
789
790static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
791 struct btrfs_pending_snapshot *pending)
792{
793 int ret;
794 int namelen;
795 u64 index = 0;
796 struct btrfs_trans_handle *trans;
797 struct inode *parent_inode;
798 struct inode *inode;
799 struct btrfs_root *parent_root;
800
801 parent_inode = pending->dentry->d_parent->d_inode;
802 parent_root = BTRFS_I(parent_inode)->root;
803 trans = btrfs_join_transaction(parent_root, 1);
804
805 /*
806 * insert the directory item
807 */
808 namelen = strlen(pending->name);
809 ret = btrfs_set_inode_index(parent_inode, &index);
810 ret = btrfs_insert_dir_item(trans, parent_root,
811 pending->name, namelen,
812 parent_inode->i_ino,
813 &pending->root_key, BTRFS_FT_DIR, index);
814
815 if (ret)
816 goto fail;
817
818 btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
819 ret = btrfs_update_inode(trans, parent_root, parent_inode);
820 BUG_ON(ret);
821
822 /* add the backref first */
823 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
824 pending->root_key.objectid,
825 BTRFS_ROOT_BACKREF_KEY,
826 parent_root->root_key.objectid,
827 parent_inode->i_ino, index, pending->name,
828 namelen);
829
830 BUG_ON(ret);
831
832 /* now add the forward ref */
833 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
834 parent_root->root_key.objectid,
835 BTRFS_ROOT_REF_KEY,
836 pending->root_key.objectid,
837 parent_inode->i_ino, index, pending->name,
838 namelen);
839
840 inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
841 d_instantiate(pending->dentry, inode);
842fail:
843 btrfs_end_transaction(trans, fs_info->fs_root);
844 return ret;
845}
846
847/*
848 * create all the snapshots we've scheduled for creation
849 */
850static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
851 struct btrfs_fs_info *fs_info)
852{
853 struct btrfs_pending_snapshot *pending;
854 struct list_head *head = &trans->transaction->pending_snapshots;
855 struct list_head *cur;
856 int ret;
857
858 list_for_each(cur, head) {
859 pending = list_entry(cur, struct btrfs_pending_snapshot, list);
860 ret = create_pending_snapshot(trans, fs_info, pending);
861 BUG_ON(ret);
862 }
863 return 0;
864}
865
866static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
867 struct btrfs_fs_info *fs_info)
868{
869 struct btrfs_pending_snapshot *pending;
870 struct list_head *head = &trans->transaction->pending_snapshots;
871 int ret;
872
873 while (!list_empty(head)) {
874 pending = list_entry(head->next,
875 struct btrfs_pending_snapshot, list);
876 ret = finish_pending_snapshot(fs_info, pending);
877 BUG_ON(ret);
878 list_del(&pending->list);
879 kfree(pending->name);
880 kfree(pending);
881 }
882 return 0;
883}
884
885int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
886 struct btrfs_root *root)
887{
888 unsigned long joined = 0;
889 unsigned long timeout = 1;
890 struct btrfs_transaction *cur_trans;
891 struct btrfs_transaction *prev_trans = NULL;
892 struct btrfs_root *chunk_root = root->fs_info->chunk_root;
893 struct list_head dirty_fs_roots;
894 struct extent_io_tree *pinned_copy;
895 DEFINE_WAIT(wait);
896 int ret;
897
898 INIT_LIST_HEAD(&dirty_fs_roots);
899 mutex_lock(&root->fs_info->trans_mutex);
900 if (trans->transaction->in_commit) {
901 cur_trans = trans->transaction;
902 trans->transaction->use_count++;
903 mutex_unlock(&root->fs_info->trans_mutex);
904 btrfs_end_transaction(trans, root);
905
906 ret = wait_for_commit(root, cur_trans);
907 BUG_ON(ret);
908
909 mutex_lock(&root->fs_info->trans_mutex);
910 put_transaction(cur_trans);
911 mutex_unlock(&root->fs_info->trans_mutex);
912
913 return 0;
914 }
915
916 pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
917 if (!pinned_copy)
918 return -ENOMEM;
919
920 extent_io_tree_init(pinned_copy,
921 root->fs_info->btree_inode->i_mapping, GFP_NOFS);
922
923 trans->transaction->in_commit = 1;
924 trans->transaction->blocked = 1;
925 cur_trans = trans->transaction;
926 if (cur_trans->list.prev != &root->fs_info->trans_list) {
927 prev_trans = list_entry(cur_trans->list.prev,
928 struct btrfs_transaction, list);
929 if (!prev_trans->commit_done) {
930 prev_trans->use_count++;
931 mutex_unlock(&root->fs_info->trans_mutex);
932
933 wait_for_commit(root, prev_trans);
934
935 mutex_lock(&root->fs_info->trans_mutex);
936 put_transaction(prev_trans);
937 }
938 }
939
940 do {
941 int snap_pending = 0;
942 joined = cur_trans->num_joined;
943 if (!list_empty(&trans->transaction->pending_snapshots))
944 snap_pending = 1;
945
946 WARN_ON(cur_trans != trans->transaction);
947 prepare_to_wait(&cur_trans->writer_wait, &wait,
948 TASK_UNINTERRUPTIBLE);
949
950 if (cur_trans->num_writers > 1)
951 timeout = MAX_SCHEDULE_TIMEOUT;
952 else
953 timeout = 1;
954
955 mutex_unlock(&root->fs_info->trans_mutex);
956
957 if (snap_pending) {
958 ret = btrfs_wait_ordered_extents(root, 1);
959 BUG_ON(ret);
960 }
961
962 schedule_timeout(timeout);
963
964 mutex_lock(&root->fs_info->trans_mutex);
965 finish_wait(&cur_trans->writer_wait, &wait);
966 } while (cur_trans->num_writers > 1 ||
967 (cur_trans->num_joined != joined));
968
969 ret = create_pending_snapshots(trans, root->fs_info);
970 BUG_ON(ret);
971
972 WARN_ON(cur_trans != trans->transaction);
973
974 /* btrfs_commit_tree_roots is responsible for getting the
975 * various roots consistent with each other. Every pointer
976 * in the tree of tree roots has to point to the most up to date
977 * root for every subvolume and other tree. So, we have to keep
978 * the tree logging code from jumping in and changing any
979 * of the trees.
980 *
981 * At this point in the commit, there can't be any tree-log
982 * writers, but a little lower down we drop the trans mutex
983 * and let new people in. By holding the tree_log_mutex
984 * from now until after the super is written, we avoid races
985 * with the tree-log code.
986 */
987 mutex_lock(&root->fs_info->tree_log_mutex);
988 /*
989 * keep tree reloc code from adding new reloc trees
990 */
991 mutex_lock(&root->fs_info->tree_reloc_mutex);
992
993
994 ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
995 &dirty_fs_roots);
996 BUG_ON(ret);
997
998 /* add_dirty_roots gets rid of all the tree log roots, it is now
999 * safe to free the root of tree log roots
1000 */
1001 btrfs_free_log_root_tree(trans, root->fs_info);
1002
1003 ret = btrfs_commit_tree_roots(trans, root);
1004 BUG_ON(ret);
1005
1006 cur_trans = root->fs_info->running_transaction;
1007 spin_lock(&root->fs_info->new_trans_lock);
1008 root->fs_info->running_transaction = NULL;
1009 spin_unlock(&root->fs_info->new_trans_lock);
1010 btrfs_set_super_generation(&root->fs_info->super_copy,
1011 cur_trans->transid);
1012 btrfs_set_super_root(&root->fs_info->super_copy,
1013 root->fs_info->tree_root->node->start);
1014 btrfs_set_super_root_level(&root->fs_info->super_copy,
1015 btrfs_header_level(root->fs_info->tree_root->node));
1016
1017 btrfs_set_super_chunk_root(&root->fs_info->super_copy,
1018 chunk_root->node->start);
1019 btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
1020 btrfs_header_level(chunk_root->node));
1021 btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy,
1022 btrfs_header_generation(chunk_root->node));
1023
1024 if (!root->fs_info->log_root_recovering) {
1025 btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
1026 btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
1027 }
1028
1029 memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
1030 sizeof(root->fs_info->super_copy));
1031
1032 btrfs_copy_pinned(root, pinned_copy);
1033
1034 trans->transaction->blocked = 0;
1035 wake_up(&root->fs_info->transaction_throttle);
1036 wake_up(&root->fs_info->transaction_wait);
1037
1038 mutex_unlock(&root->fs_info->trans_mutex);
1039 ret = btrfs_write_and_wait_transaction(trans, root);
1040 BUG_ON(ret);
1041 write_ctree_super(trans, root, 0);
1042
1043 /*
1044 * the super is written, we can safely allow the tree-loggers
1045 * to go about their business
1046 */
1047 mutex_unlock(&root->fs_info->tree_log_mutex);
1048
1049 btrfs_finish_extent_commit(trans, root, pinned_copy);
1050 kfree(pinned_copy);
1051
1052 btrfs_drop_dead_reloc_roots(root);
1053 mutex_unlock(&root->fs_info->tree_reloc_mutex);
1054
1055 /* do the directory inserts of any pending snapshot creations */
1056 finish_pending_snapshots(trans, root->fs_info);
1057
1058 mutex_lock(&root->fs_info->trans_mutex);
1059
1060 cur_trans->commit_done = 1;
1061 root->fs_info->last_trans_committed = cur_trans->transid;
1062 wake_up(&cur_trans->commit_wait);
1063
1064 put_transaction(cur_trans);
1065 put_transaction(cur_trans);
1066
1067 list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
1068 if (root->fs_info->closing)
1069 list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
1070
1071 mutex_unlock(&root->fs_info->trans_mutex);
1072
1073 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1074
1075 if (root->fs_info->closing)
1076 drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
1077 return ret;
1078}
1079
1080/*
1081 * interface function to delete all the snapshots we have scheduled for deletion
1082 */
1083int btrfs_clean_old_snapshots(struct btrfs_root *root)
1084{
1085 struct list_head dirty_roots;
1086 INIT_LIST_HEAD(&dirty_roots);
1087again:
1088 mutex_lock(&root->fs_info->trans_mutex);
1089 list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
1090 mutex_unlock(&root->fs_info->trans_mutex);
1091
1092 if (!list_empty(&dirty_roots)) {
1093 drop_dirty_roots(root, &dirty_roots);
1094 goto again;
1095 }
1096 return 0;
1097}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
new file mode 100644
index 000000000000..ea292117f882
--- /dev/null
+++ b/fs/btrfs/transaction.h
@@ -0,0 +1,106 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_TRANSACTION__
20#define __BTRFS_TRANSACTION__
21#include "btrfs_inode.h"
22
23struct btrfs_transaction {
24 u64 transid;
25 unsigned long num_writers;
26 unsigned long num_joined;
27 int in_commit;
28 int use_count;
29 int commit_done;
30 int blocked;
31 struct list_head list;
32 struct extent_io_tree dirty_pages;
33 unsigned long start_time;
34 wait_queue_head_t writer_wait;
35 wait_queue_head_t commit_wait;
36 struct list_head pending_snapshots;
37};
38
39struct btrfs_trans_handle {
40 u64 transid;
41 unsigned long blocks_reserved;
42 unsigned long blocks_used;
43 struct btrfs_transaction *transaction;
44 u64 block_group;
45 u64 alloc_exclude_start;
46 u64 alloc_exclude_nr;
47};
48
49struct btrfs_pending_snapshot {
50 struct dentry *dentry;
51 struct btrfs_root *root;
52 char *name;
53 struct btrfs_key root_key;
54 struct list_head list;
55};
56
57struct btrfs_dirty_root {
58 struct list_head list;
59 struct btrfs_root *root;
60 struct btrfs_root *latest_root;
61};
62
63static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
64 struct inode *inode)
65{
66 trans->block_group = BTRFS_I(inode)->block_group;
67}
68
69static inline void btrfs_update_inode_block_group(
70 struct btrfs_trans_handle *trans,
71 struct inode *inode)
72{
73 BTRFS_I(inode)->block_group = trans->block_group;
74}
75
76static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
77 struct inode *inode)
78{
79 BTRFS_I(inode)->last_trans = trans->transaction->transid;
80}
81
82int btrfs_end_transaction(struct btrfs_trans_handle *trans,
83 struct btrfs_root *root);
84struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
85 int num_blocks);
86struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
87 int num_blocks);
88struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
89 int num_blocks);
90int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
91 struct btrfs_root *root);
92int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
93 struct btrfs_root *root);
94
95int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest);
96int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
97int btrfs_clean_old_snapshots(struct btrfs_root *root);
98int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
99 struct btrfs_root *root);
100int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
101 struct btrfs_root *root);
102void btrfs_throttle(struct btrfs_root *root);
103int btrfs_record_root_in_trans(struct btrfs_root *root);
104int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
105 struct extent_io_tree *dirty_pages);
106#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
new file mode 100644
index 000000000000..3e8358c36165
--- /dev/null
+++ b/fs/btrfs/tree-defrag.c
@@ -0,0 +1,147 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "disk-io.h"
22#include "print-tree.h"
23#include "transaction.h"
24#include "locking.h"
25
26/* defrag all the leaves in a given btree. If cache_only == 1, don't read
27 * things from disk, otherwise read all the leaves and try to get key order to
28 * better reflect disk order
29 */
30
31int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
32 struct btrfs_root *root, int cache_only)
33{
34 struct btrfs_path *path = NULL;
35 struct btrfs_key key;
36 int ret = 0;
37 int wret;
38 int level;
39 int orig_level;
40 int is_extent = 0;
41 int next_key_ret = 0;
42 u64 last_ret = 0;
43 u64 min_trans = 0;
44
45 if (cache_only)
46 goto out;
47
48 if (root->fs_info->extent_root == root) {
49 /*
50 * there's recursion here right now in the tree locking,
51 * we can't defrag the extent root without deadlock
52 */
53 goto out;
54 }
55
56 if (root->ref_cows == 0 && !is_extent)
57 goto out;
58
59 if (btrfs_test_opt(root, SSD))
60 goto out;
61
62 path = btrfs_alloc_path();
63 if (!path)
64 return -ENOMEM;
65
66 level = btrfs_header_level(root->node);
67 orig_level = level;
68
69 if (level == 0)
70 goto out;
71
72 if (root->defrag_progress.objectid == 0) {
73 struct extent_buffer *root_node;
74 u32 nritems;
75
76 root_node = btrfs_lock_root_node(root);
77 nritems = btrfs_header_nritems(root_node);
78 root->defrag_max.objectid = 0;
79 /* from above we know this is not a leaf */
80 btrfs_node_key_to_cpu(root_node, &root->defrag_max,
81 nritems - 1);
82 btrfs_tree_unlock(root_node);
83 free_extent_buffer(root_node);
84 memset(&key, 0, sizeof(key));
85 } else {
86 memcpy(&key, &root->defrag_progress, sizeof(key));
87 }
88
89 path->keep_locks = 1;
90 if (cache_only)
91 min_trans = root->defrag_trans_start;
92
93 ret = btrfs_search_forward(root, &key, NULL, path,
94 cache_only, min_trans);
95 if (ret < 0)
96 goto out;
97 if (ret > 0) {
98 ret = 0;
99 goto out;
100 }
101 btrfs_release_path(root, path);
102 wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
103
104 if (wret < 0) {
105 ret = wret;
106 goto out;
107 }
108 if (!path->nodes[1]) {
109 ret = 0;
110 goto out;
111 }
112 path->slots[1] = btrfs_header_nritems(path->nodes[1]);
113 next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only,
114 min_trans);
115 ret = btrfs_realloc_node(trans, root,
116 path->nodes[1], 0,
117 cache_only, &last_ret,
118 &root->defrag_progress);
119 WARN_ON(ret && ret != -EAGAIN);
120 if (next_key_ret == 0) {
121 memcpy(&root->defrag_progress, &key, sizeof(key));
122 ret = -EAGAIN;
123 }
124
125 btrfs_release_path(root, path);
126 if (is_extent)
127 btrfs_extent_post_op(trans, root);
128out:
129 if (path)
130 btrfs_free_path(path);
131 if (ret == -EAGAIN) {
132 if (root->defrag_max.objectid > root->defrag_progress.objectid)
133 goto done;
134 if (root->defrag_max.type > root->defrag_progress.type)
135 goto done;
136 if (root->defrag_max.offset > root->defrag_progress.offset)
137 goto done;
138 ret = 0;
139 }
140done:
141 if (ret != -EAGAIN) {
142 memset(&root->defrag_progress, 0,
143 sizeof(root->defrag_progress));
144 root->defrag_trans_start = trans->transid;
145 }
146 return ret;
147}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
new file mode 100644
index 000000000000..d81cda2e077c
--- /dev/null
+++ b/fs/btrfs/tree-log.c
@@ -0,0 +1,2898 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "transaction.h"
22#include "disk-io.h"
23#include "locking.h"
24#include "print-tree.h"
25#include "compat.h"
26#include "tree-log.h"
27
28/* magic values for the inode_only field in btrfs_log_inode:
29 *
30 * LOG_INODE_ALL means to log everything
31 * LOG_INODE_EXISTS means to log just enough to recreate the inode
32 * during log replay
33 */
34#define LOG_INODE_ALL 0
35#define LOG_INODE_EXISTS 1
36
37/*
38 * stages for the tree walking. The first
39 * stage (0) is to only pin down the blocks we find
40 * the second stage (1) is to make sure that all the inodes
41 * we find in the log are created in the subvolume.
42 *
43 * The last stage is to deal with directories and links and extents
44 * and all the other fun semantics
45 */
46#define LOG_WALK_PIN_ONLY 0
47#define LOG_WALK_REPLAY_INODES 1
48#define LOG_WALK_REPLAY_ALL 2
49
50static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
51 struct btrfs_root *root, struct inode *inode,
52 int inode_only);
53static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root,
55 struct btrfs_path *path, u64 objectid);
56
57/*
58 * tree logging is a special write ahead log used to make sure that
59 * fsyncs and O_SYNCs can happen without doing full tree commits.
60 *
61 * Full tree commits are expensive because they require commonly
62 * modified blocks to be recowed, creating many dirty pages in the
63 * extent tree an 4x-6x higher write load than ext3.
64 *
65 * Instead of doing a tree commit on every fsync, we use the
66 * key ranges and transaction ids to find items for a given file or directory
67 * that have changed in this transaction. Those items are copied into
68 * a special tree (one per subvolume root), that tree is written to disk
69 * and then the fsync is considered complete.
70 *
71 * After a crash, items are copied out of the log-tree back into the
72 * subvolume tree. Any file data extents found are recorded in the extent
73 * allocation tree, and the log-tree freed.
74 *
75 * The log tree is read three times, once to pin down all the extents it is
76 * using in ram and once, once to create all the inodes logged in the tree
77 * and once to do all the other items.
78 */
79
80/*
81 * btrfs_add_log_tree adds a new per-subvolume log tree into the
82 * tree of log tree roots. This must be called with a tree log transaction
83 * running (see start_log_trans).
84 */
85static int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
86 struct btrfs_root *root)
87{
88 struct btrfs_key key;
89 struct btrfs_root_item root_item;
90 struct btrfs_inode_item *inode_item;
91 struct extent_buffer *leaf;
92 struct btrfs_root *new_root = root;
93 int ret;
94 u64 objectid = root->root_key.objectid;
95
96 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
97 BTRFS_TREE_LOG_OBJECTID,
98 trans->transid, 0, 0, 0);
99 if (IS_ERR(leaf)) {
100 ret = PTR_ERR(leaf);
101 return ret;
102 }
103
104 btrfs_set_header_nritems(leaf, 0);
105 btrfs_set_header_level(leaf, 0);
106 btrfs_set_header_bytenr(leaf, leaf->start);
107 btrfs_set_header_generation(leaf, trans->transid);
108 btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
109
110 write_extent_buffer(leaf, root->fs_info->fsid,
111 (unsigned long)btrfs_header_fsid(leaf),
112 BTRFS_FSID_SIZE);
113 btrfs_mark_buffer_dirty(leaf);
114
115 inode_item = &root_item.inode;
116 memset(inode_item, 0, sizeof(*inode_item));
117 inode_item->generation = cpu_to_le64(1);
118 inode_item->size = cpu_to_le64(3);
119 inode_item->nlink = cpu_to_le32(1);
120 inode_item->nbytes = cpu_to_le64(root->leafsize);
121 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
122
123 btrfs_set_root_bytenr(&root_item, leaf->start);
124 btrfs_set_root_generation(&root_item, trans->transid);
125 btrfs_set_root_level(&root_item, 0);
126 btrfs_set_root_refs(&root_item, 0);
127 btrfs_set_root_used(&root_item, 0);
128
129 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
130 root_item.drop_level = 0;
131
132 btrfs_tree_unlock(leaf);
133 free_extent_buffer(leaf);
134 leaf = NULL;
135
136 btrfs_set_root_dirid(&root_item, 0);
137
138 key.objectid = BTRFS_TREE_LOG_OBJECTID;
139 key.offset = objectid;
140 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
141 ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
142 &root_item);
143 if (ret)
144 goto fail;
145
146 new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
147 &key);
148 BUG_ON(!new_root);
149
150 WARN_ON(root->log_root);
151 root->log_root = new_root;
152
153 /*
154 * log trees do not get reference counted because they go away
155 * before a real commit is actually done. They do store pointers
156 * to file data extents, and those reference counts still get
157 * updated (along with back refs to the log tree).
158 */
159 new_root->ref_cows = 0;
160 new_root->last_trans = trans->transid;
161
162 /*
163 * we need to make sure the root block for this new tree
164 * is marked as dirty in the dirty_log_pages tree. This
165 * is how it gets flushed down to disk at tree log commit time.
166 *
167 * the tree logging mutex keeps others from coming in and changing
168 * the new_root->node, so we can safely access it here
169 */
170 set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start,
171 new_root->node->start + new_root->node->len - 1,
172 GFP_NOFS);
173
174fail:
175 return ret;
176}
177
178/*
179 * start a sub transaction and setup the log tree
180 * this increments the log tree writer count to make the people
181 * syncing the tree wait for us to finish
182 */
183static int start_log_trans(struct btrfs_trans_handle *trans,
184 struct btrfs_root *root)
185{
186 int ret;
187 mutex_lock(&root->fs_info->tree_log_mutex);
188 if (!root->fs_info->log_root_tree) {
189 ret = btrfs_init_log_root_tree(trans, root->fs_info);
190 BUG_ON(ret);
191 }
192 if (!root->log_root) {
193 ret = btrfs_add_log_tree(trans, root);
194 BUG_ON(ret);
195 }
196 atomic_inc(&root->fs_info->tree_log_writers);
197 root->fs_info->tree_log_batch++;
198 mutex_unlock(&root->fs_info->tree_log_mutex);
199 return 0;
200}
201
202/*
203 * returns 0 if there was a log transaction running and we were able
204 * to join, or returns -ENOENT if there were not transactions
205 * in progress
206 */
207static int join_running_log_trans(struct btrfs_root *root)
208{
209 int ret = -ENOENT;
210
211 smp_mb();
212 if (!root->log_root)
213 return -ENOENT;
214
215 mutex_lock(&root->fs_info->tree_log_mutex);
216 if (root->log_root) {
217 ret = 0;
218 atomic_inc(&root->fs_info->tree_log_writers);
219 root->fs_info->tree_log_batch++;
220 }
221 mutex_unlock(&root->fs_info->tree_log_mutex);
222 return ret;
223}
224
225/*
226 * indicate we're done making changes to the log tree
227 * and wake up anyone waiting to do a sync
228 */
229static int end_log_trans(struct btrfs_root *root)
230{
231 atomic_dec(&root->fs_info->tree_log_writers);
232 smp_mb();
233 if (waitqueue_active(&root->fs_info->tree_log_wait))
234 wake_up(&root->fs_info->tree_log_wait);
235 return 0;
236}
237
238
239/*
240 * the walk control struct is used to pass state down the chain when
241 * processing the log tree. The stage field tells us which part
242 * of the log tree processing we are currently doing. The others
243 * are state fields used for that specific part
244 */
245struct walk_control {
246 /* should we free the extent on disk when done? This is used
247 * at transaction commit time while freeing a log tree
248 */
249 int free;
250
251 /* should we write out the extent buffer? This is used
252 * while flushing the log tree to disk during a sync
253 */
254 int write;
255
256 /* should we wait for the extent buffer io to finish? Also used
257 * while flushing the log tree to disk for a sync
258 */
259 int wait;
260
261 /* pin only walk, we record which extents on disk belong to the
262 * log trees
263 */
264 int pin;
265
266 /* what stage of the replay code we're currently in */
267 int stage;
268
269 /* the root we are currently replaying */
270 struct btrfs_root *replay_dest;
271
272 /* the trans handle for the current replay */
273 struct btrfs_trans_handle *trans;
274
275 /* the function that gets used to process blocks we find in the
276 * tree. Note the extent_buffer might not be up to date when it is
277 * passed in, and it must be checked or read if you need the data
278 * inside it
279 */
280 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
281 struct walk_control *wc, u64 gen);
282};
283
284/*
285 * process_func used to pin down extents, write them or wait on them
286 */
287static int process_one_buffer(struct btrfs_root *log,
288 struct extent_buffer *eb,
289 struct walk_control *wc, u64 gen)
290{
291 if (wc->pin) {
292 mutex_lock(&log->fs_info->pinned_mutex);
293 btrfs_update_pinned_extents(log->fs_info->extent_root,
294 eb->start, eb->len, 1);
295 mutex_unlock(&log->fs_info->pinned_mutex);
296 }
297
298 if (btrfs_buffer_uptodate(eb, gen)) {
299 if (wc->write)
300 btrfs_write_tree_block(eb);
301 if (wc->wait)
302 btrfs_wait_tree_block_writeback(eb);
303 }
304 return 0;
305}
306
307/*
308 * Item overwrite used by replay and tree logging. eb, slot and key all refer
309 * to the src data we are copying out.
310 *
311 * root is the tree we are copying into, and path is a scratch
312 * path for use in this function (it should be released on entry and
313 * will be released on exit).
314 *
315 * If the key is already in the destination tree the existing item is
316 * overwritten. If the existing item isn't big enough, it is extended.
317 * If it is too large, it is truncated.
318 *
319 * If the key isn't in the destination yet, a new item is inserted.
320 */
321static noinline int overwrite_item(struct btrfs_trans_handle *trans,
322 struct btrfs_root *root,
323 struct btrfs_path *path,
324 struct extent_buffer *eb, int slot,
325 struct btrfs_key *key)
326{
327 int ret;
328 u32 item_size;
329 u64 saved_i_size = 0;
330 int save_old_i_size = 0;
331 unsigned long src_ptr;
332 unsigned long dst_ptr;
333 int overwrite_root = 0;
334
335 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
336 overwrite_root = 1;
337
338 item_size = btrfs_item_size_nr(eb, slot);
339 src_ptr = btrfs_item_ptr_offset(eb, slot);
340
341 /* look for the key in the destination tree */
342 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
343 if (ret == 0) {
344 char *src_copy;
345 char *dst_copy;
346 u32 dst_size = btrfs_item_size_nr(path->nodes[0],
347 path->slots[0]);
348 if (dst_size != item_size)
349 goto insert;
350
351 if (item_size == 0) {
352 btrfs_release_path(root, path);
353 return 0;
354 }
355 dst_copy = kmalloc(item_size, GFP_NOFS);
356 src_copy = kmalloc(item_size, GFP_NOFS);
357
358 read_extent_buffer(eb, src_copy, src_ptr, item_size);
359
360 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
361 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
362 item_size);
363 ret = memcmp(dst_copy, src_copy, item_size);
364
365 kfree(dst_copy);
366 kfree(src_copy);
367 /*
368 * they have the same contents, just return, this saves
369 * us from cowing blocks in the destination tree and doing
370 * extra writes that may not have been done by a previous
371 * sync
372 */
373 if (ret == 0) {
374 btrfs_release_path(root, path);
375 return 0;
376 }
377
378 }
379insert:
380 btrfs_release_path(root, path);
381 /* try to insert the key into the destination tree */
382 ret = btrfs_insert_empty_item(trans, root, path,
383 key, item_size);
384
385 /* make sure any existing item is the correct size */
386 if (ret == -EEXIST) {
387 u32 found_size;
388 found_size = btrfs_item_size_nr(path->nodes[0],
389 path->slots[0]);
390 if (found_size > item_size) {
391 btrfs_truncate_item(trans, root, path, item_size, 1);
392 } else if (found_size < item_size) {
393 ret = btrfs_extend_item(trans, root, path,
394 item_size - found_size);
395 BUG_ON(ret);
396 }
397 } else if (ret) {
398 BUG();
399 }
400 dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
401 path->slots[0]);
402
403 /* don't overwrite an existing inode if the generation number
404 * was logged as zero. This is done when the tree logging code
405 * is just logging an inode to make sure it exists after recovery.
406 *
407 * Also, don't overwrite i_size on directories during replay.
408 * log replay inserts and removes directory items based on the
409 * state of the tree found in the subvolume, and i_size is modified
410 * as it goes
411 */
412 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
413 struct btrfs_inode_item *src_item;
414 struct btrfs_inode_item *dst_item;
415
416 src_item = (struct btrfs_inode_item *)src_ptr;
417 dst_item = (struct btrfs_inode_item *)dst_ptr;
418
419 if (btrfs_inode_generation(eb, src_item) == 0)
420 goto no_copy;
421
422 if (overwrite_root &&
423 S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
424 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
425 save_old_i_size = 1;
426 saved_i_size = btrfs_inode_size(path->nodes[0],
427 dst_item);
428 }
429 }
430
431 copy_extent_buffer(path->nodes[0], eb, dst_ptr,
432 src_ptr, item_size);
433
434 if (save_old_i_size) {
435 struct btrfs_inode_item *dst_item;
436 dst_item = (struct btrfs_inode_item *)dst_ptr;
437 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
438 }
439
440 /* make sure the generation is filled in */
441 if (key->type == BTRFS_INODE_ITEM_KEY) {
442 struct btrfs_inode_item *dst_item;
443 dst_item = (struct btrfs_inode_item *)dst_ptr;
444 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
445 btrfs_set_inode_generation(path->nodes[0], dst_item,
446 trans->transid);
447 }
448 }
449no_copy:
450 btrfs_mark_buffer_dirty(path->nodes[0]);
451 btrfs_release_path(root, path);
452 return 0;
453}
454
455/*
456 * simple helper to read an inode off the disk from a given root
457 * This can only be called for subvolume roots and not for the log
458 */
459static noinline struct inode *read_one_inode(struct btrfs_root *root,
460 u64 objectid)
461{
462 struct inode *inode;
463 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
464 if (inode->i_state & I_NEW) {
465 BTRFS_I(inode)->root = root;
466 BTRFS_I(inode)->location.objectid = objectid;
467 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
468 BTRFS_I(inode)->location.offset = 0;
469 btrfs_read_locked_inode(inode);
470 unlock_new_inode(inode);
471
472 }
473 if (is_bad_inode(inode)) {
474 iput(inode);
475 inode = NULL;
476 }
477 return inode;
478}
479
480/* replays a single extent in 'eb' at 'slot' with 'key' into the
481 * subvolume 'root'. path is released on entry and should be released
482 * on exit.
483 *
484 * extents in the log tree have not been allocated out of the extent
485 * tree yet. So, this completes the allocation, taking a reference
486 * as required if the extent already exists or creating a new extent
487 * if it isn't in the extent allocation tree yet.
488 *
489 * The extent is inserted into the file, dropping any existing extents
490 * from the file that overlap the new one.
491 */
492static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
493 struct btrfs_root *root,
494 struct btrfs_path *path,
495 struct extent_buffer *eb, int slot,
496 struct btrfs_key *key)
497{
498 int found_type;
499 u64 mask = root->sectorsize - 1;
500 u64 extent_end;
501 u64 alloc_hint;
502 u64 start = key->offset;
503 u64 saved_nbytes;
504 struct btrfs_file_extent_item *item;
505 struct inode *inode = NULL;
506 unsigned long size;
507 int ret = 0;
508
509 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
510 found_type = btrfs_file_extent_type(eb, item);
511
512 if (found_type == BTRFS_FILE_EXTENT_REG ||
513 found_type == BTRFS_FILE_EXTENT_PREALLOC)
514 extent_end = start + btrfs_file_extent_num_bytes(eb, item);
515 else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
516 size = btrfs_file_extent_inline_len(eb, item);
517 extent_end = (start + size + mask) & ~mask;
518 } else {
519 ret = 0;
520 goto out;
521 }
522
523 inode = read_one_inode(root, key->objectid);
524 if (!inode) {
525 ret = -EIO;
526 goto out;
527 }
528
529 /*
530 * first check to see if we already have this extent in the
531 * file. This must be done before the btrfs_drop_extents run
532 * so we don't try to drop this extent.
533 */
534 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
535 start, 0);
536
537 if (ret == 0 &&
538 (found_type == BTRFS_FILE_EXTENT_REG ||
539 found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
540 struct btrfs_file_extent_item cmp1;
541 struct btrfs_file_extent_item cmp2;
542 struct btrfs_file_extent_item *existing;
543 struct extent_buffer *leaf;
544
545 leaf = path->nodes[0];
546 existing = btrfs_item_ptr(leaf, path->slots[0],
547 struct btrfs_file_extent_item);
548
549 read_extent_buffer(eb, &cmp1, (unsigned long)item,
550 sizeof(cmp1));
551 read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
552 sizeof(cmp2));
553
554 /*
555 * we already have a pointer to this exact extent,
556 * we don't have to do anything
557 */
558 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
559 btrfs_release_path(root, path);
560 goto out;
561 }
562 }
563 btrfs_release_path(root, path);
564
565 saved_nbytes = inode_get_bytes(inode);
566 /* drop any overlapping extents */
567 ret = btrfs_drop_extents(trans, root, inode,
568 start, extent_end, start, &alloc_hint);
569 BUG_ON(ret);
570
571 if (found_type == BTRFS_FILE_EXTENT_REG ||
572 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
573 unsigned long dest_offset;
574 struct btrfs_key ins;
575
576 ret = btrfs_insert_empty_item(trans, root, path, key,
577 sizeof(*item));
578 BUG_ON(ret);
579 dest_offset = btrfs_item_ptr_offset(path->nodes[0],
580 path->slots[0]);
581 copy_extent_buffer(path->nodes[0], eb, dest_offset,
582 (unsigned long)item, sizeof(*item));
583
584 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
585 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
586 ins.type = BTRFS_EXTENT_ITEM_KEY;
587
588 if (ins.objectid > 0) {
589 u64 csum_start;
590 u64 csum_end;
591 LIST_HEAD(ordered_sums);
592 /*
593 * is this extent already allocated in the extent
594 * allocation tree? If so, just add a reference
595 */
596 ret = btrfs_lookup_extent(root, ins.objectid,
597 ins.offset);
598 if (ret == 0) {
599 ret = btrfs_inc_extent_ref(trans, root,
600 ins.objectid, ins.offset,
601 path->nodes[0]->start,
602 root->root_key.objectid,
603 trans->transid, key->objectid);
604 } else {
605 /*
606 * insert the extent pointer in the extent
607 * allocation tree
608 */
609 ret = btrfs_alloc_logged_extent(trans, root,
610 path->nodes[0]->start,
611 root->root_key.objectid,
612 trans->transid, key->objectid,
613 &ins);
614 BUG_ON(ret);
615 }
616 btrfs_release_path(root, path);
617
618 if (btrfs_file_extent_compression(eb, item)) {
619 csum_start = ins.objectid;
620 csum_end = csum_start + ins.offset;
621 } else {
622 csum_start = ins.objectid +
623 btrfs_file_extent_offset(eb, item);
624 csum_end = csum_start +
625 btrfs_file_extent_num_bytes(eb, item);
626 }
627
628 ret = btrfs_lookup_csums_range(root->log_root,
629 csum_start, csum_end - 1,
630 &ordered_sums);
631 BUG_ON(ret);
632 while (!list_empty(&ordered_sums)) {
633 struct btrfs_ordered_sum *sums;
634 sums = list_entry(ordered_sums.next,
635 struct btrfs_ordered_sum,
636 list);
637 ret = btrfs_csum_file_blocks(trans,
638 root->fs_info->csum_root,
639 sums);
640 BUG_ON(ret);
641 list_del(&sums->list);
642 kfree(sums);
643 }
644 } else {
645 btrfs_release_path(root, path);
646 }
647 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
648 /* inline extents are easy, we just overwrite them */
649 ret = overwrite_item(trans, root, path, eb, slot, key);
650 BUG_ON(ret);
651 }
652
653 inode_set_bytes(inode, saved_nbytes);
654 btrfs_update_inode(trans, root, inode);
655out:
656 if (inode)
657 iput(inode);
658 return ret;
659}
660
661/*
662 * when cleaning up conflicts between the directory names in the
663 * subvolume, directory names in the log and directory names in the
664 * inode back references, we may have to unlink inodes from directories.
665 *
666 * This is a helper function to do the unlink of a specific directory
667 * item
668 */
669static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
670 struct btrfs_root *root,
671 struct btrfs_path *path,
672 struct inode *dir,
673 struct btrfs_dir_item *di)
674{
675 struct inode *inode;
676 char *name;
677 int name_len;
678 struct extent_buffer *leaf;
679 struct btrfs_key location;
680 int ret;
681
682 leaf = path->nodes[0];
683
684 btrfs_dir_item_key_to_cpu(leaf, di, &location);
685 name_len = btrfs_dir_name_len(leaf, di);
686 name = kmalloc(name_len, GFP_NOFS);
687 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
688 btrfs_release_path(root, path);
689
690 inode = read_one_inode(root, location.objectid);
691 BUG_ON(!inode);
692
693 ret = link_to_fixup_dir(trans, root, path, location.objectid);
694 BUG_ON(ret);
695 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
696 BUG_ON(ret);
697 kfree(name);
698
699 iput(inode);
700 return ret;
701}
702
703/*
704 * helper function to see if a given name and sequence number found
705 * in an inode back reference are already in a directory and correctly
706 * point to this inode
707 */
708static noinline int inode_in_dir(struct btrfs_root *root,
709 struct btrfs_path *path,
710 u64 dirid, u64 objectid, u64 index,
711 const char *name, int name_len)
712{
713 struct btrfs_dir_item *di;
714 struct btrfs_key location;
715 int match = 0;
716
717 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
718 index, name, name_len, 0);
719 if (di && !IS_ERR(di)) {
720 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
721 if (location.objectid != objectid)
722 goto out;
723 } else
724 goto out;
725 btrfs_release_path(root, path);
726
727 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
728 if (di && !IS_ERR(di)) {
729 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
730 if (location.objectid != objectid)
731 goto out;
732 } else
733 goto out;
734 match = 1;
735out:
736 btrfs_release_path(root, path);
737 return match;
738}
739
740/*
741 * helper function to check a log tree for a named back reference in
742 * an inode. This is used to decide if a back reference that is
743 * found in the subvolume conflicts with what we find in the log.
744 *
745 * inode backreferences may have multiple refs in a single item,
746 * during replay we process one reference at a time, and we don't
747 * want to delete valid links to a file from the subvolume if that
748 * link is also in the log.
749 */
750static noinline int backref_in_log(struct btrfs_root *log,
751 struct btrfs_key *key,
752 char *name, int namelen)
753{
754 struct btrfs_path *path;
755 struct btrfs_inode_ref *ref;
756 unsigned long ptr;
757 unsigned long ptr_end;
758 unsigned long name_ptr;
759 int found_name_len;
760 int item_size;
761 int ret;
762 int match = 0;
763
764 path = btrfs_alloc_path();
765 ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
766 if (ret != 0)
767 goto out;
768
769 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
770 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
771 ptr_end = ptr + item_size;
772 while (ptr < ptr_end) {
773 ref = (struct btrfs_inode_ref *)ptr;
774 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
775 if (found_name_len == namelen) {
776 name_ptr = (unsigned long)(ref + 1);
777 ret = memcmp_extent_buffer(path->nodes[0], name,
778 name_ptr, namelen);
779 if (ret == 0) {
780 match = 1;
781 goto out;
782 }
783 }
784 ptr = (unsigned long)(ref + 1) + found_name_len;
785 }
786out:
787 btrfs_free_path(path);
788 return match;
789}
790
791
792/*
793 * replay one inode back reference item found in the log tree.
794 * eb, slot and key refer to the buffer and key found in the log tree.
795 * root is the destination we are replaying into, and path is for temp
796 * use by this function. (it should be released on return).
797 */
798static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
799 struct btrfs_root *root,
800 struct btrfs_root *log,
801 struct btrfs_path *path,
802 struct extent_buffer *eb, int slot,
803 struct btrfs_key *key)
804{
805 struct inode *dir;
806 int ret;
807 struct btrfs_key location;
808 struct btrfs_inode_ref *ref;
809 struct btrfs_dir_item *di;
810 struct inode *inode;
811 char *name;
812 int namelen;
813 unsigned long ref_ptr;
814 unsigned long ref_end;
815
816 location.objectid = key->objectid;
817 location.type = BTRFS_INODE_ITEM_KEY;
818 location.offset = 0;
819
820 /*
821 * it is possible that we didn't log all the parent directories
822 * for a given inode. If we don't find the dir, just don't
823 * copy the back ref in. The link count fixup code will take
824 * care of the rest
825 */
826 dir = read_one_inode(root, key->offset);
827 if (!dir)
828 return -ENOENT;
829
830 inode = read_one_inode(root, key->objectid);
831 BUG_ON(!dir);
832
833 ref_ptr = btrfs_item_ptr_offset(eb, slot);
834 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
835
836again:
837 ref = (struct btrfs_inode_ref *)ref_ptr;
838
839 namelen = btrfs_inode_ref_name_len(eb, ref);
840 name = kmalloc(namelen, GFP_NOFS);
841 BUG_ON(!name);
842
843 read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
844
845 /* if we already have a perfect match, we're done */
846 if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
847 btrfs_inode_ref_index(eb, ref),
848 name, namelen)) {
849 goto out;
850 }
851
852 /*
853 * look for a conflicting back reference in the metadata.
854 * if we find one we have to unlink that name of the file
855 * before we add our new link. Later on, we overwrite any
856 * existing back reference, and we don't want to create
857 * dangling pointers in the directory.
858 */
859conflict_again:
860 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
861 if (ret == 0) {
862 char *victim_name;
863 int victim_name_len;
864 struct btrfs_inode_ref *victim_ref;
865 unsigned long ptr;
866 unsigned long ptr_end;
867 struct extent_buffer *leaf = path->nodes[0];
868
869 /* are we trying to overwrite a back ref for the root directory
870 * if so, just jump out, we're done
871 */
872 if (key->objectid == key->offset)
873 goto out_nowrite;
874
875 /* check all the names in this back reference to see
876 * if they are in the log. if so, we allow them to stay
877 * otherwise they must be unlinked as a conflict
878 */
879 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
880 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
881 while (ptr < ptr_end) {
882 victim_ref = (struct btrfs_inode_ref *)ptr;
883 victim_name_len = btrfs_inode_ref_name_len(leaf,
884 victim_ref);
885 victim_name = kmalloc(victim_name_len, GFP_NOFS);
886 BUG_ON(!victim_name);
887
888 read_extent_buffer(leaf, victim_name,
889 (unsigned long)(victim_ref + 1),
890 victim_name_len);
891
892 if (!backref_in_log(log, key, victim_name,
893 victim_name_len)) {
894 btrfs_inc_nlink(inode);
895 btrfs_release_path(root, path);
896 ret = btrfs_unlink_inode(trans, root, dir,
897 inode, victim_name,
898 victim_name_len);
899 kfree(victim_name);
900 btrfs_release_path(root, path);
901 goto conflict_again;
902 }
903 kfree(victim_name);
904 ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
905 }
906 BUG_ON(ret);
907 }
908 btrfs_release_path(root, path);
909
910 /* look for a conflicting sequence number */
911 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
912 btrfs_inode_ref_index(eb, ref),
913 name, namelen, 0);
914 if (di && !IS_ERR(di)) {
915 ret = drop_one_dir_item(trans, root, path, dir, di);
916 BUG_ON(ret);
917 }
918 btrfs_release_path(root, path);
919
920
921 /* look for a conflicting name */
922 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
923 name, namelen, 0);
924 if (di && !IS_ERR(di)) {
925 ret = drop_one_dir_item(trans, root, path, dir, di);
926 BUG_ON(ret);
927 }
928 btrfs_release_path(root, path);
929
930 /* insert our name */
931 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
932 btrfs_inode_ref_index(eb, ref));
933 BUG_ON(ret);
934
935 btrfs_update_inode(trans, root, inode);
936
937out:
938 ref_ptr = (unsigned long)(ref + 1) + namelen;
939 kfree(name);
940 if (ref_ptr < ref_end)
941 goto again;
942
943 /* finally write the back reference in the inode */
944 ret = overwrite_item(trans, root, path, eb, slot, key);
945 BUG_ON(ret);
946
947out_nowrite:
948 btrfs_release_path(root, path);
949 iput(dir);
950 iput(inode);
951 return 0;
952}
953
954/*
955 * There are a few corners where the link count of the file can't
956 * be properly maintained during replay. So, instead of adding
957 * lots of complexity to the log code, we just scan the backrefs
958 * for any file that has been through replay.
959 *
960 * The scan will update the link count on the inode to reflect the
961 * number of back refs found. If it goes down to zero, the iput
962 * will free the inode.
963 */
964static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
965 struct btrfs_root *root,
966 struct inode *inode)
967{
968 struct btrfs_path *path;
969 int ret;
970 struct btrfs_key key;
971 u64 nlink = 0;
972 unsigned long ptr;
973 unsigned long ptr_end;
974 int name_len;
975
976 key.objectid = inode->i_ino;
977 key.type = BTRFS_INODE_REF_KEY;
978 key.offset = (u64)-1;
979
980 path = btrfs_alloc_path();
981
982 while (1) {
983 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
984 if (ret < 0)
985 break;
986 if (ret > 0) {
987 if (path->slots[0] == 0)
988 break;
989 path->slots[0]--;
990 }
991 btrfs_item_key_to_cpu(path->nodes[0], &key,
992 path->slots[0]);
993 if (key.objectid != inode->i_ino ||
994 key.type != BTRFS_INODE_REF_KEY)
995 break;
996 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
997 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
998 path->slots[0]);
999 while (ptr < ptr_end) {
1000 struct btrfs_inode_ref *ref;
1001
1002 ref = (struct btrfs_inode_ref *)ptr;
1003 name_len = btrfs_inode_ref_name_len(path->nodes[0],
1004 ref);
1005 ptr = (unsigned long)(ref + 1) + name_len;
1006 nlink++;
1007 }
1008
1009 if (key.offset == 0)
1010 break;
1011 key.offset--;
1012 btrfs_release_path(root, path);
1013 }
1014 btrfs_free_path(path);
1015 if (nlink != inode->i_nlink) {
1016 inode->i_nlink = nlink;
1017 btrfs_update_inode(trans, root, inode);
1018 }
1019 BTRFS_I(inode)->index_cnt = (u64)-1;
1020
1021 return 0;
1022}
1023
1024static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1025 struct btrfs_root *root,
1026 struct btrfs_path *path)
1027{
1028 int ret;
1029 struct btrfs_key key;
1030 struct inode *inode;
1031
1032 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1033 key.type = BTRFS_ORPHAN_ITEM_KEY;
1034 key.offset = (u64)-1;
1035 while (1) {
1036 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1037 if (ret < 0)
1038 break;
1039
1040 if (ret == 1) {
1041 if (path->slots[0] == 0)
1042 break;
1043 path->slots[0]--;
1044 }
1045
1046 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1047 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1048 key.type != BTRFS_ORPHAN_ITEM_KEY)
1049 break;
1050
1051 ret = btrfs_del_item(trans, root, path);
1052 BUG_ON(ret);
1053
1054 btrfs_release_path(root, path);
1055 inode = read_one_inode(root, key.offset);
1056 BUG_ON(!inode);
1057
1058 ret = fixup_inode_link_count(trans, root, inode);
1059 BUG_ON(ret);
1060
1061 iput(inode);
1062
1063 if (key.offset == 0)
1064 break;
1065 key.offset--;
1066 }
1067 btrfs_release_path(root, path);
1068 return 0;
1069}
1070
1071
1072/*
1073 * record a given inode in the fixup dir so we can check its link
1074 * count when replay is done. The link count is incremented here
1075 * so the inode won't go away until we check it
1076 */
1077static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1078 struct btrfs_root *root,
1079 struct btrfs_path *path,
1080 u64 objectid)
1081{
1082 struct btrfs_key key;
1083 int ret = 0;
1084 struct inode *inode;
1085
1086 inode = read_one_inode(root, objectid);
1087 BUG_ON(!inode);
1088
1089 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1090 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1091 key.offset = objectid;
1092
1093 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1094
1095 btrfs_release_path(root, path);
1096 if (ret == 0) {
1097 btrfs_inc_nlink(inode);
1098 btrfs_update_inode(trans, root, inode);
1099 } else if (ret == -EEXIST) {
1100 ret = 0;
1101 } else {
1102 BUG();
1103 }
1104 iput(inode);
1105
1106 return ret;
1107}
1108
1109/*
1110 * when replaying the log for a directory, we only insert names
1111 * for inodes that actually exist. This means an fsync on a directory
1112 * does not implicitly fsync all the new files in it
1113 */
1114static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1115 struct btrfs_root *root,
1116 struct btrfs_path *path,
1117 u64 dirid, u64 index,
1118 char *name, int name_len, u8 type,
1119 struct btrfs_key *location)
1120{
1121 struct inode *inode;
1122 struct inode *dir;
1123 int ret;
1124
1125 inode = read_one_inode(root, location->objectid);
1126 if (!inode)
1127 return -ENOENT;
1128
1129 dir = read_one_inode(root, dirid);
1130 if (!dir) {
1131 iput(inode);
1132 return -EIO;
1133 }
1134 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
1135
1136 /* FIXME, put inode into FIXUP list */
1137
1138 iput(inode);
1139 iput(dir);
1140 return ret;
1141}
1142
1143/*
1144 * take a single entry in a log directory item and replay it into
1145 * the subvolume.
1146 *
1147 * if a conflicting item exists in the subdirectory already,
1148 * the inode it points to is unlinked and put into the link count
1149 * fix up tree.
1150 *
1151 * If a name from the log points to a file or directory that does
1152 * not exist in the FS, it is skipped. fsyncs on directories
1153 * do not force down inodes inside that directory, just changes to the
1154 * names or unlinks in a directory.
1155 */
1156static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1157 struct btrfs_root *root,
1158 struct btrfs_path *path,
1159 struct extent_buffer *eb,
1160 struct btrfs_dir_item *di,
1161 struct btrfs_key *key)
1162{
1163 char *name;
1164 int name_len;
1165 struct btrfs_dir_item *dst_di;
1166 struct btrfs_key found_key;
1167 struct btrfs_key log_key;
1168 struct inode *dir;
1169 u8 log_type;
1170 int exists;
1171 int ret;
1172
1173 dir = read_one_inode(root, key->objectid);
1174 BUG_ON(!dir);
1175
1176 name_len = btrfs_dir_name_len(eb, di);
1177 name = kmalloc(name_len, GFP_NOFS);
1178 log_type = btrfs_dir_type(eb, di);
1179 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1180 name_len);
1181
1182 btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1183 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1184 if (exists == 0)
1185 exists = 1;
1186 else
1187 exists = 0;
1188 btrfs_release_path(root, path);
1189
1190 if (key->type == BTRFS_DIR_ITEM_KEY) {
1191 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1192 name, name_len, 1);
1193 } else if (key->type == BTRFS_DIR_INDEX_KEY) {
1194 dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1195 key->objectid,
1196 key->offset, name,
1197 name_len, 1);
1198 } else {
1199 BUG();
1200 }
1201 if (!dst_di || IS_ERR(dst_di)) {
1202 /* we need a sequence number to insert, so we only
1203 * do inserts for the BTRFS_DIR_INDEX_KEY types
1204 */
1205 if (key->type != BTRFS_DIR_INDEX_KEY)
1206 goto out;
1207 goto insert;
1208 }
1209
1210 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1211 /* the existing item matches the logged item */
1212 if (found_key.objectid == log_key.objectid &&
1213 found_key.type == log_key.type &&
1214 found_key.offset == log_key.offset &&
1215 btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1216 goto out;
1217 }
1218
1219 /*
1220 * don't drop the conflicting directory entry if the inode
1221 * for the new entry doesn't exist
1222 */
1223 if (!exists)
1224 goto out;
1225
1226 ret = drop_one_dir_item(trans, root, path, dir, dst_di);
1227 BUG_ON(ret);
1228
1229 if (key->type == BTRFS_DIR_INDEX_KEY)
1230 goto insert;
1231out:
1232 btrfs_release_path(root, path);
1233 kfree(name);
1234 iput(dir);
1235 return 0;
1236
1237insert:
1238 btrfs_release_path(root, path);
1239 ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1240 name, name_len, log_type, &log_key);
1241
1242 if (ret && ret != -ENOENT)
1243 BUG();
1244 goto out;
1245}
1246
1247/*
1248 * find all the names in a directory item and reconcile them into
1249 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
1250 * one name in a directory item, but the same code gets used for
1251 * both directory index types
1252 */
1253static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1254 struct btrfs_root *root,
1255 struct btrfs_path *path,
1256 struct extent_buffer *eb, int slot,
1257 struct btrfs_key *key)
1258{
1259 int ret;
1260 u32 item_size = btrfs_item_size_nr(eb, slot);
1261 struct btrfs_dir_item *di;
1262 int name_len;
1263 unsigned long ptr;
1264 unsigned long ptr_end;
1265
1266 ptr = btrfs_item_ptr_offset(eb, slot);
1267 ptr_end = ptr + item_size;
1268 while (ptr < ptr_end) {
1269 di = (struct btrfs_dir_item *)ptr;
1270 name_len = btrfs_dir_name_len(eb, di);
1271 ret = replay_one_name(trans, root, path, eb, di, key);
1272 BUG_ON(ret);
1273 ptr = (unsigned long)(di + 1);
1274 ptr += name_len;
1275 }
1276 return 0;
1277}
1278
1279/*
1280 * directory replay has two parts. There are the standard directory
1281 * items in the log copied from the subvolume, and range items
1282 * created in the log while the subvolume was logged.
1283 *
1284 * The range items tell us which parts of the key space the log
1285 * is authoritative for. During replay, if a key in the subvolume
1286 * directory is in a logged range item, but not actually in the log
1287 * that means it was deleted from the directory before the fsync
1288 * and should be removed.
1289 */
1290static noinline int find_dir_range(struct btrfs_root *root,
1291 struct btrfs_path *path,
1292 u64 dirid, int key_type,
1293 u64 *start_ret, u64 *end_ret)
1294{
1295 struct btrfs_key key;
1296 u64 found_end;
1297 struct btrfs_dir_log_item *item;
1298 int ret;
1299 int nritems;
1300
1301 if (*start_ret == (u64)-1)
1302 return 1;
1303
1304 key.objectid = dirid;
1305 key.type = key_type;
1306 key.offset = *start_ret;
1307
1308 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1309 if (ret < 0)
1310 goto out;
1311 if (ret > 0) {
1312 if (path->slots[0] == 0)
1313 goto out;
1314 path->slots[0]--;
1315 }
1316 if (ret != 0)
1317 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1318
1319 if (key.type != key_type || key.objectid != dirid) {
1320 ret = 1;
1321 goto next;
1322 }
1323 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1324 struct btrfs_dir_log_item);
1325 found_end = btrfs_dir_log_end(path->nodes[0], item);
1326
1327 if (*start_ret >= key.offset && *start_ret <= found_end) {
1328 ret = 0;
1329 *start_ret = key.offset;
1330 *end_ret = found_end;
1331 goto out;
1332 }
1333 ret = 1;
1334next:
1335 /* check the next slot in the tree to see if it is a valid item */
1336 nritems = btrfs_header_nritems(path->nodes[0]);
1337 if (path->slots[0] >= nritems) {
1338 ret = btrfs_next_leaf(root, path);
1339 if (ret)
1340 goto out;
1341 } else {
1342 path->slots[0]++;
1343 }
1344
1345 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1346
1347 if (key.type != key_type || key.objectid != dirid) {
1348 ret = 1;
1349 goto out;
1350 }
1351 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1352 struct btrfs_dir_log_item);
1353 found_end = btrfs_dir_log_end(path->nodes[0], item);
1354 *start_ret = key.offset;
1355 *end_ret = found_end;
1356 ret = 0;
1357out:
1358 btrfs_release_path(root, path);
1359 return ret;
1360}
1361
1362/*
1363 * this looks for a given directory item in the log. If the directory
1364 * item is not in the log, the item is removed and the inode it points
1365 * to is unlinked
1366 */
1367static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1368 struct btrfs_root *root,
1369 struct btrfs_root *log,
1370 struct btrfs_path *path,
1371 struct btrfs_path *log_path,
1372 struct inode *dir,
1373 struct btrfs_key *dir_key)
1374{
1375 int ret;
1376 struct extent_buffer *eb;
1377 int slot;
1378 u32 item_size;
1379 struct btrfs_dir_item *di;
1380 struct btrfs_dir_item *log_di;
1381 int name_len;
1382 unsigned long ptr;
1383 unsigned long ptr_end;
1384 char *name;
1385 struct inode *inode;
1386 struct btrfs_key location;
1387
1388again:
1389 eb = path->nodes[0];
1390 slot = path->slots[0];
1391 item_size = btrfs_item_size_nr(eb, slot);
1392 ptr = btrfs_item_ptr_offset(eb, slot);
1393 ptr_end = ptr + item_size;
1394 while (ptr < ptr_end) {
1395 di = (struct btrfs_dir_item *)ptr;
1396 name_len = btrfs_dir_name_len(eb, di);
1397 name = kmalloc(name_len, GFP_NOFS);
1398 if (!name) {
1399 ret = -ENOMEM;
1400 goto out;
1401 }
1402 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1403 name_len);
1404 log_di = NULL;
1405 if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
1406 log_di = btrfs_lookup_dir_item(trans, log, log_path,
1407 dir_key->objectid,
1408 name, name_len, 0);
1409 } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
1410 log_di = btrfs_lookup_dir_index_item(trans, log,
1411 log_path,
1412 dir_key->objectid,
1413 dir_key->offset,
1414 name, name_len, 0);
1415 }
1416 if (!log_di || IS_ERR(log_di)) {
1417 btrfs_dir_item_key_to_cpu(eb, di, &location);
1418 btrfs_release_path(root, path);
1419 btrfs_release_path(log, log_path);
1420 inode = read_one_inode(root, location.objectid);
1421 BUG_ON(!inode);
1422
1423 ret = link_to_fixup_dir(trans, root,
1424 path, location.objectid);
1425 BUG_ON(ret);
1426 btrfs_inc_nlink(inode);
1427 ret = btrfs_unlink_inode(trans, root, dir, inode,
1428 name, name_len);
1429 BUG_ON(ret);
1430 kfree(name);
1431 iput(inode);
1432
1433 /* there might still be more names under this key
1434 * check and repeat if required
1435 */
1436 ret = btrfs_search_slot(NULL, root, dir_key, path,
1437 0, 0);
1438 if (ret == 0)
1439 goto again;
1440 ret = 0;
1441 goto out;
1442 }
1443 btrfs_release_path(log, log_path);
1444 kfree(name);
1445
1446 ptr = (unsigned long)(di + 1);
1447 ptr += name_len;
1448 }
1449 ret = 0;
1450out:
1451 btrfs_release_path(root, path);
1452 btrfs_release_path(log, log_path);
1453 return ret;
1454}
1455
1456/*
1457 * deletion replay happens before we copy any new directory items
1458 * out of the log or out of backreferences from inodes. It
1459 * scans the log to find ranges of keys that log is authoritative for,
1460 * and then scans the directory to find items in those ranges that are
1461 * not present in the log.
1462 *
1463 * Anything we don't find in the log is unlinked and removed from the
1464 * directory.
1465 */
1466static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1467 struct btrfs_root *root,
1468 struct btrfs_root *log,
1469 struct btrfs_path *path,
1470 u64 dirid)
1471{
1472 u64 range_start;
1473 u64 range_end;
1474 int key_type = BTRFS_DIR_LOG_ITEM_KEY;
1475 int ret = 0;
1476 struct btrfs_key dir_key;
1477 struct btrfs_key found_key;
1478 struct btrfs_path *log_path;
1479 struct inode *dir;
1480
1481 dir_key.objectid = dirid;
1482 dir_key.type = BTRFS_DIR_ITEM_KEY;
1483 log_path = btrfs_alloc_path();
1484 if (!log_path)
1485 return -ENOMEM;
1486
1487 dir = read_one_inode(root, dirid);
1488 /* it isn't an error if the inode isn't there, that can happen
1489 * because we replay the deletes before we copy in the inode item
1490 * from the log
1491 */
1492 if (!dir) {
1493 btrfs_free_path(log_path);
1494 return 0;
1495 }
1496again:
1497 range_start = 0;
1498 range_end = 0;
1499 while (1) {
1500 ret = find_dir_range(log, path, dirid, key_type,
1501 &range_start, &range_end);
1502 if (ret != 0)
1503 break;
1504
1505 dir_key.offset = range_start;
1506 while (1) {
1507 int nritems;
1508 ret = btrfs_search_slot(NULL, root, &dir_key, path,
1509 0, 0);
1510 if (ret < 0)
1511 goto out;
1512
1513 nritems = btrfs_header_nritems(path->nodes[0]);
1514 if (path->slots[0] >= nritems) {
1515 ret = btrfs_next_leaf(root, path);
1516 if (ret)
1517 break;
1518 }
1519 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1520 path->slots[0]);
1521 if (found_key.objectid != dirid ||
1522 found_key.type != dir_key.type)
1523 goto next_type;
1524
1525 if (found_key.offset > range_end)
1526 break;
1527
1528 ret = check_item_in_log(trans, root, log, path,
1529 log_path, dir, &found_key);
1530 BUG_ON(ret);
1531 if (found_key.offset == (u64)-1)
1532 break;
1533 dir_key.offset = found_key.offset + 1;
1534 }
1535 btrfs_release_path(root, path);
1536 if (range_end == (u64)-1)
1537 break;
1538 range_start = range_end + 1;
1539 }
1540
1541next_type:
1542 ret = 0;
1543 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
1544 key_type = BTRFS_DIR_LOG_INDEX_KEY;
1545 dir_key.type = BTRFS_DIR_INDEX_KEY;
1546 btrfs_release_path(root, path);
1547 goto again;
1548 }
1549out:
1550 btrfs_release_path(root, path);
1551 btrfs_free_path(log_path);
1552 iput(dir);
1553 return ret;
1554}
1555
1556/*
1557 * the process_func used to replay items from the log tree. This
1558 * gets called in two different stages. The first stage just looks
1559 * for inodes and makes sure they are all copied into the subvolume.
1560 *
1561 * The second stage copies all the other item types from the log into
1562 * the subvolume. The two stage approach is slower, but gets rid of
1563 * lots of complexity around inodes referencing other inodes that exist
1564 * only in the log (references come from either directory items or inode
1565 * back refs).
1566 */
1567static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1568 struct walk_control *wc, u64 gen)
1569{
1570 int nritems;
1571 struct btrfs_path *path;
1572 struct btrfs_root *root = wc->replay_dest;
1573 struct btrfs_key key;
1574 u32 item_size;
1575 int level;
1576 int i;
1577 int ret;
1578
1579 btrfs_read_buffer(eb, gen);
1580
1581 level = btrfs_header_level(eb);
1582
1583 if (level != 0)
1584 return 0;
1585
1586 path = btrfs_alloc_path();
1587 BUG_ON(!path);
1588
1589 nritems = btrfs_header_nritems(eb);
1590 for (i = 0; i < nritems; i++) {
1591 btrfs_item_key_to_cpu(eb, &key, i);
1592 item_size = btrfs_item_size_nr(eb, i);
1593
1594 /* inode keys are done during the first stage */
1595 if (key.type == BTRFS_INODE_ITEM_KEY &&
1596 wc->stage == LOG_WALK_REPLAY_INODES) {
1597 struct inode *inode;
1598 struct btrfs_inode_item *inode_item;
1599 u32 mode;
1600
1601 inode_item = btrfs_item_ptr(eb, i,
1602 struct btrfs_inode_item);
1603 mode = btrfs_inode_mode(eb, inode_item);
1604 if (S_ISDIR(mode)) {
1605 ret = replay_dir_deletes(wc->trans,
1606 root, log, path, key.objectid);
1607 BUG_ON(ret);
1608 }
1609 ret = overwrite_item(wc->trans, root, path,
1610 eb, i, &key);
1611 BUG_ON(ret);
1612
1613 /* for regular files, truncate away
1614 * extents past the new EOF
1615 */
1616 if (S_ISREG(mode)) {
1617 inode = read_one_inode(root,
1618 key.objectid);
1619 BUG_ON(!inode);
1620
1621 ret = btrfs_truncate_inode_items(wc->trans,
1622 root, inode, inode->i_size,
1623 BTRFS_EXTENT_DATA_KEY);
1624 BUG_ON(ret);
1625 iput(inode);
1626 }
1627 ret = link_to_fixup_dir(wc->trans, root,
1628 path, key.objectid);
1629 BUG_ON(ret);
1630 }
1631 if (wc->stage < LOG_WALK_REPLAY_ALL)
1632 continue;
1633
1634 /* these keys are simply copied */
1635 if (key.type == BTRFS_XATTR_ITEM_KEY) {
1636 ret = overwrite_item(wc->trans, root, path,
1637 eb, i, &key);
1638 BUG_ON(ret);
1639 } else if (key.type == BTRFS_INODE_REF_KEY) {
1640 ret = add_inode_ref(wc->trans, root, log, path,
1641 eb, i, &key);
1642 BUG_ON(ret && ret != -ENOENT);
1643 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
1644 ret = replay_one_extent(wc->trans, root, path,
1645 eb, i, &key);
1646 BUG_ON(ret);
1647 } else if (key.type == BTRFS_DIR_ITEM_KEY ||
1648 key.type == BTRFS_DIR_INDEX_KEY) {
1649 ret = replay_one_dir_item(wc->trans, root, path,
1650 eb, i, &key);
1651 BUG_ON(ret);
1652 }
1653 }
1654 btrfs_free_path(path);
1655 return 0;
1656}
1657
1658static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1659 struct btrfs_root *root,
1660 struct btrfs_path *path, int *level,
1661 struct walk_control *wc)
1662{
1663 u64 root_owner;
1664 u64 root_gen;
1665 u64 bytenr;
1666 u64 ptr_gen;
1667 struct extent_buffer *next;
1668 struct extent_buffer *cur;
1669 struct extent_buffer *parent;
1670 u32 blocksize;
1671 int ret = 0;
1672
1673 WARN_ON(*level < 0);
1674 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1675
1676 while (*level > 0) {
1677 WARN_ON(*level < 0);
1678 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1679 cur = path->nodes[*level];
1680
1681 if (btrfs_header_level(cur) != *level)
1682 WARN_ON(1);
1683
1684 if (path->slots[*level] >=
1685 btrfs_header_nritems(cur))
1686 break;
1687
1688 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
1689 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
1690 blocksize = btrfs_level_size(root, *level - 1);
1691
1692 parent = path->nodes[*level];
1693 root_owner = btrfs_header_owner(parent);
1694 root_gen = btrfs_header_generation(parent);
1695
1696 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1697
1698 wc->process_func(root, next, wc, ptr_gen);
1699
1700 if (*level == 1) {
1701 path->slots[*level]++;
1702 if (wc->free) {
1703 btrfs_read_buffer(next, ptr_gen);
1704
1705 btrfs_tree_lock(next);
1706 clean_tree_block(trans, root, next);
1707 btrfs_wait_tree_block_writeback(next);
1708 btrfs_tree_unlock(next);
1709
1710 ret = btrfs_drop_leaf_ref(trans, root, next);
1711 BUG_ON(ret);
1712
1713 WARN_ON(root_owner !=
1714 BTRFS_TREE_LOG_OBJECTID);
1715 ret = btrfs_free_reserved_extent(root,
1716 bytenr, blocksize);
1717 BUG_ON(ret);
1718 }
1719 free_extent_buffer(next);
1720 continue;
1721 }
1722 btrfs_read_buffer(next, ptr_gen);
1723
1724 WARN_ON(*level <= 0);
1725 if (path->nodes[*level-1])
1726 free_extent_buffer(path->nodes[*level-1]);
1727 path->nodes[*level-1] = next;
1728 *level = btrfs_header_level(next);
1729 path->slots[*level] = 0;
1730 cond_resched();
1731 }
1732 WARN_ON(*level < 0);
1733 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1734
1735 if (path->nodes[*level] == root->node)
1736 parent = path->nodes[*level];
1737 else
1738 parent = path->nodes[*level + 1];
1739
1740 bytenr = path->nodes[*level]->start;
1741
1742 blocksize = btrfs_level_size(root, *level);
1743 root_owner = btrfs_header_owner(parent);
1744 root_gen = btrfs_header_generation(parent);
1745
1746 wc->process_func(root, path->nodes[*level], wc,
1747 btrfs_header_generation(path->nodes[*level]));
1748
1749 if (wc->free) {
1750 next = path->nodes[*level];
1751 btrfs_tree_lock(next);
1752 clean_tree_block(trans, root, next);
1753 btrfs_wait_tree_block_writeback(next);
1754 btrfs_tree_unlock(next);
1755
1756 if (*level == 0) {
1757 ret = btrfs_drop_leaf_ref(trans, root, next);
1758 BUG_ON(ret);
1759 }
1760 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1761 ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
1762 BUG_ON(ret);
1763 }
1764 free_extent_buffer(path->nodes[*level]);
1765 path->nodes[*level] = NULL;
1766 *level += 1;
1767
1768 cond_resched();
1769 return 0;
1770}
1771
1772static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1773 struct btrfs_root *root,
1774 struct btrfs_path *path, int *level,
1775 struct walk_control *wc)
1776{
1777 u64 root_owner;
1778 u64 root_gen;
1779 int i;
1780 int slot;
1781 int ret;
1782
1783 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1784 slot = path->slots[i];
1785 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
1786 struct extent_buffer *node;
1787 node = path->nodes[i];
1788 path->slots[i]++;
1789 *level = i;
1790 WARN_ON(*level == 0);
1791 return 0;
1792 } else {
1793 struct extent_buffer *parent;
1794 if (path->nodes[*level] == root->node)
1795 parent = path->nodes[*level];
1796 else
1797 parent = path->nodes[*level + 1];
1798
1799 root_owner = btrfs_header_owner(parent);
1800 root_gen = btrfs_header_generation(parent);
1801 wc->process_func(root, path->nodes[*level], wc,
1802 btrfs_header_generation(path->nodes[*level]));
1803 if (wc->free) {
1804 struct extent_buffer *next;
1805
1806 next = path->nodes[*level];
1807
1808 btrfs_tree_lock(next);
1809 clean_tree_block(trans, root, next);
1810 btrfs_wait_tree_block_writeback(next);
1811 btrfs_tree_unlock(next);
1812
1813 if (*level == 0) {
1814 ret = btrfs_drop_leaf_ref(trans, root,
1815 next);
1816 BUG_ON(ret);
1817 }
1818
1819 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1820 ret = btrfs_free_reserved_extent(root,
1821 path->nodes[*level]->start,
1822 path->nodes[*level]->len);
1823 BUG_ON(ret);
1824 }
1825 free_extent_buffer(path->nodes[*level]);
1826 path->nodes[*level] = NULL;
1827 *level = i + 1;
1828 }
1829 }
1830 return 1;
1831}
1832
1833/*
1834 * drop the reference count on the tree rooted at 'snap'. This traverses
1835 * the tree freeing any blocks that have a ref count of zero after being
1836 * decremented.
1837 */
1838static int walk_log_tree(struct btrfs_trans_handle *trans,
1839 struct btrfs_root *log, struct walk_control *wc)
1840{
1841 int ret = 0;
1842 int wret;
1843 int level;
1844 struct btrfs_path *path;
1845 int i;
1846 int orig_level;
1847
1848 path = btrfs_alloc_path();
1849 BUG_ON(!path);
1850
1851 level = btrfs_header_level(log->node);
1852 orig_level = level;
1853 path->nodes[level] = log->node;
1854 extent_buffer_get(log->node);
1855 path->slots[level] = 0;
1856
1857 while (1) {
1858 wret = walk_down_log_tree(trans, log, path, &level, wc);
1859 if (wret > 0)
1860 break;
1861 if (wret < 0)
1862 ret = wret;
1863
1864 wret = walk_up_log_tree(trans, log, path, &level, wc);
1865 if (wret > 0)
1866 break;
1867 if (wret < 0)
1868 ret = wret;
1869 }
1870
1871 /* was the root node processed? if not, catch it here */
1872 if (path->nodes[orig_level]) {
1873 wc->process_func(log, path->nodes[orig_level], wc,
1874 btrfs_header_generation(path->nodes[orig_level]));
1875 if (wc->free) {
1876 struct extent_buffer *next;
1877
1878 next = path->nodes[orig_level];
1879
1880 btrfs_tree_lock(next);
1881 clean_tree_block(trans, log, next);
1882 btrfs_wait_tree_block_writeback(next);
1883 btrfs_tree_unlock(next);
1884
1885 if (orig_level == 0) {
1886 ret = btrfs_drop_leaf_ref(trans, log,
1887 next);
1888 BUG_ON(ret);
1889 }
1890 WARN_ON(log->root_key.objectid !=
1891 BTRFS_TREE_LOG_OBJECTID);
1892 ret = btrfs_free_reserved_extent(log, next->start,
1893 next->len);
1894 BUG_ON(ret);
1895 }
1896 }
1897
1898 for (i = 0; i <= orig_level; i++) {
1899 if (path->nodes[i]) {
1900 free_extent_buffer(path->nodes[i]);
1901 path->nodes[i] = NULL;
1902 }
1903 }
1904 btrfs_free_path(path);
1905 if (wc->free)
1906 free_extent_buffer(log->node);
1907 return ret;
1908}
1909
1910static int wait_log_commit(struct btrfs_root *log)
1911{
1912 DEFINE_WAIT(wait);
1913 u64 transid = log->fs_info->tree_log_transid;
1914
1915 do {
1916 prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
1917 TASK_UNINTERRUPTIBLE);
1918 mutex_unlock(&log->fs_info->tree_log_mutex);
1919 if (atomic_read(&log->fs_info->tree_log_commit))
1920 schedule();
1921 finish_wait(&log->fs_info->tree_log_wait, &wait);
1922 mutex_lock(&log->fs_info->tree_log_mutex);
1923 } while (transid == log->fs_info->tree_log_transid &&
1924 atomic_read(&log->fs_info->tree_log_commit));
1925 return 0;
1926}
1927
1928/*
1929 * btrfs_sync_log does sends a given tree log down to the disk and
1930 * updates the super blocks to record it. When this call is done,
1931 * you know that any inodes previously logged are safely on disk
1932 */
1933int btrfs_sync_log(struct btrfs_trans_handle *trans,
1934 struct btrfs_root *root)
1935{
1936 int ret;
1937 unsigned long batch;
1938 struct btrfs_root *log = root->log_root;
1939
1940 mutex_lock(&log->fs_info->tree_log_mutex);
1941 if (atomic_read(&log->fs_info->tree_log_commit)) {
1942 wait_log_commit(log);
1943 goto out;
1944 }
1945 atomic_set(&log->fs_info->tree_log_commit, 1);
1946
1947 while (1) {
1948 batch = log->fs_info->tree_log_batch;
1949 mutex_unlock(&log->fs_info->tree_log_mutex);
1950 schedule_timeout_uninterruptible(1);
1951 mutex_lock(&log->fs_info->tree_log_mutex);
1952
1953 while (atomic_read(&log->fs_info->tree_log_writers)) {
1954 DEFINE_WAIT(wait);
1955 prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
1956 TASK_UNINTERRUPTIBLE);
1957 mutex_unlock(&log->fs_info->tree_log_mutex);
1958 if (atomic_read(&log->fs_info->tree_log_writers))
1959 schedule();
1960 mutex_lock(&log->fs_info->tree_log_mutex);
1961 finish_wait(&log->fs_info->tree_log_wait, &wait);
1962 }
1963 if (batch == log->fs_info->tree_log_batch)
1964 break;
1965 }
1966
1967 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
1968 BUG_ON(ret);
1969 ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree,
1970 &root->fs_info->log_root_tree->dirty_log_pages);
1971 BUG_ON(ret);
1972
1973 btrfs_set_super_log_root(&root->fs_info->super_for_commit,
1974 log->fs_info->log_root_tree->node->start);
1975 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
1976 btrfs_header_level(log->fs_info->log_root_tree->node));
1977
1978 write_ctree_super(trans, log->fs_info->tree_root, 2);
1979 log->fs_info->tree_log_transid++;
1980 log->fs_info->tree_log_batch = 0;
1981 atomic_set(&log->fs_info->tree_log_commit, 0);
1982 smp_mb();
1983 if (waitqueue_active(&log->fs_info->tree_log_wait))
1984 wake_up(&log->fs_info->tree_log_wait);
1985out:
1986 mutex_unlock(&log->fs_info->tree_log_mutex);
1987 return 0;
1988}
1989
1990/* * free all the extents used by the tree log. This should be called
1991 * at commit time of the full transaction
1992 */
1993int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
1994{
1995 int ret;
1996 struct btrfs_root *log;
1997 struct key;
1998 u64 start;
1999 u64 end;
2000 struct walk_control wc = {
2001 .free = 1,
2002 .process_func = process_one_buffer
2003 };
2004
2005 if (!root->log_root || root->fs_info->log_root_recovering)
2006 return 0;
2007
2008 log = root->log_root;
2009 ret = walk_log_tree(trans, log, &wc);
2010 BUG_ON(ret);
2011
2012 while (1) {
2013 ret = find_first_extent_bit(&log->dirty_log_pages,
2014 0, &start, &end, EXTENT_DIRTY);
2015 if (ret)
2016 break;
2017
2018 clear_extent_dirty(&log->dirty_log_pages,
2019 start, end, GFP_NOFS);
2020 }
2021
2022 log = root->log_root;
2023 ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2024 &log->root_key);
2025 BUG_ON(ret);
2026 root->log_root = NULL;
2027 kfree(root->log_root);
2028 return 0;
2029}
2030
2031/*
2032 * helper function to update the item for a given subvolumes log root
2033 * in the tree of log roots
2034 */
2035static int update_log_root(struct btrfs_trans_handle *trans,
2036 struct btrfs_root *log)
2037{
2038 u64 bytenr = btrfs_root_bytenr(&log->root_item);
2039 int ret;
2040
2041 if (log->node->start == bytenr)
2042 return 0;
2043
2044 btrfs_set_root_bytenr(&log->root_item, log->node->start);
2045 btrfs_set_root_generation(&log->root_item, trans->transid);
2046 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
2047 ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
2048 &log->root_key, &log->root_item);
2049 BUG_ON(ret);
2050 return ret;
2051}
2052
2053/*
2054 * If both a file and directory are logged, and unlinks or renames are
2055 * mixed in, we have a few interesting corners:
2056 *
2057 * create file X in dir Y
2058 * link file X to X.link in dir Y
2059 * fsync file X
2060 * unlink file X but leave X.link
2061 * fsync dir Y
2062 *
2063 * After a crash we would expect only X.link to exist. But file X
2064 * didn't get fsync'd again so the log has back refs for X and X.link.
2065 *
2066 * We solve this by removing directory entries and inode backrefs from the
2067 * log when a file that was logged in the current transaction is
2068 * unlinked. Any later fsync will include the updated log entries, and
2069 * we'll be able to reconstruct the proper directory items from backrefs.
2070 *
2071 * This optimizations allows us to avoid relogging the entire inode
2072 * or the entire directory.
2073 */
2074int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2075 struct btrfs_root *root,
2076 const char *name, int name_len,
2077 struct inode *dir, u64 index)
2078{
2079 struct btrfs_root *log;
2080 struct btrfs_dir_item *di;
2081 struct btrfs_path *path;
2082 int ret;
2083 int bytes_del = 0;
2084
2085 if (BTRFS_I(dir)->logged_trans < trans->transid)
2086 return 0;
2087
2088 ret = join_running_log_trans(root);
2089 if (ret)
2090 return 0;
2091
2092 mutex_lock(&BTRFS_I(dir)->log_mutex);
2093
2094 log = root->log_root;
2095 path = btrfs_alloc_path();
2096 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
2097 name, name_len, -1);
2098 if (di && !IS_ERR(di)) {
2099 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2100 bytes_del += name_len;
2101 BUG_ON(ret);
2102 }
2103 btrfs_release_path(log, path);
2104 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
2105 index, name, name_len, -1);
2106 if (di && !IS_ERR(di)) {
2107 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2108 bytes_del += name_len;
2109 BUG_ON(ret);
2110 }
2111
2112 /* update the directory size in the log to reflect the names
2113 * we have removed
2114 */
2115 if (bytes_del) {
2116 struct btrfs_key key;
2117
2118 key.objectid = dir->i_ino;
2119 key.offset = 0;
2120 key.type = BTRFS_INODE_ITEM_KEY;
2121 btrfs_release_path(log, path);
2122
2123 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2124 if (ret == 0) {
2125 struct btrfs_inode_item *item;
2126 u64 i_size;
2127
2128 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2129 struct btrfs_inode_item);
2130 i_size = btrfs_inode_size(path->nodes[0], item);
2131 if (i_size > bytes_del)
2132 i_size -= bytes_del;
2133 else
2134 i_size = 0;
2135 btrfs_set_inode_size(path->nodes[0], item, i_size);
2136 btrfs_mark_buffer_dirty(path->nodes[0]);
2137 } else
2138 ret = 0;
2139 btrfs_release_path(log, path);
2140 }
2141
2142 btrfs_free_path(path);
2143 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2144 end_log_trans(root);
2145
2146 return 0;
2147}
2148
2149/* see comments for btrfs_del_dir_entries_in_log */
2150int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2151 struct btrfs_root *root,
2152 const char *name, int name_len,
2153 struct inode *inode, u64 dirid)
2154{
2155 struct btrfs_root *log;
2156 u64 index;
2157 int ret;
2158
2159 if (BTRFS_I(inode)->logged_trans < trans->transid)
2160 return 0;
2161
2162 ret = join_running_log_trans(root);
2163 if (ret)
2164 return 0;
2165 log = root->log_root;
2166 mutex_lock(&BTRFS_I(inode)->log_mutex);
2167
2168 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2169 dirid, &index);
2170 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2171 end_log_trans(root);
2172
2173 return ret;
2174}
2175
2176/*
2177 * creates a range item in the log for 'dirid'. first_offset and
2178 * last_offset tell us which parts of the key space the log should
2179 * be considered authoritative for.
2180 */
2181static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2182 struct btrfs_root *log,
2183 struct btrfs_path *path,
2184 int key_type, u64 dirid,
2185 u64 first_offset, u64 last_offset)
2186{
2187 int ret;
2188 struct btrfs_key key;
2189 struct btrfs_dir_log_item *item;
2190
2191 key.objectid = dirid;
2192 key.offset = first_offset;
2193 if (key_type == BTRFS_DIR_ITEM_KEY)
2194 key.type = BTRFS_DIR_LOG_ITEM_KEY;
2195 else
2196 key.type = BTRFS_DIR_LOG_INDEX_KEY;
2197 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2198 BUG_ON(ret);
2199
2200 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2201 struct btrfs_dir_log_item);
2202 btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
2203 btrfs_mark_buffer_dirty(path->nodes[0]);
2204 btrfs_release_path(log, path);
2205 return 0;
2206}
2207
2208/*
2209 * log all the items included in the current transaction for a given
2210 * directory. This also creates the range items in the log tree required
2211 * to replay anything deleted before the fsync
2212 */
2213static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2214 struct btrfs_root *root, struct inode *inode,
2215 struct btrfs_path *path,
2216 struct btrfs_path *dst_path, int key_type,
2217 u64 min_offset, u64 *last_offset_ret)
2218{
2219 struct btrfs_key min_key;
2220 struct btrfs_key max_key;
2221 struct btrfs_root *log = root->log_root;
2222 struct extent_buffer *src;
2223 int ret;
2224 int i;
2225 int nritems;
2226 u64 first_offset = min_offset;
2227 u64 last_offset = (u64)-1;
2228
2229 log = root->log_root;
2230 max_key.objectid = inode->i_ino;
2231 max_key.offset = (u64)-1;
2232 max_key.type = key_type;
2233
2234 min_key.objectid = inode->i_ino;
2235 min_key.type = key_type;
2236 min_key.offset = min_offset;
2237
2238 path->keep_locks = 1;
2239
2240 ret = btrfs_search_forward(root, &min_key, &max_key,
2241 path, 0, trans->transid);
2242
2243 /*
2244 * we didn't find anything from this transaction, see if there
2245 * is anything at all
2246 */
2247 if (ret != 0 || min_key.objectid != inode->i_ino ||
2248 min_key.type != key_type) {
2249 min_key.objectid = inode->i_ino;
2250 min_key.type = key_type;
2251 min_key.offset = (u64)-1;
2252 btrfs_release_path(root, path);
2253 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2254 if (ret < 0) {
2255 btrfs_release_path(root, path);
2256 return ret;
2257 }
2258 ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
2259
2260 /* if ret == 0 there are items for this type,
2261 * create a range to tell us the last key of this type.
2262 * otherwise, there are no items in this directory after
2263 * *min_offset, and we create a range to indicate that.
2264 */
2265 if (ret == 0) {
2266 struct btrfs_key tmp;
2267 btrfs_item_key_to_cpu(path->nodes[0], &tmp,
2268 path->slots[0]);
2269 if (key_type == tmp.type)
2270 first_offset = max(min_offset, tmp.offset) + 1;
2271 }
2272 goto done;
2273 }
2274
2275 /* go backward to find any previous key */
2276 ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
2277 if (ret == 0) {
2278 struct btrfs_key tmp;
2279 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2280 if (key_type == tmp.type) {
2281 first_offset = tmp.offset;
2282 ret = overwrite_item(trans, log, dst_path,
2283 path->nodes[0], path->slots[0],
2284 &tmp);
2285 }
2286 }
2287 btrfs_release_path(root, path);
2288
2289 /* find the first key from this transaction again */
2290 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2291 if (ret != 0) {
2292 WARN_ON(1);
2293 goto done;
2294 }
2295
2296 /*
2297 * we have a block from this transaction, log every item in it
2298 * from our directory
2299 */
2300 while (1) {
2301 struct btrfs_key tmp;
2302 src = path->nodes[0];
2303 nritems = btrfs_header_nritems(src);
2304 for (i = path->slots[0]; i < nritems; i++) {
2305 btrfs_item_key_to_cpu(src, &min_key, i);
2306
2307 if (min_key.objectid != inode->i_ino ||
2308 min_key.type != key_type)
2309 goto done;
2310 ret = overwrite_item(trans, log, dst_path, src, i,
2311 &min_key);
2312 BUG_ON(ret);
2313 }
2314 path->slots[0] = nritems;
2315
2316 /*
2317 * look ahead to the next item and see if it is also
2318 * from this directory and from this transaction
2319 */
2320 ret = btrfs_next_leaf(root, path);
2321 if (ret == 1) {
2322 last_offset = (u64)-1;
2323 goto done;
2324 }
2325 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2326 if (tmp.objectid != inode->i_ino || tmp.type != key_type) {
2327 last_offset = (u64)-1;
2328 goto done;
2329 }
2330 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
2331 ret = overwrite_item(trans, log, dst_path,
2332 path->nodes[0], path->slots[0],
2333 &tmp);
2334
2335 BUG_ON(ret);
2336 last_offset = tmp.offset;
2337 goto done;
2338 }
2339 }
2340done:
2341 *last_offset_ret = last_offset;
2342 btrfs_release_path(root, path);
2343 btrfs_release_path(log, dst_path);
2344
2345 /* insert the log range keys to indicate where the log is valid */
2346 ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
2347 first_offset, last_offset);
2348 BUG_ON(ret);
2349 return 0;
2350}
2351
2352/*
2353 * logging directories is very similar to logging inodes, We find all the items
2354 * from the current transaction and write them to the log.
2355 *
2356 * The recovery code scans the directory in the subvolume, and if it finds a
2357 * key in the range logged that is not present in the log tree, then it means
2358 * that dir entry was unlinked during the transaction.
2359 *
2360 * In order for that scan to work, we must include one key smaller than
2361 * the smallest logged by this transaction and one key larger than the largest
2362 * key logged by this transaction.
2363 */
2364static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
2365 struct btrfs_root *root, struct inode *inode,
2366 struct btrfs_path *path,
2367 struct btrfs_path *dst_path)
2368{
2369 u64 min_key;
2370 u64 max_key;
2371 int ret;
2372 int key_type = BTRFS_DIR_ITEM_KEY;
2373
2374again:
2375 min_key = 0;
2376 max_key = 0;
2377 while (1) {
2378 ret = log_dir_items(trans, root, inode, path,
2379 dst_path, key_type, min_key,
2380 &max_key);
2381 BUG_ON(ret);
2382 if (max_key == (u64)-1)
2383 break;
2384 min_key = max_key + 1;
2385 }
2386
2387 if (key_type == BTRFS_DIR_ITEM_KEY) {
2388 key_type = BTRFS_DIR_INDEX_KEY;
2389 goto again;
2390 }
2391 return 0;
2392}
2393
2394/*
2395 * a helper function to drop items from the log before we relog an
2396 * inode. max_key_type indicates the highest item type to remove.
2397 * This cannot be run for file data extents because it does not
2398 * free the extents they point to.
2399 */
2400static int drop_objectid_items(struct btrfs_trans_handle *trans,
2401 struct btrfs_root *log,
2402 struct btrfs_path *path,
2403 u64 objectid, int max_key_type)
2404{
2405 int ret;
2406 struct btrfs_key key;
2407 struct btrfs_key found_key;
2408
2409 key.objectid = objectid;
2410 key.type = max_key_type;
2411 key.offset = (u64)-1;
2412
2413 while (1) {
2414 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
2415
2416 if (ret != 1)
2417 break;
2418
2419 if (path->slots[0] == 0)
2420 break;
2421
2422 path->slots[0]--;
2423 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2424 path->slots[0]);
2425
2426 if (found_key.objectid != objectid)
2427 break;
2428
2429 ret = btrfs_del_item(trans, log, path);
2430 BUG_ON(ret);
2431 btrfs_release_path(log, path);
2432 }
2433 btrfs_release_path(log, path);
2434 return 0;
2435}
2436
2437static noinline int copy_items(struct btrfs_trans_handle *trans,
2438 struct btrfs_root *log,
2439 struct btrfs_path *dst_path,
2440 struct extent_buffer *src,
2441 int start_slot, int nr, int inode_only)
2442{
2443 unsigned long src_offset;
2444 unsigned long dst_offset;
2445 struct btrfs_file_extent_item *extent;
2446 struct btrfs_inode_item *inode_item;
2447 int ret;
2448 struct btrfs_key *ins_keys;
2449 u32 *ins_sizes;
2450 char *ins_data;
2451 int i;
2452 struct list_head ordered_sums;
2453
2454 INIT_LIST_HEAD(&ordered_sums);
2455
2456 ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
2457 nr * sizeof(u32), GFP_NOFS);
2458 ins_sizes = (u32 *)ins_data;
2459 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
2460
2461 for (i = 0; i < nr; i++) {
2462 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
2463 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
2464 }
2465 ret = btrfs_insert_empty_items(trans, log, dst_path,
2466 ins_keys, ins_sizes, nr);
2467 BUG_ON(ret);
2468
2469 for (i = 0; i < nr; i++) {
2470 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
2471 dst_path->slots[0]);
2472
2473 src_offset = btrfs_item_ptr_offset(src, start_slot + i);
2474
2475 copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
2476 src_offset, ins_sizes[i]);
2477
2478 if (inode_only == LOG_INODE_EXISTS &&
2479 ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
2480 inode_item = btrfs_item_ptr(dst_path->nodes[0],
2481 dst_path->slots[0],
2482 struct btrfs_inode_item);
2483 btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
2484
2485 /* set the generation to zero so the recover code
2486 * can tell the difference between an logging
2487 * just to say 'this inode exists' and a logging
2488 * to say 'update this inode with these values'
2489 */
2490 btrfs_set_inode_generation(dst_path->nodes[0],
2491 inode_item, 0);
2492 }
2493 /* take a reference on file data extents so that truncates
2494 * or deletes of this inode don't have to relog the inode
2495 * again
2496 */
2497 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
2498 int found_type;
2499 extent = btrfs_item_ptr(src, start_slot + i,
2500 struct btrfs_file_extent_item);
2501
2502 found_type = btrfs_file_extent_type(src, extent);
2503 if (found_type == BTRFS_FILE_EXTENT_REG ||
2504 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
2505 u64 ds = btrfs_file_extent_disk_bytenr(src,
2506 extent);
2507 u64 dl = btrfs_file_extent_disk_num_bytes(src,
2508 extent);
2509 u64 cs = btrfs_file_extent_offset(src, extent);
2510 u64 cl = btrfs_file_extent_num_bytes(src,
2511 extent);;
2512 if (btrfs_file_extent_compression(src,
2513 extent)) {
2514 cs = 0;
2515 cl = dl;
2516 }
2517 /* ds == 0 is a hole */
2518 if (ds != 0) {
2519 ret = btrfs_inc_extent_ref(trans, log,
2520 ds, dl,
2521 dst_path->nodes[0]->start,
2522 BTRFS_TREE_LOG_OBJECTID,
2523 trans->transid,
2524 ins_keys[i].objectid);
2525 BUG_ON(ret);
2526 ret = btrfs_lookup_csums_range(
2527 log->fs_info->csum_root,
2528 ds + cs, ds + cs + cl - 1,
2529 &ordered_sums);
2530 BUG_ON(ret);
2531 }
2532 }
2533 }
2534 dst_path->slots[0]++;
2535 }
2536
2537 btrfs_mark_buffer_dirty(dst_path->nodes[0]);
2538 btrfs_release_path(log, dst_path);
2539 kfree(ins_data);
2540
2541 /*
2542 * we have to do this after the loop above to avoid changing the
2543 * log tree while trying to change the log tree.
2544 */
2545 while (!list_empty(&ordered_sums)) {
2546 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
2547 struct btrfs_ordered_sum,
2548 list);
2549 ret = btrfs_csum_file_blocks(trans, log, sums);
2550 BUG_ON(ret);
2551 list_del(&sums->list);
2552 kfree(sums);
2553 }
2554 return 0;
2555}
2556
2557/* log a single inode in the tree log.
2558 * At least one parent directory for this inode must exist in the tree
2559 * or be logged already.
2560 *
2561 * Any items from this inode changed by the current transaction are copied
2562 * to the log tree. An extra reference is taken on any extents in this
2563 * file, allowing us to avoid a whole pile of corner cases around logging
2564 * blocks that have been removed from the tree.
2565 *
2566 * See LOG_INODE_ALL and related defines for a description of what inode_only
2567 * does.
2568 *
2569 * This handles both files and directories.
2570 */
2571static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
2572 struct btrfs_root *root, struct inode *inode,
2573 int inode_only)
2574{
2575 struct btrfs_path *path;
2576 struct btrfs_path *dst_path;
2577 struct btrfs_key min_key;
2578 struct btrfs_key max_key;
2579 struct btrfs_root *log = root->log_root;
2580 struct extent_buffer *src = NULL;
2581 u32 size;
2582 int ret;
2583 int nritems;
2584 int ins_start_slot = 0;
2585 int ins_nr;
2586
2587 log = root->log_root;
2588
2589 path = btrfs_alloc_path();
2590 dst_path = btrfs_alloc_path();
2591
2592 min_key.objectid = inode->i_ino;
2593 min_key.type = BTRFS_INODE_ITEM_KEY;
2594 min_key.offset = 0;
2595
2596 max_key.objectid = inode->i_ino;
2597 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
2598 max_key.type = BTRFS_XATTR_ITEM_KEY;
2599 else
2600 max_key.type = (u8)-1;
2601 max_key.offset = (u64)-1;
2602
2603 /*
2604 * if this inode has already been logged and we're in inode_only
2605 * mode, we don't want to delete the things that have already
2606 * been written to the log.
2607 *
2608 * But, if the inode has been through an inode_only log,
2609 * the logged_trans field is not set. This allows us to catch
2610 * any new names for this inode in the backrefs by logging it
2611 * again
2612 */
2613 if (inode_only == LOG_INODE_EXISTS &&
2614 BTRFS_I(inode)->logged_trans == trans->transid) {
2615 btrfs_free_path(path);
2616 btrfs_free_path(dst_path);
2617 goto out;
2618 }
2619 mutex_lock(&BTRFS_I(inode)->log_mutex);
2620
2621 /*
2622 * a brute force approach to making sure we get the most uptodate
2623 * copies of everything.
2624 */
2625 if (S_ISDIR(inode->i_mode)) {
2626 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
2627
2628 if (inode_only == LOG_INODE_EXISTS)
2629 max_key_type = BTRFS_XATTR_ITEM_KEY;
2630 ret = drop_objectid_items(trans, log, path,
2631 inode->i_ino, max_key_type);
2632 } else {
2633 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
2634 }
2635 BUG_ON(ret);
2636 path->keep_locks = 1;
2637
2638 while (1) {
2639 ins_nr = 0;
2640 ret = btrfs_search_forward(root, &min_key, &max_key,
2641 path, 0, trans->transid);
2642 if (ret != 0)
2643 break;
2644again:
2645 /* note, ins_nr might be > 0 here, cleanup outside the loop */
2646 if (min_key.objectid != inode->i_ino)
2647 break;
2648 if (min_key.type > max_key.type)
2649 break;
2650
2651 src = path->nodes[0];
2652 size = btrfs_item_size_nr(src, path->slots[0]);
2653 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
2654 ins_nr++;
2655 goto next_slot;
2656 } else if (!ins_nr) {
2657 ins_start_slot = path->slots[0];
2658 ins_nr = 1;
2659 goto next_slot;
2660 }
2661
2662 ret = copy_items(trans, log, dst_path, src, ins_start_slot,
2663 ins_nr, inode_only);
2664 BUG_ON(ret);
2665 ins_nr = 1;
2666 ins_start_slot = path->slots[0];
2667next_slot:
2668
2669 nritems = btrfs_header_nritems(path->nodes[0]);
2670 path->slots[0]++;
2671 if (path->slots[0] < nritems) {
2672 btrfs_item_key_to_cpu(path->nodes[0], &min_key,
2673 path->slots[0]);
2674 goto again;
2675 }
2676 if (ins_nr) {
2677 ret = copy_items(trans, log, dst_path, src,
2678 ins_start_slot,
2679 ins_nr, inode_only);
2680 BUG_ON(ret);
2681 ins_nr = 0;
2682 }
2683 btrfs_release_path(root, path);
2684
2685 if (min_key.offset < (u64)-1)
2686 min_key.offset++;
2687 else if (min_key.type < (u8)-1)
2688 min_key.type++;
2689 else if (min_key.objectid < (u64)-1)
2690 min_key.objectid++;
2691 else
2692 break;
2693 }
2694 if (ins_nr) {
2695 ret = copy_items(trans, log, dst_path, src,
2696 ins_start_slot,
2697 ins_nr, inode_only);
2698 BUG_ON(ret);
2699 ins_nr = 0;
2700 }
2701 WARN_ON(ins_nr);
2702 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2703 btrfs_release_path(root, path);
2704 btrfs_release_path(log, dst_path);
2705 BTRFS_I(inode)->log_dirty_trans = 0;
2706 ret = log_directory_changes(trans, root, inode, path, dst_path);
2707 BUG_ON(ret);
2708 }
2709 BTRFS_I(inode)->logged_trans = trans->transid;
2710 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2711
2712 btrfs_free_path(path);
2713 btrfs_free_path(dst_path);
2714
2715 mutex_lock(&root->fs_info->tree_log_mutex);
2716 ret = update_log_root(trans, log);
2717 BUG_ON(ret);
2718 mutex_unlock(&root->fs_info->tree_log_mutex);
2719out:
2720 return 0;
2721}
2722
2723int btrfs_log_inode(struct btrfs_trans_handle *trans,
2724 struct btrfs_root *root, struct inode *inode,
2725 int inode_only)
2726{
2727 int ret;
2728
2729 start_log_trans(trans, root);
2730 ret = __btrfs_log_inode(trans, root, inode, inode_only);
2731 end_log_trans(root);
2732 return ret;
2733}
2734
2735/*
2736 * helper function around btrfs_log_inode to make sure newly created
2737 * parent directories also end up in the log. A minimal inode and backref
2738 * only logging is done of any parent directories that are older than
2739 * the last committed transaction
2740 */
2741int btrfs_log_dentry(struct btrfs_trans_handle *trans,
2742 struct btrfs_root *root, struct dentry *dentry)
2743{
2744 int inode_only = LOG_INODE_ALL;
2745 struct super_block *sb;
2746 int ret;
2747
2748 start_log_trans(trans, root);
2749 sb = dentry->d_inode->i_sb;
2750 while (1) {
2751 ret = __btrfs_log_inode(trans, root, dentry->d_inode,
2752 inode_only);
2753 BUG_ON(ret);
2754 inode_only = LOG_INODE_EXISTS;
2755
2756 dentry = dentry->d_parent;
2757 if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb)
2758 break;
2759
2760 if (BTRFS_I(dentry->d_inode)->generation <=
2761 root->fs_info->last_trans_committed)
2762 break;
2763 }
2764 end_log_trans(root);
2765 return 0;
2766}
2767
2768/*
2769 * it is not safe to log dentry if the chunk root has added new
2770 * chunks. This returns 0 if the dentry was logged, and 1 otherwise.
2771 * If this returns 1, you must commit the transaction to safely get your
2772 * data on disk.
2773 */
2774int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
2775 struct btrfs_root *root, struct dentry *dentry)
2776{
2777 u64 gen;
2778 gen = root->fs_info->last_trans_new_blockgroup;
2779 if (gen > root->fs_info->last_trans_committed)
2780 return 1;
2781 else
2782 return btrfs_log_dentry(trans, root, dentry);
2783}
2784
2785/*
2786 * should be called during mount to recover any replay any log trees
2787 * from the FS
2788 */
2789int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
2790{
2791 int ret;
2792 struct btrfs_path *path;
2793 struct btrfs_trans_handle *trans;
2794 struct btrfs_key key;
2795 struct btrfs_key found_key;
2796 struct btrfs_key tmp_key;
2797 struct btrfs_root *log;
2798 struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
2799 u64 highest_inode;
2800 struct walk_control wc = {
2801 .process_func = process_one_buffer,
2802 .stage = 0,
2803 };
2804
2805 fs_info->log_root_recovering = 1;
2806 path = btrfs_alloc_path();
2807 BUG_ON(!path);
2808
2809 trans = btrfs_start_transaction(fs_info->tree_root, 1);
2810
2811 wc.trans = trans;
2812 wc.pin = 1;
2813
2814 walk_log_tree(trans, log_root_tree, &wc);
2815
2816again:
2817 key.objectid = BTRFS_TREE_LOG_OBJECTID;
2818 key.offset = (u64)-1;
2819 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
2820
2821 while (1) {
2822 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
2823 if (ret < 0)
2824 break;
2825 if (ret > 0) {
2826 if (path->slots[0] == 0)
2827 break;
2828 path->slots[0]--;
2829 }
2830 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2831 path->slots[0]);
2832 btrfs_release_path(log_root_tree, path);
2833 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
2834 break;
2835
2836 log = btrfs_read_fs_root_no_radix(log_root_tree,
2837 &found_key);
2838 BUG_ON(!log);
2839
2840
2841 tmp_key.objectid = found_key.offset;
2842 tmp_key.type = BTRFS_ROOT_ITEM_KEY;
2843 tmp_key.offset = (u64)-1;
2844
2845 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
2846 BUG_ON(!wc.replay_dest);
2847
2848 wc.replay_dest->log_root = log;
2849 btrfs_record_root_in_trans(wc.replay_dest);
2850 ret = walk_log_tree(trans, log, &wc);
2851 BUG_ON(ret);
2852
2853 if (wc.stage == LOG_WALK_REPLAY_ALL) {
2854 ret = fixup_inode_link_counts(trans, wc.replay_dest,
2855 path);
2856 BUG_ON(ret);
2857 }
2858 ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
2859 if (ret == 0) {
2860 wc.replay_dest->highest_inode = highest_inode;
2861 wc.replay_dest->last_inode_alloc = highest_inode;
2862 }
2863
2864 key.offset = found_key.offset - 1;
2865 wc.replay_dest->log_root = NULL;
2866 free_extent_buffer(log->node);
2867 kfree(log);
2868
2869 if (found_key.offset == 0)
2870 break;
2871 }
2872 btrfs_release_path(log_root_tree, path);
2873
2874 /* step one is to pin it all, step two is to replay just inodes */
2875 if (wc.pin) {
2876 wc.pin = 0;
2877 wc.process_func = replay_one_buffer;
2878 wc.stage = LOG_WALK_REPLAY_INODES;
2879 goto again;
2880 }
2881 /* step three is to replay everything */
2882 if (wc.stage < LOG_WALK_REPLAY_ALL) {
2883 wc.stage++;
2884 goto again;
2885 }
2886
2887 btrfs_free_path(path);
2888
2889 free_extent_buffer(log_root_tree->node);
2890 log_root_tree->log_root = NULL;
2891 fs_info->log_root_recovering = 0;
2892
2893 /* step 4: commit the transaction, which also unpins the blocks */
2894 btrfs_commit_transaction(trans, fs_info->tree_root);
2895
2896 kfree(log_root_tree);
2897 return 0;
2898}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
new file mode 100644
index 000000000000..b9409b32ed02
--- /dev/null
+++ b/fs/btrfs/tree-log.h
@@ -0,0 +1,41 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __TREE_LOG_
20#define __TREE_LOG_
21
22int btrfs_sync_log(struct btrfs_trans_handle *trans,
23 struct btrfs_root *root);
24int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
25int btrfs_log_dentry(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root, struct dentry *dentry);
27int btrfs_recover_log_trees(struct btrfs_root *tree_root);
28int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
29 struct btrfs_root *root, struct dentry *dentry);
30int btrfs_log_inode(struct btrfs_trans_handle *trans,
31 struct btrfs_root *root, struct inode *inode,
32 int inode_only);
33int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
34 struct btrfs_root *root,
35 const char *name, int name_len,
36 struct inode *dir, u64 index);
37int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
38 struct btrfs_root *root,
39 const char *name, int name_len,
40 struct inode *inode, u64 dirid);
41#endif
diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h
new file mode 100644
index 000000000000..9bf3946d5ef2
--- /dev/null
+++ b/fs/btrfs/version.h
@@ -0,0 +1,4 @@
1#ifndef __BTRFS_VERSION_H
2#define __BTRFS_VERSION_H
3#define BTRFS_BUILD_VERSION "Btrfs"
4#endif
diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh
new file mode 100644
index 000000000000..1ca1952fd917
--- /dev/null
+++ b/fs/btrfs/version.sh
@@ -0,0 +1,43 @@
1#!/bin/bash
2#
3# determine-version -- report a useful version for releases
4#
5# Copyright 2008, Aron Griffis <agriffis@n01se.net>
6# Copyright 2008, Oracle
7# Released under the GNU GPLv2
8
9v="v0.16"
10
11which git &> /dev/null
12if [ $? == 0 ]; then
13 git branch >& /dev/null
14 if [ $? == 0 ]; then
15 if head=`git rev-parse --verify HEAD 2>/dev/null`; then
16 if tag=`git describe --tags 2>/dev/null`; then
17 v="$tag"
18 fi
19
20 # Are there uncommitted changes?
21 git update-index --refresh --unmerged > /dev/null
22 if git diff-index --name-only HEAD | \
23 grep -v "^scripts/package" \
24 | read dummy; then
25 v="$v"-dirty
26 fi
27 fi
28 fi
29fi
30
31echo "#ifndef __BUILD_VERSION" > .build-version.h
32echo "#define __BUILD_VERSION" >> .build-version.h
33echo "#define BTRFS_BUILD_VERSION \"Btrfs $v\"" >> .build-version.h
34echo "#endif" >> .build-version.h
35
36diff -q version.h .build-version.h >& /dev/null
37
38if [ $? == 0 ]; then
39 rm .build-version.h
40 exit 0
41fi
42
43mv .build-version.h version.h
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
new file mode 100644
index 000000000000..b187b537888e
--- /dev/null
+++ b/fs/btrfs/volumes.c
@@ -0,0 +1,3218 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/bio.h>
20#include <linux/buffer_head.h>
21#include <linux/blkdev.h>
22#include <linux/random.h>
23#include <linux/version.h>
24#include <asm/div64.h>
25#include "compat.h"
26#include "ctree.h"
27#include "extent_map.h"
28#include "disk-io.h"
29#include "transaction.h"
30#include "print-tree.h"
31#include "volumes.h"
32#include "async-thread.h"
33
34struct map_lookup {
35 u64 type;
36 int io_align;
37 int io_width;
38 int stripe_len;
39 int sector_size;
40 int num_stripes;
41 int sub_stripes;
42 struct btrfs_bio_stripe stripes[];
43};
44
45static int init_first_rw_device(struct btrfs_trans_handle *trans,
46 struct btrfs_root *root,
47 struct btrfs_device *device);
48static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
49
50#define map_lookup_size(n) (sizeof(struct map_lookup) + \
51 (sizeof(struct btrfs_bio_stripe) * (n)))
52
53static DEFINE_MUTEX(uuid_mutex);
54static LIST_HEAD(fs_uuids);
55
56void btrfs_lock_volumes(void)
57{
58 mutex_lock(&uuid_mutex);
59}
60
61void btrfs_unlock_volumes(void)
62{
63 mutex_unlock(&uuid_mutex);
64}
65
66static void lock_chunks(struct btrfs_root *root)
67{
68 mutex_lock(&root->fs_info->chunk_mutex);
69}
70
71static void unlock_chunks(struct btrfs_root *root)
72{
73 mutex_unlock(&root->fs_info->chunk_mutex);
74}
75
76static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
77{
78 struct btrfs_device *device;
79 WARN_ON(fs_devices->opened);
80 while (!list_empty(&fs_devices->devices)) {
81 device = list_entry(fs_devices->devices.next,
82 struct btrfs_device, dev_list);
83 list_del(&device->dev_list);
84 kfree(device->name);
85 kfree(device);
86 }
87 kfree(fs_devices);
88}
89
90int btrfs_cleanup_fs_uuids(void)
91{
92 struct btrfs_fs_devices *fs_devices;
93
94 while (!list_empty(&fs_uuids)) {
95 fs_devices = list_entry(fs_uuids.next,
96 struct btrfs_fs_devices, list);
97 list_del(&fs_devices->list);
98 free_fs_devices(fs_devices);
99 }
100 return 0;
101}
102
103static noinline struct btrfs_device *__find_device(struct list_head *head,
104 u64 devid, u8 *uuid)
105{
106 struct btrfs_device *dev;
107 struct list_head *cur;
108
109 list_for_each(cur, head) {
110 dev = list_entry(cur, struct btrfs_device, dev_list);
111 if (dev->devid == devid &&
112 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
113 return dev;
114 }
115 }
116 return NULL;
117}
118
119static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
120{
121 struct list_head *cur;
122 struct btrfs_fs_devices *fs_devices;
123
124 list_for_each(cur, &fs_uuids) {
125 fs_devices = list_entry(cur, struct btrfs_fs_devices, list);
126 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
127 return fs_devices;
128 }
129 return NULL;
130}
131
132/*
133 * we try to collect pending bios for a device so we don't get a large
134 * number of procs sending bios down to the same device. This greatly
135 * improves the schedulers ability to collect and merge the bios.
136 *
137 * But, it also turns into a long list of bios to process and that is sure
138 * to eventually make the worker thread block. The solution here is to
139 * make some progress and then put this work struct back at the end of
140 * the list if the block device is congested. This way, multiple devices
141 * can make progress from a single worker thread.
142 */
143static noinline int run_scheduled_bios(struct btrfs_device *device)
144{
145 struct bio *pending;
146 struct backing_dev_info *bdi;
147 struct btrfs_fs_info *fs_info;
148 struct bio *tail;
149 struct bio *cur;
150 int again = 0;
151 unsigned long num_run = 0;
152 unsigned long limit;
153
154 bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
155 fs_info = device->dev_root->fs_info;
156 limit = btrfs_async_submit_limit(fs_info);
157 limit = limit * 2 / 3;
158
159loop:
160 spin_lock(&device->io_lock);
161
162 /* take all the bios off the list at once and process them
163 * later on (without the lock held). But, remember the
164 * tail and other pointers so the bios can be properly reinserted
165 * into the list if we hit congestion
166 */
167 pending = device->pending_bios;
168 tail = device->pending_bio_tail;
169 WARN_ON(pending && !tail);
170 device->pending_bios = NULL;
171 device->pending_bio_tail = NULL;
172
173 /*
174 * if pending was null this time around, no bios need processing
175 * at all and we can stop. Otherwise it'll loop back up again
176 * and do an additional check so no bios are missed.
177 *
178 * device->running_pending is used to synchronize with the
179 * schedule_bio code.
180 */
181 if (pending) {
182 again = 1;
183 device->running_pending = 1;
184 } else {
185 again = 0;
186 device->running_pending = 0;
187 }
188 spin_unlock(&device->io_lock);
189
190 while (pending) {
191 cur = pending;
192 pending = pending->bi_next;
193 cur->bi_next = NULL;
194 atomic_dec(&fs_info->nr_async_bios);
195
196 if (atomic_read(&fs_info->nr_async_bios) < limit &&
197 waitqueue_active(&fs_info->async_submit_wait))
198 wake_up(&fs_info->async_submit_wait);
199
200 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
201 bio_get(cur);
202 submit_bio(cur->bi_rw, cur);
203 bio_put(cur);
204 num_run++;
205
206 /*
207 * we made progress, there is more work to do and the bdi
208 * is now congested. Back off and let other work structs
209 * run instead
210 */
211 if (pending && bdi_write_congested(bdi) &&
212 fs_info->fs_devices->open_devices > 1) {
213 struct bio *old_head;
214
215 spin_lock(&device->io_lock);
216
217 old_head = device->pending_bios;
218 device->pending_bios = pending;
219 if (device->pending_bio_tail)
220 tail->bi_next = old_head;
221 else
222 device->pending_bio_tail = tail;
223
224 spin_unlock(&device->io_lock);
225 btrfs_requeue_work(&device->work);
226 goto done;
227 }
228 }
229 if (again)
230 goto loop;
231done:
232 return 0;
233}
234
235static void pending_bios_fn(struct btrfs_work *work)
236{
237 struct btrfs_device *device;
238
239 device = container_of(work, struct btrfs_device, work);
240 run_scheduled_bios(device);
241}
242
243static noinline int device_list_add(const char *path,
244 struct btrfs_super_block *disk_super,
245 u64 devid, struct btrfs_fs_devices **fs_devices_ret)
246{
247 struct btrfs_device *device;
248 struct btrfs_fs_devices *fs_devices;
249 u64 found_transid = btrfs_super_generation(disk_super);
250
251 fs_devices = find_fsid(disk_super->fsid);
252 if (!fs_devices) {
253 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
254 if (!fs_devices)
255 return -ENOMEM;
256 INIT_LIST_HEAD(&fs_devices->devices);
257 INIT_LIST_HEAD(&fs_devices->alloc_list);
258 list_add(&fs_devices->list, &fs_uuids);
259 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
260 fs_devices->latest_devid = devid;
261 fs_devices->latest_trans = found_transid;
262 device = NULL;
263 } else {
264 device = __find_device(&fs_devices->devices, devid,
265 disk_super->dev_item.uuid);
266 }
267 if (!device) {
268 if (fs_devices->opened)
269 return -EBUSY;
270
271 device = kzalloc(sizeof(*device), GFP_NOFS);
272 if (!device) {
273 /* we can safely leave the fs_devices entry around */
274 return -ENOMEM;
275 }
276 device->devid = devid;
277 device->work.func = pending_bios_fn;
278 memcpy(device->uuid, disk_super->dev_item.uuid,
279 BTRFS_UUID_SIZE);
280 device->barriers = 1;
281 spin_lock_init(&device->io_lock);
282 device->name = kstrdup(path, GFP_NOFS);
283 if (!device->name) {
284 kfree(device);
285 return -ENOMEM;
286 }
287 INIT_LIST_HEAD(&device->dev_alloc_list);
288 list_add(&device->dev_list, &fs_devices->devices);
289 device->fs_devices = fs_devices;
290 fs_devices->num_devices++;
291 }
292
293 if (found_transid > fs_devices->latest_trans) {
294 fs_devices->latest_devid = devid;
295 fs_devices->latest_trans = found_transid;
296 }
297 *fs_devices_ret = fs_devices;
298 return 0;
299}
300
301static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
302{
303 struct btrfs_fs_devices *fs_devices;
304 struct btrfs_device *device;
305 struct btrfs_device *orig_dev;
306
307 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
308 if (!fs_devices)
309 return ERR_PTR(-ENOMEM);
310
311 INIT_LIST_HEAD(&fs_devices->devices);
312 INIT_LIST_HEAD(&fs_devices->alloc_list);
313 INIT_LIST_HEAD(&fs_devices->list);
314 fs_devices->latest_devid = orig->latest_devid;
315 fs_devices->latest_trans = orig->latest_trans;
316 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
317
318 list_for_each_entry(orig_dev, &orig->devices, dev_list) {
319 device = kzalloc(sizeof(*device), GFP_NOFS);
320 if (!device)
321 goto error;
322
323 device->name = kstrdup(orig_dev->name, GFP_NOFS);
324 if (!device->name)
325 goto error;
326
327 device->devid = orig_dev->devid;
328 device->work.func = pending_bios_fn;
329 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
330 device->barriers = 1;
331 spin_lock_init(&device->io_lock);
332 INIT_LIST_HEAD(&device->dev_list);
333 INIT_LIST_HEAD(&device->dev_alloc_list);
334
335 list_add(&device->dev_list, &fs_devices->devices);
336 device->fs_devices = fs_devices;
337 fs_devices->num_devices++;
338 }
339 return fs_devices;
340error:
341 free_fs_devices(fs_devices);
342 return ERR_PTR(-ENOMEM);
343}
344
345int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
346{
347 struct list_head *tmp;
348 struct list_head *cur;
349 struct btrfs_device *device;
350
351 mutex_lock(&uuid_mutex);
352again:
353 list_for_each_safe(cur, tmp, &fs_devices->devices) {
354 device = list_entry(cur, struct btrfs_device, dev_list);
355 if (device->in_fs_metadata)
356 continue;
357
358 if (device->bdev) {
359 close_bdev_exclusive(device->bdev, device->mode);
360 device->bdev = NULL;
361 fs_devices->open_devices--;
362 }
363 if (device->writeable) {
364 list_del_init(&device->dev_alloc_list);
365 device->writeable = 0;
366 fs_devices->rw_devices--;
367 }
368 list_del_init(&device->dev_list);
369 fs_devices->num_devices--;
370 kfree(device->name);
371 kfree(device);
372 }
373
374 if (fs_devices->seed) {
375 fs_devices = fs_devices->seed;
376 goto again;
377 }
378
379 mutex_unlock(&uuid_mutex);
380 return 0;
381}
382
383static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
384{
385 struct list_head *cur;
386 struct btrfs_device *device;
387
388 if (--fs_devices->opened > 0)
389 return 0;
390
391 list_for_each(cur, &fs_devices->devices) {
392 device = list_entry(cur, struct btrfs_device, dev_list);
393 if (device->bdev) {
394 close_bdev_exclusive(device->bdev, device->mode);
395 fs_devices->open_devices--;
396 }
397 if (device->writeable) {
398 list_del_init(&device->dev_alloc_list);
399 fs_devices->rw_devices--;
400 }
401
402 device->bdev = NULL;
403 device->writeable = 0;
404 device->in_fs_metadata = 0;
405 }
406 WARN_ON(fs_devices->open_devices);
407 WARN_ON(fs_devices->rw_devices);
408 fs_devices->opened = 0;
409 fs_devices->seeding = 0;
410
411 return 0;
412}
413
414int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
415{
416 struct btrfs_fs_devices *seed_devices = NULL;
417 int ret;
418
419 mutex_lock(&uuid_mutex);
420 ret = __btrfs_close_devices(fs_devices);
421 if (!fs_devices->opened) {
422 seed_devices = fs_devices->seed;
423 fs_devices->seed = NULL;
424 }
425 mutex_unlock(&uuid_mutex);
426
427 while (seed_devices) {
428 fs_devices = seed_devices;
429 seed_devices = fs_devices->seed;
430 __btrfs_close_devices(fs_devices);
431 free_fs_devices(fs_devices);
432 }
433 return ret;
434}
435
436static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
437 fmode_t flags, void *holder)
438{
439 struct block_device *bdev;
440 struct list_head *head = &fs_devices->devices;
441 struct list_head *cur;
442 struct btrfs_device *device;
443 struct block_device *latest_bdev = NULL;
444 struct buffer_head *bh;
445 struct btrfs_super_block *disk_super;
446 u64 latest_devid = 0;
447 u64 latest_transid = 0;
448 u64 devid;
449 int seeding = 1;
450 int ret = 0;
451
452 list_for_each(cur, head) {
453 device = list_entry(cur, struct btrfs_device, dev_list);
454 if (device->bdev)
455 continue;
456 if (!device->name)
457 continue;
458
459 bdev = open_bdev_exclusive(device->name, flags, holder);
460 if (IS_ERR(bdev)) {
461 printk(KERN_INFO "open %s failed\n", device->name);
462 goto error;
463 }
464 set_blocksize(bdev, 4096);
465
466 bh = btrfs_read_dev_super(bdev);
467 if (!bh)
468 goto error_close;
469
470 disk_super = (struct btrfs_super_block *)bh->b_data;
471 devid = le64_to_cpu(disk_super->dev_item.devid);
472 if (devid != device->devid)
473 goto error_brelse;
474
475 if (memcmp(device->uuid, disk_super->dev_item.uuid,
476 BTRFS_UUID_SIZE))
477 goto error_brelse;
478
479 device->generation = btrfs_super_generation(disk_super);
480 if (!latest_transid || device->generation > latest_transid) {
481 latest_devid = devid;
482 latest_transid = device->generation;
483 latest_bdev = bdev;
484 }
485
486 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
487 device->writeable = 0;
488 } else {
489 device->writeable = !bdev_read_only(bdev);
490 seeding = 0;
491 }
492
493 device->bdev = bdev;
494 device->in_fs_metadata = 0;
495 device->mode = flags;
496
497 fs_devices->open_devices++;
498 if (device->writeable) {
499 fs_devices->rw_devices++;
500 list_add(&device->dev_alloc_list,
501 &fs_devices->alloc_list);
502 }
503 continue;
504
505error_brelse:
506 brelse(bh);
507error_close:
508 close_bdev_exclusive(bdev, FMODE_READ);
509error:
510 continue;
511 }
512 if (fs_devices->open_devices == 0) {
513 ret = -EIO;
514 goto out;
515 }
516 fs_devices->seeding = seeding;
517 fs_devices->opened = 1;
518 fs_devices->latest_bdev = latest_bdev;
519 fs_devices->latest_devid = latest_devid;
520 fs_devices->latest_trans = latest_transid;
521 fs_devices->total_rw_bytes = 0;
522out:
523 return ret;
524}
525
526int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
527 fmode_t flags, void *holder)
528{
529 int ret;
530
531 mutex_lock(&uuid_mutex);
532 if (fs_devices->opened) {
533 fs_devices->opened++;
534 ret = 0;
535 } else {
536 ret = __btrfs_open_devices(fs_devices, flags, holder);
537 }
538 mutex_unlock(&uuid_mutex);
539 return ret;
540}
541
542int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
543 struct btrfs_fs_devices **fs_devices_ret)
544{
545 struct btrfs_super_block *disk_super;
546 struct block_device *bdev;
547 struct buffer_head *bh;
548 int ret;
549 u64 devid;
550 u64 transid;
551
552 mutex_lock(&uuid_mutex);
553
554 bdev = open_bdev_exclusive(path, flags, holder);
555
556 if (IS_ERR(bdev)) {
557 ret = PTR_ERR(bdev);
558 goto error;
559 }
560
561 ret = set_blocksize(bdev, 4096);
562 if (ret)
563 goto error_close;
564 bh = btrfs_read_dev_super(bdev);
565 if (!bh) {
566 ret = -EIO;
567 goto error_close;
568 }
569 disk_super = (struct btrfs_super_block *)bh->b_data;
570 devid = le64_to_cpu(disk_super->dev_item.devid);
571 transid = btrfs_super_generation(disk_super);
572 if (disk_super->label[0])
573 printk(KERN_INFO "device label %s ", disk_super->label);
574 else {
575 /* FIXME, make a readl uuid parser */
576 printk(KERN_INFO "device fsid %llx-%llx ",
577 *(unsigned long long *)disk_super->fsid,
578 *(unsigned long long *)(disk_super->fsid + 8));
579 }
580 printk(KERN_INFO "devid %llu transid %llu %s\n",
581 (unsigned long long)devid, (unsigned long long)transid, path);
582 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
583
584 brelse(bh);
585error_close:
586 close_bdev_exclusive(bdev, flags);
587error:
588 mutex_unlock(&uuid_mutex);
589 return ret;
590}
591
592/*
593 * this uses a pretty simple search, the expectation is that it is
594 * called very infrequently and that a given device has a small number
595 * of extents
596 */
597static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
598 struct btrfs_device *device,
599 u64 num_bytes, u64 *start)
600{
601 struct btrfs_key key;
602 struct btrfs_root *root = device->dev_root;
603 struct btrfs_dev_extent *dev_extent = NULL;
604 struct btrfs_path *path;
605 u64 hole_size = 0;
606 u64 last_byte = 0;
607 u64 search_start = 0;
608 u64 search_end = device->total_bytes;
609 int ret;
610 int slot = 0;
611 int start_found;
612 struct extent_buffer *l;
613
614 path = btrfs_alloc_path();
615 if (!path)
616 return -ENOMEM;
617 path->reada = 2;
618 start_found = 0;
619
620 /* FIXME use last free of some kind */
621
622 /* we don't want to overwrite the superblock on the drive,
623 * so we make sure to start at an offset of at least 1MB
624 */
625 search_start = max((u64)1024 * 1024, search_start);
626
627 if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
628 search_start = max(root->fs_info->alloc_start, search_start);
629
630 key.objectid = device->devid;
631 key.offset = search_start;
632 key.type = BTRFS_DEV_EXTENT_KEY;
633 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
634 if (ret < 0)
635 goto error;
636 ret = btrfs_previous_item(root, path, 0, key.type);
637 if (ret < 0)
638 goto error;
639 l = path->nodes[0];
640 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
641 while (1) {
642 l = path->nodes[0];
643 slot = path->slots[0];
644 if (slot >= btrfs_header_nritems(l)) {
645 ret = btrfs_next_leaf(root, path);
646 if (ret == 0)
647 continue;
648 if (ret < 0)
649 goto error;
650no_more_items:
651 if (!start_found) {
652 if (search_start >= search_end) {
653 ret = -ENOSPC;
654 goto error;
655 }
656 *start = search_start;
657 start_found = 1;
658 goto check_pending;
659 }
660 *start = last_byte > search_start ?
661 last_byte : search_start;
662 if (search_end <= *start) {
663 ret = -ENOSPC;
664 goto error;
665 }
666 goto check_pending;
667 }
668 btrfs_item_key_to_cpu(l, &key, slot);
669
670 if (key.objectid < device->devid)
671 goto next;
672
673 if (key.objectid > device->devid)
674 goto no_more_items;
675
676 if (key.offset >= search_start && key.offset > last_byte &&
677 start_found) {
678 if (last_byte < search_start)
679 last_byte = search_start;
680 hole_size = key.offset - last_byte;
681 if (key.offset > last_byte &&
682 hole_size >= num_bytes) {
683 *start = last_byte;
684 goto check_pending;
685 }
686 }
687 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
688 goto next;
689
690 start_found = 1;
691 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
692 last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
693next:
694 path->slots[0]++;
695 cond_resched();
696 }
697check_pending:
698 /* we have to make sure we didn't find an extent that has already
699 * been allocated by the map tree or the original allocation
700 */
701 BUG_ON(*start < search_start);
702
703 if (*start + num_bytes > search_end) {
704 ret = -ENOSPC;
705 goto error;
706 }
707 /* check for pending inserts here */
708 ret = 0;
709
710error:
711 btrfs_free_path(path);
712 return ret;
713}
714
715static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
716 struct btrfs_device *device,
717 u64 start)
718{
719 int ret;
720 struct btrfs_path *path;
721 struct btrfs_root *root = device->dev_root;
722 struct btrfs_key key;
723 struct btrfs_key found_key;
724 struct extent_buffer *leaf = NULL;
725 struct btrfs_dev_extent *extent = NULL;
726
727 path = btrfs_alloc_path();
728 if (!path)
729 return -ENOMEM;
730
731 key.objectid = device->devid;
732 key.offset = start;
733 key.type = BTRFS_DEV_EXTENT_KEY;
734
735 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
736 if (ret > 0) {
737 ret = btrfs_previous_item(root, path, key.objectid,
738 BTRFS_DEV_EXTENT_KEY);
739 BUG_ON(ret);
740 leaf = path->nodes[0];
741 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
742 extent = btrfs_item_ptr(leaf, path->slots[0],
743 struct btrfs_dev_extent);
744 BUG_ON(found_key.offset > start || found_key.offset +
745 btrfs_dev_extent_length(leaf, extent) < start);
746 ret = 0;
747 } else if (ret == 0) {
748 leaf = path->nodes[0];
749 extent = btrfs_item_ptr(leaf, path->slots[0],
750 struct btrfs_dev_extent);
751 }
752 BUG_ON(ret);
753
754 if (device->bytes_used > 0)
755 device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
756 ret = btrfs_del_item(trans, root, path);
757 BUG_ON(ret);
758
759 btrfs_free_path(path);
760 return ret;
761}
762
763int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
764 struct btrfs_device *device,
765 u64 chunk_tree, u64 chunk_objectid,
766 u64 chunk_offset, u64 start, u64 num_bytes)
767{
768 int ret;
769 struct btrfs_path *path;
770 struct btrfs_root *root = device->dev_root;
771 struct btrfs_dev_extent *extent;
772 struct extent_buffer *leaf;
773 struct btrfs_key key;
774
775 WARN_ON(!device->in_fs_metadata);
776 path = btrfs_alloc_path();
777 if (!path)
778 return -ENOMEM;
779
780 key.objectid = device->devid;
781 key.offset = start;
782 key.type = BTRFS_DEV_EXTENT_KEY;
783 ret = btrfs_insert_empty_item(trans, root, path, &key,
784 sizeof(*extent));
785 BUG_ON(ret);
786
787 leaf = path->nodes[0];
788 extent = btrfs_item_ptr(leaf, path->slots[0],
789 struct btrfs_dev_extent);
790 btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
791 btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
792 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
793
794 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
795 (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
796 BTRFS_UUID_SIZE);
797
798 btrfs_set_dev_extent_length(leaf, extent, num_bytes);
799 btrfs_mark_buffer_dirty(leaf);
800 btrfs_free_path(path);
801 return ret;
802}
803
804static noinline int find_next_chunk(struct btrfs_root *root,
805 u64 objectid, u64 *offset)
806{
807 struct btrfs_path *path;
808 int ret;
809 struct btrfs_key key;
810 struct btrfs_chunk *chunk;
811 struct btrfs_key found_key;
812
813 path = btrfs_alloc_path();
814 BUG_ON(!path);
815
816 key.objectid = objectid;
817 key.offset = (u64)-1;
818 key.type = BTRFS_CHUNK_ITEM_KEY;
819
820 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
821 if (ret < 0)
822 goto error;
823
824 BUG_ON(ret == 0);
825
826 ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
827 if (ret) {
828 *offset = 0;
829 } else {
830 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
831 path->slots[0]);
832 if (found_key.objectid != objectid)
833 *offset = 0;
834 else {
835 chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
836 struct btrfs_chunk);
837 *offset = found_key.offset +
838 btrfs_chunk_length(path->nodes[0], chunk);
839 }
840 }
841 ret = 0;
842error:
843 btrfs_free_path(path);
844 return ret;
845}
846
847static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
848{
849 int ret;
850 struct btrfs_key key;
851 struct btrfs_key found_key;
852 struct btrfs_path *path;
853
854 root = root->fs_info->chunk_root;
855
856 path = btrfs_alloc_path();
857 if (!path)
858 return -ENOMEM;
859
860 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
861 key.type = BTRFS_DEV_ITEM_KEY;
862 key.offset = (u64)-1;
863
864 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
865 if (ret < 0)
866 goto error;
867
868 BUG_ON(ret == 0);
869
870 ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
871 BTRFS_DEV_ITEM_KEY);
872 if (ret) {
873 *objectid = 1;
874 } else {
875 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
876 path->slots[0]);
877 *objectid = found_key.offset + 1;
878 }
879 ret = 0;
880error:
881 btrfs_free_path(path);
882 return ret;
883}
884
885/*
886 * the device information is stored in the chunk root
887 * the btrfs_device struct should be fully filled in
888 */
889int btrfs_add_device(struct btrfs_trans_handle *trans,
890 struct btrfs_root *root,
891 struct btrfs_device *device)
892{
893 int ret;
894 struct btrfs_path *path;
895 struct btrfs_dev_item *dev_item;
896 struct extent_buffer *leaf;
897 struct btrfs_key key;
898 unsigned long ptr;
899
900 root = root->fs_info->chunk_root;
901
902 path = btrfs_alloc_path();
903 if (!path)
904 return -ENOMEM;
905
906 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
907 key.type = BTRFS_DEV_ITEM_KEY;
908 key.offset = device->devid;
909
910 ret = btrfs_insert_empty_item(trans, root, path, &key,
911 sizeof(*dev_item));
912 if (ret)
913 goto out;
914
915 leaf = path->nodes[0];
916 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
917
918 btrfs_set_device_id(leaf, dev_item, device->devid);
919 btrfs_set_device_generation(leaf, dev_item, 0);
920 btrfs_set_device_type(leaf, dev_item, device->type);
921 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
922 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
923 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
924 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
925 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
926 btrfs_set_device_group(leaf, dev_item, 0);
927 btrfs_set_device_seek_speed(leaf, dev_item, 0);
928 btrfs_set_device_bandwidth(leaf, dev_item, 0);
929 btrfs_set_device_start_offset(leaf, dev_item, 0);
930
931 ptr = (unsigned long)btrfs_device_uuid(dev_item);
932 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
933 ptr = (unsigned long)btrfs_device_fsid(dev_item);
934 write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
935 btrfs_mark_buffer_dirty(leaf);
936
937 ret = 0;
938out:
939 btrfs_free_path(path);
940 return ret;
941}
942
943static int btrfs_rm_dev_item(struct btrfs_root *root,
944 struct btrfs_device *device)
945{
946 int ret;
947 struct btrfs_path *path;
948 struct btrfs_key key;
949 struct btrfs_trans_handle *trans;
950
951 root = root->fs_info->chunk_root;
952
953 path = btrfs_alloc_path();
954 if (!path)
955 return -ENOMEM;
956
957 trans = btrfs_start_transaction(root, 1);
958 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
959 key.type = BTRFS_DEV_ITEM_KEY;
960 key.offset = device->devid;
961 lock_chunks(root);
962
963 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
964 if (ret < 0)
965 goto out;
966
967 if (ret > 0) {
968 ret = -ENOENT;
969 goto out;
970 }
971
972 ret = btrfs_del_item(trans, root, path);
973 if (ret)
974 goto out;
975out:
976 btrfs_free_path(path);
977 unlock_chunks(root);
978 btrfs_commit_transaction(trans, root);
979 return ret;
980}
981
982int btrfs_rm_device(struct btrfs_root *root, char *device_path)
983{
984 struct btrfs_device *device;
985 struct btrfs_device *next_device;
986 struct block_device *bdev;
987 struct buffer_head *bh = NULL;
988 struct btrfs_super_block *disk_super;
989 u64 all_avail;
990 u64 devid;
991 u64 num_devices;
992 u8 *dev_uuid;
993 int ret = 0;
994
995 mutex_lock(&uuid_mutex);
996 mutex_lock(&root->fs_info->volume_mutex);
997
998 all_avail = root->fs_info->avail_data_alloc_bits |
999 root->fs_info->avail_system_alloc_bits |
1000 root->fs_info->avail_metadata_alloc_bits;
1001
1002 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
1003 root->fs_info->fs_devices->rw_devices <= 4) {
1004 printk(KERN_ERR "btrfs: unable to go below four devices "
1005 "on raid10\n");
1006 ret = -EINVAL;
1007 goto out;
1008 }
1009
1010 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
1011 root->fs_info->fs_devices->rw_devices <= 2) {
1012 printk(KERN_ERR "btrfs: unable to go below two "
1013 "devices on raid1\n");
1014 ret = -EINVAL;
1015 goto out;
1016 }
1017
1018 if (strcmp(device_path, "missing") == 0) {
1019 struct list_head *cur;
1020 struct list_head *devices;
1021 struct btrfs_device *tmp;
1022
1023 device = NULL;
1024 devices = &root->fs_info->fs_devices->devices;
1025 list_for_each(cur, devices) {
1026 tmp = list_entry(cur, struct btrfs_device, dev_list);
1027 if (tmp->in_fs_metadata && !tmp->bdev) {
1028 device = tmp;
1029 break;
1030 }
1031 }
1032 bdev = NULL;
1033 bh = NULL;
1034 disk_super = NULL;
1035 if (!device) {
1036 printk(KERN_ERR "btrfs: no missing devices found to "
1037 "remove\n");
1038 goto out;
1039 }
1040 } else {
1041 bdev = open_bdev_exclusive(device_path, FMODE_READ,
1042 root->fs_info->bdev_holder);
1043 if (IS_ERR(bdev)) {
1044 ret = PTR_ERR(bdev);
1045 goto out;
1046 }
1047
1048 set_blocksize(bdev, 4096);
1049 bh = btrfs_read_dev_super(bdev);
1050 if (!bh) {
1051 ret = -EIO;
1052 goto error_close;
1053 }
1054 disk_super = (struct btrfs_super_block *)bh->b_data;
1055 devid = le64_to_cpu(disk_super->dev_item.devid);
1056 dev_uuid = disk_super->dev_item.uuid;
1057 device = btrfs_find_device(root, devid, dev_uuid,
1058 disk_super->fsid);
1059 if (!device) {
1060 ret = -ENOENT;
1061 goto error_brelse;
1062 }
1063 }
1064
1065 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1066 printk(KERN_ERR "btrfs: unable to remove the only writeable "
1067 "device\n");
1068 ret = -EINVAL;
1069 goto error_brelse;
1070 }
1071
1072 if (device->writeable) {
1073 list_del_init(&device->dev_alloc_list);
1074 root->fs_info->fs_devices->rw_devices--;
1075 }
1076
1077 ret = btrfs_shrink_device(device, 0);
1078 if (ret)
1079 goto error_brelse;
1080
1081 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1082 if (ret)
1083 goto error_brelse;
1084
1085 device->in_fs_metadata = 0;
1086 list_del_init(&device->dev_list);
1087 device->fs_devices->num_devices--;
1088
1089 next_device = list_entry(root->fs_info->fs_devices->devices.next,
1090 struct btrfs_device, dev_list);
1091 if (device->bdev == root->fs_info->sb->s_bdev)
1092 root->fs_info->sb->s_bdev = next_device->bdev;
1093 if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1094 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1095
1096 if (device->bdev) {
1097 close_bdev_exclusive(device->bdev, device->mode);
1098 device->bdev = NULL;
1099 device->fs_devices->open_devices--;
1100 }
1101
1102 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
1103 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
1104
1105 if (device->fs_devices->open_devices == 0) {
1106 struct btrfs_fs_devices *fs_devices;
1107 fs_devices = root->fs_info->fs_devices;
1108 while (fs_devices) {
1109 if (fs_devices->seed == device->fs_devices)
1110 break;
1111 fs_devices = fs_devices->seed;
1112 }
1113 fs_devices->seed = device->fs_devices->seed;
1114 device->fs_devices->seed = NULL;
1115 __btrfs_close_devices(device->fs_devices);
1116 free_fs_devices(device->fs_devices);
1117 }
1118
1119 /*
1120 * at this point, the device is zero sized. We want to
1121 * remove it from the devices list and zero out the old super
1122 */
1123 if (device->writeable) {
1124 /* make sure this device isn't detected as part of
1125 * the FS anymore
1126 */
1127 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1128 set_buffer_dirty(bh);
1129 sync_dirty_buffer(bh);
1130 }
1131
1132 kfree(device->name);
1133 kfree(device);
1134 ret = 0;
1135
1136error_brelse:
1137 brelse(bh);
1138error_close:
1139 if (bdev)
1140 close_bdev_exclusive(bdev, FMODE_READ);
1141out:
1142 mutex_unlock(&root->fs_info->volume_mutex);
1143 mutex_unlock(&uuid_mutex);
1144 return ret;
1145}
1146
1147/*
1148 * does all the dirty work required for changing file system's UUID.
1149 */
1150static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1151 struct btrfs_root *root)
1152{
1153 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1154 struct btrfs_fs_devices *old_devices;
1155 struct btrfs_fs_devices *seed_devices;
1156 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
1157 struct btrfs_device *device;
1158 u64 super_flags;
1159
1160 BUG_ON(!mutex_is_locked(&uuid_mutex));
1161 if (!fs_devices->seeding)
1162 return -EINVAL;
1163
1164 seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
1165 if (!seed_devices)
1166 return -ENOMEM;
1167
1168 old_devices = clone_fs_devices(fs_devices);
1169 if (IS_ERR(old_devices)) {
1170 kfree(seed_devices);
1171 return PTR_ERR(old_devices);
1172 }
1173
1174 list_add(&old_devices->list, &fs_uuids);
1175
1176 memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1177 seed_devices->opened = 1;
1178 INIT_LIST_HEAD(&seed_devices->devices);
1179 INIT_LIST_HEAD(&seed_devices->alloc_list);
1180 list_splice_init(&fs_devices->devices, &seed_devices->devices);
1181 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1182 list_for_each_entry(device, &seed_devices->devices, dev_list) {
1183 device->fs_devices = seed_devices;
1184 }
1185
1186 fs_devices->seeding = 0;
1187 fs_devices->num_devices = 0;
1188 fs_devices->open_devices = 0;
1189 fs_devices->seed = seed_devices;
1190
1191 generate_random_uuid(fs_devices->fsid);
1192 memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1193 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1194 super_flags = btrfs_super_flags(disk_super) &
1195 ~BTRFS_SUPER_FLAG_SEEDING;
1196 btrfs_set_super_flags(disk_super, super_flags);
1197
1198 return 0;
1199}
1200
1201/*
1202 * strore the expected generation for seed devices in device items.
1203 */
1204static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
1205 struct btrfs_root *root)
1206{
1207 struct btrfs_path *path;
1208 struct extent_buffer *leaf;
1209 struct btrfs_dev_item *dev_item;
1210 struct btrfs_device *device;
1211 struct btrfs_key key;
1212 u8 fs_uuid[BTRFS_UUID_SIZE];
1213 u8 dev_uuid[BTRFS_UUID_SIZE];
1214 u64 devid;
1215 int ret;
1216
1217 path = btrfs_alloc_path();
1218 if (!path)
1219 return -ENOMEM;
1220
1221 root = root->fs_info->chunk_root;
1222 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1223 key.offset = 0;
1224 key.type = BTRFS_DEV_ITEM_KEY;
1225
1226 while (1) {
1227 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1228 if (ret < 0)
1229 goto error;
1230
1231 leaf = path->nodes[0];
1232next_slot:
1233 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1234 ret = btrfs_next_leaf(root, path);
1235 if (ret > 0)
1236 break;
1237 if (ret < 0)
1238 goto error;
1239 leaf = path->nodes[0];
1240 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1241 btrfs_release_path(root, path);
1242 continue;
1243 }
1244
1245 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1246 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
1247 key.type != BTRFS_DEV_ITEM_KEY)
1248 break;
1249
1250 dev_item = btrfs_item_ptr(leaf, path->slots[0],
1251 struct btrfs_dev_item);
1252 devid = btrfs_device_id(leaf, dev_item);
1253 read_extent_buffer(leaf, dev_uuid,
1254 (unsigned long)btrfs_device_uuid(dev_item),
1255 BTRFS_UUID_SIZE);
1256 read_extent_buffer(leaf, fs_uuid,
1257 (unsigned long)btrfs_device_fsid(dev_item),
1258 BTRFS_UUID_SIZE);
1259 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
1260 BUG_ON(!device);
1261
1262 if (device->fs_devices->seeding) {
1263 btrfs_set_device_generation(leaf, dev_item,
1264 device->generation);
1265 btrfs_mark_buffer_dirty(leaf);
1266 }
1267
1268 path->slots[0]++;
1269 goto next_slot;
1270 }
1271 ret = 0;
1272error:
1273 btrfs_free_path(path);
1274 return ret;
1275}
1276
1277int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1278{
1279 struct btrfs_trans_handle *trans;
1280 struct btrfs_device *device;
1281 struct block_device *bdev;
1282 struct list_head *cur;
1283 struct list_head *devices;
1284 struct super_block *sb = root->fs_info->sb;
1285 u64 total_bytes;
1286 int seeding_dev = 0;
1287 int ret = 0;
1288
1289 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1290 return -EINVAL;
1291
1292 bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
1293 if (!bdev)
1294 return -EIO;
1295
1296 if (root->fs_info->fs_devices->seeding) {
1297 seeding_dev = 1;
1298 down_write(&sb->s_umount);
1299 mutex_lock(&uuid_mutex);
1300 }
1301
1302 filemap_write_and_wait(bdev->bd_inode->i_mapping);
1303 mutex_lock(&root->fs_info->volume_mutex);
1304
1305 devices = &root->fs_info->fs_devices->devices;
1306 list_for_each(cur, devices) {
1307 device = list_entry(cur, struct btrfs_device, dev_list);
1308 if (device->bdev == bdev) {
1309 ret = -EEXIST;
1310 goto error;
1311 }
1312 }
1313
1314 device = kzalloc(sizeof(*device), GFP_NOFS);
1315 if (!device) {
1316 /* we can safely leave the fs_devices entry around */
1317 ret = -ENOMEM;
1318 goto error;
1319 }
1320
1321 device->name = kstrdup(device_path, GFP_NOFS);
1322 if (!device->name) {
1323 kfree(device);
1324 ret = -ENOMEM;
1325 goto error;
1326 }
1327
1328 ret = find_next_devid(root, &device->devid);
1329 if (ret) {
1330 kfree(device);
1331 goto error;
1332 }
1333
1334 trans = btrfs_start_transaction(root, 1);
1335 lock_chunks(root);
1336
1337 device->barriers = 1;
1338 device->writeable = 1;
1339 device->work.func = pending_bios_fn;
1340 generate_random_uuid(device->uuid);
1341 spin_lock_init(&device->io_lock);
1342 device->generation = trans->transid;
1343 device->io_width = root->sectorsize;
1344 device->io_align = root->sectorsize;
1345 device->sector_size = root->sectorsize;
1346 device->total_bytes = i_size_read(bdev->bd_inode);
1347 device->dev_root = root->fs_info->dev_root;
1348 device->bdev = bdev;
1349 device->in_fs_metadata = 1;
1350 device->mode = 0;
1351 set_blocksize(device->bdev, 4096);
1352
1353 if (seeding_dev) {
1354 sb->s_flags &= ~MS_RDONLY;
1355 ret = btrfs_prepare_sprout(trans, root);
1356 BUG_ON(ret);
1357 }
1358
1359 device->fs_devices = root->fs_info->fs_devices;
1360 list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
1361 list_add(&device->dev_alloc_list,
1362 &root->fs_info->fs_devices->alloc_list);
1363 root->fs_info->fs_devices->num_devices++;
1364 root->fs_info->fs_devices->open_devices++;
1365 root->fs_info->fs_devices->rw_devices++;
1366 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1367
1368 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
1369 btrfs_set_super_total_bytes(&root->fs_info->super_copy,
1370 total_bytes + device->total_bytes);
1371
1372 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
1373 btrfs_set_super_num_devices(&root->fs_info->super_copy,
1374 total_bytes + 1);
1375
1376 if (seeding_dev) {
1377 ret = init_first_rw_device(trans, root, device);
1378 BUG_ON(ret);
1379 ret = btrfs_finish_sprout(trans, root);
1380 BUG_ON(ret);
1381 } else {
1382 ret = btrfs_add_device(trans, root, device);
1383 }
1384
1385 unlock_chunks(root);
1386 btrfs_commit_transaction(trans, root);
1387
1388 if (seeding_dev) {
1389 mutex_unlock(&uuid_mutex);
1390 up_write(&sb->s_umount);
1391
1392 ret = btrfs_relocate_sys_chunks(root);
1393 BUG_ON(ret);
1394 }
1395out:
1396 mutex_unlock(&root->fs_info->volume_mutex);
1397 return ret;
1398error:
1399 close_bdev_exclusive(bdev, 0);
1400 if (seeding_dev) {
1401 mutex_unlock(&uuid_mutex);
1402 up_write(&sb->s_umount);
1403 }
1404 goto out;
1405}
1406
1407static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1408 struct btrfs_device *device)
1409{
1410 int ret;
1411 struct btrfs_path *path;
1412 struct btrfs_root *root;
1413 struct btrfs_dev_item *dev_item;
1414 struct extent_buffer *leaf;
1415 struct btrfs_key key;
1416
1417 root = device->dev_root->fs_info->chunk_root;
1418
1419 path = btrfs_alloc_path();
1420 if (!path)
1421 return -ENOMEM;
1422
1423 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1424 key.type = BTRFS_DEV_ITEM_KEY;
1425 key.offset = device->devid;
1426
1427 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1428 if (ret < 0)
1429 goto out;
1430
1431 if (ret > 0) {
1432 ret = -ENOENT;
1433 goto out;
1434 }
1435
1436 leaf = path->nodes[0];
1437 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1438
1439 btrfs_set_device_id(leaf, dev_item, device->devid);
1440 btrfs_set_device_type(leaf, dev_item, device->type);
1441 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1442 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1443 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1444 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1445 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1446 btrfs_mark_buffer_dirty(leaf);
1447
1448out:
1449 btrfs_free_path(path);
1450 return ret;
1451}
1452
1453static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1454 struct btrfs_device *device, u64 new_size)
1455{
1456 struct btrfs_super_block *super_copy =
1457 &device->dev_root->fs_info->super_copy;
1458 u64 old_total = btrfs_super_total_bytes(super_copy);
1459 u64 diff = new_size - device->total_bytes;
1460
1461 if (!device->writeable)
1462 return -EACCES;
1463 if (new_size <= device->total_bytes)
1464 return -EINVAL;
1465
1466 btrfs_set_super_total_bytes(super_copy, old_total + diff);
1467 device->fs_devices->total_rw_bytes += diff;
1468
1469 device->total_bytes = new_size;
1470 return btrfs_update_device(trans, device);
1471}
1472
1473int btrfs_grow_device(struct btrfs_trans_handle *trans,
1474 struct btrfs_device *device, u64 new_size)
1475{
1476 int ret;
1477 lock_chunks(device->dev_root);
1478 ret = __btrfs_grow_device(trans, device, new_size);
1479 unlock_chunks(device->dev_root);
1480 return ret;
1481}
1482
1483static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1484 struct btrfs_root *root,
1485 u64 chunk_tree, u64 chunk_objectid,
1486 u64 chunk_offset)
1487{
1488 int ret;
1489 struct btrfs_path *path;
1490 struct btrfs_key key;
1491
1492 root = root->fs_info->chunk_root;
1493 path = btrfs_alloc_path();
1494 if (!path)
1495 return -ENOMEM;
1496
1497 key.objectid = chunk_objectid;
1498 key.offset = chunk_offset;
1499 key.type = BTRFS_CHUNK_ITEM_KEY;
1500
1501 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1502 BUG_ON(ret);
1503
1504 ret = btrfs_del_item(trans, root, path);
1505 BUG_ON(ret);
1506
1507 btrfs_free_path(path);
1508 return 0;
1509}
1510
1511static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1512 chunk_offset)
1513{
1514 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1515 struct btrfs_disk_key *disk_key;
1516 struct btrfs_chunk *chunk;
1517 u8 *ptr;
1518 int ret = 0;
1519 u32 num_stripes;
1520 u32 array_size;
1521 u32 len = 0;
1522 u32 cur;
1523 struct btrfs_key key;
1524
1525 array_size = btrfs_super_sys_array_size(super_copy);
1526
1527 ptr = super_copy->sys_chunk_array;
1528 cur = 0;
1529
1530 while (cur < array_size) {
1531 disk_key = (struct btrfs_disk_key *)ptr;
1532 btrfs_disk_key_to_cpu(&key, disk_key);
1533
1534 len = sizeof(*disk_key);
1535
1536 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1537 chunk = (struct btrfs_chunk *)(ptr + len);
1538 num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1539 len += btrfs_chunk_item_size(num_stripes);
1540 } else {
1541 ret = -EIO;
1542 break;
1543 }
1544 if (key.objectid == chunk_objectid &&
1545 key.offset == chunk_offset) {
1546 memmove(ptr, ptr + len, array_size - (cur + len));
1547 array_size -= len;
1548 btrfs_set_super_sys_array_size(super_copy, array_size);
1549 } else {
1550 ptr += len;
1551 cur += len;
1552 }
1553 }
1554 return ret;
1555}
1556
1557static int btrfs_relocate_chunk(struct btrfs_root *root,
1558 u64 chunk_tree, u64 chunk_objectid,
1559 u64 chunk_offset)
1560{
1561 struct extent_map_tree *em_tree;
1562 struct btrfs_root *extent_root;
1563 struct btrfs_trans_handle *trans;
1564 struct extent_map *em;
1565 struct map_lookup *map;
1566 int ret;
1567 int i;
1568
1569 printk(KERN_INFO "btrfs relocating chunk %llu\n",
1570 (unsigned long long)chunk_offset);
1571 root = root->fs_info->chunk_root;
1572 extent_root = root->fs_info->extent_root;
1573 em_tree = &root->fs_info->mapping_tree.map_tree;
1574
1575 /* step one, relocate all the extents inside this chunk */
1576 ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1577 BUG_ON(ret);
1578
1579 trans = btrfs_start_transaction(root, 1);
1580 BUG_ON(!trans);
1581
1582 lock_chunks(root);
1583
1584 /*
1585 * step two, delete the device extents and the
1586 * chunk tree entries
1587 */
1588 spin_lock(&em_tree->lock);
1589 em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1590 spin_unlock(&em_tree->lock);
1591
1592 BUG_ON(em->start > chunk_offset ||
1593 em->start + em->len < chunk_offset);
1594 map = (struct map_lookup *)em->bdev;
1595
1596 for (i = 0; i < map->num_stripes; i++) {
1597 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
1598 map->stripes[i].physical);
1599 BUG_ON(ret);
1600
1601 if (map->stripes[i].dev) {
1602 ret = btrfs_update_device(trans, map->stripes[i].dev);
1603 BUG_ON(ret);
1604 }
1605 }
1606 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
1607 chunk_offset);
1608
1609 BUG_ON(ret);
1610
1611 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
1612 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
1613 BUG_ON(ret);
1614 }
1615
1616 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
1617 BUG_ON(ret);
1618
1619 spin_lock(&em_tree->lock);
1620 remove_extent_mapping(em_tree, em);
1621 spin_unlock(&em_tree->lock);
1622
1623 kfree(map);
1624 em->bdev = NULL;
1625
1626 /* once for the tree */
1627 free_extent_map(em);
1628 /* once for us */
1629 free_extent_map(em);
1630
1631 unlock_chunks(root);
1632 btrfs_end_transaction(trans, root);
1633 return 0;
1634}
1635
1636static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
1637{
1638 struct btrfs_root *chunk_root = root->fs_info->chunk_root;
1639 struct btrfs_path *path;
1640 struct extent_buffer *leaf;
1641 struct btrfs_chunk *chunk;
1642 struct btrfs_key key;
1643 struct btrfs_key found_key;
1644 u64 chunk_tree = chunk_root->root_key.objectid;
1645 u64 chunk_type;
1646 int ret;
1647
1648 path = btrfs_alloc_path();
1649 if (!path)
1650 return -ENOMEM;
1651
1652 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1653 key.offset = (u64)-1;
1654 key.type = BTRFS_CHUNK_ITEM_KEY;
1655
1656 while (1) {
1657 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
1658 if (ret < 0)
1659 goto error;
1660 BUG_ON(ret == 0);
1661
1662 ret = btrfs_previous_item(chunk_root, path, key.objectid,
1663 key.type);
1664 if (ret < 0)
1665 goto error;
1666 if (ret > 0)
1667 break;
1668
1669 leaf = path->nodes[0];
1670 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1671
1672 chunk = btrfs_item_ptr(leaf, path->slots[0],
1673 struct btrfs_chunk);
1674 chunk_type = btrfs_chunk_type(leaf, chunk);
1675 btrfs_release_path(chunk_root, path);
1676
1677 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
1678 ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
1679 found_key.objectid,
1680 found_key.offset);
1681 BUG_ON(ret);
1682 }
1683
1684 if (found_key.offset == 0)
1685 break;
1686 key.offset = found_key.offset - 1;
1687 }
1688 ret = 0;
1689error:
1690 btrfs_free_path(path);
1691 return ret;
1692}
1693
1694static u64 div_factor(u64 num, int factor)
1695{
1696 if (factor == 10)
1697 return num;
1698 num *= factor;
1699 do_div(num, 10);
1700 return num;
1701}
1702
1703int btrfs_balance(struct btrfs_root *dev_root)
1704{
1705 int ret;
1706 struct list_head *cur;
1707 struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
1708 struct btrfs_device *device;
1709 u64 old_size;
1710 u64 size_to_free;
1711 struct btrfs_path *path;
1712 struct btrfs_key key;
1713 struct btrfs_chunk *chunk;
1714 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
1715 struct btrfs_trans_handle *trans;
1716 struct btrfs_key found_key;
1717
1718 if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
1719 return -EROFS;
1720
1721 mutex_lock(&dev_root->fs_info->volume_mutex);
1722 dev_root = dev_root->fs_info->dev_root;
1723
1724 /* step one make some room on all the devices */
1725 list_for_each(cur, devices) {
1726 device = list_entry(cur, struct btrfs_device, dev_list);
1727 old_size = device->total_bytes;
1728 size_to_free = div_factor(old_size, 1);
1729 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
1730 if (!device->writeable ||
1731 device->total_bytes - device->bytes_used > size_to_free)
1732 continue;
1733
1734 ret = btrfs_shrink_device(device, old_size - size_to_free);
1735 BUG_ON(ret);
1736
1737 trans = btrfs_start_transaction(dev_root, 1);
1738 BUG_ON(!trans);
1739
1740 ret = btrfs_grow_device(trans, device, old_size);
1741 BUG_ON(ret);
1742
1743 btrfs_end_transaction(trans, dev_root);
1744 }
1745
1746 /* step two, relocate all the chunks */
1747 path = btrfs_alloc_path();
1748 BUG_ON(!path);
1749
1750 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1751 key.offset = (u64)-1;
1752 key.type = BTRFS_CHUNK_ITEM_KEY;
1753
1754 while (1) {
1755 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
1756 if (ret < 0)
1757 goto error;
1758
1759 /*
1760 * this shouldn't happen, it means the last relocate
1761 * failed
1762 */
1763 if (ret == 0)
1764 break;
1765
1766 ret = btrfs_previous_item(chunk_root, path, 0,
1767 BTRFS_CHUNK_ITEM_KEY);
1768 if (ret)
1769 break;
1770
1771 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1772 path->slots[0]);
1773 if (found_key.objectid != key.objectid)
1774 break;
1775
1776 chunk = btrfs_item_ptr(path->nodes[0],
1777 path->slots[0],
1778 struct btrfs_chunk);
1779 key.offset = found_key.offset;
1780 /* chunk zero is special */
1781 if (key.offset == 0)
1782 break;
1783
1784 btrfs_release_path(chunk_root, path);
1785 ret = btrfs_relocate_chunk(chunk_root,
1786 chunk_root->root_key.objectid,
1787 found_key.objectid,
1788 found_key.offset);
1789 BUG_ON(ret);
1790 }
1791 ret = 0;
1792error:
1793 btrfs_free_path(path);
1794 mutex_unlock(&dev_root->fs_info->volume_mutex);
1795 return ret;
1796}
1797
1798/*
1799 * shrinking a device means finding all of the device extents past
1800 * the new size, and then following the back refs to the chunks.
1801 * The chunk relocation code actually frees the device extent
1802 */
1803int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
1804{
1805 struct btrfs_trans_handle *trans;
1806 struct btrfs_root *root = device->dev_root;
1807 struct btrfs_dev_extent *dev_extent = NULL;
1808 struct btrfs_path *path;
1809 u64 length;
1810 u64 chunk_tree;
1811 u64 chunk_objectid;
1812 u64 chunk_offset;
1813 int ret;
1814 int slot;
1815 struct extent_buffer *l;
1816 struct btrfs_key key;
1817 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1818 u64 old_total = btrfs_super_total_bytes(super_copy);
1819 u64 diff = device->total_bytes - new_size;
1820
1821 if (new_size >= device->total_bytes)
1822 return -EINVAL;
1823
1824 path = btrfs_alloc_path();
1825 if (!path)
1826 return -ENOMEM;
1827
1828 trans = btrfs_start_transaction(root, 1);
1829 if (!trans) {
1830 ret = -ENOMEM;
1831 goto done;
1832 }
1833
1834 path->reada = 2;
1835
1836 lock_chunks(root);
1837
1838 device->total_bytes = new_size;
1839 if (device->writeable)
1840 device->fs_devices->total_rw_bytes -= diff;
1841 ret = btrfs_update_device(trans, device);
1842 if (ret) {
1843 unlock_chunks(root);
1844 btrfs_end_transaction(trans, root);
1845 goto done;
1846 }
1847 WARN_ON(diff > old_total);
1848 btrfs_set_super_total_bytes(super_copy, old_total - diff);
1849 unlock_chunks(root);
1850 btrfs_end_transaction(trans, root);
1851
1852 key.objectid = device->devid;
1853 key.offset = (u64)-1;
1854 key.type = BTRFS_DEV_EXTENT_KEY;
1855
1856 while (1) {
1857 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1858 if (ret < 0)
1859 goto done;
1860
1861 ret = btrfs_previous_item(root, path, 0, key.type);
1862 if (ret < 0)
1863 goto done;
1864 if (ret) {
1865 ret = 0;
1866 goto done;
1867 }
1868
1869 l = path->nodes[0];
1870 slot = path->slots[0];
1871 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
1872
1873 if (key.objectid != device->devid)
1874 goto done;
1875
1876 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1877 length = btrfs_dev_extent_length(l, dev_extent);
1878
1879 if (key.offset + length <= new_size)
1880 goto done;
1881
1882 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
1883 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
1884 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
1885 btrfs_release_path(root, path);
1886
1887 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
1888 chunk_offset);
1889 if (ret)
1890 goto done;
1891 }
1892
1893done:
1894 btrfs_free_path(path);
1895 return ret;
1896}
1897
1898static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
1899 struct btrfs_root *root,
1900 struct btrfs_key *key,
1901 struct btrfs_chunk *chunk, int item_size)
1902{
1903 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1904 struct btrfs_disk_key disk_key;
1905 u32 array_size;
1906 u8 *ptr;
1907
1908 array_size = btrfs_super_sys_array_size(super_copy);
1909 if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
1910 return -EFBIG;
1911
1912 ptr = super_copy->sys_chunk_array + array_size;
1913 btrfs_cpu_key_to_disk(&disk_key, key);
1914 memcpy(ptr, &disk_key, sizeof(disk_key));
1915 ptr += sizeof(disk_key);
1916 memcpy(ptr, chunk, item_size);
1917 item_size += sizeof(disk_key);
1918 btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
1919 return 0;
1920}
1921
1922static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
1923 int num_stripes, int sub_stripes)
1924{
1925 if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
1926 return calc_size;
1927 else if (type & BTRFS_BLOCK_GROUP_RAID10)
1928 return calc_size * (num_stripes / sub_stripes);
1929 else
1930 return calc_size * num_stripes;
1931}
1932
1933static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
1934 struct btrfs_root *extent_root,
1935 struct map_lookup **map_ret,
1936 u64 *num_bytes, u64 *stripe_size,
1937 u64 start, u64 type)
1938{
1939 struct btrfs_fs_info *info = extent_root->fs_info;
1940 struct btrfs_device *device = NULL;
1941 struct btrfs_fs_devices *fs_devices = info->fs_devices;
1942 struct list_head *cur;
1943 struct map_lookup *map = NULL;
1944 struct extent_map_tree *em_tree;
1945 struct extent_map *em;
1946 struct list_head private_devs;
1947 int min_stripe_size = 1 * 1024 * 1024;
1948 u64 calc_size = 1024 * 1024 * 1024;
1949 u64 max_chunk_size = calc_size;
1950 u64 min_free;
1951 u64 avail;
1952 u64 max_avail = 0;
1953 u64 dev_offset;
1954 int num_stripes = 1;
1955 int min_stripes = 1;
1956 int sub_stripes = 0;
1957 int looped = 0;
1958 int ret;
1959 int index;
1960 int stripe_len = 64 * 1024;
1961
1962 if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
1963 (type & BTRFS_BLOCK_GROUP_DUP)) {
1964 WARN_ON(1);
1965 type &= ~BTRFS_BLOCK_GROUP_DUP;
1966 }
1967 if (list_empty(&fs_devices->alloc_list))
1968 return -ENOSPC;
1969
1970 if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
1971 num_stripes = fs_devices->rw_devices;
1972 min_stripes = 2;
1973 }
1974 if (type & (BTRFS_BLOCK_GROUP_DUP)) {
1975 num_stripes = 2;
1976 min_stripes = 2;
1977 }
1978 if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
1979 num_stripes = min_t(u64, 2, fs_devices->rw_devices);
1980 if (num_stripes < 2)
1981 return -ENOSPC;
1982 min_stripes = 2;
1983 }
1984 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
1985 num_stripes = fs_devices->rw_devices;
1986 if (num_stripes < 4)
1987 return -ENOSPC;
1988 num_stripes &= ~(u32)1;
1989 sub_stripes = 2;
1990 min_stripes = 4;
1991 }
1992
1993 if (type & BTRFS_BLOCK_GROUP_DATA) {
1994 max_chunk_size = 10 * calc_size;
1995 min_stripe_size = 64 * 1024 * 1024;
1996 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
1997 max_chunk_size = 4 * calc_size;
1998 min_stripe_size = 32 * 1024 * 1024;
1999 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
2000 calc_size = 8 * 1024 * 1024;
2001 max_chunk_size = calc_size * 2;
2002 min_stripe_size = 1 * 1024 * 1024;
2003 }
2004
2005 /* we don't want a chunk larger than 10% of writeable space */
2006 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
2007 max_chunk_size);
2008
2009again:
2010 if (!map || map->num_stripes != num_stripes) {
2011 kfree(map);
2012 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2013 if (!map)
2014 return -ENOMEM;
2015 map->num_stripes = num_stripes;
2016 }
2017
2018 if (calc_size * num_stripes > max_chunk_size) {
2019 calc_size = max_chunk_size;
2020 do_div(calc_size, num_stripes);
2021 do_div(calc_size, stripe_len);
2022 calc_size *= stripe_len;
2023 }
2024 /* we don't want tiny stripes */
2025 calc_size = max_t(u64, min_stripe_size, calc_size);
2026
2027 do_div(calc_size, stripe_len);
2028 calc_size *= stripe_len;
2029
2030 cur = fs_devices->alloc_list.next;
2031 index = 0;
2032
2033 if (type & BTRFS_BLOCK_GROUP_DUP)
2034 min_free = calc_size * 2;
2035 else
2036 min_free = calc_size;
2037
2038 /*
2039 * we add 1MB because we never use the first 1MB of the device, unless
2040 * we've looped, then we are likely allocating the maximum amount of
2041 * space left already
2042 */
2043 if (!looped)
2044 min_free += 1024 * 1024;
2045
2046 INIT_LIST_HEAD(&private_devs);
2047 while (index < num_stripes) {
2048 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
2049 BUG_ON(!device->writeable);
2050 if (device->total_bytes > device->bytes_used)
2051 avail = device->total_bytes - device->bytes_used;
2052 else
2053 avail = 0;
2054 cur = cur->next;
2055
2056 if (device->in_fs_metadata && avail >= min_free) {
2057 ret = find_free_dev_extent(trans, device,
2058 min_free, &dev_offset);
2059 if (ret == 0) {
2060 list_move_tail(&device->dev_alloc_list,
2061 &private_devs);
2062 map->stripes[index].dev = device;
2063 map->stripes[index].physical = dev_offset;
2064 index++;
2065 if (type & BTRFS_BLOCK_GROUP_DUP) {
2066 map->stripes[index].dev = device;
2067 map->stripes[index].physical =
2068 dev_offset + calc_size;
2069 index++;
2070 }
2071 }
2072 } else if (device->in_fs_metadata && avail > max_avail)
2073 max_avail = avail;
2074 if (cur == &fs_devices->alloc_list)
2075 break;
2076 }
2077 list_splice(&private_devs, &fs_devices->alloc_list);
2078 if (index < num_stripes) {
2079 if (index >= min_stripes) {
2080 num_stripes = index;
2081 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2082 num_stripes /= sub_stripes;
2083 num_stripes *= sub_stripes;
2084 }
2085 looped = 1;
2086 goto again;
2087 }
2088 if (!looped && max_avail > 0) {
2089 looped = 1;
2090 calc_size = max_avail;
2091 goto again;
2092 }
2093 kfree(map);
2094 return -ENOSPC;
2095 }
2096 map->sector_size = extent_root->sectorsize;
2097 map->stripe_len = stripe_len;
2098 map->io_align = stripe_len;
2099 map->io_width = stripe_len;
2100 map->type = type;
2101 map->num_stripes = num_stripes;
2102 map->sub_stripes = sub_stripes;
2103
2104 *map_ret = map;
2105 *stripe_size = calc_size;
2106 *num_bytes = chunk_bytes_by_type(type, calc_size,
2107 num_stripes, sub_stripes);
2108
2109 em = alloc_extent_map(GFP_NOFS);
2110 if (!em) {
2111 kfree(map);
2112 return -ENOMEM;
2113 }
2114 em->bdev = (struct block_device *)map;
2115 em->start = start;
2116 em->len = *num_bytes;
2117 em->block_start = 0;
2118 em->block_len = em->len;
2119
2120 em_tree = &extent_root->fs_info->mapping_tree.map_tree;
2121 spin_lock(&em_tree->lock);
2122 ret = add_extent_mapping(em_tree, em);
2123 spin_unlock(&em_tree->lock);
2124 BUG_ON(ret);
2125 free_extent_map(em);
2126
2127 ret = btrfs_make_block_group(trans, extent_root, 0, type,
2128 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2129 start, *num_bytes);
2130 BUG_ON(ret);
2131
2132 index = 0;
2133 while (index < map->num_stripes) {
2134 device = map->stripes[index].dev;
2135 dev_offset = map->stripes[index].physical;
2136
2137 ret = btrfs_alloc_dev_extent(trans, device,
2138 info->chunk_root->root_key.objectid,
2139 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2140 start, dev_offset, calc_size);
2141 BUG_ON(ret);
2142 index++;
2143 }
2144
2145 return 0;
2146}
2147
2148static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2149 struct btrfs_root *extent_root,
2150 struct map_lookup *map, u64 chunk_offset,
2151 u64 chunk_size, u64 stripe_size)
2152{
2153 u64 dev_offset;
2154 struct btrfs_key key;
2155 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2156 struct btrfs_device *device;
2157 struct btrfs_chunk *chunk;
2158 struct btrfs_stripe *stripe;
2159 size_t item_size = btrfs_chunk_item_size(map->num_stripes);
2160 int index = 0;
2161 int ret;
2162
2163 chunk = kzalloc(item_size, GFP_NOFS);
2164 if (!chunk)
2165 return -ENOMEM;
2166
2167 index = 0;
2168 while (index < map->num_stripes) {
2169 device = map->stripes[index].dev;
2170 device->bytes_used += stripe_size;
2171 ret = btrfs_update_device(trans, device);
2172 BUG_ON(ret);
2173 index++;
2174 }
2175
2176 index = 0;
2177 stripe = &chunk->stripe;
2178 while (index < map->num_stripes) {
2179 device = map->stripes[index].dev;
2180 dev_offset = map->stripes[index].physical;
2181
2182 btrfs_set_stack_stripe_devid(stripe, device->devid);
2183 btrfs_set_stack_stripe_offset(stripe, dev_offset);
2184 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
2185 stripe++;
2186 index++;
2187 }
2188
2189 btrfs_set_stack_chunk_length(chunk, chunk_size);
2190 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
2191 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
2192 btrfs_set_stack_chunk_type(chunk, map->type);
2193 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
2194 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
2195 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
2196 btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
2197 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
2198
2199 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2200 key.type = BTRFS_CHUNK_ITEM_KEY;
2201 key.offset = chunk_offset;
2202
2203 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
2204 BUG_ON(ret);
2205
2206 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2207 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
2208 item_size);
2209 BUG_ON(ret);
2210 }
2211 kfree(chunk);
2212 return 0;
2213}
2214
2215/*
2216 * Chunk allocation falls into two parts. The first part does works
2217 * that make the new allocated chunk useable, but not do any operation
2218 * that modifies the chunk tree. The second part does the works that
2219 * require modifying the chunk tree. This division is important for the
2220 * bootstrap process of adding storage to a seed btrfs.
2221 */
2222int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2223 struct btrfs_root *extent_root, u64 type)
2224{
2225 u64 chunk_offset;
2226 u64 chunk_size;
2227 u64 stripe_size;
2228 struct map_lookup *map;
2229 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2230 int ret;
2231
2232 ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2233 &chunk_offset);
2234 if (ret)
2235 return ret;
2236
2237 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2238 &stripe_size, chunk_offset, type);
2239 if (ret)
2240 return ret;
2241
2242 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2243 chunk_size, stripe_size);
2244 BUG_ON(ret);
2245 return 0;
2246}
2247
2248static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
2249 struct btrfs_root *root,
2250 struct btrfs_device *device)
2251{
2252 u64 chunk_offset;
2253 u64 sys_chunk_offset;
2254 u64 chunk_size;
2255 u64 sys_chunk_size;
2256 u64 stripe_size;
2257 u64 sys_stripe_size;
2258 u64 alloc_profile;
2259 struct map_lookup *map;
2260 struct map_lookup *sys_map;
2261 struct btrfs_fs_info *fs_info = root->fs_info;
2262 struct btrfs_root *extent_root = fs_info->extent_root;
2263 int ret;
2264
2265 ret = find_next_chunk(fs_info->chunk_root,
2266 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
2267 BUG_ON(ret);
2268
2269 alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
2270 (fs_info->metadata_alloc_profile &
2271 fs_info->avail_metadata_alloc_bits);
2272 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2273
2274 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2275 &stripe_size, chunk_offset, alloc_profile);
2276 BUG_ON(ret);
2277
2278 sys_chunk_offset = chunk_offset + chunk_size;
2279
2280 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
2281 (fs_info->system_alloc_profile &
2282 fs_info->avail_system_alloc_bits);
2283 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2284
2285 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
2286 &sys_chunk_size, &sys_stripe_size,
2287 sys_chunk_offset, alloc_profile);
2288 BUG_ON(ret);
2289
2290 ret = btrfs_add_device(trans, fs_info->chunk_root, device);
2291 BUG_ON(ret);
2292
2293 /*
2294 * Modifying chunk tree needs allocating new blocks from both
2295 * system block group and metadata block group. So we only can
2296 * do operations require modifying the chunk tree after both
2297 * block groups were created.
2298 */
2299 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2300 chunk_size, stripe_size);
2301 BUG_ON(ret);
2302
2303 ret = __finish_chunk_alloc(trans, extent_root, sys_map,
2304 sys_chunk_offset, sys_chunk_size,
2305 sys_stripe_size);
2306 BUG_ON(ret);
2307 return 0;
2308}
2309
2310int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
2311{
2312 struct extent_map *em;
2313 struct map_lookup *map;
2314 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
2315 int readonly = 0;
2316 int i;
2317
2318 spin_lock(&map_tree->map_tree.lock);
2319 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2320 spin_unlock(&map_tree->map_tree.lock);
2321 if (!em)
2322 return 1;
2323
2324 map = (struct map_lookup *)em->bdev;
2325 for (i = 0; i < map->num_stripes; i++) {
2326 if (!map->stripes[i].dev->writeable) {
2327 readonly = 1;
2328 break;
2329 }
2330 }
2331 free_extent_map(em);
2332 return readonly;
2333}
2334
2335void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
2336{
2337 extent_map_tree_init(&tree->map_tree, GFP_NOFS);
2338}
2339
2340void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
2341{
2342 struct extent_map *em;
2343
2344 while (1) {
2345 spin_lock(&tree->map_tree.lock);
2346 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
2347 if (em)
2348 remove_extent_mapping(&tree->map_tree, em);
2349 spin_unlock(&tree->map_tree.lock);
2350 if (!em)
2351 break;
2352 kfree(em->bdev);
2353 /* once for us */
2354 free_extent_map(em);
2355 /* once for the tree */
2356 free_extent_map(em);
2357 }
2358}
2359
2360int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
2361{
2362 struct extent_map *em;
2363 struct map_lookup *map;
2364 struct extent_map_tree *em_tree = &map_tree->map_tree;
2365 int ret;
2366
2367 spin_lock(&em_tree->lock);
2368 em = lookup_extent_mapping(em_tree, logical, len);
2369 spin_unlock(&em_tree->lock);
2370 BUG_ON(!em);
2371
2372 BUG_ON(em->start > logical || em->start + em->len < logical);
2373 map = (struct map_lookup *)em->bdev;
2374 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
2375 ret = map->num_stripes;
2376 else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
2377 ret = map->sub_stripes;
2378 else
2379 ret = 1;
2380 free_extent_map(em);
2381 return ret;
2382}
2383
2384static int find_live_mirror(struct map_lookup *map, int first, int num,
2385 int optimal)
2386{
2387 int i;
2388 if (map->stripes[optimal].dev->bdev)
2389 return optimal;
2390 for (i = first; i < first + num; i++) {
2391 if (map->stripes[i].dev->bdev)
2392 return i;
2393 }
2394 /* we couldn't find one that doesn't fail. Just return something
2395 * and the io error handling code will clean up eventually
2396 */
2397 return optimal;
2398}
2399
2400static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2401 u64 logical, u64 *length,
2402 struct btrfs_multi_bio **multi_ret,
2403 int mirror_num, struct page *unplug_page)
2404{
2405 struct extent_map *em;
2406 struct map_lookup *map;
2407 struct extent_map_tree *em_tree = &map_tree->map_tree;
2408 u64 offset;
2409 u64 stripe_offset;
2410 u64 stripe_nr;
2411 int stripes_allocated = 8;
2412 int stripes_required = 1;
2413 int stripe_index;
2414 int i;
2415 int num_stripes;
2416 int max_errors = 0;
2417 struct btrfs_multi_bio *multi = NULL;
2418
2419 if (multi_ret && !(rw & (1 << BIO_RW)))
2420 stripes_allocated = 1;
2421again:
2422 if (multi_ret) {
2423 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
2424 GFP_NOFS);
2425 if (!multi)
2426 return -ENOMEM;
2427
2428 atomic_set(&multi->error, 0);
2429 }
2430
2431 spin_lock(&em_tree->lock);
2432 em = lookup_extent_mapping(em_tree, logical, *length);
2433 spin_unlock(&em_tree->lock);
2434
2435 if (!em && unplug_page)
2436 return 0;
2437
2438 if (!em) {
2439 printk(KERN_CRIT "unable to find logical %llu len %llu\n",
2440 (unsigned long long)logical,
2441 (unsigned long long)*length);
2442 BUG();
2443 }
2444
2445 BUG_ON(em->start > logical || em->start + em->len < logical);
2446 map = (struct map_lookup *)em->bdev;
2447 offset = logical - em->start;
2448
2449 if (mirror_num > map->num_stripes)
2450 mirror_num = 0;
2451
2452 /* if our multi bio struct is too small, back off and try again */
2453 if (rw & (1 << BIO_RW)) {
2454 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2455 BTRFS_BLOCK_GROUP_DUP)) {
2456 stripes_required = map->num_stripes;
2457 max_errors = 1;
2458 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2459 stripes_required = map->sub_stripes;
2460 max_errors = 1;
2461 }
2462 }
2463 if (multi_ret && rw == WRITE &&
2464 stripes_allocated < stripes_required) {
2465 stripes_allocated = map->num_stripes;
2466 free_extent_map(em);
2467 kfree(multi);
2468 goto again;
2469 }
2470 stripe_nr = offset;
2471 /*
2472 * stripe_nr counts the total number of stripes we have to stride
2473 * to get to this block
2474 */
2475 do_div(stripe_nr, map->stripe_len);
2476
2477 stripe_offset = stripe_nr * map->stripe_len;
2478 BUG_ON(offset < stripe_offset);
2479
2480 /* stripe_offset is the offset of this block in its stripe*/
2481 stripe_offset = offset - stripe_offset;
2482
2483 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
2484 BTRFS_BLOCK_GROUP_RAID10 |
2485 BTRFS_BLOCK_GROUP_DUP)) {
2486 /* we limit the length of each bio to what fits in a stripe */
2487 *length = min_t(u64, em->len - offset,
2488 map->stripe_len - stripe_offset);
2489 } else {
2490 *length = em->len - offset;
2491 }
2492
2493 if (!multi_ret && !unplug_page)
2494 goto out;
2495
2496 num_stripes = 1;
2497 stripe_index = 0;
2498 if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2499 if (unplug_page || (rw & (1 << BIO_RW)))
2500 num_stripes = map->num_stripes;
2501 else if (mirror_num)
2502 stripe_index = mirror_num - 1;
2503 else {
2504 stripe_index = find_live_mirror(map, 0,
2505 map->num_stripes,
2506 current->pid % map->num_stripes);
2507 }
2508
2509 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2510 if (rw & (1 << BIO_RW))
2511 num_stripes = map->num_stripes;
2512 else if (mirror_num)
2513 stripe_index = mirror_num - 1;
2514
2515 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2516 int factor = map->num_stripes / map->sub_stripes;
2517
2518 stripe_index = do_div(stripe_nr, factor);
2519 stripe_index *= map->sub_stripes;
2520
2521 if (unplug_page || (rw & (1 << BIO_RW)))
2522 num_stripes = map->sub_stripes;
2523 else if (mirror_num)
2524 stripe_index += mirror_num - 1;
2525 else {
2526 stripe_index = find_live_mirror(map, stripe_index,
2527 map->sub_stripes, stripe_index +
2528 current->pid % map->sub_stripes);
2529 }
2530 } else {
2531 /*
2532 * after this do_div call, stripe_nr is the number of stripes
2533 * on this device we have to walk to find the data, and
2534 * stripe_index is the number of our device in the stripe array
2535 */
2536 stripe_index = do_div(stripe_nr, map->num_stripes);
2537 }
2538 BUG_ON(stripe_index >= map->num_stripes);
2539
2540 for (i = 0; i < num_stripes; i++) {
2541 if (unplug_page) {
2542 struct btrfs_device *device;
2543 struct backing_dev_info *bdi;
2544
2545 device = map->stripes[stripe_index].dev;
2546 if (device->bdev) {
2547 bdi = blk_get_backing_dev_info(device->bdev);
2548 if (bdi->unplug_io_fn)
2549 bdi->unplug_io_fn(bdi, unplug_page);
2550 }
2551 } else {
2552 multi->stripes[i].physical =
2553 map->stripes[stripe_index].physical +
2554 stripe_offset + stripe_nr * map->stripe_len;
2555 multi->stripes[i].dev = map->stripes[stripe_index].dev;
2556 }
2557 stripe_index++;
2558 }
2559 if (multi_ret) {
2560 *multi_ret = multi;
2561 multi->num_stripes = num_stripes;
2562 multi->max_errors = max_errors;
2563 }
2564out:
2565 free_extent_map(em);
2566 return 0;
2567}
2568
2569int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2570 u64 logical, u64 *length,
2571 struct btrfs_multi_bio **multi_ret, int mirror_num)
2572{
2573 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
2574 mirror_num, NULL);
2575}
2576
2577int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
2578 u64 chunk_start, u64 physical, u64 devid,
2579 u64 **logical, int *naddrs, int *stripe_len)
2580{
2581 struct extent_map_tree *em_tree = &map_tree->map_tree;
2582 struct extent_map *em;
2583 struct map_lookup *map;
2584 u64 *buf;
2585 u64 bytenr;
2586 u64 length;
2587 u64 stripe_nr;
2588 int i, j, nr = 0;
2589
2590 spin_lock(&em_tree->lock);
2591 em = lookup_extent_mapping(em_tree, chunk_start, 1);
2592 spin_unlock(&em_tree->lock);
2593
2594 BUG_ON(!em || em->start != chunk_start);
2595 map = (struct map_lookup *)em->bdev;
2596
2597 length = em->len;
2598 if (map->type & BTRFS_BLOCK_GROUP_RAID10)
2599 do_div(length, map->num_stripes / map->sub_stripes);
2600 else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
2601 do_div(length, map->num_stripes);
2602
2603 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
2604 BUG_ON(!buf);
2605
2606 for (i = 0; i < map->num_stripes; i++) {
2607 if (devid && map->stripes[i].dev->devid != devid)
2608 continue;
2609 if (map->stripes[i].physical > physical ||
2610 map->stripes[i].physical + length <= physical)
2611 continue;
2612
2613 stripe_nr = physical - map->stripes[i].physical;
2614 do_div(stripe_nr, map->stripe_len);
2615
2616 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2617 stripe_nr = stripe_nr * map->num_stripes + i;
2618 do_div(stripe_nr, map->sub_stripes);
2619 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2620 stripe_nr = stripe_nr * map->num_stripes + i;
2621 }
2622 bytenr = chunk_start + stripe_nr * map->stripe_len;
2623 WARN_ON(nr >= map->num_stripes);
2624 for (j = 0; j < nr; j++) {
2625 if (buf[j] == bytenr)
2626 break;
2627 }
2628 if (j == nr) {
2629 WARN_ON(nr >= map->num_stripes);
2630 buf[nr++] = bytenr;
2631 }
2632 }
2633
2634 for (i = 0; i > nr; i++) {
2635 struct btrfs_multi_bio *multi;
2636 struct btrfs_bio_stripe *stripe;
2637 int ret;
2638
2639 length = 1;
2640 ret = btrfs_map_block(map_tree, WRITE, buf[i],
2641 &length, &multi, 0);
2642 BUG_ON(ret);
2643
2644 stripe = multi->stripes;
2645 for (j = 0; j < multi->num_stripes; j++) {
2646 if (stripe->physical >= physical &&
2647 physical < stripe->physical + length)
2648 break;
2649 }
2650 BUG_ON(j >= multi->num_stripes);
2651 kfree(multi);
2652 }
2653
2654 *logical = buf;
2655 *naddrs = nr;
2656 *stripe_len = map->stripe_len;
2657
2658 free_extent_map(em);
2659 return 0;
2660}
2661
2662int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
2663 u64 logical, struct page *page)
2664{
2665 u64 length = PAGE_CACHE_SIZE;
2666 return __btrfs_map_block(map_tree, READ, logical, &length,
2667 NULL, 0, page);
2668}
2669
2670static void end_bio_multi_stripe(struct bio *bio, int err)
2671{
2672 struct btrfs_multi_bio *multi = bio->bi_private;
2673 int is_orig_bio = 0;
2674
2675 if (err)
2676 atomic_inc(&multi->error);
2677
2678 if (bio == multi->orig_bio)
2679 is_orig_bio = 1;
2680
2681 if (atomic_dec_and_test(&multi->stripes_pending)) {
2682 if (!is_orig_bio) {
2683 bio_put(bio);
2684 bio = multi->orig_bio;
2685 }
2686 bio->bi_private = multi->private;
2687 bio->bi_end_io = multi->end_io;
2688 /* only send an error to the higher layers if it is
2689 * beyond the tolerance of the multi-bio
2690 */
2691 if (atomic_read(&multi->error) > multi->max_errors) {
2692 err = -EIO;
2693 } else if (err) {
2694 /*
2695 * this bio is actually up to date, we didn't
2696 * go over the max number of errors
2697 */
2698 set_bit(BIO_UPTODATE, &bio->bi_flags);
2699 err = 0;
2700 }
2701 kfree(multi);
2702
2703 bio_endio(bio, err);
2704 } else if (!is_orig_bio) {
2705 bio_put(bio);
2706 }
2707}
2708
2709struct async_sched {
2710 struct bio *bio;
2711 int rw;
2712 struct btrfs_fs_info *info;
2713 struct btrfs_work work;
2714};
2715
2716/*
2717 * see run_scheduled_bios for a description of why bios are collected for
2718 * async submit.
2719 *
2720 * This will add one bio to the pending list for a device and make sure
2721 * the work struct is scheduled.
2722 */
2723static noinline int schedule_bio(struct btrfs_root *root,
2724 struct btrfs_device *device,
2725 int rw, struct bio *bio)
2726{
2727 int should_queue = 1;
2728
2729 /* don't bother with additional async steps for reads, right now */
2730 if (!(rw & (1 << BIO_RW))) {
2731 bio_get(bio);
2732 submit_bio(rw, bio);
2733 bio_put(bio);
2734 return 0;
2735 }
2736
2737 /*
2738 * nr_async_bios allows us to reliably return congestion to the
2739 * higher layers. Otherwise, the async bio makes it appear we have
2740 * made progress against dirty pages when we've really just put it
2741 * on a queue for later
2742 */
2743 atomic_inc(&root->fs_info->nr_async_bios);
2744 WARN_ON(bio->bi_next);
2745 bio->bi_next = NULL;
2746 bio->bi_rw |= rw;
2747
2748 spin_lock(&device->io_lock);
2749
2750 if (device->pending_bio_tail)
2751 device->pending_bio_tail->bi_next = bio;
2752
2753 device->pending_bio_tail = bio;
2754 if (!device->pending_bios)
2755 device->pending_bios = bio;
2756 if (device->running_pending)
2757 should_queue = 0;
2758
2759 spin_unlock(&device->io_lock);
2760
2761 if (should_queue)
2762 btrfs_queue_worker(&root->fs_info->submit_workers,
2763 &device->work);
2764 return 0;
2765}
2766
2767int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
2768 int mirror_num, int async_submit)
2769{
2770 struct btrfs_mapping_tree *map_tree;
2771 struct btrfs_device *dev;
2772 struct bio *first_bio = bio;
2773 u64 logical = (u64)bio->bi_sector << 9;
2774 u64 length = 0;
2775 u64 map_length;
2776 struct btrfs_multi_bio *multi = NULL;
2777 int ret;
2778 int dev_nr = 0;
2779 int total_devs = 1;
2780
2781 length = bio->bi_size;
2782 map_tree = &root->fs_info->mapping_tree;
2783 map_length = length;
2784
2785 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
2786 mirror_num);
2787 BUG_ON(ret);
2788
2789 total_devs = multi->num_stripes;
2790 if (map_length < length) {
2791 printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
2792 "len %llu\n", (unsigned long long)logical,
2793 (unsigned long long)length,
2794 (unsigned long long)map_length);
2795 BUG();
2796 }
2797 multi->end_io = first_bio->bi_end_io;
2798 multi->private = first_bio->bi_private;
2799 multi->orig_bio = first_bio;
2800 atomic_set(&multi->stripes_pending, multi->num_stripes);
2801
2802 while (dev_nr < total_devs) {
2803 if (total_devs > 1) {
2804 if (dev_nr < total_devs - 1) {
2805 bio = bio_clone(first_bio, GFP_NOFS);
2806 BUG_ON(!bio);
2807 } else {
2808 bio = first_bio;
2809 }
2810 bio->bi_private = multi;
2811 bio->bi_end_io = end_bio_multi_stripe;
2812 }
2813 bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
2814 dev = multi->stripes[dev_nr].dev;
2815 BUG_ON(rw == WRITE && !dev->writeable);
2816 if (dev && dev->bdev) {
2817 bio->bi_bdev = dev->bdev;
2818 if (async_submit)
2819 schedule_bio(root, dev, rw, bio);
2820 else
2821 submit_bio(rw, bio);
2822 } else {
2823 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
2824 bio->bi_sector = logical >> 9;
2825 bio_endio(bio, -EIO);
2826 }
2827 dev_nr++;
2828 }
2829 if (total_devs == 1)
2830 kfree(multi);
2831 return 0;
2832}
2833
2834struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
2835 u8 *uuid, u8 *fsid)
2836{
2837 struct btrfs_device *device;
2838 struct btrfs_fs_devices *cur_devices;
2839
2840 cur_devices = root->fs_info->fs_devices;
2841 while (cur_devices) {
2842 if (!fsid ||
2843 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
2844 device = __find_device(&cur_devices->devices,
2845 devid, uuid);
2846 if (device)
2847 return device;
2848 }
2849 cur_devices = cur_devices->seed;
2850 }
2851 return NULL;
2852}
2853
2854static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
2855 u64 devid, u8 *dev_uuid)
2856{
2857 struct btrfs_device *device;
2858 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
2859
2860 device = kzalloc(sizeof(*device), GFP_NOFS);
2861 if (!device)
2862 return NULL;
2863 list_add(&device->dev_list,
2864 &fs_devices->devices);
2865 device->barriers = 1;
2866 device->dev_root = root->fs_info->dev_root;
2867 device->devid = devid;
2868 device->work.func = pending_bios_fn;
2869 device->fs_devices = fs_devices;
2870 fs_devices->num_devices++;
2871 spin_lock_init(&device->io_lock);
2872 INIT_LIST_HEAD(&device->dev_alloc_list);
2873 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
2874 return device;
2875}
2876
2877static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
2878 struct extent_buffer *leaf,
2879 struct btrfs_chunk *chunk)
2880{
2881 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
2882 struct map_lookup *map;
2883 struct extent_map *em;
2884 u64 logical;
2885 u64 length;
2886 u64 devid;
2887 u8 uuid[BTRFS_UUID_SIZE];
2888 int num_stripes;
2889 int ret;
2890 int i;
2891
2892 logical = key->offset;
2893 length = btrfs_chunk_length(leaf, chunk);
2894
2895 spin_lock(&map_tree->map_tree.lock);
2896 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
2897 spin_unlock(&map_tree->map_tree.lock);
2898
2899 /* already mapped? */
2900 if (em && em->start <= logical && em->start + em->len > logical) {
2901 free_extent_map(em);
2902 return 0;
2903 } else if (em) {
2904 free_extent_map(em);
2905 }
2906
2907 map = kzalloc(sizeof(*map), GFP_NOFS);
2908 if (!map)
2909 return -ENOMEM;
2910
2911 em = alloc_extent_map(GFP_NOFS);
2912 if (!em)
2913 return -ENOMEM;
2914 num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2915 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2916 if (!map) {
2917 free_extent_map(em);
2918 return -ENOMEM;
2919 }
2920
2921 em->bdev = (struct block_device *)map;
2922 em->start = logical;
2923 em->len = length;
2924 em->block_start = 0;
2925 em->block_len = em->len;
2926
2927 map->num_stripes = num_stripes;
2928 map->io_width = btrfs_chunk_io_width(leaf, chunk);
2929 map->io_align = btrfs_chunk_io_align(leaf, chunk);
2930 map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
2931 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
2932 map->type = btrfs_chunk_type(leaf, chunk);
2933 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
2934 for (i = 0; i < num_stripes; i++) {
2935 map->stripes[i].physical =
2936 btrfs_stripe_offset_nr(leaf, chunk, i);
2937 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
2938 read_extent_buffer(leaf, uuid, (unsigned long)
2939 btrfs_stripe_dev_uuid_nr(chunk, i),
2940 BTRFS_UUID_SIZE);
2941 map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
2942 NULL);
2943 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
2944 kfree(map);
2945 free_extent_map(em);
2946 return -EIO;
2947 }
2948 if (!map->stripes[i].dev) {
2949 map->stripes[i].dev =
2950 add_missing_dev(root, devid, uuid);
2951 if (!map->stripes[i].dev) {
2952 kfree(map);
2953 free_extent_map(em);
2954 return -EIO;
2955 }
2956 }
2957 map->stripes[i].dev->in_fs_metadata = 1;
2958 }
2959
2960 spin_lock(&map_tree->map_tree.lock);
2961 ret = add_extent_mapping(&map_tree->map_tree, em);
2962 spin_unlock(&map_tree->map_tree.lock);
2963 BUG_ON(ret);
2964 free_extent_map(em);
2965
2966 return 0;
2967}
2968
2969static int fill_device_from_item(struct extent_buffer *leaf,
2970 struct btrfs_dev_item *dev_item,
2971 struct btrfs_device *device)
2972{
2973 unsigned long ptr;
2974
2975 device->devid = btrfs_device_id(leaf, dev_item);
2976 device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
2977 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
2978 device->type = btrfs_device_type(leaf, dev_item);
2979 device->io_align = btrfs_device_io_align(leaf, dev_item);
2980 device->io_width = btrfs_device_io_width(leaf, dev_item);
2981 device->sector_size = btrfs_device_sector_size(leaf, dev_item);
2982
2983 ptr = (unsigned long)btrfs_device_uuid(dev_item);
2984 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
2985
2986 return 0;
2987}
2988
2989static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
2990{
2991 struct btrfs_fs_devices *fs_devices;
2992 int ret;
2993
2994 mutex_lock(&uuid_mutex);
2995
2996 fs_devices = root->fs_info->fs_devices->seed;
2997 while (fs_devices) {
2998 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
2999 ret = 0;
3000 goto out;
3001 }
3002 fs_devices = fs_devices->seed;
3003 }
3004
3005 fs_devices = find_fsid(fsid);
3006 if (!fs_devices) {
3007 ret = -ENOENT;
3008 goto out;
3009 }
3010
3011 fs_devices = clone_fs_devices(fs_devices);
3012 if (IS_ERR(fs_devices)) {
3013 ret = PTR_ERR(fs_devices);
3014 goto out;
3015 }
3016
3017 ret = __btrfs_open_devices(fs_devices, FMODE_READ,
3018 root->fs_info->bdev_holder);
3019 if (ret)
3020 goto out;
3021
3022 if (!fs_devices->seeding) {
3023 __btrfs_close_devices(fs_devices);
3024 free_fs_devices(fs_devices);
3025 ret = -EINVAL;
3026 goto out;
3027 }
3028
3029 fs_devices->seed = root->fs_info->fs_devices->seed;
3030 root->fs_info->fs_devices->seed = fs_devices;
3031out:
3032 mutex_unlock(&uuid_mutex);
3033 return ret;
3034}
3035
3036static int read_one_dev(struct btrfs_root *root,
3037 struct extent_buffer *leaf,
3038 struct btrfs_dev_item *dev_item)
3039{
3040 struct btrfs_device *device;
3041 u64 devid;
3042 int ret;
3043 u8 fs_uuid[BTRFS_UUID_SIZE];
3044 u8 dev_uuid[BTRFS_UUID_SIZE];
3045
3046 devid = btrfs_device_id(leaf, dev_item);
3047 read_extent_buffer(leaf, dev_uuid,
3048 (unsigned long)btrfs_device_uuid(dev_item),
3049 BTRFS_UUID_SIZE);
3050 read_extent_buffer(leaf, fs_uuid,
3051 (unsigned long)btrfs_device_fsid(dev_item),
3052 BTRFS_UUID_SIZE);
3053
3054 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
3055 ret = open_seed_devices(root, fs_uuid);
3056 if (ret && !btrfs_test_opt(root, DEGRADED))
3057 return ret;
3058 }
3059
3060 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
3061 if (!device || !device->bdev) {
3062 if (!btrfs_test_opt(root, DEGRADED))
3063 return -EIO;
3064
3065 if (!device) {
3066 printk(KERN_WARNING "warning devid %llu missing\n",
3067 (unsigned long long)devid);
3068 device = add_missing_dev(root, devid, dev_uuid);
3069 if (!device)
3070 return -ENOMEM;
3071 }
3072 }
3073
3074 if (device->fs_devices != root->fs_info->fs_devices) {
3075 BUG_ON(device->writeable);
3076 if (device->generation !=
3077 btrfs_device_generation(leaf, dev_item))
3078 return -EINVAL;
3079 }
3080
3081 fill_device_from_item(leaf, dev_item, device);
3082 device->dev_root = root->fs_info->dev_root;
3083 device->in_fs_metadata = 1;
3084 if (device->writeable)
3085 device->fs_devices->total_rw_bytes += device->total_bytes;
3086 ret = 0;
3087 return ret;
3088}
3089
3090int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
3091{
3092 struct btrfs_dev_item *dev_item;
3093
3094 dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
3095 dev_item);
3096 return read_one_dev(root, buf, dev_item);
3097}
3098
3099int btrfs_read_sys_array(struct btrfs_root *root)
3100{
3101 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
3102 struct extent_buffer *sb;
3103 struct btrfs_disk_key *disk_key;
3104 struct btrfs_chunk *chunk;
3105 u8 *ptr;
3106 unsigned long sb_ptr;
3107 int ret = 0;
3108 u32 num_stripes;
3109 u32 array_size;
3110 u32 len = 0;
3111 u32 cur;
3112 struct btrfs_key key;
3113
3114 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
3115 BTRFS_SUPER_INFO_SIZE);
3116 if (!sb)
3117 return -ENOMEM;
3118 btrfs_set_buffer_uptodate(sb);
3119 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
3120 array_size = btrfs_super_sys_array_size(super_copy);
3121
3122 ptr = super_copy->sys_chunk_array;
3123 sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
3124 cur = 0;
3125
3126 while (cur < array_size) {
3127 disk_key = (struct btrfs_disk_key *)ptr;
3128 btrfs_disk_key_to_cpu(&key, disk_key);
3129
3130 len = sizeof(*disk_key); ptr += len;
3131 sb_ptr += len;
3132 cur += len;
3133
3134 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
3135 chunk = (struct btrfs_chunk *)sb_ptr;
3136 ret = read_one_chunk(root, &key, sb, chunk);
3137 if (ret)
3138 break;
3139 num_stripes = btrfs_chunk_num_stripes(sb, chunk);
3140 len = btrfs_chunk_item_size(num_stripes);
3141 } else {
3142 ret = -EIO;
3143 break;
3144 }
3145 ptr += len;
3146 sb_ptr += len;
3147 cur += len;
3148 }
3149 free_extent_buffer(sb);
3150 return ret;
3151}
3152
3153int btrfs_read_chunk_tree(struct btrfs_root *root)
3154{
3155 struct btrfs_path *path;
3156 struct extent_buffer *leaf;
3157 struct btrfs_key key;
3158 struct btrfs_key found_key;
3159 int ret;
3160 int slot;
3161
3162 root = root->fs_info->chunk_root;
3163
3164 path = btrfs_alloc_path();
3165 if (!path)
3166 return -ENOMEM;
3167
3168 /* first we search for all of the device items, and then we
3169 * read in all of the chunk items. This way we can create chunk
3170 * mappings that reference all of the devices that are afound
3171 */
3172 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
3173 key.offset = 0;
3174 key.type = 0;
3175again:
3176 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3177 while (1) {
3178 leaf = path->nodes[0];
3179 slot = path->slots[0];
3180 if (slot >= btrfs_header_nritems(leaf)) {
3181 ret = btrfs_next_leaf(root, path);
3182 if (ret == 0)
3183 continue;
3184 if (ret < 0)
3185 goto error;
3186 break;
3187 }
3188 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3189 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3190 if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
3191 break;
3192 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
3193 struct btrfs_dev_item *dev_item;
3194 dev_item = btrfs_item_ptr(leaf, slot,
3195 struct btrfs_dev_item);
3196 ret = read_one_dev(root, leaf, dev_item);
3197 if (ret)
3198 goto error;
3199 }
3200 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
3201 struct btrfs_chunk *chunk;
3202 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3203 ret = read_one_chunk(root, &found_key, leaf, chunk);
3204 if (ret)
3205 goto error;
3206 }
3207 path->slots[0]++;
3208 }
3209 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3210 key.objectid = 0;
3211 btrfs_release_path(root, path);
3212 goto again;
3213 }
3214 ret = 0;
3215error:
3216 btrfs_free_path(path);
3217 return ret;
3218}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
new file mode 100644
index 000000000000..86c44e9ae110
--- /dev/null
+++ b/fs/btrfs/volumes.h
@@ -0,0 +1,162 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_VOLUMES_
20#define __BTRFS_VOLUMES_
21
22#include <linux/bio.h>
23#include "async-thread.h"
24
25struct buffer_head;
26struct btrfs_device {
27 struct list_head dev_list;
28 struct list_head dev_alloc_list;
29 struct btrfs_fs_devices *fs_devices;
30 struct btrfs_root *dev_root;
31 struct bio *pending_bios;
32 struct bio *pending_bio_tail;
33 int running_pending;
34 u64 generation;
35
36 int barriers;
37 int writeable;
38 int in_fs_metadata;
39
40 spinlock_t io_lock;
41
42 struct block_device *bdev;
43
44 /* the mode sent to open_bdev_exclusive */
45 fmode_t mode;
46
47 char *name;
48
49 /* the internal btrfs device id */
50 u64 devid;
51
52 /* size of the device */
53 u64 total_bytes;
54
55 /* bytes used */
56 u64 bytes_used;
57
58 /* optimal io alignment for this device */
59 u32 io_align;
60
61 /* optimal io width for this device */
62 u32 io_width;
63
64 /* minimal io size for this device */
65 u32 sector_size;
66
67 /* type and info about this device */
68 u64 type;
69
70 /* physical drive uuid (or lvm uuid) */
71 u8 uuid[BTRFS_UUID_SIZE];
72
73 struct btrfs_work work;
74};
75
76struct btrfs_fs_devices {
77 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
78
79 /* the device with this id has the most recent coyp of the super */
80 u64 latest_devid;
81 u64 latest_trans;
82 u64 num_devices;
83 u64 open_devices;
84 u64 rw_devices;
85 u64 total_rw_bytes;
86 struct block_device *latest_bdev;
87 /* all of the devices in the FS */
88 struct list_head devices;
89
90 /* devices not currently being allocated */
91 struct list_head alloc_list;
92 struct list_head list;
93
94 struct btrfs_fs_devices *seed;
95 int seeding;
96
97 int opened;
98};
99
100struct btrfs_bio_stripe {
101 struct btrfs_device *dev;
102 u64 physical;
103};
104
105struct btrfs_multi_bio {
106 atomic_t stripes_pending;
107 bio_end_io_t *end_io;
108 struct bio *orig_bio;
109 void *private;
110 atomic_t error;
111 int max_errors;
112 int num_stripes;
113 struct btrfs_bio_stripe stripes[];
114};
115
116#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
117 (sizeof(struct btrfs_bio_stripe) * (n)))
118
119int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
120 struct btrfs_device *device,
121 u64 chunk_tree, u64 chunk_objectid,
122 u64 chunk_offset, u64 start, u64 num_bytes);
123int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
124 u64 logical, u64 *length,
125 struct btrfs_multi_bio **multi_ret, int mirror_num);
126int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
127 u64 chunk_start, u64 physical, u64 devid,
128 u64 **logical, int *naddrs, int *stripe_len);
129int btrfs_read_sys_array(struct btrfs_root *root);
130int btrfs_read_chunk_tree(struct btrfs_root *root);
131int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
132 struct btrfs_root *extent_root, u64 type);
133void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
134void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
135int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
136 int mirror_num, int async_submit);
137int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
138int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
139 fmode_t flags, void *holder);
140int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
141 struct btrfs_fs_devices **fs_devices_ret);
142int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
143int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
144int btrfs_add_device(struct btrfs_trans_handle *trans,
145 struct btrfs_root *root,
146 struct btrfs_device *device);
147int btrfs_rm_device(struct btrfs_root *root, char *device_path);
148int btrfs_cleanup_fs_uuids(void);
149int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
150int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
151 u64 logical, struct page *page);
152int btrfs_grow_device(struct btrfs_trans_handle *trans,
153 struct btrfs_device *device, u64 new_size);
154struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
155 u8 *uuid, u8 *fsid);
156int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
157int btrfs_init_new_device(struct btrfs_root *root, char *path);
158int btrfs_balance(struct btrfs_root *dev_root);
159void btrfs_unlock_volumes(void);
160void btrfs_lock_volumes(void);
161int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
162#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
new file mode 100644
index 000000000000..7f332e270894
--- /dev/null
+++ b/fs/btrfs/xattr.c
@@ -0,0 +1,322 @@
1/*
2 * Copyright (C) 2007 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/init.h>
20#include <linux/fs.h>
21#include <linux/slab.h>
22#include <linux/rwsem.h>
23#include <linux/xattr.h>
24#include "ctree.h"
25#include "btrfs_inode.h"
26#include "transaction.h"
27#include "xattr.h"
28#include "disk-io.h"
29
30
31ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
32 void *buffer, size_t size)
33{
34 struct btrfs_dir_item *di;
35 struct btrfs_root *root = BTRFS_I(inode)->root;
36 struct btrfs_path *path;
37 struct extent_buffer *leaf;
38 int ret = 0;
39 unsigned long data_ptr;
40
41 path = btrfs_alloc_path();
42 if (!path)
43 return -ENOMEM;
44
45 /* lookup the xattr by name */
46 di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
47 strlen(name), 0);
48 if (!di || IS_ERR(di)) {
49 ret = -ENODATA;
50 goto out;
51 }
52
53 leaf = path->nodes[0];
54 /* if size is 0, that means we want the size of the attr */
55 if (!size) {
56 ret = btrfs_dir_data_len(leaf, di);
57 goto out;
58 }
59
60 /* now get the data out of our dir_item */
61 if (btrfs_dir_data_len(leaf, di) > size) {
62 ret = -ERANGE;
63 goto out;
64 }
65 data_ptr = (unsigned long)((char *)(di + 1) +
66 btrfs_dir_name_len(leaf, di));
67 read_extent_buffer(leaf, buffer, data_ptr,
68 btrfs_dir_data_len(leaf, di));
69 ret = btrfs_dir_data_len(leaf, di);
70
71out:
72 btrfs_free_path(path);
73 return ret;
74}
75
76int __btrfs_setxattr(struct inode *inode, const char *name,
77 const void *value, size_t size, int flags)
78{
79 struct btrfs_dir_item *di;
80 struct btrfs_root *root = BTRFS_I(inode)->root;
81 struct btrfs_trans_handle *trans;
82 struct btrfs_path *path;
83 int ret = 0, mod = 0;
84
85 path = btrfs_alloc_path();
86 if (!path)
87 return -ENOMEM;
88
89 trans = btrfs_start_transaction(root, 1);
90 btrfs_set_trans_block_group(trans, inode);
91
92 /* first lets see if we already have this xattr */
93 di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name,
94 strlen(name), -1);
95 if (IS_ERR(di)) {
96 ret = PTR_ERR(di);
97 goto out;
98 }
99
100 /* ok we already have this xattr, lets remove it */
101 if (di) {
102 /* if we want create only exit */
103 if (flags & XATTR_CREATE) {
104 ret = -EEXIST;
105 goto out;
106 }
107
108 ret = btrfs_delete_one_dir_name(trans, root, path, di);
109 if (ret)
110 goto out;
111 btrfs_release_path(root, path);
112
113 /* if we don't have a value then we are removing the xattr */
114 if (!value) {
115 mod = 1;
116 goto out;
117 }
118 } else {
119 btrfs_release_path(root, path);
120
121 if (flags & XATTR_REPLACE) {
122 /* we couldn't find the attr to replace */
123 ret = -ENODATA;
124 goto out;
125 }
126 }
127
128 /* ok we have to create a completely new xattr */
129 ret = btrfs_insert_xattr_item(trans, root, name, strlen(name),
130 value, size, inode->i_ino);
131 if (ret)
132 goto out;
133 mod = 1;
134
135out:
136 if (mod) {
137 inode->i_ctime = CURRENT_TIME;
138 ret = btrfs_update_inode(trans, root, inode);
139 }
140
141 btrfs_end_transaction(trans, root);
142 btrfs_free_path(path);
143 return ret;
144}
145
146ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
147{
148 struct btrfs_key key, found_key;
149 struct inode *inode = dentry->d_inode;
150 struct btrfs_root *root = BTRFS_I(inode)->root;
151 struct btrfs_path *path;
152 struct btrfs_item *item;
153 struct extent_buffer *leaf;
154 struct btrfs_dir_item *di;
155 int ret = 0, slot, advance;
156 size_t total_size = 0, size_left = size;
157 unsigned long name_ptr;
158 size_t name_len;
159 u32 nritems;
160
161 /*
162 * ok we want all objects associated with this id.
163 * NOTE: we set key.offset = 0; because we want to start with the
164 * first xattr that we find and walk forward
165 */
166 key.objectid = inode->i_ino;
167 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
168 key.offset = 0;
169
170 path = btrfs_alloc_path();
171 if (!path)
172 return -ENOMEM;
173 path->reada = 2;
174
175 /* search for our xattrs */
176 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
177 if (ret < 0)
178 goto err;
179 ret = 0;
180 advance = 0;
181 while (1) {
182 leaf = path->nodes[0];
183 nritems = btrfs_header_nritems(leaf);
184 slot = path->slots[0];
185
186 /* this is where we start walking through the path */
187 if (advance || slot >= nritems) {
188 /*
189 * if we've reached the last slot in this leaf we need
190 * to go to the next leaf and reset everything
191 */
192 if (slot >= nritems-1) {
193 ret = btrfs_next_leaf(root, path);
194 if (ret)
195 break;
196 leaf = path->nodes[0];
197 nritems = btrfs_header_nritems(leaf);
198 slot = path->slots[0];
199 } else {
200 /*
201 * just walking through the slots on this leaf
202 */
203 slot++;
204 path->slots[0]++;
205 }
206 }
207 advance = 1;
208
209 item = btrfs_item_nr(leaf, slot);
210 btrfs_item_key_to_cpu(leaf, &found_key, slot);
211
212 /* check to make sure this item is what we want */
213 if (found_key.objectid != key.objectid)
214 break;
215 if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY)
216 break;
217
218 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
219
220 name_len = btrfs_dir_name_len(leaf, di);
221 total_size += name_len + 1;
222
223 /* we are just looking for how big our buffer needs to be */
224 if (!size)
225 continue;
226
227 if (!buffer || (name_len + 1) > size_left) {
228 ret = -ERANGE;
229 goto err;
230 }
231
232 name_ptr = (unsigned long)(di + 1);
233 read_extent_buffer(leaf, buffer, name_ptr, name_len);
234 buffer[name_len] = '\0';
235
236 size_left -= name_len + 1;
237 buffer += name_len + 1;
238 }
239 ret = total_size;
240
241err:
242 btrfs_free_path(path);
243
244 return ret;
245}
246
247/*
248 * List of handlers for synthetic system.* attributes. All real ondisk
249 * attributes are handled directly.
250 */
251struct xattr_handler *btrfs_xattr_handlers[] = {
252#ifdef CONFIG_FS_POSIX_ACL
253 &btrfs_xattr_acl_access_handler,
254 &btrfs_xattr_acl_default_handler,
255#endif
256 NULL,
257};
258
259/*
260 * Check if the attribute is in a supported namespace.
261 *
262 * This applied after the check for the synthetic attributes in the system
263 * namespace.
264 */
265static bool btrfs_is_valid_xattr(const char *name)
266{
267 return !strncmp(name, XATTR_SECURITY_PREFIX,
268 XATTR_SECURITY_PREFIX_LEN) ||
269 !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
270 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
271 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
272}
273
274ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
275 void *buffer, size_t size)
276{
277 /*
278 * If this is a request for a synthetic attribute in the system.*
279 * namespace use the generic infrastructure to resolve a handler
280 * for it via sb->s_xattr.
281 */
282 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
283 return generic_getxattr(dentry, name, buffer, size);
284
285 if (!btrfs_is_valid_xattr(name))
286 return -EOPNOTSUPP;
287 return __btrfs_getxattr(dentry->d_inode, name, buffer, size);
288}
289
290int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
291 size_t size, int flags)
292{
293 /*
294 * If this is a request for a synthetic attribute in the system.*
295 * namespace use the generic infrastructure to resolve a handler
296 * for it via sb->s_xattr.
297 */
298 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
299 return generic_setxattr(dentry, name, value, size, flags);
300
301 if (!btrfs_is_valid_xattr(name))
302 return -EOPNOTSUPP;
303
304 if (size == 0)
305 value = ""; /* empty EA, do not remove */
306 return __btrfs_setxattr(dentry->d_inode, name, value, size, flags);
307}
308
309int btrfs_removexattr(struct dentry *dentry, const char *name)
310{
311 /*
312 * If this is a request for a synthetic attribute in the system.*
313 * namespace use the generic infrastructure to resolve a handler
314 * for it via sb->s_xattr.
315 */
316 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
317 return generic_removexattr(dentry, name);
318
319 if (!btrfs_is_valid_xattr(name))
320 return -EOPNOTSUPP;
321 return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
322}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
new file mode 100644
index 000000000000..5b1d08f8e68d
--- /dev/null
+++ b/fs/btrfs/xattr.h
@@ -0,0 +1,39 @@
1/*
2 * Copyright (C) 2007 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __XATTR__
20#define __XATTR__
21
22#include <linux/xattr.h>
23
24extern struct xattr_handler btrfs_xattr_acl_access_handler;
25extern struct xattr_handler btrfs_xattr_acl_default_handler;
26extern struct xattr_handler *btrfs_xattr_handlers[];
27
28extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
29 void *buffer, size_t size);
30extern int __btrfs_setxattr(struct inode *inode, const char *name,
31 const void *value, size_t size, int flags);
32
33extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
34 void *buffer, size_t size);
35extern int btrfs_setxattr(struct dentry *dentry, const char *name,
36 const void *value, size_t size, int flags);
37extern int btrfs_removexattr(struct dentry *dentry, const char *name);
38
39#endif /* __XATTR__ */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
new file mode 100644
index 000000000000..ecfbce836d32
--- /dev/null
+++ b/fs/btrfs/zlib.c
@@ -0,0 +1,632 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 *
18 * Based on jffs2 zlib code:
19 * Copyright © 2001-2007 Red Hat, Inc.
20 * Created by David Woodhouse <dwmw2@infradead.org>
21 */
22
23#include <linux/kernel.h>
24#include <linux/slab.h>
25#include <linux/zlib.h>
26#include <linux/zutil.h>
27#include <linux/vmalloc.h>
28#include <linux/init.h>
29#include <linux/err.h>
30#include <linux/sched.h>
31#include <linux/pagemap.h>
32#include <linux/bio.h>
33#include "compression.h"
34
35/* Plan: call deflate() with avail_in == *sourcelen,
36 avail_out = *dstlen - 12 and flush == Z_FINISH.
37 If it doesn't manage to finish, call it again with
38 avail_in == 0 and avail_out set to the remaining 12
39 bytes for it to clean up.
40 Q: Is 12 bytes sufficient?
41*/
42#define STREAM_END_SPACE 12
43
44struct workspace {
45 z_stream inf_strm;
46 z_stream def_strm;
47 char *buf;
48 struct list_head list;
49};
50
51static LIST_HEAD(idle_workspace);
52static DEFINE_SPINLOCK(workspace_lock);
53static unsigned long num_workspace;
54static atomic_t alloc_workspace = ATOMIC_INIT(0);
55static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
56
57/*
58 * this finds an available zlib workspace or allocates a new one
59 * NULL or an ERR_PTR is returned if things go bad.
60 */
61static struct workspace *find_zlib_workspace(void)
62{
63 struct workspace *workspace;
64 int ret;
65 int cpus = num_online_cpus();
66
67again:
68 spin_lock(&workspace_lock);
69 if (!list_empty(&idle_workspace)) {
70 workspace = list_entry(idle_workspace.next, struct workspace,
71 list);
72 list_del(&workspace->list);
73 num_workspace--;
74 spin_unlock(&workspace_lock);
75 return workspace;
76
77 }
78 spin_unlock(&workspace_lock);
79 if (atomic_read(&alloc_workspace) > cpus) {
80 DEFINE_WAIT(wait);
81 prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
82 if (atomic_read(&alloc_workspace) > cpus)
83 schedule();
84 finish_wait(&workspace_wait, &wait);
85 goto again;
86 }
87 atomic_inc(&alloc_workspace);
88 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
89 if (!workspace) {
90 ret = -ENOMEM;
91 goto fail;
92 }
93
94 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
95 if (!workspace->def_strm.workspace) {
96 ret = -ENOMEM;
97 goto fail;
98 }
99 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
100 if (!workspace->inf_strm.workspace) {
101 ret = -ENOMEM;
102 goto fail_inflate;
103 }
104 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
105 if (!workspace->buf) {
106 ret = -ENOMEM;
107 goto fail_kmalloc;
108 }
109 return workspace;
110
111fail_kmalloc:
112 vfree(workspace->inf_strm.workspace);
113fail_inflate:
114 vfree(workspace->def_strm.workspace);
115fail:
116 kfree(workspace);
117 atomic_dec(&alloc_workspace);
118 wake_up(&workspace_wait);
119 return ERR_PTR(ret);
120}
121
122/*
123 * put a workspace struct back on the list or free it if we have enough
124 * idle ones sitting around
125 */
126static int free_workspace(struct workspace *workspace)
127{
128 spin_lock(&workspace_lock);
129 if (num_workspace < num_online_cpus()) {
130 list_add_tail(&workspace->list, &idle_workspace);
131 num_workspace++;
132 spin_unlock(&workspace_lock);
133 if (waitqueue_active(&workspace_wait))
134 wake_up(&workspace_wait);
135 return 0;
136 }
137 spin_unlock(&workspace_lock);
138 vfree(workspace->def_strm.workspace);
139 vfree(workspace->inf_strm.workspace);
140 kfree(workspace->buf);
141 kfree(workspace);
142
143 atomic_dec(&alloc_workspace);
144 if (waitqueue_active(&workspace_wait))
145 wake_up(&workspace_wait);
146 return 0;
147}
148
149/*
150 * cleanup function for module exit
151 */
152static void free_workspaces(void)
153{
154 struct workspace *workspace;
155 while (!list_empty(&idle_workspace)) {
156 workspace = list_entry(idle_workspace.next, struct workspace,
157 list);
158 list_del(&workspace->list);
159 vfree(workspace->def_strm.workspace);
160 vfree(workspace->inf_strm.workspace);
161 kfree(workspace->buf);
162 kfree(workspace);
163 atomic_dec(&alloc_workspace);
164 }
165}
166
167/*
168 * given an address space and start/len, compress the bytes.
169 *
170 * pages are allocated to hold the compressed result and stored
171 * in 'pages'
172 *
173 * out_pages is used to return the number of pages allocated. There
174 * may be pages allocated even if we return an error
175 *
176 * total_in is used to return the number of bytes actually read. It
177 * may be smaller then len if we had to exit early because we
178 * ran out of room in the pages array or because we cross the
179 * max_out threshold.
180 *
181 * total_out is used to return the total number of compressed bytes
182 *
183 * max_out tells us the max number of bytes that we're allowed to
184 * stuff into pages
185 */
186int btrfs_zlib_compress_pages(struct address_space *mapping,
187 u64 start, unsigned long len,
188 struct page **pages,
189 unsigned long nr_dest_pages,
190 unsigned long *out_pages,
191 unsigned long *total_in,
192 unsigned long *total_out,
193 unsigned long max_out)
194{
195 int ret;
196 struct workspace *workspace;
197 char *data_in;
198 char *cpage_out;
199 int nr_pages = 0;
200 struct page *in_page = NULL;
201 struct page *out_page = NULL;
202 int out_written = 0;
203 int in_read = 0;
204 unsigned long bytes_left;
205
206 *out_pages = 0;
207 *total_out = 0;
208 *total_in = 0;
209
210 workspace = find_zlib_workspace();
211 if (!workspace)
212 return -1;
213
214 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
215 printk(KERN_WARNING "deflateInit failed\n");
216 ret = -1;
217 goto out;
218 }
219
220 workspace->def_strm.total_in = 0;
221 workspace->def_strm.total_out = 0;
222
223 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
224 data_in = kmap(in_page);
225
226 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
227 cpage_out = kmap(out_page);
228 pages[0] = out_page;
229 nr_pages = 1;
230
231 workspace->def_strm.next_in = data_in;
232 workspace->def_strm.next_out = cpage_out;
233 workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
234 workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
235
236 out_written = 0;
237 in_read = 0;
238
239 while (workspace->def_strm.total_in < len) {
240 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
241 if (ret != Z_OK) {
242 printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
243 ret);
244 zlib_deflateEnd(&workspace->def_strm);
245 ret = -1;
246 goto out;
247 }
248
249 /* we're making it bigger, give up */
250 if (workspace->def_strm.total_in > 8192 &&
251 workspace->def_strm.total_in <
252 workspace->def_strm.total_out) {
253 ret = -1;
254 goto out;
255 }
256 /* we need another page for writing out. Test this
257 * before the total_in so we will pull in a new page for
258 * the stream end if required
259 */
260 if (workspace->def_strm.avail_out == 0) {
261 kunmap(out_page);
262 if (nr_pages == nr_dest_pages) {
263 out_page = NULL;
264 ret = -1;
265 goto out;
266 }
267 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
268 cpage_out = kmap(out_page);
269 pages[nr_pages] = out_page;
270 nr_pages++;
271 workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
272 workspace->def_strm.next_out = cpage_out;
273 }
274 /* we're all done */
275 if (workspace->def_strm.total_in >= len)
276 break;
277
278 /* we've read in a full page, get a new one */
279 if (workspace->def_strm.avail_in == 0) {
280 if (workspace->def_strm.total_out > max_out)
281 break;
282
283 bytes_left = len - workspace->def_strm.total_in;
284 kunmap(in_page);
285 page_cache_release(in_page);
286
287 start += PAGE_CACHE_SIZE;
288 in_page = find_get_page(mapping,
289 start >> PAGE_CACHE_SHIFT);
290 data_in = kmap(in_page);
291 workspace->def_strm.avail_in = min(bytes_left,
292 PAGE_CACHE_SIZE);
293 workspace->def_strm.next_in = data_in;
294 }
295 }
296 workspace->def_strm.avail_in = 0;
297 ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
298 zlib_deflateEnd(&workspace->def_strm);
299
300 if (ret != Z_STREAM_END) {
301 ret = -1;
302 goto out;
303 }
304
305 if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
306 ret = -1;
307 goto out;
308 }
309
310 ret = 0;
311 *total_out = workspace->def_strm.total_out;
312 *total_in = workspace->def_strm.total_in;
313out:
314 *out_pages = nr_pages;
315 if (out_page)
316 kunmap(out_page);
317
318 if (in_page) {
319 kunmap(in_page);
320 page_cache_release(in_page);
321 }
322 free_workspace(workspace);
323 return ret;
324}
325
326/*
327 * pages_in is an array of pages with compressed data.
328 *
329 * disk_start is the starting logical offset of this array in the file
330 *
331 * bvec is a bio_vec of pages from the file that we want to decompress into
332 *
333 * vcnt is the count of pages in the biovec
334 *
335 * srclen is the number of bytes in pages_in
336 *
337 * The basic idea is that we have a bio that was created by readpages.
338 * The pages in the bio are for the uncompressed data, and they may not
339 * be contiguous. They all correspond to the range of bytes covered by
340 * the compressed extent.
341 */
342int btrfs_zlib_decompress_biovec(struct page **pages_in,
343 u64 disk_start,
344 struct bio_vec *bvec,
345 int vcnt,
346 size_t srclen)
347{
348 int ret = 0;
349 int wbits = MAX_WBITS;
350 struct workspace *workspace;
351 char *data_in;
352 size_t total_out = 0;
353 unsigned long page_bytes_left;
354 unsigned long page_in_index = 0;
355 unsigned long page_out_index = 0;
356 struct page *page_out;
357 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
358 PAGE_CACHE_SIZE;
359 unsigned long buf_start;
360 unsigned long buf_offset;
361 unsigned long bytes;
362 unsigned long working_bytes;
363 unsigned long pg_offset;
364 unsigned long start_byte;
365 unsigned long current_buf_start;
366 char *kaddr;
367
368 workspace = find_zlib_workspace();
369 if (!workspace)
370 return -ENOMEM;
371
372 data_in = kmap(pages_in[page_in_index]);
373 workspace->inf_strm.next_in = data_in;
374 workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
375 workspace->inf_strm.total_in = 0;
376
377 workspace->inf_strm.total_out = 0;
378 workspace->inf_strm.next_out = workspace->buf;
379 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
380 page_out = bvec[page_out_index].bv_page;
381 page_bytes_left = PAGE_CACHE_SIZE;
382 pg_offset = 0;
383
384 /* If it's deflate, and it's got no preset dictionary, then
385 we can tell zlib to skip the adler32 check. */
386 if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
387 ((data_in[0] & 0x0f) == Z_DEFLATED) &&
388 !(((data_in[0]<<8) + data_in[1]) % 31)) {
389
390 wbits = -((data_in[0] >> 4) + 8);
391 workspace->inf_strm.next_in += 2;
392 workspace->inf_strm.avail_in -= 2;
393 }
394
395 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
396 printk(KERN_WARNING "inflateInit failed\n");
397 ret = -1;
398 goto out;
399 }
400 while (workspace->inf_strm.total_in < srclen) {
401 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
402 if (ret != Z_OK && ret != Z_STREAM_END)
403 break;
404 /*
405 * buf start is the byte offset we're of the start of
406 * our workspace buffer
407 */
408 buf_start = total_out;
409
410 /* total_out is the last byte of the workspace buffer */
411 total_out = workspace->inf_strm.total_out;
412
413 working_bytes = total_out - buf_start;
414
415 /*
416 * start byte is the first byte of the page we're currently
417 * copying into relative to the start of the compressed data.
418 */
419 start_byte = page_offset(page_out) - disk_start;
420
421 if (working_bytes == 0) {
422 /* we didn't make progress in this inflate
423 * call, we're done
424 */
425 if (ret != Z_STREAM_END)
426 ret = -1;
427 break;
428 }
429
430 /* we haven't yet hit data corresponding to this page */
431 if (total_out <= start_byte)
432 goto next;
433
434 /*
435 * the start of the data we care about is offset into
436 * the middle of our working buffer
437 */
438 if (total_out > start_byte && buf_start < start_byte) {
439 buf_offset = start_byte - buf_start;
440 working_bytes -= buf_offset;
441 } else {
442 buf_offset = 0;
443 }
444 current_buf_start = buf_start;
445
446 /* copy bytes from the working buffer into the pages */
447 while (working_bytes > 0) {
448 bytes = min(PAGE_CACHE_SIZE - pg_offset,
449 PAGE_CACHE_SIZE - buf_offset);
450 bytes = min(bytes, working_bytes);
451 kaddr = kmap_atomic(page_out, KM_USER0);
452 memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
453 bytes);
454 kunmap_atomic(kaddr, KM_USER0);
455 flush_dcache_page(page_out);
456
457 pg_offset += bytes;
458 page_bytes_left -= bytes;
459 buf_offset += bytes;
460 working_bytes -= bytes;
461 current_buf_start += bytes;
462
463 /* check if we need to pick another page */
464 if (page_bytes_left == 0) {
465 page_out_index++;
466 if (page_out_index >= vcnt) {
467 ret = 0;
468 goto done;
469 }
470
471 page_out = bvec[page_out_index].bv_page;
472 pg_offset = 0;
473 page_bytes_left = PAGE_CACHE_SIZE;
474 start_byte = page_offset(page_out) - disk_start;
475
476 /*
477 * make sure our new page is covered by this
478 * working buffer
479 */
480 if (total_out <= start_byte)
481 goto next;
482
483 /* the next page in the biovec might not
484 * be adjacent to the last page, but it
485 * might still be found inside this working
486 * buffer. bump our offset pointer
487 */
488 if (total_out > start_byte &&
489 current_buf_start < start_byte) {
490 buf_offset = start_byte - buf_start;
491 working_bytes = total_out - start_byte;
492 current_buf_start = buf_start +
493 buf_offset;
494 }
495 }
496 }
497next:
498 workspace->inf_strm.next_out = workspace->buf;
499 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
500
501 if (workspace->inf_strm.avail_in == 0) {
502 unsigned long tmp;
503 kunmap(pages_in[page_in_index]);
504 page_in_index++;
505 if (page_in_index >= total_pages_in) {
506 data_in = NULL;
507 break;
508 }
509 data_in = kmap(pages_in[page_in_index]);
510 workspace->inf_strm.next_in = data_in;
511 tmp = srclen - workspace->inf_strm.total_in;
512 workspace->inf_strm.avail_in = min(tmp,
513 PAGE_CACHE_SIZE);
514 }
515 }
516 if (ret != Z_STREAM_END)
517 ret = -1;
518 else
519 ret = 0;
520done:
521 zlib_inflateEnd(&workspace->inf_strm);
522 if (data_in)
523 kunmap(pages_in[page_in_index]);
524out:
525 free_workspace(workspace);
526 return ret;
527}
528
529/*
530 * a less complex decompression routine. Our compressed data fits in a
531 * single page, and we want to read a single page out of it.
532 * start_byte tells us the offset into the compressed data we're interested in
533 */
534int btrfs_zlib_decompress(unsigned char *data_in,
535 struct page *dest_page,
536 unsigned long start_byte,
537 size_t srclen, size_t destlen)
538{
539 int ret = 0;
540 int wbits = MAX_WBITS;
541 struct workspace *workspace;
542 unsigned long bytes_left = destlen;
543 unsigned long total_out = 0;
544 char *kaddr;
545
546 if (destlen > PAGE_CACHE_SIZE)
547 return -ENOMEM;
548
549 workspace = find_zlib_workspace();
550 if (!workspace)
551 return -ENOMEM;
552
553 workspace->inf_strm.next_in = data_in;
554 workspace->inf_strm.avail_in = srclen;
555 workspace->inf_strm.total_in = 0;
556
557 workspace->inf_strm.next_out = workspace->buf;
558 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
559 workspace->inf_strm.total_out = 0;
560 /* If it's deflate, and it's got no preset dictionary, then
561 we can tell zlib to skip the adler32 check. */
562 if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
563 ((data_in[0] & 0x0f) == Z_DEFLATED) &&
564 !(((data_in[0]<<8) + data_in[1]) % 31)) {
565
566 wbits = -((data_in[0] >> 4) + 8);
567 workspace->inf_strm.next_in += 2;
568 workspace->inf_strm.avail_in -= 2;
569 }
570
571 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
572 printk(KERN_WARNING "inflateInit failed\n");
573 ret = -1;
574 goto out;
575 }
576
577 while (bytes_left > 0) {
578 unsigned long buf_start;
579 unsigned long buf_offset;
580 unsigned long bytes;
581 unsigned long pg_offset = 0;
582
583 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
584 if (ret != Z_OK && ret != Z_STREAM_END)
585 break;
586
587 buf_start = total_out;
588 total_out = workspace->inf_strm.total_out;
589
590 if (total_out == buf_start) {
591 ret = -1;
592 break;
593 }
594
595 if (total_out <= start_byte)
596 goto next;
597
598 if (total_out > start_byte && buf_start < start_byte)
599 buf_offset = start_byte - buf_start;
600 else
601 buf_offset = 0;
602
603 bytes = min(PAGE_CACHE_SIZE - pg_offset,
604 PAGE_CACHE_SIZE - buf_offset);
605 bytes = min(bytes, bytes_left);
606
607 kaddr = kmap_atomic(dest_page, KM_USER0);
608 memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
609 kunmap_atomic(kaddr, KM_USER0);
610
611 pg_offset += bytes;
612 bytes_left -= bytes;
613next:
614 workspace->inf_strm.next_out = workspace->buf;
615 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
616 }
617
618 if (ret != Z_STREAM_END && bytes_left != 0)
619 ret = -1;
620 else
621 ret = 0;
622
623 zlib_inflateEnd(&workspace->inf_strm);
624out:
625 free_workspace(workspace);
626 return ret;
627}
628
629void btrfs_zlib_exit(void)
630{
631 free_workspaces();
632}
diff --git a/fs/buffer.c b/fs/buffer.c
index 10179cfa1152..b6e8b8632e2f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -99,10 +99,18 @@ __clear_page_buffers(struct page *page)
99 page_cache_release(page); 99 page_cache_release(page);
100} 100}
101 101
102
103static int quiet_error(struct buffer_head *bh)
104{
105 if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
106 return 0;
107 return 1;
108}
109
110
102static void buffer_io_error(struct buffer_head *bh) 111static void buffer_io_error(struct buffer_head *bh)
103{ 112{
104 char b[BDEVNAME_SIZE]; 113 char b[BDEVNAME_SIZE];
105
106 printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n", 114 printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
107 bdevname(bh->b_bdev, b), 115 bdevname(bh->b_bdev, b),
108 (unsigned long long)bh->b_blocknr); 116 (unsigned long long)bh->b_blocknr);
@@ -144,7 +152,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
144 if (uptodate) { 152 if (uptodate) {
145 set_buffer_uptodate(bh); 153 set_buffer_uptodate(bh);
146 } else { 154 } else {
147 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { 155 if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {
148 buffer_io_error(bh); 156 buffer_io_error(bh);
149 printk(KERN_WARNING "lost page write due to " 157 printk(KERN_WARNING "lost page write due to "
150 "I/O error on %s\n", 158 "I/O error on %s\n",
@@ -195,10 +203,25 @@ int fsync_bdev(struct block_device *bdev)
195 * happen on bdev until thaw_bdev() is called. 203 * happen on bdev until thaw_bdev() is called.
196 * If a superblock is found on this device, we take the s_umount semaphore 204 * If a superblock is found on this device, we take the s_umount semaphore
197 * on it to make sure nobody unmounts until the snapshot creation is done. 205 * on it to make sure nobody unmounts until the snapshot creation is done.
206 * The reference counter (bd_fsfreeze_count) guarantees that only the last
207 * unfreeze process can unfreeze the frozen filesystem actually when multiple
208 * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
209 * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
210 * actually.
198 */ 211 */
199struct super_block *freeze_bdev(struct block_device *bdev) 212struct super_block *freeze_bdev(struct block_device *bdev)
200{ 213{
201 struct super_block *sb; 214 struct super_block *sb;
215 int error = 0;
216
217 mutex_lock(&bdev->bd_fsfreeze_mutex);
218 if (bdev->bd_fsfreeze_count > 0) {
219 bdev->bd_fsfreeze_count++;
220 sb = get_super(bdev);
221 mutex_unlock(&bdev->bd_fsfreeze_mutex);
222 return sb;
223 }
224 bdev->bd_fsfreeze_count++;
202 225
203 down(&bdev->bd_mount_sem); 226 down(&bdev->bd_mount_sem);
204 sb = get_super(bdev); 227 sb = get_super(bdev);
@@ -213,11 +236,24 @@ struct super_block *freeze_bdev(struct block_device *bdev)
213 236
214 sync_blockdev(sb->s_bdev); 237 sync_blockdev(sb->s_bdev);
215 238
216 if (sb->s_op->write_super_lockfs) 239 if (sb->s_op->freeze_fs) {
217 sb->s_op->write_super_lockfs(sb); 240 error = sb->s_op->freeze_fs(sb);
241 if (error) {
242 printk(KERN_ERR
243 "VFS:Filesystem freeze failed\n");
244 sb->s_frozen = SB_UNFROZEN;
245 drop_super(sb);
246 up(&bdev->bd_mount_sem);
247 bdev->bd_fsfreeze_count--;
248 mutex_unlock(&bdev->bd_fsfreeze_mutex);
249 return ERR_PTR(error);
250 }
251 }
218 } 252 }
219 253
220 sync_blockdev(bdev); 254 sync_blockdev(bdev);
255 mutex_unlock(&bdev->bd_fsfreeze_mutex);
256
221 return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */ 257 return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
222} 258}
223EXPORT_SYMBOL(freeze_bdev); 259EXPORT_SYMBOL(freeze_bdev);
@@ -229,20 +265,48 @@ EXPORT_SYMBOL(freeze_bdev);
229 * 265 *
230 * Unlocks the filesystem and marks it writeable again after freeze_bdev(). 266 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
231 */ 267 */
232void thaw_bdev(struct block_device *bdev, struct super_block *sb) 268int thaw_bdev(struct block_device *bdev, struct super_block *sb)
233{ 269{
270 int error = 0;
271
272 mutex_lock(&bdev->bd_fsfreeze_mutex);
273 if (!bdev->bd_fsfreeze_count) {
274 mutex_unlock(&bdev->bd_fsfreeze_mutex);
275 return -EINVAL;
276 }
277
278 bdev->bd_fsfreeze_count--;
279 if (bdev->bd_fsfreeze_count > 0) {
280 if (sb)
281 drop_super(sb);
282 mutex_unlock(&bdev->bd_fsfreeze_mutex);
283 return 0;
284 }
285
234 if (sb) { 286 if (sb) {
235 BUG_ON(sb->s_bdev != bdev); 287 BUG_ON(sb->s_bdev != bdev);
236 288 if (!(sb->s_flags & MS_RDONLY)) {
237 if (sb->s_op->unlockfs) 289 if (sb->s_op->unfreeze_fs) {
238 sb->s_op->unlockfs(sb); 290 error = sb->s_op->unfreeze_fs(sb);
239 sb->s_frozen = SB_UNFROZEN; 291 if (error) {
240 smp_wmb(); 292 printk(KERN_ERR
241 wake_up(&sb->s_wait_unfrozen); 293 "VFS:Filesystem thaw failed\n");
294 sb->s_frozen = SB_FREEZE_TRANS;
295 bdev->bd_fsfreeze_count++;
296 mutex_unlock(&bdev->bd_fsfreeze_mutex);
297 return error;
298 }
299 }
300 sb->s_frozen = SB_UNFROZEN;
301 smp_wmb();
302 wake_up(&sb->s_wait_unfrozen);
303 }
242 drop_super(sb); 304 drop_super(sb);
243 } 305 }
244 306
245 up(&bdev->bd_mount_sem); 307 up(&bdev->bd_mount_sem);
308 mutex_unlock(&bdev->bd_fsfreeze_mutex);
309 return 0;
246} 310}
247EXPORT_SYMBOL(thaw_bdev); 311EXPORT_SYMBOL(thaw_bdev);
248 312
@@ -394,7 +458,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
394 set_buffer_uptodate(bh); 458 set_buffer_uptodate(bh);
395 } else { 459 } else {
396 clear_buffer_uptodate(bh); 460 clear_buffer_uptodate(bh);
397 if (printk_ratelimit()) 461 if (!quiet_error(bh))
398 buffer_io_error(bh); 462 buffer_io_error(bh);
399 SetPageError(page); 463 SetPageError(page);
400 } 464 }
@@ -455,7 +519,7 @@ static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
455 if (uptodate) { 519 if (uptodate) {
456 set_buffer_uptodate(bh); 520 set_buffer_uptodate(bh);
457 } else { 521 } else {
458 if (printk_ratelimit()) { 522 if (!quiet_error(bh)) {
459 buffer_io_error(bh); 523 buffer_io_error(bh);
460 printk(KERN_WARNING "lost page write due to " 524 printk(KERN_WARNING "lost page write due to "
461 "I/O error on %s\n", 525 "I/O error on %s\n",
@@ -1988,7 +2052,7 @@ int block_write_begin(struct file *file, struct address_space *mapping,
1988 page = *pagep; 2052 page = *pagep;
1989 if (page == NULL) { 2053 if (page == NULL) {
1990 ownpage = 1; 2054 ownpage = 1;
1991 page = __grab_cache_page(mapping, index); 2055 page = grab_cache_page_write_begin(mapping, index, flags);
1992 if (!page) { 2056 if (!page) {
1993 status = -ENOMEM; 2057 status = -ENOMEM;
1994 goto out; 2058 goto out;
@@ -2014,7 +2078,6 @@ int block_write_begin(struct file *file, struct address_space *mapping,
2014 if (pos + len > inode->i_size) 2078 if (pos + len > inode->i_size)
2015 vmtruncate(inode, inode->i_size); 2079 vmtruncate(inode, inode->i_size);
2016 } 2080 }
2017 goto out;
2018 } 2081 }
2019 2082
2020out: 2083out:
@@ -2494,7 +2557,7 @@ int nobh_write_begin(struct file *file, struct address_space *mapping,
2494 from = pos & (PAGE_CACHE_SIZE - 1); 2557 from = pos & (PAGE_CACHE_SIZE - 1);
2495 to = from + len; 2558 to = from + len;
2496 2559
2497 page = __grab_cache_page(mapping, index); 2560 page = grab_cache_page_write_begin(mapping, index, flags);
2498 if (!page) 2561 if (!page)
2499 return -ENOMEM; 2562 return -ENOMEM;
2500 *pagep = page; 2563 *pagep = page;
@@ -2913,6 +2976,9 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
2913 set_bit(BH_Eopnotsupp, &bh->b_state); 2976 set_bit(BH_Eopnotsupp, &bh->b_state);
2914 } 2977 }
2915 2978
2979 if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
2980 set_bit(BH_Quiet, &bh->b_state);
2981
2916 bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags)); 2982 bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2917 bio_put(bio); 2983 bio_put(bio);
2918} 2984}
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 700697a72618..38f71222a552 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -120,7 +120,7 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
120 cd->major = major; 120 cd->major = major;
121 cd->baseminor = baseminor; 121 cd->baseminor = baseminor;
122 cd->minorct = minorct; 122 cd->minorct = minorct;
123 strncpy(cd->name,name, 64); 123 strlcpy(cd->name, name, sizeof(cd->name));
124 124
125 i = major_to_index(major); 125 i = major_to_index(major);
126 126
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 6ba43fb346fb..9948c0030e86 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -5,7 +5,7 @@ obj-$(CONFIG_CIFS) += cifs.o
5 5
6cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \ 6cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
7 link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \ 7 link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
8 md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o fcntl.o \ 8 md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
9 readdir.o ioctl.o sess.o export.o cifsacl.o 9 readdir.o ioctl.o sess.o export.o cifsacl.o
10 10
11cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o 11cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 0005a194a75c..13ea53251dcf 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -747,7 +747,6 @@ const struct file_operations cifs_file_ops = {
747#endif /* CONFIG_CIFS_POSIX */ 747#endif /* CONFIG_CIFS_POSIX */
748 748
749#ifdef CONFIG_CIFS_EXPERIMENTAL 749#ifdef CONFIG_CIFS_EXPERIMENTAL
750 .dir_notify = cifs_dir_notify,
751 .setlease = cifs_setlease, 750 .setlease = cifs_setlease,
752#endif /* CONFIG_CIFS_EXPERIMENTAL */ 751#endif /* CONFIG_CIFS_EXPERIMENTAL */
753}; 752};
@@ -768,7 +767,6 @@ const struct file_operations cifs_file_direct_ops = {
768#endif /* CONFIG_CIFS_POSIX */ 767#endif /* CONFIG_CIFS_POSIX */
769 .llseek = cifs_llseek, 768 .llseek = cifs_llseek,
770#ifdef CONFIG_CIFS_EXPERIMENTAL 769#ifdef CONFIG_CIFS_EXPERIMENTAL
771 .dir_notify = cifs_dir_notify,
772 .setlease = cifs_setlease, 770 .setlease = cifs_setlease,
773#endif /* CONFIG_CIFS_EXPERIMENTAL */ 771#endif /* CONFIG_CIFS_EXPERIMENTAL */
774}; 772};
@@ -789,7 +787,6 @@ const struct file_operations cifs_file_nobrl_ops = {
789#endif /* CONFIG_CIFS_POSIX */ 787#endif /* CONFIG_CIFS_POSIX */
790 788
791#ifdef CONFIG_CIFS_EXPERIMENTAL 789#ifdef CONFIG_CIFS_EXPERIMENTAL
792 .dir_notify = cifs_dir_notify,
793 .setlease = cifs_setlease, 790 .setlease = cifs_setlease,
794#endif /* CONFIG_CIFS_EXPERIMENTAL */ 791#endif /* CONFIG_CIFS_EXPERIMENTAL */
795}; 792};
@@ -809,7 +806,6 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
809#endif /* CONFIG_CIFS_POSIX */ 806#endif /* CONFIG_CIFS_POSIX */
810 .llseek = cifs_llseek, 807 .llseek = cifs_llseek,
811#ifdef CONFIG_CIFS_EXPERIMENTAL 808#ifdef CONFIG_CIFS_EXPERIMENTAL
812 .dir_notify = cifs_dir_notify,
813 .setlease = cifs_setlease, 809 .setlease = cifs_setlease,
814#endif /* CONFIG_CIFS_EXPERIMENTAL */ 810#endif /* CONFIG_CIFS_EXPERIMENTAL */
815}; 811};
@@ -818,9 +814,6 @@ const struct file_operations cifs_dir_ops = {
818 .readdir = cifs_readdir, 814 .readdir = cifs_readdir,
819 .release = cifs_closedir, 815 .release = cifs_closedir,
820 .read = generic_read_dir, 816 .read = generic_read_dir,
821#ifdef CONFIG_CIFS_EXPERIMENTAL
822 .dir_notify = cifs_dir_notify,
823#endif /* CONFIG_CIFS_EXPERIMENTAL */
824 .unlocked_ioctl = cifs_ioctl, 817 .unlocked_ioctl = cifs_ioctl,
825 .llseek = generic_file_llseek, 818 .llseek = generic_file_llseek,
826}; 819};
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 2ce04c73d74e..7ac481841f87 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -76,7 +76,6 @@ extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
76extern const struct file_operations cifs_dir_ops; 76extern const struct file_operations cifs_dir_ops;
77extern int cifs_dir_open(struct inode *inode, struct file *file); 77extern int cifs_dir_open(struct inode *inode, struct file *file);
78extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir); 78extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
79extern int cifs_dir_notify(struct file *, unsigned long arg);
80 79
81/* Functions related to dir entries */ 80/* Functions related to dir entries */
82extern struct dentry_operations cifs_dentry_ops; 81extern struct dentry_operations cifs_dentry_ops;
diff --git a/fs/cifs/fcntl.c b/fs/cifs/fcntl.c
deleted file mode 100644
index 5a57581eb4b2..000000000000
--- a/fs/cifs/fcntl.c
+++ /dev/null
@@ -1,118 +0,0 @@
1/*
2 * fs/cifs/fcntl.c
3 *
4 * vfs operations that deal with the file control API
5 *
6 * Copyright (C) International Business Machines Corp., 2003,2004
7 * Author(s): Steve French (sfrench@us.ibm.com)
8 *
9 * This library is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU Lesser General Public License as published
11 * by the Free Software Foundation; either version 2.1 of the License, or
12 * (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
17 * the GNU Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public License
20 * along with this library; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
23#include <linux/fs.h>
24#include <linux/stat.h>
25#include <linux/fcntl.h>
26#include "cifsglob.h"
27#include "cifsproto.h"
28#include "cifs_unicode.h"
29#include "cifs_debug.h"
30#include "cifsfs.h"
31
32static __u32 convert_to_cifs_notify_flags(unsigned long fcntl_notify_flags)
33{
34 __u32 cifs_ntfy_flags = 0;
35
36 /* No way on Linux VFS to ask to monitor xattr
37 changes (and no stream support either */
38 if (fcntl_notify_flags & DN_ACCESS)
39 cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_ACCESS;
40 if (fcntl_notify_flags & DN_MODIFY) {
41 /* What does this mean on directories? */
42 cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_WRITE |
43 FILE_NOTIFY_CHANGE_SIZE;
44 }
45 if (fcntl_notify_flags & DN_CREATE) {
46 cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_CREATION |
47 FILE_NOTIFY_CHANGE_LAST_WRITE;
48 }
49 if (fcntl_notify_flags & DN_DELETE)
50 cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_WRITE;
51 if (fcntl_notify_flags & DN_RENAME) {
52 /* BB review this - checking various server behaviors */
53 cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_DIR_NAME |
54 FILE_NOTIFY_CHANGE_FILE_NAME;
55 }
56 if (fcntl_notify_flags & DN_ATTRIB) {
57 cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_SECURITY |
58 FILE_NOTIFY_CHANGE_ATTRIBUTES;
59 }
60/* if (fcntl_notify_flags & DN_MULTISHOT) {
61 cifs_ntfy_flags |= ;
62 } */ /* BB fixme - not sure how to handle this with CIFS yet */
63
64 return cifs_ntfy_flags;
65}
66
67int cifs_dir_notify(struct file *file, unsigned long arg)
68{
69 int xid;
70 int rc = -EINVAL;
71 int oplock = 0;
72 struct cifs_sb_info *cifs_sb;
73 struct cifsTconInfo *pTcon;
74 char *full_path = NULL;
75 __u32 filter = FILE_NOTIFY_CHANGE_NAME | FILE_NOTIFY_CHANGE_ATTRIBUTES;
76 __u16 netfid;
77
78 if (experimEnabled == 0)
79 return 0;
80
81 xid = GetXid();
82 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
83 pTcon = cifs_sb->tcon;
84
85 full_path = build_path_from_dentry(file->f_path.dentry);
86
87 if (full_path == NULL) {
88 rc = -ENOMEM;
89 } else {
90 cFYI(1, ("dir notify on file %s Arg 0x%lx", full_path, arg));
91 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
92 GENERIC_READ | SYNCHRONIZE, 0 /* create options */,
93 &netfid, &oplock, NULL, cifs_sb->local_nls,
94 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
95 /* BB fixme - add this handle to a notify handle list */
96 if (rc) {
97 cFYI(1, ("Could not open directory for notify"));
98 } else {
99 filter = convert_to_cifs_notify_flags(arg);
100 if (filter != 0) {
101 rc = CIFSSMBNotify(xid, pTcon,
102 0 /* no subdirs */, netfid,
103 filter, file, arg & DN_MULTISHOT,
104 cifs_sb->local_nls);
105 } else {
106 rc = -EINVAL;
107 }
108 /* BB add code to close file eventually (at unmount
109 it would close automatically but may be a way
110 to do it easily when inode freed or when
111 notify info is cleared/changed */
112 cFYI(1, ("notify rc %d", rc));
113 }
114 }
115
116 FreeXid(xid);
117 return rc;
118}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index b1e1fc6a6e6a..12bb656fbe75 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2074,7 +2074,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
2074 2074
2075 cFYI(1, ("write_begin from %lld len %d", (long long)pos, len)); 2075 cFYI(1, ("write_begin from %lld len %d", (long long)pos, len));
2076 2076
2077 page = __grab_cache_page(mapping, index); 2077 page = grab_cache_page_write_begin(mapping, index, flags);
2078 if (!page) { 2078 if (!page) {
2079 rc = -ENOMEM; 2079 rc = -ENOMEM;
2080 goto out; 2080 goto out;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index f247da9f4edc..5ab9896fdcb2 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1641,7 +1641,7 @@ do_expand:
1641 i_size_write(inode, offset); 1641 i_size_write(inode, offset);
1642 spin_unlock(&inode->i_lock); 1642 spin_unlock(&inode->i_lock);
1643out_truncate: 1643out_truncate:
1644 if (inode->i_op && inode->i_op->truncate) 1644 if (inode->i_op->truncate)
1645 inode->i_op->truncate(inode); 1645 inode->i_op->truncate(inode);
1646 return 0; 1646 return 0;
1647out_sig: 1647out_sig:
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 466303db2df6..6a347fbc998a 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -201,8 +201,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
201int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync) 201int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
202{ 202{
203 struct file *host_file; 203 struct file *host_file;
204 struct dentry *host_dentry; 204 struct inode *coda_inode = coda_dentry->d_inode;
205 struct inode *host_inode, *coda_inode = coda_dentry->d_inode;
206 struct coda_file_info *cfi; 205 struct coda_file_info *cfi;
207 int err = 0; 206 int err = 0;
208 207
@@ -214,14 +213,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
214 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); 213 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
215 host_file = cfi->cfi_container; 214 host_file = cfi->cfi_container;
216 215
217 if (host_file->f_op && host_file->f_op->fsync) { 216 err = vfs_fsync(host_file, host_file->f_path.dentry, datasync);
218 host_dentry = host_file->f_path.dentry;
219 host_inode = host_dentry->d_inode;
220 mutex_lock(&host_inode->i_mutex);
221 err = host_file->f_op->fsync(host_file, host_dentry, datasync);
222 mutex_unlock(&host_inode->i_mutex);
223 }
224
225 if ( !err && !datasync ) { 217 if ( !err && !datasync ) {
226 lock_kernel(); 218 lock_kernel();
227 err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode)); 219 err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index 81b7771c6465..43c96ce29614 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -11,7 +11,9 @@
11 11
12#include "coda_int.h" 12#include "coda_int.h"
13 13
14#ifdef CONFIG_SYSCTL
14static struct ctl_table_header *fs_table_header; 15static struct ctl_table_header *fs_table_header;
16#endif
15 17
16static ctl_table coda_table[] = { 18static ctl_table coda_table[] = {
17 { 19 {
@@ -41,6 +43,7 @@ static ctl_table coda_table[] = {
41 {} 43 {}
42}; 44};
43 45
46#ifdef CONFIG_SYSCTL
44static ctl_table fs_table[] = { 47static ctl_table fs_table[] = {
45 { 48 {
46 .ctl_name = CTL_UNNUMBERED, 49 .ctl_name = CTL_UNNUMBERED,
@@ -50,7 +53,7 @@ static ctl_table fs_table[] = {
50 }, 53 },
51 {} 54 {}
52}; 55};
53 56#endif
54 57
55void coda_sysctl_init(void) 58void coda_sysctl_init(void)
56{ 59{
diff --git a/fs/compat.c b/fs/compat.c
index d1ece79b6411..30f2faa22f5c 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1187,6 +1187,9 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsign
1187 ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos); 1187 ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos);
1188 1188
1189out: 1189out:
1190 if (ret > 0)
1191 add_rchar(current, ret);
1192 inc_syscr(current);
1190 fput(file); 1193 fput(file);
1191 return ret; 1194 return ret;
1192} 1195}
@@ -1210,6 +1213,9 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsig
1210 ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos); 1213 ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos);
1211 1214
1212out: 1215out:
1216 if (ret > 0)
1217 add_wchar(current, ret);
1218 inc_syscw(current);
1213 fput(file); 1219 fput(file);
1214 return ret; 1220 return ret;
1215} 1221}
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 4803ccc94480..5d349d38e056 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -117,8 +117,6 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
117static inline void set_default_inode_attr(struct inode * inode, mode_t mode) 117static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
118{ 118{
119 inode->i_mode = mode; 119 inode->i_mode = mode;
120 inode->i_uid = 0;
121 inode->i_gid = 0;
122 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 120 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
123} 121}
124 122
@@ -136,7 +134,6 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
136{ 134{
137 struct inode * inode = new_inode(configfs_sb); 135 struct inode * inode = new_inode(configfs_sb);
138 if (inode) { 136 if (inode) {
139 inode->i_blocks = 0;
140 inode->i_mapping->a_ops = &configfs_aops; 137 inode->i_mapping->a_ops = &configfs_aops;
141 inode->i_mapping->backing_dev_info = &configfs_backing_dev_info; 138 inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
142 inode->i_op = &configfs_inode_operations; 139 inode->i_op = &configfs_inode_operations;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index f40423eb1a14..a07338d2d140 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -83,8 +83,6 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
83 inode->i_op = &page_symlink_inode_operations; 83 inode->i_op = &page_symlink_inode_operations;
84 inode->i_data.a_ops = &cramfs_aops; 84 inode->i_data.a_ops = &cramfs_aops;
85 } else { 85 } else {
86 inode->i_size = 0;
87 inode->i_blocks = 0;
88 init_special_inode(inode, inode->i_mode, 86 init_special_inode(inode, inode->i_mode,
89 old_decode_dev(cramfs_inode->size)); 87 old_decode_dev(cramfs_inode->size));
90 } 88 }
diff --git a/fs/dcache.c b/fs/dcache.c
index a1d86c7f3e66..4547f66884a0 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -34,7 +34,6 @@
34#include <linux/bootmem.h> 34#include <linux/bootmem.h>
35#include "internal.h" 35#include "internal.h"
36 36
37
38int sysctl_vfs_cache_pressure __read_mostly = 100; 37int sysctl_vfs_cache_pressure __read_mostly = 100;
39EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); 38EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
40 39
@@ -948,9 +947,6 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
948 dentry->d_op = NULL; 947 dentry->d_op = NULL;
949 dentry->d_fsdata = NULL; 948 dentry->d_fsdata = NULL;
950 dentry->d_mounted = 0; 949 dentry->d_mounted = 0;
951#ifdef CONFIG_PROFILING
952 dentry->d_cookie = NULL;
953#endif
954 INIT_HLIST_NODE(&dentry->d_hash); 950 INIT_HLIST_NODE(&dentry->d_hash);
955 INIT_LIST_HEAD(&dentry->d_lru); 951 INIT_LIST_HEAD(&dentry->d_lru);
956 INIT_LIST_HEAD(&dentry->d_subdirs); 952 INIT_LIST_HEAD(&dentry->d_subdirs);
@@ -1336,7 +1332,7 @@ err_out:
1336 * 1332 *
1337 * Searches the children of the parent dentry for the name in question. If 1333 * Searches the children of the parent dentry for the name in question. If
1338 * the dentry is found its reference count is incremented and the dentry 1334 * the dentry is found its reference count is incremented and the dentry
1339 * is returned. The caller must use d_put to free the entry when it has 1335 * is returned. The caller must use dput to free the entry when it has
1340 * finished using it. %NULL is returned on failure. 1336 * finished using it. %NULL is returned on failure.
1341 * 1337 *
1342 * __d_lookup is dcache_lock free. The hash list is protected using RCU. 1338 * __d_lookup is dcache_lock free. The hash list is protected using RCU.
@@ -1571,10 +1567,6 @@ void d_rehash(struct dentry * entry)
1571 spin_unlock(&dcache_lock); 1567 spin_unlock(&dcache_lock);
1572} 1568}
1573 1569
1574#define do_switch(x,y) do { \
1575 __typeof__ (x) __tmp = x; \
1576 x = y; y = __tmp; } while (0)
1577
1578/* 1570/*
1579 * When switching names, the actual string doesn't strictly have to 1571 * When switching names, the actual string doesn't strictly have to
1580 * be preserved in the target - because we're dropping the target 1572 * be preserved in the target - because we're dropping the target
@@ -1593,7 +1585,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
1593 /* 1585 /*
1594 * Both external: swap the pointers 1586 * Both external: swap the pointers
1595 */ 1587 */
1596 do_switch(target->d_name.name, dentry->d_name.name); 1588 swap(target->d_name.name, dentry->d_name.name);
1597 } else { 1589 } else {
1598 /* 1590 /*
1599 * dentry:internal, target:external. Steal target's 1591 * dentry:internal, target:external. Steal target's
@@ -1620,8 +1612,11 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
1620 */ 1612 */
1621 memcpy(dentry->d_iname, target->d_name.name, 1613 memcpy(dentry->d_iname, target->d_name.name,
1622 target->d_name.len + 1); 1614 target->d_name.len + 1);
1615 dentry->d_name.len = target->d_name.len;
1616 return;
1623 } 1617 }
1624 } 1618 }
1619 swap(dentry->d_name.len, target->d_name.len);
1625} 1620}
1626 1621
1627/* 1622/*
@@ -1681,8 +1676,7 @@ already_unhashed:
1681 1676
1682 /* Switch the names.. */ 1677 /* Switch the names.. */
1683 switch_names(dentry, target); 1678 switch_names(dentry, target);
1684 do_switch(dentry->d_name.len, target->d_name.len); 1679 swap(dentry->d_name.hash, target->d_name.hash);
1685 do_switch(dentry->d_name.hash, target->d_name.hash);
1686 1680
1687 /* ... and switch the parents */ 1681 /* ... and switch the parents */
1688 if (IS_ROOT(dentry)) { 1682 if (IS_ROOT(dentry)) {
@@ -1690,7 +1684,7 @@ already_unhashed:
1690 target->d_parent = target; 1684 target->d_parent = target;
1691 INIT_LIST_HEAD(&target->d_u.d_child); 1685 INIT_LIST_HEAD(&target->d_u.d_child);
1692 } else { 1686 } else {
1693 do_switch(dentry->d_parent, target->d_parent); 1687 swap(dentry->d_parent, target->d_parent);
1694 1688
1695 /* And add them back to the (new) parent lists */ 1689 /* And add them back to the (new) parent lists */
1696 list_add(&target->d_u.d_child, &target->d_parent->d_subdirs); 1690 list_add(&target->d_u.d_child, &target->d_parent->d_subdirs);
@@ -1791,8 +1785,7 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
1791 struct dentry *dparent, *aparent; 1785 struct dentry *dparent, *aparent;
1792 1786
1793 switch_names(dentry, anon); 1787 switch_names(dentry, anon);
1794 do_switch(dentry->d_name.len, anon->d_name.len); 1788 swap(dentry->d_name.hash, anon->d_name.hash);
1795 do_switch(dentry->d_name.hash, anon->d_name.hash);
1796 1789
1797 dparent = dentry->d_parent; 1790 dparent = dentry->d_parent;
1798 aparent = anon->d_parent; 1791 aparent = anon->d_parent;
@@ -1911,7 +1904,8 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
1911 * Convert a dentry into an ASCII path name. If the entry has been deleted 1904 * Convert a dentry into an ASCII path name. If the entry has been deleted
1912 * the string " (deleted)" is appended. Note that this is ambiguous. 1905 * the string " (deleted)" is appended. Note that this is ambiguous.
1913 * 1906 *
1914 * Returns the buffer or an error code if the path was too long. 1907 * Returns a pointer into the buffer or an error code if the
1908 * path was too long.
1915 * 1909 *
1916 * "buflen" should be positive. Caller holds the dcache_lock. 1910 * "buflen" should be positive. Caller holds the dcache_lock.
1917 * 1911 *
@@ -1987,7 +1981,10 @@ Elong:
1987 * Convert a dentry into an ASCII path name. If the entry has been deleted 1981 * Convert a dentry into an ASCII path name. If the entry has been deleted
1988 * the string " (deleted)" is appended. Note that this is ambiguous. 1982 * the string " (deleted)" is appended. Note that this is ambiguous.
1989 * 1983 *
1990 * Returns the buffer or an error code if the path was too long. 1984 * Returns a pointer into the buffer or an error code if the path was
1985 * too long. Note: Callers should use the returned pointer, not the passed
1986 * in buffer, to use the name! The implementation often starts at an offset
1987 * into the buffer, and may leave 0 bytes at the start.
1991 * 1988 *
1992 * "buflen" should be positive. 1989 * "buflen" should be positive.
1993 */ 1990 */
@@ -2313,9 +2310,6 @@ static void __init dcache_init(void)
2313/* SLAB cache for __getname() consumers */ 2310/* SLAB cache for __getname() consumers */
2314struct kmem_cache *names_cachep __read_mostly; 2311struct kmem_cache *names_cachep __read_mostly;
2315 2312
2316/* SLAB cache for file structures */
2317struct kmem_cache *filp_cachep __read_mostly;
2318
2319EXPORT_SYMBOL(d_genocide); 2313EXPORT_SYMBOL(d_genocide);
2320 2314
2321void __init vfs_caches_init_early(void) 2315void __init vfs_caches_init_early(void)
@@ -2337,9 +2331,6 @@ void __init vfs_caches_init(unsigned long mempages)
2337 names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0, 2331 names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0,
2338 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 2332 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2339 2333
2340 filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
2341 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2342
2343 dcache_init(); 2334 dcache_init();
2344 inode_init(); 2335 inode_init();
2345 files_init(mempages); 2336 files_init(mempages);
diff --git a/fs/dcookies.c b/fs/dcookies.c
index 855d4b1d619a..180e9fec4ad8 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -93,10 +93,15 @@ static struct dcookie_struct *alloc_dcookie(struct path *path)
93{ 93{
94 struct dcookie_struct *dcs = kmem_cache_alloc(dcookie_cache, 94 struct dcookie_struct *dcs = kmem_cache_alloc(dcookie_cache,
95 GFP_KERNEL); 95 GFP_KERNEL);
96 struct dentry *d;
96 if (!dcs) 97 if (!dcs)
97 return NULL; 98 return NULL;
98 99
99 path->dentry->d_cookie = dcs; 100 d = path->dentry;
101 spin_lock(&d->d_lock);
102 d->d_flags |= DCACHE_COOKIE;
103 spin_unlock(&d->d_lock);
104
100 dcs->path = *path; 105 dcs->path = *path;
101 path_get(path); 106 path_get(path);
102 hash_dcookie(dcs); 107 hash_dcookie(dcs);
@@ -119,14 +124,14 @@ int get_dcookie(struct path *path, unsigned long *cookie)
119 goto out; 124 goto out;
120 } 125 }
121 126
122 dcs = path->dentry->d_cookie; 127 if (path->dentry->d_flags & DCACHE_COOKIE) {
123 128 dcs = find_dcookie((unsigned long)path->dentry);
124 if (!dcs) 129 } else {
125 dcs = alloc_dcookie(path); 130 dcs = alloc_dcookie(path);
126 131 if (!dcs) {
127 if (!dcs) { 132 err = -ENOMEM;
128 err = -ENOMEM; 133 goto out;
129 goto out; 134 }
130 } 135 }
131 136
132 *cookie = dcookie_value(dcs); 137 *cookie = dcookie_value(dcs);
@@ -251,7 +256,12 @@ out_kmem:
251 256
252static void free_dcookie(struct dcookie_struct * dcs) 257static void free_dcookie(struct dcookie_struct * dcs)
253{ 258{
254 dcs->path.dentry->d_cookie = NULL; 259 struct dentry *d = dcs->path.dentry;
260
261 spin_lock(&d->d_lock);
262 d->d_flags &= ~DCACHE_COOKIE;
263 spin_unlock(&d->d_lock);
264
255 path_put(&dcs->path); 265 path_put(&dcs->path);
256 kmem_cache_free(dcookie_cache, dcs); 266 kmem_cache_free(dcookie_cache, dcs);
257} 267}
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 159a5efd6a8a..33a90120f6ad 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -294,6 +294,38 @@ struct dentry *debugfs_create_x32(const char *name, mode_t mode,
294} 294}
295EXPORT_SYMBOL_GPL(debugfs_create_x32); 295EXPORT_SYMBOL_GPL(debugfs_create_x32);
296 296
297
298static int debugfs_size_t_set(void *data, u64 val)
299{
300 *(size_t *)data = val;
301 return 0;
302}
303static int debugfs_size_t_get(void *data, u64 *val)
304{
305 *val = *(size_t *)data;
306 return 0;
307}
308DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
309 "%llu\n"); /* %llu and %zu are more or less the same */
310
311/**
312 * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value
313 * @name: a pointer to a string containing the name of the file to create.
314 * @mode: the permission that the file should have
315 * @parent: a pointer to the parent dentry for this file. This should be a
316 * directory dentry if set. If this parameter is %NULL, then the
317 * file will be created in the root of the debugfs filesystem.
318 * @value: a pointer to the variable that the file should read to and write
319 * from.
320 */
321struct dentry *debugfs_create_size_t(const char *name, mode_t mode,
322 struct dentry *parent, size_t *value)
323{
324 return debugfs_create_file(name, mode, parent, value, &fops_size_t);
325}
326EXPORT_SYMBOL_GPL(debugfs_create_size_t);
327
328
297static ssize_t read_file_bool(struct file *file, char __user *user_buf, 329static ssize_t read_file_bool(struct file *file, char __user *user_buf,
298 size_t count, loff_t *ppos) 330 size_t count, loff_t *ppos)
299{ 331{
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 3dbe2169cf36..81ae9ea3c6e1 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -37,9 +37,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
37 37
38 if (inode) { 38 if (inode) {
39 inode->i_mode = mode; 39 inode->i_mode = mode;
40 inode->i_uid = 0;
41 inode->i_gid = 0;
42 inode->i_blocks = 0;
43 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 40 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
44 switch (mode & S_IFMT) { 41 switch (mode & S_IFMT) {
45 default: 42 default:
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 5d61b7c06e13..5f3231b9633f 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -27,25 +27,32 @@
27#define DEVPTS_SUPER_MAGIC 0x1cd1 27#define DEVPTS_SUPER_MAGIC 0x1cd1
28 28
29#define DEVPTS_DEFAULT_MODE 0600 29#define DEVPTS_DEFAULT_MODE 0600
30/*
31 * ptmx is a new node in /dev/pts and will be unused in legacy (single-
32 * instance) mode. To prevent surprises in user space, set permissions of
33 * ptmx to 0. Use 'chmod' or remount with '-o ptmxmode' to set meaningful
34 * permissions.
35 */
36#define DEVPTS_DEFAULT_PTMX_MODE 0000
30#define PTMX_MINOR 2 37#define PTMX_MINOR 2
31 38
32extern int pty_limit; /* Config limit on Unix98 ptys */ 39extern int pty_limit; /* Config limit on Unix98 ptys */
33static DEFINE_IDA(allocated_ptys);
34static DEFINE_MUTEX(allocated_ptys_lock); 40static DEFINE_MUTEX(allocated_ptys_lock);
35 41
36static struct vfsmount *devpts_mnt; 42static struct vfsmount *devpts_mnt;
37static struct dentry *devpts_root;
38 43
39static struct { 44struct pts_mount_opts {
40 int setuid; 45 int setuid;
41 int setgid; 46 int setgid;
42 uid_t uid; 47 uid_t uid;
43 gid_t gid; 48 gid_t gid;
44 umode_t mode; 49 umode_t mode;
45} config = {.mode = DEVPTS_DEFAULT_MODE}; 50 umode_t ptmxmode;
51 int newinstance;
52};
46 53
47enum { 54enum {
48 Opt_uid, Opt_gid, Opt_mode, 55 Opt_uid, Opt_gid, Opt_mode, Opt_ptmxmode, Opt_newinstance,
49 Opt_err 56 Opt_err
50}; 57};
51 58
@@ -53,18 +60,50 @@ static const match_table_t tokens = {
53 {Opt_uid, "uid=%u"}, 60 {Opt_uid, "uid=%u"},
54 {Opt_gid, "gid=%u"}, 61 {Opt_gid, "gid=%u"},
55 {Opt_mode, "mode=%o"}, 62 {Opt_mode, "mode=%o"},
63#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
64 {Opt_ptmxmode, "ptmxmode=%o"},
65 {Opt_newinstance, "newinstance"},
66#endif
56 {Opt_err, NULL} 67 {Opt_err, NULL}
57}; 68};
58 69
59static int devpts_remount(struct super_block *sb, int *flags, char *data) 70struct pts_fs_info {
71 struct ida allocated_ptys;
72 struct pts_mount_opts mount_opts;
73 struct dentry *ptmx_dentry;
74};
75
76static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
77{
78 return sb->s_fs_info;
79}
80
81static inline struct super_block *pts_sb_from_inode(struct inode *inode)
82{
83#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
84 if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
85 return inode->i_sb;
86#endif
87 return devpts_mnt->mnt_sb;
88}
89
90#define PARSE_MOUNT 0
91#define PARSE_REMOUNT 1
92
93static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
60{ 94{
61 char *p; 95 char *p;
62 96
63 config.setuid = 0; 97 opts->setuid = 0;
64 config.setgid = 0; 98 opts->setgid = 0;
65 config.uid = 0; 99 opts->uid = 0;
66 config.gid = 0; 100 opts->gid = 0;
67 config.mode = DEVPTS_DEFAULT_MODE; 101 opts->mode = DEVPTS_DEFAULT_MODE;
102 opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
103
104 /* newinstance makes sense only on initial mount */
105 if (op == PARSE_MOUNT)
106 opts->newinstance = 0;
68 107
69 while ((p = strsep(&data, ",")) != NULL) { 108 while ((p = strsep(&data, ",")) != NULL) {
70 substring_t args[MAX_OPT_ARGS]; 109 substring_t args[MAX_OPT_ARGS];
@@ -79,20 +118,32 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
79 case Opt_uid: 118 case Opt_uid:
80 if (match_int(&args[0], &option)) 119 if (match_int(&args[0], &option))
81 return -EINVAL; 120 return -EINVAL;
82 config.uid = option; 121 opts->uid = option;
83 config.setuid = 1; 122 opts->setuid = 1;
84 break; 123 break;
85 case Opt_gid: 124 case Opt_gid:
86 if (match_int(&args[0], &option)) 125 if (match_int(&args[0], &option))
87 return -EINVAL; 126 return -EINVAL;
88 config.gid = option; 127 opts->gid = option;
89 config.setgid = 1; 128 opts->setgid = 1;
90 break; 129 break;
91 case Opt_mode: 130 case Opt_mode:
92 if (match_octal(&args[0], &option)) 131 if (match_octal(&args[0], &option))
93 return -EINVAL; 132 return -EINVAL;
94 config.mode = option & S_IALLUGO; 133 opts->mode = option & S_IALLUGO;
134 break;
135#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
136 case Opt_ptmxmode:
137 if (match_octal(&args[0], &option))
138 return -EINVAL;
139 opts->ptmxmode = option & S_IALLUGO;
140 break;
141 case Opt_newinstance:
142 /* newinstance makes sense only on initial mount */
143 if (op == PARSE_MOUNT)
144 opts->newinstance = 1;
95 break; 145 break;
146#endif
96 default: 147 default:
97 printk(KERN_ERR "devpts: called with bogus options\n"); 148 printk(KERN_ERR "devpts: called with bogus options\n");
98 return -EINVAL; 149 return -EINVAL;
@@ -102,13 +153,106 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
102 return 0; 153 return 0;
103} 154}
104 155
156#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
157static int mknod_ptmx(struct super_block *sb)
158{
159 int mode;
160 int rc = -ENOMEM;
161 struct dentry *dentry;
162 struct inode *inode;
163 struct dentry *root = sb->s_root;
164 struct pts_fs_info *fsi = DEVPTS_SB(sb);
165 struct pts_mount_opts *opts = &fsi->mount_opts;
166
167 mutex_lock(&root->d_inode->i_mutex);
168
169 /* If we have already created ptmx node, return */
170 if (fsi->ptmx_dentry) {
171 rc = 0;
172 goto out;
173 }
174
175 dentry = d_alloc_name(root, "ptmx");
176 if (!dentry) {
177 printk(KERN_NOTICE "Unable to alloc dentry for ptmx node\n");
178 goto out;
179 }
180
181 /*
182 * Create a new 'ptmx' node in this mount of devpts.
183 */
184 inode = new_inode(sb);
185 if (!inode) {
186 printk(KERN_ERR "Unable to alloc inode for ptmx node\n");
187 dput(dentry);
188 goto out;
189 }
190
191 inode->i_ino = 2;
192 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
193
194 mode = S_IFCHR|opts->ptmxmode;
195 init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
196
197 d_add(dentry, inode);
198
199 fsi->ptmx_dentry = dentry;
200 rc = 0;
201
202 printk(KERN_DEBUG "Created ptmx node in devpts ino %lu\n",
203 inode->i_ino);
204out:
205 mutex_unlock(&root->d_inode->i_mutex);
206 return rc;
207}
208
209static void update_ptmx_mode(struct pts_fs_info *fsi)
210{
211 struct inode *inode;
212 if (fsi->ptmx_dentry) {
213 inode = fsi->ptmx_dentry->d_inode;
214 inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode;
215 }
216}
217#else
218static inline void update_ptmx_mode(struct pts_fs_info *fsi)
219{
220 return;
221}
222#endif
223
224static int devpts_remount(struct super_block *sb, int *flags, char *data)
225{
226 int err;
227 struct pts_fs_info *fsi = DEVPTS_SB(sb);
228 struct pts_mount_opts *opts = &fsi->mount_opts;
229
230 err = parse_mount_options(data, PARSE_REMOUNT, opts);
231
232 /*
233 * parse_mount_options() restores options to default values
234 * before parsing and may have changed ptmxmode. So, update the
235 * mode in the inode too. Bogus options don't fail the remount,
236 * so do this even on error return.
237 */
238 update_ptmx_mode(fsi);
239
240 return err;
241}
242
105static int devpts_show_options(struct seq_file *seq, struct vfsmount *vfs) 243static int devpts_show_options(struct seq_file *seq, struct vfsmount *vfs)
106{ 244{
107 if (config.setuid) 245 struct pts_fs_info *fsi = DEVPTS_SB(vfs->mnt_sb);
108 seq_printf(seq, ",uid=%u", config.uid); 246 struct pts_mount_opts *opts = &fsi->mount_opts;
109 if (config.setgid) 247
110 seq_printf(seq, ",gid=%u", config.gid); 248 if (opts->setuid)
111 seq_printf(seq, ",mode=%03o", config.mode); 249 seq_printf(seq, ",uid=%u", opts->uid);
250 if (opts->setgid)
251 seq_printf(seq, ",gid=%u", opts->gid);
252 seq_printf(seq, ",mode=%03o", opts->mode);
253#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
254 seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode);
255#endif
112 256
113 return 0; 257 return 0;
114} 258}
@@ -119,10 +263,25 @@ static const struct super_operations devpts_sops = {
119 .show_options = devpts_show_options, 263 .show_options = devpts_show_options,
120}; 264};
121 265
266static void *new_pts_fs_info(void)
267{
268 struct pts_fs_info *fsi;
269
270 fsi = kzalloc(sizeof(struct pts_fs_info), GFP_KERNEL);
271 if (!fsi)
272 return NULL;
273
274 ida_init(&fsi->allocated_ptys);
275 fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE;
276 fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
277
278 return fsi;
279}
280
122static int 281static int
123devpts_fill_super(struct super_block *s, void *data, int silent) 282devpts_fill_super(struct super_block *s, void *data, int silent)
124{ 283{
125 struct inode * inode; 284 struct inode *inode;
126 285
127 s->s_blocksize = 1024; 286 s->s_blocksize = 1024;
128 s->s_blocksize_bits = 10; 287 s->s_blocksize_bits = 10;
@@ -130,39 +289,240 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
130 s->s_op = &devpts_sops; 289 s->s_op = &devpts_sops;
131 s->s_time_gran = 1; 290 s->s_time_gran = 1;
132 291
292 s->s_fs_info = new_pts_fs_info();
293 if (!s->s_fs_info)
294 goto fail;
295
133 inode = new_inode(s); 296 inode = new_inode(s);
134 if (!inode) 297 if (!inode)
135 goto fail; 298 goto free_fsi;
136 inode->i_ino = 1; 299 inode->i_ino = 1;
137 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 300 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
138 inode->i_blocks = 0;
139 inode->i_uid = inode->i_gid = 0;
140 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; 301 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
141 inode->i_op = &simple_dir_inode_operations; 302 inode->i_op = &simple_dir_inode_operations;
142 inode->i_fop = &simple_dir_operations; 303 inode->i_fop = &simple_dir_operations;
143 inode->i_nlink = 2; 304 inode->i_nlink = 2;
144 305
145 devpts_root = s->s_root = d_alloc_root(inode); 306 s->s_root = d_alloc_root(inode);
146 if (s->s_root) 307 if (s->s_root)
147 return 0; 308 return 0;
148 309
149 printk("devpts: get root dentry failed\n"); 310 printk(KERN_ERR "devpts: get root dentry failed\n");
150 iput(inode); 311 iput(inode);
312
313free_fsi:
314 kfree(s->s_fs_info);
151fail: 315fail:
152 return -ENOMEM; 316 return -ENOMEM;
153} 317}
154 318
319#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
320static int compare_init_pts_sb(struct super_block *s, void *p)
321{
322 if (devpts_mnt)
323 return devpts_mnt->mnt_sb == s;
324 return 0;
325}
326
327/*
328 * Safely parse the mount options in @data and update @opts.
329 *
330 * devpts ends up parsing options two times during mount, due to the
331 * two modes of operation it supports. The first parse occurs in
332 * devpts_get_sb() when determining the mode (single-instance or
333 * multi-instance mode). The second parse happens in devpts_remount()
334 * or new_pts_mount() depending on the mode.
335 *
336 * Parsing of options modifies the @data making subsequent parsing
337 * incorrect. So make a local copy of @data and parse it.
338 *
339 * Return: 0 On success, -errno on error
340 */
341static int safe_parse_mount_options(void *data, struct pts_mount_opts *opts)
342{
343 int rc;
344 void *datacp;
345
346 if (!data)
347 return 0;
348
349 /* Use kstrdup() ? */
350 datacp = kmalloc(PAGE_SIZE, GFP_KERNEL);
351 if (!datacp)
352 return -ENOMEM;
353
354 memcpy(datacp, data, PAGE_SIZE);
355 rc = parse_mount_options((char *)datacp, PARSE_MOUNT, opts);
356 kfree(datacp);
357
358 return rc;
359}
360
361/*
362 * Mount a new (private) instance of devpts. PTYs created in this
363 * instance are independent of the PTYs in other devpts instances.
364 */
365static int new_pts_mount(struct file_system_type *fs_type, int flags,
366 void *data, struct vfsmount *mnt)
367{
368 int err;
369 struct pts_fs_info *fsi;
370 struct pts_mount_opts *opts;
371
372 printk(KERN_NOTICE "devpts: newinstance mount\n");
373
374 err = get_sb_nodev(fs_type, flags, data, devpts_fill_super, mnt);
375 if (err)
376 return err;
377
378 fsi = DEVPTS_SB(mnt->mnt_sb);
379 opts = &fsi->mount_opts;
380
381 err = parse_mount_options(data, PARSE_MOUNT, opts);
382 if (err)
383 goto fail;
384
385 err = mknod_ptmx(mnt->mnt_sb);
386 if (err)
387 goto fail;
388
389 return 0;
390
391fail:
392 dput(mnt->mnt_sb->s_root);
393 deactivate_super(mnt->mnt_sb);
394 return err;
395}
396
397/*
398 * Check if 'newinstance' mount option was specified in @data.
399 *
400 * Return: -errno on error (eg: invalid mount options specified)
401 * : 1 if 'newinstance' mount option was specified
402 * : 0 if 'newinstance' mount option was NOT specified
403 */
404static int is_new_instance_mount(void *data)
405{
406 int rc;
407 struct pts_mount_opts opts;
408
409 if (!data)
410 return 0;
411
412 rc = safe_parse_mount_options(data, &opts);
413 if (!rc)
414 rc = opts.newinstance;
415
416 return rc;
417}
418
419/*
420 * get_init_pts_sb()
421 *
422 * This interface is needed to support multiple namespace semantics in
423 * devpts while preserving backward compatibility of the current 'single-
424 * namespace' semantics. i.e all mounts of devpts without the 'newinstance'
425 * mount option should bind to the initial kernel mount, like
426 * get_sb_single().
427 *
428 * Mounts with 'newinstance' option create a new private namespace.
429 *
430 * But for single-mount semantics, devpts cannot use get_sb_single(),
431 * because get_sb_single()/sget() find and use the super-block from
432 * the most recent mount of devpts. But that recent mount may be a
433 * 'newinstance' mount and get_sb_single() would pick the newinstance
434 * super-block instead of the initial super-block.
435 *
436 * This interface is identical to get_sb_single() except that it
437 * consistently selects the 'single-namespace' superblock even in the
438 * presence of the private namespace (i.e 'newinstance') super-blocks.
439 */
440static int get_init_pts_sb(struct file_system_type *fs_type, int flags,
441 void *data, struct vfsmount *mnt)
442{
443 struct super_block *s;
444 int error;
445
446 s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL);
447 if (IS_ERR(s))
448 return PTR_ERR(s);
449
450 if (!s->s_root) {
451 s->s_flags = flags;
452 error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
453 if (error) {
454 up_write(&s->s_umount);
455 deactivate_super(s);
456 return error;
457 }
458 s->s_flags |= MS_ACTIVE;
459 }
460 do_remount_sb(s, flags, data, 0);
461 return simple_set_mnt(mnt, s);
462}
463
464/*
465 * Mount or remount the initial kernel mount of devpts. This type of
466 * mount maintains the legacy, single-instance semantics, while the
467 * kernel still allows multiple-instances.
468 */
469static int init_pts_mount(struct file_system_type *fs_type, int flags,
470 void *data, struct vfsmount *mnt)
471{
472 int err;
473
474 err = get_init_pts_sb(fs_type, flags, data, mnt);
475 if (err)
476 return err;
477
478 err = mknod_ptmx(mnt->mnt_sb);
479 if (err) {
480 dput(mnt->mnt_sb->s_root);
481 deactivate_super(mnt->mnt_sb);
482 }
483
484 return err;
485}
486
155static int devpts_get_sb(struct file_system_type *fs_type, 487static int devpts_get_sb(struct file_system_type *fs_type,
156 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 488 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
157{ 489{
490 int new;
491
492 new = is_new_instance_mount(data);
493 if (new < 0)
494 return new;
495
496 if (new)
497 return new_pts_mount(fs_type, flags, data, mnt);
498
499 return init_pts_mount(fs_type, flags, data, mnt);
500}
501#else
502/*
503 * This supports only the legacy single-instance semantics (no
504 * multiple-instance semantics)
505 */
506static int devpts_get_sb(struct file_system_type *fs_type, int flags,
507 const char *dev_name, void *data, struct vfsmount *mnt)
508{
158 return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt); 509 return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt);
159} 510}
511#endif
512
513static void devpts_kill_sb(struct super_block *sb)
514{
515 struct pts_fs_info *fsi = DEVPTS_SB(sb);
516
517 kfree(fsi);
518 kill_litter_super(sb);
519}
160 520
161static struct file_system_type devpts_fs_type = { 521static struct file_system_type devpts_fs_type = {
162 .owner = THIS_MODULE, 522 .owner = THIS_MODULE,
163 .name = "devpts", 523 .name = "devpts",
164 .get_sb = devpts_get_sb, 524 .get_sb = devpts_get_sb,
165 .kill_sb = kill_anon_super, 525 .kill_sb = devpts_kill_sb,
166}; 526};
167 527
168/* 528/*
@@ -172,16 +532,17 @@ static struct file_system_type devpts_fs_type = {
172 532
173int devpts_new_index(struct inode *ptmx_inode) 533int devpts_new_index(struct inode *ptmx_inode)
174{ 534{
535 struct super_block *sb = pts_sb_from_inode(ptmx_inode);
536 struct pts_fs_info *fsi = DEVPTS_SB(sb);
175 int index; 537 int index;
176 int ida_ret; 538 int ida_ret;
177 539
178retry: 540retry:
179 if (!ida_pre_get(&allocated_ptys, GFP_KERNEL)) { 541 if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL))
180 return -ENOMEM; 542 return -ENOMEM;
181 }
182 543
183 mutex_lock(&allocated_ptys_lock); 544 mutex_lock(&allocated_ptys_lock);
184 ida_ret = ida_get_new(&allocated_ptys, &index); 545 ida_ret = ida_get_new(&fsi->allocated_ptys, &index);
185 if (ida_ret < 0) { 546 if (ida_ret < 0) {
186 mutex_unlock(&allocated_ptys_lock); 547 mutex_unlock(&allocated_ptys_lock);
187 if (ida_ret == -EAGAIN) 548 if (ida_ret == -EAGAIN)
@@ -190,7 +551,7 @@ retry:
190 } 551 }
191 552
192 if (index >= pty_limit) { 553 if (index >= pty_limit) {
193 ida_remove(&allocated_ptys, index); 554 ida_remove(&fsi->allocated_ptys, index);
194 mutex_unlock(&allocated_ptys_lock); 555 mutex_unlock(&allocated_ptys_lock);
195 return -EIO; 556 return -EIO;
196 } 557 }
@@ -200,18 +561,26 @@ retry:
200 561
201void devpts_kill_index(struct inode *ptmx_inode, int idx) 562void devpts_kill_index(struct inode *ptmx_inode, int idx)
202{ 563{
564 struct super_block *sb = pts_sb_from_inode(ptmx_inode);
565 struct pts_fs_info *fsi = DEVPTS_SB(sb);
566
203 mutex_lock(&allocated_ptys_lock); 567 mutex_lock(&allocated_ptys_lock);
204 ida_remove(&allocated_ptys, idx); 568 ida_remove(&fsi->allocated_ptys, idx);
205 mutex_unlock(&allocated_ptys_lock); 569 mutex_unlock(&allocated_ptys_lock);
206} 570}
207 571
208int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty) 572int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
209{ 573{
210 int number = tty->index; /* tty layer puts index from devpts_new_index() in here */ 574 /* tty layer puts index from devpts_new_index() in here */
575 int number = tty->index;
211 struct tty_driver *driver = tty->driver; 576 struct tty_driver *driver = tty->driver;
212 dev_t device = MKDEV(driver->major, driver->minor_start+number); 577 dev_t device = MKDEV(driver->major, driver->minor_start+number);
213 struct dentry *dentry; 578 struct dentry *dentry;
214 struct inode *inode = new_inode(devpts_mnt->mnt_sb); 579 struct super_block *sb = pts_sb_from_inode(ptmx_inode);
580 struct inode *inode = new_inode(sb);
581 struct dentry *root = sb->s_root;
582 struct pts_fs_info *fsi = DEVPTS_SB(sb);
583 struct pts_mount_opts *opts = &fsi->mount_opts;
215 char s[12]; 584 char s[12];
216 585
217 /* We're supposed to be given the slave end of a pty */ 586 /* We're supposed to be given the slave end of a pty */
@@ -221,25 +590,25 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
221 if (!inode) 590 if (!inode)
222 return -ENOMEM; 591 return -ENOMEM;
223 592
224 inode->i_ino = number+2; 593 inode->i_ino = number + 3;
225 inode->i_uid = config.setuid ? config.uid : current_fsuid(); 594 inode->i_uid = opts->setuid ? opts->uid : current_fsuid();
226 inode->i_gid = config.setgid ? config.gid : current_fsgid(); 595 inode->i_gid = opts->setgid ? opts->gid : current_fsgid();
227 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 596 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
228 init_special_inode(inode, S_IFCHR|config.mode, device); 597 init_special_inode(inode, S_IFCHR|opts->mode, device);
229 inode->i_private = tty; 598 inode->i_private = tty;
230 tty->driver_data = inode; 599 tty->driver_data = inode;
231 600
232 sprintf(s, "%d", number); 601 sprintf(s, "%d", number);
233 602
234 mutex_lock(&devpts_root->d_inode->i_mutex); 603 mutex_lock(&root->d_inode->i_mutex);
235 604
236 dentry = d_alloc_name(devpts_root, s); 605 dentry = d_alloc_name(root, s);
237 if (!IS_ERR(dentry)) { 606 if (!IS_ERR(dentry)) {
238 d_add(dentry, inode); 607 d_add(dentry, inode);
239 fsnotify_create(devpts_root->d_inode, dentry); 608 fsnotify_create(root->d_inode, dentry);
240 } 609 }
241 610
242 mutex_unlock(&devpts_root->d_inode->i_mutex); 611 mutex_unlock(&root->d_inode->i_mutex);
243 612
244 return 0; 613 return 0;
245} 614}
@@ -256,20 +625,27 @@ struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
256void devpts_pty_kill(struct tty_struct *tty) 625void devpts_pty_kill(struct tty_struct *tty)
257{ 626{
258 struct inode *inode = tty->driver_data; 627 struct inode *inode = tty->driver_data;
628 struct super_block *sb = pts_sb_from_inode(inode);
629 struct dentry *root = sb->s_root;
259 struct dentry *dentry; 630 struct dentry *dentry;
260 631
261 BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); 632 BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
262 633
263 mutex_lock(&devpts_root->d_inode->i_mutex); 634 mutex_lock(&root->d_inode->i_mutex);
264 635
265 dentry = d_find_alias(inode); 636 dentry = d_find_alias(inode);
266 if (dentry && !IS_ERR(dentry)) { 637 if (IS_ERR(dentry))
638 goto out;
639
640 if (dentry) {
267 inode->i_nlink--; 641 inode->i_nlink--;
268 d_delete(dentry); 642 d_delete(dentry);
269 dput(dentry); 643 dput(dentry); /* d_alloc_name() in devpts_pty_new() */
270 } 644 }
271 645
272 mutex_unlock(&devpts_root->d_inode->i_mutex); 646 dput(dentry); /* d_find_alias above */
647out:
648 mutex_unlock(&root->d_inode->i_mutex);
273} 649}
274 650
275static int __init init_devpts_fs(void) 651static int __init init_devpts_fs(void)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index af0558dbe8b7..b6d43908ff7a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1209,6 +1209,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1209 retval = direct_io_worker(rw, iocb, inode, iov, offset, 1209 retval = direct_io_worker(rw, iocb, inode, iov, offset,
1210 nr_segs, blkbits, get_block, end_io, dio); 1210 nr_segs, blkbits, get_block, end_io, dio);
1211 1211
1212 /*
1213 * In case of error extending write may have instantiated a few
1214 * blocks outside i_size. Trim these off again for DIO_LOCKING.
1215 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by
1216 * it's own meaner.
1217 */
1218 if (unlikely(retval < 0 && (rw & WRITE))) {
1219 loff_t isize = i_size_read(inode);
1220
1221 if (end > isize && dio_lock_type == DIO_LOCKING)
1222 vmtruncate(inode, isize);
1223 }
1224
1212 if (rw == READ && dio_lock_type == DIO_LOCKING) 1225 if (rw == READ && dio_lock_type == DIO_LOCKING)
1213 release_i_mutex = 0; 1226 release_i_mutex = 0;
1214 1227
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 8bf31e3fbf01..dc2ad6008b2d 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -33,10 +33,10 @@ void dlm_del_ast(struct dlm_lkb *lkb)
33 spin_unlock(&ast_queue_lock); 33 spin_unlock(&ast_queue_lock);
34} 34}
35 35
36void dlm_add_ast(struct dlm_lkb *lkb, int type) 36void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
37{ 37{
38 if (lkb->lkb_flags & DLM_IFL_USER) { 38 if (lkb->lkb_flags & DLM_IFL_USER) {
39 dlm_user_add_ast(lkb, type); 39 dlm_user_add_ast(lkb, type, bastmode);
40 return; 40 return;
41 } 41 }
42 42
@@ -46,6 +46,8 @@ void dlm_add_ast(struct dlm_lkb *lkb, int type)
46 list_add_tail(&lkb->lkb_astqueue, &ast_queue); 46 list_add_tail(&lkb->lkb_astqueue, &ast_queue);
47 } 47 }
48 lkb->lkb_ast_type |= type; 48 lkb->lkb_ast_type |= type;
49 if (bastmode)
50 lkb->lkb_bastmode = bastmode;
49 spin_unlock(&ast_queue_lock); 51 spin_unlock(&ast_queue_lock);
50 52
51 set_bit(WAKE_ASTS, &astd_wakeflags); 53 set_bit(WAKE_ASTS, &astd_wakeflags);
@@ -59,50 +61,40 @@ static void process_asts(void)
59 struct dlm_lkb *lkb; 61 struct dlm_lkb *lkb;
60 void (*cast) (void *astparam); 62 void (*cast) (void *astparam);
61 void (*bast) (void *astparam, int mode); 63 void (*bast) (void *astparam, int mode);
62 int type = 0, found, bmode; 64 int type = 0, bastmode;
63 65
64 for (;;) { 66repeat:
65 found = 0; 67 spin_lock(&ast_queue_lock);
66 spin_lock(&ast_queue_lock); 68 list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
67 list_for_each_entry(lkb, &ast_queue, lkb_astqueue) { 69 r = lkb->lkb_resource;
68 r = lkb->lkb_resource; 70 ls = r->res_ls;
69 ls = r->res_ls; 71
70 72 if (dlm_locking_stopped(ls))
71 if (dlm_locking_stopped(ls)) 73 continue;
72 continue;
73
74 list_del(&lkb->lkb_astqueue);
75 type = lkb->lkb_ast_type;
76 lkb->lkb_ast_type = 0;
77 found = 1;
78 break;
79 }
80 spin_unlock(&ast_queue_lock);
81 74
82 if (!found) 75 list_del(&lkb->lkb_astqueue);
83 break; 76 type = lkb->lkb_ast_type;
77 lkb->lkb_ast_type = 0;
78 bastmode = lkb->lkb_bastmode;
84 79
80 spin_unlock(&ast_queue_lock);
85 cast = lkb->lkb_astfn; 81 cast = lkb->lkb_astfn;
86 bast = lkb->lkb_bastfn; 82 bast = lkb->lkb_bastfn;
87 bmode = lkb->lkb_bastmode;
88 83
89 if ((type & AST_COMP) && cast) 84 if ((type & AST_COMP) && cast)
90 cast(lkb->lkb_astparam); 85 cast(lkb->lkb_astparam);
91 86
92 /* FIXME: Is it safe to look at lkb_grmode here
93 without doing a lock_rsb() ?
94 Look at other checks in v1 to avoid basts. */
95
96 if ((type & AST_BAST) && bast) 87 if ((type & AST_BAST) && bast)
97 if (!dlm_modes_compat(lkb->lkb_grmode, bmode)) 88 bast(lkb->lkb_astparam, bastmode);
98 bast(lkb->lkb_astparam, bmode);
99 89
100 /* this removes the reference added by dlm_add_ast 90 /* this removes the reference added by dlm_add_ast
101 and may result in the lkb being freed */ 91 and may result in the lkb being freed */
102 dlm_put_lkb(lkb); 92 dlm_put_lkb(lkb);
103 93
104 schedule(); 94 cond_resched();
95 goto repeat;
105 } 96 }
97 spin_unlock(&ast_queue_lock);
106} 98}
107 99
108static inline int no_asts(void) 100static inline int no_asts(void)
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index 6ee276c74c52..1b5fc5f428fd 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -13,7 +13,7 @@
13#ifndef __ASTD_DOT_H__ 13#ifndef __ASTD_DOT_H__
14#define __ASTD_DOT_H__ 14#define __ASTD_DOT_H__
15 15
16void dlm_add_ast(struct dlm_lkb *lkb, int type); 16void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
17void dlm_del_ast(struct dlm_lkb *lkb); 17void dlm_del_ast(struct dlm_lkb *lkb);
18 18
19void dlm_astd_wake(void); 19void dlm_astd_wake(void);
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 8fc24f4507a3..2f107d1a6a45 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -27,7 +27,7 @@ static struct dentry *dlm_root;
27 27
28struct rsb_iter { 28struct rsb_iter {
29 int entry; 29 int entry;
30 int locks; 30 int format;
31 int header; 31 int header;
32 struct dlm_ls *ls; 32 struct dlm_ls *ls;
33 struct list_head *next; 33 struct list_head *next;
@@ -60,8 +60,8 @@ static char *print_lockmode(int mode)
60 } 60 }
61} 61}
62 62
63static void print_resource_lock(struct seq_file *s, struct dlm_lkb *lkb, 63static void print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb,
64 struct dlm_rsb *res) 64 struct dlm_rsb *res)
65{ 65{
66 seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode)); 66 seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
67 67
@@ -83,7 +83,7 @@ static void print_resource_lock(struct seq_file *s, struct dlm_lkb *lkb,
83 seq_printf(s, "\n"); 83 seq_printf(s, "\n");
84} 84}
85 85
86static int print_resource(struct dlm_rsb *res, struct seq_file *s) 86static int print_format1(struct dlm_rsb *res, struct seq_file *s)
87{ 87{
88 struct dlm_lkb *lkb; 88 struct dlm_lkb *lkb;
89 int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list; 89 int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list;
@@ -134,15 +134,15 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s)
134 /* Print the locks attached to this resource */ 134 /* Print the locks attached to this resource */
135 seq_printf(s, "Granted Queue\n"); 135 seq_printf(s, "Granted Queue\n");
136 list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue) 136 list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
137 print_resource_lock(s, lkb, res); 137 print_format1_lock(s, lkb, res);
138 138
139 seq_printf(s, "Conversion Queue\n"); 139 seq_printf(s, "Conversion Queue\n");
140 list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue) 140 list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
141 print_resource_lock(s, lkb, res); 141 print_format1_lock(s, lkb, res);
142 142
143 seq_printf(s, "Waiting Queue\n"); 143 seq_printf(s, "Waiting Queue\n");
144 list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue) 144 list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
145 print_resource_lock(s, lkb, res); 145 print_format1_lock(s, lkb, res);
146 146
147 if (list_empty(&res->res_lookup)) 147 if (list_empty(&res->res_lookup))
148 goto out; 148 goto out;
@@ -160,23 +160,24 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s)
160 return 0; 160 return 0;
161} 161}
162 162
163static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, struct dlm_rsb *r) 163static void print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb,
164 struct dlm_rsb *r)
164{ 165{
165 unsigned int waiting = 0; 166 u64 xid = 0;
166 uint64_t xid = 0; 167 u64 us;
167 168
168 if (lkb->lkb_flags & DLM_IFL_USER) { 169 if (lkb->lkb_flags & DLM_IFL_USER) {
169 if (lkb->lkb_ua) 170 if (lkb->lkb_ua)
170 xid = lkb->lkb_ua->xid; 171 xid = lkb->lkb_ua->xid;
171 } 172 }
172 173
173 if (lkb->lkb_timestamp) 174 /* microseconds since lkb was added to current queue */
174 waiting = jiffies_to_msecs(jiffies - lkb->lkb_timestamp); 175 us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_timestamp));
175 176
176 /* id nodeid remid pid xid exflags flags sts grmode rqmode time_ms 177 /* id nodeid remid pid xid exflags flags sts grmode rqmode time_us
177 r_nodeid r_len r_name */ 178 r_nodeid r_len r_name */
178 179
179 seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %u %u %d \"%s\"\n", 180 seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %llu %u %d \"%s\"\n",
180 lkb->lkb_id, 181 lkb->lkb_id,
181 lkb->lkb_nodeid, 182 lkb->lkb_nodeid,
182 lkb->lkb_remid, 183 lkb->lkb_remid,
@@ -187,26 +188,114 @@ static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, struct dlm_rsb *
187 lkb->lkb_status, 188 lkb->lkb_status,
188 lkb->lkb_grmode, 189 lkb->lkb_grmode,
189 lkb->lkb_rqmode, 190 lkb->lkb_rqmode,
190 waiting, 191 (unsigned long long)us,
191 r->res_nodeid, 192 r->res_nodeid,
192 r->res_length, 193 r->res_length,
193 r->res_name); 194 r->res_name);
194} 195}
195 196
196static int print_locks(struct dlm_rsb *r, struct seq_file *s) 197static int print_format2(struct dlm_rsb *r, struct seq_file *s)
197{ 198{
198 struct dlm_lkb *lkb; 199 struct dlm_lkb *lkb;
199 200
200 lock_rsb(r); 201 lock_rsb(r);
201 202
202 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) 203 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
203 print_lock(s, lkb, r); 204 print_format2_lock(s, lkb, r);
204 205
205 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) 206 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
206 print_lock(s, lkb, r); 207 print_format2_lock(s, lkb, r);
207 208
208 list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) 209 list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
209 print_lock(s, lkb, r); 210 print_format2_lock(s, lkb, r);
211
212 unlock_rsb(r);
213 return 0;
214}
215
216static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
217 int rsb_lookup)
218{
219 u64 xid = 0;
220
221 if (lkb->lkb_flags & DLM_IFL_USER) {
222 if (lkb->lkb_ua)
223 xid = lkb->lkb_ua->xid;
224 }
225
226 seq_printf(s, "lkb %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu\n",
227 lkb->lkb_id,
228 lkb->lkb_nodeid,
229 lkb->lkb_remid,
230 lkb->lkb_ownpid,
231 (unsigned long long)xid,
232 lkb->lkb_exflags,
233 lkb->lkb_flags,
234 lkb->lkb_status,
235 lkb->lkb_grmode,
236 lkb->lkb_rqmode,
237 lkb->lkb_highbast,
238 rsb_lookup,
239 lkb->lkb_wait_type,
240 lkb->lkb_lvbseq,
241 (unsigned long long)ktime_to_ns(lkb->lkb_timestamp),
242 (unsigned long long)ktime_to_ns(lkb->lkb_time_bast));
243}
244
245static int print_format3(struct dlm_rsb *r, struct seq_file *s)
246{
247 struct dlm_lkb *lkb;
248 int i, lvblen = r->res_ls->ls_lvblen;
249 int print_name = 1;
250
251 lock_rsb(r);
252
253 seq_printf(s, "rsb %p %d %x %lx %d %d %u %d ",
254 r,
255 r->res_nodeid,
256 r->res_first_lkid,
257 r->res_flags,
258 !list_empty(&r->res_root_list),
259 !list_empty(&r->res_recover_list),
260 r->res_recover_locks_count,
261 r->res_length);
262
263 for (i = 0; i < r->res_length; i++) {
264 if (!isascii(r->res_name[i]) || !isprint(r->res_name[i]))
265 print_name = 0;
266 }
267
268 seq_printf(s, "%s", print_name ? "str " : "hex");
269
270 for (i = 0; i < r->res_length; i++) {
271 if (print_name)
272 seq_printf(s, "%c", r->res_name[i]);
273 else
274 seq_printf(s, " %02x", (unsigned char)r->res_name[i]);
275 }
276 seq_printf(s, "\n");
277
278 if (!r->res_lvbptr)
279 goto do_locks;
280
281 seq_printf(s, "lvb %u %d", r->res_lvbseq, lvblen);
282
283 for (i = 0; i < lvblen; i++)
284 seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]);
285 seq_printf(s, "\n");
286
287 do_locks:
288 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
289 print_format3_lock(s, lkb, 0);
290
291 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
292 print_format3_lock(s, lkb, 0);
293
294 list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
295 print_format3_lock(s, lkb, 0);
296
297 list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup)
298 print_format3_lock(s, lkb, 1);
210 299
211 unlock_rsb(r); 300 unlock_rsb(r);
212 return 0; 301 return 0;
@@ -231,7 +320,7 @@ static int rsb_iter_next(struct rsb_iter *ri)
231 break; 320 break;
232 } 321 }
233 read_unlock(&ls->ls_rsbtbl[i].lock); 322 read_unlock(&ls->ls_rsbtbl[i].lock);
234 } 323 }
235 ri->entry = i; 324 ri->entry = i;
236 325
237 if (ri->entry >= ls->ls_rsbtbl_size) 326 if (ri->entry >= ls->ls_rsbtbl_size)
@@ -248,7 +337,7 @@ static int rsb_iter_next(struct rsb_iter *ri)
248 read_unlock(&ls->ls_rsbtbl[i].lock); 337 read_unlock(&ls->ls_rsbtbl[i].lock);
249 dlm_put_rsb(old); 338 dlm_put_rsb(old);
250 goto top; 339 goto top;
251 } 340 }
252 ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain); 341 ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
253 dlm_hold_rsb(ri->rsb); 342 dlm_hold_rsb(ri->rsb);
254 read_unlock(&ls->ls_rsbtbl[i].lock); 343 read_unlock(&ls->ls_rsbtbl[i].lock);
@@ -274,6 +363,7 @@ static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
274 ri->ls = ls; 363 ri->ls = ls;
275 ri->entry = 0; 364 ri->entry = 0;
276 ri->next = NULL; 365 ri->next = NULL;
366 ri->format = 1;
277 367
278 if (rsb_iter_next(ri)) { 368 if (rsb_iter_next(ri)) {
279 rsb_iter_free(ri); 369 rsb_iter_free(ri);
@@ -325,16 +415,26 @@ static int rsb_seq_show(struct seq_file *file, void *iter_ptr)
325{ 415{
326 struct rsb_iter *ri = iter_ptr; 416 struct rsb_iter *ri = iter_ptr;
327 417
328 if (ri->locks) { 418 switch (ri->format) {
419 case 1:
420 print_format1(ri->rsb, file);
421 break;
422 case 2:
329 if (ri->header) { 423 if (ri->header) {
330 seq_printf(file, "id nodeid remid pid xid exflags flags " 424 seq_printf(file, "id nodeid remid pid xid exflags "
331 "sts grmode rqmode time_ms r_nodeid " 425 "flags sts grmode rqmode time_ms "
332 "r_len r_name\n"); 426 "r_nodeid r_len r_name\n");
333 ri->header = 0; 427 ri->header = 0;
334 } 428 }
335 print_locks(ri->rsb, file); 429 print_format2(ri->rsb, file);
336 } else { 430 break;
337 print_resource(ri->rsb, file); 431 case 3:
432 if (ri->header) {
433 seq_printf(file, "version rsb 1.1 lvb 1.1 lkb 1.1\n");
434 ri->header = 0;
435 }
436 print_format3(ri->rsb, file);
437 break;
338 } 438 }
339 439
340 return 0; 440 return 0;
@@ -385,7 +485,7 @@ static struct rsb_iter *locks_iter_init(struct dlm_ls *ls, loff_t *pos)
385 ri->ls = ls; 485 ri->ls = ls;
386 ri->entry = 0; 486 ri->entry = 0;
387 ri->next = NULL; 487 ri->next = NULL;
388 ri->locks = 1; 488 ri->format = 2;
389 489
390 if (*pos == 0) 490 if (*pos == 0)
391 ri->header = 1; 491 ri->header = 1;
@@ -448,6 +548,84 @@ static const struct file_operations locks_fops = {
448}; 548};
449 549
450/* 550/*
551 * Dump all rsb/lvb/lkb state in compact listing, more complete than _locks
552 * This can replace both formats 1 and 2 eventually.
553 */
554
555static struct rsb_iter *all_iter_init(struct dlm_ls *ls, loff_t *pos)
556{
557 struct rsb_iter *ri;
558
559 ri = kzalloc(sizeof *ri, GFP_KERNEL);
560 if (!ri)
561 return NULL;
562
563 ri->ls = ls;
564 ri->entry = 0;
565 ri->next = NULL;
566 ri->format = 3;
567
568 if (*pos == 0)
569 ri->header = 1;
570
571 if (rsb_iter_next(ri)) {
572 rsb_iter_free(ri);
573 return NULL;
574 }
575
576 return ri;
577}
578
579static void *all_seq_start(struct seq_file *file, loff_t *pos)
580{
581 struct rsb_iter *ri;
582 loff_t n = *pos;
583
584 ri = all_iter_init(file->private, pos);
585 if (!ri)
586 return NULL;
587
588 while (n--) {
589 if (rsb_iter_next(ri)) {
590 rsb_iter_free(ri);
591 return NULL;
592 }
593 }
594
595 return ri;
596}
597
598static struct seq_operations all_seq_ops = {
599 .start = all_seq_start,
600 .next = rsb_seq_next,
601 .stop = rsb_seq_stop,
602 .show = rsb_seq_show,
603};
604
605static int all_open(struct inode *inode, struct file *file)
606{
607 struct seq_file *seq;
608 int ret;
609
610 ret = seq_open(file, &all_seq_ops);
611 if (ret)
612 return ret;
613
614 seq = file->private_data;
615 seq->private = inode->i_private;
616
617 return 0;
618}
619
620static const struct file_operations all_fops = {
621 .owner = THIS_MODULE,
622 .open = all_open,
623 .read = seq_read,
624 .llseek = seq_lseek,
625 .release = seq_release
626};
627
628/*
451 * dump lkb's on the ls_waiters list 629 * dump lkb's on the ls_waiters list
452 */ 630 */
453 631
@@ -489,30 +667,33 @@ static const struct file_operations waiters_fops = {
489 .read = waiters_read 667 .read = waiters_read
490}; 668};
491 669
670void dlm_delete_debug_file(struct dlm_ls *ls)
671{
672 if (ls->ls_debug_rsb_dentry)
673 debugfs_remove(ls->ls_debug_rsb_dentry);
674 if (ls->ls_debug_waiters_dentry)
675 debugfs_remove(ls->ls_debug_waiters_dentry);
676 if (ls->ls_debug_locks_dentry)
677 debugfs_remove(ls->ls_debug_locks_dentry);
678 if (ls->ls_debug_all_dentry)
679 debugfs_remove(ls->ls_debug_all_dentry);
680}
681
492int dlm_create_debug_file(struct dlm_ls *ls) 682int dlm_create_debug_file(struct dlm_ls *ls)
493{ 683{
494 char name[DLM_LOCKSPACE_LEN+8]; 684 char name[DLM_LOCKSPACE_LEN+8];
495 685
686 /* format 1 */
687
496 ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name, 688 ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name,
497 S_IFREG | S_IRUGO, 689 S_IFREG | S_IRUGO,
498 dlm_root, 690 dlm_root,
499 ls, 691 ls,
500 &rsb_fops); 692 &rsb_fops);
501 if (!ls->ls_debug_rsb_dentry) 693 if (!ls->ls_debug_rsb_dentry)
502 return -ENOMEM; 694 goto fail;
503 695
504 memset(name, 0, sizeof(name)); 696 /* format 2 */
505 snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
506
507 ls->ls_debug_waiters_dentry = debugfs_create_file(name,
508 S_IFREG | S_IRUGO,
509 dlm_root,
510 ls,
511 &waiters_fops);
512 if (!ls->ls_debug_waiters_dentry) {
513 debugfs_remove(ls->ls_debug_rsb_dentry);
514 return -ENOMEM;
515 }
516 697
517 memset(name, 0, sizeof(name)); 698 memset(name, 0, sizeof(name));
518 snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_locks", ls->ls_name); 699 snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_locks", ls->ls_name);
@@ -522,23 +703,38 @@ int dlm_create_debug_file(struct dlm_ls *ls)
522 dlm_root, 703 dlm_root,
523 ls, 704 ls,
524 &locks_fops); 705 &locks_fops);
525 if (!ls->ls_debug_locks_dentry) { 706 if (!ls->ls_debug_locks_dentry)
526 debugfs_remove(ls->ls_debug_waiters_dentry); 707 goto fail;
527 debugfs_remove(ls->ls_debug_rsb_dentry); 708
528 return -ENOMEM; 709 /* format 3 */
529 } 710
711 memset(name, 0, sizeof(name));
712 snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_all", ls->ls_name);
713
714 ls->ls_debug_all_dentry = debugfs_create_file(name,
715 S_IFREG | S_IRUGO,
716 dlm_root,
717 ls,
718 &all_fops);
719 if (!ls->ls_debug_all_dentry)
720 goto fail;
721
722 memset(name, 0, sizeof(name));
723 snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
724
725 ls->ls_debug_waiters_dentry = debugfs_create_file(name,
726 S_IFREG | S_IRUGO,
727 dlm_root,
728 ls,
729 &waiters_fops);
730 if (!ls->ls_debug_waiters_dentry)
731 goto fail;
530 732
531 return 0; 733 return 0;
532}
533 734
534void dlm_delete_debug_file(struct dlm_ls *ls) 735 fail:
535{ 736 dlm_delete_debug_file(ls);
536 if (ls->ls_debug_rsb_dentry) 737 return -ENOMEM;
537 debugfs_remove(ls->ls_debug_rsb_dentry);
538 if (ls->ls_debug_waiters_dentry)
539 debugfs_remove(ls->ls_debug_waiters_dentry);
540 if (ls->ls_debug_locks_dentry)
541 debugfs_remove(ls->ls_debug_locks_dentry);
542} 738}
543 739
544int __init dlm_register_debugfs(void) 740int __init dlm_register_debugfs(void)
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 85defeb64df4..92969f879a17 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -374,7 +374,7 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
374 struct list_head *list; 374 struct list_head *list;
375 struct dlm_rsb *r; 375 struct dlm_rsb *r;
376 int offset = 0, dir_nodeid; 376 int offset = 0, dir_nodeid;
377 uint16_t be_namelen; 377 __be16 be_namelen;
378 378
379 down_read(&ls->ls_root_sem); 379 down_read(&ls->ls_root_sem);
380 380
@@ -410,15 +410,15 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
410 410
411 if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) { 411 if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
412 /* Write end-of-block record */ 412 /* Write end-of-block record */
413 be_namelen = 0; 413 be_namelen = cpu_to_be16(0);
414 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t)); 414 memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
415 offset += sizeof(uint16_t); 415 offset += sizeof(__be16);
416 goto out; 416 goto out;
417 } 417 }
418 418
419 be_namelen = cpu_to_be16(r->res_length); 419 be_namelen = cpu_to_be16(r->res_length);
420 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t)); 420 memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
421 offset += sizeof(uint16_t); 421 offset += sizeof(__be16);
422 memcpy(outbuf + offset, r->res_name, r->res_length); 422 memcpy(outbuf + offset, r->res_name, r->res_length);
423 offset += r->res_length; 423 offset += r->res_length;
424 } 424 }
@@ -430,9 +430,9 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
430 430
431 if ((list == &ls->ls_root_list) && 431 if ((list == &ls->ls_root_list) &&
432 (offset + sizeof(uint16_t) <= outlen)) { 432 (offset + sizeof(uint16_t) <= outlen)) {
433 be_namelen = 0xFFFF; 433 be_namelen = cpu_to_be16(0xFFFF);
434 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t)); 434 memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
435 offset += sizeof(uint16_t); 435 offset += sizeof(__be16);
436 } 436 }
437 437
438 out: 438 out:
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 868e4c9ef127..ef2f1e353966 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -245,7 +245,8 @@ struct dlm_lkb {
245 struct list_head lkb_astqueue; /* need ast to be sent */ 245 struct list_head lkb_astqueue; /* need ast to be sent */
246 struct list_head lkb_ownqueue; /* list of locks for a process */ 246 struct list_head lkb_ownqueue; /* list of locks for a process */
247 struct list_head lkb_time_list; 247 struct list_head lkb_time_list;
248 unsigned long lkb_timestamp; 248 ktime_t lkb_time_bast; /* for debugging */
249 ktime_t lkb_timestamp;
249 unsigned long lkb_timeout_cs; 250 unsigned long lkb_timeout_cs;
250 251
251 char *lkb_lvbptr; 252 char *lkb_lvbptr;
@@ -481,6 +482,7 @@ struct dlm_ls {
481 struct dentry *ls_debug_rsb_dentry; /* debugfs */ 482 struct dentry *ls_debug_rsb_dentry; /* debugfs */
482 struct dentry *ls_debug_waiters_dentry; /* debugfs */ 483 struct dentry *ls_debug_waiters_dentry; /* debugfs */
483 struct dentry *ls_debug_locks_dentry; /* debugfs */ 484 struct dentry *ls_debug_locks_dentry; /* debugfs */
485 struct dentry *ls_debug_all_dentry; /* debugfs */
484 486
485 wait_queue_head_t ls_uevent_wait; /* user part of join/leave */ 487 wait_queue_head_t ls_uevent_wait; /* user part of join/leave */
486 int ls_uevent_result; 488 int ls_uevent_result;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 724ddac91538..6cfe65bbf4a2 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -307,7 +307,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
307 lkb->lkb_lksb->sb_status = rv; 307 lkb->lkb_lksb->sb_status = rv;
308 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags; 308 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
309 309
310 dlm_add_ast(lkb, AST_COMP); 310 dlm_add_ast(lkb, AST_COMP, 0);
311} 311}
312 312
313static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb) 313static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -318,12 +318,12 @@ static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
318 318
319static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode) 319static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
320{ 320{
321 lkb->lkb_time_bast = ktime_get();
322
321 if (is_master_copy(lkb)) 323 if (is_master_copy(lkb))
322 send_bast(r, lkb, rqmode); 324 send_bast(r, lkb, rqmode);
323 else { 325 else
324 lkb->lkb_bastmode = rqmode; 326 dlm_add_ast(lkb, AST_BAST, rqmode);
325 dlm_add_ast(lkb, AST_BAST);
326 }
327} 327}
328 328
329/* 329/*
@@ -744,6 +744,8 @@ static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
744 744
745 DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb);); 745 DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
746 746
747 lkb->lkb_timestamp = ktime_get();
748
747 lkb->lkb_status = status; 749 lkb->lkb_status = status;
748 750
749 switch (status) { 751 switch (status) {
@@ -1013,10 +1015,8 @@ static void add_timeout(struct dlm_lkb *lkb)
1013{ 1015{
1014 struct dlm_ls *ls = lkb->lkb_resource->res_ls; 1016 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
1015 1017
1016 if (is_master_copy(lkb)) { 1018 if (is_master_copy(lkb))
1017 lkb->lkb_timestamp = jiffies;
1018 return; 1019 return;
1019 }
1020 1020
1021 if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) && 1021 if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) &&
1022 !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) { 1022 !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) {
@@ -1031,7 +1031,6 @@ static void add_timeout(struct dlm_lkb *lkb)
1031 DLM_ASSERT(list_empty(&lkb->lkb_time_list), dlm_print_lkb(lkb);); 1031 DLM_ASSERT(list_empty(&lkb->lkb_time_list), dlm_print_lkb(lkb););
1032 mutex_lock(&ls->ls_timeout_mutex); 1032 mutex_lock(&ls->ls_timeout_mutex);
1033 hold_lkb(lkb); 1033 hold_lkb(lkb);
1034 lkb->lkb_timestamp = jiffies;
1035 list_add_tail(&lkb->lkb_time_list, &ls->ls_timeout); 1034 list_add_tail(&lkb->lkb_time_list, &ls->ls_timeout);
1036 mutex_unlock(&ls->ls_timeout_mutex); 1035 mutex_unlock(&ls->ls_timeout_mutex);
1037} 1036}
@@ -1059,6 +1058,7 @@ void dlm_scan_timeout(struct dlm_ls *ls)
1059 struct dlm_rsb *r; 1058 struct dlm_rsb *r;
1060 struct dlm_lkb *lkb; 1059 struct dlm_lkb *lkb;
1061 int do_cancel, do_warn; 1060 int do_cancel, do_warn;
1061 s64 wait_us;
1062 1062
1063 for (;;) { 1063 for (;;) {
1064 if (dlm_locking_stopped(ls)) 1064 if (dlm_locking_stopped(ls))
@@ -1069,14 +1069,15 @@ void dlm_scan_timeout(struct dlm_ls *ls)
1069 mutex_lock(&ls->ls_timeout_mutex); 1069 mutex_lock(&ls->ls_timeout_mutex);
1070 list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) { 1070 list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) {
1071 1071
1072 wait_us = ktime_to_us(ktime_sub(ktime_get(),
1073 lkb->lkb_timestamp));
1074
1072 if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) && 1075 if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) &&
1073 time_after_eq(jiffies, lkb->lkb_timestamp + 1076 wait_us >= (lkb->lkb_timeout_cs * 10000))
1074 lkb->lkb_timeout_cs * HZ/100))
1075 do_cancel = 1; 1077 do_cancel = 1;
1076 1078
1077 if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) && 1079 if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) &&
1078 time_after_eq(jiffies, lkb->lkb_timestamp + 1080 wait_us >= dlm_config.ci_timewarn_cs * 10000)
1079 dlm_config.ci_timewarn_cs * HZ/100))
1080 do_warn = 1; 1081 do_warn = 1;
1081 1082
1082 if (!do_cancel && !do_warn) 1083 if (!do_cancel && !do_warn)
@@ -1122,12 +1123,12 @@ void dlm_scan_timeout(struct dlm_ls *ls)
1122void dlm_adjust_timeouts(struct dlm_ls *ls) 1123void dlm_adjust_timeouts(struct dlm_ls *ls)
1123{ 1124{
1124 struct dlm_lkb *lkb; 1125 struct dlm_lkb *lkb;
1125 long adj = jiffies - ls->ls_recover_begin; 1126 u64 adj_us = jiffies_to_usecs(jiffies - ls->ls_recover_begin);
1126 1127
1127 ls->ls_recover_begin = 0; 1128 ls->ls_recover_begin = 0;
1128 mutex_lock(&ls->ls_timeout_mutex); 1129 mutex_lock(&ls->ls_timeout_mutex);
1129 list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) 1130 list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
1130 lkb->lkb_timestamp += adj; 1131 lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
1131 mutex_unlock(&ls->ls_timeout_mutex); 1132 mutex_unlock(&ls->ls_timeout_mutex);
1132} 1133}
1133 1134
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 3962262f991a..103a5ebd1371 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -295,6 +295,7 @@ static int add_sock(struct socket *sock, struct connection *con)
295 con->sock->sk->sk_write_space = lowcomms_write_space; 295 con->sock->sk->sk_write_space = lowcomms_write_space;
296 con->sock->sk->sk_state_change = lowcomms_state_change; 296 con->sock->sk->sk_state_change = lowcomms_state_change;
297 con->sock->sk->sk_user_data = con; 297 con->sock->sk->sk_user_data = con;
298 con->sock->sk->sk_allocation = GFP_NOFS;
298 return 0; 299 return 0;
299} 300}
300 301
@@ -823,7 +824,6 @@ static void sctp_init_assoc(struct connection *con)
823 len = e->len; 824 len = e->len;
824 offset = e->offset; 825 offset = e->offset;
825 spin_unlock(&con->writequeue_lock); 826 spin_unlock(&con->writequeue_lock);
826 kmap(e->page);
827 827
828 /* Send the first block off the write queue */ 828 /* Send the first block off the write queue */
829 iov[0].iov_base = page_address(e->page)+offset; 829 iov[0].iov_base = page_address(e->page)+offset;
@@ -854,7 +854,6 @@ static void sctp_init_assoc(struct connection *con)
854 854
855 if (e->len == 0 && e->users == 0) { 855 if (e->len == 0 && e->users == 0) {
856 list_del(&e->list); 856 list_del(&e->list);
857 kunmap(e->page);
858 free_entry(e); 857 free_entry(e);
859 } 858 }
860 spin_unlock(&con->writequeue_lock); 859 spin_unlock(&con->writequeue_lock);
@@ -1203,8 +1202,6 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
1203 1202
1204 if (e) { 1203 if (e) {
1205 got_one: 1204 got_one:
1206 if (users == 0)
1207 kmap(e->page);
1208 *ppc = page_address(e->page) + offset; 1205 *ppc = page_address(e->page) + offset;
1209 return e; 1206 return e;
1210 } 1207 }
@@ -1233,7 +1230,6 @@ void dlm_lowcomms_commit_buffer(void *mh)
1233 if (users) 1230 if (users)
1234 goto out; 1231 goto out;
1235 e->len = e->end - e->offset; 1232 e->len = e->end - e->offset;
1236 kunmap(e->page);
1237 spin_unlock(&con->writequeue_lock); 1233 spin_unlock(&con->writequeue_lock);
1238 1234
1239 if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) { 1235 if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) {
@@ -1272,7 +1268,6 @@ static void send_to_sock(struct connection *con)
1272 offset = e->offset; 1268 offset = e->offset;
1273 BUG_ON(len == 0 && e->users == 0); 1269 BUG_ON(len == 0 && e->users == 0);
1274 spin_unlock(&con->writequeue_lock); 1270 spin_unlock(&con->writequeue_lock);
1275 kmap(e->page);
1276 1271
1277 ret = 0; 1272 ret = 0;
1278 if (len) { 1273 if (len) {
@@ -1294,7 +1289,6 @@ static void send_to_sock(struct connection *con)
1294 1289
1295 if (e->len == 0 && e->users == 0) { 1290 if (e->len == 0 && e->users == 0) {
1296 list_del(&e->list); 1291 list_del(&e->list);
1297 kunmap(e->page);
1298 free_entry(e); 1292 free_entry(e);
1299 continue; 1293 continue;
1300 } 1294 }
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index 54c14c6d06cb..c1775b84ebab 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -39,7 +39,7 @@ char *dlm_allocate_lvb(struct dlm_ls *ls)
39{ 39{
40 char *p; 40 char *p;
41 41
42 p = kzalloc(ls->ls_lvblen, GFP_KERNEL); 42 p = kzalloc(ls->ls_lvblen, ls->ls_allocation);
43 return p; 43 return p;
44} 44}
45 45
@@ -57,7 +57,7 @@ struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen)
57 57
58 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,); 58 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
59 59
60 r = kzalloc(sizeof(*r) + namelen, GFP_KERNEL); 60 r = kzalloc(sizeof(*r) + namelen, ls->ls_allocation);
61 return r; 61 return r;
62} 62}
63 63
@@ -72,7 +72,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
72{ 72{
73 struct dlm_lkb *lkb; 73 struct dlm_lkb *lkb;
74 74
75 lkb = kmem_cache_zalloc(lkb_cache, GFP_KERNEL); 75 lkb = kmem_cache_zalloc(lkb_cache, ls->ls_allocation);
76 return lkb; 76 return lkb;
77} 77}
78 78
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 07ac709f3ed7..f3396c622aec 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -112,7 +112,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
112 ordinary messages). */ 112 ordinary messages). */
113 113
114 if (msglen > sizeof(__tmp) && p == &__tmp.p) { 114 if (msglen > sizeof(__tmp) && p == &__tmp.p) {
115 p = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL); 115 p = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
116 if (p == NULL) 116 if (p == NULL)
117 return ret; 117 return ret;
118 } 118 }
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index aa2a5775a027..ccc9d62c462d 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -115,7 +115,6 @@ static void fill_data(struct dlm_lock_data *data, struct dlm_lkb *lkb)
115 data->status = lkb->lkb_status; 115 data->status = lkb->lkb_status;
116 data->grmode = lkb->lkb_grmode; 116 data->grmode = lkb->lkb_grmode;
117 data->rqmode = lkb->lkb_rqmode; 117 data->rqmode = lkb->lkb_rqmode;
118 data->timestamp = lkb->lkb_timestamp;
119 if (lkb->lkb_ua) 118 if (lkb->lkb_ua)
120 data->xid = lkb->lkb_ua->xid; 119 data->xid = lkb->lkb_ua->xid;
121 if (r) { 120 if (r) {
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index b3832c67194a..065149e84f42 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -175,7 +175,7 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
175/* we could possibly check if the cancel of an orphan has resulted in the lkb 175/* we could possibly check if the cancel of an orphan has resulted in the lkb
176 being removed and then remove that lkb from the orphans list and free it */ 176 being removed and then remove that lkb from the orphans list and free it */
177 177
178void dlm_user_add_ast(struct dlm_lkb *lkb, int type) 178void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
179{ 179{
180 struct dlm_ls *ls; 180 struct dlm_ls *ls;
181 struct dlm_user_args *ua; 181 struct dlm_user_args *ua;
@@ -208,6 +208,8 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
208 208
209 ast_type = lkb->lkb_ast_type; 209 ast_type = lkb->lkb_ast_type;
210 lkb->lkb_ast_type |= type; 210 lkb->lkb_ast_type |= type;
211 if (bastmode)
212 lkb->lkb_bastmode = bastmode;
211 213
212 if (!ast_type) { 214 if (!ast_type) {
213 kref_get(&lkb->lkb_ref); 215 kref_get(&lkb->lkb_ref);
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index 35eb6a13d616..1c9686492286 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -9,7 +9,7 @@
9#ifndef __USER_DOT_H__ 9#ifndef __USER_DOT_H__
10#define __USER_DOT_H__ 10#define __USER_DOT_H__
11 11
12void dlm_user_add_ast(struct dlm_lkb *lkb, int type); 12void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
13int dlm_user_init(void); 13int dlm_user_init(void);
14void dlm_user_exit(void); 14void dlm_user_exit(void);
15int dlm_device_deregister(struct dlm_ls *ls); 15int dlm_device_deregister(struct dlm_ls *ls);
diff --git a/fs/dquot.c b/fs/dquot.c
index c237ccc8581c..48c0571f831d 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -211,8 +211,6 @@ static struct hlist_head *dquot_hash;
211 211
212struct dqstats dqstats; 212struct dqstats dqstats;
213 213
214static void dqput(struct dquot *dquot);
215
216static inline unsigned int 214static inline unsigned int
217hashfn(const struct super_block *sb, unsigned int id, int type) 215hashfn(const struct super_block *sb, unsigned int id, int type)
218{ 216{
@@ -415,6 +413,17 @@ out_dqlock:
415 return ret; 413 return ret;
416} 414}
417 415
416void dquot_destroy(struct dquot *dquot)
417{
418 kmem_cache_free(dquot_cachep, dquot);
419}
420EXPORT_SYMBOL(dquot_destroy);
421
422static inline void do_destroy_dquot(struct dquot *dquot)
423{
424 dquot->dq_sb->dq_op->destroy_dquot(dquot);
425}
426
418/* Invalidate all dquots on the list. Note that this function is called after 427/* Invalidate all dquots on the list. Note that this function is called after
419 * quota is disabled and pointers from inodes removed so there cannot be new 428 * quota is disabled and pointers from inodes removed so there cannot be new
420 * quota users. There can still be some users of quotas due to inodes being 429 * quota users. There can still be some users of quotas due to inodes being
@@ -463,9 +472,44 @@ restart:
463 remove_dquot_hash(dquot); 472 remove_dquot_hash(dquot);
464 remove_free_dquot(dquot); 473 remove_free_dquot(dquot);
465 remove_inuse(dquot); 474 remove_inuse(dquot);
466 kmem_cache_free(dquot_cachep, dquot); 475 do_destroy_dquot(dquot);
476 }
477 spin_unlock(&dq_list_lock);
478}
479
480/* Call callback for every active dquot on given filesystem */
481int dquot_scan_active(struct super_block *sb,
482 int (*fn)(struct dquot *dquot, unsigned long priv),
483 unsigned long priv)
484{
485 struct dquot *dquot, *old_dquot = NULL;
486 int ret = 0;
487
488 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
489 spin_lock(&dq_list_lock);
490 list_for_each_entry(dquot, &inuse_list, dq_inuse) {
491 if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
492 continue;
493 if (dquot->dq_sb != sb)
494 continue;
495 /* Now we have active dquot so we can just increase use count */
496 atomic_inc(&dquot->dq_count);
497 dqstats.lookups++;
498 spin_unlock(&dq_list_lock);
499 dqput(old_dquot);
500 old_dquot = dquot;
501 ret = fn(dquot, priv);
502 if (ret < 0)
503 goto out;
504 spin_lock(&dq_list_lock);
505 /* We are safe to continue now because our dquot could not
506 * be moved out of the inuse list while we hold the reference */
467 } 507 }
468 spin_unlock(&dq_list_lock); 508 spin_unlock(&dq_list_lock);
509out:
510 dqput(old_dquot);
511 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
512 return ret;
469} 513}
470 514
471int vfs_quota_sync(struct super_block *sb, int type) 515int vfs_quota_sync(struct super_block *sb, int type)
@@ -479,7 +523,7 @@ int vfs_quota_sync(struct super_block *sb, int type)
479 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 523 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
480 if (type != -1 && cnt != type) 524 if (type != -1 && cnt != type)
481 continue; 525 continue;
482 if (!sb_has_quota_enabled(sb, cnt)) 526 if (!sb_has_quota_active(sb, cnt))
483 continue; 527 continue;
484 spin_lock(&dq_list_lock); 528 spin_lock(&dq_list_lock);
485 dirty = &dqopt->info[cnt].dqi_dirty_list; 529 dirty = &dqopt->info[cnt].dqi_dirty_list;
@@ -504,8 +548,8 @@ int vfs_quota_sync(struct super_block *sb, int type)
504 } 548 }
505 549
506 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 550 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
507 if ((cnt == type || type == -1) && sb_has_quota_enabled(sb, cnt) 551 if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt)
508 && info_dirty(&dqopt->info[cnt])) 552 && info_dirty(&dqopt->info[cnt]))
509 sb->dq_op->write_info(sb, cnt); 553 sb->dq_op->write_info(sb, cnt);
510 spin_lock(&dq_list_lock); 554 spin_lock(&dq_list_lock);
511 dqstats.syncs++; 555 dqstats.syncs++;
@@ -527,7 +571,7 @@ static void prune_dqcache(int count)
527 remove_dquot_hash(dquot); 571 remove_dquot_hash(dquot);
528 remove_free_dquot(dquot); 572 remove_free_dquot(dquot);
529 remove_inuse(dquot); 573 remove_inuse(dquot);
530 kmem_cache_free(dquot_cachep, dquot); 574 do_destroy_dquot(dquot);
531 count--; 575 count--;
532 head = free_dquots.prev; 576 head = free_dquots.prev;
533 } 577 }
@@ -558,7 +602,7 @@ static struct shrinker dqcache_shrinker = {
558 * NOTE: If you change this function please check whether dqput_blocks() works right... 602 * NOTE: If you change this function please check whether dqput_blocks() works right...
559 * MUST be called with either dqptr_sem or dqonoff_mutex held 603 * MUST be called with either dqptr_sem or dqonoff_mutex held
560 */ 604 */
561static void dqput(struct dquot *dquot) 605void dqput(struct dquot *dquot)
562{ 606{
563 int ret; 607 int ret;
564 608
@@ -584,7 +628,7 @@ we_slept:
584 /* We have more than one user... nothing to do */ 628 /* We have more than one user... nothing to do */
585 atomic_dec(&dquot->dq_count); 629 atomic_dec(&dquot->dq_count);
586 /* Releasing dquot during quotaoff phase? */ 630 /* Releasing dquot during quotaoff phase? */
587 if (!sb_has_quota_enabled(dquot->dq_sb, dquot->dq_type) && 631 if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_type) &&
588 atomic_read(&dquot->dq_count) == 1) 632 atomic_read(&dquot->dq_count) == 1)
589 wake_up(&dquot->dq_wait_unused); 633 wake_up(&dquot->dq_wait_unused);
590 spin_unlock(&dq_list_lock); 634 spin_unlock(&dq_list_lock);
@@ -625,11 +669,17 @@ we_slept:
625 spin_unlock(&dq_list_lock); 669 spin_unlock(&dq_list_lock);
626} 670}
627 671
672struct dquot *dquot_alloc(struct super_block *sb, int type)
673{
674 return kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
675}
676EXPORT_SYMBOL(dquot_alloc);
677
628static struct dquot *get_empty_dquot(struct super_block *sb, int type) 678static struct dquot *get_empty_dquot(struct super_block *sb, int type)
629{ 679{
630 struct dquot *dquot; 680 struct dquot *dquot;
631 681
632 dquot = kmem_cache_zalloc(dquot_cachep, GFP_NOFS); 682 dquot = sb->dq_op->alloc_dquot(sb, type);
633 if(!dquot) 683 if(!dquot)
634 return NODQUOT; 684 return NODQUOT;
635 685
@@ -647,15 +697,33 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
647} 697}
648 698
649/* 699/*
700 * Check whether dquot is in memory.
701 * MUST be called with either dqptr_sem or dqonoff_mutex held
702 */
703int dquot_is_cached(struct super_block *sb, unsigned int id, int type)
704{
705 unsigned int hashent = hashfn(sb, id, type);
706 int ret = 0;
707
708 if (!sb_has_quota_active(sb, type))
709 return 0;
710 spin_lock(&dq_list_lock);
711 if (find_dquot(hashent, sb, id, type) != NODQUOT)
712 ret = 1;
713 spin_unlock(&dq_list_lock);
714 return ret;
715}
716
717/*
650 * Get reference to dquot 718 * Get reference to dquot
651 * MUST be called with either dqptr_sem or dqonoff_mutex held 719 * MUST be called with either dqptr_sem or dqonoff_mutex held
652 */ 720 */
653static struct dquot *dqget(struct super_block *sb, unsigned int id, int type) 721struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
654{ 722{
655 unsigned int hashent = hashfn(sb, id, type); 723 unsigned int hashent = hashfn(sb, id, type);
656 struct dquot *dquot, *empty = NODQUOT; 724 struct dquot *dquot, *empty = NODQUOT;
657 725
658 if (!sb_has_quota_enabled(sb, type)) 726 if (!sb_has_quota_active(sb, type))
659 return NODQUOT; 727 return NODQUOT;
660we_slept: 728we_slept:
661 spin_lock(&dq_list_lock); 729 spin_lock(&dq_list_lock);
@@ -682,7 +750,7 @@ we_slept:
682 dqstats.lookups++; 750 dqstats.lookups++;
683 spin_unlock(&dq_list_lock); 751 spin_unlock(&dq_list_lock);
684 if (empty) 752 if (empty)
685 kmem_cache_free(dquot_cachep, empty); 753 do_destroy_dquot(empty);
686 } 754 }
687 /* Wait for dq_lock - after this we know that either dquot_release() is already 755 /* Wait for dq_lock - after this we know that either dquot_release() is already
688 * finished or it will be canceled due to dq_count > 1 test */ 756 * finished or it will be canceled due to dq_count > 1 test */
@@ -820,7 +888,7 @@ static void drop_dquot_ref(struct super_block *sb, int type)
820 } 888 }
821} 889}
822 890
823static inline void dquot_incr_inodes(struct dquot *dquot, unsigned long number) 891static inline void dquot_incr_inodes(struct dquot *dquot, qsize_t number)
824{ 892{
825 dquot->dq_dqb.dqb_curinodes += number; 893 dquot->dq_dqb.dqb_curinodes += number;
826} 894}
@@ -830,9 +898,10 @@ static inline void dquot_incr_space(struct dquot *dquot, qsize_t number)
830 dquot->dq_dqb.dqb_curspace += number; 898 dquot->dq_dqb.dqb_curspace += number;
831} 899}
832 900
833static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number) 901static inline void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
834{ 902{
835 if (dquot->dq_dqb.dqb_curinodes > number) 903 if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
904 dquot->dq_dqb.dqb_curinodes >= number)
836 dquot->dq_dqb.dqb_curinodes -= number; 905 dquot->dq_dqb.dqb_curinodes -= number;
837 else 906 else
838 dquot->dq_dqb.dqb_curinodes = 0; 907 dquot->dq_dqb.dqb_curinodes = 0;
@@ -843,11 +912,12 @@ static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number)
843 912
844static inline void dquot_decr_space(struct dquot *dquot, qsize_t number) 913static inline void dquot_decr_space(struct dquot *dquot, qsize_t number)
845{ 914{
846 if (dquot->dq_dqb.dqb_curspace > number) 915 if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
916 dquot->dq_dqb.dqb_curspace >= number)
847 dquot->dq_dqb.dqb_curspace -= number; 917 dquot->dq_dqb.dqb_curspace -= number;
848 else 918 else
849 dquot->dq_dqb.dqb_curspace = 0; 919 dquot->dq_dqb.dqb_curspace = 0;
850 if (toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit) 920 if (dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
851 dquot->dq_dqb.dqb_btime = (time_t) 0; 921 dquot->dq_dqb.dqb_btime = (time_t) 0;
852 clear_bit(DQ_BLKS_B, &dquot->dq_flags); 922 clear_bit(DQ_BLKS_B, &dquot->dq_flags);
853} 923}
@@ -1023,10 +1093,11 @@ static inline char ignore_hardlimit(struct dquot *dquot)
1023} 1093}
1024 1094
1025/* needs dq_data_lock */ 1095/* needs dq_data_lock */
1026static int check_idq(struct dquot *dquot, ulong inodes, char *warntype) 1096static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
1027{ 1097{
1028 *warntype = QUOTA_NL_NOWARN; 1098 *warntype = QUOTA_NL_NOWARN;
1029 if (inodes <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags)) 1099 if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
1100 test_bit(DQ_FAKE_B, &dquot->dq_flags))
1030 return QUOTA_OK; 1101 return QUOTA_OK;
1031 1102
1032 if (dquot->dq_dqb.dqb_ihardlimit && 1103 if (dquot->dq_dqb.dqb_ihardlimit &&
@@ -1058,11 +1129,12 @@ static int check_idq(struct dquot *dquot, ulong inodes, char *warntype)
1058static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype) 1129static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype)
1059{ 1130{
1060 *warntype = QUOTA_NL_NOWARN; 1131 *warntype = QUOTA_NL_NOWARN;
1061 if (space <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags)) 1132 if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
1133 test_bit(DQ_FAKE_B, &dquot->dq_flags))
1062 return QUOTA_OK; 1134 return QUOTA_OK;
1063 1135
1064 if (dquot->dq_dqb.dqb_bhardlimit && 1136 if (dquot->dq_dqb.dqb_bhardlimit &&
1065 toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bhardlimit && 1137 dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bhardlimit &&
1066 !ignore_hardlimit(dquot)) { 1138 !ignore_hardlimit(dquot)) {
1067 if (!prealloc) 1139 if (!prealloc)
1068 *warntype = QUOTA_NL_BHARDWARN; 1140 *warntype = QUOTA_NL_BHARDWARN;
@@ -1070,7 +1142,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
1070 } 1142 }
1071 1143
1072 if (dquot->dq_dqb.dqb_bsoftlimit && 1144 if (dquot->dq_dqb.dqb_bsoftlimit &&
1073 toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit && 1145 dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
1074 dquot->dq_dqb.dqb_btime && get_seconds() >= dquot->dq_dqb.dqb_btime && 1146 dquot->dq_dqb.dqb_btime && get_seconds() >= dquot->dq_dqb.dqb_btime &&
1075 !ignore_hardlimit(dquot)) { 1147 !ignore_hardlimit(dquot)) {
1076 if (!prealloc) 1148 if (!prealloc)
@@ -1079,7 +1151,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
1079 } 1151 }
1080 1152
1081 if (dquot->dq_dqb.dqb_bsoftlimit && 1153 if (dquot->dq_dqb.dqb_bsoftlimit &&
1082 toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit && 1154 dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
1083 dquot->dq_dqb.dqb_btime == 0) { 1155 dquot->dq_dqb.dqb_btime == 0) {
1084 if (!prealloc) { 1156 if (!prealloc) {
1085 *warntype = QUOTA_NL_BSOFTWARN; 1157 *warntype = QUOTA_NL_BSOFTWARN;
@@ -1096,10 +1168,11 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
1096 return QUOTA_OK; 1168 return QUOTA_OK;
1097} 1169}
1098 1170
1099static int info_idq_free(struct dquot *dquot, ulong inodes) 1171static int info_idq_free(struct dquot *dquot, qsize_t inodes)
1100{ 1172{
1101 if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || 1173 if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
1102 dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit) 1174 dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit ||
1175 !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type))
1103 return QUOTA_NL_NOWARN; 1176 return QUOTA_NL_NOWARN;
1104 1177
1105 if (dquot->dq_dqb.dqb_curinodes - inodes <= dquot->dq_dqb.dqb_isoftlimit) 1178 if (dquot->dq_dqb.dqb_curinodes - inodes <= dquot->dq_dqb.dqb_isoftlimit)
@@ -1113,15 +1186,13 @@ static int info_idq_free(struct dquot *dquot, ulong inodes)
1113static int info_bdq_free(struct dquot *dquot, qsize_t space) 1186static int info_bdq_free(struct dquot *dquot, qsize_t space)
1114{ 1187{
1115 if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || 1188 if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
1116 toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit) 1189 dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
1117 return QUOTA_NL_NOWARN; 1190 return QUOTA_NL_NOWARN;
1118 1191
1119 if (toqb(dquot->dq_dqb.dqb_curspace - space) <= 1192 if (dquot->dq_dqb.dqb_curspace - space <= dquot->dq_dqb.dqb_bsoftlimit)
1120 dquot->dq_dqb.dqb_bsoftlimit)
1121 return QUOTA_NL_BSOFTBELOW; 1193 return QUOTA_NL_BSOFTBELOW;
1122 if (toqb(dquot->dq_dqb.dqb_curspace) >= dquot->dq_dqb.dqb_bhardlimit && 1194 if (dquot->dq_dqb.dqb_curspace >= dquot->dq_dqb.dqb_bhardlimit &&
1123 toqb(dquot->dq_dqb.dqb_curspace - space) < 1195 dquot->dq_dqb.dqb_curspace - space < dquot->dq_dqb.dqb_bhardlimit)
1124 dquot->dq_dqb.dqb_bhardlimit)
1125 return QUOTA_NL_BHARDBELOW; 1196 return QUOTA_NL_BHARDBELOW;
1126 return QUOTA_NL_NOWARN; 1197 return QUOTA_NL_NOWARN;
1127} 1198}
@@ -1166,17 +1237,23 @@ out_err:
1166 * Release all quotas referenced by inode 1237 * Release all quotas referenced by inode
1167 * Transaction must be started at an entry 1238 * Transaction must be started at an entry
1168 */ 1239 */
1169int dquot_drop(struct inode *inode) 1240int dquot_drop_locked(struct inode *inode)
1170{ 1241{
1171 int cnt; 1242 int cnt;
1172 1243
1173 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1174 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1244 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1175 if (inode->i_dquot[cnt] != NODQUOT) { 1245 if (inode->i_dquot[cnt] != NODQUOT) {
1176 dqput(inode->i_dquot[cnt]); 1246 dqput(inode->i_dquot[cnt]);
1177 inode->i_dquot[cnt] = NODQUOT; 1247 inode->i_dquot[cnt] = NODQUOT;
1178 } 1248 }
1179 } 1249 }
1250 return 0;
1251}
1252
1253int dquot_drop(struct inode *inode)
1254{
1255 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1256 dquot_drop_locked(inode);
1180 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1257 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1181 return 0; 1258 return 0;
1182} 1259}
@@ -1264,7 +1341,7 @@ warn_put_all:
1264/* 1341/*
1265 * This operation can block, but only after everything is updated 1342 * This operation can block, but only after everything is updated
1266 */ 1343 */
1267int dquot_alloc_inode(const struct inode *inode, unsigned long number) 1344int dquot_alloc_inode(const struct inode *inode, qsize_t number)
1268{ 1345{
1269 int cnt, ret = NO_QUOTA; 1346 int cnt, ret = NO_QUOTA;
1270 char warntype[MAXQUOTAS]; 1347 char warntype[MAXQUOTAS];
@@ -1349,7 +1426,7 @@ out_sub:
1349/* 1426/*
1350 * This operation can block, but only after everything is updated 1427 * This operation can block, but only after everything is updated
1351 */ 1428 */
1352int dquot_free_inode(const struct inode *inode, unsigned long number) 1429int dquot_free_inode(const struct inode *inode, qsize_t number)
1353{ 1430{
1354 unsigned int cnt; 1431 unsigned int cnt;
1355 char warntype[MAXQUOTAS]; 1432 char warntype[MAXQUOTAS];
@@ -1495,7 +1572,7 @@ warn_put_all:
1495/* Wrapper for transferring ownership of an inode */ 1572/* Wrapper for transferring ownership of an inode */
1496int vfs_dq_transfer(struct inode *inode, struct iattr *iattr) 1573int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
1497{ 1574{
1498 if (sb_any_quota_enabled(inode->i_sb) && !IS_NOQUOTA(inode)) { 1575 if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) {
1499 vfs_dq_init(inode); 1576 vfs_dq_init(inode);
1500 if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA) 1577 if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA)
1501 return 1; 1578 return 1;
@@ -1533,54 +1610,27 @@ struct dquot_operations dquot_operations = {
1533 .acquire_dquot = dquot_acquire, 1610 .acquire_dquot = dquot_acquire,
1534 .release_dquot = dquot_release, 1611 .release_dquot = dquot_release,
1535 .mark_dirty = dquot_mark_dquot_dirty, 1612 .mark_dirty = dquot_mark_dquot_dirty,
1536 .write_info = dquot_commit_info 1613 .write_info = dquot_commit_info,
1614 .alloc_dquot = dquot_alloc,
1615 .destroy_dquot = dquot_destroy,
1537}; 1616};
1538 1617
1539static inline void set_enable_flags(struct quota_info *dqopt, int type)
1540{
1541 switch (type) {
1542 case USRQUOTA:
1543 dqopt->flags |= DQUOT_USR_ENABLED;
1544 dqopt->flags &= ~DQUOT_USR_SUSPENDED;
1545 break;
1546 case GRPQUOTA:
1547 dqopt->flags |= DQUOT_GRP_ENABLED;
1548 dqopt->flags &= ~DQUOT_GRP_SUSPENDED;
1549 break;
1550 }
1551}
1552
1553static inline void reset_enable_flags(struct quota_info *dqopt, int type,
1554 int remount)
1555{
1556 switch (type) {
1557 case USRQUOTA:
1558 dqopt->flags &= ~DQUOT_USR_ENABLED;
1559 if (remount)
1560 dqopt->flags |= DQUOT_USR_SUSPENDED;
1561 else
1562 dqopt->flags &= ~DQUOT_USR_SUSPENDED;
1563 break;
1564 case GRPQUOTA:
1565 dqopt->flags &= ~DQUOT_GRP_ENABLED;
1566 if (remount)
1567 dqopt->flags |= DQUOT_GRP_SUSPENDED;
1568 else
1569 dqopt->flags &= ~DQUOT_GRP_SUSPENDED;
1570 break;
1571 }
1572}
1573
1574
1575/* 1618/*
1576 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) 1619 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
1577 */ 1620 */
1578int vfs_quota_off(struct super_block *sb, int type, int remount) 1621int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
1579{ 1622{
1580 int cnt, ret = 0; 1623 int cnt, ret = 0;
1581 struct quota_info *dqopt = sb_dqopt(sb); 1624 struct quota_info *dqopt = sb_dqopt(sb);
1582 struct inode *toputinode[MAXQUOTAS]; 1625 struct inode *toputinode[MAXQUOTAS];
1583 1626
1627 /* Cannot turn off usage accounting without turning off limits, or
1628 * suspend quotas and simultaneously turn quotas off. */
1629 if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED))
1630 || (flags & DQUOT_SUSPENDED && flags & (DQUOT_LIMITS_ENABLED |
1631 DQUOT_USAGE_ENABLED)))
1632 return -EINVAL;
1633
1584 /* We need to serialize quota_off() for device */ 1634 /* We need to serialize quota_off() for device */
1585 mutex_lock(&dqopt->dqonoff_mutex); 1635 mutex_lock(&dqopt->dqonoff_mutex);
1586 1636
@@ -1589,7 +1639,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
1589 * sometimes we are called when fill_super() failed and calling 1639 * sometimes we are called when fill_super() failed and calling
1590 * sync_fs() in such cases does no good. 1640 * sync_fs() in such cases does no good.
1591 */ 1641 */
1592 if (!sb_any_quota_enabled(sb) && !sb_any_quota_suspended(sb)) { 1642 if (!sb_any_quota_loaded(sb)) {
1593 mutex_unlock(&dqopt->dqonoff_mutex); 1643 mutex_unlock(&dqopt->dqonoff_mutex);
1594 return 0; 1644 return 0;
1595 } 1645 }
@@ -1597,17 +1647,28 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
1597 toputinode[cnt] = NULL; 1647 toputinode[cnt] = NULL;
1598 if (type != -1 && cnt != type) 1648 if (type != -1 && cnt != type)
1599 continue; 1649 continue;
1600 /* If we keep inodes of quota files after remount and quotaoff 1650 if (!sb_has_quota_loaded(sb, cnt))
1601 * is called, drop kept inodes. */
1602 if (!remount && sb_has_quota_suspended(sb, cnt)) {
1603 iput(dqopt->files[cnt]);
1604 dqopt->files[cnt] = NULL;
1605 reset_enable_flags(dqopt, cnt, 0);
1606 continue; 1651 continue;
1652
1653 if (flags & DQUOT_SUSPENDED) {
1654 dqopt->flags |=
1655 dquot_state_flag(DQUOT_SUSPENDED, cnt);
1656 } else {
1657 dqopt->flags &= ~dquot_state_flag(flags, cnt);
1658 /* Turning off suspended quotas? */
1659 if (!sb_has_quota_loaded(sb, cnt) &&
1660 sb_has_quota_suspended(sb, cnt)) {
1661 dqopt->flags &= ~dquot_state_flag(
1662 DQUOT_SUSPENDED, cnt);
1663 iput(dqopt->files[cnt]);
1664 dqopt->files[cnt] = NULL;
1665 continue;
1666 }
1607 } 1667 }
1608 if (!sb_has_quota_enabled(sb, cnt)) 1668
1669 /* We still have to keep quota loaded? */
1670 if (sb_has_quota_loaded(sb, cnt) && !(flags & DQUOT_SUSPENDED))
1609 continue; 1671 continue;
1610 reset_enable_flags(dqopt, cnt, remount);
1611 1672
1612 /* Note: these are blocking operations */ 1673 /* Note: these are blocking operations */
1613 drop_dquot_ref(sb, cnt); 1674 drop_dquot_ref(sb, cnt);
@@ -1623,7 +1684,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
1623 put_quota_format(dqopt->info[cnt].dqi_format); 1684 put_quota_format(dqopt->info[cnt].dqi_format);
1624 1685
1625 toputinode[cnt] = dqopt->files[cnt]; 1686 toputinode[cnt] = dqopt->files[cnt];
1626 if (!remount) 1687 if (!sb_has_quota_loaded(sb, cnt))
1627 dqopt->files[cnt] = NULL; 1688 dqopt->files[cnt] = NULL;
1628 dqopt->info[cnt].dqi_flags = 0; 1689 dqopt->info[cnt].dqi_flags = 0;
1629 dqopt->info[cnt].dqi_igrace = 0; 1690 dqopt->info[cnt].dqi_igrace = 0;
@@ -1631,6 +1692,11 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
1631 dqopt->ops[cnt] = NULL; 1692 dqopt->ops[cnt] = NULL;
1632 } 1693 }
1633 mutex_unlock(&dqopt->dqonoff_mutex); 1694 mutex_unlock(&dqopt->dqonoff_mutex);
1695
1696 /* Skip syncing and setting flags if quota files are hidden */
1697 if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
1698 goto put_inodes;
1699
1634 /* Sync the superblock so that buffers with quota data are written to 1700 /* Sync the superblock so that buffers with quota data are written to
1635 * disk (and so userspace sees correct data afterwards). */ 1701 * disk (and so userspace sees correct data afterwards). */
1636 if (sb->s_op->sync_fs) 1702 if (sb->s_op->sync_fs)
@@ -1646,7 +1712,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
1646 mutex_lock(&dqopt->dqonoff_mutex); 1712 mutex_lock(&dqopt->dqonoff_mutex);
1647 /* If quota was reenabled in the meantime, we have 1713 /* If quota was reenabled in the meantime, we have
1648 * nothing to do */ 1714 * nothing to do */
1649 if (!sb_has_quota_enabled(sb, cnt)) { 1715 if (!sb_has_quota_loaded(sb, cnt)) {
1650 mutex_lock_nested(&toputinode[cnt]->i_mutex, I_MUTEX_QUOTA); 1716 mutex_lock_nested(&toputinode[cnt]->i_mutex, I_MUTEX_QUOTA);
1651 toputinode[cnt]->i_flags &= ~(S_IMMUTABLE | 1717 toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
1652 S_NOATIME | S_NOQUOTA); 1718 S_NOATIME | S_NOQUOTA);
@@ -1655,26 +1721,43 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
1655 mark_inode_dirty(toputinode[cnt]); 1721 mark_inode_dirty(toputinode[cnt]);
1656 } 1722 }
1657 mutex_unlock(&dqopt->dqonoff_mutex); 1723 mutex_unlock(&dqopt->dqonoff_mutex);
1724 }
1725 if (sb->s_bdev)
1726 invalidate_bdev(sb->s_bdev);
1727put_inodes:
1728 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1729 if (toputinode[cnt]) {
1658 /* On remount RO, we keep the inode pointer so that we 1730 /* On remount RO, we keep the inode pointer so that we
1659 * can reenable quota on the subsequent remount RW. 1731 * can reenable quota on the subsequent remount RW. We
1660 * But we have better not keep inode pointer when there 1732 * have to check 'flags' variable and not use sb_has_
1661 * is pending delete on the quota file... */ 1733 * function because another quotaon / quotaoff could
1662 if (!remount) 1734 * change global state before we got here. We refuse
1735 * to suspend quotas when there is pending delete on
1736 * the quota file... */
1737 if (!(flags & DQUOT_SUSPENDED))
1663 iput(toputinode[cnt]); 1738 iput(toputinode[cnt]);
1664 else if (!toputinode[cnt]->i_nlink) 1739 else if (!toputinode[cnt]->i_nlink)
1665 ret = -EBUSY; 1740 ret = -EBUSY;
1666 } 1741 }
1667 if (sb->s_bdev)
1668 invalidate_bdev(sb->s_bdev);
1669 return ret; 1742 return ret;
1670} 1743}
1671 1744
1745int vfs_quota_off(struct super_block *sb, int type, int remount)
1746{
1747 return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED :
1748 (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED));
1749}
1750
1672/* 1751/*
1673 * Turn quotas on on a device 1752 * Turn quotas on on a device
1674 */ 1753 */
1675 1754
1676/* Helper function when we already have the inode */ 1755/*
1677static int vfs_quota_on_inode(struct inode *inode, int type, int format_id) 1756 * Helper function to turn quotas on when we already have the inode of
1757 * quota file and no quota information is loaded.
1758 */
1759static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
1760 unsigned int flags)
1678{ 1761{
1679 struct quota_format_type *fmt = find_quota_format(format_id); 1762 struct quota_format_type *fmt = find_quota_format(format_id);
1680 struct super_block *sb = inode->i_sb; 1763 struct super_block *sb = inode->i_sb;
@@ -1696,27 +1779,37 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
1696 error = -EINVAL; 1779 error = -EINVAL;
1697 goto out_fmt; 1780 goto out_fmt;
1698 } 1781 }
1782 /* Usage always has to be set... */
1783 if (!(flags & DQUOT_USAGE_ENABLED)) {
1784 error = -EINVAL;
1785 goto out_fmt;
1786 }
1699 1787
1700 /* As we bypass the pagecache we must now flush the inode so that 1788 if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
1701 * we see all the changes from userspace... */ 1789 /* As we bypass the pagecache we must now flush the inode so
1702 write_inode_now(inode, 1); 1790 * that we see all the changes from userspace... */
1703 /* And now flush the block cache so that kernel sees the changes */ 1791 write_inode_now(inode, 1);
1704 invalidate_bdev(sb->s_bdev); 1792 /* And now flush the block cache so that kernel sees the
1793 * changes */
1794 invalidate_bdev(sb->s_bdev);
1795 }
1705 mutex_lock(&inode->i_mutex); 1796 mutex_lock(&inode->i_mutex);
1706 mutex_lock(&dqopt->dqonoff_mutex); 1797 mutex_lock(&dqopt->dqonoff_mutex);
1707 if (sb_has_quota_enabled(sb, type) || 1798 if (sb_has_quota_loaded(sb, type)) {
1708 sb_has_quota_suspended(sb, type)) {
1709 error = -EBUSY; 1799 error = -EBUSY;
1710 goto out_lock; 1800 goto out_lock;
1711 } 1801 }
1712 /* We don't want quota and atime on quota files (deadlocks possible) 1802
1713 * Also nobody should write to the file - we use special IO operations 1803 if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
1714 * which ignore the immutable bit. */ 1804 /* We don't want quota and atime on quota files (deadlocks
1715 down_write(&dqopt->dqptr_sem); 1805 * possible) Also nobody should write to the file - we use
1716 oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA); 1806 * special IO operations which ignore the immutable bit. */
1717 inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; 1807 down_write(&dqopt->dqptr_sem);
1718 up_write(&dqopt->dqptr_sem); 1808 oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA);
1719 sb->dq_op->drop(inode); 1809 inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
1810 up_write(&dqopt->dqptr_sem);
1811 sb->dq_op->drop(inode);
1812 }
1720 1813
1721 error = -EIO; 1814 error = -EIO;
1722 dqopt->files[type] = igrab(inode); 1815 dqopt->files[type] = igrab(inode);
@@ -1737,7 +1830,7 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
1737 } 1830 }
1738 mutex_unlock(&dqopt->dqio_mutex); 1831 mutex_unlock(&dqopt->dqio_mutex);
1739 mutex_unlock(&inode->i_mutex); 1832 mutex_unlock(&inode->i_mutex);
1740 set_enable_flags(dqopt, type); 1833 dqopt->flags |= dquot_state_flag(flags, type);
1741 1834
1742 add_dquot_ref(sb, type); 1835 add_dquot_ref(sb, type);
1743 mutex_unlock(&dqopt->dqonoff_mutex); 1836 mutex_unlock(&dqopt->dqonoff_mutex);
@@ -1770,20 +1863,23 @@ static int vfs_quota_on_remount(struct super_block *sb, int type)
1770 struct quota_info *dqopt = sb_dqopt(sb); 1863 struct quota_info *dqopt = sb_dqopt(sb);
1771 struct inode *inode; 1864 struct inode *inode;
1772 int ret; 1865 int ret;
1866 unsigned int flags;
1773 1867
1774 mutex_lock(&dqopt->dqonoff_mutex); 1868 mutex_lock(&dqopt->dqonoff_mutex);
1775 if (!sb_has_quota_suspended(sb, type)) { 1869 if (!sb_has_quota_suspended(sb, type)) {
1776 mutex_unlock(&dqopt->dqonoff_mutex); 1870 mutex_unlock(&dqopt->dqonoff_mutex);
1777 return 0; 1871 return 0;
1778 } 1872 }
1779 BUG_ON(sb_has_quota_enabled(sb, type));
1780
1781 inode = dqopt->files[type]; 1873 inode = dqopt->files[type];
1782 dqopt->files[type] = NULL; 1874 dqopt->files[type] = NULL;
1783 reset_enable_flags(dqopt, type, 0); 1875 flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
1876 DQUOT_LIMITS_ENABLED, type);
1877 dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type);
1784 mutex_unlock(&dqopt->dqonoff_mutex); 1878 mutex_unlock(&dqopt->dqonoff_mutex);
1785 1879
1786 ret = vfs_quota_on_inode(inode, type, dqopt->info[type].dqi_fmt_id); 1880 flags = dquot_generic_flag(flags, type);
1881 ret = vfs_load_quota_inode(inode, type, dqopt->info[type].dqi_fmt_id,
1882 flags);
1787 iput(inode); 1883 iput(inode);
1788 1884
1789 return ret; 1885 return ret;
@@ -1799,12 +1895,12 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
1799 if (path->mnt->mnt_sb != sb) 1895 if (path->mnt->mnt_sb != sb)
1800 error = -EXDEV; 1896 error = -EXDEV;
1801 else 1897 else
1802 error = vfs_quota_on_inode(path->dentry->d_inode, type, 1898 error = vfs_load_quota_inode(path->dentry->d_inode, type,
1803 format_id); 1899 format_id, DQUOT_USAGE_ENABLED |
1900 DQUOT_LIMITS_ENABLED);
1804 return error; 1901 return error;
1805} 1902}
1806 1903
1807/* Actual function called from quotactl() */
1808int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name, 1904int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
1809 int remount) 1905 int remount)
1810{ 1906{
@@ -1823,6 +1919,50 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
1823} 1919}
1824 1920
1825/* 1921/*
1922 * More powerful function for turning on quotas allowing setting
1923 * of individual quota flags
1924 */
1925int vfs_quota_enable(struct inode *inode, int type, int format_id,
1926 unsigned int flags)
1927{
1928 int ret = 0;
1929 struct super_block *sb = inode->i_sb;
1930 struct quota_info *dqopt = sb_dqopt(sb);
1931
1932 /* Just unsuspend quotas? */
1933 if (flags & DQUOT_SUSPENDED)
1934 return vfs_quota_on_remount(sb, type);
1935 if (!flags)
1936 return 0;
1937 /* Just updating flags needed? */
1938 if (sb_has_quota_loaded(sb, type)) {
1939 mutex_lock(&dqopt->dqonoff_mutex);
1940 /* Now do a reliable test... */
1941 if (!sb_has_quota_loaded(sb, type)) {
1942 mutex_unlock(&dqopt->dqonoff_mutex);
1943 goto load_quota;
1944 }
1945 if (flags & DQUOT_USAGE_ENABLED &&
1946 sb_has_quota_usage_enabled(sb, type)) {
1947 ret = -EBUSY;
1948 goto out_lock;
1949 }
1950 if (flags & DQUOT_LIMITS_ENABLED &&
1951 sb_has_quota_limits_enabled(sb, type)) {
1952 ret = -EBUSY;
1953 goto out_lock;
1954 }
1955 sb_dqopt(sb)->flags |= dquot_state_flag(flags, type);
1956out_lock:
1957 mutex_unlock(&dqopt->dqonoff_mutex);
1958 return ret;
1959 }
1960
1961load_quota:
1962 return vfs_load_quota_inode(inode, type, format_id, flags);
1963}
1964
1965/*
1826 * This function is used when filesystem needs to initialize quotas 1966 * This function is used when filesystem needs to initialize quotas
1827 * during mount time. 1967 * during mount time.
1828 */ 1968 */
@@ -1843,7 +1983,8 @@ int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
1843 1983
1844 error = security_quota_on(dentry); 1984 error = security_quota_on(dentry);
1845 if (!error) 1985 if (!error)
1846 error = vfs_quota_on_inode(dentry->d_inode, type, format_id); 1986 error = vfs_load_quota_inode(dentry->d_inode, type, format_id,
1987 DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
1847 1988
1848out: 1989out:
1849 dput(dentry); 1990 dput(dentry);
@@ -1866,14 +2007,24 @@ int vfs_dq_quota_on_remount(struct super_block *sb)
1866 return ret; 2007 return ret;
1867} 2008}
1868 2009
2010static inline qsize_t qbtos(qsize_t blocks)
2011{
2012 return blocks << QIF_DQBLKSIZE_BITS;
2013}
2014
2015static inline qsize_t stoqb(qsize_t space)
2016{
2017 return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
2018}
2019
1869/* Generic routine for getting common part of quota structure */ 2020/* Generic routine for getting common part of quota structure */
1870static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di) 2021static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
1871{ 2022{
1872 struct mem_dqblk *dm = &dquot->dq_dqb; 2023 struct mem_dqblk *dm = &dquot->dq_dqb;
1873 2024
1874 spin_lock(&dq_data_lock); 2025 spin_lock(&dq_data_lock);
1875 di->dqb_bhardlimit = dm->dqb_bhardlimit; 2026 di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit);
1876 di->dqb_bsoftlimit = dm->dqb_bsoftlimit; 2027 di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit);
1877 di->dqb_curspace = dm->dqb_curspace; 2028 di->dqb_curspace = dm->dqb_curspace;
1878 di->dqb_ihardlimit = dm->dqb_ihardlimit; 2029 di->dqb_ihardlimit = dm->dqb_ihardlimit;
1879 di->dqb_isoftlimit = dm->dqb_isoftlimit; 2030 di->dqb_isoftlimit = dm->dqb_isoftlimit;
@@ -1918,28 +2069,38 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
1918 if (di->dqb_valid & QIF_SPACE) { 2069 if (di->dqb_valid & QIF_SPACE) {
1919 dm->dqb_curspace = di->dqb_curspace; 2070 dm->dqb_curspace = di->dqb_curspace;
1920 check_blim = 1; 2071 check_blim = 1;
2072 __set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
1921 } 2073 }
1922 if (di->dqb_valid & QIF_BLIMITS) { 2074 if (di->dqb_valid & QIF_BLIMITS) {
1923 dm->dqb_bsoftlimit = di->dqb_bsoftlimit; 2075 dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit);
1924 dm->dqb_bhardlimit = di->dqb_bhardlimit; 2076 dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit);
1925 check_blim = 1; 2077 check_blim = 1;
2078 __set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
1926 } 2079 }
1927 if (di->dqb_valid & QIF_INODES) { 2080 if (di->dqb_valid & QIF_INODES) {
1928 dm->dqb_curinodes = di->dqb_curinodes; 2081 dm->dqb_curinodes = di->dqb_curinodes;
1929 check_ilim = 1; 2082 check_ilim = 1;
2083 __set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
1930 } 2084 }
1931 if (di->dqb_valid & QIF_ILIMITS) { 2085 if (di->dqb_valid & QIF_ILIMITS) {
1932 dm->dqb_isoftlimit = di->dqb_isoftlimit; 2086 dm->dqb_isoftlimit = di->dqb_isoftlimit;
1933 dm->dqb_ihardlimit = di->dqb_ihardlimit; 2087 dm->dqb_ihardlimit = di->dqb_ihardlimit;
1934 check_ilim = 1; 2088 check_ilim = 1;
2089 __set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
1935 } 2090 }
1936 if (di->dqb_valid & QIF_BTIME) 2091 if (di->dqb_valid & QIF_BTIME) {
1937 dm->dqb_btime = di->dqb_btime; 2092 dm->dqb_btime = di->dqb_btime;
1938 if (di->dqb_valid & QIF_ITIME) 2093 check_blim = 1;
2094 __set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
2095 }
2096 if (di->dqb_valid & QIF_ITIME) {
1939 dm->dqb_itime = di->dqb_itime; 2097 dm->dqb_itime = di->dqb_itime;
2098 check_ilim = 1;
2099 __set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
2100 }
1940 2101
1941 if (check_blim) { 2102 if (check_blim) {
1942 if (!dm->dqb_bsoftlimit || toqb(dm->dqb_curspace) < dm->dqb_bsoftlimit) { 2103 if (!dm->dqb_bsoftlimit || dm->dqb_curspace < dm->dqb_bsoftlimit) {
1943 dm->dqb_btime = 0; 2104 dm->dqb_btime = 0;
1944 clear_bit(DQ_BLKS_B, &dquot->dq_flags); 2105 clear_bit(DQ_BLKS_B, &dquot->dq_flags);
1945 } 2106 }
@@ -1970,12 +2131,14 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
1970 int rc; 2131 int rc;
1971 2132
1972 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); 2133 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
1973 if (!(dquot = dqget(sb, id, type))) { 2134 dquot = dqget(sb, id, type);
1974 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2135 if (!dquot) {
1975 return -ESRCH; 2136 rc = -ESRCH;
2137 goto out;
1976 } 2138 }
1977 rc = do_set_dqblk(dquot, di); 2139 rc = do_set_dqblk(dquot, di);
1978 dqput(dquot); 2140 dqput(dquot);
2141out:
1979 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2142 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
1980 return rc; 2143 return rc;
1981} 2144}
@@ -1986,7 +2149,7 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
1986 struct mem_dqinfo *mi; 2149 struct mem_dqinfo *mi;
1987 2150
1988 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); 2151 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
1989 if (!sb_has_quota_enabled(sb, type)) { 2152 if (!sb_has_quota_active(sb, type)) {
1990 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2153 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
1991 return -ESRCH; 2154 return -ESRCH;
1992 } 2155 }
@@ -2005,11 +2168,12 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2005int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) 2168int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2006{ 2169{
2007 struct mem_dqinfo *mi; 2170 struct mem_dqinfo *mi;
2171 int err = 0;
2008 2172
2009 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); 2173 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
2010 if (!sb_has_quota_enabled(sb, type)) { 2174 if (!sb_has_quota_active(sb, type)) {
2011 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2175 err = -ESRCH;
2012 return -ESRCH; 2176 goto out;
2013 } 2177 }
2014 mi = sb_dqopt(sb)->info + type; 2178 mi = sb_dqopt(sb)->info + type;
2015 spin_lock(&dq_data_lock); 2179 spin_lock(&dq_data_lock);
@@ -2023,8 +2187,9 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2023 mark_info_dirty(sb, type); 2187 mark_info_dirty(sb, type);
2024 /* Force write to disk */ 2188 /* Force write to disk */
2025 sb->dq_op->write_info(sb, type); 2189 sb->dq_op->write_info(sb, type);
2190out:
2026 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2191 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
2027 return 0; 2192 return err;
2028} 2193}
2029 2194
2030struct quotactl_ops vfs_quotactl_ops = { 2195struct quotactl_ops vfs_quotactl_ops = {
@@ -2186,10 +2351,13 @@ EXPORT_SYMBOL(register_quota_format);
2186EXPORT_SYMBOL(unregister_quota_format); 2351EXPORT_SYMBOL(unregister_quota_format);
2187EXPORT_SYMBOL(dqstats); 2352EXPORT_SYMBOL(dqstats);
2188EXPORT_SYMBOL(dq_data_lock); 2353EXPORT_SYMBOL(dq_data_lock);
2354EXPORT_SYMBOL(vfs_quota_enable);
2189EXPORT_SYMBOL(vfs_quota_on); 2355EXPORT_SYMBOL(vfs_quota_on);
2190EXPORT_SYMBOL(vfs_quota_on_path); 2356EXPORT_SYMBOL(vfs_quota_on_path);
2191EXPORT_SYMBOL(vfs_quota_on_mount); 2357EXPORT_SYMBOL(vfs_quota_on_mount);
2358EXPORT_SYMBOL(vfs_quota_disable);
2192EXPORT_SYMBOL(vfs_quota_off); 2359EXPORT_SYMBOL(vfs_quota_off);
2360EXPORT_SYMBOL(dquot_scan_active);
2193EXPORT_SYMBOL(vfs_quota_sync); 2361EXPORT_SYMBOL(vfs_quota_sync);
2194EXPORT_SYMBOL(vfs_get_dqinfo); 2362EXPORT_SYMBOL(vfs_get_dqinfo);
2195EXPORT_SYMBOL(vfs_set_dqinfo); 2363EXPORT_SYMBOL(vfs_set_dqinfo);
@@ -2202,7 +2370,11 @@ EXPORT_SYMBOL(dquot_release);
2202EXPORT_SYMBOL(dquot_mark_dquot_dirty); 2370EXPORT_SYMBOL(dquot_mark_dquot_dirty);
2203EXPORT_SYMBOL(dquot_initialize); 2371EXPORT_SYMBOL(dquot_initialize);
2204EXPORT_SYMBOL(dquot_drop); 2372EXPORT_SYMBOL(dquot_drop);
2373EXPORT_SYMBOL(dquot_drop_locked);
2205EXPORT_SYMBOL(vfs_dq_drop); 2374EXPORT_SYMBOL(vfs_dq_drop);
2375EXPORT_SYMBOL(dqget);
2376EXPORT_SYMBOL(dqput);
2377EXPORT_SYMBOL(dquot_is_cached);
2206EXPORT_SYMBOL(dquot_alloc_space); 2378EXPORT_SYMBOL(dquot_alloc_space);
2207EXPORT_SYMBOL(dquot_alloc_inode); 2379EXPORT_SYMBOL(dquot_alloc_inode);
2208EXPORT_SYMBOL(dquot_free_space); 2380EXPORT_SYMBOL(dquot_free_space);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 6046239465a1..c01e043670e2 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -175,8 +175,8 @@ out:
175 * 175 *
176 * Returns zero on success; non-zero on error. 176 * Returns zero on success; non-zero on error.
177 */ 177 */
178static int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, 178int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
179 loff_t offset) 179 loff_t offset)
180{ 180{
181 int rc = 0; 181 int rc = 0;
182 char dst[MD5_DIGEST_SIZE]; 182 char dst[MD5_DIGEST_SIZE];
@@ -924,6 +924,15 @@ static void ecryptfs_copy_mount_wide_flags_to_inode_flags(
924 crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; 924 crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
925 if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) 925 if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
926 crypt_stat->flags |= ECRYPTFS_VIEW_AS_ENCRYPTED; 926 crypt_stat->flags |= ECRYPTFS_VIEW_AS_ENCRYPTED;
927 if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
928 crypt_stat->flags |= ECRYPTFS_ENCRYPT_FILENAMES;
929 if (mount_crypt_stat->flags
930 & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)
931 crypt_stat->flags |= ECRYPTFS_ENCFN_USE_MOUNT_FNEK;
932 else if (mount_crypt_stat->flags
933 & ECRYPTFS_GLOBAL_ENCFN_USE_FEK)
934 crypt_stat->flags |= ECRYPTFS_ENCFN_USE_FEK;
935 }
927} 936}
928 937
929static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs( 938static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs(
@@ -1060,7 +1069,8 @@ struct ecryptfs_flag_map_elem {
1060static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = { 1069static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = {
1061 {0x00000001, ECRYPTFS_ENABLE_HMAC}, 1070 {0x00000001, ECRYPTFS_ENABLE_HMAC},
1062 {0x00000002, ECRYPTFS_ENCRYPTED}, 1071 {0x00000002, ECRYPTFS_ENCRYPTED},
1063 {0x00000004, ECRYPTFS_METADATA_IN_XATTR} 1072 {0x00000004, ECRYPTFS_METADATA_IN_XATTR},
1073 {0x00000008, ECRYPTFS_ENCRYPT_FILENAMES}
1064}; 1074};
1065 1075
1066/** 1076/**
@@ -1149,19 +1159,20 @@ ecryptfs_cipher_code_str_map[] = {
1149 1159
1150/** 1160/**
1151 * ecryptfs_code_for_cipher_string 1161 * ecryptfs_code_for_cipher_string
1152 * @crypt_stat: The cryptographic context 1162 * @cipher_name: The string alias for the cipher
1163 * @key_bytes: Length of key in bytes; used for AES code selection
1153 * 1164 *
1154 * Returns zero on no match, or the cipher code on match 1165 * Returns zero on no match, or the cipher code on match
1155 */ 1166 */
1156u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat) 1167u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes)
1157{ 1168{
1158 int i; 1169 int i;
1159 u8 code = 0; 1170 u8 code = 0;
1160 struct ecryptfs_cipher_code_str_map_elem *map = 1171 struct ecryptfs_cipher_code_str_map_elem *map =
1161 ecryptfs_cipher_code_str_map; 1172 ecryptfs_cipher_code_str_map;
1162 1173
1163 if (strcmp(crypt_stat->cipher, "aes") == 0) { 1174 if (strcmp(cipher_name, "aes") == 0) {
1164 switch (crypt_stat->key_size) { 1175 switch (key_bytes) {
1165 case 16: 1176 case 16:
1166 code = RFC2440_CIPHER_AES_128; 1177 code = RFC2440_CIPHER_AES_128;
1167 break; 1178 break;
@@ -1173,7 +1184,7 @@ u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat)
1173 } 1184 }
1174 } else { 1185 } else {
1175 for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++) 1186 for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++)
1176 if (strcmp(crypt_stat->cipher, map[i].cipher_str) == 0){ 1187 if (strcmp(cipher_name, map[i].cipher_str) == 0) {
1177 code = map[i].cipher_code; 1188 code = map[i].cipher_code;
1178 break; 1189 break;
1179 } 1190 }
@@ -1212,6 +1223,8 @@ int ecryptfs_read_and_validate_header_region(char *data,
1212 &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); 1223 &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
1213 int rc; 1224 int rc;
1214 1225
1226 if (crypt_stat->extent_size == 0)
1227 crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE;
1215 rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size, 1228 rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size,
1216 ecryptfs_inode); 1229 ecryptfs_inode);
1217 if (rc) { 1230 if (rc) {
@@ -1221,7 +1234,6 @@ int ecryptfs_read_and_validate_header_region(char *data,
1221 } 1234 }
1222 if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) { 1235 if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) {
1223 rc = -EINVAL; 1236 rc = -EINVAL;
1224 ecryptfs_printk(KERN_DEBUG, "Valid marker not found\n");
1225 } 1237 }
1226out: 1238out:
1227 return rc; 1239 return rc;
@@ -1628,95 +1640,95 @@ out:
1628} 1640}
1629 1641
1630/** 1642/**
1631 * ecryptfs_encode_filename - converts a plaintext file name to cipher text 1643 * ecryptfs_encrypt_filename - encrypt filename
1632 * @crypt_stat: The crypt_stat struct associated with the file anem to encode
1633 * @name: The plaintext name
1634 * @length: The length of the plaintext
1635 * @encoded_name: The encypted name
1636 * 1644 *
1637 * Encrypts and encodes a filename into something that constitutes a 1645 * CBC-encrypts the filename. We do not want to encrypt the same
1638 * valid filename for a filesystem, with printable characters. 1646 * filename with the same key and IV, which may happen with hard
1647 * links, so we prepend random bits to each filename.
1639 * 1648 *
1640 * We assume that we have a properly initialized crypto context, 1649 * Returns zero on success; non-zero otherwise
1641 * pointed to by crypt_stat->tfm.
1642 *
1643 * TODO: Implement filename decoding and decryption here, in place of
1644 * memcpy. We are keeping the framework around for now to (1)
1645 * facilitate testing of the components needed to implement filename
1646 * encryption and (2) to provide a code base from which other
1647 * developers in the community can easily implement this feature.
1648 *
1649 * Returns the length of encoded filename; negative if error
1650 */ 1650 */
1651int 1651static int
1652ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat, 1652ecryptfs_encrypt_filename(struct ecryptfs_filename *filename,
1653 const char *name, int length, char **encoded_name) 1653 struct ecryptfs_crypt_stat *crypt_stat,
1654 struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
1654{ 1655{
1655 int error = 0; 1656 int rc = 0;
1656 1657
1657 (*encoded_name) = kmalloc(length + 2, GFP_KERNEL); 1658 filename->encrypted_filename = NULL;
1658 if (!(*encoded_name)) { 1659 filename->encrypted_filename_size = 0;
1659 error = -ENOMEM; 1660 if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
1661 || (mount_crypt_stat && (mount_crypt_stat->flags
1662 & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
1663 size_t packet_size;
1664 size_t remaining_bytes;
1665
1666 rc = ecryptfs_write_tag_70_packet(
1667 NULL, NULL,
1668 &filename->encrypted_filename_size,
1669 mount_crypt_stat, NULL,
1670 filename->filename_size);
1671 if (rc) {
1672 printk(KERN_ERR "%s: Error attempting to get packet "
1673 "size for tag 72; rc = [%d]\n", __func__,
1674 rc);
1675 filename->encrypted_filename_size = 0;
1676 goto out;
1677 }
1678 filename->encrypted_filename =
1679 kmalloc(filename->encrypted_filename_size, GFP_KERNEL);
1680 if (!filename->encrypted_filename) {
1681 printk(KERN_ERR "%s: Out of memory whilst attempting "
1682 "to kmalloc [%zd] bytes\n", __func__,
1683 filename->encrypted_filename_size);
1684 rc = -ENOMEM;
1685 goto out;
1686 }
1687 remaining_bytes = filename->encrypted_filename_size;
1688 rc = ecryptfs_write_tag_70_packet(filename->encrypted_filename,
1689 &remaining_bytes,
1690 &packet_size,
1691 mount_crypt_stat,
1692 filename->filename,
1693 filename->filename_size);
1694 if (rc) {
1695 printk(KERN_ERR "%s: Error attempting to generate "
1696 "tag 70 packet; rc = [%d]\n", __func__,
1697 rc);
1698 kfree(filename->encrypted_filename);
1699 filename->encrypted_filename = NULL;
1700 filename->encrypted_filename_size = 0;
1701 goto out;
1702 }
1703 filename->encrypted_filename_size = packet_size;
1704 } else {
1705 printk(KERN_ERR "%s: No support for requested filename "
1706 "encryption method in this release\n", __func__);
1707 rc = -ENOTSUPP;
1660 goto out; 1708 goto out;
1661 } 1709 }
1662 /* TODO: Filename encryption is a scheduled feature for a
1663 * future version of eCryptfs. This function is here only for
1664 * the purpose of providing a framework for other developers
1665 * to easily implement filename encryption. Hint: Replace this
1666 * memcpy() with a call to encrypt and encode the
1667 * filename, the set the length accordingly. */
1668 memcpy((void *)(*encoded_name), (void *)name, length);
1669 (*encoded_name)[length] = '\0';
1670 error = length + 1;
1671out: 1710out:
1672 return error; 1711 return rc;
1673} 1712}
1674 1713
1675/** 1714static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size,
1676 * ecryptfs_decode_filename - converts the cipher text name to plaintext 1715 const char *name, size_t name_size)
1677 * @crypt_stat: The crypt_stat struct associated with the file
1678 * @name: The filename in cipher text
1679 * @length: The length of the cipher text name
1680 * @decrypted_name: The plaintext name
1681 *
1682 * Decodes and decrypts the filename.
1683 *
1684 * We assume that we have a properly initialized crypto context,
1685 * pointed to by crypt_stat->tfm.
1686 *
1687 * TODO: Implement filename decoding and decryption here, in place of
1688 * memcpy. We are keeping the framework around for now to (1)
1689 * facilitate testing of the components needed to implement filename
1690 * encryption and (2) to provide a code base from which other
1691 * developers in the community can easily implement this feature.
1692 *
1693 * Returns the length of decoded filename; negative if error
1694 */
1695int
1696ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
1697 const char *name, int length, char **decrypted_name)
1698{ 1716{
1699 int error = 0; 1717 int rc = 0;
1700 1718
1701 (*decrypted_name) = kmalloc(length + 2, GFP_KERNEL); 1719 (*copied_name) = kmalloc((name_size + 2), GFP_KERNEL);
1702 if (!(*decrypted_name)) { 1720 if (!(*copied_name)) {
1703 error = -ENOMEM; 1721 rc = -ENOMEM;
1704 goto out; 1722 goto out;
1705 } 1723 }
1706 /* TODO: Filename encryption is a scheduled feature for a 1724 memcpy((void *)(*copied_name), (void *)name, name_size);
1707 * future version of eCryptfs. This function is here only for 1725 (*copied_name)[(name_size)] = '\0'; /* Only for convenience
1708 * the purpose of providing a framework for other developers
1709 * to easily implement filename encryption. Hint: Replace this
1710 * memcpy() with a call to decode and decrypt the
1711 * filename, the set the length accordingly. */
1712 memcpy((void *)(*decrypted_name), (void *)name, length);
1713 (*decrypted_name)[length + 1] = '\0'; /* Only for convenience
1714 * in printing out the 1726 * in printing out the
1715 * string in debug 1727 * string in debug
1716 * messages */ 1728 * messages */
1717 error = length; 1729 (*copied_name_size) = (name_size + 1);
1718out: 1730out:
1719 return error; 1731 return rc;
1720} 1732}
1721 1733
1722/** 1734/**
@@ -1740,7 +1752,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
1740 *key_tfm = NULL; 1752 *key_tfm = NULL;
1741 if (*key_size > ECRYPTFS_MAX_KEY_BYTES) { 1753 if (*key_size > ECRYPTFS_MAX_KEY_BYTES) {
1742 rc = -EINVAL; 1754 rc = -EINVAL;
1743 printk(KERN_ERR "Requested key size is [%Zd] bytes; maximum " 1755 printk(KERN_ERR "Requested key size is [%zd] bytes; maximum "
1744 "allowable is [%d]\n", *key_size, ECRYPTFS_MAX_KEY_BYTES); 1756 "allowable is [%d]\n", *key_size, ECRYPTFS_MAX_KEY_BYTES);
1745 goto out; 1757 goto out;
1746 } 1758 }
@@ -1765,7 +1777,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
1765 get_random_bytes(dummy_key, *key_size); 1777 get_random_bytes(dummy_key, *key_size);
1766 rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size); 1778 rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size);
1767 if (rc) { 1779 if (rc) {
1768 printk(KERN_ERR "Error attempting to set key of size [%Zd] for " 1780 printk(KERN_ERR "Error attempting to set key of size [%zd] for "
1769 "cipher [%s]; rc = [%d]\n", *key_size, cipher_name, rc); 1781 "cipher [%s]; rc = [%d]\n", *key_size, cipher_name, rc);
1770 rc = -EINVAL; 1782 rc = -EINVAL;
1771 goto out; 1783 goto out;
@@ -1910,3 +1922,341 @@ out:
1910 mutex_unlock(&key_tfm_list_mutex); 1922 mutex_unlock(&key_tfm_list_mutex);
1911 return rc; 1923 return rc;
1912} 1924}
1925
1926/* 64 characters forming a 6-bit target field */
1927static unsigned char *portable_filename_chars = ("-.0123456789ABCD"
1928 "EFGHIJKLMNOPQRST"
1929 "UVWXYZabcdefghij"
1930 "klmnopqrstuvwxyz");
1931
1932/* We could either offset on every reverse map or just pad some 0x00's
1933 * at the front here */
1934static const unsigned char filename_rev_map[] = {
1935 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */
1936 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */
1937 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */
1938 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 31 */
1939 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 39 */
1940 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* 47 */
1941 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, /* 55 */
1942 0x0A, 0x0B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 63 */
1943 0x00, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, /* 71 */
1944 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, /* 79 */
1945 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, /* 87 */
1946 0x23, 0x24, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, /* 95 */
1947 0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */
1948 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */
1949 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */
1950 0x3D, 0x3E, 0x3F
1951};
1952
1953/**
1954 * ecryptfs_encode_for_filename
1955 * @dst: Destination location for encoded filename
1956 * @dst_size: Size of the encoded filename in bytes
1957 * @src: Source location for the filename to encode
1958 * @src_size: Size of the source in bytes
1959 */
1960void ecryptfs_encode_for_filename(unsigned char *dst, size_t *dst_size,
1961 unsigned char *src, size_t src_size)
1962{
1963 size_t num_blocks;
1964 size_t block_num = 0;
1965 size_t dst_offset = 0;
1966 unsigned char last_block[3];
1967
1968 if (src_size == 0) {
1969 (*dst_size) = 0;
1970 goto out;
1971 }
1972 num_blocks = (src_size / 3);
1973 if ((src_size % 3) == 0) {
1974 memcpy(last_block, (&src[src_size - 3]), 3);
1975 } else {
1976 num_blocks++;
1977 last_block[2] = 0x00;
1978 switch (src_size % 3) {
1979 case 1:
1980 last_block[0] = src[src_size - 1];
1981 last_block[1] = 0x00;
1982 break;
1983 case 2:
1984 last_block[0] = src[src_size - 2];
1985 last_block[1] = src[src_size - 1];
1986 }
1987 }
1988 (*dst_size) = (num_blocks * 4);
1989 if (!dst)
1990 goto out;
1991 while (block_num < num_blocks) {
1992 unsigned char *src_block;
1993 unsigned char dst_block[4];
1994
1995 if (block_num == (num_blocks - 1))
1996 src_block = last_block;
1997 else
1998 src_block = &src[block_num * 3];
1999 dst_block[0] = ((src_block[0] >> 2) & 0x3F);
2000 dst_block[1] = (((src_block[0] << 4) & 0x30)
2001 | ((src_block[1] >> 4) & 0x0F));
2002 dst_block[2] = (((src_block[1] << 2) & 0x3C)
2003 | ((src_block[2] >> 6) & 0x03));
2004 dst_block[3] = (src_block[2] & 0x3F);
2005 dst[dst_offset++] = portable_filename_chars[dst_block[0]];
2006 dst[dst_offset++] = portable_filename_chars[dst_block[1]];
2007 dst[dst_offset++] = portable_filename_chars[dst_block[2]];
2008 dst[dst_offset++] = portable_filename_chars[dst_block[3]];
2009 block_num++;
2010 }
2011out:
2012 return;
2013}
2014
2015/**
2016 * ecryptfs_decode_from_filename
2017 * @dst: If NULL, this function only sets @dst_size and returns. If
2018 * non-NULL, this function decodes the encoded octets in @src
2019 * into the memory that @dst points to.
2020 * @dst_size: Set to the size of the decoded string.
2021 * @src: The encoded set of octets to decode.
2022 * @src_size: The size of the encoded set of octets to decode.
2023 */
2024static void
2025ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size,
2026 const unsigned char *src, size_t src_size)
2027{
2028 u8 current_bit_offset = 0;
2029 size_t src_byte_offset = 0;
2030 size_t dst_byte_offset = 0;
2031
2032 if (dst == NULL) {
2033 /* Not exact; conservatively long. Every block of 4
2034 * encoded characters decodes into a block of 3
2035 * decoded characters. This segment of code provides
2036 * the caller with the maximum amount of allocated
2037 * space that @dst will need to point to in a
2038 * subsequent call. */
2039 (*dst_size) = (((src_size + 1) * 3) / 4);
2040 goto out;
2041 }
2042 while (src_byte_offset < src_size) {
2043 unsigned char src_byte =
2044 filename_rev_map[(int)src[src_byte_offset]];
2045
2046 switch (current_bit_offset) {
2047 case 0:
2048 dst[dst_byte_offset] = (src_byte << 2);
2049 current_bit_offset = 6;
2050 break;
2051 case 6:
2052 dst[dst_byte_offset++] |= (src_byte >> 4);
2053 dst[dst_byte_offset] = ((src_byte & 0xF)
2054 << 4);
2055 current_bit_offset = 4;
2056 break;
2057 case 4:
2058 dst[dst_byte_offset++] |= (src_byte >> 2);
2059 dst[dst_byte_offset] = (src_byte << 6);
2060 current_bit_offset = 2;
2061 break;
2062 case 2:
2063 dst[dst_byte_offset++] |= (src_byte);
2064 dst[dst_byte_offset] = 0;
2065 current_bit_offset = 0;
2066 break;
2067 }
2068 src_byte_offset++;
2069 }
2070 (*dst_size) = dst_byte_offset;
2071out:
2072 return;
2073}
2074
2075/**
2076 * ecryptfs_encrypt_and_encode_filename - converts a plaintext file name to cipher text
2077 * @crypt_stat: The crypt_stat struct associated with the file anem to encode
2078 * @name: The plaintext name
2079 * @length: The length of the plaintext
2080 * @encoded_name: The encypted name
2081 *
2082 * Encrypts and encodes a filename into something that constitutes a
2083 * valid filename for a filesystem, with printable characters.
2084 *
2085 * We assume that we have a properly initialized crypto context,
2086 * pointed to by crypt_stat->tfm.
2087 *
2088 * Returns zero on success; non-zero on otherwise
2089 */
2090int ecryptfs_encrypt_and_encode_filename(
2091 char **encoded_name,
2092 size_t *encoded_name_size,
2093 struct ecryptfs_crypt_stat *crypt_stat,
2094 struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
2095 const char *name, size_t name_size)
2096{
2097 size_t encoded_name_no_prefix_size;
2098 int rc = 0;
2099
2100 (*encoded_name) = NULL;
2101 (*encoded_name_size) = 0;
2102 if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
2103 || (mount_crypt_stat && (mount_crypt_stat->flags
2104 & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) {
2105 struct ecryptfs_filename *filename;
2106
2107 filename = kzalloc(sizeof(*filename), GFP_KERNEL);
2108 if (!filename) {
2109 printk(KERN_ERR "%s: Out of memory whilst attempting "
2110 "to kzalloc [%zd] bytes\n", __func__,
2111 sizeof(*filename));
2112 rc = -ENOMEM;
2113 goto out;
2114 }
2115 filename->filename = (char *)name;
2116 filename->filename_size = name_size;
2117 rc = ecryptfs_encrypt_filename(filename, crypt_stat,
2118 mount_crypt_stat);
2119 if (rc) {
2120 printk(KERN_ERR "%s: Error attempting to encrypt "
2121 "filename; rc = [%d]\n", __func__, rc);
2122 kfree(filename);
2123 goto out;
2124 }
2125 ecryptfs_encode_for_filename(
2126 NULL, &encoded_name_no_prefix_size,
2127 filename->encrypted_filename,
2128 filename->encrypted_filename_size);
2129 if ((crypt_stat && (crypt_stat->flags
2130 & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
2131 || (mount_crypt_stat
2132 && (mount_crypt_stat->flags
2133 & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)))
2134 (*encoded_name_size) =
2135 (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
2136 + encoded_name_no_prefix_size);
2137 else
2138 (*encoded_name_size) =
2139 (ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE
2140 + encoded_name_no_prefix_size);
2141 (*encoded_name) = kmalloc((*encoded_name_size) + 1, GFP_KERNEL);
2142 if (!(*encoded_name)) {
2143 printk(KERN_ERR "%s: Out of memory whilst attempting "
2144 "to kzalloc [%zd] bytes\n", __func__,
2145 (*encoded_name_size));
2146 rc = -ENOMEM;
2147 kfree(filename->encrypted_filename);
2148 kfree(filename);
2149 goto out;
2150 }
2151 if ((crypt_stat && (crypt_stat->flags
2152 & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
2153 || (mount_crypt_stat
2154 && (mount_crypt_stat->flags
2155 & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
2156 memcpy((*encoded_name),
2157 ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
2158 ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE);
2159 ecryptfs_encode_for_filename(
2160 ((*encoded_name)
2161 + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE),
2162 &encoded_name_no_prefix_size,
2163 filename->encrypted_filename,
2164 filename->encrypted_filename_size);
2165 (*encoded_name_size) =
2166 (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
2167 + encoded_name_no_prefix_size);
2168 (*encoded_name)[(*encoded_name_size)] = '\0';
2169 (*encoded_name_size)++;
2170 } else {
2171 rc = -ENOTSUPP;
2172 }
2173 if (rc) {
2174 printk(KERN_ERR "%s: Error attempting to encode "
2175 "encrypted filename; rc = [%d]\n", __func__,
2176 rc);
2177 kfree((*encoded_name));
2178 (*encoded_name) = NULL;
2179 (*encoded_name_size) = 0;
2180 }
2181 kfree(filename->encrypted_filename);
2182 kfree(filename);
2183 } else {
2184 rc = ecryptfs_copy_filename(encoded_name,
2185 encoded_name_size,
2186 name, name_size);
2187 }
2188out:
2189 return rc;
2190}
2191
2192/**
2193 * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext
2194 * @plaintext_name: The plaintext name
2195 * @plaintext_name_size: The plaintext name size
2196 * @ecryptfs_dir_dentry: eCryptfs directory dentry
2197 * @name: The filename in cipher text
2198 * @name_size: The cipher text name size
2199 *
2200 * Decrypts and decodes the filename.
2201 *
2202 * Returns zero on error; non-zero otherwise
2203 */
2204int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
2205 size_t *plaintext_name_size,
2206 struct dentry *ecryptfs_dir_dentry,
2207 const char *name, size_t name_size)
2208{
2209 char *decoded_name;
2210 size_t decoded_name_size;
2211 size_t packet_size;
2212 int rc = 0;
2213
2214 if ((name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE)
2215 && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
2216 ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) {
2217 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
2218 &ecryptfs_superblock_to_private(
2219 ecryptfs_dir_dentry->d_sb)->mount_crypt_stat;
2220 const char *orig_name = name;
2221 size_t orig_name_size = name_size;
2222
2223 name += ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
2224 name_size -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
2225 ecryptfs_decode_from_filename(NULL, &decoded_name_size,
2226 name, name_size);
2227 decoded_name = kmalloc(decoded_name_size, GFP_KERNEL);
2228 if (!decoded_name) {
2229 printk(KERN_ERR "%s: Out of memory whilst attempting "
2230 "to kmalloc [%zd] bytes\n", __func__,
2231 decoded_name_size);
2232 rc = -ENOMEM;
2233 goto out;
2234 }
2235 ecryptfs_decode_from_filename(decoded_name, &decoded_name_size,
2236 name, name_size);
2237 rc = ecryptfs_parse_tag_70_packet(plaintext_name,
2238 plaintext_name_size,
2239 &packet_size,
2240 mount_crypt_stat,
2241 decoded_name,
2242 decoded_name_size);
2243 if (rc) {
2244 printk(KERN_INFO "%s: Could not parse tag 70 packet "
2245 "from filename; copying through filename "
2246 "as-is\n", __func__);
2247 rc = ecryptfs_copy_filename(plaintext_name,
2248 plaintext_name_size,
2249 orig_name, orig_name_size);
2250 goto out_free;
2251 }
2252 } else {
2253 rc = ecryptfs_copy_filename(plaintext_name,
2254 plaintext_name_size,
2255 name, name_size);
2256 goto out;
2257 }
2258out_free:
2259 kfree(decoded_name);
2260out:
2261 return rc;
2262}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index a75026d35d16..c11fc95714ab 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -51,12 +51,16 @@
51#define ECRYPTFS_VERSIONING_XATTR 0x00000010 51#define ECRYPTFS_VERSIONING_XATTR 0x00000010
52#define ECRYPTFS_VERSIONING_MULTKEY 0x00000020 52#define ECRYPTFS_VERSIONING_MULTKEY 0x00000020
53#define ECRYPTFS_VERSIONING_DEVMISC 0x00000040 53#define ECRYPTFS_VERSIONING_DEVMISC 0x00000040
54#define ECRYPTFS_VERSIONING_HMAC 0x00000080
55#define ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION 0x00000100
56#define ECRYPTFS_VERSIONING_GCM 0x00000200
54#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \ 57#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \
55 | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \ 58 | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \
56 | ECRYPTFS_VERSIONING_PUBKEY \ 59 | ECRYPTFS_VERSIONING_PUBKEY \
57 | ECRYPTFS_VERSIONING_XATTR \ 60 | ECRYPTFS_VERSIONING_XATTR \
58 | ECRYPTFS_VERSIONING_MULTKEY \ 61 | ECRYPTFS_VERSIONING_MULTKEY \
59 | ECRYPTFS_VERSIONING_DEVMISC) 62 | ECRYPTFS_VERSIONING_DEVMISC \
63 | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION)
60#define ECRYPTFS_MAX_PASSWORD_LENGTH 64 64#define ECRYPTFS_MAX_PASSWORD_LENGTH 64
61#define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH 65#define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH
62#define ECRYPTFS_SALT_SIZE 8 66#define ECRYPTFS_SALT_SIZE 8
@@ -199,6 +203,7 @@ ecryptfs_get_key_payload_data(struct key *key)
199#define ECRYPTFS_DEFAULT_CIPHER "aes" 203#define ECRYPTFS_DEFAULT_CIPHER "aes"
200#define ECRYPTFS_DEFAULT_KEY_BYTES 16 204#define ECRYPTFS_DEFAULT_KEY_BYTES 16
201#define ECRYPTFS_DEFAULT_HASH "md5" 205#define ECRYPTFS_DEFAULT_HASH "md5"
206#define ECRYPTFS_TAG_70_DIGEST ECRYPTFS_DEFAULT_HASH
202#define ECRYPTFS_TAG_1_PACKET_TYPE 0x01 207#define ECRYPTFS_TAG_1_PACKET_TYPE 0x01
203#define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C 208#define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C
204#define ECRYPTFS_TAG_11_PACKET_TYPE 0xED 209#define ECRYPTFS_TAG_11_PACKET_TYPE 0xED
@@ -206,30 +211,64 @@ ecryptfs_get_key_payload_data(struct key *key)
206#define ECRYPTFS_TAG_65_PACKET_TYPE 0x41 211#define ECRYPTFS_TAG_65_PACKET_TYPE 0x41
207#define ECRYPTFS_TAG_66_PACKET_TYPE 0x42 212#define ECRYPTFS_TAG_66_PACKET_TYPE 0x42
208#define ECRYPTFS_TAG_67_PACKET_TYPE 0x43 213#define ECRYPTFS_TAG_67_PACKET_TYPE 0x43
214#define ECRYPTFS_TAG_70_PACKET_TYPE 0x46 /* FNEK-encrypted filename
215 * as dentry name */
216#define ECRYPTFS_TAG_71_PACKET_TYPE 0x47 /* FNEK-encrypted filename in
217 * metadata */
218#define ECRYPTFS_TAG_72_PACKET_TYPE 0x48 /* FEK-encrypted filename as
219 * dentry name */
220#define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as
221 * metadata */
222/* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >=
223 * ECRYPTFS_MAX_IV_BYTES */
224#define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16
225#define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */
209#define MD5_DIGEST_SIZE 16 226#define MD5_DIGEST_SIZE 16
227#define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE
228#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FEK_ENCRYPTED."
229#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE 23
230#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FNEK_ENCRYPTED."
231#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 24
232#define ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN (18 + 1 + 4 + 1 + 32)
210 233
211struct ecryptfs_key_sig { 234struct ecryptfs_key_sig {
212 struct list_head crypt_stat_list; 235 struct list_head crypt_stat_list;
213 char keysig[ECRYPTFS_SIG_SIZE_HEX]; 236 char keysig[ECRYPTFS_SIG_SIZE_HEX];
214}; 237};
215 238
239struct ecryptfs_filename {
240 struct list_head crypt_stat_list;
241#define ECRYPTFS_FILENAME_CONTAINS_DECRYPTED 0x00000001
242 u32 flags;
243 u32 seq_no;
244 char *filename;
245 char *encrypted_filename;
246 size_t filename_size;
247 size_t encrypted_filename_size;
248 char fnek_sig[ECRYPTFS_SIG_SIZE_HEX];
249 char dentry_name[ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN + 1];
250};
251
216/** 252/**
217 * This is the primary struct associated with each encrypted file. 253 * This is the primary struct associated with each encrypted file.
218 * 254 *
219 * TODO: cache align/pack? 255 * TODO: cache align/pack?
220 */ 256 */
221struct ecryptfs_crypt_stat { 257struct ecryptfs_crypt_stat {
222#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001 258#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001
223#define ECRYPTFS_POLICY_APPLIED 0x00000002 259#define ECRYPTFS_POLICY_APPLIED 0x00000002
224#define ECRYPTFS_NEW_FILE 0x00000004 260#define ECRYPTFS_NEW_FILE 0x00000004
225#define ECRYPTFS_ENCRYPTED 0x00000008 261#define ECRYPTFS_ENCRYPTED 0x00000008
226#define ECRYPTFS_SECURITY_WARNING 0x00000010 262#define ECRYPTFS_SECURITY_WARNING 0x00000010
227#define ECRYPTFS_ENABLE_HMAC 0x00000020 263#define ECRYPTFS_ENABLE_HMAC 0x00000020
228#define ECRYPTFS_ENCRYPT_IV_PAGES 0x00000040 264#define ECRYPTFS_ENCRYPT_IV_PAGES 0x00000040
229#define ECRYPTFS_KEY_VALID 0x00000080 265#define ECRYPTFS_KEY_VALID 0x00000080
230#define ECRYPTFS_METADATA_IN_XATTR 0x00000100 266#define ECRYPTFS_METADATA_IN_XATTR 0x00000100
231#define ECRYPTFS_VIEW_AS_ENCRYPTED 0x00000200 267#define ECRYPTFS_VIEW_AS_ENCRYPTED 0x00000200
232#define ECRYPTFS_KEY_SET 0x00000400 268#define ECRYPTFS_KEY_SET 0x00000400
269#define ECRYPTFS_ENCRYPT_FILENAMES 0x00000800
270#define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00001000
271#define ECRYPTFS_ENCFN_USE_FEK 0x00002000
233 u32 flags; 272 u32 flags;
234 unsigned int file_version; 273 unsigned int file_version;
235 size_t iv_bytes; 274 size_t iv_bytes;
@@ -332,13 +371,20 @@ struct ecryptfs_mount_crypt_stat {
332#define ECRYPTFS_XATTR_METADATA_ENABLED 0x00000002 371#define ECRYPTFS_XATTR_METADATA_ENABLED 0x00000002
333#define ECRYPTFS_ENCRYPTED_VIEW_ENABLED 0x00000004 372#define ECRYPTFS_ENCRYPTED_VIEW_ENABLED 0x00000004
334#define ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED 0x00000008 373#define ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED 0x00000008
374#define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES 0x00000010
375#define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK 0x00000020
376#define ECRYPTFS_GLOBAL_ENCFN_USE_FEK 0x00000040
335 u32 flags; 377 u32 flags;
336 struct list_head global_auth_tok_list; 378 struct list_head global_auth_tok_list;
337 struct mutex global_auth_tok_list_mutex; 379 struct mutex global_auth_tok_list_mutex;
338 size_t num_global_auth_toks; 380 size_t num_global_auth_toks;
339 size_t global_default_cipher_key_size; 381 size_t global_default_cipher_key_size;
382 size_t global_default_fn_cipher_key_bytes;
340 unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE 383 unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE
341 + 1]; 384 + 1];
385 unsigned char global_default_fn_cipher_name[
386 ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
387 char global_default_fnek_sig[ECRYPTFS_SIG_SIZE_HEX + 1];
342}; 388};
343 389
344/* superblock private data. */ 390/* superblock private data. */
@@ -571,13 +617,22 @@ struct ecryptfs_open_req {
571int ecryptfs_interpose(struct dentry *hidden_dentry, 617int ecryptfs_interpose(struct dentry *hidden_dentry,
572 struct dentry *this_dentry, struct super_block *sb, 618 struct dentry *this_dentry, struct super_block *sb,
573 u32 flags); 619 u32 flags);
620int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
621 struct dentry *lower_dentry,
622 struct ecryptfs_crypt_stat *crypt_stat,
623 struct inode *ecryptfs_dir_inode,
624 struct nameidata *ecryptfs_nd);
625int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
626 size_t *decrypted_name_size,
627 struct dentry *ecryptfs_dentry,
628 const char *name, size_t name_size);
574int ecryptfs_fill_zeros(struct file *file, loff_t new_length); 629int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
575int ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat, 630int ecryptfs_encrypt_and_encode_filename(
576 const char *name, int length, 631 char **encoded_name,
577 char **decrypted_name); 632 size_t *encoded_name_size,
578int ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat, 633 struct ecryptfs_crypt_stat *crypt_stat,
579 const char *name, int length, 634 struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
580 char **encoded_name); 635 const char *name, size_t name_size);
581struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry); 636struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry);
582void ecryptfs_dump_hex(char *data, int bytes); 637void ecryptfs_dump_hex(char *data, int bytes);
583int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, 638int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
@@ -599,7 +654,7 @@ int ecryptfs_read_and_validate_header_region(char *data,
599 struct inode *ecryptfs_inode); 654 struct inode *ecryptfs_inode);
600int ecryptfs_read_and_validate_xattr_region(char *page_virt, 655int ecryptfs_read_and_validate_xattr_region(char *page_virt,
601 struct dentry *ecryptfs_dentry); 656 struct dentry *ecryptfs_dentry);
602u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat); 657u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes);
603int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code); 658int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code);
604void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat); 659void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat);
605int ecryptfs_generate_key_packet_set(char *dest_base, 660int ecryptfs_generate_key_packet_set(char *dest_base,
@@ -694,5 +749,17 @@ int ecryptfs_privileged_open(struct file **lower_file,
694 struct vfsmount *lower_mnt, 749 struct vfsmount *lower_mnt,
695 const struct cred *cred); 750 const struct cred *cred);
696int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry); 751int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry);
752int
753ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
754 size_t *packet_size,
755 struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
756 char *filename, size_t filename_size);
757int
758ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
759 size_t *packet_size,
760 struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
761 char *data, size_t max_packet_size);
762int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
763 loff_t offset);
697 764
698#endif /* #ifndef ECRYPTFS_KERNEL_H */ 765#endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index eb3dc4c7ac06..9e944057001b 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -77,27 +77,27 @@ struct ecryptfs_getdents_callback {
77 77
78/* Inspired by generic filldir in fs/readdir.c */ 78/* Inspired by generic filldir in fs/readdir.c */
79static int 79static int
80ecryptfs_filldir(void *dirent, const char *name, int namelen, loff_t offset, 80ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen,
81 u64 ino, unsigned int d_type) 81 loff_t offset, u64 ino, unsigned int d_type)
82{ 82{
83 struct ecryptfs_crypt_stat *crypt_stat;
84 struct ecryptfs_getdents_callback *buf = 83 struct ecryptfs_getdents_callback *buf =
85 (struct ecryptfs_getdents_callback *)dirent; 84 (struct ecryptfs_getdents_callback *)dirent;
85 size_t name_size;
86 char *name;
86 int rc; 87 int rc;
87 int decoded_length;
88 char *decoded_name;
89 88
90 crypt_stat = ecryptfs_dentry_to_private(buf->dentry)->crypt_stat;
91 buf->filldir_called++; 89 buf->filldir_called++;
92 decoded_length = ecryptfs_decode_filename(crypt_stat, name, namelen, 90 rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size,
93 &decoded_name); 91 buf->dentry, lower_name,
94 if (decoded_length < 0) { 92 lower_namelen);
95 rc = decoded_length; 93 if (rc) {
94 printk(KERN_ERR "%s: Error attempting to decode and decrypt "
95 "filename [%s]; rc = [%d]\n", __func__, lower_name,
96 rc);
96 goto out; 97 goto out;
97 } 98 }
98 rc = buf->filldir(buf->dirent, decoded_name, decoded_length, offset, 99 rc = buf->filldir(buf->dirent, name, name_size, offset, ino, d_type);
99 ino, d_type); 100 kfree(name);
100 kfree(decoded_name);
101 if (rc >= 0) 101 if (rc >= 0)
102 buf->entries_written++; 102 buf->entries_written++;
103out: 103out:
@@ -106,8 +106,8 @@ out:
106 106
107/** 107/**
108 * ecryptfs_readdir 108 * ecryptfs_readdir
109 * @file: The ecryptfs file struct 109 * @file: The eCryptfs directory file
110 * @dirent: Directory entry 110 * @dirent: Directory entry handle
111 * @filldir: The filldir callback function 111 * @filldir: The filldir callback function
112 */ 112 */
113static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir) 113static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir)
@@ -275,18 +275,9 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
275static int 275static int
276ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync) 276ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync)
277{ 277{
278 struct file *lower_file = ecryptfs_file_to_lower(file); 278 return vfs_fsync(ecryptfs_file_to_lower(file),
279 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); 279 ecryptfs_dentry_to_lower(dentry),
280 struct inode *lower_inode = lower_dentry->d_inode; 280 datasync);
281 int rc = -EINVAL;
282
283 if (lower_inode->i_fop->fsync) {
284 mutex_lock(&lower_inode->i_mutex);
285 rc = lower_inode->i_fop->fsync(lower_file, lower_dentry,
286 datasync);
287 mutex_unlock(&lower_inode->i_mutex);
288 }
289 return rc;
290} 281}
291 282
292static int ecryptfs_fasync(int fd, struct file *file, int flag) 283static int ecryptfs_fasync(int fd, struct file *file, int flag)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 89209f00f9c7..5697899a168d 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -52,8 +52,7 @@ static void unlock_dir(struct dentry *dir)
52/** 52/**
53 * ecryptfs_create_underlying_file 53 * ecryptfs_create_underlying_file
54 * @lower_dir_inode: inode of the parent in the lower fs of the new file 54 * @lower_dir_inode: inode of the parent in the lower fs of the new file
55 * @lower_dentry: New file's dentry in the lower fs 55 * @dentry: New file's dentry
56 * @ecryptfs_dentry: New file's dentry in ecryptfs
57 * @mode: The mode of the new file 56 * @mode: The mode of the new file
58 * @nd: nameidata of ecryptfs' parent's dentry & vfsmount 57 * @nd: nameidata of ecryptfs' parent's dentry & vfsmount
59 * 58 *
@@ -228,8 +227,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
228{ 227{
229 int rc; 228 int rc;
230 229
231 /* ecryptfs_do_create() calls ecryptfs_interpose(), which opens 230 /* ecryptfs_do_create() calls ecryptfs_interpose() */
232 * the crypt_stat->lower_file (persistent file) */
233 rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode, nd); 231 rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode, nd);
234 if (unlikely(rc)) { 232 if (unlikely(rc)) {
235 ecryptfs_printk(KERN_WARNING, "Failed to create file in" 233 ecryptfs_printk(KERN_WARNING, "Failed to create file in"
@@ -244,141 +242,91 @@ out:
244} 242}
245 243
246/** 244/**
247 * ecryptfs_lookup 245 * ecryptfs_lookup_and_interpose_lower - Perform a lookup
248 * @dir: inode
249 * @dentry: The dentry
250 * @nd: nameidata, may be NULL
251 *
252 * Find a file on disk. If the file does not exist, then we'll add it to the
253 * dentry cache and continue on to read it from the disk.
254 */ 246 */
255static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry, 247int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
256 struct nameidata *nd) 248 struct dentry *lower_dentry,
249 struct ecryptfs_crypt_stat *crypt_stat,
250 struct inode *ecryptfs_dir_inode,
251 struct nameidata *ecryptfs_nd)
257{ 252{
258 int rc = 0;
259 struct dentry *lower_dir_dentry; 253 struct dentry *lower_dir_dentry;
260 struct dentry *lower_dentry;
261 struct vfsmount *lower_mnt; 254 struct vfsmount *lower_mnt;
262 char *encoded_name; 255 struct inode *lower_inode;
263 int encoded_namelen;
264 struct ecryptfs_crypt_stat *crypt_stat = NULL;
265 struct ecryptfs_mount_crypt_stat *mount_crypt_stat; 256 struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
266 char *page_virt = NULL; 257 char *page_virt = NULL;
267 struct inode *lower_inode;
268 u64 file_size; 258 u64 file_size;
259 int rc = 0;
269 260
270 lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent); 261 lower_dir_dentry = lower_dentry->d_parent;
271 dentry->d_op = &ecryptfs_dops; 262 lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
272 if ((dentry->d_name.len == 1 && !strcmp(dentry->d_name.name, ".")) 263 ecryptfs_dentry->d_parent));
273 || (dentry->d_name.len == 2
274 && !strcmp(dentry->d_name.name, ".."))) {
275 d_drop(dentry);
276 goto out;
277 }
278 encoded_namelen = ecryptfs_encode_filename(crypt_stat,
279 dentry->d_name.name,
280 dentry->d_name.len,
281 &encoded_name);
282 if (encoded_namelen < 0) {
283 rc = encoded_namelen;
284 d_drop(dentry);
285 goto out;
286 }
287 ecryptfs_printk(KERN_DEBUG, "encoded_name = [%s]; encoded_namelen "
288 "= [%d]\n", encoded_name, encoded_namelen);
289 lower_dentry = lookup_one_len(encoded_name, lower_dir_dentry,
290 encoded_namelen - 1);
291 kfree(encoded_name);
292 if (IS_ERR(lower_dentry)) {
293 ecryptfs_printk(KERN_ERR, "ERR from lower_dentry\n");
294 rc = PTR_ERR(lower_dentry);
295 d_drop(dentry);
296 goto out;
297 }
298 lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
299 ecryptfs_printk(KERN_DEBUG, "lower_dentry = [%p]; lower_dentry->"
300 "d_name.name = [%s]\n", lower_dentry,
301 lower_dentry->d_name.name);
302 lower_inode = lower_dentry->d_inode; 264 lower_inode = lower_dentry->d_inode;
303 fsstack_copy_attr_atime(dir, lower_dir_dentry->d_inode); 265 fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode);
304 BUG_ON(!atomic_read(&lower_dentry->d_count)); 266 BUG_ON(!atomic_read(&lower_dentry->d_count));
305 ecryptfs_set_dentry_private(dentry, 267 ecryptfs_set_dentry_private(ecryptfs_dentry,
306 kmem_cache_alloc(ecryptfs_dentry_info_cache, 268 kmem_cache_alloc(ecryptfs_dentry_info_cache,
307 GFP_KERNEL)); 269 GFP_KERNEL));
308 if (!ecryptfs_dentry_to_private(dentry)) { 270 if (!ecryptfs_dentry_to_private(ecryptfs_dentry)) {
309 rc = -ENOMEM; 271 rc = -ENOMEM;
310 ecryptfs_printk(KERN_ERR, "Out of memory whilst attempting " 272 printk(KERN_ERR "%s: Out of memory whilst attempting "
311 "to allocate ecryptfs_dentry_info struct\n"); 273 "to allocate ecryptfs_dentry_info struct\n",
274 __func__);
312 goto out_dput; 275 goto out_dput;
313 } 276 }
314 ecryptfs_set_dentry_lower(dentry, lower_dentry); 277 ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry);
315 ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt); 278 ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt);
316 if (!lower_dentry->d_inode) { 279 if (!lower_dentry->d_inode) {
317 /* We want to add because we couldn't find in lower */ 280 /* We want to add because we couldn't find in lower */
318 d_add(dentry, NULL); 281 d_add(ecryptfs_dentry, NULL);
319 goto out; 282 goto out;
320 } 283 }
321 rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 284 rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
322 ECRYPTFS_INTERPOSE_FLAG_D_ADD); 285 ecryptfs_dir_inode->i_sb, 1);
323 if (rc) { 286 if (rc) {
324 ecryptfs_printk(KERN_ERR, "Error interposing\n"); 287 printk(KERN_ERR "%s: Error interposing; rc = [%d]\n",
288 __func__, rc);
325 goto out; 289 goto out;
326 } 290 }
327 if (S_ISDIR(lower_inode->i_mode)) { 291 if (S_ISDIR(lower_inode->i_mode))
328 ecryptfs_printk(KERN_DEBUG, "Is a directory; returning\n");
329 goto out; 292 goto out;
330 } 293 if (S_ISLNK(lower_inode->i_mode))
331 if (S_ISLNK(lower_inode->i_mode)) {
332 ecryptfs_printk(KERN_DEBUG, "Is a symlink; returning\n");
333 goto out; 294 goto out;
334 } 295 if (special_file(lower_inode->i_mode))
335 if (special_file(lower_inode->i_mode)) {
336 ecryptfs_printk(KERN_DEBUG, "Is a special file; returning\n");
337 goto out; 296 goto out;
338 } 297 if (!ecryptfs_nd)
339 if (!nd) {
340 ecryptfs_printk(KERN_DEBUG, "We have a NULL nd, just leave"
341 "as we *think* we are about to unlink\n");
342 goto out; 298 goto out;
343 }
344 /* Released in this function */ 299 /* Released in this function */
345 page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, 300 page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER);
346 GFP_USER);
347 if (!page_virt) { 301 if (!page_virt) {
302 printk(KERN_ERR "%s: Cannot kmem_cache_zalloc() a page\n",
303 __func__);
348 rc = -ENOMEM; 304 rc = -ENOMEM;
349 ecryptfs_printk(KERN_ERR,
350 "Cannot ecryptfs_kmalloc a page\n");
351 goto out; 305 goto out;
352 } 306 }
353 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; 307 if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) {
354 if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) 308 rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
355 ecryptfs_set_default_sizes(crypt_stat);
356 if (!ecryptfs_inode_to_private(dentry->d_inode)->lower_file) {
357 rc = ecryptfs_init_persistent_file(dentry);
358 if (rc) { 309 if (rc) {
359 printk(KERN_ERR "%s: Error attempting to initialize " 310 printk(KERN_ERR "%s: Error attempting to initialize "
360 "the persistent file for the dentry with name " 311 "the persistent file for the dentry with name "
361 "[%s]; rc = [%d]\n", __func__, 312 "[%s]; rc = [%d]\n", __func__,
362 dentry->d_name.name, rc); 313 ecryptfs_dentry->d_name.name, rc);
363 goto out; 314 goto out_free_kmem;
364 } 315 }
365 } 316 }
366 rc = ecryptfs_read_and_validate_header_region(page_virt, 317 rc = ecryptfs_read_and_validate_header_region(page_virt,
367 dentry->d_inode); 318 ecryptfs_dentry->d_inode);
368 if (rc) { 319 if (rc) {
369 rc = ecryptfs_read_and_validate_xattr_region(page_virt, dentry); 320 rc = ecryptfs_read_and_validate_xattr_region(page_virt,
321 ecryptfs_dentry);
370 if (rc) { 322 if (rc) {
371 printk(KERN_DEBUG "Valid metadata not found in header "
372 "region or xattr region; treating file as "
373 "unencrypted\n");
374 rc = 0; 323 rc = 0;
375 kmem_cache_free(ecryptfs_header_cache_2, page_virt); 324 goto out_free_kmem;
376 goto out;
377 } 325 }
378 crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; 326 crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
379 } 327 }
380 mount_crypt_stat = &ecryptfs_superblock_to_private( 328 mount_crypt_stat = &ecryptfs_superblock_to_private(
381 dentry->d_sb)->mount_crypt_stat; 329 ecryptfs_dentry->d_sb)->mount_crypt_stat;
382 if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) { 330 if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) {
383 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) 331 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
384 file_size = (crypt_stat->num_header_bytes_at_front 332 file_size = (crypt_stat->num_header_bytes_at_front
@@ -388,14 +336,103 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
388 } else { 336 } else {
389 file_size = get_unaligned_be64(page_virt); 337 file_size = get_unaligned_be64(page_virt);
390 } 338 }
391 i_size_write(dentry->d_inode, (loff_t)file_size); 339 i_size_write(ecryptfs_dentry->d_inode, (loff_t)file_size);
340out_free_kmem:
392 kmem_cache_free(ecryptfs_header_cache_2, page_virt); 341 kmem_cache_free(ecryptfs_header_cache_2, page_virt);
393 goto out; 342 goto out;
394
395out_dput: 343out_dput:
396 dput(lower_dentry); 344 dput(lower_dentry);
397 d_drop(dentry); 345 d_drop(ecryptfs_dentry);
398out: 346out:
347 return rc;
348}
349
350/**
351 * ecryptfs_lookup
352 * @ecryptfs_dir_inode: The eCryptfs directory inode
353 * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
354 * @ecryptfs_nd: nameidata; may be NULL
355 *
356 * Find a file on disk. If the file does not exist, then we'll add it to the
357 * dentry cache and continue on to read it from the disk.
358 */
359static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
360 struct dentry *ecryptfs_dentry,
361 struct nameidata *ecryptfs_nd)
362{
363 char *encrypted_and_encoded_name = NULL;
364 size_t encrypted_and_encoded_name_size;
365 struct ecryptfs_crypt_stat *crypt_stat = NULL;
366 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
367 struct ecryptfs_inode_info *inode_info;
368 struct dentry *lower_dir_dentry, *lower_dentry;
369 int rc = 0;
370
371 ecryptfs_dentry->d_op = &ecryptfs_dops;
372 if ((ecryptfs_dentry->d_name.len == 1
373 && !strcmp(ecryptfs_dentry->d_name.name, "."))
374 || (ecryptfs_dentry->d_name.len == 2
375 && !strcmp(ecryptfs_dentry->d_name.name, ".."))) {
376 goto out_d_drop;
377 }
378 lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
379 lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
380 lower_dir_dentry,
381 ecryptfs_dentry->d_name.len);
382 if (IS_ERR(lower_dentry)) {
383 rc = PTR_ERR(lower_dentry);
384 printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
385 "lower_dentry = [%s]\n", __func__, rc,
386 ecryptfs_dentry->d_name.name);
387 goto out_d_drop;
388 }
389 if (lower_dentry->d_inode)
390 goto lookup_and_interpose;
391 inode_info = ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
392 if (inode_info) {
393 crypt_stat = &inode_info->crypt_stat;
394 /* TODO: lock for crypt_stat comparison */
395 if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
396 ecryptfs_set_default_sizes(crypt_stat);
397 }
398 if (crypt_stat)
399 mount_crypt_stat = crypt_stat->mount_crypt_stat;
400 else
401 mount_crypt_stat = &ecryptfs_superblock_to_private(
402 ecryptfs_dentry->d_sb)->mount_crypt_stat;
403 if (!(crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
404 && !(mount_crypt_stat && (mount_crypt_stat->flags
405 & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)))
406 goto lookup_and_interpose;
407 dput(lower_dentry);
408 rc = ecryptfs_encrypt_and_encode_filename(
409 &encrypted_and_encoded_name, &encrypted_and_encoded_name_size,
410 crypt_stat, mount_crypt_stat, ecryptfs_dentry->d_name.name,
411 ecryptfs_dentry->d_name.len);
412 if (rc) {
413 printk(KERN_ERR "%s: Error attempting to encrypt and encode "
414 "filename; rc = [%d]\n", __func__, rc);
415 goto out_d_drop;
416 }
417 lower_dentry = lookup_one_len(encrypted_and_encoded_name,
418 lower_dir_dentry,
419 encrypted_and_encoded_name_size - 1);
420 if (IS_ERR(lower_dentry)) {
421 rc = PTR_ERR(lower_dentry);
422 printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
423 "lower_dentry = [%s]\n", __func__, rc,
424 encrypted_and_encoded_name);
425 goto out_d_drop;
426 }
427lookup_and_interpose:
428 rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry,
429 crypt_stat, ecryptfs_dir_inode,
430 ecryptfs_nd);
431 goto out;
432out_d_drop:
433 d_drop(ecryptfs_dentry);
434out:
435 kfree(encrypted_and_encoded_name);
399 return ERR_PTR(rc); 436 return ERR_PTR(rc);
400} 437}
401 438
@@ -466,19 +503,21 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
466 struct dentry *lower_dentry; 503 struct dentry *lower_dentry;
467 struct dentry *lower_dir_dentry; 504 struct dentry *lower_dir_dentry;
468 char *encoded_symname; 505 char *encoded_symname;
469 int encoded_symlen; 506 size_t encoded_symlen;
470 struct ecryptfs_crypt_stat *crypt_stat = NULL; 507 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
471 508
472 lower_dentry = ecryptfs_dentry_to_lower(dentry); 509 lower_dentry = ecryptfs_dentry_to_lower(dentry);
473 dget(lower_dentry); 510 dget(lower_dentry);
474 lower_dir_dentry = lock_parent(lower_dentry); 511 lower_dir_dentry = lock_parent(lower_dentry);
475 encoded_symlen = ecryptfs_encode_filename(crypt_stat, symname, 512 mount_crypt_stat = &ecryptfs_superblock_to_private(
476 strlen(symname), 513 dir->i_sb)->mount_crypt_stat;
477 &encoded_symname); 514 rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname,
478 if (encoded_symlen < 0) { 515 &encoded_symlen,
479 rc = encoded_symlen; 516 NULL,
517 mount_crypt_stat, symname,
518 strlen(symname));
519 if (rc)
480 goto out_lock; 520 goto out_lock;
481 }
482 rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry, 521 rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry,
483 encoded_symname); 522 encoded_symname);
484 kfree(encoded_symname); 523 kfree(encoded_symname);
@@ -602,53 +641,54 @@ out_lock:
602} 641}
603 642
604static int 643static int
605ecryptfs_readlink(struct dentry *dentry, char __user * buf, int bufsiz) 644ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
606{ 645{
607 int rc;
608 struct dentry *lower_dentry;
609 char *decoded_name;
610 char *lower_buf; 646 char *lower_buf;
611 mm_segment_t old_fs; 647 struct dentry *lower_dentry;
612 struct ecryptfs_crypt_stat *crypt_stat; 648 struct ecryptfs_crypt_stat *crypt_stat;
649 char *plaintext_name;
650 size_t plaintext_name_size;
651 mm_segment_t old_fs;
652 int rc;
613 653
614 lower_dentry = ecryptfs_dentry_to_lower(dentry); 654 lower_dentry = ecryptfs_dentry_to_lower(dentry);
615 if (!lower_dentry->d_inode->i_op || 655 if (!lower_dentry->d_inode->i_op->readlink) {
616 !lower_dentry->d_inode->i_op->readlink) {
617 rc = -EINVAL; 656 rc = -EINVAL;
618 goto out; 657 goto out;
619 } 658 }
659 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
620 /* Released in this function */ 660 /* Released in this function */
621 lower_buf = kmalloc(bufsiz, GFP_KERNEL); 661 lower_buf = kmalloc(bufsiz, GFP_KERNEL);
622 if (lower_buf == NULL) { 662 if (lower_buf == NULL) {
623 ecryptfs_printk(KERN_ERR, "Out of memory\n"); 663 printk(KERN_ERR "%s: Out of memory whilst attempting to "
664 "kmalloc [%d] bytes\n", __func__, bufsiz);
624 rc = -ENOMEM; 665 rc = -ENOMEM;
625 goto out; 666 goto out;
626 } 667 }
627 old_fs = get_fs(); 668 old_fs = get_fs();
628 set_fs(get_ds()); 669 set_fs(get_ds());
629 ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
630 "lower_dentry->d_name.name = [%s]\n",
631 lower_dentry->d_name.name);
632 rc = lower_dentry->d_inode->i_op->readlink(lower_dentry, 670 rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
633 (char __user *)lower_buf, 671 (char __user *)lower_buf,
634 bufsiz); 672 bufsiz);
635 set_fs(old_fs); 673 set_fs(old_fs);
636 if (rc >= 0) { 674 if (rc >= 0) {
637 crypt_stat = NULL; 675 rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name,
638 rc = ecryptfs_decode_filename(crypt_stat, lower_buf, rc, 676 &plaintext_name_size,
639 &decoded_name); 677 dentry, lower_buf,
640 if (rc == -ENOMEM) 678 rc);
679 if (rc) {
680 printk(KERN_ERR "%s: Error attempting to decode and "
681 "decrypt filename; rc = [%d]\n", __func__,
682 rc);
641 goto out_free_lower_buf; 683 goto out_free_lower_buf;
642 if (rc > 0) {
643 ecryptfs_printk(KERN_DEBUG, "Copying [%d] bytes "
644 "to userspace: [%*s]\n", rc,
645 decoded_name);
646 if (copy_to_user(buf, decoded_name, rc))
647 rc = -EFAULT;
648 } 684 }
649 kfree(decoded_name); 685 rc = copy_to_user(buf, plaintext_name, plaintext_name_size);
650 fsstack_copy_attr_atime(dentry->d_inode, 686 if (rc)
651 lower_dentry->d_inode); 687 rc = -EFAULT;
688 else
689 rc = plaintext_name_size;
690 kfree(plaintext_name);
691 fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode);
652 } 692 }
653out_free_lower_buf: 693out_free_lower_buf:
654 kfree(lower_buf); 694 kfree(lower_buf);
@@ -670,13 +710,12 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
670 } 710 }
671 old_fs = get_fs(); 711 old_fs = get_fs();
672 set_fs(get_ds()); 712 set_fs(get_ds());
673 ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
674 "dentry->d_name.name = [%s]\n", dentry->d_name.name);
675 rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len); 713 rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len);
676 buf[rc] = '\0';
677 set_fs(old_fs); 714 set_fs(old_fs);
678 if (rc < 0) 715 if (rc < 0)
679 goto out_free; 716 goto out_free;
717 else
718 buf[rc] = '\0';
680 rc = 0; 719 rc = 0;
681 nd_set_link(nd, buf); 720 nd_set_link(nd, buf);
682 goto out; 721 goto out;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 0d713b691941..ff539420cc6f 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -358,7 +358,7 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec,
358 /* verify that everything through the encrypted FEK size is present */ 358 /* verify that everything through the encrypted FEK size is present */
359 if (message_len < 4) { 359 if (message_len < 4) {
360 rc = -EIO; 360 rc = -EIO;
361 printk(KERN_ERR "%s: message_len is [%Zd]; minimum acceptable " 361 printk(KERN_ERR "%s: message_len is [%zd]; minimum acceptable "
362 "message length is [%d]\n", __func__, message_len, 4); 362 "message length is [%d]\n", __func__, message_len, 4);
363 goto out; 363 goto out;
364 } 364 }
@@ -385,13 +385,13 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec,
385 i += data_len; 385 i += data_len;
386 if (message_len < (i + key_rec->enc_key_size)) { 386 if (message_len < (i + key_rec->enc_key_size)) {
387 rc = -EIO; 387 rc = -EIO;
388 printk(KERN_ERR "%s: message_len [%Zd]; max len is [%Zd]\n", 388 printk(KERN_ERR "%s: message_len [%zd]; max len is [%zd]\n",
389 __func__, message_len, (i + key_rec->enc_key_size)); 389 __func__, message_len, (i + key_rec->enc_key_size));
390 goto out; 390 goto out;
391 } 391 }
392 if (key_rec->enc_key_size > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) { 392 if (key_rec->enc_key_size > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
393 rc = -EIO; 393 rc = -EIO;
394 printk(KERN_ERR "%s: Encrypted key_size [%Zd] larger than " 394 printk(KERN_ERR "%s: Encrypted key_size [%zd] larger than "
395 "the maximum key size [%d]\n", __func__, 395 "the maximum key size [%d]\n", __func__,
396 key_rec->enc_key_size, 396 key_rec->enc_key_size,
397 ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES); 397 ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES);
@@ -403,6 +403,580 @@ out:
403} 403}
404 404
405static int 405static int
406ecryptfs_find_global_auth_tok_for_sig(
407 struct ecryptfs_global_auth_tok **global_auth_tok,
408 struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig)
409{
410 struct ecryptfs_global_auth_tok *walker;
411 int rc = 0;
412
413 (*global_auth_tok) = NULL;
414 mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
415 list_for_each_entry(walker,
416 &mount_crypt_stat->global_auth_tok_list,
417 mount_crypt_stat_list) {
418 if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) {
419 (*global_auth_tok) = walker;
420 goto out;
421 }
422 }
423 rc = -EINVAL;
424out:
425 mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
426 return rc;
427}
428
429/**
430 * ecryptfs_find_auth_tok_for_sig
431 * @auth_tok: Set to the matching auth_tok; NULL if not found
432 * @crypt_stat: inode crypt_stat crypto context
433 * @sig: Sig of auth_tok to find
434 *
435 * For now, this function simply looks at the registered auth_tok's
436 * linked off the mount_crypt_stat, so all the auth_toks that can be
437 * used must be registered at mount time. This function could
438 * potentially try a lot harder to find auth_tok's (e.g., by calling
439 * out to ecryptfsd to dynamically retrieve an auth_tok object) so
440 * that static registration of auth_tok's will no longer be necessary.
441 *
442 * Returns zero on no error; non-zero on error
443 */
444static int
445ecryptfs_find_auth_tok_for_sig(
446 struct ecryptfs_auth_tok **auth_tok,
447 struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
448 char *sig)
449{
450 struct ecryptfs_global_auth_tok *global_auth_tok;
451 int rc = 0;
452
453 (*auth_tok) = NULL;
454 if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
455 mount_crypt_stat, sig)) {
456 struct key *auth_tok_key;
457
458 rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok,
459 sig);
460 } else
461 (*auth_tok) = global_auth_tok->global_auth_tok;
462 return rc;
463}
464
465/**
466 * write_tag_70_packet can gobble a lot of stack space. We stuff most
467 * of the function's parameters in a kmalloc'd struct to help reduce
468 * eCryptfs' overall stack usage.
469 */
470struct ecryptfs_write_tag_70_packet_silly_stack {
471 u8 cipher_code;
472 size_t max_packet_size;
473 size_t packet_size_len;
474 size_t block_aligned_filename_size;
475 size_t block_size;
476 size_t i;
477 size_t j;
478 size_t num_rand_bytes;
479 struct mutex *tfm_mutex;
480 char *block_aligned_filename;
481 struct ecryptfs_auth_tok *auth_tok;
482 struct scatterlist src_sg;
483 struct scatterlist dst_sg;
484 struct blkcipher_desc desc;
485 char iv[ECRYPTFS_MAX_IV_BYTES];
486 char hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
487 char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
488 struct hash_desc hash_desc;
489 struct scatterlist hash_sg;
490};
491
492/**
493 * write_tag_70_packet - Write encrypted filename (EFN) packet against FNEK
494 * @filename: NULL-terminated filename string
495 *
496 * This is the simplest mechanism for achieving filename encryption in
497 * eCryptfs. It encrypts the given filename with the mount-wide
498 * filename encryption key (FNEK) and stores it in a packet to @dest,
499 * which the callee will encode and write directly into the dentry
500 * name.
501 */
502int
503ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
504 size_t *packet_size,
505 struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
506 char *filename, size_t filename_size)
507{
508 struct ecryptfs_write_tag_70_packet_silly_stack *s;
509 int rc = 0;
510
511 s = kmalloc(sizeof(*s), GFP_KERNEL);
512 if (!s) {
513 printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
514 "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
515 goto out;
516 }
517 s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
518 (*packet_size) = 0;
519 rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(
520 &s->desc.tfm,
521 &s->tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name);
522 if (unlikely(rc)) {
523 printk(KERN_ERR "Internal error whilst attempting to get "
524 "tfm and mutex for cipher name [%s]; rc = [%d]\n",
525 mount_crypt_stat->global_default_fn_cipher_name, rc);
526 goto out;
527 }
528 mutex_lock(s->tfm_mutex);
529 s->block_size = crypto_blkcipher_blocksize(s->desc.tfm);
530 /* Plus one for the \0 separator between the random prefix
531 * and the plaintext filename */
532 s->num_rand_bytes = (ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + 1);
533 s->block_aligned_filename_size = (s->num_rand_bytes + filename_size);
534 if ((s->block_aligned_filename_size % s->block_size) != 0) {
535 s->num_rand_bytes += (s->block_size
536 - (s->block_aligned_filename_size
537 % s->block_size));
538 s->block_aligned_filename_size = (s->num_rand_bytes
539 + filename_size);
540 }
541 /* Octet 0: Tag 70 identifier
542 * Octets 1-N1: Tag 70 packet size (includes cipher identifier
543 * and block-aligned encrypted filename size)
544 * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE)
545 * Octet N2-N3: Cipher identifier (1 octet)
546 * Octets N3-N4: Block-aligned encrypted filename
547 * - Consists of a minimum number of random characters, a \0
548 * separator, and then the filename */
549 s->max_packet_size = (1 /* Tag 70 identifier */
550 + 3 /* Max Tag 70 packet size */
551 + ECRYPTFS_SIG_SIZE /* FNEK sig */
552 + 1 /* Cipher identifier */
553 + s->block_aligned_filename_size);
554 if (dest == NULL) {
555 (*packet_size) = s->max_packet_size;
556 goto out_unlock;
557 }
558 if (s->max_packet_size > (*remaining_bytes)) {
559 printk(KERN_WARNING "%s: Require [%zd] bytes to write; only "
560 "[%zd] available\n", __func__, s->max_packet_size,
561 (*remaining_bytes));
562 rc = -EINVAL;
563 goto out_unlock;
564 }
565 s->block_aligned_filename = kzalloc(s->block_aligned_filename_size,
566 GFP_KERNEL);
567 if (!s->block_aligned_filename) {
568 printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
569 "kzalloc [%zd] bytes\n", __func__,
570 s->block_aligned_filename_size);
571 rc = -ENOMEM;
572 goto out_unlock;
573 }
574 s->i = 0;
575 dest[s->i++] = ECRYPTFS_TAG_70_PACKET_TYPE;
576 rc = ecryptfs_write_packet_length(&dest[s->i],
577 (ECRYPTFS_SIG_SIZE
578 + 1 /* Cipher code */
579 + s->block_aligned_filename_size),
580 &s->packet_size_len);
581 if (rc) {
582 printk(KERN_ERR "%s: Error generating tag 70 packet "
583 "header; cannot generate packet length; rc = [%d]\n",
584 __func__, rc);
585 goto out_free_unlock;
586 }
587 s->i += s->packet_size_len;
588 ecryptfs_from_hex(&dest[s->i],
589 mount_crypt_stat->global_default_fnek_sig,
590 ECRYPTFS_SIG_SIZE);
591 s->i += ECRYPTFS_SIG_SIZE;
592 s->cipher_code = ecryptfs_code_for_cipher_string(
593 mount_crypt_stat->global_default_fn_cipher_name,
594 mount_crypt_stat->global_default_fn_cipher_key_bytes);
595 if (s->cipher_code == 0) {
596 printk(KERN_WARNING "%s: Unable to generate code for "
597 "cipher [%s] with key bytes [%zd]\n", __func__,
598 mount_crypt_stat->global_default_fn_cipher_name,
599 mount_crypt_stat->global_default_fn_cipher_key_bytes);
600 rc = -EINVAL;
601 goto out_free_unlock;
602 }
603 dest[s->i++] = s->cipher_code;
604 rc = ecryptfs_find_auth_tok_for_sig(
605 &s->auth_tok, mount_crypt_stat,
606 mount_crypt_stat->global_default_fnek_sig);
607 if (rc) {
608 printk(KERN_ERR "%s: Error attempting to find auth tok for "
609 "fnek sig [%s]; rc = [%d]\n", __func__,
610 mount_crypt_stat->global_default_fnek_sig, rc);
611 goto out_free_unlock;
612 }
613 /* TODO: Support other key modules than passphrase for
614 * filename encryption */
615 BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD);
616 sg_init_one(
617 &s->hash_sg,
618 (u8 *)s->auth_tok->token.password.session_key_encryption_key,
619 s->auth_tok->token.password.session_key_encryption_key_bytes);
620 s->hash_desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
621 s->hash_desc.tfm = crypto_alloc_hash(ECRYPTFS_TAG_70_DIGEST, 0,
622 CRYPTO_ALG_ASYNC);
623 if (IS_ERR(s->hash_desc.tfm)) {
624 rc = PTR_ERR(s->hash_desc.tfm);
625 printk(KERN_ERR "%s: Error attempting to "
626 "allocate hash crypto context; rc = [%d]\n",
627 __func__, rc);
628 goto out_free_unlock;
629 }
630 rc = crypto_hash_init(&s->hash_desc);
631 if (rc) {
632 printk(KERN_ERR
633 "%s: Error initializing crypto hash; rc = [%d]\n",
634 __func__, rc);
635 goto out_release_free_unlock;
636 }
637 rc = crypto_hash_update(
638 &s->hash_desc, &s->hash_sg,
639 s->auth_tok->token.password.session_key_encryption_key_bytes);
640 if (rc) {
641 printk(KERN_ERR
642 "%s: Error updating crypto hash; rc = [%d]\n",
643 __func__, rc);
644 goto out_release_free_unlock;
645 }
646 rc = crypto_hash_final(&s->hash_desc, s->hash);
647 if (rc) {
648 printk(KERN_ERR
649 "%s: Error finalizing crypto hash; rc = [%d]\n",
650 __func__, rc);
651 goto out_release_free_unlock;
652 }
653 for (s->j = 0; s->j < (s->num_rand_bytes - 1); s->j++) {
654 s->block_aligned_filename[s->j] =
655 s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)];
656 if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)
657 == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) {
658 sg_init_one(&s->hash_sg, (u8 *)s->hash,
659 ECRYPTFS_TAG_70_DIGEST_SIZE);
660 rc = crypto_hash_init(&s->hash_desc);
661 if (rc) {
662 printk(KERN_ERR
663 "%s: Error initializing crypto hash; "
664 "rc = [%d]\n", __func__, rc);
665 goto out_release_free_unlock;
666 }
667 rc = crypto_hash_update(&s->hash_desc, &s->hash_sg,
668 ECRYPTFS_TAG_70_DIGEST_SIZE);
669 if (rc) {
670 printk(KERN_ERR
671 "%s: Error updating crypto hash; "
672 "rc = [%d]\n", __func__, rc);
673 goto out_release_free_unlock;
674 }
675 rc = crypto_hash_final(&s->hash_desc, s->tmp_hash);
676 if (rc) {
677 printk(KERN_ERR
678 "%s: Error finalizing crypto hash; "
679 "rc = [%d]\n", __func__, rc);
680 goto out_release_free_unlock;
681 }
682 memcpy(s->hash, s->tmp_hash,
683 ECRYPTFS_TAG_70_DIGEST_SIZE);
684 }
685 if (s->block_aligned_filename[s->j] == '\0')
686 s->block_aligned_filename[s->j] = ECRYPTFS_NON_NULL;
687 }
688 memcpy(&s->block_aligned_filename[s->num_rand_bytes], filename,
689 filename_size);
690 rc = virt_to_scatterlist(s->block_aligned_filename,
691 s->block_aligned_filename_size, &s->src_sg, 1);
692 if (rc != 1) {
693 printk(KERN_ERR "%s: Internal error whilst attempting to "
694 "convert filename memory to scatterlist; "
695 "expected rc = 1; got rc = [%d]. "
696 "block_aligned_filename_size = [%zd]\n", __func__, rc,
697 s->block_aligned_filename_size);
698 goto out_release_free_unlock;
699 }
700 rc = virt_to_scatterlist(&dest[s->i], s->block_aligned_filename_size,
701 &s->dst_sg, 1);
702 if (rc != 1) {
703 printk(KERN_ERR "%s: Internal error whilst attempting to "
704 "convert encrypted filename memory to scatterlist; "
705 "expected rc = 1; got rc = [%d]. "
706 "block_aligned_filename_size = [%zd]\n", __func__, rc,
707 s->block_aligned_filename_size);
708 goto out_release_free_unlock;
709 }
710 /* The characters in the first block effectively do the job
711 * of the IV here, so we just use 0's for the IV. Note the
712 * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
713 * >= ECRYPTFS_MAX_IV_BYTES. */
714 memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
715 s->desc.info = s->iv;
716 rc = crypto_blkcipher_setkey(
717 s->desc.tfm,
718 s->auth_tok->token.password.session_key_encryption_key,
719 mount_crypt_stat->global_default_fn_cipher_key_bytes);
720 if (rc < 0) {
721 printk(KERN_ERR "%s: Error setting key for crypto context; "
722 "rc = [%d]. s->auth_tok->token.password.session_key_"
723 "encryption_key = [0x%p]; mount_crypt_stat->"
724 "global_default_fn_cipher_key_bytes = [%zd]\n", __func__,
725 rc,
726 s->auth_tok->token.password.session_key_encryption_key,
727 mount_crypt_stat->global_default_fn_cipher_key_bytes);
728 goto out_release_free_unlock;
729 }
730 rc = crypto_blkcipher_encrypt_iv(&s->desc, &s->dst_sg, &s->src_sg,
731 s->block_aligned_filename_size);
732 if (rc) {
733 printk(KERN_ERR "%s: Error attempting to encrypt filename; "
734 "rc = [%d]\n", __func__, rc);
735 goto out_release_free_unlock;
736 }
737 s->i += s->block_aligned_filename_size;
738 (*packet_size) = s->i;
739 (*remaining_bytes) -= (*packet_size);
740out_release_free_unlock:
741 crypto_free_hash(s->hash_desc.tfm);
742out_free_unlock:
743 memset(s->block_aligned_filename, 0, s->block_aligned_filename_size);
744 kfree(s->block_aligned_filename);
745out_unlock:
746 mutex_unlock(s->tfm_mutex);
747out:
748 kfree(s);
749 return rc;
750}
751
752struct ecryptfs_parse_tag_70_packet_silly_stack {
753 u8 cipher_code;
754 size_t max_packet_size;
755 size_t packet_size_len;
756 size_t parsed_tag_70_packet_size;
757 size_t block_aligned_filename_size;
758 size_t block_size;
759 size_t i;
760 struct mutex *tfm_mutex;
761 char *decrypted_filename;
762 struct ecryptfs_auth_tok *auth_tok;
763 struct scatterlist src_sg;
764 struct scatterlist dst_sg;
765 struct blkcipher_desc desc;
766 char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1];
767 char iv[ECRYPTFS_MAX_IV_BYTES];
768 char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE];
769};
770
771/**
772 * parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet
773 * @filename: This function kmalloc's the memory for the filename
774 * @filename_size: This function sets this to the amount of memory
775 * kmalloc'd for the filename
776 * @packet_size: This function sets this to the the number of octets
777 * in the packet parsed
778 * @mount_crypt_stat: The mount-wide cryptographic context
779 * @data: The memory location containing the start of the tag 70
780 * packet
781 * @max_packet_size: The maximum legal size of the packet to be parsed
782 * from @data
783 *
784 * Returns zero on success; non-zero otherwise
785 */
786int
787ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
788 size_t *packet_size,
789 struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
790 char *data, size_t max_packet_size)
791{
792 struct ecryptfs_parse_tag_70_packet_silly_stack *s;
793 int rc = 0;
794
795 (*packet_size) = 0;
796 (*filename_size) = 0;
797 (*filename) = NULL;
798 s = kmalloc(sizeof(*s), GFP_KERNEL);
799 if (!s) {
800 printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
801 "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
802 goto out;
803 }
804 s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
805 if (max_packet_size < (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)) {
806 printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be "
807 "at least [%d]\n", __func__, max_packet_size,
808 (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1));
809 rc = -EINVAL;
810 goto out;
811 }
812 /* Octet 0: Tag 70 identifier
813 * Octets 1-N1: Tag 70 packet size (includes cipher identifier
814 * and block-aligned encrypted filename size)
815 * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE)
816 * Octet N2-N3: Cipher identifier (1 octet)
817 * Octets N3-N4: Block-aligned encrypted filename
818 * - Consists of a minimum number of random numbers, a \0
819 * separator, and then the filename */
820 if (data[(*packet_size)++] != ECRYPTFS_TAG_70_PACKET_TYPE) {
821 printk(KERN_WARNING "%s: Invalid packet tag [0x%.2x]; must be "
822 "tag [0x%.2x]\n", __func__,
823 data[((*packet_size) - 1)], ECRYPTFS_TAG_70_PACKET_TYPE);
824 rc = -EINVAL;
825 goto out;
826 }
827 rc = ecryptfs_parse_packet_length(&data[(*packet_size)],
828 &s->parsed_tag_70_packet_size,
829 &s->packet_size_len);
830 if (rc) {
831 printk(KERN_WARNING "%s: Error parsing packet length; "
832 "rc = [%d]\n", __func__, rc);
833 goto out;
834 }
835 s->block_aligned_filename_size = (s->parsed_tag_70_packet_size
836 - ECRYPTFS_SIG_SIZE - 1);
837 if ((1 + s->packet_size_len + s->parsed_tag_70_packet_size)
838 > max_packet_size) {
839 printk(KERN_WARNING "%s: max_packet_size is [%zd]; real packet "
840 "size is [%zd]\n", __func__, max_packet_size,
841 (1 + s->packet_size_len + 1
842 + s->block_aligned_filename_size));
843 rc = -EINVAL;
844 goto out;
845 }
846 (*packet_size) += s->packet_size_len;
847 ecryptfs_to_hex(s->fnek_sig_hex, &data[(*packet_size)],
848 ECRYPTFS_SIG_SIZE);
849 s->fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX] = '\0';
850 (*packet_size) += ECRYPTFS_SIG_SIZE;
851 s->cipher_code = data[(*packet_size)++];
852 rc = ecryptfs_cipher_code_to_string(s->cipher_string, s->cipher_code);
853 if (rc) {
854 printk(KERN_WARNING "%s: Cipher code [%d] is invalid\n",
855 __func__, s->cipher_code);
856 goto out;
857 }
858 rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->desc.tfm,
859 &s->tfm_mutex,
860 s->cipher_string);
861 if (unlikely(rc)) {
862 printk(KERN_ERR "Internal error whilst attempting to get "
863 "tfm and mutex for cipher name [%s]; rc = [%d]\n",
864 s->cipher_string, rc);
865 goto out;
866 }
867 mutex_lock(s->tfm_mutex);
868 rc = virt_to_scatterlist(&data[(*packet_size)],
869 s->block_aligned_filename_size, &s->src_sg, 1);
870 if (rc != 1) {
871 printk(KERN_ERR "%s: Internal error whilst attempting to "
872 "convert encrypted filename memory to scatterlist; "
873 "expected rc = 1; got rc = [%d]. "
874 "block_aligned_filename_size = [%zd]\n", __func__, rc,
875 s->block_aligned_filename_size);
876 goto out_unlock;
877 }
878 (*packet_size) += s->block_aligned_filename_size;
879 s->decrypted_filename = kmalloc(s->block_aligned_filename_size,
880 GFP_KERNEL);
881 if (!s->decrypted_filename) {
882 printk(KERN_ERR "%s: Out of memory whilst attempting to "
883 "kmalloc [%zd] bytes\n", __func__,
884 s->block_aligned_filename_size);
885 rc = -ENOMEM;
886 goto out_unlock;
887 }
888 rc = virt_to_scatterlist(s->decrypted_filename,
889 s->block_aligned_filename_size, &s->dst_sg, 1);
890 if (rc != 1) {
891 printk(KERN_ERR "%s: Internal error whilst attempting to "
892 "convert decrypted filename memory to scatterlist; "
893 "expected rc = 1; got rc = [%d]. "
894 "block_aligned_filename_size = [%zd]\n", __func__, rc,
895 s->block_aligned_filename_size);
896 goto out_free_unlock;
897 }
898 /* The characters in the first block effectively do the job of
899 * the IV here, so we just use 0's for the IV. Note the
900 * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
901 * >= ECRYPTFS_MAX_IV_BYTES. */
902 memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
903 s->desc.info = s->iv;
904 rc = ecryptfs_find_auth_tok_for_sig(&s->auth_tok, mount_crypt_stat,
905 s->fnek_sig_hex);
906 if (rc) {
907 printk(KERN_ERR "%s: Error attempting to find auth tok for "
908 "fnek sig [%s]; rc = [%d]\n", __func__, s->fnek_sig_hex,
909 rc);
910 goto out_free_unlock;
911 }
912 /* TODO: Support other key modules than passphrase for
913 * filename encryption */
914 BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD);
915 rc = crypto_blkcipher_setkey(
916 s->desc.tfm,
917 s->auth_tok->token.password.session_key_encryption_key,
918 mount_crypt_stat->global_default_fn_cipher_key_bytes);
919 if (rc < 0) {
920 printk(KERN_ERR "%s: Error setting key for crypto context; "
921 "rc = [%d]. s->auth_tok->token.password.session_key_"
922 "encryption_key = [0x%p]; mount_crypt_stat->"
923 "global_default_fn_cipher_key_bytes = [%zd]\n", __func__,
924 rc,
925 s->auth_tok->token.password.session_key_encryption_key,
926 mount_crypt_stat->global_default_fn_cipher_key_bytes);
927 goto out_free_unlock;
928 }
929 rc = crypto_blkcipher_decrypt_iv(&s->desc, &s->dst_sg, &s->src_sg,
930 s->block_aligned_filename_size);
931 if (rc) {
932 printk(KERN_ERR "%s: Error attempting to decrypt filename; "
933 "rc = [%d]\n", __func__, rc);
934 goto out_free_unlock;
935 }
936 s->i = 0;
937 while (s->decrypted_filename[s->i] != '\0'
938 && s->i < s->block_aligned_filename_size)
939 s->i++;
940 if (s->i == s->block_aligned_filename_size) {
941 printk(KERN_WARNING "%s: Invalid tag 70 packet; could not "
942 "find valid separator between random characters and "
943 "the filename\n", __func__);
944 rc = -EINVAL;
945 goto out_free_unlock;
946 }
947 s->i++;
948 (*filename_size) = (s->block_aligned_filename_size - s->i);
949 if (!((*filename_size) > 0 && (*filename_size < PATH_MAX))) {
950 printk(KERN_WARNING "%s: Filename size is [%zd], which is "
951 "invalid\n", __func__, (*filename_size));
952 rc = -EINVAL;
953 goto out_free_unlock;
954 }
955 (*filename) = kmalloc(((*filename_size) + 1), GFP_KERNEL);
956 if (!(*filename)) {
957 printk(KERN_ERR "%s: Out of memory whilst attempting to "
958 "kmalloc [%zd] bytes\n", __func__,
959 ((*filename_size) + 1));
960 rc = -ENOMEM;
961 goto out_free_unlock;
962 }
963 memcpy((*filename), &s->decrypted_filename[s->i], (*filename_size));
964 (*filename)[(*filename_size)] = '\0';
965out_free_unlock:
966 kfree(s->decrypted_filename);
967out_unlock:
968 mutex_unlock(s->tfm_mutex);
969out:
970 if (rc) {
971 (*packet_size) = 0;
972 (*filename_size) = 0;
973 (*filename) = NULL;
974 }
975 kfree(s);
976 return rc;
977}
978
979static int
406ecryptfs_get_auth_tok_sig(char **sig, struct ecryptfs_auth_tok *auth_tok) 980ecryptfs_get_auth_tok_sig(char **sig, struct ecryptfs_auth_tok *auth_tok)
407{ 981{
408 int rc = 0; 982 int rc = 0;
@@ -897,30 +1471,6 @@ out:
897 return rc; 1471 return rc;
898} 1472}
899 1473
900static int
901ecryptfs_find_global_auth_tok_for_sig(
902 struct ecryptfs_global_auth_tok **global_auth_tok,
903 struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig)
904{
905 struct ecryptfs_global_auth_tok *walker;
906 int rc = 0;
907
908 (*global_auth_tok) = NULL;
909 mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
910 list_for_each_entry(walker,
911 &mount_crypt_stat->global_auth_tok_list,
912 mount_crypt_stat_list) {
913 if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) {
914 (*global_auth_tok) = walker;
915 goto out;
916 }
917 }
918 rc = -EINVAL;
919out:
920 mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
921 return rc;
922}
923
924/** 1474/**
925 * ecryptfs_verify_version 1475 * ecryptfs_verify_version
926 * @version: The version number to confirm 1476 * @version: The version number to confirm
@@ -990,43 +1540,6 @@ out:
990} 1540}
991 1541
992/** 1542/**
993 * ecryptfs_find_auth_tok_for_sig
994 * @auth_tok: Set to the matching auth_tok; NULL if not found
995 * @crypt_stat: inode crypt_stat crypto context
996 * @sig: Sig of auth_tok to find
997 *
998 * For now, this function simply looks at the registered auth_tok's
999 * linked off the mount_crypt_stat, so all the auth_toks that can be
1000 * used must be registered at mount time. This function could
1001 * potentially try a lot harder to find auth_tok's (e.g., by calling
1002 * out to ecryptfsd to dynamically retrieve an auth_tok object) so
1003 * that static registration of auth_tok's will no longer be necessary.
1004 *
1005 * Returns zero on no error; non-zero on error
1006 */
1007static int
1008ecryptfs_find_auth_tok_for_sig(
1009 struct ecryptfs_auth_tok **auth_tok,
1010 struct ecryptfs_crypt_stat *crypt_stat, char *sig)
1011{
1012 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
1013 crypt_stat->mount_crypt_stat;
1014 struct ecryptfs_global_auth_tok *global_auth_tok;
1015 int rc = 0;
1016
1017 (*auth_tok) = NULL;
1018 if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
1019 mount_crypt_stat, sig)) {
1020 struct key *auth_tok_key;
1021
1022 rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok,
1023 sig);
1024 } else
1025 (*auth_tok) = global_auth_tok->global_auth_tok;
1026 return rc;
1027}
1028
1029/**
1030 * decrypt_passphrase_encrypted_session_key - Decrypt the session key with the given auth_tok. 1543 * decrypt_passphrase_encrypted_session_key - Decrypt the session key with the given auth_tok.
1031 * @auth_tok: The passphrase authentication token to use to encrypt the FEK 1544 * @auth_tok: The passphrase authentication token to use to encrypt the FEK
1032 * @crypt_stat: The cryptographic context 1545 * @crypt_stat: The cryptographic context
@@ -1256,7 +1769,8 @@ find_next_matching_auth_tok:
1256 rc = -EINVAL; 1769 rc = -EINVAL;
1257 goto out_wipe_list; 1770 goto out_wipe_list;
1258 } 1771 }
1259 ecryptfs_find_auth_tok_for_sig(&matching_auth_tok, crypt_stat, 1772 ecryptfs_find_auth_tok_for_sig(&matching_auth_tok,
1773 crypt_stat->mount_crypt_stat,
1260 candidate_auth_tok_sig); 1774 candidate_auth_tok_sig);
1261 if (matching_auth_tok) { 1775 if (matching_auth_tok) {
1262 found_auth_tok = 1; 1776 found_auth_tok = 1;
@@ -1336,7 +1850,9 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok,
1336 int rc; 1850 int rc;
1337 1851
1338 rc = write_tag_66_packet(auth_tok->token.private_key.signature, 1852 rc = write_tag_66_packet(auth_tok->token.private_key.signature,
1339 ecryptfs_code_for_cipher_string(crypt_stat), 1853 ecryptfs_code_for_cipher_string(
1854 crypt_stat->cipher,
1855 crypt_stat->key_size),
1340 crypt_stat, &payload, &payload_len); 1856 crypt_stat, &payload, &payload_len);
1341 if (rc) { 1857 if (rc) {
1342 ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n"); 1858 ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n");
@@ -1696,7 +2212,8 @@ encrypted_session_key_set:
1696 dest[(*packet_size)++] = 0x04; /* version 4 */ 2212 dest[(*packet_size)++] = 0x04; /* version 4 */
1697 /* TODO: Break from RFC2440 so that arbitrary ciphers can be 2213 /* TODO: Break from RFC2440 so that arbitrary ciphers can be
1698 * specified with strings */ 2214 * specified with strings */
1699 cipher_code = ecryptfs_code_for_cipher_string(crypt_stat); 2215 cipher_code = ecryptfs_code_for_cipher_string(crypt_stat->cipher,
2216 crypt_stat->key_size);
1700 if (cipher_code == 0) { 2217 if (cipher_code == 0) {
1701 ecryptfs_printk(KERN_WARNING, "Unable to generate code for " 2218 ecryptfs_printk(KERN_WARNING, "Unable to generate code for "
1702 "cipher [%s]\n", crypt_stat->cipher); 2219 "cipher [%s]\n", crypt_stat->cipher);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index fd630713c5c7..789cf2e1be1e 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -206,7 +206,9 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
206 ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher, 206 ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher,
207 ecryptfs_opt_ecryptfs_key_bytes, 207 ecryptfs_opt_ecryptfs_key_bytes,
208 ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, 208 ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
209 ecryptfs_opt_encrypted_view, ecryptfs_opt_err }; 209 ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
210 ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
211 ecryptfs_opt_err };
210 212
211static const match_table_t tokens = { 213static const match_table_t tokens = {
212 {ecryptfs_opt_sig, "sig=%s"}, 214 {ecryptfs_opt_sig, "sig=%s"},
@@ -217,6 +219,9 @@ static const match_table_t tokens = {
217 {ecryptfs_opt_passthrough, "ecryptfs_passthrough"}, 219 {ecryptfs_opt_passthrough, "ecryptfs_passthrough"},
218 {ecryptfs_opt_xattr_metadata, "ecryptfs_xattr_metadata"}, 220 {ecryptfs_opt_xattr_metadata, "ecryptfs_xattr_metadata"},
219 {ecryptfs_opt_encrypted_view, "ecryptfs_encrypted_view"}, 221 {ecryptfs_opt_encrypted_view, "ecryptfs_encrypted_view"},
222 {ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"},
223 {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"},
224 {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
220 {ecryptfs_opt_err, NULL} 225 {ecryptfs_opt_err, NULL}
221}; 226};
222 227
@@ -281,8 +286,11 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
281 int rc = 0; 286 int rc = 0;
282 int sig_set = 0; 287 int sig_set = 0;
283 int cipher_name_set = 0; 288 int cipher_name_set = 0;
289 int fn_cipher_name_set = 0;
284 int cipher_key_bytes; 290 int cipher_key_bytes;
285 int cipher_key_bytes_set = 0; 291 int cipher_key_bytes_set = 0;
292 int fn_cipher_key_bytes;
293 int fn_cipher_key_bytes_set = 0;
286 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = 294 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
287 &ecryptfs_superblock_to_private(sb)->mount_crypt_stat; 295 &ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
288 substring_t args[MAX_OPT_ARGS]; 296 substring_t args[MAX_OPT_ARGS];
@@ -290,7 +298,12 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
290 char *sig_src; 298 char *sig_src;
291 char *cipher_name_dst; 299 char *cipher_name_dst;
292 char *cipher_name_src; 300 char *cipher_name_src;
301 char *fn_cipher_name_dst;
302 char *fn_cipher_name_src;
303 char *fnek_dst;
304 char *fnek_src;
293 char *cipher_key_bytes_src; 305 char *cipher_key_bytes_src;
306 char *fn_cipher_key_bytes_src;
294 307
295 if (!options) { 308 if (!options) {
296 rc = -EINVAL; 309 rc = -EINVAL;
@@ -322,10 +335,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
322 global_default_cipher_name; 335 global_default_cipher_name;
323 strncpy(cipher_name_dst, cipher_name_src, 336 strncpy(cipher_name_dst, cipher_name_src,
324 ECRYPTFS_MAX_CIPHER_NAME_SIZE); 337 ECRYPTFS_MAX_CIPHER_NAME_SIZE);
325 ecryptfs_printk(KERN_DEBUG, 338 cipher_name_dst[ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0';
326 "The mount_crypt_stat "
327 "global_default_cipher_name set to: "
328 "[%s]\n", cipher_name_dst);
329 cipher_name_set = 1; 339 cipher_name_set = 1;
330 break; 340 break;
331 case ecryptfs_opt_ecryptfs_key_bytes: 341 case ecryptfs_opt_ecryptfs_key_bytes:
@@ -335,11 +345,6 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
335 &cipher_key_bytes_src, 0); 345 &cipher_key_bytes_src, 0);
336 mount_crypt_stat->global_default_cipher_key_size = 346 mount_crypt_stat->global_default_cipher_key_size =
337 cipher_key_bytes; 347 cipher_key_bytes;
338 ecryptfs_printk(KERN_DEBUG,
339 "The mount_crypt_stat "
340 "global_default_cipher_key_size "
341 "set to: [%d]\n", mount_crypt_stat->
342 global_default_cipher_key_size);
343 cipher_key_bytes_set = 1; 348 cipher_key_bytes_set = 1;
344 break; 349 break;
345 case ecryptfs_opt_passthrough: 350 case ecryptfs_opt_passthrough:
@@ -356,11 +361,51 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
356 mount_crypt_stat->flags |= 361 mount_crypt_stat->flags |=
357 ECRYPTFS_ENCRYPTED_VIEW_ENABLED; 362 ECRYPTFS_ENCRYPTED_VIEW_ENABLED;
358 break; 363 break;
364 case ecryptfs_opt_fnek_sig:
365 fnek_src = args[0].from;
366 fnek_dst =
367 mount_crypt_stat->global_default_fnek_sig;
368 strncpy(fnek_dst, fnek_src, ECRYPTFS_SIG_SIZE_HEX);
369 mount_crypt_stat->global_default_fnek_sig[
370 ECRYPTFS_SIG_SIZE_HEX] = '\0';
371 rc = ecryptfs_add_global_auth_tok(
372 mount_crypt_stat,
373 mount_crypt_stat->global_default_fnek_sig);
374 if (rc) {
375 printk(KERN_ERR "Error attempting to register "
376 "global fnek sig [%s]; rc = [%d]\n",
377 mount_crypt_stat->global_default_fnek_sig,
378 rc);
379 goto out;
380 }
381 mount_crypt_stat->flags |=
382 (ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES
383 | ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK);
384 break;
385 case ecryptfs_opt_fn_cipher:
386 fn_cipher_name_src = args[0].from;
387 fn_cipher_name_dst =
388 mount_crypt_stat->global_default_fn_cipher_name;
389 strncpy(fn_cipher_name_dst, fn_cipher_name_src,
390 ECRYPTFS_MAX_CIPHER_NAME_SIZE);
391 mount_crypt_stat->global_default_fn_cipher_name[
392 ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0';
393 fn_cipher_name_set = 1;
394 break;
395 case ecryptfs_opt_fn_cipher_key_bytes:
396 fn_cipher_key_bytes_src = args[0].from;
397 fn_cipher_key_bytes =
398 (int)simple_strtol(fn_cipher_key_bytes_src,
399 &fn_cipher_key_bytes_src, 0);
400 mount_crypt_stat->global_default_fn_cipher_key_bytes =
401 fn_cipher_key_bytes;
402 fn_cipher_key_bytes_set = 1;
403 break;
359 case ecryptfs_opt_err: 404 case ecryptfs_opt_err:
360 default: 405 default:
361 ecryptfs_printk(KERN_WARNING, 406 printk(KERN_WARNING
362 "eCryptfs: unrecognized option '%s'\n", 407 "%s: eCryptfs: unrecognized option [%s]\n",
363 p); 408 __func__, p);
364 } 409 }
365 } 410 }
366 if (!sig_set) { 411 if (!sig_set) {
@@ -374,33 +419,60 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
374 int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER); 419 int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER);
375 420
376 BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE); 421 BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE);
377
378 strcpy(mount_crypt_stat->global_default_cipher_name, 422 strcpy(mount_crypt_stat->global_default_cipher_name,
379 ECRYPTFS_DEFAULT_CIPHER); 423 ECRYPTFS_DEFAULT_CIPHER);
380 } 424 }
381 if (!cipher_key_bytes_set) { 425 if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
426 && !fn_cipher_name_set)
427 strcpy(mount_crypt_stat->global_default_fn_cipher_name,
428 mount_crypt_stat->global_default_cipher_name);
429 if (!cipher_key_bytes_set)
382 mount_crypt_stat->global_default_cipher_key_size = 0; 430 mount_crypt_stat->global_default_cipher_key_size = 0;
383 } 431 if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
432 && !fn_cipher_key_bytes_set)
433 mount_crypt_stat->global_default_fn_cipher_key_bytes =
434 mount_crypt_stat->global_default_cipher_key_size;
384 mutex_lock(&key_tfm_list_mutex); 435 mutex_lock(&key_tfm_list_mutex);
385 if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name, 436 if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name,
386 NULL)) 437 NULL)) {
387 rc = ecryptfs_add_new_key_tfm( 438 rc = ecryptfs_add_new_key_tfm(
388 NULL, mount_crypt_stat->global_default_cipher_name, 439 NULL, mount_crypt_stat->global_default_cipher_name,
389 mount_crypt_stat->global_default_cipher_key_size); 440 mount_crypt_stat->global_default_cipher_key_size);
390 mutex_unlock(&key_tfm_list_mutex); 441 if (rc) {
391 if (rc) { 442 printk(KERN_ERR "Error attempting to initialize "
392 printk(KERN_ERR "Error attempting to initialize cipher with " 443 "cipher with name = [%s] and key size = [%td]; "
393 "name = [%s] and key size = [%td]; rc = [%d]\n", 444 "rc = [%d]\n",
394 mount_crypt_stat->global_default_cipher_name, 445 mount_crypt_stat->global_default_cipher_name,
395 mount_crypt_stat->global_default_cipher_key_size, rc); 446 mount_crypt_stat->global_default_cipher_key_size,
396 rc = -EINVAL; 447 rc);
397 goto out; 448 rc = -EINVAL;
449 mutex_unlock(&key_tfm_list_mutex);
450 goto out;
451 }
398 } 452 }
453 if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
454 && !ecryptfs_tfm_exists(
455 mount_crypt_stat->global_default_fn_cipher_name, NULL)) {
456 rc = ecryptfs_add_new_key_tfm(
457 NULL, mount_crypt_stat->global_default_fn_cipher_name,
458 mount_crypt_stat->global_default_fn_cipher_key_bytes);
459 if (rc) {
460 printk(KERN_ERR "Error attempting to initialize "
461 "cipher with name = [%s] and key size = [%td]; "
462 "rc = [%d]\n",
463 mount_crypt_stat->global_default_fn_cipher_name,
464 mount_crypt_stat->global_default_fn_cipher_key_bytes,
465 rc);
466 rc = -EINVAL;
467 mutex_unlock(&key_tfm_list_mutex);
468 goto out;
469 }
470 }
471 mutex_unlock(&key_tfm_list_mutex);
399 rc = ecryptfs_init_global_auth_toks(mount_crypt_stat); 472 rc = ecryptfs_init_global_auth_toks(mount_crypt_stat);
400 if (rc) { 473 if (rc)
401 printk(KERN_WARNING "One or more global auth toks could not " 474 printk(KERN_WARNING "One or more global auth toks could not "
402 "properly register; rc = [%d]\n", rc); 475 "properly register; rc = [%d]\n", rc);
403 }
404out: 476out:
405 return rc; 477 return rc;
406} 478}
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 6913f727624d..96ef51489e01 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -193,7 +193,7 @@ ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid,
193 (*daemon) = kzalloc(sizeof(**daemon), GFP_KERNEL); 193 (*daemon) = kzalloc(sizeof(**daemon), GFP_KERNEL);
194 if (!(*daemon)) { 194 if (!(*daemon)) {
195 rc = -ENOMEM; 195 rc = -ENOMEM;
196 printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of " 196 printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
197 "GFP_KERNEL memory\n", __func__, sizeof(**daemon)); 197 "GFP_KERNEL memory\n", __func__, sizeof(**daemon));
198 goto out; 198 goto out;
199 } 199 }
@@ -435,7 +435,7 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
435 msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL); 435 msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL);
436 if (!msg_ctx->msg) { 436 if (!msg_ctx->msg) {
437 rc = -ENOMEM; 437 rc = -ENOMEM;
438 printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of " 438 printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
439 "GFP_KERNEL memory\n", __func__, msg_size); 439 "GFP_KERNEL memory\n", __func__, msg_size);
440 goto unlock; 440 goto unlock;
441 } 441 }
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index efd95a0ed1ea..a67fea655f49 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -199,7 +199,7 @@ int ecryptfs_send_miscdev(char *data, size_t data_size,
199 if (!msg_ctx->msg) { 199 if (!msg_ctx->msg) {
200 rc = -ENOMEM; 200 rc = -ENOMEM;
201 printk(KERN_ERR "%s: Out of memory whilst attempting " 201 printk(KERN_ERR "%s: Out of memory whilst attempting "
202 "to kmalloc(%Zd, GFP_KERNEL)\n", __func__, 202 "to kmalloc(%zd, GFP_KERNEL)\n", __func__,
203 (sizeof(*msg_ctx->msg) + data_size)); 203 (sizeof(*msg_ctx->msg) + data_size));
204 goto out_unlock; 204 goto out_unlock;
205 } 205 }
@@ -322,7 +322,7 @@ check_list:
322 if (count < total_length) { 322 if (count < total_length) {
323 rc = 0; 323 rc = 0;
324 printk(KERN_WARNING "%s: Only given user buffer of " 324 printk(KERN_WARNING "%s: Only given user buffer of "
325 "size [%Zd], but we need [%Zd] to read the " 325 "size [%zd], but we need [%zd] to read the "
326 "pending message\n", __func__, count, total_length); 326 "pending message\n", __func__, count, total_length);
327 goto out_unlock_msg_ctx; 327 goto out_unlock_msg_ctx;
328 } 328 }
@@ -376,7 +376,7 @@ static int ecryptfs_miscdev_response(char *data, size_t data_size,
376 376
377 if ((sizeof(*msg) + msg->data_len) != data_size) { 377 if ((sizeof(*msg) + msg->data_len) != data_size) {
378 printk(KERN_WARNING "%s: (sizeof(*msg) + msg->data_len) = " 378 printk(KERN_WARNING "%s: (sizeof(*msg) + msg->data_len) = "
379 "[%Zd]; data_size = [%Zd]. Invalid packet.\n", __func__, 379 "[%zd]; data_size = [%zd]. Invalid packet.\n", __func__,
380 (sizeof(*msg) + msg->data_len), data_size); 380 (sizeof(*msg) + msg->data_len), data_size);
381 rc = -EINVAL; 381 rc = -EINVAL;
382 goto out; 382 goto out;
@@ -421,7 +421,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
421 data = kmalloc(count, GFP_KERNEL); 421 data = kmalloc(count, GFP_KERNEL);
422 if (!data) { 422 if (!data) {
423 printk(KERN_ERR "%s: Out of memory whilst attempting to " 423 printk(KERN_ERR "%s: Out of memory whilst attempting to "
424 "kmalloc([%Zd], GFP_KERNEL)\n", __func__, count); 424 "kmalloc([%zd], GFP_KERNEL)\n", __func__, count);
425 goto out; 425 goto out;
426 } 426 }
427 rc = copy_from_user(data, buf, count); 427 rc = copy_from_user(data, buf, count);
@@ -436,8 +436,8 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
436 case ECRYPTFS_MSG_RESPONSE: 436 case ECRYPTFS_MSG_RESPONSE:
437 if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) { 437 if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) {
438 printk(KERN_WARNING "%s: Minimum acceptable packet " 438 printk(KERN_WARNING "%s: Minimum acceptable packet "
439 "size is [%Zd], but amount of data written is " 439 "size is [%zd], but amount of data written is "
440 "only [%Zd]. Discarding response packet.\n", 440 "only [%zd]. Discarding response packet.\n",
441 __func__, 441 __func__,
442 (1 + 4 + 1 + sizeof(struct ecryptfs_message)), 442 (1 + 4 + 1 + sizeof(struct ecryptfs_message)),
443 count); 443 count);
@@ -455,9 +455,9 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
455 } 455 }
456 i += packet_size_length; 456 i += packet_size_length;
457 if ((1 + 4 + packet_size_length + packet_size) != count) { 457 if ((1 + 4 + packet_size_length + packet_size) != count) {
458 printk(KERN_WARNING "%s: (1 + packet_size_length([%Zd])" 458 printk(KERN_WARNING "%s: (1 + packet_size_length([%zd])"
459 " + packet_size([%Zd]))([%Zd]) != " 459 " + packet_size([%zd]))([%zd]) != "
460 "count([%Zd]). Invalid packet format.\n", 460 "count([%zd]). Invalid packet format.\n",
461 __func__, packet_size_length, packet_size, 461 __func__, packet_size_length, packet_size,
462 (1 + packet_size_length + packet_size), count); 462 (1 + packet_size_length + packet_size), count);
463 goto out_free; 463 goto out_free;
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 04d7b3fa1ac6..46cec2b69796 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -288,7 +288,7 @@ static int ecryptfs_write_begin(struct file *file,
288 loff_t prev_page_end_size; 288 loff_t prev_page_end_size;
289 int rc = 0; 289 int rc = 0;
290 290
291 page = __grab_cache_page(mapping, index); 291 page = grab_cache_page_write_begin(mapping, index, flags);
292 if (!page) 292 if (!page)
293 return -ENOMEM; 293 return -ENOMEM;
294 *pagep = page; 294 *pagep = page;
diff --git a/fs/exec.c b/fs/exec.c
index 911dd0fd7e09..605be573fe87 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -52,17 +52,13 @@
52#include <linux/audit.h> 52#include <linux/audit.h>
53#include <linux/tracehook.h> 53#include <linux/tracehook.h>
54#include <linux/kmod.h> 54#include <linux/kmod.h>
55#include <linux/fsnotify.h>
55 56
56#include <asm/uaccess.h> 57#include <asm/uaccess.h>
57#include <asm/mmu_context.h> 58#include <asm/mmu_context.h>
58#include <asm/tlb.h> 59#include <asm/tlb.h>
59#include "internal.h" 60#include "internal.h"
60 61
61#ifdef __alpha__
62/* for /sbin/loader handling in search_binary_handler() */
63#include <linux/a.out.h>
64#endif
65
66int core_uses_pid; 62int core_uses_pid;
67char core_pattern[CORENAME_MAX_SIZE] = "core"; 63char core_pattern[CORENAME_MAX_SIZE] = "core";
68int suid_dumpable = 0; 64int suid_dumpable = 0;
@@ -128,7 +124,8 @@ asmlinkage long sys_uselib(const char __user * library)
128 if (nd.path.mnt->mnt_flags & MNT_NOEXEC) 124 if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
129 goto exit; 125 goto exit;
130 126
131 error = vfs_permission(&nd, MAY_READ | MAY_EXEC | MAY_OPEN); 127 error = inode_permission(nd.path.dentry->d_inode,
128 MAY_READ | MAY_EXEC | MAY_OPEN);
132 if (error) 129 if (error)
133 goto exit; 130 goto exit;
134 131
@@ -137,6 +134,8 @@ asmlinkage long sys_uselib(const char __user * library)
137 if (IS_ERR(file)) 134 if (IS_ERR(file))
138 goto out; 135 goto out;
139 136
137 fsnotify_open(file->f_path.dentry);
138
140 error = -ENOEXEC; 139 error = -ENOEXEC;
141 if(file->f_op) { 140 if(file->f_op) {
142 struct linux_binfmt * fmt; 141 struct linux_binfmt * fmt;
@@ -234,13 +233,13 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
234 233
235static int __bprm_mm_init(struct linux_binprm *bprm) 234static int __bprm_mm_init(struct linux_binprm *bprm)
236{ 235{
237 int err = -ENOMEM; 236 int err;
238 struct vm_area_struct *vma = NULL; 237 struct vm_area_struct *vma = NULL;
239 struct mm_struct *mm = bprm->mm; 238 struct mm_struct *mm = bprm->mm;
240 239
241 bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); 240 bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
242 if (!vma) 241 if (!vma)
243 goto err; 242 return -ENOMEM;
244 243
245 down_write(&mm->mmap_sem); 244 down_write(&mm->mmap_sem);
246 vma->vm_mm = mm; 245 vma->vm_mm = mm;
@@ -253,28 +252,20 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
253 */ 252 */
254 vma->vm_end = STACK_TOP_MAX; 253 vma->vm_end = STACK_TOP_MAX;
255 vma->vm_start = vma->vm_end - PAGE_SIZE; 254 vma->vm_start = vma->vm_end - PAGE_SIZE;
256
257 vma->vm_flags = VM_STACK_FLAGS; 255 vma->vm_flags = VM_STACK_FLAGS;
258 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 256 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
259 err = insert_vm_struct(mm, vma); 257 err = insert_vm_struct(mm, vma);
260 if (err) { 258 if (err)
261 up_write(&mm->mmap_sem);
262 goto err; 259 goto err;
263 }
264 260
265 mm->stack_vm = mm->total_vm = 1; 261 mm->stack_vm = mm->total_vm = 1;
266 up_write(&mm->mmap_sem); 262 up_write(&mm->mmap_sem);
267
268 bprm->p = vma->vm_end - sizeof(void *); 263 bprm->p = vma->vm_end - sizeof(void *);
269
270 return 0; 264 return 0;
271
272err: 265err:
273 if (vma) { 266 up_write(&mm->mmap_sem);
274 bprm->vma = NULL; 267 bprm->vma = NULL;
275 kmem_cache_free(vm_area_cachep, vma); 268 kmem_cache_free(vm_area_cachep, vma);
276 }
277
278 return err; 269 return err;
279} 270}
280 271
@@ -681,7 +672,7 @@ struct file *open_exec(const char *name)
681 if (nd.path.mnt->mnt_flags & MNT_NOEXEC) 672 if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
682 goto out_path_put; 673 goto out_path_put;
683 674
684 err = vfs_permission(&nd, MAY_EXEC | MAY_OPEN); 675 err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN);
685 if (err) 676 if (err)
686 goto out_path_put; 677 goto out_path_put;
687 678
@@ -689,6 +680,8 @@ struct file *open_exec(const char *name)
689 if (IS_ERR(file)) 680 if (IS_ERR(file))
690 return file; 681 return file;
691 682
683 fsnotify_open(file->f_path.dentry);
684
692 err = deny_write_access(file); 685 err = deny_write_access(file);
693 if (err) { 686 if (err) {
694 fput(file); 687 fput(file);
@@ -774,7 +767,6 @@ static int de_thread(struct task_struct *tsk)
774 struct signal_struct *sig = tsk->signal; 767 struct signal_struct *sig = tsk->signal;
775 struct sighand_struct *oldsighand = tsk->sighand; 768 struct sighand_struct *oldsighand = tsk->sighand;
776 spinlock_t *lock = &oldsighand->siglock; 769 spinlock_t *lock = &oldsighand->siglock;
777 struct task_struct *leader = NULL;
778 int count; 770 int count;
779 771
780 if (thread_group_empty(tsk)) 772 if (thread_group_empty(tsk))
@@ -812,7 +804,7 @@ static int de_thread(struct task_struct *tsk)
812 * and to assume its PID: 804 * and to assume its PID:
813 */ 805 */
814 if (!thread_group_leader(tsk)) { 806 if (!thread_group_leader(tsk)) {
815 leader = tsk->group_leader; 807 struct task_struct *leader = tsk->group_leader;
816 808
817 sig->notify_count = -1; /* for exit_notify() */ 809 sig->notify_count = -1; /* for exit_notify() */
818 for (;;) { 810 for (;;) {
@@ -864,8 +856,9 @@ static int de_thread(struct task_struct *tsk)
864 856
865 BUG_ON(leader->exit_state != EXIT_ZOMBIE); 857 BUG_ON(leader->exit_state != EXIT_ZOMBIE);
866 leader->exit_state = EXIT_DEAD; 858 leader->exit_state = EXIT_DEAD;
867
868 write_unlock_irq(&tasklist_lock); 859 write_unlock_irq(&tasklist_lock);
860
861 release_task(leader);
869 } 862 }
870 863
871 sig->group_exit_task = NULL; 864 sig->group_exit_task = NULL;
@@ -874,8 +867,6 @@ static int de_thread(struct task_struct *tsk)
874no_thread_group: 867no_thread_group:
875 exit_itimers(sig); 868 exit_itimers(sig);
876 flush_itimer_signals(); 869 flush_itimer_signals();
877 if (leader)
878 release_task(leader);
879 870
880 if (atomic_read(&oldsighand->count) != 1) { 871 if (atomic_read(&oldsighand->count) != 1) {
881 struct sighand_struct *newsighand; 872 struct sighand_struct *newsighand;
@@ -1181,41 +1172,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
1181 unsigned int depth = bprm->recursion_depth; 1172 unsigned int depth = bprm->recursion_depth;
1182 int try,retval; 1173 int try,retval;
1183 struct linux_binfmt *fmt; 1174 struct linux_binfmt *fmt;
1184#ifdef __alpha__
1185 /* handle /sbin/loader.. */
1186 {
1187 struct exec * eh = (struct exec *) bprm->buf;
1188
1189 if (!bprm->loader && eh->fh.f_magic == 0x183 &&
1190 (eh->fh.f_flags & 0x3000) == 0x3000)
1191 {
1192 struct file * file;
1193 unsigned long loader;
1194 1175
1195 allow_write_access(bprm->file);
1196 fput(bprm->file);
1197 bprm->file = NULL;
1198
1199 loader = bprm->vma->vm_end - sizeof(void *);
1200
1201 file = open_exec("/sbin/loader");
1202 retval = PTR_ERR(file);
1203 if (IS_ERR(file))
1204 return retval;
1205
1206 /* Remember if the application is TASO. */
1207 bprm->taso = eh->ah.entry < 0x100000000UL;
1208
1209 bprm->file = file;
1210 bprm->loader = loader;
1211 retval = prepare_binprm(bprm);
1212 if (retval<0)
1213 return retval;
1214 /* should call search_binary_handler recursively here,
1215 but it does not matter */
1216 }
1217 }
1218#endif
1219 retval = security_bprm_check(bprm); 1176 retval = security_bprm_check(bprm);
1220 if (retval) 1177 if (retval)
1221 return retval; 1178 return retval;
@@ -1737,7 +1694,7 @@ int get_dumpable(struct mm_struct *mm)
1737 return (ret >= 2) ? 2 : ret; 1694 return (ret >= 2) ? 2 : ret;
1738} 1695}
1739 1696
1740int do_coredump(long signr, int exit_code, struct pt_regs * regs) 1697void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1741{ 1698{
1742 struct core_state core_state; 1699 struct core_state core_state;
1743 char corename[CORENAME_MAX_SIZE + 1]; 1700 char corename[CORENAME_MAX_SIZE + 1];
@@ -1821,6 +1778,11 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
1821 1778
1822 if (ispipe) { 1779 if (ispipe) {
1823 helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc); 1780 helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
1781 if (!helper_argv) {
1782 printk(KERN_WARNING "%s failed to allocate memory\n",
1783 __func__);
1784 goto fail_unlock;
1785 }
1824 /* Terminate the string before the first option */ 1786 /* Terminate the string before the first option */
1825 delimit = strchr(corename, ' '); 1787 delimit = strchr(corename, ' ');
1826 if (delimit) 1788 if (delimit)
@@ -1888,5 +1850,5 @@ fail_unlock:
1888 put_cred(cred); 1850 put_cred(cred);
1889 coredump_finish(mm); 1851 coredump_finish(mm);
1890fail: 1852fail:
1891 return retval; 1853 return;
1892} 1854}
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 8d0add625870..66321a877e74 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -565,12 +565,8 @@ got:
565 inode->i_blocks = 0; 565 inode->i_blocks = 0;
566 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 566 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
567 memset(ei->i_data, 0, sizeof(ei->i_data)); 567 memset(ei->i_data, 0, sizeof(ei->i_data));
568 ei->i_flags = EXT2_I(dir)->i_flags & ~EXT2_BTREE_FL; 568 ei->i_flags =
569 if (S_ISLNK(mode)) 569 ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED);
570 ei->i_flags &= ~(EXT2_IMMUTABLE_FL|EXT2_APPEND_FL);
571 /* dirsync is only applied to directories */
572 if (!S_ISDIR(mode))
573 ei->i_flags &= ~EXT2_DIRSYNC_FL;
574 ei->i_faddr = 0; 570 ei->i_faddr = 0;
575 ei->i_frag_no = 0; 571 ei->i_frag_no = 0;
576 ei->i_frag_size = 0; 572 ei->i_frag_size = 0;
@@ -585,7 +581,10 @@ got:
585 spin_lock(&sbi->s_next_gen_lock); 581 spin_lock(&sbi->s_next_gen_lock);
586 inode->i_generation = sbi->s_next_generation++; 582 inode->i_generation = sbi->s_next_generation++;
587 spin_unlock(&sbi->s_next_gen_lock); 583 spin_unlock(&sbi->s_next_gen_lock);
588 insert_inode_hash(inode); 584 if (insert_inode_locked(inode) < 0) {
585 err = -EINVAL;
586 goto fail_drop;
587 }
589 588
590 if (DQUOT_ALLOC_INODE(inode)) { 589 if (DQUOT_ALLOC_INODE(inode)) {
591 err = -EDQUOT; 590 err = -EDQUOT;
@@ -612,6 +611,7 @@ fail_drop:
612 DQUOT_DROP(inode); 611 DQUOT_DROP(inode);
613 inode->i_flags |= S_NOQUOTA; 612 inode->i_flags |= S_NOQUOTA;
614 inode->i_nlink = 0; 613 inode->i_nlink = 0;
614 unlock_new_inode(inode);
615 iput(inode); 615 iput(inode);
616 return ERR_PTR(err); 616 return ERR_PTR(err);
617 617
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 7658b33e2653..23fff2f87783 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -32,6 +32,7 @@
32#include <linux/buffer_head.h> 32#include <linux/buffer_head.h>
33#include <linux/mpage.h> 33#include <linux/mpage.h>
34#include <linux/fiemap.h> 34#include <linux/fiemap.h>
35#include <linux/namei.h>
35#include "ext2.h" 36#include "ext2.h"
36#include "acl.h" 37#include "acl.h"
37#include "xip.h" 38#include "xip.h"
@@ -497,8 +498,6 @@ static int ext2_alloc_branch(struct inode *inode,
497 * ext2_splice_branch - splice the allocated branch onto inode. 498 * ext2_splice_branch - splice the allocated branch onto inode.
498 * @inode: owner 499 * @inode: owner
499 * @block: (logical) number of block we are adding 500 * @block: (logical) number of block we are adding
500 * @chain: chain of indirect blocks (with a missing link - see
501 * ext2_alloc_branch)
502 * @where: location of missing link 501 * @where: location of missing link
503 * @num: number of indirect blocks we are adding 502 * @num: number of indirect blocks we are adding
504 * @blks: number of direct blocks we are adding 503 * @blks: number of direct blocks we are adding
@@ -1286,9 +1285,11 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
1286 else 1285 else
1287 inode->i_mapping->a_ops = &ext2_aops; 1286 inode->i_mapping->a_ops = &ext2_aops;
1288 } else if (S_ISLNK(inode->i_mode)) { 1287 } else if (S_ISLNK(inode->i_mode)) {
1289 if (ext2_inode_is_fast_symlink(inode)) 1288 if (ext2_inode_is_fast_symlink(inode)) {
1290 inode->i_op = &ext2_fast_symlink_inode_operations; 1289 inode->i_op = &ext2_fast_symlink_inode_operations;
1291 else { 1290 nd_terminate_link(ei->i_data, inode->i_size,
1291 sizeof(ei->i_data) - 1);
1292 } else {
1292 inode->i_op = &ext2_symlink_inode_operations; 1293 inode->i_op = &ext2_symlink_inode_operations;
1293 if (test_opt(inode->i_sb, NOBH)) 1294 if (test_opt(inode->i_sb, NOBH))
1294 inode->i_mapping->a_ops = &ext2_nobh_aops; 1295 inode->i_mapping->a_ops = &ext2_nobh_aops;
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index de876fa793e1..7cb4badef927 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -50,8 +50,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
50 goto setflags_out; 50 goto setflags_out;
51 } 51 }
52 52
53 if (!S_ISDIR(inode->i_mode)) 53 flags = ext2_mask_flags(inode->i_mode, flags);
54 flags &= ~EXT2_DIRSYNC_FL;
55 54
56 mutex_lock(&inode->i_mutex); 55 mutex_lock(&inode->i_mutex);
57 /* Is it quota file? Do not allow user to mess with it */ 56 /* Is it quota file? Do not allow user to mess with it */
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 2a747252ec12..90ea17998a73 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -41,9 +41,11 @@ static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode)
41 int err = ext2_add_link(dentry, inode); 41 int err = ext2_add_link(dentry, inode);
42 if (!err) { 42 if (!err) {
43 d_instantiate(dentry, inode); 43 d_instantiate(dentry, inode);
44 unlock_new_inode(inode);
44 return 0; 45 return 0;
45 } 46 }
46 inode_dec_link_count(inode); 47 inode_dec_link_count(inode);
48 unlock_new_inode(inode);
47 iput(inode); 49 iput(inode);
48 return err; 50 return err;
49} 51}
@@ -170,6 +172,7 @@ out:
170 172
171out_fail: 173out_fail:
172 inode_dec_link_count(inode); 174 inode_dec_link_count(inode);
175 unlock_new_inode(inode);
173 iput (inode); 176 iput (inode);
174 goto out; 177 goto out;
175} 178}
@@ -178,6 +181,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
178 struct dentry *dentry) 181 struct dentry *dentry)
179{ 182{
180 struct inode *inode = old_dentry->d_inode; 183 struct inode *inode = old_dentry->d_inode;
184 int err;
181 185
182 if (inode->i_nlink >= EXT2_LINK_MAX) 186 if (inode->i_nlink >= EXT2_LINK_MAX)
183 return -EMLINK; 187 return -EMLINK;
@@ -186,7 +190,14 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
186 inode_inc_link_count(inode); 190 inode_inc_link_count(inode);
187 atomic_inc(&inode->i_count); 191 atomic_inc(&inode->i_count);
188 192
189 return ext2_add_nondir(dentry, inode); 193 err = ext2_add_link(dentry, inode);
194 if (!err) {
195 d_instantiate(dentry, inode);
196 return 0;
197 }
198 inode_dec_link_count(inode);
199 iput(inode);
200 return err;
190} 201}
191 202
192static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode) 203static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
@@ -222,12 +233,14 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
222 goto out_fail; 233 goto out_fail;
223 234
224 d_instantiate(dentry, inode); 235 d_instantiate(dentry, inode);
236 unlock_new_inode(inode);
225out: 237out:
226 return err; 238 return err;
227 239
228out_fail: 240out_fail:
229 inode_dec_link_count(inode); 241 inode_dec_link_count(inode);
230 inode_dec_link_count(inode); 242 inode_dec_link_count(inode);
243 unlock_new_inode(inode);
231 iput(inode); 244 iput(inode);
232out_dir: 245out_dir:
233 inode_dec_link_count(dir); 246 inode_dec_link_count(dir);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 647cd888ac87..da8bdeaa2e6d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -132,6 +132,7 @@ static void ext2_put_super (struct super_block * sb)
132 percpu_counter_destroy(&sbi->s_dirs_counter); 132 percpu_counter_destroy(&sbi->s_dirs_counter);
133 brelse (sbi->s_sbh); 133 brelse (sbi->s_sbh);
134 sb->s_fs_info = NULL; 134 sb->s_fs_info = NULL;
135 kfree(sbi->s_blockgroup_lock);
135 kfree(sbi); 136 kfree(sbi);
136 137
137 return; 138 return;
@@ -756,6 +757,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
756 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 757 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
757 if (!sbi) 758 if (!sbi)
758 return -ENOMEM; 759 return -ENOMEM;
760
761 sbi->s_blockgroup_lock =
762 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
763 if (!sbi->s_blockgroup_lock) {
764 kfree(sbi);
765 return -ENOMEM;
766 }
759 sb->s_fs_info = sbi; 767 sb->s_fs_info = sbi;
760 sbi->s_sb_block = sb_block; 768 sbi->s_sb_block = sb_block;
761 769
@@ -983,7 +991,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
983 printk ("EXT2-fs: not enough memory\n"); 991 printk ("EXT2-fs: not enough memory\n");
984 goto failed_mount; 992 goto failed_mount;
985 } 993 }
986 bgl_lock_init(&sbi->s_blockgroup_lock); 994 bgl_lock_init(sbi->s_blockgroup_lock);
987 sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL); 995 sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL);
988 if (!sbi->s_debts) { 996 if (!sbi->s_debts) {
989 printk ("EXT2-fs: not enough memory\n"); 997 printk ("EXT2-fs: not enough memory\n");
diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c
index c30e149fbd2e..7d215b4d4f2e 100644
--- a/fs/ext3/hash.c
+++ b/fs/ext3/hash.c
@@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
35 35
36 36
37/* The old legacy hash */ 37/* The old legacy hash */
38static __u32 dx_hack_hash (const char *name, int len) 38static __u32 dx_hack_hash_unsigned(const char *name, int len)
39{ 39{
40 __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; 40 __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
41 const unsigned char *ucp = (const unsigned char *) name;
42
43 while (len--) {
44 hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
45
46 if (hash & 0x80000000)
47 hash -= 0x7fffffff;
48 hash1 = hash0;
49 hash0 = hash;
50 }
51 return hash0 << 1;
52}
53
54static __u32 dx_hack_hash_signed(const char *name, int len)
55{
56 __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
57 const signed char *scp = (const signed char *) name;
58
41 while (len--) { 59 while (len--) {
42 __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); 60 hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
43 61
44 if (hash & 0x80000000) hash -= 0x7fffffff; 62 if (hash & 0x80000000)
63 hash -= 0x7fffffff;
45 hash1 = hash0; 64 hash1 = hash0;
46 hash0 = hash; 65 hash0 = hash;
47 } 66 }
48 return (hash0 << 1); 67 return hash0 << 1;
49} 68}
50 69
51static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) 70static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
52{ 71{
53 __u32 pad, val; 72 __u32 pad, val;
54 int i; 73 int i;
74 const signed char *scp = (const signed char *) msg;
75
76 pad = (__u32)len | ((__u32)len << 8);
77 pad |= pad << 16;
78
79 val = pad;
80 if (len > num*4)
81 len = num * 4;
82 for (i = 0; i < len; i++) {
83 if ((i % 4) == 0)
84 val = pad;
85 val = ((int) scp[i]) + (val << 8);
86 if ((i % 4) == 3) {
87 *buf++ = val;
88 val = pad;
89 num--;
90 }
91 }
92 if (--num >= 0)
93 *buf++ = val;
94 while (--num >= 0)
95 *buf++ = pad;
96}
97
98static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
99{
100 __u32 pad, val;
101 int i;
102 const unsigned char *ucp = (const unsigned char *) msg;
55 103
56 pad = (__u32)len | ((__u32)len << 8); 104 pad = (__u32)len | ((__u32)len << 8);
57 pad |= pad << 16; 105 pad |= pad << 16;
@@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
62 for (i=0; i < len; i++) { 110 for (i=0; i < len; i++) {
63 if ((i % 4) == 0) 111 if ((i % 4) == 0)
64 val = pad; 112 val = pad;
65 val = msg[i] + (val << 8); 113 val = ((int) ucp[i]) + (val << 8);
66 if ((i % 4) == 3) { 114 if ((i % 4) == 3) {
67 *buf++ = val; 115 *buf++ = val;
68 val = pad; 116 val = pad;
@@ -95,6 +143,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
95 const char *p; 143 const char *p;
96 int i; 144 int i;
97 __u32 in[8], buf[4]; 145 __u32 in[8], buf[4];
146 void (*str2hashbuf)(const char *, int, __u32 *, int) =
147 str2hashbuf_signed;
98 148
99 /* Initialize the default seed for the hash checksum functions */ 149 /* Initialize the default seed for the hash checksum functions */
100 buf[0] = 0x67452301; 150 buf[0] = 0x67452301;
@@ -113,13 +163,18 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
113 } 163 }
114 164
115 switch (hinfo->hash_version) { 165 switch (hinfo->hash_version) {
166 case DX_HASH_LEGACY_UNSIGNED:
167 hash = dx_hack_hash_unsigned(name, len);
168 break;
116 case DX_HASH_LEGACY: 169 case DX_HASH_LEGACY:
117 hash = dx_hack_hash(name, len); 170 hash = dx_hack_hash_signed(name, len);
118 break; 171 break;
172 case DX_HASH_HALF_MD4_UNSIGNED:
173 str2hashbuf = str2hashbuf_unsigned;
119 case DX_HASH_HALF_MD4: 174 case DX_HASH_HALF_MD4:
120 p = name; 175 p = name;
121 while (len > 0) { 176 while (len > 0) {
122 str2hashbuf(p, len, in, 8); 177 (*str2hashbuf)(p, len, in, 8);
123 half_md4_transform(buf, in); 178 half_md4_transform(buf, in);
124 len -= 32; 179 len -= 32;
125 p += 32; 180 p += 32;
@@ -127,10 +182,12 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
127 minor_hash = buf[2]; 182 minor_hash = buf[2];
128 hash = buf[1]; 183 hash = buf[1];
129 break; 184 break;
185 case DX_HASH_TEA_UNSIGNED:
186 str2hashbuf = str2hashbuf_unsigned;
130 case DX_HASH_TEA: 187 case DX_HASH_TEA:
131 p = name; 188 p = name;
132 while (len > 0) { 189 while (len > 0) {
133 str2hashbuf(p, len, in, 4); 190 (*str2hashbuf)(p, len, in, 4);
134 TEA_transform(buf, in); 191 TEA_transform(buf, in);
135 len -= 16; 192 len -= 16;
136 p += 16; 193 p += 16;
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 490bd0ed7896..8de6c720e510 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -559,12 +559,8 @@ got:
559 ei->i_dir_start_lookup = 0; 559 ei->i_dir_start_lookup = 0;
560 ei->i_disksize = 0; 560 ei->i_disksize = 0;
561 561
562 ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; 562 ei->i_flags =
563 if (S_ISLNK(mode)) 563 ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED);
564 ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL);
565 /* dirsync only applies to directories */
566 if (!S_ISDIR(mode))
567 ei->i_flags &= ~EXT3_DIRSYNC_FL;
568#ifdef EXT3_FRAGMENTS 564#ifdef EXT3_FRAGMENTS
569 ei->i_faddr = 0; 565 ei->i_faddr = 0;
570 ei->i_frag_no = 0; 566 ei->i_frag_no = 0;
@@ -579,7 +575,10 @@ got:
579 ext3_set_inode_flags(inode); 575 ext3_set_inode_flags(inode);
580 if (IS_DIRSYNC(inode)) 576 if (IS_DIRSYNC(inode))
581 handle->h_sync = 1; 577 handle->h_sync = 1;
582 insert_inode_hash(inode); 578 if (insert_inode_locked(inode) < 0) {
579 err = -EINVAL;
580 goto fail_drop;
581 }
583 spin_lock(&sbi->s_next_gen_lock); 582 spin_lock(&sbi->s_next_gen_lock);
584 inode->i_generation = sbi->s_next_generation++; 583 inode->i_generation = sbi->s_next_generation++;
585 spin_unlock(&sbi->s_next_gen_lock); 584 spin_unlock(&sbi->s_next_gen_lock);
@@ -627,6 +626,7 @@ fail_drop:
627 DQUOT_DROP(inode); 626 DQUOT_DROP(inode);
628 inode->i_flags |= S_NOQUOTA; 627 inode->i_flags |= S_NOQUOTA;
629 inode->i_nlink = 0; 628 inode->i_nlink = 0;
629 unlock_new_inode(inode);
630 iput(inode); 630 iput(inode);
631 brelse(bitmap_bh); 631 brelse(bitmap_bh);
632 return ERR_PTR(err); 632 return ERR_PTR(err);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index f8424ad89971..5fa453b49a64 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -37,6 +37,7 @@
37#include <linux/uio.h> 37#include <linux/uio.h>
38#include <linux/bio.h> 38#include <linux/bio.h>
39#include <linux/fiemap.h> 39#include <linux/fiemap.h>
40#include <linux/namei.h>
40#include "xattr.h" 41#include "xattr.h"
41#include "acl.h" 42#include "acl.h"
42 43
@@ -1160,7 +1161,7 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping,
1160 to = from + len; 1161 to = from + len;
1161 1162
1162retry: 1163retry:
1163 page = __grab_cache_page(mapping, index); 1164 page = grab_cache_page_write_begin(mapping, index, flags);
1164 if (!page) 1165 if (!page)
1165 return -ENOMEM; 1166 return -ENOMEM;
1166 *pagep = page; 1167 *pagep = page;
@@ -2817,9 +2818,11 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
2817 inode->i_op = &ext3_dir_inode_operations; 2818 inode->i_op = &ext3_dir_inode_operations;
2818 inode->i_fop = &ext3_dir_operations; 2819 inode->i_fop = &ext3_dir_operations;
2819 } else if (S_ISLNK(inode->i_mode)) { 2820 } else if (S_ISLNK(inode->i_mode)) {
2820 if (ext3_inode_is_fast_symlink(inode)) 2821 if (ext3_inode_is_fast_symlink(inode)) {
2821 inode->i_op = &ext3_fast_symlink_inode_operations; 2822 inode->i_op = &ext3_fast_symlink_inode_operations;
2822 else { 2823 nd_terminate_link(ei->i_data, inode->i_size,
2824 sizeof(ei->i_data) - 1);
2825 } else {
2823 inode->i_op = &ext3_symlink_inode_operations; 2826 inode->i_op = &ext3_symlink_inode_operations;
2824 ext3_set_aops(inode); 2827 ext3_set_aops(inode);
2825 } 2828 }
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index b7394d05ee8e..5e86ce9a86e0 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -53,8 +53,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
53 goto flags_out; 53 goto flags_out;
54 } 54 }
55 55
56 if (!S_ISDIR(inode->i_mode)) 56 flags = ext3_mask_flags(inode->i_mode, flags);
57 flags &= ~EXT3_DIRSYNC_FL;
58 57
59 mutex_lock(&inode->i_mutex); 58 mutex_lock(&inode->i_mutex);
60 /* Is it quota file? Do not allow user to mess with it */ 59 /* Is it quota file? Do not allow user to mess with it */
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 3e5edc92aa0b..69a3d19ca9fd 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -74,10 +74,6 @@ static struct buffer_head *ext3_append(handle_t *handle,
74#define assert(test) J_ASSERT(test) 74#define assert(test) J_ASSERT(test)
75#endif 75#endif
76 76
77#ifndef swap
78#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
79#endif
80
81#ifdef DX_DEBUG 77#ifdef DX_DEBUG
82#define dxtrace(command) command 78#define dxtrace(command) command
83#else 79#else
@@ -368,6 +364,8 @@ dx_probe(struct qstr *entry, struct inode *dir,
368 goto fail; 364 goto fail;
369 } 365 }
370 hinfo->hash_version = root->info.hash_version; 366 hinfo->hash_version = root->info.hash_version;
367 if (hinfo->hash_version <= DX_HASH_TEA)
368 hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
371 hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed; 369 hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
372 if (entry) 370 if (entry)
373 ext3fs_dirhash(entry->name, entry->len, hinfo); 371 ext3fs_dirhash(entry->name, entry->len, hinfo);
@@ -636,6 +634,9 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
636 dir = dir_file->f_path.dentry->d_inode; 634 dir = dir_file->f_path.dentry->d_inode;
637 if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { 635 if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
638 hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; 636 hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
637 if (hinfo.hash_version <= DX_HASH_TEA)
638 hinfo.hash_version +=
639 EXT3_SB(dir->i_sb)->s_hash_unsigned;
639 hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; 640 hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
640 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, 641 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
641 start_hash, start_minor_hash); 642 start_hash, start_minor_hash);
@@ -1156,9 +1157,9 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1156 u32 hash2; 1157 u32 hash2;
1157 struct dx_map_entry *map; 1158 struct dx_map_entry *map;
1158 char *data1 = (*bh)->b_data, *data2; 1159 char *data1 = (*bh)->b_data, *data2;
1159 unsigned split, move, size, i; 1160 unsigned split, move, size;
1160 struct ext3_dir_entry_2 *de = NULL, *de2; 1161 struct ext3_dir_entry_2 *de = NULL, *de2;
1161 int err = 0; 1162 int err = 0, i;
1162 1163
1163 bh2 = ext3_append (handle, dir, &newblock, &err); 1164 bh2 = ext3_append (handle, dir, &newblock, &err);
1164 if (!(bh2)) { 1165 if (!(bh2)) {
@@ -1398,6 +1399,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1398 1399
1399 /* Initialize as for dx_probe */ 1400 /* Initialize as for dx_probe */
1400 hinfo.hash_version = root->info.hash_version; 1401 hinfo.hash_version = root->info.hash_version;
1402 if (hinfo.hash_version <= DX_HASH_TEA)
1403 hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
1401 hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; 1404 hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
1402 ext3fs_dirhash(name, namelen, &hinfo); 1405 ext3fs_dirhash(name, namelen, &hinfo);
1403 frame = frames; 1406 frame = frames;
@@ -1652,9 +1655,11 @@ static int ext3_add_nondir(handle_t *handle,
1652 if (!err) { 1655 if (!err) {
1653 ext3_mark_inode_dirty(handle, inode); 1656 ext3_mark_inode_dirty(handle, inode);
1654 d_instantiate(dentry, inode); 1657 d_instantiate(dentry, inode);
1658 unlock_new_inode(inode);
1655 return 0; 1659 return 0;
1656 } 1660 }
1657 drop_nlink(inode); 1661 drop_nlink(inode);
1662 unlock_new_inode(inode);
1658 iput(inode); 1663 iput(inode);
1659 return err; 1664 return err;
1660} 1665}
@@ -1765,6 +1770,7 @@ retry:
1765 dir_block = ext3_bread (handle, inode, 0, 1, &err); 1770 dir_block = ext3_bread (handle, inode, 0, 1, &err);
1766 if (!dir_block) { 1771 if (!dir_block) {
1767 drop_nlink(inode); /* is this nlink == 0? */ 1772 drop_nlink(inode); /* is this nlink == 0? */
1773 unlock_new_inode(inode);
1768 ext3_mark_inode_dirty(handle, inode); 1774 ext3_mark_inode_dirty(handle, inode);
1769 iput (inode); 1775 iput (inode);
1770 goto out_stop; 1776 goto out_stop;
@@ -1792,6 +1798,7 @@ retry:
1792 err = ext3_add_entry (handle, dentry, inode); 1798 err = ext3_add_entry (handle, dentry, inode);
1793 if (err) { 1799 if (err) {
1794 inode->i_nlink = 0; 1800 inode->i_nlink = 0;
1801 unlock_new_inode(inode);
1795 ext3_mark_inode_dirty(handle, inode); 1802 ext3_mark_inode_dirty(handle, inode);
1796 iput (inode); 1803 iput (inode);
1797 goto out_stop; 1804 goto out_stop;
@@ -1800,6 +1807,7 @@ retry:
1800 ext3_update_dx_flag(dir); 1807 ext3_update_dx_flag(dir);
1801 ext3_mark_inode_dirty(handle, dir); 1808 ext3_mark_inode_dirty(handle, dir);
1802 d_instantiate(dentry, inode); 1809 d_instantiate(dentry, inode);
1810 unlock_new_inode(inode);
1803out_stop: 1811out_stop:
1804 ext3_journal_stop(handle); 1812 ext3_journal_stop(handle);
1805 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) 1813 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
@@ -2170,10 +2178,10 @@ retry:
2170 * We have a transaction open. All is sweetness. It also sets 2178 * We have a transaction open. All is sweetness. It also sets
2171 * i_size in generic_commit_write(). 2179 * i_size in generic_commit_write().
2172 */ 2180 */
2173 err = __page_symlink(inode, symname, l, 2181 err = __page_symlink(inode, symname, l, 1);
2174 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
2175 if (err) { 2182 if (err) {
2176 drop_nlink(inode); 2183 drop_nlink(inode);
2184 unlock_new_inode(inode);
2177 ext3_mark_inode_dirty(handle, inode); 2185 ext3_mark_inode_dirty(handle, inode);
2178 iput (inode); 2186 iput (inode);
2179 goto out_stop; 2187 goto out_stop;
@@ -2221,7 +2229,14 @@ retry:
2221 inc_nlink(inode); 2229 inc_nlink(inode);
2222 atomic_inc(&inode->i_count); 2230 atomic_inc(&inode->i_count);
2223 2231
2224 err = ext3_add_nondir(handle, dentry, inode); 2232 err = ext3_add_entry(handle, dentry, inode);
2233 if (!err) {
2234 ext3_mark_inode_dirty(handle, inode);
2235 d_instantiate(dentry, inode);
2236 } else {
2237 drop_nlink(inode);
2238 iput(inode);
2239 }
2225 ext3_journal_stop(handle); 2240 ext3_journal_stop(handle);
2226 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) 2241 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
2227 goto retry; 2242 goto retry;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index f6c94f232ec1..b70d90e08a3c 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -48,8 +48,8 @@ static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
48 unsigned long journal_devnum); 48 unsigned long journal_devnum);
49static int ext3_create_journal(struct super_block *, struct ext3_super_block *, 49static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
50 unsigned int); 50 unsigned int);
51static void ext3_commit_super (struct super_block * sb, 51static int ext3_commit_super(struct super_block *sb,
52 struct ext3_super_block * es, 52 struct ext3_super_block *es,
53 int sync); 53 int sync);
54static void ext3_mark_recovery_complete(struct super_block * sb, 54static void ext3_mark_recovery_complete(struct super_block * sb,
55 struct ext3_super_block * es); 55 struct ext3_super_block * es);
@@ -60,9 +60,9 @@ static const char *ext3_decode_error(struct super_block * sb, int errno,
60 char nbuf[16]); 60 char nbuf[16]);
61static int ext3_remount (struct super_block * sb, int * flags, char * data); 61static int ext3_remount (struct super_block * sb, int * flags, char * data);
62static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf); 62static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
63static void ext3_unlockfs(struct super_block *sb); 63static int ext3_unfreeze(struct super_block *sb);
64static void ext3_write_super (struct super_block * sb); 64static void ext3_write_super (struct super_block * sb);
65static void ext3_write_super_lockfs(struct super_block *sb); 65static int ext3_freeze(struct super_block *sb);
66 66
67/* 67/*
68 * Wrappers for journal_start/end. 68 * Wrappers for journal_start/end.
@@ -439,6 +439,7 @@ static void ext3_put_super (struct super_block * sb)
439 ext3_blkdev_remove(sbi); 439 ext3_blkdev_remove(sbi);
440 } 440 }
441 sb->s_fs_info = NULL; 441 sb->s_fs_info = NULL;
442 kfree(sbi->s_blockgroup_lock);
442 kfree(sbi); 443 kfree(sbi);
443 return; 444 return;
444} 445}
@@ -682,6 +683,26 @@ static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid,
682 ext3_nfs_get_inode); 683 ext3_nfs_get_inode);
683} 684}
684 685
686/*
687 * Try to release metadata pages (indirect blocks, directories) which are
688 * mapped via the block device. Since these pages could have journal heads
689 * which would prevent try_to_free_buffers() from freeing them, we must use
690 * jbd layer's try_to_free_buffers() function to release them.
691 */
692static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
693 gfp_t wait)
694{
695 journal_t *journal = EXT3_SB(sb)->s_journal;
696
697 WARN_ON(PageChecked(page));
698 if (!page_has_buffers(page))
699 return 0;
700 if (journal)
701 return journal_try_to_free_buffers(journal, page,
702 wait & ~__GFP_WAIT);
703 return try_to_free_buffers(page);
704}
705
685#ifdef CONFIG_QUOTA 706#ifdef CONFIG_QUOTA
686#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") 707#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
687#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) 708#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
@@ -713,7 +734,9 @@ static struct dquot_operations ext3_quota_operations = {
713 .acquire_dquot = ext3_acquire_dquot, 734 .acquire_dquot = ext3_acquire_dquot,
714 .release_dquot = ext3_release_dquot, 735 .release_dquot = ext3_release_dquot,
715 .mark_dirty = ext3_mark_dquot_dirty, 736 .mark_dirty = ext3_mark_dquot_dirty,
716 .write_info = ext3_write_info 737 .write_info = ext3_write_info,
738 .alloc_dquot = dquot_alloc,
739 .destroy_dquot = dquot_destroy,
717}; 740};
718 741
719static struct quotactl_ops ext3_qctl_operations = { 742static struct quotactl_ops ext3_qctl_operations = {
@@ -736,8 +759,8 @@ static const struct super_operations ext3_sops = {
736 .put_super = ext3_put_super, 759 .put_super = ext3_put_super,
737 .write_super = ext3_write_super, 760 .write_super = ext3_write_super,
738 .sync_fs = ext3_sync_fs, 761 .sync_fs = ext3_sync_fs,
739 .write_super_lockfs = ext3_write_super_lockfs, 762 .freeze_fs = ext3_freeze,
740 .unlockfs = ext3_unlockfs, 763 .unfreeze_fs = ext3_unfreeze,
741 .statfs = ext3_statfs, 764 .statfs = ext3_statfs,
742 .remount_fs = ext3_remount, 765 .remount_fs = ext3_remount,
743 .clear_inode = ext3_clear_inode, 766 .clear_inode = ext3_clear_inode,
@@ -746,6 +769,7 @@ static const struct super_operations ext3_sops = {
746 .quota_read = ext3_quota_read, 769 .quota_read = ext3_quota_read,
747 .quota_write = ext3_quota_write, 770 .quota_write = ext3_quota_write,
748#endif 771#endif
772 .bdev_try_to_free_page = bdev_try_to_free_page,
749}; 773};
750 774
751static const struct export_operations ext3_export_ops = { 775static const struct export_operations ext3_export_ops = {
@@ -1035,8 +1059,7 @@ static int parse_options (char *options, struct super_block *sb,
1035 case Opt_grpjquota: 1059 case Opt_grpjquota:
1036 qtype = GRPQUOTA; 1060 qtype = GRPQUOTA;
1037set_qf_name: 1061set_qf_name:
1038 if ((sb_any_quota_enabled(sb) || 1062 if (sb_any_quota_loaded(sb) &&
1039 sb_any_quota_suspended(sb)) &&
1040 !sbi->s_qf_names[qtype]) { 1063 !sbi->s_qf_names[qtype]) {
1041 printk(KERN_ERR 1064 printk(KERN_ERR
1042 "EXT3-fs: Cannot change journaled " 1065 "EXT3-fs: Cannot change journaled "
@@ -1075,8 +1098,7 @@ set_qf_name:
1075 case Opt_offgrpjquota: 1098 case Opt_offgrpjquota:
1076 qtype = GRPQUOTA; 1099 qtype = GRPQUOTA;
1077clear_qf_name: 1100clear_qf_name:
1078 if ((sb_any_quota_enabled(sb) || 1101 if (sb_any_quota_loaded(sb) &&
1079 sb_any_quota_suspended(sb)) &&
1080 sbi->s_qf_names[qtype]) { 1102 sbi->s_qf_names[qtype]) {
1081 printk(KERN_ERR "EXT3-fs: Cannot change " 1103 printk(KERN_ERR "EXT3-fs: Cannot change "
1082 "journaled quota options when " 1104 "journaled quota options when "
@@ -1095,8 +1117,7 @@ clear_qf_name:
1095 case Opt_jqfmt_vfsv0: 1117 case Opt_jqfmt_vfsv0:
1096 qfmt = QFMT_VFS_V0; 1118 qfmt = QFMT_VFS_V0;
1097set_qf_format: 1119set_qf_format:
1098 if ((sb_any_quota_enabled(sb) || 1120 if (sb_any_quota_loaded(sb) &&
1099 sb_any_quota_suspended(sb)) &&
1100 sbi->s_jquota_fmt != qfmt) { 1121 sbi->s_jquota_fmt != qfmt) {
1101 printk(KERN_ERR "EXT3-fs: Cannot change " 1122 printk(KERN_ERR "EXT3-fs: Cannot change "
1102 "journaled quota options when " 1123 "journaled quota options when "
@@ -1115,8 +1136,7 @@ set_qf_format:
1115 set_opt(sbi->s_mount_opt, GRPQUOTA); 1136 set_opt(sbi->s_mount_opt, GRPQUOTA);
1116 break; 1137 break;
1117 case Opt_noquota: 1138 case Opt_noquota:
1118 if (sb_any_quota_enabled(sb) || 1139 if (sb_any_quota_loaded(sb)) {
1119 sb_any_quota_suspended(sb)) {
1120 printk(KERN_ERR "EXT3-fs: Cannot change quota " 1140 printk(KERN_ERR "EXT3-fs: Cannot change quota "
1121 "options when quota turned on.\n"); 1141 "options when quota turned on.\n");
1122 return 0; 1142 return 0;
@@ -1548,6 +1568,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1548 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 1568 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
1549 if (!sbi) 1569 if (!sbi)
1550 return -ENOMEM; 1570 return -ENOMEM;
1571
1572 sbi->s_blockgroup_lock =
1573 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
1574 if (!sbi->s_blockgroup_lock) {
1575 kfree(sbi);
1576 return -ENOMEM;
1577 }
1551 sb->s_fs_info = sbi; 1578 sb->s_fs_info = sbi;
1552 sbi->s_mount_opt = 0; 1579 sbi->s_mount_opt = 0;
1553 sbi->s_resuid = EXT3_DEF_RESUID; 1580 sbi->s_resuid = EXT3_DEF_RESUID;
@@ -1744,6 +1771,18 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1744 for (i=0; i < 4; i++) 1771 for (i=0; i < 4; i++)
1745 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); 1772 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
1746 sbi->s_def_hash_version = es->s_def_hash_version; 1773 sbi->s_def_hash_version = es->s_def_hash_version;
1774 i = le32_to_cpu(es->s_flags);
1775 if (i & EXT2_FLAGS_UNSIGNED_HASH)
1776 sbi->s_hash_unsigned = 3;
1777 else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
1778#ifdef __CHAR_UNSIGNED__
1779 es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
1780 sbi->s_hash_unsigned = 3;
1781#else
1782 es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
1783#endif
1784 sb->s_dirt = 1;
1785 }
1747 1786
1748 if (sbi->s_blocks_per_group > blocksize * 8) { 1787 if (sbi->s_blocks_per_group > blocksize * 8) {
1749 printk (KERN_ERR 1788 printk (KERN_ERR
@@ -1788,7 +1827,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1788 goto failed_mount; 1827 goto failed_mount;
1789 } 1828 }
1790 1829
1791 bgl_lock_init(&sbi->s_blockgroup_lock); 1830 bgl_lock_init(sbi->s_blockgroup_lock);
1792 1831
1793 for (i = 0; i < db_count; i++) { 1832 for (i = 0; i < db_count; i++) {
1794 block = descriptor_loc(sb, logic_sb_block, i); 1833 block = descriptor_loc(sb, logic_sb_block, i);
@@ -2272,21 +2311,23 @@ static int ext3_create_journal(struct super_block * sb,
2272 return 0; 2311 return 0;
2273} 2312}
2274 2313
2275static void ext3_commit_super (struct super_block * sb, 2314static int ext3_commit_super(struct super_block *sb,
2276 struct ext3_super_block * es, 2315 struct ext3_super_block *es,
2277 int sync) 2316 int sync)
2278{ 2317{
2279 struct buffer_head *sbh = EXT3_SB(sb)->s_sbh; 2318 struct buffer_head *sbh = EXT3_SB(sb)->s_sbh;
2319 int error = 0;
2280 2320
2281 if (!sbh) 2321 if (!sbh)
2282 return; 2322 return error;
2283 es->s_wtime = cpu_to_le32(get_seconds()); 2323 es->s_wtime = cpu_to_le32(get_seconds());
2284 es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb)); 2324 es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb));
2285 es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb)); 2325 es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
2286 BUFFER_TRACE(sbh, "marking dirty"); 2326 BUFFER_TRACE(sbh, "marking dirty");
2287 mark_buffer_dirty(sbh); 2327 mark_buffer_dirty(sbh);
2288 if (sync) 2328 if (sync)
2289 sync_dirty_buffer(sbh); 2329 error = sync_dirty_buffer(sbh);
2330 return error;
2290} 2331}
2291 2332
2292 2333
@@ -2400,12 +2441,14 @@ static int ext3_sync_fs(struct super_block *sb, int wait)
2400 * LVM calls this function before a (read-only) snapshot is created. This 2441 * LVM calls this function before a (read-only) snapshot is created. This
2401 * gives us a chance to flush the journal completely and mark the fs clean. 2442 * gives us a chance to flush the journal completely and mark the fs clean.
2402 */ 2443 */
2403static void ext3_write_super_lockfs(struct super_block *sb) 2444static int ext3_freeze(struct super_block *sb)
2404{ 2445{
2446 int error = 0;
2447 journal_t *journal;
2405 sb->s_dirt = 0; 2448 sb->s_dirt = 0;
2406 2449
2407 if (!(sb->s_flags & MS_RDONLY)) { 2450 if (!(sb->s_flags & MS_RDONLY)) {
2408 journal_t *journal = EXT3_SB(sb)->s_journal; 2451 journal = EXT3_SB(sb)->s_journal;
2409 2452
2410 /* Now we set up the journal barrier. */ 2453 /* Now we set up the journal barrier. */
2411 journal_lock_updates(journal); 2454 journal_lock_updates(journal);
@@ -2414,20 +2457,28 @@ static void ext3_write_super_lockfs(struct super_block *sb)
2414 * We don't want to clear needs_recovery flag when we failed 2457 * We don't want to clear needs_recovery flag when we failed
2415 * to flush the journal. 2458 * to flush the journal.
2416 */ 2459 */
2417 if (journal_flush(journal) < 0) 2460 error = journal_flush(journal);
2418 return; 2461 if (error < 0)
2462 goto out;
2419 2463
2420 /* Journal blocked and flushed, clear needs_recovery flag. */ 2464 /* Journal blocked and flushed, clear needs_recovery flag. */
2421 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); 2465 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
2422 ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); 2466 error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
2467 if (error)
2468 goto out;
2423 } 2469 }
2470 return 0;
2471
2472out:
2473 journal_unlock_updates(journal);
2474 return error;
2424} 2475}
2425 2476
2426/* 2477/*
2427 * Called by LVM after the snapshot is done. We need to reset the RECOVER 2478 * Called by LVM after the snapshot is done. We need to reset the RECOVER
2428 * flag here, even though the filesystem is not technically dirty yet. 2479 * flag here, even though the filesystem is not technically dirty yet.
2429 */ 2480 */
2430static void ext3_unlockfs(struct super_block *sb) 2481static int ext3_unfreeze(struct super_block *sb)
2431{ 2482{
2432 if (!(sb->s_flags & MS_RDONLY)) { 2483 if (!(sb->s_flags & MS_RDONLY)) {
2433 lock_super(sb); 2484 lock_super(sb);
@@ -2437,6 +2488,7 @@ static void ext3_unlockfs(struct super_block *sb)
2437 unlock_super(sb); 2488 unlock_super(sb);
2438 journal_unlock_updates(EXT3_SB(sb)->s_journal); 2489 journal_unlock_updates(EXT3_SB(sb)->s_journal);
2439 } 2490 }
2491 return 0;
2440} 2492}
2441 2493
2442static int ext3_remount (struct super_block * sb, int * flags, char * data) 2494static int ext3_remount (struct super_block * sb, int * flags, char * data)
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 38b3acf5683b..6bba06b09dd1 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -20,6 +20,7 @@
20#include "ext4.h" 20#include "ext4.h"
21#include "ext4_jbd2.h" 21#include "ext4_jbd2.h"
22#include "group.h" 22#include "group.h"
23#include "mballoc.h"
23 24
24/* 25/*
25 * balloc.c contains the blocks allocation and deallocation routines 26 * balloc.c contains the blocks allocation and deallocation routines
@@ -100,10 +101,10 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
100 * essentially implementing a per-group read-only flag. */ 101 * essentially implementing a per-group read-only flag. */
101 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { 102 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
102 ext4_error(sb, __func__, 103 ext4_error(sb, __func__,
103 "Checksum bad for group %lu\n", block_group); 104 "Checksum bad for group %u", block_group);
104 gdp->bg_free_blocks_count = 0; 105 ext4_free_blks_set(sb, gdp, 0);
105 gdp->bg_free_inodes_count = 0; 106 ext4_free_inodes_set(sb, gdp, 0);
106 gdp->bg_itable_unused = 0; 107 ext4_itable_unused_set(sb, gdp, 0);
107 memset(bh->b_data, 0xff, sb->s_blocksize); 108 memset(bh->b_data, 0xff, sb->s_blocksize);
108 return 0; 109 return 0;
109 } 110 }
@@ -205,15 +206,15 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
205 ext4_group_t block_group, 206 ext4_group_t block_group,
206 struct buffer_head **bh) 207 struct buffer_head **bh)
207{ 208{
208 unsigned long group_desc; 209 unsigned int group_desc;
209 unsigned long offset; 210 unsigned int offset;
210 struct ext4_group_desc *desc; 211 struct ext4_group_desc *desc;
211 struct ext4_sb_info *sbi = EXT4_SB(sb); 212 struct ext4_sb_info *sbi = EXT4_SB(sb);
212 213
213 if (block_group >= sbi->s_groups_count) { 214 if (block_group >= sbi->s_groups_count) {
214 ext4_error(sb, "ext4_get_group_desc", 215 ext4_error(sb, "ext4_get_group_desc",
215 "block_group >= groups_count - " 216 "block_group >= groups_count - "
216 "block_group = %lu, groups_count = %lu", 217 "block_group = %u, groups_count = %u",
217 block_group, sbi->s_groups_count); 218 block_group, sbi->s_groups_count);
218 219
219 return NULL; 220 return NULL;
@@ -225,7 +226,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
225 if (!sbi->s_group_desc[group_desc]) { 226 if (!sbi->s_group_desc[group_desc]) {
226 ext4_error(sb, "ext4_get_group_desc", 227 ext4_error(sb, "ext4_get_group_desc",
227 "Group descriptor not loaded - " 228 "Group descriptor not loaded - "
228 "block_group = %lu, group_desc = %lu, desc = %lu", 229 "block_group = %u, group_desc = %u, desc = %u",
229 block_group, group_desc, offset); 230 block_group, group_desc, offset);
230 return NULL; 231 return NULL;
231 } 232 }
@@ -315,29 +316,50 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
315 if (unlikely(!bh)) { 316 if (unlikely(!bh)) {
316 ext4_error(sb, __func__, 317 ext4_error(sb, __func__,
317 "Cannot read block bitmap - " 318 "Cannot read block bitmap - "
318 "block_group = %lu, block_bitmap = %llu", 319 "block_group = %u, block_bitmap = %llu",
319 block_group, bitmap_blk); 320 block_group, bitmap_blk);
320 return NULL; 321 return NULL;
321 } 322 }
322 if (buffer_uptodate(bh) && 323
323 !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) 324 if (bitmap_uptodate(bh))
324 return bh; 325 return bh;
325 326
326 lock_buffer(bh); 327 lock_buffer(bh);
328 if (bitmap_uptodate(bh)) {
329 unlock_buffer(bh);
330 return bh;
331 }
327 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); 332 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
328 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 333 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
329 ext4_init_block_bitmap(sb, bh, block_group, desc); 334 ext4_init_block_bitmap(sb, bh, block_group, desc);
335 set_bitmap_uptodate(bh);
330 set_buffer_uptodate(bh); 336 set_buffer_uptodate(bh);
331 unlock_buffer(bh);
332 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); 337 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
338 unlock_buffer(bh);
333 return bh; 339 return bh;
334 } 340 }
335 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); 341 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
342 if (buffer_uptodate(bh)) {
343 /*
344 * if not uninit if bh is uptodate,
345 * bitmap is also uptodate
346 */
347 set_bitmap_uptodate(bh);
348 unlock_buffer(bh);
349 return bh;
350 }
351 /*
352 * submit the buffer_head for read. We can
353 * safely mark the bitmap as uptodate now.
354 * We do it here so the bitmap uptodate bit
355 * get set with buffer lock held.
356 */
357 set_bitmap_uptodate(bh);
336 if (bh_submit_read(bh) < 0) { 358 if (bh_submit_read(bh) < 0) {
337 put_bh(bh); 359 put_bh(bh);
338 ext4_error(sb, __func__, 360 ext4_error(sb, __func__,
339 "Cannot read block bitmap - " 361 "Cannot read block bitmap - "
340 "block_group = %lu, block_bitmap = %llu", 362 "block_group = %u, block_bitmap = %llu",
341 block_group, bitmap_blk); 363 block_group, bitmap_blk);
342 return NULL; 364 return NULL;
343 } 365 }
@@ -350,62 +372,44 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
350} 372}
351 373
352/** 374/**
353 * ext4_free_blocks_sb() -- Free given blocks and update quota 375 * ext4_add_groupblocks() -- Add given blocks to an existing group
354 * @handle: handle to this transaction 376 * @handle: handle to this transaction
355 * @sb: super block 377 * @sb: super block
356 * @block: start physcial block to free 378 * @block: start physcial block to add to the block group
357 * @count: number of blocks to free 379 * @count: number of blocks to free
358 * @pdquot_freed_blocks: pointer to quota
359 * 380 *
360 * XXX This function is only used by the on-line resizing code, which 381 * This marks the blocks as free in the bitmap. We ask the
361 * should probably be fixed up to call the mballoc variant. There 382 * mballoc to reload the buddy after this by setting group
362 * this needs to be cleaned up later; in fact, I'm not convinced this 383 * EXT4_GROUP_INFO_NEED_INIT_BIT flag
363 * is 100% correct in the face of the mballoc code. The online resizing
364 * code needs to be fixed up to more tightly (and correctly) interlock
365 * with the mballoc code.
366 */ 384 */
367void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, 385void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
368 ext4_fsblk_t block, unsigned long count, 386 ext4_fsblk_t block, unsigned long count)
369 unsigned long *pdquot_freed_blocks)
370{ 387{
371 struct buffer_head *bitmap_bh = NULL; 388 struct buffer_head *bitmap_bh = NULL;
372 struct buffer_head *gd_bh; 389 struct buffer_head *gd_bh;
373 ext4_group_t block_group; 390 ext4_group_t block_group;
374 ext4_grpblk_t bit; 391 ext4_grpblk_t bit;
375 unsigned long i; 392 unsigned int i;
376 unsigned long overflow;
377 struct ext4_group_desc *desc; 393 struct ext4_group_desc *desc;
378 struct ext4_super_block *es; 394 struct ext4_super_block *es;
379 struct ext4_sb_info *sbi; 395 struct ext4_sb_info *sbi;
380 int err = 0, ret; 396 int err = 0, ret, blk_free_count;
381 ext4_grpblk_t group_freed; 397 ext4_grpblk_t blocks_freed;
398 struct ext4_group_info *grp;
382 399
383 *pdquot_freed_blocks = 0;
384 sbi = EXT4_SB(sb); 400 sbi = EXT4_SB(sb);
385 es = sbi->s_es; 401 es = sbi->s_es;
386 if (block < le32_to_cpu(es->s_first_data_block) || 402 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
387 block + count < block ||
388 block + count > ext4_blocks_count(es)) {
389 ext4_error(sb, "ext4_free_blocks",
390 "Freeing blocks not in datazone - "
391 "block = %llu, count = %lu", block, count);
392 goto error_return;
393 }
394
395 ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1);
396 403
397do_more:
398 overflow = 0;
399 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 404 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
405 grp = ext4_get_group_info(sb, block_group);
400 /* 406 /*
401 * Check to see if we are freeing blocks across a group 407 * Check to see if we are freeing blocks across a group
402 * boundary. 408 * boundary.
403 */ 409 */
404 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { 410 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
405 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); 411 goto error_return;
406 count -= overflow;
407 } 412 }
408 brelse(bitmap_bh);
409 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 413 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
410 if (!bitmap_bh) 414 if (!bitmap_bh)
411 goto error_return; 415 goto error_return;
@@ -418,18 +422,17 @@ do_more:
418 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || 422 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
419 in_range(block + count - 1, ext4_inode_table(sb, desc), 423 in_range(block + count - 1, ext4_inode_table(sb, desc),
420 sbi->s_itb_per_group)) { 424 sbi->s_itb_per_group)) {
421 ext4_error(sb, "ext4_free_blocks", 425 ext4_error(sb, __func__,
422 "Freeing blocks in system zones - " 426 "Adding blocks in system zones - "
423 "Block = %llu, count = %lu", 427 "Block = %llu, count = %lu",
424 block, count); 428 block, count);
425 goto error_return; 429 goto error_return;
426 } 430 }
427 431
428 /* 432 /*
429 * We are about to start releasing blocks in the bitmap, 433 * We are about to add blocks to the bitmap,
430 * so we need undo access. 434 * so we need undo access.
431 */ 435 */
432 /* @@@ check errors */
433 BUFFER_TRACE(bitmap_bh, "getting undo access"); 436 BUFFER_TRACE(bitmap_bh, "getting undo access");
434 err = ext4_journal_get_undo_access(handle, bitmap_bh); 437 err = ext4_journal_get_undo_access(handle, bitmap_bh);
435 if (err) 438 if (err)
@@ -444,107 +447,55 @@ do_more:
444 err = ext4_journal_get_write_access(handle, gd_bh); 447 err = ext4_journal_get_write_access(handle, gd_bh);
445 if (err) 448 if (err)
446 goto error_return; 449 goto error_return;
447 450 /*
448 jbd_lock_bh_state(bitmap_bh); 451 * make sure we don't allow a parallel init on other groups in the
449 452 * same buddy cache
450 for (i = 0, group_freed = 0; i < count; i++) { 453 */
451 /* 454 down_write(&grp->alloc_sem);
452 * An HJ special. This is expensive... 455 for (i = 0, blocks_freed = 0; i < count; i++) {
453 */
454#ifdef CONFIG_JBD2_DEBUG
455 jbd_unlock_bh_state(bitmap_bh);
456 {
457 struct buffer_head *debug_bh;
458 debug_bh = sb_find_get_block(sb, block + i);
459 if (debug_bh) {
460 BUFFER_TRACE(debug_bh, "Deleted!");
461 if (!bh2jh(bitmap_bh)->b_committed_data)
462 BUFFER_TRACE(debug_bh,
463 "No commited data in bitmap");
464 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
465 __brelse(debug_bh);
466 }
467 }
468 jbd_lock_bh_state(bitmap_bh);
469#endif
470 if (need_resched()) {
471 jbd_unlock_bh_state(bitmap_bh);
472 cond_resched();
473 jbd_lock_bh_state(bitmap_bh);
474 }
475 /* @@@ This prevents newly-allocated data from being
476 * freed and then reallocated within the same
477 * transaction.
478 *
479 * Ideally we would want to allow that to happen, but to
480 * do so requires making jbd2_journal_forget() capable of
481 * revoking the queued write of a data block, which
482 * implies blocking on the journal lock. *forget()
483 * cannot block due to truncate races.
484 *
485 * Eventually we can fix this by making jbd2_journal_forget()
486 * return a status indicating whether or not it was able
487 * to revoke the buffer. On successful revoke, it is
488 * safe not to set the allocation bit in the committed
489 * bitmap, because we know that there is no outstanding
490 * activity on the buffer any more and so it is safe to
491 * reallocate it.
492 */
493 BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
494 J_ASSERT_BH(bitmap_bh,
495 bh2jh(bitmap_bh)->b_committed_data != NULL);
496 ext4_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
497 bh2jh(bitmap_bh)->b_committed_data);
498
499 /*
500 * We clear the bit in the bitmap after setting the committed
501 * data bit, because this is the reverse order to that which
502 * the allocator uses.
503 */
504 BUFFER_TRACE(bitmap_bh, "clear bit"); 456 BUFFER_TRACE(bitmap_bh, "clear bit");
505 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), 457 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
506 bit + i, bitmap_bh->b_data)) { 458 bit + i, bitmap_bh->b_data)) {
507 jbd_unlock_bh_state(bitmap_bh);
508 ext4_error(sb, __func__, 459 ext4_error(sb, __func__,
509 "bit already cleared for block %llu", 460 "bit already cleared for block %llu",
510 (ext4_fsblk_t)(block + i)); 461 (ext4_fsblk_t)(block + i));
511 jbd_lock_bh_state(bitmap_bh);
512 BUFFER_TRACE(bitmap_bh, "bit already cleared"); 462 BUFFER_TRACE(bitmap_bh, "bit already cleared");
513 } else { 463 } else {
514 group_freed++; 464 blocks_freed++;
515 } 465 }
516 } 466 }
517 jbd_unlock_bh_state(bitmap_bh);
518
519 spin_lock(sb_bgl_lock(sbi, block_group)); 467 spin_lock(sb_bgl_lock(sbi, block_group));
520 le16_add_cpu(&desc->bg_free_blocks_count, group_freed); 468 blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
469 ext4_free_blks_set(sb, desc, blk_free_count);
521 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); 470 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
522 spin_unlock(sb_bgl_lock(sbi, block_group)); 471 spin_unlock(sb_bgl_lock(sbi, block_group));
523 percpu_counter_add(&sbi->s_freeblocks_counter, count); 472 percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
524 473
525 if (sbi->s_log_groups_per_flex) { 474 if (sbi->s_log_groups_per_flex) {
526 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 475 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
527 spin_lock(sb_bgl_lock(sbi, flex_group)); 476 spin_lock(sb_bgl_lock(sbi, flex_group));
528 sbi->s_flex_groups[flex_group].free_blocks += count; 477 sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
529 spin_unlock(sb_bgl_lock(sbi, flex_group)); 478 spin_unlock(sb_bgl_lock(sbi, flex_group));
530 } 479 }
480 /*
481 * request to reload the buddy with the
482 * new bitmap information
483 */
484 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
485 ext4_mb_update_group_info(grp, blocks_freed);
486 up_write(&grp->alloc_sem);
531 487
532 /* We dirtied the bitmap block */ 488 /* We dirtied the bitmap block */
533 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 489 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
534 err = ext4_journal_dirty_metadata(handle, bitmap_bh); 490 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
535 491
536 /* And the group descriptor block */ 492 /* And the group descriptor block */
537 BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); 493 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
538 ret = ext4_journal_dirty_metadata(handle, gd_bh); 494 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
539 if (!err) err = ret; 495 if (!err)
540 *pdquot_freed_blocks += group_freed; 496 err = ret;
541
542 if (overflow && !err) {
543 block += count;
544 count = overflow;
545 goto do_more;
546 }
547 sb->s_dirt = 1; 497 sb->s_dirt = 1;
498
548error_return: 499error_return:
549 brelse(bitmap_bh); 500 brelse(bitmap_bh);
550 ext4_std_error(sb, err); 501 ext4_std_error(sb, err);
@@ -614,7 +565,7 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
614 if (dirty_blocks < 0) { 565 if (dirty_blocks < 0) {
615 printk(KERN_CRIT "Dirty block accounting " 566 printk(KERN_CRIT "Dirty block accounting "
616 "went wrong %lld\n", 567 "went wrong %lld\n",
617 dirty_blocks); 568 (long long)dirty_blocks);
618 } 569 }
619 } 570 }
620 /* Check whether we have space after 571 /* Check whether we have space after
@@ -666,101 +617,45 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
666 return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); 617 return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
667} 618}
668 619
669#define EXT4_META_BLOCK 0x1
670
671static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
672 ext4_lblk_t iblock, ext4_fsblk_t goal,
673 unsigned long *count, int *errp, int flags)
674{
675 struct ext4_allocation_request ar;
676 ext4_fsblk_t ret;
677
678 memset(&ar, 0, sizeof(ar));
679 /* Fill with neighbour allocated blocks */
680
681 ar.inode = inode;
682 ar.goal = goal;
683 ar.len = *count;
684 ar.logical = iblock;
685
686 if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
687 /* enable in-core preallocation for data block allocation */
688 ar.flags = EXT4_MB_HINT_DATA;
689 else
690 /* disable in-core preallocation for non-regular files */
691 ar.flags = 0;
692
693 ret = ext4_mb_new_blocks(handle, &ar, errp);
694 *count = ar.len;
695 return ret;
696}
697
698/* 620/*
699 * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks 621 * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
700 * 622 *
701 * @handle: handle to this transaction 623 * @handle: handle to this transaction
702 * @inode: file inode 624 * @inode: file inode
703 * @goal: given target block(filesystem wide) 625 * @goal: given target block(filesystem wide)
704 * @count: total number of blocks need 626 * @count: pointer to total number of blocks needed
705 * @errp: error code 627 * @errp: error code
706 * 628 *
707 * Return 1st allocated block numberon success, *count stores total account 629 * Return 1st allocated block number on success, *count stores total account
708 * error stores in errp pointer 630 * error stores in errp pointer
709 */ 631 */
710ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, 632ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
711 ext4_fsblk_t goal, unsigned long *count, int *errp) 633 ext4_fsblk_t goal, unsigned long *count, int *errp)
712{ 634{
635 struct ext4_allocation_request ar;
713 ext4_fsblk_t ret; 636 ext4_fsblk_t ret;
714 ret = do_blk_alloc(handle, inode, 0, goal, 637
715 count, errp, EXT4_META_BLOCK); 638 memset(&ar, 0, sizeof(ar));
639 /* Fill with neighbour allocated blocks */
640 ar.inode = inode;
641 ar.goal = goal;
642 ar.len = count ? *count : 1;
643
644 ret = ext4_mb_new_blocks(handle, &ar, errp);
645 if (count)
646 *count = ar.len;
647
716 /* 648 /*
717 * Account for the allocated meta blocks 649 * Account for the allocated meta blocks
718 */ 650 */
719 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { 651 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
720 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 652 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
721 EXT4_I(inode)->i_allocated_meta_blocks += *count; 653 EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
722 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 654 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
723 } 655 }
724 return ret; 656 return ret;
725} 657}
726 658
727/*
728 * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
729 *
730 * @handle: handle to this transaction
731 * @inode: file inode
732 * @goal: given target block(filesystem wide)
733 * @errp: error code
734 *
735 * Return allocated block number on success
736 */
737ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
738 ext4_fsblk_t goal, int *errp)
739{
740 unsigned long count = 1;
741 return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
742}
743
744/*
745 * ext4_new_blocks() -- allocate data blocks
746 *
747 * @handle: handle to this transaction
748 * @inode: file inode
749 * @goal: given target block(filesystem wide)
750 * @count: total number of blocks need
751 * @errp: error code
752 *
753 * Return 1st allocated block numberon success, *count stores total account
754 * error stores in errp pointer
755 */
756
757ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
758 ext4_lblk_t iblock, ext4_fsblk_t goal,
759 unsigned long *count, int *errp)
760{
761 return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
762}
763
764/** 659/**
765 * ext4_count_free_blocks() -- count filesystem free blocks 660 * ext4_count_free_blocks() -- count filesystem free blocks
766 * @sb: superblock 661 * @sb: superblock
@@ -776,7 +671,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
776#ifdef EXT4FS_DEBUG 671#ifdef EXT4FS_DEBUG
777 struct ext4_super_block *es; 672 struct ext4_super_block *es;
778 ext4_fsblk_t bitmap_count; 673 ext4_fsblk_t bitmap_count;
779 unsigned long x; 674 unsigned int x;
780 struct buffer_head *bitmap_bh = NULL; 675 struct buffer_head *bitmap_bh = NULL;
781 676
782 es = EXT4_SB(sb)->s_es; 677 es = EXT4_SB(sb)->s_es;
@@ -796,7 +691,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
796 continue; 691 continue;
797 692
798 x = ext4_count_free(bitmap_bh, sb->s_blocksize); 693 x = ext4_count_free(bitmap_bh, sb->s_blocksize);
799 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", 694 printk(KERN_DEBUG "group %lu: stored = %d, counted = %u\n",
800 i, le16_to_cpu(gdp->bg_free_blocks_count), x); 695 i, le16_to_cpu(gdp->bg_free_blocks_count), x);
801 bitmap_count += x; 696 bitmap_count += x;
802 } 697 }
@@ -812,7 +707,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
812 gdp = ext4_get_group_desc(sb, i, NULL); 707 gdp = ext4_get_group_desc(sb, i, NULL);
813 if (!gdp) 708 if (!gdp)
814 continue; 709 continue;
815 desc_count += le16_to_cpu(gdp->bg_free_blocks_count); 710 desc_count += ext4_free_blks_count(sb, gdp);
816 } 711 }
817 712
818 return desc_count; 713 return desc_count;
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index 0a7a6663c190..fa3af81ac565 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -15,10 +15,9 @@
15 15
16static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; 16static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
17 17
18unsigned long ext4_count_free(struct buffer_head *map, unsigned int numchars) 18unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars)
19{ 19{
20 unsigned int i; 20 unsigned int i, sum = 0;
21 unsigned long sum = 0;
22 21
23 if (!map) 22 if (!map)
24 return 0; 23 return 0;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index fed5b610df5a..2df2e40b01af 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -64,7 +64,7 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
64int ext4_check_dir_entry(const char *function, struct inode *dir, 64int ext4_check_dir_entry(const char *function, struct inode *dir,
65 struct ext4_dir_entry_2 *de, 65 struct ext4_dir_entry_2 *de,
66 struct buffer_head *bh, 66 struct buffer_head *bh,
67 unsigned long offset) 67 unsigned int offset)
68{ 68{
69 const char *error_msg = NULL; 69 const char *error_msg = NULL;
70 const int rlen = ext4_rec_len_from_disk(de->rec_len); 70 const int rlen = ext4_rec_len_from_disk(de->rec_len);
@@ -84,9 +84,9 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
84 if (error_msg != NULL) 84 if (error_msg != NULL)
85 ext4_error(dir->i_sb, function, 85 ext4_error(dir->i_sb, function,
86 "bad entry in directory #%lu: %s - " 86 "bad entry in directory #%lu: %s - "
87 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", 87 "offset=%u, inode=%u, rec_len=%d, name_len=%d",
88 dir->i_ino, error_msg, offset, 88 dir->i_ino, error_msg, offset,
89 (unsigned long) le32_to_cpu(de->inode), 89 le32_to_cpu(de->inode),
90 rlen, de->name_len); 90 rlen, de->name_len);
91 return error_msg == NULL ? 1 : 0; 91 return error_msg == NULL ? 1 : 0;
92} 92}
@@ -95,7 +95,7 @@ static int ext4_readdir(struct file *filp,
95 void *dirent, filldir_t filldir) 95 void *dirent, filldir_t filldir)
96{ 96{
97 int error = 0; 97 int error = 0;
98 unsigned long offset; 98 unsigned int offset;
99 int i, stored; 99 int i, stored;
100 struct ext4_dir_entry_2 *de; 100 struct ext4_dir_entry_2 *de;
101 struct super_block *sb; 101 struct super_block *sb;
@@ -405,7 +405,7 @@ static int call_filldir(struct file *filp, void *dirent,
405 sb = inode->i_sb; 405 sb = inode->i_sb;
406 406
407 if (!fname) { 407 if (!fname) {
408 printk(KERN_ERR "ext4: call_filldir: called with " 408 printk(KERN_ERR "EXT4-fs: call_filldir: called with "
409 "null fname?!?\n"); 409 "null fname?!?\n");
410 return 0; 410 return 0;
411 } 411 }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b0537c827024..c668e4377d76 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -19,6 +19,7 @@
19#include <linux/types.h> 19#include <linux/types.h>
20#include <linux/blkdev.h> 20#include <linux/blkdev.h>
21#include <linux/magic.h> 21#include <linux/magic.h>
22#include <linux/jbd2.h>
22#include "ext4_i.h" 23#include "ext4_i.h"
23 24
24/* 25/*
@@ -94,9 +95,9 @@ struct ext4_allocation_request {
94 /* phys. block for ^^^ */ 95 /* phys. block for ^^^ */
95 ext4_fsblk_t pright; 96 ext4_fsblk_t pright;
96 /* how many blocks we want to allocate */ 97 /* how many blocks we want to allocate */
97 unsigned long len; 98 unsigned int len;
98 /* flags. see above EXT4_MB_HINT_* */ 99 /* flags. see above EXT4_MB_HINT_* */
99 unsigned long flags; 100 unsigned int flags;
100}; 101};
101 102
102/* 103/*
@@ -156,12 +157,12 @@ struct ext4_group_desc
156 __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ 157 __le32 bg_block_bitmap_lo; /* Blocks bitmap block */
157 __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ 158 __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */
158 __le32 bg_inode_table_lo; /* Inodes table block */ 159 __le32 bg_inode_table_lo; /* Inodes table block */
159 __le16 bg_free_blocks_count; /* Free blocks count */ 160 __le16 bg_free_blocks_count_lo;/* Free blocks count */
160 __le16 bg_free_inodes_count; /* Free inodes count */ 161 __le16 bg_free_inodes_count_lo;/* Free inodes count */
161 __le16 bg_used_dirs_count; /* Directories count */ 162 __le16 bg_used_dirs_count_lo; /* Directories count */
162 __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ 163 __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */
163 __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */ 164 __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */
164 __le16 bg_itable_unused; /* Unused inodes count */ 165 __le16 bg_itable_unused_lo; /* Unused inodes count */
165 __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ 166 __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */
166 __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ 167 __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */
167 __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ 168 __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */
@@ -169,7 +170,7 @@ struct ext4_group_desc
169 __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ 170 __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */
170 __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ 171 __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */
171 __le16 bg_used_dirs_count_hi; /* Directories count MSB */ 172 __le16 bg_used_dirs_count_hi; /* Directories count MSB */
172 __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ 173 __le16 bg_itable_unused_hi; /* Unused inodes count MSB */
173 __u32 bg_reserved2[3]; 174 __u32 bg_reserved2[3];
174}; 175};
175 176
@@ -328,6 +329,7 @@ struct ext4_mount_options {
328 uid_t s_resuid; 329 uid_t s_resuid;
329 gid_t s_resgid; 330 gid_t s_resgid;
330 unsigned long s_commit_interval; 331 unsigned long s_commit_interval;
332 u32 s_min_batch_time, s_max_batch_time;
331#ifdef CONFIG_QUOTA 333#ifdef CONFIG_QUOTA
332 int s_jquota_fmt; 334 int s_jquota_fmt;
333 char *s_qf_names[MAXQUOTAS]; 335 char *s_qf_names[MAXQUOTAS];
@@ -534,7 +536,6 @@ do { \
534#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ 536#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
535#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ 537#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
536#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ 538#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
537#define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */
538#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 539#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
539#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 540#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
540#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 541#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
@@ -726,11 +727,11 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
726 */ 727 */
727 728
728#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ 729#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \
729 (EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) 730 ((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0)
730#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ 731#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \
731 (EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) 732 ((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0)
732#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \ 733#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \
733 (EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) 734 ((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0)
734#define EXT4_SET_COMPAT_FEATURE(sb,mask) \ 735#define EXT4_SET_COMPAT_FEATURE(sb,mask) \
735 EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) 736 EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
736#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \ 737#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \
@@ -806,6 +807,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
806#define EXT4_DEFM_JMODE_WBACK 0x0060 807#define EXT4_DEFM_JMODE_WBACK 0x0060
807 808
808/* 809/*
810 * Default journal batch times
811 */
812#define EXT4_DEF_MIN_BATCH_TIME 0
813#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
814
815/*
809 * Structure of a directory entry 816 * Structure of a directory entry
810 */ 817 */
811#define EXT4_NAME_LEN 255 818#define EXT4_NAME_LEN 255
@@ -891,6 +898,9 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len)
891#define DX_HASH_LEGACY 0 898#define DX_HASH_LEGACY 0
892#define DX_HASH_HALF_MD4 1 899#define DX_HASH_HALF_MD4 1
893#define DX_HASH_TEA 2 900#define DX_HASH_TEA 2
901#define DX_HASH_LEGACY_UNSIGNED 3
902#define DX_HASH_HALF_MD4_UNSIGNED 4
903#define DX_HASH_TEA_UNSIGNED 5
894 904
895#ifdef __KERNEL__ 905#ifdef __KERNEL__
896 906
@@ -955,7 +965,7 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
955#define ERR_BAD_DX_DIR -75000 965#define ERR_BAD_DX_DIR -75000
956 966
957void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, 967void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
958 unsigned long *blockgrpp, ext4_grpblk_t *offsetp); 968 ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
959 969
960extern struct proc_dir_entry *ext4_proc_root; 970extern struct proc_dir_entry *ext4_proc_root;
961 971
@@ -987,6 +997,9 @@ do { \
987# define ATTRIB_NORET __attribute__((noreturn)) 997# define ATTRIB_NORET __attribute__((noreturn))
988# define NORET_AND noreturn, 998# define NORET_AND noreturn,
989 999
1000/* bitmap.c */
1001extern unsigned int ext4_count_free(struct buffer_head *, unsigned);
1002
990/* balloc.c */ 1003/* balloc.c */
991extern unsigned int ext4_block_group(struct super_block *sb, 1004extern unsigned int ext4_block_group(struct super_block *sb,
992 ext4_fsblk_t blocknr); 1005 ext4_fsblk_t blocknr);
@@ -995,20 +1008,14 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
995extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); 1008extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
996extern unsigned long ext4_bg_num_gdb(struct super_block *sb, 1009extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
997 ext4_group_t group); 1010 ext4_group_t group);
998extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
999 ext4_fsblk_t goal, int *errp);
1000extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, 1011extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
1001 ext4_fsblk_t goal, unsigned long *count, int *errp); 1012 ext4_fsblk_t goal, unsigned long *count, int *errp);
1002extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
1003 ext4_lblk_t iblock, ext4_fsblk_t goal,
1004 unsigned long *count, int *errp);
1005extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); 1013extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
1006extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); 1014extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
1007extern void ext4_free_blocks(handle_t *handle, struct inode *inode, 1015extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
1008 ext4_fsblk_t block, unsigned long count, int metadata); 1016 ext4_fsblk_t block, unsigned long count, int metadata);
1009extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, 1017extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
1010 ext4_fsblk_t block, unsigned long count, 1018 ext4_fsblk_t block, unsigned long count);
1011 unsigned long *pdquot_freed_blocks);
1012extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); 1019extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
1013extern void ext4_check_blocks_bitmap(struct super_block *); 1020extern void ext4_check_blocks_bitmap(struct super_block *);
1014extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, 1021extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
@@ -1019,7 +1026,7 @@ extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
1019/* dir.c */ 1026/* dir.c */
1020extern int ext4_check_dir_entry(const char *, struct inode *, 1027extern int ext4_check_dir_entry(const char *, struct inode *,
1021 struct ext4_dir_entry_2 *, 1028 struct ext4_dir_entry_2 *,
1022 struct buffer_head *, unsigned long); 1029 struct buffer_head *, unsigned int);
1023extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, 1030extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
1024 __u32 minor_hash, 1031 __u32 minor_hash,
1025 struct ext4_dir_entry_2 *dirent); 1032 struct ext4_dir_entry_2 *dirent);
@@ -1039,7 +1046,6 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
1039extern unsigned long ext4_count_free_inodes(struct super_block *); 1046extern unsigned long ext4_count_free_inodes(struct super_block *);
1040extern unsigned long ext4_count_dirs(struct super_block *); 1047extern unsigned long ext4_count_dirs(struct super_block *);
1041extern void ext4_check_inodes_bitmap(struct super_block *); 1048extern void ext4_check_inodes_bitmap(struct super_block *);
1042extern unsigned long ext4_count_free(struct buffer_head *, unsigned);
1043 1049
1044/* mballoc.c */ 1050/* mballoc.c */
1045extern long ext4_mb_stats; 1051extern long ext4_mb_stats;
@@ -1054,12 +1060,13 @@ extern int __init init_ext4_mballoc(void);
1054extern void exit_ext4_mballoc(void); 1060extern void exit_ext4_mballoc(void);
1055extern void ext4_mb_free_blocks(handle_t *, struct inode *, 1061extern void ext4_mb_free_blocks(handle_t *, struct inode *,
1056 unsigned long, unsigned long, int, unsigned long *); 1062 unsigned long, unsigned long, int, unsigned long *);
1057extern int ext4_mb_add_more_groupinfo(struct super_block *sb, 1063extern int ext4_mb_add_groupinfo(struct super_block *sb,
1058 ext4_group_t i, struct ext4_group_desc *desc); 1064 ext4_group_t i, struct ext4_group_desc *desc);
1059extern void ext4_mb_update_group_info(struct ext4_group_info *grp, 1065extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
1060 ext4_grpblk_t add); 1066 ext4_grpblk_t add);
1061 1067extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
1062 1068extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
1069 ext4_group_t, int);
1063/* inode.c */ 1070/* inode.c */
1064int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, 1071int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
1065 struct buffer_head *bh, ext4_fsblk_t blocknr); 1072 struct buffer_head *bh, ext4_fsblk_t blocknr);
@@ -1069,10 +1076,6 @@ struct buffer_head *ext4_bread(handle_t *, struct inode *,
1069 ext4_lblk_t, int, int *); 1076 ext4_lblk_t, int, int *);
1070int ext4_get_block(struct inode *inode, sector_t iblock, 1077int ext4_get_block(struct inode *inode, sector_t iblock,
1071 struct buffer_head *bh_result, int create); 1078 struct buffer_head *bh_result, int create);
1072int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
1073 ext4_lblk_t iblock, unsigned long maxblocks,
1074 struct buffer_head *bh_result,
1075 int create, int extend_disksize);
1076 1079
1077extern struct inode *ext4_iget(struct super_block *, unsigned long); 1080extern struct inode *ext4_iget(struct super_block *, unsigned long);
1078extern int ext4_write_inode(struct inode *, int); 1081extern int ext4_write_inode(struct inode *, int);
@@ -1123,6 +1126,9 @@ extern void ext4_abort(struct super_block *, const char *, const char *, ...)
1123 __attribute__ ((format (printf, 3, 4))); 1126 __attribute__ ((format (printf, 3, 4)));
1124extern void ext4_warning(struct super_block *, const char *, const char *, ...) 1127extern void ext4_warning(struct super_block *, const char *, const char *, ...)
1125 __attribute__ ((format (printf, 3, 4))); 1128 __attribute__ ((format (printf, 3, 4)));
1129extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
1130 const char *, const char *, ...)
1131 __attribute__ ((format (printf, 4, 5)));
1126extern void ext4_update_dynamic_rev(struct super_block *sb); 1132extern void ext4_update_dynamic_rev(struct super_block *sb);
1127extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, 1133extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
1128 __u32 compat); 1134 __u32 compat);
@@ -1136,12 +1142,28 @@ extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
1136 struct ext4_group_desc *bg); 1142 struct ext4_group_desc *bg);
1137extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, 1143extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
1138 struct ext4_group_desc *bg); 1144 struct ext4_group_desc *bg);
1145extern __u32 ext4_free_blks_count(struct super_block *sb,
1146 struct ext4_group_desc *bg);
1147extern __u32 ext4_free_inodes_count(struct super_block *sb,
1148 struct ext4_group_desc *bg);
1149extern __u32 ext4_used_dirs_count(struct super_block *sb,
1150 struct ext4_group_desc *bg);
1151extern __u32 ext4_itable_unused_count(struct super_block *sb,
1152 struct ext4_group_desc *bg);
1139extern void ext4_block_bitmap_set(struct super_block *sb, 1153extern void ext4_block_bitmap_set(struct super_block *sb,
1140 struct ext4_group_desc *bg, ext4_fsblk_t blk); 1154 struct ext4_group_desc *bg, ext4_fsblk_t blk);
1141extern void ext4_inode_bitmap_set(struct super_block *sb, 1155extern void ext4_inode_bitmap_set(struct super_block *sb,
1142 struct ext4_group_desc *bg, ext4_fsblk_t blk); 1156 struct ext4_group_desc *bg, ext4_fsblk_t blk);
1143extern void ext4_inode_table_set(struct super_block *sb, 1157extern void ext4_inode_table_set(struct super_block *sb,
1144 struct ext4_group_desc *bg, ext4_fsblk_t blk); 1158 struct ext4_group_desc *bg, ext4_fsblk_t blk);
1159extern void ext4_free_blks_set(struct super_block *sb,
1160 struct ext4_group_desc *bg, __u32 count);
1161extern void ext4_free_inodes_set(struct super_block *sb,
1162 struct ext4_group_desc *bg, __u32 count);
1163extern void ext4_used_dirs_set(struct super_block *sb,
1164 struct ext4_group_desc *bg, __u32 count);
1165extern void ext4_itable_unused_set(struct super_block *sb,
1166 struct ext4_group_desc *bg, __u32 count);
1145 1167
1146static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) 1168static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
1147{ 1169{
@@ -1225,11 +1247,11 @@ do { \
1225} while (0) 1247} while (0)
1226 1248
1227#ifdef CONFIG_SMP 1249#ifdef CONFIG_SMP
1228/* Each CPU can accumulate FBC_BATCH blocks in their local 1250/* Each CPU can accumulate percpu_counter_batch blocks in their local
1229 * counters. So we need to make sure we have free blocks more 1251 * counters. So we need to make sure we have free blocks more
1230 * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times. 1252 * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times.
1231 */ 1253 */
1232#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids)) 1254#define EXT4_FREEBLOCKS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
1233#else 1255#else
1234#define EXT4_FREEBLOCKS_WATERMARK 0 1256#define EXT4_FREEBLOCKS_WATERMARK 0
1235#endif 1257#endif
@@ -1246,6 +1268,50 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
1246 return ; 1268 return ;
1247} 1269}
1248 1270
1271struct ext4_group_info {
1272 unsigned long bb_state;
1273 struct rb_root bb_free_root;
1274 unsigned short bb_first_free;
1275 unsigned short bb_free;
1276 unsigned short bb_fragments;
1277 struct list_head bb_prealloc_list;
1278#ifdef DOUBLE_CHECK
1279 void *bb_bitmap;
1280#endif
1281 struct rw_semaphore alloc_sem;
1282 unsigned short bb_counters[];
1283};
1284
1285#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
1286#define EXT4_GROUP_INFO_LOCKED_BIT 1
1287
1288#define EXT4_MB_GRP_NEED_INIT(grp) \
1289 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
1290
1291static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
1292{
1293 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
1294
1295 bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
1296}
1297
1298static inline void ext4_unlock_group(struct super_block *sb,
1299 ext4_group_t group)
1300{
1301 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
1302
1303 bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
1304}
1305
1306static inline int ext4_is_group_locked(struct super_block *sb,
1307 ext4_group_t group)
1308{
1309 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
1310
1311 return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
1312 &(grinfo->bb_state));
1313}
1314
1249/* 1315/*
1250 * Inodes and files operations 1316 * Inodes and files operations
1251 */ 1317 */
@@ -1271,18 +1337,38 @@ extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
1271extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, 1337extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
1272 int chunk); 1338 int chunk);
1273extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 1339extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1274 ext4_lblk_t iblock, 1340 ext4_lblk_t iblock, unsigned int max_blocks,
1275 unsigned long max_blocks, struct buffer_head *bh_result, 1341 struct buffer_head *bh_result,
1276 int create, int extend_disksize); 1342 int create, int extend_disksize);
1277extern void ext4_ext_truncate(struct inode *); 1343extern void ext4_ext_truncate(struct inode *);
1278extern void ext4_ext_init(struct super_block *); 1344extern void ext4_ext_init(struct super_block *);
1279extern void ext4_ext_release(struct super_block *); 1345extern void ext4_ext_release(struct super_block *);
1280extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, 1346extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1281 loff_t len); 1347 loff_t len);
1282extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, 1348extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
1283 sector_t block, unsigned long max_blocks, 1349 sector_t block, unsigned int max_blocks,
1284 struct buffer_head *bh, int create, 1350 struct buffer_head *bh, int create,
1285 int extend_disksize, int flag); 1351 int extend_disksize, int flag);
1352extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
1353 __u64 start, __u64 len);
1354
1355/*
1356 * Add new method to test wether block and inode bitmaps are properly
1357 * initialized. With uninit_bg reading the block from disk is not enough
1358 * to mark the bitmap uptodate. We need to also zero-out the bitmap
1359 */
1360#define BH_BITMAP_UPTODATE BH_JBDPrivateStart
1361
1362static inline int bitmap_uptodate(struct buffer_head *bh)
1363{
1364 return (buffer_uptodate(bh) &&
1365 test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
1366}
1367static inline void set_bitmap_uptodate(struct buffer_head *bh)
1368{
1369 set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
1370}
1371
1286#endif /* __KERNEL__ */ 1372#endif /* __KERNEL__ */
1287 1373
1288#endif /* _EXT4_H */ 1374#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index bec7ce59fc0d..18cb67b2cbbc 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -194,11 +194,6 @@ static inline unsigned short ext_depth(struct inode *inode)
194 return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); 194 return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
195} 195}
196 196
197static inline void ext4_ext_tree_changed(struct inode *inode)
198{
199 EXT4_I(inode)->i_ext_generation++;
200}
201
202static inline void 197static inline void
203ext4_ext_invalidate_cache(struct inode *inode) 198ext4_ext_invalidate_cache(struct inode *inode)
204{ 199{
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 5c124c0ac6d3..e69acc16f5c4 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -31,7 +31,7 @@ typedef unsigned long long ext4_fsblk_t;
31typedef __u32 ext4_lblk_t; 31typedef __u32 ext4_lblk_t;
32 32
33/* data type for block group number */ 33/* data type for block group number */
34typedef unsigned long ext4_group_t; 34typedef unsigned int ext4_group_t;
35 35
36#define rsv_start rsv_window._rsv_start 36#define rsv_start rsv_window._rsv_start
37#define rsv_end rsv_window._rsv_end 37#define rsv_end rsv_window._rsv_end
@@ -100,9 +100,6 @@ struct ext4_inode_info {
100 */ 100 */
101 loff_t i_disksize; 101 loff_t i_disksize;
102 102
103 /* on-disk additional length */
104 __u16 i_extra_isize;
105
106 /* 103 /*
107 * i_data_sem is for serialising ext4_truncate() against 104 * i_data_sem is for serialising ext4_truncate() against
108 * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's 105 * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's
@@ -117,7 +114,6 @@ struct ext4_inode_info {
117 struct inode vfs_inode; 114 struct inode vfs_inode;
118 struct jbd2_inode jinode; 115 struct jbd2_inode jinode;
119 116
120 unsigned long i_ext_generation;
121 struct ext4_ext_cache i_cached_extent; 117 struct ext4_ext_cache i_cached_extent;
122 /* 118 /*
123 * File creation time. Its function is same as that of 119 * File creation time. Its function is same as that of
@@ -130,10 +126,14 @@ struct ext4_inode_info {
130 spinlock_t i_prealloc_lock; 126 spinlock_t i_prealloc_lock;
131 127
132 /* allocation reservation info for delalloc */ 128 /* allocation reservation info for delalloc */
133 unsigned long i_reserved_data_blocks; 129 unsigned int i_reserved_data_blocks;
134 unsigned long i_reserved_meta_blocks; 130 unsigned int i_reserved_meta_blocks;
135 unsigned long i_allocated_meta_blocks; 131 unsigned int i_allocated_meta_blocks;
136 unsigned short i_delalloc_reserved_flag; 132 unsigned short i_delalloc_reserved_flag;
133
134 /* on-disk additional length */
135 __u16 i_extra_isize;
136
137 spinlock_t i_block_reservation_lock; 137 spinlock_t i_block_reservation_lock;
138}; 138};
139 139
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index c75384b34f2c..ad13a84644e1 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -7,53 +7,96 @@
7int __ext4_journal_get_undo_access(const char *where, handle_t *handle, 7int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
8 struct buffer_head *bh) 8 struct buffer_head *bh)
9{ 9{
10 int err = jbd2_journal_get_undo_access(handle, bh); 10 int err = 0;
11 if (err) 11
12 ext4_journal_abort_handle(where, __func__, bh, handle, err); 12 if (ext4_handle_valid(handle)) {
13 err = jbd2_journal_get_undo_access(handle, bh);
14 if (err)
15 ext4_journal_abort_handle(where, __func__, bh,
16 handle, err);
17 }
13 return err; 18 return err;
14} 19}
15 20
16int __ext4_journal_get_write_access(const char *where, handle_t *handle, 21int __ext4_journal_get_write_access(const char *where, handle_t *handle,
17 struct buffer_head *bh) 22 struct buffer_head *bh)
18{ 23{
19 int err = jbd2_journal_get_write_access(handle, bh); 24 int err = 0;
20 if (err) 25
21 ext4_journal_abort_handle(where, __func__, bh, handle, err); 26 if (ext4_handle_valid(handle)) {
27 err = jbd2_journal_get_write_access(handle, bh);
28 if (err)
29 ext4_journal_abort_handle(where, __func__, bh,
30 handle, err);
31 }
22 return err; 32 return err;
23} 33}
24 34
25int __ext4_journal_forget(const char *where, handle_t *handle, 35int __ext4_journal_forget(const char *where, handle_t *handle,
26 struct buffer_head *bh) 36 struct buffer_head *bh)
27{ 37{
28 int err = jbd2_journal_forget(handle, bh); 38 int err = 0;
29 if (err) 39
30 ext4_journal_abort_handle(where, __func__, bh, handle, err); 40 if (ext4_handle_valid(handle)) {
41 err = jbd2_journal_forget(handle, bh);
42 if (err)
43 ext4_journal_abort_handle(where, __func__, bh,
44 handle, err);
45 }
31 return err; 46 return err;
32} 47}
33 48
34int __ext4_journal_revoke(const char *where, handle_t *handle, 49int __ext4_journal_revoke(const char *where, handle_t *handle,
35 ext4_fsblk_t blocknr, struct buffer_head *bh) 50 ext4_fsblk_t blocknr, struct buffer_head *bh)
36{ 51{
37 int err = jbd2_journal_revoke(handle, blocknr, bh); 52 int err = 0;
38 if (err) 53
39 ext4_journal_abort_handle(where, __func__, bh, handle, err); 54 if (ext4_handle_valid(handle)) {
55 err = jbd2_journal_revoke(handle, blocknr, bh);
56 if (err)
57 ext4_journal_abort_handle(where, __func__, bh,
58 handle, err);
59 }
40 return err; 60 return err;
41} 61}
42 62
43int __ext4_journal_get_create_access(const char *where, 63int __ext4_journal_get_create_access(const char *where,
44 handle_t *handle, struct buffer_head *bh) 64 handle_t *handle, struct buffer_head *bh)
45{ 65{
46 int err = jbd2_journal_get_create_access(handle, bh); 66 int err = 0;
47 if (err) 67
48 ext4_journal_abort_handle(where, __func__, bh, handle, err); 68 if (ext4_handle_valid(handle)) {
69 err = jbd2_journal_get_create_access(handle, bh);
70 if (err)
71 ext4_journal_abort_handle(where, __func__, bh,
72 handle, err);
73 }
49 return err; 74 return err;
50} 75}
51 76
52int __ext4_journal_dirty_metadata(const char *where, 77int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
53 handle_t *handle, struct buffer_head *bh) 78 struct inode *inode, struct buffer_head *bh)
54{ 79{
55 int err = jbd2_journal_dirty_metadata(handle, bh); 80 int err = 0;
56 if (err) 81
57 ext4_journal_abort_handle(where, __func__, bh, handle, err); 82 if (ext4_handle_valid(handle)) {
83 err = jbd2_journal_dirty_metadata(handle, bh);
84 if (err)
85 ext4_journal_abort_handle(where, __func__, bh,
86 handle, err);
87 } else {
88 mark_buffer_dirty(bh);
89 if (inode && inode_needs_sync(inode)) {
90 sync_dirty_buffer(bh);
91 if (buffer_req(bh) && !buffer_uptodate(bh)) {
92 ext4_error(inode->i_sb, __func__,
93 "IO error syncing inode, "
94 "inode=%lu, block=%llu",
95 inode->i_ino,
96 (unsigned long long) bh->b_blocknr);
97 err = -EIO;
98 }
99 }
100 }
58 return err; 101 return err;
59} 102}
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b455c685a98b..be2f426f6805 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -32,8 +32,8 @@
32 * 5 levels of tree + root which are stored in the inode. */ 32 * 5 levels of tree + root which are stored in the inode. */
33 33
34#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ 34#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \
35 (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ 35 (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
36 || test_opt(sb, EXTENTS) ? 27U : 8U) 36 ? 27U : 8U)
37 37
38/* Extended attribute operations touch at most two data buffers, 38/* Extended attribute operations touch at most two data buffers,
39 * two bitmap buffers, and two group summaries, in addition to the inode 39 * two bitmap buffers, and two group summaries, in addition to the inode
@@ -122,12 +122,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
122 * been done yet. 122 * been done yet.
123 */ 123 */
124 124
125static inline void ext4_journal_release_buffer(handle_t *handle,
126 struct buffer_head *bh)
127{
128 jbd2_journal_release_buffer(handle, bh);
129}
130
131void ext4_journal_abort_handle(const char *caller, const char *err_fn, 125void ext4_journal_abort_handle(const char *caller, const char *err_fn,
132 struct buffer_head *bh, handle_t *handle, int err); 126 struct buffer_head *bh, handle_t *handle, int err);
133 127
@@ -146,8 +140,8 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
146int __ext4_journal_get_create_access(const char *where, 140int __ext4_journal_get_create_access(const char *where,
147 handle_t *handle, struct buffer_head *bh); 141 handle_t *handle, struct buffer_head *bh);
148 142
149int __ext4_journal_dirty_metadata(const char *where, 143int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
150 handle_t *handle, struct buffer_head *bh); 144 struct inode *inode, struct buffer_head *bh);
151 145
152#define ext4_journal_get_undo_access(handle, bh) \ 146#define ext4_journal_get_undo_access(handle, bh) \
153 __ext4_journal_get_undo_access(__func__, (handle), (bh)) 147 __ext4_journal_get_undo_access(__func__, (handle), (bh))
@@ -157,14 +151,57 @@ int __ext4_journal_dirty_metadata(const char *where,
157 __ext4_journal_revoke(__func__, (handle), (blocknr), (bh)) 151 __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
158#define ext4_journal_get_create_access(handle, bh) \ 152#define ext4_journal_get_create_access(handle, bh) \
159 __ext4_journal_get_create_access(__func__, (handle), (bh)) 153 __ext4_journal_get_create_access(__func__, (handle), (bh))
160#define ext4_journal_dirty_metadata(handle, bh) \
161 __ext4_journal_dirty_metadata(__func__, (handle), (bh))
162#define ext4_journal_forget(handle, bh) \ 154#define ext4_journal_forget(handle, bh) \
163 __ext4_journal_forget(__func__, (handle), (bh)) 155 __ext4_journal_forget(__func__, (handle), (bh))
156#define ext4_handle_dirty_metadata(handle, inode, bh) \
157 __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh))
164 158
165handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); 159handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
166int __ext4_journal_stop(const char *where, handle_t *handle); 160int __ext4_journal_stop(const char *where, handle_t *handle);
167 161
162#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1)
163
164static inline int ext4_handle_valid(handle_t *handle)
165{
166 if (handle == EXT4_NOJOURNAL_HANDLE)
167 return 0;
168 return 1;
169}
170
171static inline void ext4_handle_sync(handle_t *handle)
172{
173 if (ext4_handle_valid(handle))
174 handle->h_sync = 1;
175}
176
177static inline void ext4_handle_release_buffer(handle_t *handle,
178 struct buffer_head *bh)
179{
180 if (ext4_handle_valid(handle))
181 jbd2_journal_release_buffer(handle, bh);
182}
183
184static inline int ext4_handle_is_aborted(handle_t *handle)
185{
186 if (ext4_handle_valid(handle))
187 return is_handle_aborted(handle);
188 return 0;
189}
190
191static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
192{
193 if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed)
194 return 0;
195 return 1;
196}
197
198static inline void ext4_journal_release_buffer(handle_t *handle,
199 struct buffer_head *bh)
200{
201 if (ext4_handle_valid(handle))
202 jbd2_journal_release_buffer(handle, bh);
203}
204
168static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) 205static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
169{ 206{
170 return ext4_journal_start_sb(inode->i_sb, nblocks); 207 return ext4_journal_start_sb(inode->i_sb, nblocks);
@@ -180,27 +217,37 @@ static inline handle_t *ext4_journal_current_handle(void)
180 217
181static inline int ext4_journal_extend(handle_t *handle, int nblocks) 218static inline int ext4_journal_extend(handle_t *handle, int nblocks)
182{ 219{
183 return jbd2_journal_extend(handle, nblocks); 220 if (ext4_handle_valid(handle))
221 return jbd2_journal_extend(handle, nblocks);
222 return 0;
184} 223}
185 224
186static inline int ext4_journal_restart(handle_t *handle, int nblocks) 225static inline int ext4_journal_restart(handle_t *handle, int nblocks)
187{ 226{
188 return jbd2_journal_restart(handle, nblocks); 227 if (ext4_handle_valid(handle))
228 return jbd2_journal_restart(handle, nblocks);
229 return 0;
189} 230}
190 231
191static inline int ext4_journal_blocks_per_page(struct inode *inode) 232static inline int ext4_journal_blocks_per_page(struct inode *inode)
192{ 233{
193 return jbd2_journal_blocks_per_page(inode); 234 if (EXT4_JOURNAL(inode) != NULL)
235 return jbd2_journal_blocks_per_page(inode);
236 return 0;
194} 237}
195 238
196static inline int ext4_journal_force_commit(journal_t *journal) 239static inline int ext4_journal_force_commit(journal_t *journal)
197{ 240{
198 return jbd2_journal_force_commit(journal); 241 if (journal)
242 return jbd2_journal_force_commit(journal);
243 return 0;
199} 244}
200 245
201static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) 246static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
202{ 247{
203 return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); 248 if (ext4_handle_valid(handle))
249 return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
250 return 0;
204} 251}
205 252
206/* super.c */ 253/* super.c */
@@ -208,6 +255,8 @@ int ext4_force_commit(struct super_block *sb);
208 255
209static inline int ext4_should_journal_data(struct inode *inode) 256static inline int ext4_should_journal_data(struct inode *inode)
210{ 257{
258 if (EXT4_JOURNAL(inode) == NULL)
259 return 0;
211 if (!S_ISREG(inode->i_mode)) 260 if (!S_ISREG(inode->i_mode))
212 return 1; 261 return 1;
213 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) 262 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
@@ -219,6 +268,8 @@ static inline int ext4_should_journal_data(struct inode *inode)
219 268
220static inline int ext4_should_order_data(struct inode *inode) 269static inline int ext4_should_order_data(struct inode *inode)
221{ 270{
271 if (EXT4_JOURNAL(inode) == NULL)
272 return 0;
222 if (!S_ISREG(inode->i_mode)) 273 if (!S_ISREG(inode->i_mode))
223 return 0; 274 return 0;
224 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 275 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
@@ -230,6 +281,8 @@ static inline int ext4_should_order_data(struct inode *inode)
230 281
231static inline int ext4_should_writeback_data(struct inode *inode) 282static inline int ext4_should_writeback_data(struct inode *inode)
232{ 283{
284 if (EXT4_JOURNAL(inode) == NULL)
285 return 0;
233 if (!S_ISREG(inode->i_mode)) 286 if (!S_ISREG(inode->i_mode))
234 return 0; 287 return 0;
235 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 288 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 445fde603df8..039b6ea1a042 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -57,6 +57,7 @@ struct ext4_sb_info {
57 u32 s_next_generation; 57 u32 s_next_generation;
58 u32 s_hash_seed[4]; 58 u32 s_hash_seed[4];
59 int s_def_hash_version; 59 int s_def_hash_version;
60 int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
60 struct percpu_counter s_freeblocks_counter; 61 struct percpu_counter s_freeblocks_counter;
61 struct percpu_counter s_freeinodes_counter; 62 struct percpu_counter s_freeinodes_counter;
62 struct percpu_counter s_dirs_counter; 63 struct percpu_counter s_dirs_counter;
@@ -73,6 +74,8 @@ struct ext4_sb_info {
73 struct journal_s *s_journal; 74 struct journal_s *s_journal;
74 struct list_head s_orphan; 75 struct list_head s_orphan;
75 unsigned long s_commit_interval; 76 unsigned long s_commit_interval;
77 u32 s_max_batch_time;
78 u32 s_min_batch_time;
76 struct block_device *journal_bdev; 79 struct block_device *journal_bdev;
77#ifdef CONFIG_JBD2_DEBUG 80#ifdef CONFIG_JBD2_DEBUG
78 struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ 81 struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
@@ -101,7 +104,8 @@ struct ext4_sb_info {
101 spinlock_t s_reserve_lock; 104 spinlock_t s_reserve_lock;
102 spinlock_t s_md_lock; 105 spinlock_t s_md_lock;
103 tid_t s_last_transaction; 106 tid_t s_last_transaction;
104 unsigned short *s_mb_offsets, *s_mb_maxs; 107 unsigned short *s_mb_offsets;
108 unsigned int *s_mb_maxs;
105 109
106 /* tunables */ 110 /* tunables */
107 unsigned long s_stripe; 111 unsigned long s_stripe;
@@ -146,4 +150,10 @@ struct ext4_sb_info {
146 struct flex_groups *s_flex_groups; 150 struct flex_groups *s_flex_groups;
147}; 151};
148 152
153static inline spinlock_t *
154sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
155{
156 return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group);
157}
158
149#endif /* _EXT4_SB */ 159#endif /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ea2ce3c0ae66..54bf0623a9ae 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -97,6 +97,8 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
97{ 97{
98 int err; 98 int err;
99 99
100 if (!ext4_handle_valid(handle))
101 return 0;
100 if (handle->h_buffer_credits > needed) 102 if (handle->h_buffer_credits > needed)
101 return 0; 103 return 0;
102 err = ext4_journal_extend(handle, needed); 104 err = ext4_journal_extend(handle, needed);
@@ -134,7 +136,7 @@ static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
134 int err; 136 int err;
135 if (path->p_bh) { 137 if (path->p_bh) {
136 /* path points to block */ 138 /* path points to block */
137 err = ext4_journal_dirty_metadata(handle, path->p_bh); 139 err = ext4_handle_dirty_metadata(handle, inode, path->p_bh);
138 } else { 140 } else {
139 /* path points to leaf/index in inode body */ 141 /* path points to leaf/index in inode body */
140 err = ext4_mark_inode_dirty(handle, inode); 142 err = ext4_mark_inode_dirty(handle, inode);
@@ -191,7 +193,7 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
191 ext4_fsblk_t goal, newblock; 193 ext4_fsblk_t goal, newblock;
192 194
193 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); 195 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
194 newblock = ext4_new_meta_block(handle, inode, goal, err); 196 newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err);
195 return newblock; 197 return newblock;
196} 198}
197 199
@@ -780,7 +782,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
780 set_buffer_uptodate(bh); 782 set_buffer_uptodate(bh);
781 unlock_buffer(bh); 783 unlock_buffer(bh);
782 784
783 err = ext4_journal_dirty_metadata(handle, bh); 785 err = ext4_handle_dirty_metadata(handle, inode, bh);
784 if (err) 786 if (err)
785 goto cleanup; 787 goto cleanup;
786 brelse(bh); 788 brelse(bh);
@@ -859,7 +861,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
859 set_buffer_uptodate(bh); 861 set_buffer_uptodate(bh);
860 unlock_buffer(bh); 862 unlock_buffer(bh);
861 863
862 err = ext4_journal_dirty_metadata(handle, bh); 864 err = ext4_handle_dirty_metadata(handle, inode, bh);
863 if (err) 865 if (err)
864 goto cleanup; 866 goto cleanup;
865 brelse(bh); 867 brelse(bh);
@@ -955,7 +957,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
955 set_buffer_uptodate(bh); 957 set_buffer_uptodate(bh);
956 unlock_buffer(bh); 958 unlock_buffer(bh);
957 959
958 err = ext4_journal_dirty_metadata(handle, bh); 960 err = ext4_handle_dirty_metadata(handle, inode, bh);
959 if (err) 961 if (err)
960 goto out; 962 goto out;
961 963
@@ -1160,15 +1162,13 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1160 while (--depth >= 0) { 1162 while (--depth >= 0) {
1161 ix = path[depth].p_idx; 1163 ix = path[depth].p_idx;
1162 if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) 1164 if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1163 break; 1165 goto got_index;
1164 } 1166 }
1165 1167
1166 if (depth < 0) { 1168 /* we've gone up to the root and found no index to the right */
1167 /* we've gone up to the root and 1169 return 0;
1168 * found no index to the right */
1169 return 0;
1170 }
1171 1170
1171got_index:
1172 /* we've found index to the right, let's 1172 /* we've found index to the right, let's
1173 * follow it and find the closest allocated 1173 * follow it and find the closest allocated
1174 * block to the right */ 1174 * block to the right */
@@ -1201,7 +1201,6 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1201 *phys = ext_pblock(ex); 1201 *phys = ext_pblock(ex);
1202 put_bh(bh); 1202 put_bh(bh);
1203 return 0; 1203 return 0;
1204
1205} 1204}
1206 1205
1207/* 1206/*
@@ -1622,7 +1621,6 @@ cleanup:
1622 ext4_ext_drop_refs(npath); 1621 ext4_ext_drop_refs(npath);
1623 kfree(npath); 1622 kfree(npath);
1624 } 1623 }
1625 ext4_ext_tree_changed(inode);
1626 ext4_ext_invalidate_cache(inode); 1624 ext4_ext_invalidate_cache(inode);
1627 return err; 1625 return err;
1628} 1626}
@@ -2233,7 +2231,6 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2233 } 2231 }
2234 } 2232 }
2235out: 2233out:
2236 ext4_ext_tree_changed(inode);
2237 ext4_ext_drop_refs(path); 2234 ext4_ext_drop_refs(path);
2238 kfree(path); 2235 kfree(path);
2239 ext4_journal_stop(handle); 2236 ext4_journal_stop(handle);
@@ -2250,7 +2247,7 @@ void ext4_ext_init(struct super_block *sb)
2250 * possible initialization would be here 2247 * possible initialization would be here
2251 */ 2248 */
2252 2249
2253 if (test_opt(sb, EXTENTS)) { 2250 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2254 printk(KERN_INFO "EXT4-fs: file extents enabled"); 2251 printk(KERN_INFO "EXT4-fs: file extents enabled");
2255#ifdef AGGRESSIVE_TEST 2252#ifdef AGGRESSIVE_TEST
2256 printk(", aggressive tests"); 2253 printk(", aggressive tests");
@@ -2275,7 +2272,7 @@ void ext4_ext_init(struct super_block *sb)
2275 */ 2272 */
2276void ext4_ext_release(struct super_block *sb) 2273void ext4_ext_release(struct super_block *sb)
2277{ 2274{
2278 if (!test_opt(sb, EXTENTS)) 2275 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
2279 return; 2276 return;
2280 2277
2281#ifdef EXTENTS_STATS 2278#ifdef EXTENTS_STATS
@@ -2380,7 +2377,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2380 struct inode *inode, 2377 struct inode *inode,
2381 struct ext4_ext_path *path, 2378 struct ext4_ext_path *path,
2382 ext4_lblk_t iblock, 2379 ext4_lblk_t iblock,
2383 unsigned long max_blocks) 2380 unsigned int max_blocks)
2384{ 2381{
2385 struct ext4_extent *ex, newex, orig_ex; 2382 struct ext4_extent *ex, newex, orig_ex;
2386 struct ext4_extent *ex1 = NULL; 2383 struct ext4_extent *ex1 = NULL;
@@ -2536,7 +2533,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2536 */ 2533 */
2537 newdepth = ext_depth(inode); 2534 newdepth = ext_depth(inode);
2538 /* 2535 /*
2539 * update the extent length after successfull insert of the 2536 * update the extent length after successful insert of the
2540 * split extent 2537 * split extent
2541 */ 2538 */
2542 orig_ex.ee_len = cpu_to_le16(ee_len - 2539 orig_ex.ee_len = cpu_to_le16(ee_len -
@@ -2678,26 +2675,26 @@ fix_extent_len:
2678 */ 2675 */
2679int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 2676int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2680 ext4_lblk_t iblock, 2677 ext4_lblk_t iblock,
2681 unsigned long max_blocks, struct buffer_head *bh_result, 2678 unsigned int max_blocks, struct buffer_head *bh_result,
2682 int create, int extend_disksize) 2679 int create, int extend_disksize)
2683{ 2680{
2684 struct ext4_ext_path *path = NULL; 2681 struct ext4_ext_path *path = NULL;
2685 struct ext4_extent_header *eh; 2682 struct ext4_extent_header *eh;
2686 struct ext4_extent newex, *ex; 2683 struct ext4_extent newex, *ex;
2687 ext4_fsblk_t goal, newblock; 2684 ext4_fsblk_t newblock;
2688 int err = 0, depth, ret; 2685 int err = 0, depth, ret, cache_type;
2689 unsigned long allocated = 0; 2686 unsigned int allocated = 0;
2690 struct ext4_allocation_request ar; 2687 struct ext4_allocation_request ar;
2691 loff_t disksize; 2688 loff_t disksize;
2692 2689
2693 __clear_bit(BH_New, &bh_result->b_state); 2690 __clear_bit(BH_New, &bh_result->b_state);
2694 ext_debug("blocks %u/%lu requested for inode %u\n", 2691 ext_debug("blocks %u/%u requested for inode %u\n",
2695 iblock, max_blocks, inode->i_ino); 2692 iblock, max_blocks, inode->i_ino);
2696 2693
2697 /* check in cache */ 2694 /* check in cache */
2698 goal = ext4_ext_in_cache(inode, iblock, &newex); 2695 cache_type = ext4_ext_in_cache(inode, iblock, &newex);
2699 if (goal) { 2696 if (cache_type) {
2700 if (goal == EXT4_EXT_CACHE_GAP) { 2697 if (cache_type == EXT4_EXT_CACHE_GAP) {
2701 if (!create) { 2698 if (!create) {
2702 /* 2699 /*
2703 * block isn't allocated yet and 2700 * block isn't allocated yet and
@@ -2706,7 +2703,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2706 goto out2; 2703 goto out2;
2707 } 2704 }
2708 /* we should allocate requested block */ 2705 /* we should allocate requested block */
2709 } else if (goal == EXT4_EXT_CACHE_EXTENT) { 2706 } else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
2710 /* block is already allocated */ 2707 /* block is already allocated */
2711 newblock = iblock 2708 newblock = iblock
2712 - le32_to_cpu(newex.ee_block) 2709 - le32_to_cpu(newex.ee_block)
@@ -2854,7 +2851,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2854 if (!newblock) 2851 if (!newblock)
2855 goto out2; 2852 goto out2;
2856 ext_debug("allocate new block: goal %llu, found %llu/%lu\n", 2853 ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
2857 goal, newblock, allocated); 2854 ar.goal, newblock, allocated);
2858 2855
2859 /* try to insert new extent into found leaf and return */ 2856 /* try to insert new extent into found leaf and return */
2860 ext4_ext_store_pblock(&newex, newblock); 2857 ext4_ext_store_pblock(&newex, newblock);
@@ -2950,7 +2947,7 @@ void ext4_ext_truncate(struct inode *inode)
2950 * transaction synchronous. 2947 * transaction synchronous.
2951 */ 2948 */
2952 if (IS_SYNC(inode)) 2949 if (IS_SYNC(inode))
2953 handle->h_sync = 1; 2950 ext4_handle_sync(handle);
2954 2951
2955out_stop: 2952out_stop:
2956 up_write(&EXT4_I(inode)->i_data_sem); 2953 up_write(&EXT4_I(inode)->i_data_sem);
@@ -3004,7 +3001,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
3004 handle_t *handle; 3001 handle_t *handle;
3005 ext4_lblk_t block; 3002 ext4_lblk_t block;
3006 loff_t new_size; 3003 loff_t new_size;
3007 unsigned long max_blocks; 3004 unsigned int max_blocks;
3008 int ret = 0; 3005 int ret = 0;
3009 int ret2 = 0; 3006 int ret2 = 0;
3010 int retries = 0; 3007 int retries = 0;
@@ -3083,7 +3080,7 @@ retry:
3083/* 3080/*
3084 * Callback function called for each extent to gather FIEMAP information. 3081 * Callback function called for each extent to gather FIEMAP information.
3085 */ 3082 */
3086int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, 3083static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3087 struct ext4_ext_cache *newex, struct ext4_extent *ex, 3084 struct ext4_ext_cache *newex, struct ext4_extent *ex,
3088 void *data) 3085 void *data)
3089{ 3086{
@@ -3152,7 +3149,8 @@ int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3152/* fiemap flags we can handle specified here */ 3149/* fiemap flags we can handle specified here */
3153#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) 3150#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
3154 3151
3155int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) 3152static int ext4_xattr_fiemap(struct inode *inode,
3153 struct fiemap_extent_info *fieinfo)
3156{ 3154{
3157 __u64 physical = 0; 3155 __u64 physical = 0;
3158 __u64 length; 3156 __u64 length;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6bd11fba71f7..f731cb545a03 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -140,9 +140,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
140 return 0; 140 return 0;
141} 141}
142 142
143extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
144 __u64 start, __u64 len);
145
146const struct file_operations ext4_file_operations = { 143const struct file_operations ext4_file_operations = {
147 .llseek = generic_file_llseek, 144 .llseek = generic_file_llseek,
148 .read = do_sync_read, 145 .read = do_sync_read,
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 556ca8eba3db..ac8f168c8ab4 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
35 35
36 36
37/* The old legacy hash */ 37/* The old legacy hash */
38static __u32 dx_hack_hash(const char *name, int len) 38static __u32 dx_hack_hash_unsigned(const char *name, int len)
39{ 39{
40 __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; 40 __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
41 const unsigned char *ucp = (const unsigned char *) name;
42
43 while (len--) {
44 hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
45
46 if (hash & 0x80000000)
47 hash -= 0x7fffffff;
48 hash1 = hash0;
49 hash0 = hash;
50 }
51 return hash0 << 1;
52}
53
54static __u32 dx_hack_hash_signed(const char *name, int len)
55{
56 __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
57 const signed char *scp = (const signed char *) name;
58
41 while (len--) { 59 while (len--) {
42 __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); 60 hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
43 61
44 if (hash & 0x80000000) hash -= 0x7fffffff; 62 if (hash & 0x80000000)
63 hash -= 0x7fffffff;
45 hash1 = hash0; 64 hash1 = hash0;
46 hash0 = hash; 65 hash0 = hash;
47 } 66 }
48 return (hash0 << 1); 67 return hash0 << 1;
68}
69
70static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
71{
72 __u32 pad, val;
73 int i;
74 const signed char *scp = (const signed char *) msg;
75
76 pad = (__u32)len | ((__u32)len << 8);
77 pad |= pad << 16;
78
79 val = pad;
80 if (len > num*4)
81 len = num * 4;
82 for (i = 0; i < len; i++) {
83 if ((i % 4) == 0)
84 val = pad;
85 val = ((int) scp[i]) + (val << 8);
86 if ((i % 4) == 3) {
87 *buf++ = val;
88 val = pad;
89 num--;
90 }
91 }
92 if (--num >= 0)
93 *buf++ = val;
94 while (--num >= 0)
95 *buf++ = pad;
49} 96}
50 97
51static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) 98static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
52{ 99{
53 __u32 pad, val; 100 __u32 pad, val;
54 int i; 101 int i;
102 const unsigned char *ucp = (const unsigned char *) msg;
55 103
56 pad = (__u32)len | ((__u32)len << 8); 104 pad = (__u32)len | ((__u32)len << 8);
57 pad |= pad << 16; 105 pad |= pad << 16;
@@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
62 for (i = 0; i < len; i++) { 110 for (i = 0; i < len; i++) {
63 if ((i % 4) == 0) 111 if ((i % 4) == 0)
64 val = pad; 112 val = pad;
65 val = msg[i] + (val << 8); 113 val = ((int) ucp[i]) + (val << 8);
66 if ((i % 4) == 3) { 114 if ((i % 4) == 3) {
67 *buf++ = val; 115 *buf++ = val;
68 val = pad; 116 val = pad;
@@ -95,6 +143,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
95 const char *p; 143 const char *p;
96 int i; 144 int i;
97 __u32 in[8], buf[4]; 145 __u32 in[8], buf[4];
146 void (*str2hashbuf)(const char *, int, __u32 *, int) =
147 str2hashbuf_signed;
98 148
99 /* Initialize the default seed for the hash checksum functions */ 149 /* Initialize the default seed for the hash checksum functions */
100 buf[0] = 0x67452301; 150 buf[0] = 0x67452301;
@@ -113,13 +163,18 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
113 } 163 }
114 164
115 switch (hinfo->hash_version) { 165 switch (hinfo->hash_version) {
166 case DX_HASH_LEGACY_UNSIGNED:
167 hash = dx_hack_hash_unsigned(name, len);
168 break;
116 case DX_HASH_LEGACY: 169 case DX_HASH_LEGACY:
117 hash = dx_hack_hash(name, len); 170 hash = dx_hack_hash_signed(name, len);
118 break; 171 break;
172 case DX_HASH_HALF_MD4_UNSIGNED:
173 str2hashbuf = str2hashbuf_unsigned;
119 case DX_HASH_HALF_MD4: 174 case DX_HASH_HALF_MD4:
120 p = name; 175 p = name;
121 while (len > 0) { 176 while (len > 0) {
122 str2hashbuf(p, len, in, 8); 177 (*str2hashbuf)(p, len, in, 8);
123 half_md4_transform(buf, in); 178 half_md4_transform(buf, in);
124 len -= 32; 179 len -= 32;
125 p += 32; 180 p += 32;
@@ -127,10 +182,12 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
127 minor_hash = buf[2]; 182 minor_hash = buf[2];
128 hash = buf[1]; 183 hash = buf[1];
129 break; 184 break;
185 case DX_HASH_TEA_UNSIGNED:
186 str2hashbuf = str2hashbuf_unsigned;
130 case DX_HASH_TEA: 187 case DX_HASH_TEA:
131 p = name; 188 p = name;
132 while (len > 0) { 189 while (len > 0) {
133 str2hashbuf(p, len, in, 4); 190 (*str2hashbuf)(p, len, in, 4);
134 TEA_transform(buf, in); 191 TEA_transform(buf, in);
135 len -= 16; 192 len -= 16;
136 p += 16; 193 p += 16;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 08cac9fcace2..4fb86a0061d0 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -74,17 +74,17 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
74 /* If checksum is bad mark all blocks and inodes use to prevent 74 /* If checksum is bad mark all blocks and inodes use to prevent
75 * allocation, essentially implementing a per-group read-only flag. */ 75 * allocation, essentially implementing a per-group read-only flag. */
76 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { 76 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
77 ext4_error(sb, __func__, "Checksum bad for group %lu\n", 77 ext4_error(sb, __func__, "Checksum bad for group %u",
78 block_group); 78 block_group);
79 gdp->bg_free_blocks_count = 0; 79 ext4_free_blks_set(sb, gdp, 0);
80 gdp->bg_free_inodes_count = 0; 80 ext4_free_inodes_set(sb, gdp, 0);
81 gdp->bg_itable_unused = 0; 81 ext4_itable_unused_set(sb, gdp, 0);
82 memset(bh->b_data, 0xff, sb->s_blocksize); 82 memset(bh->b_data, 0xff, sb->s_blocksize);
83 return 0; 83 return 0;
84 } 84 }
85 85
86 memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); 86 memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
87 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb), 87 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
88 bh->b_data); 88 bh->b_data);
89 89
90 return EXT4_INODES_PER_GROUP(sb); 90 return EXT4_INODES_PER_GROUP(sb);
@@ -111,29 +111,49 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
111 if (unlikely(!bh)) { 111 if (unlikely(!bh)) {
112 ext4_error(sb, __func__, 112 ext4_error(sb, __func__,
113 "Cannot read inode bitmap - " 113 "Cannot read inode bitmap - "
114 "block_group = %lu, inode_bitmap = %llu", 114 "block_group = %u, inode_bitmap = %llu",
115 block_group, bitmap_blk); 115 block_group, bitmap_blk);
116 return NULL; 116 return NULL;
117 } 117 }
118 if (buffer_uptodate(bh) && 118 if (bitmap_uptodate(bh))
119 !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
120 return bh; 119 return bh;
121 120
122 lock_buffer(bh); 121 lock_buffer(bh);
122 if (bitmap_uptodate(bh)) {
123 unlock_buffer(bh);
124 return bh;
125 }
123 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); 126 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
124 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { 127 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
125 ext4_init_inode_bitmap(sb, bh, block_group, desc); 128 ext4_init_inode_bitmap(sb, bh, block_group, desc);
129 set_bitmap_uptodate(bh);
126 set_buffer_uptodate(bh); 130 set_buffer_uptodate(bh);
127 unlock_buffer(bh);
128 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); 131 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
132 unlock_buffer(bh);
129 return bh; 133 return bh;
130 } 134 }
131 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); 135 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
136 if (buffer_uptodate(bh)) {
137 /*
138 * if not uninit if bh is uptodate,
139 * bitmap is also uptodate
140 */
141 set_bitmap_uptodate(bh);
142 unlock_buffer(bh);
143 return bh;
144 }
145 /*
146 * submit the buffer_head for read. We can
147 * safely mark the bitmap as uptodate now.
148 * We do it here so the bitmap uptodate bit
149 * get set with buffer lock held.
150 */
151 set_bitmap_uptodate(bh);
132 if (bh_submit_read(bh) < 0) { 152 if (bh_submit_read(bh) < 0) {
133 put_bh(bh); 153 put_bh(bh);
134 ext4_error(sb, __func__, 154 ext4_error(sb, __func__,
135 "Cannot read inode bitmap - " 155 "Cannot read inode bitmap - "
136 "block_group = %lu, inode_bitmap = %llu", 156 "block_group = %u, inode_bitmap = %llu",
137 block_group, bitmap_blk); 157 block_group, bitmap_blk);
138 return NULL; 158 return NULL;
139 } 159 }
@@ -168,7 +188,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
168 struct ext4_group_desc *gdp; 188 struct ext4_group_desc *gdp;
169 struct ext4_super_block *es; 189 struct ext4_super_block *es;
170 struct ext4_sb_info *sbi; 190 struct ext4_sb_info *sbi;
171 int fatal = 0, err; 191 int fatal = 0, err, count;
172 ext4_group_t flex_group; 192 ext4_group_t flex_group;
173 193
174 if (atomic_read(&inode->i_count) > 1) { 194 if (atomic_read(&inode->i_count) > 1) {
@@ -190,6 +210,11 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
190 210
191 ino = inode->i_ino; 211 ino = inode->i_ino;
192 ext4_debug("freeing inode %lu\n", ino); 212 ext4_debug("freeing inode %lu\n", ino);
213 trace_mark(ext4_free_inode,
214 "dev %s ino %lu mode %d uid %lu gid %lu bocks %llu",
215 sb->s_id, inode->i_ino, inode->i_mode,
216 (unsigned long) inode->i_uid, (unsigned long) inode->i_gid,
217 (unsigned long long) inode->i_blocks);
193 218
194 /* 219 /*
195 * Note: we must free any quota before locking the superblock, 220 * Note: we must free any quota before locking the superblock,
@@ -236,9 +261,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
236 261
237 if (gdp) { 262 if (gdp) {
238 spin_lock(sb_bgl_lock(sbi, block_group)); 263 spin_lock(sb_bgl_lock(sbi, block_group));
239 le16_add_cpu(&gdp->bg_free_inodes_count, 1); 264 count = ext4_free_inodes_count(sb, gdp) + 1;
240 if (is_directory) 265 ext4_free_inodes_set(sb, gdp, count);
241 le16_add_cpu(&gdp->bg_used_dirs_count, -1); 266 if (is_directory) {
267 count = ext4_used_dirs_count(sb, gdp) - 1;
268 ext4_used_dirs_set(sb, gdp, count);
269 }
242 gdp->bg_checksum = ext4_group_desc_csum(sbi, 270 gdp->bg_checksum = ext4_group_desc_csum(sbi,
243 block_group, gdp); 271 block_group, gdp);
244 spin_unlock(sb_bgl_lock(sbi, block_group)); 272 spin_unlock(sb_bgl_lock(sbi, block_group));
@@ -253,12 +281,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
253 spin_unlock(sb_bgl_lock(sbi, flex_group)); 281 spin_unlock(sb_bgl_lock(sbi, flex_group));
254 } 282 }
255 } 283 }
256 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); 284 BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
257 err = ext4_journal_dirty_metadata(handle, bh2); 285 err = ext4_handle_dirty_metadata(handle, NULL, bh2);
258 if (!fatal) fatal = err; 286 if (!fatal) fatal = err;
259 } 287 }
260 BUFFER_TRACE(bitmap_bh, "call ext4_journal_dirty_metadata"); 288 BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
261 err = ext4_journal_dirty_metadata(handle, bitmap_bh); 289 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
262 if (!fatal) 290 if (!fatal)
263 fatal = err; 291 fatal = err;
264 sb->s_dirt = 1; 292 sb->s_dirt = 1;
@@ -291,13 +319,13 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
291 319
292 for (group = 0; group < ngroups; group++) { 320 for (group = 0; group < ngroups; group++) {
293 desc = ext4_get_group_desc(sb, group, NULL); 321 desc = ext4_get_group_desc(sb, group, NULL);
294 if (!desc || !desc->bg_free_inodes_count) 322 if (!desc || !ext4_free_inodes_count(sb, desc))
295 continue; 323 continue;
296 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) 324 if (ext4_free_inodes_count(sb, desc) < avefreei)
297 continue; 325 continue;
298 if (!best_desc || 326 if (!best_desc ||
299 (le16_to_cpu(desc->bg_free_blocks_count) > 327 (ext4_free_blks_count(sb, desc) >
300 le16_to_cpu(best_desc->bg_free_blocks_count))) { 328 ext4_free_blks_count(sb, best_desc))) {
301 *best_group = group; 329 *best_group = group;
302 best_desc = desc; 330 best_desc = desc;
303 ret = 0; 331 ret = 0;
@@ -369,7 +397,7 @@ found_flexbg:
369 for (i = best_flex * flex_size; i < ngroups && 397 for (i = best_flex * flex_size; i < ngroups &&
370 i < (best_flex + 1) * flex_size; i++) { 398 i < (best_flex + 1) * flex_size; i++) {
371 desc = ext4_get_group_desc(sb, i, &bh); 399 desc = ext4_get_group_desc(sb, i, &bh);
372 if (le16_to_cpu(desc->bg_free_inodes_count)) { 400 if (ext4_free_inodes_count(sb, desc)) {
373 *best_group = i; 401 *best_group = i;
374 goto out; 402 goto out;
375 } 403 }
@@ -443,17 +471,17 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
443 for (i = 0; i < ngroups; i++) { 471 for (i = 0; i < ngroups; i++) {
444 grp = (parent_group + i) % ngroups; 472 grp = (parent_group + i) % ngroups;
445 desc = ext4_get_group_desc(sb, grp, NULL); 473 desc = ext4_get_group_desc(sb, grp, NULL);
446 if (!desc || !desc->bg_free_inodes_count) 474 if (!desc || !ext4_free_inodes_count(sb, desc))
447 continue; 475 continue;
448 if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir) 476 if (ext4_used_dirs_count(sb, desc) >= best_ndir)
449 continue; 477 continue;
450 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) 478 if (ext4_free_inodes_count(sb, desc) < avefreei)
451 continue; 479 continue;
452 if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb) 480 if (ext4_free_blks_count(sb, desc) < avefreeb)
453 continue; 481 continue;
454 *group = grp; 482 *group = grp;
455 ret = 0; 483 ret = 0;
456 best_ndir = le16_to_cpu(desc->bg_used_dirs_count); 484 best_ndir = ext4_used_dirs_count(sb, desc);
457 } 485 }
458 if (ret == 0) 486 if (ret == 0)
459 return ret; 487 return ret;
@@ -479,13 +507,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
479 for (i = 0; i < ngroups; i++) { 507 for (i = 0; i < ngroups; i++) {
480 *group = (parent_group + i) % ngroups; 508 *group = (parent_group + i) % ngroups;
481 desc = ext4_get_group_desc(sb, *group, NULL); 509 desc = ext4_get_group_desc(sb, *group, NULL);
482 if (!desc || !desc->bg_free_inodes_count) 510 if (!desc || !ext4_free_inodes_count(sb, desc))
483 continue; 511 continue;
484 if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) 512 if (ext4_used_dirs_count(sb, desc) >= max_dirs)
485 continue; 513 continue;
486 if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes) 514 if (ext4_free_inodes_count(sb, desc) < min_inodes)
487 continue; 515 continue;
488 if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks) 516 if (ext4_free_blks_count(sb, desc) < min_blocks)
489 continue; 517 continue;
490 return 0; 518 return 0;
491 } 519 }
@@ -494,8 +522,8 @@ fallback:
494 for (i = 0; i < ngroups; i++) { 522 for (i = 0; i < ngroups; i++) {
495 *group = (parent_group + i) % ngroups; 523 *group = (parent_group + i) % ngroups;
496 desc = ext4_get_group_desc(sb, *group, NULL); 524 desc = ext4_get_group_desc(sb, *group, NULL);
497 if (desc && desc->bg_free_inodes_count && 525 if (desc && ext4_free_inodes_count(sb, desc) &&
498 le16_to_cpu(desc->bg_free_inodes_count) >= avefreei) 526 ext4_free_inodes_count(sb, desc) >= avefreei)
499 return 0; 527 return 0;
500 } 528 }
501 529
@@ -524,8 +552,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
524 */ 552 */
525 *group = parent_group; 553 *group = parent_group;
526 desc = ext4_get_group_desc(sb, *group, NULL); 554 desc = ext4_get_group_desc(sb, *group, NULL);
527 if (desc && le16_to_cpu(desc->bg_free_inodes_count) && 555 if (desc && ext4_free_inodes_count(sb, desc) &&
528 le16_to_cpu(desc->bg_free_blocks_count)) 556 ext4_free_blks_count(sb, desc))
529 return 0; 557 return 0;
530 558
531 /* 559 /*
@@ -548,8 +576,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
548 if (*group >= ngroups) 576 if (*group >= ngroups)
549 *group -= ngroups; 577 *group -= ngroups;
550 desc = ext4_get_group_desc(sb, *group, NULL); 578 desc = ext4_get_group_desc(sb, *group, NULL);
551 if (desc && le16_to_cpu(desc->bg_free_inodes_count) && 579 if (desc && ext4_free_inodes_count(sb, desc) &&
552 le16_to_cpu(desc->bg_free_blocks_count)) 580 ext4_free_blks_count(sb, desc))
553 return 0; 581 return 0;
554 } 582 }
555 583
@@ -562,7 +590,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
562 if (++*group >= ngroups) 590 if (++*group >= ngroups)
563 *group = 0; 591 *group = 0;
564 desc = ext4_get_group_desc(sb, *group, NULL); 592 desc = ext4_get_group_desc(sb, *group, NULL);
565 if (desc && le16_to_cpu(desc->bg_free_inodes_count)) 593 if (desc && ext4_free_inodes_count(sb, desc))
566 return 0; 594 return 0;
567 } 595 }
568 596
@@ -570,6 +598,79 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
570} 598}
571 599
572/* 600/*
601 * claim the inode from the inode bitmap. If the group
602 * is uninit we need to take the groups's sb_bgl_lock
603 * and clear the uninit flag. The inode bitmap update
604 * and group desc uninit flag clear should be done
605 * after holding sb_bgl_lock so that ext4_read_inode_bitmap
606 * doesn't race with the ext4_claim_inode
607 */
608static int ext4_claim_inode(struct super_block *sb,
609 struct buffer_head *inode_bitmap_bh,
610 unsigned long ino, ext4_group_t group, int mode)
611{
612 int free = 0, retval = 0, count;
613 struct ext4_sb_info *sbi = EXT4_SB(sb);
614 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
615
616 spin_lock(sb_bgl_lock(sbi, group));
617 if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
618 /* not a free inode */
619 retval = 1;
620 goto err_ret;
621 }
622 ino++;
623 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
624 ino > EXT4_INODES_PER_GROUP(sb)) {
625 spin_unlock(sb_bgl_lock(sbi, group));
626 ext4_error(sb, __func__,
627 "reserved inode or inode > inodes count - "
628 "block_group = %u, inode=%lu", group,
629 ino + group * EXT4_INODES_PER_GROUP(sb));
630 return 1;
631 }
632 /* If we didn't allocate from within the initialized part of the inode
633 * table then we need to initialize up to this inode. */
634 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
635
636 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
637 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
638 /* When marking the block group with
639 * ~EXT4_BG_INODE_UNINIT we don't want to depend
640 * on the value of bg_itable_unused even though
641 * mke2fs could have initialized the same for us.
642 * Instead we calculated the value below
643 */
644
645 free = 0;
646 } else {
647 free = EXT4_INODES_PER_GROUP(sb) -
648 ext4_itable_unused_count(sb, gdp);
649 }
650
651 /*
652 * Check the relative inode number against the last used
653 * relative inode number in this group. if it is greater
654 * we need to update the bg_itable_unused count
655 *
656 */
657 if (ino > free)
658 ext4_itable_unused_set(sb, gdp,
659 (EXT4_INODES_PER_GROUP(sb) - ino));
660 }
661 count = ext4_free_inodes_count(sb, gdp) - 1;
662 ext4_free_inodes_set(sb, gdp, count);
663 if (S_ISDIR(mode)) {
664 count = ext4_used_dirs_count(sb, gdp) + 1;
665 ext4_used_dirs_set(sb, gdp, count);
666 }
667 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
668err_ret:
669 spin_unlock(sb_bgl_lock(sbi, group));
670 return retval;
671}
672
673/*
573 * There are two policies for allocating an inode. If the new inode is 674 * There are two policies for allocating an inode. If the new inode is
574 * a directory, then a forward search is made for a block group with both 675 * a directory, then a forward search is made for a block group with both
575 * free space and a low directory-to-inode ratio; if that fails, then of 676 * free space and a low directory-to-inode ratio; if that fails, then of
@@ -582,8 +683,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
582struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) 683struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
583{ 684{
584 struct super_block *sb; 685 struct super_block *sb;
585 struct buffer_head *bitmap_bh = NULL; 686 struct buffer_head *inode_bitmap_bh = NULL;
586 struct buffer_head *bh2; 687 struct buffer_head *group_desc_bh;
587 ext4_group_t group = 0; 688 ext4_group_t group = 0;
588 unsigned long ino = 0; 689 unsigned long ino = 0;
589 struct inode *inode; 690 struct inode *inode;
@@ -602,6 +703,8 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
602 return ERR_PTR(-EPERM); 703 return ERR_PTR(-EPERM);
603 704
604 sb = dir->i_sb; 705 sb = dir->i_sb;
706 trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id,
707 dir->i_ino, mode);
605 inode = new_inode(sb); 708 inode = new_inode(sb);
606 if (!inode) 709 if (!inode)
607 return ERR_PTR(-ENOMEM); 710 return ERR_PTR(-ENOMEM);
@@ -631,40 +734,52 @@ got_group:
631 for (i = 0; i < sbi->s_groups_count; i++) { 734 for (i = 0; i < sbi->s_groups_count; i++) {
632 err = -EIO; 735 err = -EIO;
633 736
634 gdp = ext4_get_group_desc(sb, group, &bh2); 737 gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
635 if (!gdp) 738 if (!gdp)
636 goto fail; 739 goto fail;
637 740
638 brelse(bitmap_bh); 741 brelse(inode_bitmap_bh);
639 bitmap_bh = ext4_read_inode_bitmap(sb, group); 742 inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
640 if (!bitmap_bh) 743 if (!inode_bitmap_bh)
641 goto fail; 744 goto fail;
642 745
643 ino = 0; 746 ino = 0;
644 747
645repeat_in_this_group: 748repeat_in_this_group:
646 ino = ext4_find_next_zero_bit((unsigned long *) 749 ino = ext4_find_next_zero_bit((unsigned long *)
647 bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino); 750 inode_bitmap_bh->b_data,
751 EXT4_INODES_PER_GROUP(sb), ino);
752
648 if (ino < EXT4_INODES_PER_GROUP(sb)) { 753 if (ino < EXT4_INODES_PER_GROUP(sb)) {
649 754
650 BUFFER_TRACE(bitmap_bh, "get_write_access"); 755 BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
651 err = ext4_journal_get_write_access(handle, bitmap_bh); 756 err = ext4_journal_get_write_access(handle,
757 inode_bitmap_bh);
652 if (err) 758 if (err)
653 goto fail; 759 goto fail;
654 760
655 if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group), 761 BUFFER_TRACE(group_desc_bh, "get_write_access");
656 ino, bitmap_bh->b_data)) { 762 err = ext4_journal_get_write_access(handle,
763 group_desc_bh);
764 if (err)
765 goto fail;
766 if (!ext4_claim_inode(sb, inode_bitmap_bh,
767 ino, group, mode)) {
657 /* we won it */ 768 /* we won it */
658 BUFFER_TRACE(bitmap_bh, 769 BUFFER_TRACE(inode_bitmap_bh,
659 "call ext4_journal_dirty_metadata"); 770 "call ext4_handle_dirty_metadata");
660 err = ext4_journal_dirty_metadata(handle, 771 err = ext4_handle_dirty_metadata(handle,
661 bitmap_bh); 772 inode,
773 inode_bitmap_bh);
662 if (err) 774 if (err)
663 goto fail; 775 goto fail;
776 /* zero bit is inode number 1*/
777 ino++;
664 goto got; 778 goto got;
665 } 779 }
666 /* we lost it */ 780 /* we lost it */
667 jbd2_journal_release_buffer(handle, bitmap_bh); 781 ext4_handle_release_buffer(handle, inode_bitmap_bh);
782 ext4_handle_release_buffer(handle, group_desc_bh);
668 783
669 if (++ino < EXT4_INODES_PER_GROUP(sb)) 784 if (++ino < EXT4_INODES_PER_GROUP(sb))
670 goto repeat_in_this_group; 785 goto repeat_in_this_group;
@@ -684,30 +799,16 @@ repeat_in_this_group:
684 goto out; 799 goto out;
685 800
686got: 801got:
687 ino++;
688 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
689 ino > EXT4_INODES_PER_GROUP(sb)) {
690 ext4_error(sb, __func__,
691 "reserved inode or inode > inodes count - "
692 "block_group = %lu, inode=%lu", group,
693 ino + group * EXT4_INODES_PER_GROUP(sb));
694 err = -EIO;
695 goto fail;
696 }
697
698 BUFFER_TRACE(bh2, "get_write_access");
699 err = ext4_journal_get_write_access(handle, bh2);
700 if (err) goto fail;
701
702 /* We may have to initialize the block bitmap if it isn't already */ 802 /* We may have to initialize the block bitmap if it isn't already */
703 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && 803 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
704 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 804 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
705 struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group); 805 struct buffer_head *block_bitmap_bh;
706 806
707 BUFFER_TRACE(block_bh, "get block bitmap access"); 807 block_bitmap_bh = ext4_read_block_bitmap(sb, group);
708 err = ext4_journal_get_write_access(handle, block_bh); 808 BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
809 err = ext4_journal_get_write_access(handle, block_bitmap_bh);
709 if (err) { 810 if (err) {
710 brelse(block_bh); 811 brelse(block_bitmap_bh);
711 goto fail; 812 goto fail;
712 } 813 }
713 814
@@ -715,9 +816,9 @@ got:
715 spin_lock(sb_bgl_lock(sbi, group)); 816 spin_lock(sb_bgl_lock(sbi, group));
716 /* recheck and clear flag under lock if we still need to */ 817 /* recheck and clear flag under lock if we still need to */
717 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 818 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
718 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
719 free = ext4_free_blocks_after_init(sb, group, gdp); 819 free = ext4_free_blocks_after_init(sb, group, gdp);
720 gdp->bg_free_blocks_count = cpu_to_le16(free); 820 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
821 ext4_free_blks_set(sb, gdp, free);
721 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, 822 gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
722 gdp); 823 gdp);
723 } 824 }
@@ -725,55 +826,19 @@ got:
725 826
726 /* Don't need to dirty bitmap block if we didn't change it */ 827 /* Don't need to dirty bitmap block if we didn't change it */
727 if (free) { 828 if (free) {
728 BUFFER_TRACE(block_bh, "dirty block bitmap"); 829 BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
729 err = ext4_journal_dirty_metadata(handle, block_bh); 830 err = ext4_handle_dirty_metadata(handle,
831 NULL, block_bitmap_bh);
730 } 832 }
731 833
732 brelse(block_bh); 834 brelse(block_bitmap_bh);
733 if (err) 835 if (err)
734 goto fail; 836 goto fail;
735 } 837 }
736 838 BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
737 spin_lock(sb_bgl_lock(sbi, group)); 839 err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
738 /* If we didn't allocate from within the initialized part of the inode 840 if (err)
739 * table then we need to initialize up to this inode. */ 841 goto fail;
740 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
741 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
742 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
743
744 /* When marking the block group with
745 * ~EXT4_BG_INODE_UNINIT we don't want to depend
746 * on the value of bg_itable_unused even though
747 * mke2fs could have initialized the same for us.
748 * Instead we calculated the value below
749 */
750
751 free = 0;
752 } else {
753 free = EXT4_INODES_PER_GROUP(sb) -
754 le16_to_cpu(gdp->bg_itable_unused);
755 }
756
757 /*
758 * Check the relative inode number against the last used
759 * relative inode number in this group. if it is greater
760 * we need to update the bg_itable_unused count
761 *
762 */
763 if (ino > free)
764 gdp->bg_itable_unused =
765 cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
766 }
767
768 le16_add_cpu(&gdp->bg_free_inodes_count, -1);
769 if (S_ISDIR(mode)) {
770 le16_add_cpu(&gdp->bg_used_dirs_count, 1);
771 }
772 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
773 spin_unlock(sb_bgl_lock(sbi, group));
774 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
775 err = ext4_journal_dirty_metadata(handle, bh2);
776 if (err) goto fail;
777 842
778 percpu_counter_dec(&sbi->s_freeinodes_counter); 843 percpu_counter_dec(&sbi->s_freeinodes_counter);
779 if (S_ISDIR(mode)) 844 if (S_ISDIR(mode))
@@ -825,8 +890,11 @@ got:
825 890
826 ext4_set_inode_flags(inode); 891 ext4_set_inode_flags(inode);
827 if (IS_DIRSYNC(inode)) 892 if (IS_DIRSYNC(inode))
828 handle->h_sync = 1; 893 ext4_handle_sync(handle);
829 insert_inode_hash(inode); 894 if (insert_inode_locked(inode) < 0) {
895 err = -EINVAL;
896 goto fail_drop;
897 }
830 spin_lock(&sbi->s_next_gen_lock); 898 spin_lock(&sbi->s_next_gen_lock);
831 inode->i_generation = sbi->s_next_generation++; 899 inode->i_generation = sbi->s_next_generation++;
832 spin_unlock(&sbi->s_next_gen_lock); 900 spin_unlock(&sbi->s_next_gen_lock);
@@ -849,7 +917,7 @@ got:
849 if (err) 917 if (err)
850 goto fail_free_drop; 918 goto fail_free_drop;
851 919
852 if (test_opt(sb, EXTENTS)) { 920 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
853 /* set extent flag only for directory, file and normal symlink*/ 921 /* set extent flag only for directory, file and normal symlink*/
854 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { 922 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
855 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; 923 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
@@ -864,6 +932,8 @@ got:
864 } 932 }
865 933
866 ext4_debug("allocating inode %lu\n", inode->i_ino); 934 ext4_debug("allocating inode %lu\n", inode->i_ino);
935 trace_mark(ext4_allocate_inode, "dev %s ino %lu dir %lu mode %d",
936 sb->s_id, inode->i_ino, dir->i_ino, mode);
867 goto really_out; 937 goto really_out;
868fail: 938fail:
869 ext4_std_error(sb, err); 939 ext4_std_error(sb, err);
@@ -871,7 +941,7 @@ out:
871 iput(inode); 941 iput(inode);
872 ret = ERR_PTR(err); 942 ret = ERR_PTR(err);
873really_out: 943really_out:
874 brelse(bitmap_bh); 944 brelse(inode_bitmap_bh);
875 return ret; 945 return ret;
876 946
877fail_free_drop: 947fail_free_drop:
@@ -881,8 +951,9 @@ fail_drop:
881 DQUOT_DROP(inode); 951 DQUOT_DROP(inode);
882 inode->i_flags |= S_NOQUOTA; 952 inode->i_flags |= S_NOQUOTA;
883 inode->i_nlink = 0; 953 inode->i_nlink = 0;
954 unlock_new_inode(inode);
884 iput(inode); 955 iput(inode);
885 brelse(bitmap_bh); 956 brelse(inode_bitmap_bh);
886 return ERR_PTR(err); 957 return ERR_PTR(err);
887} 958}
888 959
@@ -981,7 +1052,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
981 gdp = ext4_get_group_desc(sb, i, NULL); 1052 gdp = ext4_get_group_desc(sb, i, NULL);
982 if (!gdp) 1053 if (!gdp)
983 continue; 1054 continue;
984 desc_count += le16_to_cpu(gdp->bg_free_inodes_count); 1055 desc_count += ext4_free_inodes_count(sb, gdp);
985 brelse(bitmap_bh); 1056 brelse(bitmap_bh);
986 bitmap_bh = ext4_read_inode_bitmap(sb, i); 1057 bitmap_bh = ext4_read_inode_bitmap(sb, i);
987 if (!bitmap_bh) 1058 if (!bitmap_bh)
@@ -989,7 +1060,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
989 1060
990 x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); 1061 x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
991 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", 1062 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
992 i, le16_to_cpu(gdp->bg_free_inodes_count), x); 1063 i, ext4_free_inodes_count(sb, gdp), x);
993 bitmap_count += x; 1064 bitmap_count += x;
994 } 1065 }
995 brelse(bitmap_bh); 1066 brelse(bitmap_bh);
@@ -1003,7 +1074,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
1003 gdp = ext4_get_group_desc(sb, i, NULL); 1074 gdp = ext4_get_group_desc(sb, i, NULL);
1004 if (!gdp) 1075 if (!gdp)
1005 continue; 1076 continue;
1006 desc_count += le16_to_cpu(gdp->bg_free_inodes_count); 1077 desc_count += ext4_free_inodes_count(sb, gdp);
1007 cond_resched(); 1078 cond_resched();
1008 } 1079 }
1009 return desc_count; 1080 return desc_count;
@@ -1020,8 +1091,7 @@ unsigned long ext4_count_dirs(struct super_block * sb)
1020 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); 1091 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
1021 if (!gdp) 1092 if (!gdp)
1022 continue; 1093 continue;
1023 count += le16_to_cpu(gdp->bg_used_dirs_count); 1094 count += ext4_used_dirs_count(sb, gdp);
1024 } 1095 }
1025 return count; 1096 return count;
1026} 1097}
1027
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index be21a5ae33cb..a6444cee0c7e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -34,6 +34,7 @@
34#include <linux/writeback.h> 34#include <linux/writeback.h>
35#include <linux/pagevec.h> 35#include <linux/pagevec.h>
36#include <linux/mpage.h> 36#include <linux/mpage.h>
37#include <linux/namei.h>
37#include <linux/uio.h> 38#include <linux/uio.h>
38#include <linux/bio.h> 39#include <linux/bio.h>
39#include "ext4_jbd2.h" 40#include "ext4_jbd2.h"
@@ -71,12 +72,17 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
71 * "bh" may be NULL: a metadata block may have been freed from memory 72 * "bh" may be NULL: a metadata block may have been freed from memory
72 * but there may still be a record of it in the journal, and that record 73 * but there may still be a record of it in the journal, and that record
73 * still needs to be revoked. 74 * still needs to be revoked.
75 *
76 * If the handle isn't valid we're not journaling so there's nothing to do.
74 */ 77 */
75int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, 78int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
76 struct buffer_head *bh, ext4_fsblk_t blocknr) 79 struct buffer_head *bh, ext4_fsblk_t blocknr)
77{ 80{
78 int err; 81 int err;
79 82
83 if (!ext4_handle_valid(handle))
84 return 0;
85
80 might_sleep(); 86 might_sleep();
81 87
82 BUFFER_TRACE(bh, "enter"); 88 BUFFER_TRACE(bh, "enter");
@@ -169,7 +175,9 @@ static handle_t *start_transaction(struct inode *inode)
169 */ 175 */
170static int try_to_extend_transaction(handle_t *handle, struct inode *inode) 176static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
171{ 177{
172 if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS) 178 if (!ext4_handle_valid(handle))
179 return 0;
180 if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
173 return 0; 181 return 0;
174 if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) 182 if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
175 return 0; 183 return 0;
@@ -183,6 +191,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
183 */ 191 */
184static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) 192static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
185{ 193{
194 BUG_ON(EXT4_JOURNAL(inode) == NULL);
186 jbd_debug(2, "restarting handle %p\n", handle); 195 jbd_debug(2, "restarting handle %p\n", handle);
187 return ext4_journal_restart(handle, blocks_for_truncate(inode)); 196 return ext4_journal_restart(handle, blocks_for_truncate(inode));
188} 197}
@@ -215,7 +224,7 @@ void ext4_delete_inode(struct inode *inode)
215 } 224 }
216 225
217 if (IS_SYNC(inode)) 226 if (IS_SYNC(inode))
218 handle->h_sync = 1; 227 ext4_handle_sync(handle);
219 inode->i_size = 0; 228 inode->i_size = 0;
220 err = ext4_mark_inode_dirty(handle, inode); 229 err = ext4_mark_inode_dirty(handle, inode);
221 if (err) { 230 if (err) {
@@ -232,7 +241,7 @@ void ext4_delete_inode(struct inode *inode)
232 * enough credits left in the handle to remove the inode from 241 * enough credits left in the handle to remove the inode from
233 * the orphan list and set the dtime field. 242 * the orphan list and set the dtime field.
234 */ 243 */
235 if (handle->h_buffer_credits < 3) { 244 if (!ext4_handle_has_enough_credits(handle, 3)) {
236 err = ext4_journal_extend(handle, 3); 245 err = ext4_journal_extend(handle, 3);
237 if (err > 0) 246 if (err > 0)
238 err = ext4_journal_restart(handle, 3); 247 err = ext4_journal_restart(handle, 3);
@@ -505,10 +514,10 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
505 * return the total number of blocks to be allocate, including the 514 * return the total number of blocks to be allocate, including the
506 * direct and indirect blocks. 515 * direct and indirect blocks.
507 */ 516 */
508static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks, 517static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
509 int blocks_to_boundary) 518 int blocks_to_boundary)
510{ 519{
511 unsigned long count = 0; 520 unsigned int count = 0;
512 521
513 /* 522 /*
514 * Simple case, [t,d]Indirect block(s) has not allocated yet 523 * Simple case, [t,d]Indirect block(s) has not allocated yet
@@ -546,6 +555,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
546 int indirect_blks, int blks, 555 int indirect_blks, int blks,
547 ext4_fsblk_t new_blocks[4], int *err) 556 ext4_fsblk_t new_blocks[4], int *err)
548{ 557{
558 struct ext4_allocation_request ar;
549 int target, i; 559 int target, i;
550 unsigned long count = 0, blk_allocated = 0; 560 unsigned long count = 0, blk_allocated = 0;
551 int index = 0; 561 int index = 0;
@@ -594,10 +604,17 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
594 if (!target) 604 if (!target)
595 goto allocated; 605 goto allocated;
596 /* Now allocate data blocks */ 606 /* Now allocate data blocks */
597 count = target; 607 memset(&ar, 0, sizeof(ar));
598 /* allocating blocks for data blocks */ 608 ar.inode = inode;
599 current_block = ext4_new_blocks(handle, inode, iblock, 609 ar.goal = goal;
600 goal, &count, err); 610 ar.len = target;
611 ar.logical = iblock;
612 if (S_ISREG(inode->i_mode))
613 /* enable in-core preallocation only for regular files */
614 ar.flags = EXT4_MB_HINT_DATA;
615
616 current_block = ext4_mb_new_blocks(handle, &ar, err);
617
601 if (*err && (target == blks)) { 618 if (*err && (target == blks)) {
602 /* 619 /*
603 * if the allocation failed and we didn't allocate 620 * if the allocation failed and we didn't allocate
@@ -613,7 +630,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
613 */ 630 */
614 new_blocks[index] = current_block; 631 new_blocks[index] = current_block;
615 } 632 }
616 blk_allocated += count; 633 blk_allocated += ar.len;
617 } 634 }
618allocated: 635allocated:
619 /* total number of blocks allocated for direct blocks */ 636 /* total number of blocks allocated for direct blocks */
@@ -708,8 +725,8 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
708 set_buffer_uptodate(bh); 725 set_buffer_uptodate(bh);
709 unlock_buffer(bh); 726 unlock_buffer(bh);
710 727
711 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 728 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
712 err = ext4_journal_dirty_metadata(handle, bh); 729 err = ext4_handle_dirty_metadata(handle, inode, bh);
713 if (err) 730 if (err)
714 goto failed; 731 goto failed;
715 } 732 }
@@ -791,8 +808,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
791 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. 808 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
792 */ 809 */
793 jbd_debug(5, "splicing indirect only\n"); 810 jbd_debug(5, "splicing indirect only\n");
794 BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata"); 811 BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
795 err = ext4_journal_dirty_metadata(handle, where->bh); 812 err = ext4_handle_dirty_metadata(handle, inode, where->bh);
796 if (err) 813 if (err)
797 goto err_out; 814 goto err_out;
798 } else { 815 } else {
@@ -839,10 +856,10 @@ err_out:
839 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block 856 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
840 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) 857 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
841 */ 858 */
842int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, 859static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
843 ext4_lblk_t iblock, unsigned long maxblocks, 860 ext4_lblk_t iblock, unsigned int maxblocks,
844 struct buffer_head *bh_result, 861 struct buffer_head *bh_result,
845 int create, int extend_disksize) 862 int create, int extend_disksize)
846{ 863{
847 int err = -EIO; 864 int err = -EIO;
848 ext4_lblk_t offsets[4]; 865 ext4_lblk_t offsets[4];
@@ -1044,7 +1061,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
1044 * It returns the error in case of allocation failure. 1061 * It returns the error in case of allocation failure.
1045 */ 1062 */
1046int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, 1063int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
1047 unsigned long max_blocks, struct buffer_head *bh, 1064 unsigned int max_blocks, struct buffer_head *bh,
1048 int create, int extend_disksize, int flag) 1065 int create, int extend_disksize, int flag)
1049{ 1066{
1050 int retval; 1067 int retval;
@@ -1220,8 +1237,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1220 set_buffer_uptodate(bh); 1237 set_buffer_uptodate(bh);
1221 } 1238 }
1222 unlock_buffer(bh); 1239 unlock_buffer(bh);
1223 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 1240 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1224 err = ext4_journal_dirty_metadata(handle, bh); 1241 err = ext4_handle_dirty_metadata(handle, inode, bh);
1225 if (!fatal) 1242 if (!fatal)
1226 fatal = err; 1243 fatal = err;
1227 } else { 1244 } else {
@@ -1334,6 +1351,10 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
1334 pgoff_t index; 1351 pgoff_t index;
1335 unsigned from, to; 1352 unsigned from, to;
1336 1353
1354 trace_mark(ext4_write_begin,
1355 "dev %s ino %lu pos %llu len %u flags %u",
1356 inode->i_sb->s_id, inode->i_ino,
1357 (unsigned long long) pos, len, flags);
1337 index = pos >> PAGE_CACHE_SHIFT; 1358 index = pos >> PAGE_CACHE_SHIFT;
1338 from = pos & (PAGE_CACHE_SIZE - 1); 1359 from = pos & (PAGE_CACHE_SIZE - 1);
1339 to = from + len; 1360 to = from + len;
@@ -1345,7 +1366,7 @@ retry:
1345 goto out; 1366 goto out;
1346 } 1367 }
1347 1368
1348 page = __grab_cache_page(mapping, index); 1369 page = grab_cache_page_write_begin(mapping, index, flags);
1349 if (!page) { 1370 if (!page) {
1350 ext4_journal_stop(handle); 1371 ext4_journal_stop(handle);
1351 ret = -ENOMEM; 1372 ret = -ENOMEM;
@@ -1386,7 +1407,7 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1386 if (!buffer_mapped(bh) || buffer_freed(bh)) 1407 if (!buffer_mapped(bh) || buffer_freed(bh))
1387 return 0; 1408 return 0;
1388 set_buffer_uptodate(bh); 1409 set_buffer_uptodate(bh);
1389 return ext4_journal_dirty_metadata(handle, bh); 1410 return ext4_handle_dirty_metadata(handle, NULL, bh);
1390} 1411}
1391 1412
1392/* 1413/*
@@ -1405,6 +1426,10 @@ static int ext4_ordered_write_end(struct file *file,
1405 struct inode *inode = mapping->host; 1426 struct inode *inode = mapping->host;
1406 int ret = 0, ret2; 1427 int ret = 0, ret2;
1407 1428
1429 trace_mark(ext4_ordered_write_end,
1430 "dev %s ino %lu pos %llu len %u copied %u",
1431 inode->i_sb->s_id, inode->i_ino,
1432 (unsigned long long) pos, len, copied);
1408 ret = ext4_jbd2_file_inode(handle, inode); 1433 ret = ext4_jbd2_file_inode(handle, inode);
1409 1434
1410 if (ret == 0) { 1435 if (ret == 0) {
@@ -1443,6 +1468,10 @@ static int ext4_writeback_write_end(struct file *file,
1443 int ret = 0, ret2; 1468 int ret = 0, ret2;
1444 loff_t new_i_size; 1469 loff_t new_i_size;
1445 1470
1471 trace_mark(ext4_writeback_write_end,
1472 "dev %s ino %lu pos %llu len %u copied %u",
1473 inode->i_sb->s_id, inode->i_ino,
1474 (unsigned long long) pos, len, copied);
1446 new_i_size = pos + copied; 1475 new_i_size = pos + copied;
1447 if (new_i_size > EXT4_I(inode)->i_disksize) { 1476 if (new_i_size > EXT4_I(inode)->i_disksize) {
1448 ext4_update_i_disksize(inode, new_i_size); 1477 ext4_update_i_disksize(inode, new_i_size);
@@ -1478,6 +1507,10 @@ static int ext4_journalled_write_end(struct file *file,
1478 unsigned from, to; 1507 unsigned from, to;
1479 loff_t new_i_size; 1508 loff_t new_i_size;
1480 1509
1510 trace_mark(ext4_journalled_write_end,
1511 "dev %s ino %lu pos %llu len %u copied %u",
1512 inode->i_sb->s_id, inode->i_ino,
1513 (unsigned long long) pos, len, copied);
1481 from = pos & (PAGE_CACHE_SIZE - 1); 1514 from = pos & (PAGE_CACHE_SIZE - 1);
1482 to = from + len; 1515 to = from + len;
1483 1516
@@ -1624,7 +1657,7 @@ struct mpage_da_data {
1624 get_block_t *get_block; 1657 get_block_t *get_block;
1625 struct writeback_control *wbc; 1658 struct writeback_control *wbc;
1626 int io_done; 1659 int io_done;
1627 long pages_written; 1660 int pages_written;
1628 int retval; 1661 int retval;
1629}; 1662};
1630 1663
@@ -1644,35 +1677,39 @@ struct mpage_da_data {
1644 */ 1677 */
1645static int mpage_da_submit_io(struct mpage_da_data *mpd) 1678static int mpage_da_submit_io(struct mpage_da_data *mpd)
1646{ 1679{
1647 struct address_space *mapping = mpd->inode->i_mapping;
1648 int ret = 0, err, nr_pages, i;
1649 unsigned long index, end;
1650 struct pagevec pvec;
1651 long pages_skipped; 1680 long pages_skipped;
1681 struct pagevec pvec;
1682 unsigned long index, end;
1683 int ret = 0, err, nr_pages, i;
1684 struct inode *inode = mpd->inode;
1685 struct address_space *mapping = inode->i_mapping;
1652 1686
1653 BUG_ON(mpd->next_page <= mpd->first_page); 1687 BUG_ON(mpd->next_page <= mpd->first_page);
1654 pagevec_init(&pvec, 0); 1688 /*
1689 * We need to start from the first_page to the next_page - 1
1690 * to make sure we also write the mapped dirty buffer_heads.
1691 * If we look at mpd->lbh.b_blocknr we would only be looking
1692 * at the currently mapped buffer_heads.
1693 */
1655 index = mpd->first_page; 1694 index = mpd->first_page;
1656 end = mpd->next_page - 1; 1695 end = mpd->next_page - 1;
1657 1696
1697 pagevec_init(&pvec, 0);
1658 while (index <= end) { 1698 while (index <= end) {
1659 /* 1699 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1660 * We can use PAGECACHE_TAG_DIRTY lookup here because
1661 * even though we have cleared the dirty flag on the page
1662 * We still keep the page in the radix tree with tag
1663 * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io.
1664 * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback
1665 * which is called via the below writepage callback.
1666 */
1667 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
1668 PAGECACHE_TAG_DIRTY,
1669 min(end - index,
1670 (pgoff_t)PAGEVEC_SIZE-1) + 1);
1671 if (nr_pages == 0) 1700 if (nr_pages == 0)
1672 break; 1701 break;
1673 for (i = 0; i < nr_pages; i++) { 1702 for (i = 0; i < nr_pages; i++) {
1674 struct page *page = pvec.pages[i]; 1703 struct page *page = pvec.pages[i];
1675 1704
1705 index = page->index;
1706 if (index > end)
1707 break;
1708 index++;
1709
1710 BUG_ON(!PageLocked(page));
1711 BUG_ON(PageWriteback(page));
1712
1676 pages_skipped = mpd->wbc->pages_skipped; 1713 pages_skipped = mpd->wbc->pages_skipped;
1677 err = mapping->a_ops->writepage(page, mpd->wbc); 1714 err = mapping->a_ops->writepage(page, mpd->wbc);
1678 if (!err && (pages_skipped == mpd->wbc->pages_skipped)) 1715 if (!err && (pages_skipped == mpd->wbc->pages_skipped))
@@ -1830,13 +1867,13 @@ static void ext4_print_free_blocks(struct inode *inode)
1830 ext4_count_free_blocks(inode->i_sb)); 1867 ext4_count_free_blocks(inode->i_sb));
1831 printk(KERN_EMERG "Free/Dirty block details\n"); 1868 printk(KERN_EMERG "Free/Dirty block details\n");
1832 printk(KERN_EMERG "free_blocks=%lld\n", 1869 printk(KERN_EMERG "free_blocks=%lld\n",
1833 percpu_counter_sum(&sbi->s_freeblocks_counter)); 1870 (long long)percpu_counter_sum(&sbi->s_freeblocks_counter));
1834 printk(KERN_EMERG "dirty_blocks=%lld\n", 1871 printk(KERN_EMERG "dirty_blocks=%lld\n",
1835 percpu_counter_sum(&sbi->s_dirtyblocks_counter)); 1872 (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter));
1836 printk(KERN_EMERG "Block reservation details\n"); 1873 printk(KERN_EMERG "Block reservation details\n");
1837 printk(KERN_EMERG "i_reserved_data_blocks=%lu\n", 1874 printk(KERN_EMERG "i_reserved_data_blocks=%u\n",
1838 EXT4_I(inode)->i_reserved_data_blocks); 1875 EXT4_I(inode)->i_reserved_data_blocks);
1839 printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n", 1876 printk(KERN_EMERG "i_reserved_meta_blocks=%u\n",
1840 EXT4_I(inode)->i_reserved_meta_blocks); 1877 EXT4_I(inode)->i_reserved_meta_blocks);
1841 return; 1878 return;
1842} 1879}
@@ -2086,11 +2123,29 @@ static int __mpage_da_writepage(struct page *page,
2086 bh = head; 2123 bh = head;
2087 do { 2124 do {
2088 BUG_ON(buffer_locked(bh)); 2125 BUG_ON(buffer_locked(bh));
2126 /*
2127 * We need to try to allocate
2128 * unmapped blocks in the same page.
2129 * Otherwise we won't make progress
2130 * with the page in ext4_da_writepage
2131 */
2089 if (buffer_dirty(bh) && 2132 if (buffer_dirty(bh) &&
2090 (!buffer_mapped(bh) || buffer_delay(bh))) { 2133 (!buffer_mapped(bh) || buffer_delay(bh))) {
2091 mpage_add_bh_to_extent(mpd, logical, bh); 2134 mpage_add_bh_to_extent(mpd, logical, bh);
2092 if (mpd->io_done) 2135 if (mpd->io_done)
2093 return MPAGE_DA_EXTENT_TAIL; 2136 return MPAGE_DA_EXTENT_TAIL;
2137 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
2138 /*
2139 * mapped dirty buffer. We need to update
2140 * the b_state because we look at
2141 * b_state in mpage_da_map_blocks. We don't
2142 * update b_size because if we find an
2143 * unmapped buffer_head later we need to
2144 * use the b_state flag of that buffer_head.
2145 */
2146 if (mpd->lbh.b_size == 0)
2147 mpd->lbh.b_state =
2148 bh->b_state & BH_FLAGS;
2094 } 2149 }
2095 logical++; 2150 logical++;
2096 } while ((bh = bh->b_this_page) != head); 2151 } while ((bh = bh->b_this_page) != head);
@@ -2268,10 +2323,13 @@ static int ext4_da_writepage(struct page *page,
2268{ 2323{
2269 int ret = 0; 2324 int ret = 0;
2270 loff_t size; 2325 loff_t size;
2271 unsigned long len; 2326 unsigned int len;
2272 struct buffer_head *page_bufs; 2327 struct buffer_head *page_bufs;
2273 struct inode *inode = page->mapping->host; 2328 struct inode *inode = page->mapping->host;
2274 2329
2330 trace_mark(ext4_da_writepage,
2331 "dev %s ino %lu page_index %lu",
2332 inode->i_sb->s_id, inode->i_ino, page->index);
2275 size = i_size_read(inode); 2333 size = i_size_read(inode);
2276 if (page->index == size >> PAGE_CACHE_SHIFT) 2334 if (page->index == size >> PAGE_CACHE_SHIFT)
2277 len = size & ~PAGE_CACHE_MASK; 2335 len = size & ~PAGE_CACHE_MASK;
@@ -2377,10 +2435,25 @@ static int ext4_da_writepages(struct address_space *mapping,
2377 struct mpage_da_data mpd; 2435 struct mpage_da_data mpd;
2378 struct inode *inode = mapping->host; 2436 struct inode *inode = mapping->host;
2379 int no_nrwrite_index_update; 2437 int no_nrwrite_index_update;
2380 long pages_written = 0, pages_skipped; 2438 int pages_written = 0;
2439 long pages_skipped;
2381 int needed_blocks, ret = 0, nr_to_writebump = 0; 2440 int needed_blocks, ret = 0, nr_to_writebump = 0;
2382 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2441 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2383 2442
2443 trace_mark(ext4_da_writepages,
2444 "dev %s ino %lu nr_t_write %ld "
2445 "pages_skipped %ld range_start %llu "
2446 "range_end %llu nonblocking %d "
2447 "for_kupdate %d for_reclaim %d "
2448 "for_writepages %d range_cyclic %d",
2449 inode->i_sb->s_id, inode->i_ino,
2450 wbc->nr_to_write, wbc->pages_skipped,
2451 (unsigned long long) wbc->range_start,
2452 (unsigned long long) wbc->range_end,
2453 wbc->nonblocking, wbc->for_kupdate,
2454 wbc->for_reclaim, wbc->for_writepages,
2455 wbc->range_cyclic);
2456
2384 /* 2457 /*
2385 * No pages to write? This is mainly a kludge to avoid starting 2458 * No pages to write? This is mainly a kludge to avoid starting
2386 * a transaction for special inodes like journal inode on last iput() 2459 * a transaction for special inodes like journal inode on last iput()
@@ -2388,6 +2461,20 @@ static int ext4_da_writepages(struct address_space *mapping,
2388 */ 2461 */
2389 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 2462 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
2390 return 0; 2463 return 0;
2464
2465 /*
2466 * If the filesystem has aborted, it is read-only, so return
2467 * right away instead of dumping stack traces later on that
2468 * will obscure the real source of the problem. We test
2469 * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because
2470 * the latter could be true if the filesystem is mounted
2471 * read-only, and in that case, ext4_da_writepages should
2472 * *never* be called, so if that ever happens, we would want
2473 * the stack trace.
2474 */
2475 if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT))
2476 return -EROFS;
2477
2391 /* 2478 /*
2392 * Make sure nr_to_write is >= sbi->s_mb_stream_request 2479 * Make sure nr_to_write is >= sbi->s_mb_stream_request
2393 * This make sure small files blocks are allocated in 2480 * This make sure small files blocks are allocated in
@@ -2432,7 +2519,7 @@ static int ext4_da_writepages(struct address_space *mapping,
2432 handle = ext4_journal_start(inode, needed_blocks); 2519 handle = ext4_journal_start(inode, needed_blocks);
2433 if (IS_ERR(handle)) { 2520 if (IS_ERR(handle)) {
2434 ret = PTR_ERR(handle); 2521 ret = PTR_ERR(handle);
2435 printk(KERN_EMERG "%s: jbd2_start: " 2522 printk(KERN_CRIT "%s: jbd2_start: "
2436 "%ld pages, ino %lu; err %d\n", __func__, 2523 "%ld pages, ino %lu; err %d\n", __func__,
2437 wbc->nr_to_write, inode->i_ino, ret); 2524 wbc->nr_to_write, inode->i_ino, ret);
2438 dump_stack(); 2525 dump_stack();
@@ -2485,6 +2572,14 @@ out_writepages:
2485 if (!no_nrwrite_index_update) 2572 if (!no_nrwrite_index_update)
2486 wbc->no_nrwrite_index_update = 0; 2573 wbc->no_nrwrite_index_update = 0;
2487 wbc->nr_to_write -= nr_to_writebump; 2574 wbc->nr_to_write -= nr_to_writebump;
2575 trace_mark(ext4_da_writepage_result,
2576 "dev %s ino %lu ret %d pages_written %d "
2577 "pages_skipped %ld congestion %d "
2578 "more_io %d no_nrwrite_index_update %d",
2579 inode->i_sb->s_id, inode->i_ino, ret,
2580 pages_written, wbc->pages_skipped,
2581 wbc->encountered_congestion, wbc->more_io,
2582 wbc->no_nrwrite_index_update);
2488 return ret; 2583 return ret;
2489} 2584}
2490 2585
@@ -2497,7 +2592,7 @@ static int ext4_nonda_switch(struct super_block *sb)
2497 /* 2592 /*
2498 * switch to non delalloc mode if we are running low 2593 * switch to non delalloc mode if we are running low
2499 * on free block. The free block accounting via percpu 2594 * on free block. The free block accounting via percpu
2500 * counters can get slightly wrong with FBC_BATCH getting 2595 * counters can get slightly wrong with percpu_counter_batch getting
2501 * accumulated on each CPU without updating global counters 2596 * accumulated on each CPU without updating global counters
2502 * Delalloc need an accurate free block accounting. So switch 2597 * Delalloc need an accurate free block accounting. So switch
2503 * to non delalloc when we are near to error range. 2598 * to non delalloc when we are near to error range.
@@ -2536,6 +2631,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2536 len, flags, pagep, fsdata); 2631 len, flags, pagep, fsdata);
2537 } 2632 }
2538 *fsdata = (void *)0; 2633 *fsdata = (void *)0;
2634
2635 trace_mark(ext4_da_write_begin,
2636 "dev %s ino %lu pos %llu len %u flags %u",
2637 inode->i_sb->s_id, inode->i_ino,
2638 (unsigned long long) pos, len, flags);
2539retry: 2639retry:
2540 /* 2640 /*
2541 * With delayed allocation, we don't log the i_disksize update 2641 * With delayed allocation, we don't log the i_disksize update
@@ -2549,7 +2649,7 @@ retry:
2549 goto out; 2649 goto out;
2550 } 2650 }
2551 2651
2552 page = __grab_cache_page(mapping, index); 2652 page = grab_cache_page_write_begin(mapping, index, flags);
2553 if (!page) { 2653 if (!page) {
2554 ext4_journal_stop(handle); 2654 ext4_journal_stop(handle);
2555 ret = -ENOMEM; 2655 ret = -ENOMEM;
@@ -2625,6 +2725,10 @@ static int ext4_da_write_end(struct file *file,
2625 } 2725 }
2626 } 2726 }
2627 2727
2728 trace_mark(ext4_da_write_end,
2729 "dev %s ino %lu pos %llu len %u copied %u",
2730 inode->i_sb->s_id, inode->i_ino,
2731 (unsigned long long) pos, len, copied);
2628 start = pos & (PAGE_CACHE_SIZE - 1); 2732 start = pos & (PAGE_CACHE_SIZE - 1);
2629 end = start + copied - 1; 2733 end = start + copied - 1;
2630 2734
@@ -2717,7 +2821,10 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
2717 filemap_write_and_wait(mapping); 2821 filemap_write_and_wait(mapping);
2718 } 2822 }
2719 2823
2720 if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { 2824 BUG_ON(!EXT4_JOURNAL(inode) &&
2825 EXT4_I(inode)->i_state & EXT4_STATE_JDATA);
2826
2827 if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
2721 /* 2828 /*
2722 * This is a REALLY heavyweight approach, but the use of 2829 * This is a REALLY heavyweight approach, but the use of
2723 * bmap on dirty files is expected to be extremely rare: 2830 * bmap on dirty files is expected to be extremely rare:
@@ -2835,6 +2942,9 @@ static int ext4_normal_writepage(struct page *page,
2835 loff_t size = i_size_read(inode); 2942 loff_t size = i_size_read(inode);
2836 loff_t len; 2943 loff_t len;
2837 2944
2945 trace_mark(ext4_normal_writepage,
2946 "dev %s ino %lu page_index %lu",
2947 inode->i_sb->s_id, inode->i_ino, page->index);
2838 J_ASSERT(PageLocked(page)); 2948 J_ASSERT(PageLocked(page));
2839 if (page->index == size >> PAGE_CACHE_SHIFT) 2949 if (page->index == size >> PAGE_CACHE_SHIFT)
2840 len = size & ~PAGE_CACHE_MASK; 2950 len = size & ~PAGE_CACHE_MASK;
@@ -2920,6 +3030,9 @@ static int ext4_journalled_writepage(struct page *page,
2920 loff_t size = i_size_read(inode); 3030 loff_t size = i_size_read(inode);
2921 loff_t len; 3031 loff_t len;
2922 3032
3033 trace_mark(ext4_journalled_writepage,
3034 "dev %s ino %lu page_index %lu",
3035 inode->i_sb->s_id, inode->i_ino, page->index);
2923 J_ASSERT(PageLocked(page)); 3036 J_ASSERT(PageLocked(page));
2924 if (page->index == size >> PAGE_CACHE_SHIFT) 3037 if (page->index == size >> PAGE_CACHE_SHIFT)
2925 len = size & ~PAGE_CACHE_MASK; 3038 len = size & ~PAGE_CACHE_MASK;
@@ -2988,7 +3101,10 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset)
2988 if (offset == 0) 3101 if (offset == 0)
2989 ClearPageChecked(page); 3102 ClearPageChecked(page);
2990 3103
2991 jbd2_journal_invalidatepage(journal, page, offset); 3104 if (journal)
3105 jbd2_journal_invalidatepage(journal, page, offset);
3106 else
3107 block_invalidatepage(page, offset);
2992} 3108}
2993 3109
2994static int ext4_releasepage(struct page *page, gfp_t wait) 3110static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -2998,7 +3114,10 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
2998 WARN_ON(PageChecked(page)); 3114 WARN_ON(PageChecked(page));
2999 if (!page_has_buffers(page)) 3115 if (!page_has_buffers(page))
3000 return 0; 3116 return 0;
3001 return jbd2_journal_try_to_free_buffers(journal, page, wait); 3117 if (journal)
3118 return jbd2_journal_try_to_free_buffers(journal, page, wait);
3119 else
3120 return try_to_free_buffers(page);
3002} 3121}
3003 3122
3004/* 3123/*
@@ -3270,7 +3389,7 @@ int ext4_block_truncate_page(handle_t *handle,
3270 3389
3271 err = 0; 3390 err = 0;
3272 if (ext4_should_journal_data(inode)) { 3391 if (ext4_should_journal_data(inode)) {
3273 err = ext4_journal_dirty_metadata(handle, bh); 3392 err = ext4_handle_dirty_metadata(handle, inode, bh);
3274 } else { 3393 } else {
3275 if (ext4_should_order_data(inode)) 3394 if (ext4_should_order_data(inode))
3276 err = ext4_jbd2_file_inode(handle, inode); 3395 err = ext4_jbd2_file_inode(handle, inode);
@@ -3394,8 +3513,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
3394 __le32 *p; 3513 __le32 *p;
3395 if (try_to_extend_transaction(handle, inode)) { 3514 if (try_to_extend_transaction(handle, inode)) {
3396 if (bh) { 3515 if (bh) {
3397 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 3516 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
3398 ext4_journal_dirty_metadata(handle, bh); 3517 ext4_handle_dirty_metadata(handle, inode, bh);
3399 } 3518 }
3400 ext4_mark_inode_dirty(handle, inode); 3519 ext4_mark_inode_dirty(handle, inode);
3401 ext4_journal_test_restart(handle, inode); 3520 ext4_journal_test_restart(handle, inode);
@@ -3495,7 +3614,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
3495 count, block_to_free_p, p); 3614 count, block_to_free_p, p);
3496 3615
3497 if (this_bh) { 3616 if (this_bh) {
3498 BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata"); 3617 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
3499 3618
3500 /* 3619 /*
3501 * The buffer head should have an attached journal head at this 3620 * The buffer head should have an attached journal head at this
@@ -3504,7 +3623,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
3504 * the block was cleared. Check for this instead of OOPSing. 3623 * the block was cleared. Check for this instead of OOPSing.
3505 */ 3624 */
3506 if (bh2jh(this_bh)) 3625 if (bh2jh(this_bh))
3507 ext4_journal_dirty_metadata(handle, this_bh); 3626 ext4_handle_dirty_metadata(handle, inode, this_bh);
3508 else 3627 else
3509 ext4_error(inode->i_sb, __func__, 3628 ext4_error(inode->i_sb, __func__,
3510 "circular indirect block detected, " 3629 "circular indirect block detected, "
@@ -3534,7 +3653,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
3534 ext4_fsblk_t nr; 3653 ext4_fsblk_t nr;
3535 __le32 *p; 3654 __le32 *p;
3536 3655
3537 if (is_handle_aborted(handle)) 3656 if (ext4_handle_is_aborted(handle))
3538 return; 3657 return;
3539 3658
3540 if (depth--) { 3659 if (depth--) {
@@ -3604,7 +3723,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
3604 * will merely complain about releasing a free block, 3723 * will merely complain about releasing a free block,
3605 * rather than leaking blocks. 3724 * rather than leaking blocks.
3606 */ 3725 */
3607 if (is_handle_aborted(handle)) 3726 if (ext4_handle_is_aborted(handle))
3608 return; 3727 return;
3609 if (try_to_extend_transaction(handle, inode)) { 3728 if (try_to_extend_transaction(handle, inode)) {
3610 ext4_mark_inode_dirty(handle, inode); 3729 ext4_mark_inode_dirty(handle, inode);
@@ -3623,9 +3742,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
3623 parent_bh)){ 3742 parent_bh)){
3624 *p = 0; 3743 *p = 0;
3625 BUFFER_TRACE(parent_bh, 3744 BUFFER_TRACE(parent_bh,
3626 "call ext4_journal_dirty_metadata"); 3745 "call ext4_handle_dirty_metadata");
3627 ext4_journal_dirty_metadata(handle, 3746 ext4_handle_dirty_metadata(handle,
3628 parent_bh); 3747 inode,
3748 parent_bh);
3629 } 3749 }
3630 } 3750 }
3631 } 3751 }
@@ -3813,7 +3933,7 @@ do_indirects:
3813 * synchronous 3933 * synchronous
3814 */ 3934 */
3815 if (IS_SYNC(inode)) 3935 if (IS_SYNC(inode))
3816 handle->h_sync = 1; 3936 ext4_handle_sync(handle);
3817out_stop: 3937out_stop:
3818 /* 3938 /*
3819 * If this was a simple ftruncate(), and the file will remain alive 3939 * If this was a simple ftruncate(), and the file will remain alive
@@ -3843,7 +3963,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
3843 ext4_fsblk_t block; 3963 ext4_fsblk_t block;
3844 int inodes_per_block, inode_offset; 3964 int inodes_per_block, inode_offset;
3845 3965
3846 iloc->bh = 0; 3966 iloc->bh = NULL;
3847 if (!ext4_valid_inum(sb, inode->i_ino)) 3967 if (!ext4_valid_inum(sb, inode->i_ino))
3848 return -EIO; 3968 return -EIO;
3849 3969
@@ -3950,7 +4070,7 @@ make_io:
3950 num = EXT4_INODES_PER_GROUP(sb); 4070 num = EXT4_INODES_PER_GROUP(sb);
3951 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 4071 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3952 EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) 4072 EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
3953 num -= le16_to_cpu(gdp->bg_itable_unused); 4073 num -= ext4_itable_unused_count(sb, gdp);
3954 table += num / inodes_per_block; 4074 table += num / inodes_per_block;
3955 if (end > table) 4075 if (end > table)
3956 end = table; 4076 end = table;
@@ -4164,9 +4284,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4164 inode->i_op = &ext4_dir_inode_operations; 4284 inode->i_op = &ext4_dir_inode_operations;
4165 inode->i_fop = &ext4_dir_operations; 4285 inode->i_fop = &ext4_dir_operations;
4166 } else if (S_ISLNK(inode->i_mode)) { 4286 } else if (S_ISLNK(inode->i_mode)) {
4167 if (ext4_inode_is_fast_symlink(inode)) 4287 if (ext4_inode_is_fast_symlink(inode)) {
4168 inode->i_op = &ext4_fast_symlink_inode_operations; 4288 inode->i_op = &ext4_fast_symlink_inode_operations;
4169 else { 4289 nd_terminate_link(ei->i_data, inode->i_size,
4290 sizeof(ei->i_data) - 1);
4291 } else {
4170 inode->i_op = &ext4_symlink_inode_operations; 4292 inode->i_op = &ext4_symlink_inode_operations;
4171 ext4_set_aops(inode); 4293 ext4_set_aops(inode);
4172 } 4294 }
@@ -4310,8 +4432,8 @@ static int ext4_do_update_inode(handle_t *handle,
4310 EXT4_SET_RO_COMPAT_FEATURE(sb, 4432 EXT4_SET_RO_COMPAT_FEATURE(sb,
4311 EXT4_FEATURE_RO_COMPAT_LARGE_FILE); 4433 EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
4312 sb->s_dirt = 1; 4434 sb->s_dirt = 1;
4313 handle->h_sync = 1; 4435 ext4_handle_sync(handle);
4314 err = ext4_journal_dirty_metadata(handle, 4436 err = ext4_handle_dirty_metadata(handle, inode,
4315 EXT4_SB(sb)->s_sbh); 4437 EXT4_SB(sb)->s_sbh);
4316 } 4438 }
4317 } 4439 }
@@ -4338,9 +4460,8 @@ static int ext4_do_update_inode(handle_t *handle,
4338 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 4460 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
4339 } 4461 }
4340 4462
4341 4463 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4342 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 4464 rc = ext4_handle_dirty_metadata(handle, inode, bh);
4343 rc = ext4_journal_dirty_metadata(handle, bh);
4344 if (!err) 4465 if (!err)
4345 err = rc; 4466 err = rc;
4346 ei->i_state &= ~EXT4_STATE_NEW; 4467 ei->i_state &= ~EXT4_STATE_NEW;
@@ -4403,6 +4524,25 @@ int ext4_write_inode(struct inode *inode, int wait)
4403 return ext4_force_commit(inode->i_sb); 4524 return ext4_force_commit(inode->i_sb);
4404} 4525}
4405 4526
4527int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh)
4528{
4529 int err = 0;
4530
4531 mark_buffer_dirty(bh);
4532 if (inode && inode_needs_sync(inode)) {
4533 sync_dirty_buffer(bh);
4534 if (buffer_req(bh) && !buffer_uptodate(bh)) {
4535 ext4_error(inode->i_sb, __func__,
4536 "IO error syncing inode, "
4537 "inode=%lu, block=%llu",
4538 inode->i_ino,
4539 (unsigned long long)bh->b_blocknr);
4540 err = -EIO;
4541 }
4542 }
4543 return err;
4544}
4545
4406/* 4546/*
4407 * ext4_setattr() 4547 * ext4_setattr()
4408 * 4548 *
@@ -4707,16 +4847,15 @@ int
4707ext4_reserve_inode_write(handle_t *handle, struct inode *inode, 4847ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
4708 struct ext4_iloc *iloc) 4848 struct ext4_iloc *iloc)
4709{ 4849{
4710 int err = 0; 4850 int err;
4711 if (handle) { 4851
4712 err = ext4_get_inode_loc(inode, iloc); 4852 err = ext4_get_inode_loc(inode, iloc);
4713 if (!err) { 4853 if (!err) {
4714 BUFFER_TRACE(iloc->bh, "get_write_access"); 4854 BUFFER_TRACE(iloc->bh, "get_write_access");
4715 err = ext4_journal_get_write_access(handle, iloc->bh); 4855 err = ext4_journal_get_write_access(handle, iloc->bh);
4716 if (err) { 4856 if (err) {
4717 brelse(iloc->bh); 4857 brelse(iloc->bh);
4718 iloc->bh = NULL; 4858 iloc->bh = NULL;
4719 }
4720 } 4859 }
4721 } 4860 }
4722 ext4_std_error(inode->i_sb, err); 4861 ext4_std_error(inode->i_sb, err);
@@ -4788,7 +4927,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
4788 4927
4789 might_sleep(); 4928 might_sleep();
4790 err = ext4_reserve_inode_write(handle, inode, &iloc); 4929 err = ext4_reserve_inode_write(handle, inode, &iloc);
4791 if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 4930 if (ext4_handle_valid(handle) &&
4931 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
4792 !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { 4932 !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
4793 /* 4933 /*
4794 * We need extra buffer credits since we may write into EA block 4934 * We need extra buffer credits since we may write into EA block
@@ -4840,6 +4980,11 @@ void ext4_dirty_inode(struct inode *inode)
4840 handle_t *current_handle = ext4_journal_current_handle(); 4980 handle_t *current_handle = ext4_journal_current_handle();
4841 handle_t *handle; 4981 handle_t *handle;
4842 4982
4983 if (!ext4_handle_valid(current_handle)) {
4984 ext4_mark_inode_dirty(current_handle, inode);
4985 return;
4986 }
4987
4843 handle = ext4_journal_start(inode, 2); 4988 handle = ext4_journal_start(inode, 2);
4844 if (IS_ERR(handle)) 4989 if (IS_ERR(handle))
4845 goto out; 4990 goto out;
@@ -4877,8 +5022,9 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
4877 BUFFER_TRACE(iloc.bh, "get_write_access"); 5022 BUFFER_TRACE(iloc.bh, "get_write_access");
4878 err = jbd2_journal_get_write_access(handle, iloc.bh); 5023 err = jbd2_journal_get_write_access(handle, iloc.bh);
4879 if (!err) 5024 if (!err)
4880 err = ext4_journal_dirty_metadata(handle, 5025 err = ext4_handle_dirty_metadata(handle,
4881 iloc.bh); 5026 inode,
5027 iloc.bh);
4882 brelse(iloc.bh); 5028 brelse(iloc.bh);
4883 } 5029 }
4884 } 5030 }
@@ -4904,6 +5050,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
4904 */ 5050 */
4905 5051
4906 journal = EXT4_JOURNAL(inode); 5052 journal = EXT4_JOURNAL(inode);
5053 if (!journal)
5054 return 0;
4907 if (is_journal_aborted(journal)) 5055 if (is_journal_aborted(journal))
4908 return -EROFS; 5056 return -EROFS;
4909 5057
@@ -4933,7 +5081,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
4933 return PTR_ERR(handle); 5081 return PTR_ERR(handle);
4934 5082
4935 err = ext4_mark_inode_dirty(handle, inode); 5083 err = ext4_mark_inode_dirty(handle, inode);
4936 handle->h_sync = 1; 5084 ext4_handle_sync(handle);
4937 ext4_journal_stop(handle); 5085 ext4_journal_stop(handle);
4938 ext4_std_error(inode->i_sb, err); 5086 ext4_std_error(inode->i_sb, err);
4939 5087
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index dc99b4776d58..42dc83fb247a 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -99,7 +99,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
99 goto flags_out; 99 goto flags_out;
100 } 100 }
101 if (IS_SYNC(inode)) 101 if (IS_SYNC(inode))
102 handle->h_sync = 1; 102 ext4_handle_sync(handle);
103 err = ext4_reserve_inode_write(handle, inode, &iloc); 103 err = ext4_reserve_inode_write(handle, inode, &iloc);
104 if (err) 104 if (err)
105 goto flags_err; 105 goto flags_err;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 444ad998f72e..918aec0c8a11 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -100,7 +100,7 @@
100 * inode as: 100 * inode as:
101 * 101 *
102 * { page } 102 * { page }
103 * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]... 103 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
104 * 104 *
105 * 105 *
106 * one block each for bitmap and buddy information. So for each group we 106 * one block each for bitmap and buddy information. So for each group we
@@ -330,6 +330,18 @@
330 * object 330 * object
331 * 331 *
332 */ 332 */
333static struct kmem_cache *ext4_pspace_cachep;
334static struct kmem_cache *ext4_ac_cachep;
335static struct kmem_cache *ext4_free_ext_cachep;
336static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
337 ext4_group_t group);
338static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
339 ext4_group_t group);
340static int ext4_mb_init_per_dev_proc(struct super_block *sb);
341static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
342static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
343
344
333 345
334static inline void *mb_correct_addr_and_bit(int *bit, void *addr) 346static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
335{ 347{
@@ -445,9 +457,9 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
445 blocknr += first + i; 457 blocknr += first + i;
446 blocknr += 458 blocknr +=
447 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 459 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
448 460 ext4_grp_locked_error(sb, e4b->bd_group,
449 ext4_error(sb, __func__, "double-free of inode" 461 __func__, "double-free of inode"
450 " %lu's block %llu(bit %u in group %lu)\n", 462 " %lu's block %llu(bit %u in group %u)",
451 inode ? inode->i_ino : 0, blocknr, 463 inode ? inode->i_ino : 0, blocknr,
452 first + i, e4b->bd_group); 464 first + i, e4b->bd_group);
453 } 465 }
@@ -477,7 +489,7 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
477 b2 = (unsigned char *) bitmap; 489 b2 = (unsigned char *) bitmap;
478 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { 490 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
479 if (b1[i] != b2[i]) { 491 if (b1[i] != b2[i]) {
480 printk(KERN_ERR "corruption in group %lu " 492 printk(KERN_ERR "corruption in group %u "
481 "at byte %u(%u): %x in copy != %x " 493 "at byte %u(%u): %x in copy != %x "
482 "on disk/prealloc\n", 494 "on disk/prealloc\n",
483 e4b->bd_group, i, i * 8, b1[i], b2[i]); 495 e4b->bd_group, i, i * 8, b1[i], b2[i]);
@@ -690,8 +702,8 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
690 grp->bb_fragments = fragments; 702 grp->bb_fragments = fragments;
691 703
692 if (free != grp->bb_free) { 704 if (free != grp->bb_free) {
693 ext4_error(sb, __func__, 705 ext4_grp_locked_error(sb, group, __func__,
694 "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n", 706 "EXT4-fs: group %u: %u blocks in bitmap, %u in gd",
695 group, free, grp->bb_free); 707 group, free, grp->bb_free);
696 /* 708 /*
697 * If we intent to continue, we consider group descritor 709 * If we intent to continue, we consider group descritor
@@ -716,7 +728,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
716 * stored in the inode as 728 * stored in the inode as
717 * 729 *
718 * { page } 730 * { page }
719 * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]... 731 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
720 * 732 *
721 * 733 *
722 * one block each for bitmap and buddy information. 734 * one block each for bitmap and buddy information.
@@ -782,25 +794,45 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
782 if (bh[i] == NULL) 794 if (bh[i] == NULL)
783 goto out; 795 goto out;
784 796
785 if (buffer_uptodate(bh[i]) && 797 if (bitmap_uptodate(bh[i]))
786 !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
787 continue; 798 continue;
788 799
789 lock_buffer(bh[i]); 800 lock_buffer(bh[i]);
801 if (bitmap_uptodate(bh[i])) {
802 unlock_buffer(bh[i]);
803 continue;
804 }
790 spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); 805 spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
791 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 806 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
792 ext4_init_block_bitmap(sb, bh[i], 807 ext4_init_block_bitmap(sb, bh[i],
793 first_group + i, desc); 808 first_group + i, desc);
809 set_bitmap_uptodate(bh[i]);
794 set_buffer_uptodate(bh[i]); 810 set_buffer_uptodate(bh[i]);
795 unlock_buffer(bh[i]);
796 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); 811 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
812 unlock_buffer(bh[i]);
797 continue; 813 continue;
798 } 814 }
799 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); 815 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
816 if (buffer_uptodate(bh[i])) {
817 /*
818 * if not uninit if bh is uptodate,
819 * bitmap is also uptodate
820 */
821 set_bitmap_uptodate(bh[i]);
822 unlock_buffer(bh[i]);
823 continue;
824 }
800 get_bh(bh[i]); 825 get_bh(bh[i]);
826 /*
827 * submit the buffer_head for read. We can
828 * safely mark the bitmap as uptodate now.
829 * We do it here so the bitmap uptodate bit
830 * get set with buffer lock held.
831 */
832 set_bitmap_uptodate(bh[i]);
801 bh[i]->b_end_io = end_buffer_read_sync; 833 bh[i]->b_end_io = end_buffer_read_sync;
802 submit_bh(READ, bh[i]); 834 submit_bh(READ, bh[i]);
803 mb_debug("read bitmap for group %lu\n", first_group + i); 835 mb_debug("read bitmap for group %u\n", first_group + i);
804 } 836 }
805 837
806 /* wait for I/O completion */ 838 /* wait for I/O completion */
@@ -814,6 +846,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
814 846
815 err = 0; 847 err = 0;
816 first_block = page->index * blocks_per_page; 848 first_block = page->index * blocks_per_page;
849 /* init the page */
850 memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
817 for (i = 0; i < blocks_per_page; i++) { 851 for (i = 0; i < blocks_per_page; i++) {
818 int group; 852 int group;
819 struct ext4_group_info *grinfo; 853 struct ext4_group_info *grinfo;
@@ -840,7 +874,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
840 BUG_ON(incore == NULL); 874 BUG_ON(incore == NULL);
841 mb_debug("put buddy for group %u in page %lu/%x\n", 875 mb_debug("put buddy for group %u in page %lu/%x\n",
842 group, page->index, i * blocksize); 876 group, page->index, i * blocksize);
843 memset(data, 0xff, blocksize);
844 grinfo = ext4_get_group_info(sb, group); 877 grinfo = ext4_get_group_info(sb, group);
845 grinfo->bb_fragments = 0; 878 grinfo->bb_fragments = 0;
846 memset(grinfo->bb_counters, 0, 879 memset(grinfo->bb_counters, 0,
@@ -848,7 +881,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
848 /* 881 /*
849 * incore got set to the group block bitmap below 882 * incore got set to the group block bitmap below
850 */ 883 */
884 ext4_lock_group(sb, group);
851 ext4_mb_generate_buddy(sb, data, incore, group); 885 ext4_mb_generate_buddy(sb, data, incore, group);
886 ext4_unlock_group(sb, group);
852 incore = NULL; 887 incore = NULL;
853 } else { 888 } else {
854 /* this is block of bitmap */ 889 /* this is block of bitmap */
@@ -862,6 +897,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
862 897
863 /* mark all preallocated blks used in in-core bitmap */ 898 /* mark all preallocated blks used in in-core bitmap */
864 ext4_mb_generate_from_pa(sb, data, group); 899 ext4_mb_generate_from_pa(sb, data, group);
900 ext4_mb_generate_from_freelist(sb, data, group);
865 ext4_unlock_group(sb, group); 901 ext4_unlock_group(sb, group);
866 902
867 /* set incore so that the buddy information can be 903 /* set incore so that the buddy information can be
@@ -886,18 +922,20 @@ static noinline_for_stack int
886ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 922ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
887 struct ext4_buddy *e4b) 923 struct ext4_buddy *e4b)
888{ 924{
889 struct ext4_sb_info *sbi = EXT4_SB(sb);
890 struct inode *inode = sbi->s_buddy_cache;
891 int blocks_per_page; 925 int blocks_per_page;
892 int block; 926 int block;
893 int pnum; 927 int pnum;
894 int poff; 928 int poff;
895 struct page *page; 929 struct page *page;
896 int ret; 930 int ret;
931 struct ext4_group_info *grp;
932 struct ext4_sb_info *sbi = EXT4_SB(sb);
933 struct inode *inode = sbi->s_buddy_cache;
897 934
898 mb_debug("load group %lu\n", group); 935 mb_debug("load group %u\n", group);
899 936
900 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 937 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
938 grp = ext4_get_group_info(sb, group);
901 939
902 e4b->bd_blkbits = sb->s_blocksize_bits; 940 e4b->bd_blkbits = sb->s_blocksize_bits;
903 e4b->bd_info = ext4_get_group_info(sb, group); 941 e4b->bd_info = ext4_get_group_info(sb, group);
@@ -905,6 +943,15 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
905 e4b->bd_group = group; 943 e4b->bd_group = group;
906 e4b->bd_buddy_page = NULL; 944 e4b->bd_buddy_page = NULL;
907 e4b->bd_bitmap_page = NULL; 945 e4b->bd_bitmap_page = NULL;
946 e4b->alloc_semp = &grp->alloc_sem;
947
948 /* Take the read lock on the group alloc
949 * sem. This would make sure a parallel
950 * ext4_mb_init_group happening on other
951 * groups mapped by the page is blocked
952 * till we are done with allocation
953 */
954 down_read(e4b->alloc_semp);
908 955
909 /* 956 /*
910 * the buddy cache inode stores the block bitmap 957 * the buddy cache inode stores the block bitmap
@@ -920,6 +967,14 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
920 page = find_get_page(inode->i_mapping, pnum); 967 page = find_get_page(inode->i_mapping, pnum);
921 if (page == NULL || !PageUptodate(page)) { 968 if (page == NULL || !PageUptodate(page)) {
922 if (page) 969 if (page)
970 /*
971 * drop the page reference and try
972 * to get the page with lock. If we
973 * are not uptodate that implies
974 * somebody just created the page but
975 * is yet to initialize the same. So
976 * wait for it to initialize.
977 */
923 page_cache_release(page); 978 page_cache_release(page);
924 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 979 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
925 if (page) { 980 if (page) {
@@ -985,6 +1040,9 @@ err:
985 page_cache_release(e4b->bd_buddy_page); 1040 page_cache_release(e4b->bd_buddy_page);
986 e4b->bd_buddy = NULL; 1041 e4b->bd_buddy = NULL;
987 e4b->bd_bitmap = NULL; 1042 e4b->bd_bitmap = NULL;
1043
1044 /* Done with the buddy cache */
1045 up_read(e4b->alloc_semp);
988 return ret; 1046 return ret;
989} 1047}
990 1048
@@ -994,6 +1052,9 @@ static void ext4_mb_release_desc(struct ext4_buddy *e4b)
994 page_cache_release(e4b->bd_bitmap_page); 1052 page_cache_release(e4b->bd_bitmap_page);
995 if (e4b->bd_buddy_page) 1053 if (e4b->bd_buddy_page)
996 page_cache_release(e4b->bd_buddy_page); 1054 page_cache_release(e4b->bd_buddy_page);
1055 /* Done with the buddy cache */
1056 if (e4b->alloc_semp)
1057 up_read(e4b->alloc_semp);
997} 1058}
998 1059
999 1060
@@ -1031,7 +1092,10 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
1031 cur += 32; 1092 cur += 32;
1032 continue; 1093 continue;
1033 } 1094 }
1034 mb_clear_bit_atomic(lock, cur, bm); 1095 if (lock)
1096 mb_clear_bit_atomic(lock, cur, bm);
1097 else
1098 mb_clear_bit(cur, bm);
1035 cur++; 1099 cur++;
1036 } 1100 }
1037} 1101}
@@ -1049,7 +1113,10 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
1049 cur += 32; 1113 cur += 32;
1050 continue; 1114 continue;
1051 } 1115 }
1052 mb_set_bit_atomic(lock, cur, bm); 1116 if (lock)
1117 mb_set_bit_atomic(lock, cur, bm);
1118 else
1119 mb_set_bit(cur, bm);
1053 cur++; 1120 cur++;
1054 } 1121 }
1055} 1122}
@@ -1094,12 +1161,11 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1094 blocknr += block; 1161 blocknr += block;
1095 blocknr += 1162 blocknr +=
1096 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 1163 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
1097 ext4_unlock_group(sb, e4b->bd_group); 1164 ext4_grp_locked_error(sb, e4b->bd_group,
1098 ext4_error(sb, __func__, "double-free of inode" 1165 __func__, "double-free of inode"
1099 " %lu's block %llu(bit %u in group %lu)\n", 1166 " %lu's block %llu(bit %u in group %u)",
1100 inode ? inode->i_ino : 0, blocknr, block, 1167 inode ? inode->i_ino : 0, blocknr, block,
1101 e4b->bd_group); 1168 e4b->bd_group);
1102 ext4_lock_group(sb, e4b->bd_group);
1103 } 1169 }
1104 mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); 1170 mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
1105 e4b->bd_info->bb_counters[order]++; 1171 e4b->bd_info->bb_counters[order]++;
@@ -1296,13 +1362,20 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1296 ac->ac_tail = ret & 0xffff; 1362 ac->ac_tail = ret & 0xffff;
1297 ac->ac_buddy = ret >> 16; 1363 ac->ac_buddy = ret >> 16;
1298 1364
1299 /* XXXXXXX: SUCH A HORRIBLE **CK */ 1365 /*
1300 /*FIXME!! Why ? */ 1366 * take the page reference. We want the page to be pinned
1367 * so that we don't get a ext4_mb_init_cache_call for this
1368 * group until we update the bitmap. That would mean we
1369 * double allocate blocks. The reference is dropped
1370 * in ext4_mb_release_context
1371 */
1301 ac->ac_bitmap_page = e4b->bd_bitmap_page; 1372 ac->ac_bitmap_page = e4b->bd_bitmap_page;
1302 get_page(ac->ac_bitmap_page); 1373 get_page(ac->ac_bitmap_page);
1303 ac->ac_buddy_page = e4b->bd_buddy_page; 1374 ac->ac_buddy_page = e4b->bd_buddy_page;
1304 get_page(ac->ac_buddy_page); 1375 get_page(ac->ac_buddy_page);
1305 1376 /* on allocation we use ac to track the held semaphore */
1377 ac->alloc_semp = e4b->alloc_semp;
1378 e4b->alloc_semp = NULL;
1306 /* store last allocated for subsequent stream allocation */ 1379 /* store last allocated for subsequent stream allocation */
1307 if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { 1380 if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
1308 spin_lock(&sbi->s_md_lock); 1381 spin_lock(&sbi->s_md_lock);
@@ -1326,6 +1399,8 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
1326 struct ext4_free_extent ex; 1399 struct ext4_free_extent ex;
1327 int max; 1400 int max;
1328 1401
1402 if (ac->ac_status == AC_STATUS_FOUND)
1403 return;
1329 /* 1404 /*
1330 * We don't want to scan for a whole year 1405 * We don't want to scan for a whole year
1331 */ 1406 */
@@ -1575,8 +1650,9 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1575 * free blocks even though group info says we 1650 * free blocks even though group info says we
1576 * we have free blocks 1651 * we have free blocks
1577 */ 1652 */
1578 ext4_error(sb, __func__, "%d free blocks as per " 1653 ext4_grp_locked_error(sb, e4b->bd_group,
1579 "group info. But bitmap says 0\n", 1654 __func__, "%d free blocks as per "
1655 "group info. But bitmap says 0",
1580 free); 1656 free);
1581 break; 1657 break;
1582 } 1658 }
@@ -1584,8 +1660,9 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1584 mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); 1660 mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
1585 BUG_ON(ex.fe_len <= 0); 1661 BUG_ON(ex.fe_len <= 0);
1586 if (free < ex.fe_len) { 1662 if (free < ex.fe_len) {
1587 ext4_error(sb, __func__, "%d free blocks as per " 1663 ext4_grp_locked_error(sb, e4b->bd_group,
1588 "group info. But got %d blocks\n", 1664 __func__, "%d free blocks as per "
1665 "group info. But got %d blocks",
1589 free, ex.fe_len); 1666 free, ex.fe_len);
1590 /* 1667 /*
1591 * The number of free blocks differs. This mostly 1668 * The number of free blocks differs. This mostly
@@ -1692,6 +1769,173 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1692 return 0; 1769 return 0;
1693} 1770}
1694 1771
1772/*
1773 * lock the group_info alloc_sem of all the groups
1774 * belonging to the same buddy cache page. This
1775 * make sure other parallel operation on the buddy
1776 * cache doesn't happen whild holding the buddy cache
1777 * lock
1778 */
1779int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
1780{
1781 int i;
1782 int block, pnum;
1783 int blocks_per_page;
1784 int groups_per_page;
1785 ext4_group_t first_group;
1786 struct ext4_group_info *grp;
1787
1788 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1789 /*
1790 * the buddy cache inode stores the block bitmap
1791 * and buddy information in consecutive blocks.
1792 * So for each group we need two blocks.
1793 */
1794 block = group * 2;
1795 pnum = block / blocks_per_page;
1796 first_group = pnum * blocks_per_page / 2;
1797
1798 groups_per_page = blocks_per_page >> 1;
1799 if (groups_per_page == 0)
1800 groups_per_page = 1;
1801 /* read all groups the page covers into the cache */
1802 for (i = 0; i < groups_per_page; i++) {
1803
1804 if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
1805 break;
1806 grp = ext4_get_group_info(sb, first_group + i);
1807 /* take all groups write allocation
1808 * semaphore. This make sure there is
1809 * no block allocation going on in any
1810 * of that groups
1811 */
1812 down_write_nested(&grp->alloc_sem, i);
1813 }
1814 return i;
1815}
1816
1817void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
1818 ext4_group_t group, int locked_group)
1819{
1820 int i;
1821 int block, pnum;
1822 int blocks_per_page;
1823 ext4_group_t first_group;
1824 struct ext4_group_info *grp;
1825
1826 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1827 /*
1828 * the buddy cache inode stores the block bitmap
1829 * and buddy information in consecutive blocks.
1830 * So for each group we need two blocks.
1831 */
1832 block = group * 2;
1833 pnum = block / blocks_per_page;
1834 first_group = pnum * blocks_per_page / 2;
1835 /* release locks on all the groups */
1836 for (i = 0; i < locked_group; i++) {
1837
1838 grp = ext4_get_group_info(sb, first_group + i);
1839 /* take all groups write allocation
1840 * semaphore. This make sure there is
1841 * no block allocation going on in any
1842 * of that groups
1843 */
1844 up_write(&grp->alloc_sem);
1845 }
1846
1847}
1848
1849static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1850{
1851
1852 int ret;
1853 void *bitmap;
1854 int blocks_per_page;
1855 int block, pnum, poff;
1856 int num_grp_locked = 0;
1857 struct ext4_group_info *this_grp;
1858 struct ext4_sb_info *sbi = EXT4_SB(sb);
1859 struct inode *inode = sbi->s_buddy_cache;
1860 struct page *page = NULL, *bitmap_page = NULL;
1861
1862 mb_debug("init group %lu\n", group);
1863 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1864 this_grp = ext4_get_group_info(sb, group);
1865 /*
1866 * This ensures we don't add group
1867 * to this buddy cache via resize
1868 */
1869 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
1870 if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
1871 /*
1872 * somebody initialized the group
1873 * return without doing anything
1874 */
1875 ret = 0;
1876 goto err;
1877 }
1878 /*
1879 * the buddy cache inode stores the block bitmap
1880 * and buddy information in consecutive blocks.
1881 * So for each group we need two blocks.
1882 */
1883 block = group * 2;
1884 pnum = block / blocks_per_page;
1885 poff = block % blocks_per_page;
1886 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1887 if (page) {
1888 BUG_ON(page->mapping != inode->i_mapping);
1889 ret = ext4_mb_init_cache(page, NULL);
1890 if (ret) {
1891 unlock_page(page);
1892 goto err;
1893 }
1894 unlock_page(page);
1895 }
1896 if (page == NULL || !PageUptodate(page)) {
1897 ret = -EIO;
1898 goto err;
1899 }
1900 mark_page_accessed(page);
1901 bitmap_page = page;
1902 bitmap = page_address(page) + (poff * sb->s_blocksize);
1903
1904 /* init buddy cache */
1905 block++;
1906 pnum = block / blocks_per_page;
1907 poff = block % blocks_per_page;
1908 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1909 if (page == bitmap_page) {
1910 /*
1911 * If both the bitmap and buddy are in
1912 * the same page we don't need to force
1913 * init the buddy
1914 */
1915 unlock_page(page);
1916 } else if (page) {
1917 BUG_ON(page->mapping != inode->i_mapping);
1918 ret = ext4_mb_init_cache(page, bitmap);
1919 if (ret) {
1920 unlock_page(page);
1921 goto err;
1922 }
1923 unlock_page(page);
1924 }
1925 if (page == NULL || !PageUptodate(page)) {
1926 ret = -EIO;
1927 goto err;
1928 }
1929 mark_page_accessed(page);
1930err:
1931 ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
1932 if (bitmap_page)
1933 page_cache_release(bitmap_page);
1934 if (page)
1935 page_cache_release(page);
1936 return ret;
1937}
1938
1695static noinline_for_stack int 1939static noinline_for_stack int
1696ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 1940ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1697{ 1941{
@@ -1775,7 +2019,7 @@ repeat:
1775 group = 0; 2019 group = 0;
1776 2020
1777 /* quick check to skip empty groups */ 2021 /* quick check to skip empty groups */
1778 grp = ext4_get_group_info(ac->ac_sb, group); 2022 grp = ext4_get_group_info(sb, group);
1779 if (grp->bb_free == 0) 2023 if (grp->bb_free == 0)
1780 continue; 2024 continue;
1781 2025
@@ -1788,10 +2032,9 @@ repeat:
1788 * we need full data about the group 2032 * we need full data about the group
1789 * to make a good selection 2033 * to make a good selection
1790 */ 2034 */
1791 err = ext4_mb_load_buddy(sb, group, &e4b); 2035 err = ext4_mb_init_group(sb, group);
1792 if (err) 2036 if (err)
1793 goto out; 2037 goto out;
1794 ext4_mb_release_desc(&e4b);
1795 } 2038 }
1796 2039
1797 /* 2040 /*
@@ -1932,13 +2175,13 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
1932 if (hs->op == EXT4_MB_HISTORY_ALLOC) { 2175 if (hs->op == EXT4_MB_HISTORY_ALLOC) {
1933 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u " 2176 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
1934 "%-5u %-5s %-5u %-6u\n"; 2177 "%-5u %-5s %-5u %-6u\n";
1935 sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group, 2178 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
1936 hs->result.fe_start, hs->result.fe_len, 2179 hs->result.fe_start, hs->result.fe_len,
1937 hs->result.fe_logical); 2180 hs->result.fe_logical);
1938 sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group, 2181 sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
1939 hs->orig.fe_start, hs->orig.fe_len, 2182 hs->orig.fe_start, hs->orig.fe_len,
1940 hs->orig.fe_logical); 2183 hs->orig.fe_logical);
1941 sprintf(buf3, "%lu/%d/%u@%u", hs->goal.fe_group, 2184 sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group,
1942 hs->goal.fe_start, hs->goal.fe_len, 2185 hs->goal.fe_start, hs->goal.fe_len,
1943 hs->goal.fe_logical); 2186 hs->goal.fe_logical);
1944 seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2, 2187 seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
@@ -1947,20 +2190,20 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
1947 hs->buddy ? 1 << hs->buddy : 0); 2190 hs->buddy ? 1 << hs->buddy : 0);
1948 } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) { 2191 } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
1949 fmt = "%-5u %-8u %-23s %-23s %-23s\n"; 2192 fmt = "%-5u %-8u %-23s %-23s %-23s\n";
1950 sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group, 2193 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
1951 hs->result.fe_start, hs->result.fe_len, 2194 hs->result.fe_start, hs->result.fe_len,
1952 hs->result.fe_logical); 2195 hs->result.fe_logical);
1953 sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group, 2196 sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
1954 hs->orig.fe_start, hs->orig.fe_len, 2197 hs->orig.fe_start, hs->orig.fe_len,
1955 hs->orig.fe_logical); 2198 hs->orig.fe_logical);
1956 seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2); 2199 seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
1957 } else if (hs->op == EXT4_MB_HISTORY_DISCARD) { 2200 } else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
1958 sprintf(buf2, "%lu/%d/%u", hs->result.fe_group, 2201 sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
1959 hs->result.fe_start, hs->result.fe_len); 2202 hs->result.fe_start, hs->result.fe_len);
1960 seq_printf(seq, "%-5u %-8u %-23s discard\n", 2203 seq_printf(seq, "%-5u %-8u %-23s discard\n",
1961 hs->pid, hs->ino, buf2); 2204 hs->pid, hs->ino, buf2);
1962 } else if (hs->op == EXT4_MB_HISTORY_FREE) { 2205 } else if (hs->op == EXT4_MB_HISTORY_FREE) {
1963 sprintf(buf2, "%lu/%d/%u", hs->result.fe_group, 2206 sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
1964 hs->result.fe_start, hs->result.fe_len); 2207 hs->result.fe_start, hs->result.fe_len);
1965 seq_printf(seq, "%-5u %-8u %-23s free\n", 2208 seq_printf(seq, "%-5u %-8u %-23s free\n",
1966 hs->pid, hs->ino, buf2); 2209 hs->pid, hs->ino, buf2);
@@ -2073,7 +2316,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2073 return NULL; 2316 return NULL;
2074 2317
2075 group = *pos + 1; 2318 group = *pos + 1;
2076 return (void *) group; 2319 return (void *) ((unsigned long) group);
2077} 2320}
2078 2321
2079static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) 2322static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -2086,13 +2329,13 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
2086 if (*pos < 0 || *pos >= sbi->s_groups_count) 2329 if (*pos < 0 || *pos >= sbi->s_groups_count)
2087 return NULL; 2330 return NULL;
2088 group = *pos + 1; 2331 group = *pos + 1;
2089 return (void *) group;; 2332 return (void *) ((unsigned long) group);
2090} 2333}
2091 2334
2092static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) 2335static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2093{ 2336{
2094 struct super_block *sb = seq->private; 2337 struct super_block *sb = seq->private;
2095 long group = (long) v; 2338 ext4_group_t group = (ext4_group_t) ((unsigned long) v);
2096 int i; 2339 int i;
2097 int err; 2340 int err;
2098 struct ext4_buddy e4b; 2341 struct ext4_buddy e4b;
@@ -2114,7 +2357,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2114 sizeof(struct ext4_group_info); 2357 sizeof(struct ext4_group_info);
2115 err = ext4_mb_load_buddy(sb, group, &e4b); 2358 err = ext4_mb_load_buddy(sb, group, &e4b);
2116 if (err) { 2359 if (err) {
2117 seq_printf(seq, "#%-5lu: I/O error\n", group); 2360 seq_printf(seq, "#%-5u: I/O error\n", group);
2118 return 0; 2361 return 0;
2119 } 2362 }
2120 ext4_lock_group(sb, group); 2363 ext4_lock_group(sb, group);
@@ -2122,7 +2365,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2122 ext4_unlock_group(sb, group); 2365 ext4_unlock_group(sb, group);
2123 ext4_mb_release_desc(&e4b); 2366 ext4_mb_release_desc(&e4b);
2124 2367
2125 seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, 2368 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
2126 sg.info.bb_fragments, sg.info.bb_first_free); 2369 sg.info.bb_fragments, sg.info.bb_first_free);
2127 for (i = 0; i <= 13; i++) 2370 for (i = 0; i <= 13; i++)
2128 seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? 2371 seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
@@ -2296,10 +2539,11 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2296 ext4_free_blocks_after_init(sb, group, desc); 2539 ext4_free_blocks_after_init(sb, group, desc);
2297 } else { 2540 } else {
2298 meta_group_info[i]->bb_free = 2541 meta_group_info[i]->bb_free =
2299 le16_to_cpu(desc->bg_free_blocks_count); 2542 ext4_free_blks_count(sb, desc);
2300 } 2543 }
2301 2544
2302 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2545 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2546 init_rwsem(&meta_group_info[i]->alloc_sem);
2303 meta_group_info[i]->bb_free_root.rb_node = NULL;; 2547 meta_group_info[i]->bb_free_root.rb_node = NULL;;
2304 2548
2305#ifdef DOUBLE_CHECK 2549#ifdef DOUBLE_CHECK
@@ -2327,54 +2571,6 @@ exit_meta_group_info:
2327} /* ext4_mb_add_groupinfo */ 2571} /* ext4_mb_add_groupinfo */
2328 2572
2329/* 2573/*
2330 * Add a group to the existing groups.
2331 * This function is used for online resize
2332 */
2333int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
2334 struct ext4_group_desc *desc)
2335{
2336 struct ext4_sb_info *sbi = EXT4_SB(sb);
2337 struct inode *inode = sbi->s_buddy_cache;
2338 int blocks_per_page;
2339 int block;
2340 int pnum;
2341 struct page *page;
2342 int err;
2343
2344 /* Add group based on group descriptor*/
2345 err = ext4_mb_add_groupinfo(sb, group, desc);
2346 if (err)
2347 return err;
2348
2349 /*
2350 * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
2351 * datas) are set not up to date so that they will be re-initilaized
2352 * during the next call to ext4_mb_load_buddy
2353 */
2354
2355 /* Set buddy page as not up to date */
2356 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
2357 block = group * 2;
2358 pnum = block / blocks_per_page;
2359 page = find_get_page(inode->i_mapping, pnum);
2360 if (page != NULL) {
2361 ClearPageUptodate(page);
2362 page_cache_release(page);
2363 }
2364
2365 /* Set bitmap page as not up to date */
2366 block++;
2367 pnum = block / blocks_per_page;
2368 page = find_get_page(inode->i_mapping, pnum);
2369 if (page != NULL) {
2370 ClearPageUptodate(page);
2371 page_cache_release(page);
2372 }
2373
2374 return 0;
2375}
2376
2377/*
2378 * Update an existing group. 2574 * Update an existing group.
2379 * This function is used for online resize 2575 * This function is used for online resize
2380 */ 2576 */
@@ -2457,7 +2653,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
2457 desc = ext4_get_group_desc(sb, i, NULL); 2653 desc = ext4_get_group_desc(sb, i, NULL);
2458 if (desc == NULL) { 2654 if (desc == NULL) {
2459 printk(KERN_ERR 2655 printk(KERN_ERR
2460 "EXT4-fs: can't read descriptor %lu\n", i); 2656 "EXT4-fs: can't read descriptor %u\n", i);
2461 goto err_freebuddy; 2657 goto err_freebuddy;
2462 } 2658 }
2463 if (ext4_mb_add_groupinfo(sb, i, desc) != 0) 2659 if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
@@ -2493,6 +2689,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2493 if (sbi->s_mb_offsets == NULL) { 2689 if (sbi->s_mb_offsets == NULL) {
2494 return -ENOMEM; 2690 return -ENOMEM;
2495 } 2691 }
2692
2693 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
2496 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2694 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2497 if (sbi->s_mb_maxs == NULL) { 2695 if (sbi->s_mb_maxs == NULL) {
2498 kfree(sbi->s_mb_maxs); 2696 kfree(sbi->s_mb_maxs);
@@ -2551,7 +2749,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2551 ext4_mb_init_per_dev_proc(sb); 2749 ext4_mb_init_per_dev_proc(sb);
2552 ext4_mb_history_init(sb); 2750 ext4_mb_history_init(sb);
2553 2751
2554 sbi->s_journal->j_commit_callback = release_blocks_on_commit; 2752 if (sbi->s_journal)
2753 sbi->s_journal->j_commit_callback = release_blocks_on_commit;
2555 2754
2556 printk(KERN_INFO "EXT4-fs: mballoc enabled\n"); 2755 printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
2557 return 0; 2756 return 0;
@@ -2652,7 +2851,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2652 list_for_each_safe(l, ltmp, &txn->t_private_list) { 2851 list_for_each_safe(l, ltmp, &txn->t_private_list) {
2653 entry = list_entry(l, struct ext4_free_data, list); 2852 entry = list_entry(l, struct ext4_free_data, list);
2654 2853
2655 mb_debug("gonna free %u blocks in group %lu (0x%p):", 2854 mb_debug("gonna free %u blocks in group %u (0x%p):",
2656 entry->count, entry->group, entry); 2855 entry->count, entry->group, entry);
2657 2856
2658 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2857 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2679,8 +2878,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2679 discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) 2878 discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
2680 + entry->start_blk 2879 + entry->start_blk
2681 + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 2880 + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
2682 trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id, 2881 trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u",
2683 (unsigned long long) discard_block, entry->count); 2882 sb->s_id, (unsigned long long) discard_block,
2883 entry->count);
2684 sb_issue_discard(sb, discard_block, entry->count); 2884 sb_issue_discard(sb, discard_block, entry->count);
2685 2885
2686 kmem_cache_free(ext4_free_ext_cachep, entry); 2886 kmem_cache_free(ext4_free_ext_cachep, entry);
@@ -2791,7 +2991,7 @@ void exit_ext4_mballoc(void)
2791 */ 2991 */
2792static noinline_for_stack int 2992static noinline_for_stack int
2793ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, 2993ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2794 handle_t *handle, unsigned long reserv_blks) 2994 handle_t *handle, unsigned int reserv_blks)
2795{ 2995{
2796 struct buffer_head *bitmap_bh = NULL; 2996 struct buffer_head *bitmap_bh = NULL;
2797 struct ext4_super_block *es; 2997 struct ext4_super_block *es;
@@ -2824,7 +3024,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2824 if (!gdp) 3024 if (!gdp)
2825 goto out_err; 3025 goto out_err;
2826 3026
2827 ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group, 3027 ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
2828 gdp->bg_free_blocks_count); 3028 gdp->bg_free_blocks_count);
2829 3029
2830 err = ext4_journal_get_write_access(handle, gdp_bh); 3030 err = ext4_journal_get_write_access(handle, gdp_bh);
@@ -2843,8 +3043,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2843 in_range(block + len - 1, ext4_inode_table(sb, gdp), 3043 in_range(block + len - 1, ext4_inode_table(sb, gdp),
2844 EXT4_SB(sb)->s_itb_per_group)) { 3044 EXT4_SB(sb)->s_itb_per_group)) {
2845 ext4_error(sb, __func__, 3045 ext4_error(sb, __func__,
2846 "Allocating block in system zone - block = %llu", 3046 "Allocating block %llu in system zone of %d group\n",
2847 block); 3047 block, ac->ac_b_ex.fe_group);
2848 /* File system mounted not to panic on error 3048 /* File system mounted not to panic on error
2849 * Fix the bitmap and repeat the block allocation 3049 * Fix the bitmap and repeat the block allocation
2850 * We leak some of the blocks here. 3050 * We leak some of the blocks here.
@@ -2852,7 +3052,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2852 mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), 3052 mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group),
2853 bitmap_bh->b_data, ac->ac_b_ex.fe_start, 3053 bitmap_bh->b_data, ac->ac_b_ex.fe_start,
2854 ac->ac_b_ex.fe_len); 3054 ac->ac_b_ex.fe_len);
2855 err = ext4_journal_dirty_metadata(handle, bitmap_bh); 3055 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
2856 if (!err) 3056 if (!err)
2857 err = -EAGAIN; 3057 err = -EAGAIN;
2858 goto out_err; 3058 goto out_err;
@@ -2866,18 +3066,17 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2866 } 3066 }
2867 } 3067 }
2868#endif 3068#endif
2869 mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
2870 ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
2871
2872 spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); 3069 spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
3070 mb_set_bits(NULL, bitmap_bh->b_data,
3071 ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
2873 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 3072 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2874 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 3073 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
2875 gdp->bg_free_blocks_count = 3074 ext4_free_blks_set(sb, gdp,
2876 cpu_to_le16(ext4_free_blocks_after_init(sb, 3075 ext4_free_blocks_after_init(sb,
2877 ac->ac_b_ex.fe_group, 3076 ac->ac_b_ex.fe_group, gdp));
2878 gdp));
2879 } 3077 }
2880 le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); 3078 len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
3079 ext4_free_blks_set(sb, gdp, len);
2881 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); 3080 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
2882 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); 3081 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
2883 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); 3082 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
@@ -2899,10 +3098,10 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2899 spin_unlock(sb_bgl_lock(sbi, flex_group)); 3098 spin_unlock(sb_bgl_lock(sbi, flex_group));
2900 } 3099 }
2901 3100
2902 err = ext4_journal_dirty_metadata(handle, bitmap_bh); 3101 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
2903 if (err) 3102 if (err)
2904 goto out_err; 3103 goto out_err;
2905 err = ext4_journal_dirty_metadata(handle, gdp_bh); 3104 err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
2906 3105
2907out_err: 3106out_err:
2908 sb->s_dirt = 1; 3107 sb->s_dirt = 1;
@@ -3031,7 +3230,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3031 /* check we don't cross already preallocated blocks */ 3230 /* check we don't cross already preallocated blocks */
3032 rcu_read_lock(); 3231 rcu_read_lock();
3033 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3232 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3034 unsigned long pa_end; 3233 ext4_lblk_t pa_end;
3035 3234
3036 if (pa->pa_deleted) 3235 if (pa->pa_deleted)
3037 continue; 3236 continue;
@@ -3075,7 +3274,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3075 /* XXX: extra loop to check we really don't overlap preallocations */ 3274 /* XXX: extra loop to check we really don't overlap preallocations */
3076 rcu_read_lock(); 3275 rcu_read_lock();
3077 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3276 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3078 unsigned long pa_end; 3277 ext4_lblk_t pa_end;
3079 spin_lock(&pa->pa_lock); 3278 spin_lock(&pa->pa_lock);
3080 if (pa->pa_deleted == 0) { 3279 if (pa->pa_deleted == 0) {
3081 pa_end = pa->pa_lstart + pa->pa_len; 3280 pa_end = pa->pa_lstart + pa->pa_len;
@@ -3307,6 +3506,32 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3307} 3506}
3308 3507
3309/* 3508/*
3509 * the function goes through all block freed in the group
3510 * but not yet committed and marks them used in in-core bitmap.
3511 * buddy must be generated from this bitmap
3512 * Need to be called with ext4 group lock (ext4_lock_group)
3513 */
3514static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3515 ext4_group_t group)
3516{
3517 struct rb_node *n;
3518 struct ext4_group_info *grp;
3519 struct ext4_free_data *entry;
3520
3521 grp = ext4_get_group_info(sb, group);
3522 n = rb_first(&(grp->bb_free_root));
3523
3524 while (n) {
3525 entry = rb_entry(n, struct ext4_free_data, node);
3526 mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
3527 bitmap, entry->start_blk,
3528 entry->count);
3529 n = rb_next(n);
3530 }
3531 return;
3532}
3533
3534/*
3310 * the function goes through all preallocation in this group and marks them 3535 * the function goes through all preallocation in this group and marks them
3311 * used in in-core bitmap. buddy must be generated from this bitmap 3536 * used in in-core bitmap. buddy must be generated from this bitmap
3312 * Need to be called with ext4 group lock (ext4_lock_group) 3537 * Need to be called with ext4 group lock (ext4_lock_group)
@@ -3346,7 +3571,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3346 preallocated += len; 3571 preallocated += len;
3347 count++; 3572 count++;
3348 } 3573 }
3349 mb_debug("prellocated %u for group %lu\n", preallocated, group); 3574 mb_debug("prellocated %u for group %u\n", preallocated, group);
3350} 3575}
3351 3576
3352static void ext4_mb_pa_callback(struct rcu_head *head) 3577static void ext4_mb_pa_callback(struct rcu_head *head)
@@ -3363,7 +3588,7 @@ static void ext4_mb_pa_callback(struct rcu_head *head)
3363static void ext4_mb_put_pa(struct ext4_allocation_context *ac, 3588static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3364 struct super_block *sb, struct ext4_prealloc_space *pa) 3589 struct super_block *sb, struct ext4_prealloc_space *pa)
3365{ 3590{
3366 unsigned long grp; 3591 ext4_group_t grp;
3367 3592
3368 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) 3593 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
3369 return; 3594 return;
@@ -3473,6 +3698,10 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3473 3698
3474 mb_debug("new inode pa %p: %llu/%u for %u\n", pa, 3699 mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
3475 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3700 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3701 trace_mark(ext4_mb_new_inode_pa,
3702 "dev %s ino %lu pstart %llu len %u lstart %u",
3703 sb->s_id, ac->ac_inode->i_ino,
3704 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3476 3705
3477 ext4_mb_use_inode_pa(ac, pa); 3706 ext4_mb_use_inode_pa(ac, pa);
3478 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 3707 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3530,7 +3759,9 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3530 pa->pa_linear = 1; 3759 pa->pa_linear = 1;
3531 3760
3532 mb_debug("new group pa %p: %llu/%u for %u\n", pa, 3761 mb_debug("new group pa %p: %llu/%u for %u\n", pa,
3533 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3762 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3763 trace_mark(ext4_mb_new_group_pa, "dev %s pstart %llu len %u lstart %u",
3764 sb->s_id, pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3534 3765
3535 ext4_mb_use_group_pa(ac, pa); 3766 ext4_mb_use_group_pa(ac, pa);
3536 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 3767 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3579,16 +3810,18 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3579{ 3810{
3580 struct super_block *sb = e4b->bd_sb; 3811 struct super_block *sb = e4b->bd_sb;
3581 struct ext4_sb_info *sbi = EXT4_SB(sb); 3812 struct ext4_sb_info *sbi = EXT4_SB(sb);
3582 unsigned long end; 3813 unsigned int end;
3583 unsigned long next; 3814 unsigned int next;
3584 ext4_group_t group; 3815 ext4_group_t group;
3585 ext4_grpblk_t bit; 3816 ext4_grpblk_t bit;
3817 unsigned long long grp_blk_start;
3586 sector_t start; 3818 sector_t start;
3587 int err = 0; 3819 int err = 0;
3588 int free = 0; 3820 int free = 0;
3589 3821
3590 BUG_ON(pa->pa_deleted == 0); 3822 BUG_ON(pa->pa_deleted == 0);
3591 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3823 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3824 grp_blk_start = pa->pa_pstart - bit;
3592 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3825 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3593 end = bit + pa->pa_len; 3826 end = bit + pa->pa_len;
3594 3827
@@ -3618,6 +3851,10 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3618 ext4_mb_store_history(ac); 3851 ext4_mb_store_history(ac);
3619 } 3852 }
3620 3853
3854 trace_mark(ext4_mb_release_inode_pa,
3855 "dev %s ino %lu block %llu count %u",
3856 sb->s_id, pa->pa_inode->i_ino, grp_blk_start + bit,
3857 next - bit);
3621 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); 3858 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3622 bit = next + 1; 3859 bit = next + 1;
3623 } 3860 }
@@ -3626,8 +3863,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3626 pa, (unsigned long) pa->pa_lstart, 3863 pa, (unsigned long) pa->pa_lstart,
3627 (unsigned long) pa->pa_pstart, 3864 (unsigned long) pa->pa_pstart,
3628 (unsigned long) pa->pa_len); 3865 (unsigned long) pa->pa_len);
3629 ext4_error(sb, __func__, "free %u, pa_free %u\n", 3866 ext4_grp_locked_error(sb, group,
3630 free, pa->pa_free); 3867 __func__, "free %u, pa_free %u",
3868 free, pa->pa_free);
3631 /* 3869 /*
3632 * pa is already deleted so we use the value obtained 3870 * pa is already deleted so we use the value obtained
3633 * from the bitmap and continue. 3871 * from the bitmap and continue.
@@ -3650,6 +3888,8 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3650 if (ac) 3888 if (ac)
3651 ac->ac_op = EXT4_MB_HISTORY_DISCARD; 3889 ac->ac_op = EXT4_MB_HISTORY_DISCARD;
3652 3890
3891 trace_mark(ext4_mb_release_group_pa, "dev %s pstart %llu len %d",
3892 sb->s_id, pa->pa_pstart, pa->pa_len);
3653 BUG_ON(pa->pa_deleted == 0); 3893 BUG_ON(pa->pa_deleted == 0);
3654 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3894 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3655 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3895 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
@@ -3692,7 +3932,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3692 int busy = 0; 3932 int busy = 0;
3693 int free = 0; 3933 int free = 0;
3694 3934
3695 mb_debug("discard preallocation for group %lu\n", group); 3935 mb_debug("discard preallocation for group %u\n", group);
3696 3936
3697 if (list_empty(&grp->bb_prealloc_list)) 3937 if (list_empty(&grp->bb_prealloc_list))
3698 return 0; 3938 return 0;
@@ -3700,14 +3940,14 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3700 bitmap_bh = ext4_read_block_bitmap(sb, group); 3940 bitmap_bh = ext4_read_block_bitmap(sb, group);
3701 if (bitmap_bh == NULL) { 3941 if (bitmap_bh == NULL) {
3702 ext4_error(sb, __func__, "Error in reading block " 3942 ext4_error(sb, __func__, "Error in reading block "
3703 "bitmap for %lu\n", group); 3943 "bitmap for %u", group);
3704 return 0; 3944 return 0;
3705 } 3945 }
3706 3946
3707 err = ext4_mb_load_buddy(sb, group, &e4b); 3947 err = ext4_mb_load_buddy(sb, group, &e4b);
3708 if (err) { 3948 if (err) {
3709 ext4_error(sb, __func__, "Error in loading buddy " 3949 ext4_error(sb, __func__, "Error in loading buddy "
3710 "information for %lu\n", group); 3950 "information for %u", group);
3711 put_bh(bitmap_bh); 3951 put_bh(bitmap_bh);
3712 return 0; 3952 return 0;
3713 } 3953 }
@@ -3815,6 +4055,8 @@ void ext4_discard_preallocations(struct inode *inode)
3815 } 4055 }
3816 4056
3817 mb_debug("discard preallocation for inode %lu\n", inode->i_ino); 4057 mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
4058 trace_mark(ext4_discard_preallocations, "dev %s ino %lu", sb->s_id,
4059 inode->i_ino);
3818 4060
3819 INIT_LIST_HEAD(&list); 4061 INIT_LIST_HEAD(&list);
3820 4062
@@ -3874,14 +4116,14 @@ repeat:
3874 err = ext4_mb_load_buddy(sb, group, &e4b); 4116 err = ext4_mb_load_buddy(sb, group, &e4b);
3875 if (err) { 4117 if (err) {
3876 ext4_error(sb, __func__, "Error in loading buddy " 4118 ext4_error(sb, __func__, "Error in loading buddy "
3877 "information for %lu\n", group); 4119 "information for %u", group);
3878 continue; 4120 continue;
3879 } 4121 }
3880 4122
3881 bitmap_bh = ext4_read_block_bitmap(sb, group); 4123 bitmap_bh = ext4_read_block_bitmap(sb, group);
3882 if (bitmap_bh == NULL) { 4124 if (bitmap_bh == NULL) {
3883 ext4_error(sb, __func__, "Error in reading block " 4125 ext4_error(sb, __func__, "Error in reading block "
3884 "bitmap for %lu\n", group); 4126 "bitmap for %u", group);
3885 ext4_mb_release_desc(&e4b); 4127 ext4_mb_release_desc(&e4b);
3886 continue; 4128 continue;
3887 } 4129 }
@@ -4024,8 +4266,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4024 struct ext4_sb_info *sbi = EXT4_SB(sb); 4266 struct ext4_sb_info *sbi = EXT4_SB(sb);
4025 struct ext4_super_block *es = sbi->s_es; 4267 struct ext4_super_block *es = sbi->s_es;
4026 ext4_group_t group; 4268 ext4_group_t group;
4027 unsigned long len; 4269 unsigned int len;
4028 unsigned long goal; 4270 ext4_fsblk_t goal;
4029 ext4_grpblk_t block; 4271 ext4_grpblk_t block;
4030 4272
4031 /* we can't allocate > group size */ 4273 /* we can't allocate > group size */
@@ -4068,6 +4310,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4068 ac->ac_pa = NULL; 4310 ac->ac_pa = NULL;
4069 ac->ac_bitmap_page = NULL; 4311 ac->ac_bitmap_page = NULL;
4070 ac->ac_buddy_page = NULL; 4312 ac->ac_buddy_page = NULL;
4313 ac->alloc_semp = NULL;
4071 ac->ac_lg = NULL; 4314 ac->ac_lg = NULL;
4072 4315
4073 /* we have to define context: we'll we work with a file or 4316 /* we have to define context: we'll we work with a file or
@@ -4146,7 +4389,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4146 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 4389 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
4147 if (ext4_mb_load_buddy(sb, group, &e4b)) { 4390 if (ext4_mb_load_buddy(sb, group, &e4b)) {
4148 ext4_error(sb, __func__, "Error in loading buddy " 4391 ext4_error(sb, __func__, "Error in loading buddy "
4149 "information for %lu\n", group); 4392 "information for %u", group);
4150 continue; 4393 continue;
4151 } 4394 }
4152 ext4_lock_group(sb, group); 4395 ext4_lock_group(sb, group);
@@ -4248,6 +4491,8 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4248 } 4491 }
4249 ext4_mb_put_pa(ac, ac->ac_sb, pa); 4492 ext4_mb_put_pa(ac, ac->ac_sb, pa);
4250 } 4493 }
4494 if (ac->alloc_semp)
4495 up_read(ac->alloc_semp);
4251 if (ac->ac_bitmap_page) 4496 if (ac->ac_bitmap_page)
4252 page_cache_release(ac->ac_bitmap_page); 4497 page_cache_release(ac->ac_bitmap_page);
4253 if (ac->ac_buddy_page) 4498 if (ac->ac_buddy_page)
@@ -4264,6 +4509,8 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4264 int ret; 4509 int ret;
4265 int freed = 0; 4510 int freed = 0;
4266 4511
4512 trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d",
4513 sb->s_id, needed);
4267 for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) { 4514 for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
4268 ret = ext4_mb_discard_group_preallocations(sb, i, needed); 4515 ret = ext4_mb_discard_group_preallocations(sb, i, needed);
4269 freed += ret; 4516 freed += ret;
@@ -4286,12 +4533,24 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4286 struct ext4_sb_info *sbi; 4533 struct ext4_sb_info *sbi;
4287 struct super_block *sb; 4534 struct super_block *sb;
4288 ext4_fsblk_t block = 0; 4535 ext4_fsblk_t block = 0;
4289 unsigned long inquota; 4536 unsigned int inquota;
4290 unsigned long reserv_blks = 0; 4537 unsigned int reserv_blks = 0;
4291 4538
4292 sb = ar->inode->i_sb; 4539 sb = ar->inode->i_sb;
4293 sbi = EXT4_SB(sb); 4540 sbi = EXT4_SB(sb);
4294 4541
4542 trace_mark(ext4_request_blocks, "dev %s flags %u len %u ino %lu "
4543 "lblk %llu goal %llu lleft %llu lright %llu "
4544 "pleft %llu pright %llu ",
4545 sb->s_id, ar->flags, ar->len,
4546 ar->inode ? ar->inode->i_ino : 0,
4547 (unsigned long long) ar->logical,
4548 (unsigned long long) ar->goal,
4549 (unsigned long long) ar->lleft,
4550 (unsigned long long) ar->lright,
4551 (unsigned long long) ar->pleft,
4552 (unsigned long long) ar->pright);
4553
4295 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) { 4554 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
4296 /* 4555 /*
4297 * With delalloc we already reserved the blocks 4556 * With delalloc we already reserved the blocks
@@ -4313,7 +4572,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4313 } 4572 }
4314 if (ar->len == 0) { 4573 if (ar->len == 0) {
4315 *errp = -EDQUOT; 4574 *errp = -EDQUOT;
4316 return 0; 4575 goto out3;
4317 } 4576 }
4318 inquota = ar->len; 4577 inquota = ar->len;
4319 4578
@@ -4348,10 +4607,14 @@ repeat:
4348 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) 4607 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
4349 ext4_mb_new_preallocation(ac); 4608 ext4_mb_new_preallocation(ac);
4350 } 4609 }
4351
4352 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 4610 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4353 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); 4611 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
4354 if (*errp == -EAGAIN) { 4612 if (*errp == -EAGAIN) {
4613 /*
4614 * drop the reference that we took
4615 * in ext4_mb_use_best_found
4616 */
4617 ext4_mb_release_context(ac);
4355 ac->ac_b_ex.fe_group = 0; 4618 ac->ac_b_ex.fe_group = 0;
4356 ac->ac_b_ex.fe_start = 0; 4619 ac->ac_b_ex.fe_start = 0;
4357 ac->ac_b_ex.fe_len = 0; 4620 ac->ac_b_ex.fe_len = 0;
@@ -4382,6 +4645,26 @@ out2:
4382out1: 4645out1:
4383 if (ar->len < inquota) 4646 if (ar->len < inquota)
4384 DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len); 4647 DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
4648out3:
4649 if (!ar->len) {
4650 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
4651 /* release all the reserved blocks if non delalloc */
4652 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
4653 reserv_blks);
4654 }
4655
4656 trace_mark(ext4_allocate_blocks,
4657 "dev %s block %llu flags %u len %u ino %lu "
4658 "logical %llu goal %llu lleft %llu lright %llu "
4659 "pleft %llu pright %llu ",
4660 sb->s_id, (unsigned long long) block,
4661 ar->flags, ar->len, ar->inode ? ar->inode->i_ino : 0,
4662 (unsigned long long) ar->logical,
4663 (unsigned long long) ar->goal,
4664 (unsigned long long) ar->lleft,
4665 (unsigned long long) ar->lright,
4666 (unsigned long long) ar->pleft,
4667 (unsigned long long) ar->pright);
4385 4668
4386 return block; 4669 return block;
4387} 4670}
@@ -4403,27 +4686,23 @@ static int can_merge(struct ext4_free_data *entry1,
4403 4686
4404static noinline_for_stack int 4687static noinline_for_stack int
4405ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, 4688ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4406 ext4_group_t group, ext4_grpblk_t block, int count) 4689 struct ext4_free_data *new_entry)
4407{ 4690{
4691 ext4_grpblk_t block;
4692 struct ext4_free_data *entry;
4408 struct ext4_group_info *db = e4b->bd_info; 4693 struct ext4_group_info *db = e4b->bd_info;
4409 struct super_block *sb = e4b->bd_sb; 4694 struct super_block *sb = e4b->bd_sb;
4410 struct ext4_sb_info *sbi = EXT4_SB(sb); 4695 struct ext4_sb_info *sbi = EXT4_SB(sb);
4411 struct ext4_free_data *entry, *new_entry;
4412 struct rb_node **n = &db->bb_free_root.rb_node, *node; 4696 struct rb_node **n = &db->bb_free_root.rb_node, *node;
4413 struct rb_node *parent = NULL, *new_node; 4697 struct rb_node *parent = NULL, *new_node;
4414 4698
4415 4699 BUG_ON(!ext4_handle_valid(handle));
4416 BUG_ON(e4b->bd_bitmap_page == NULL); 4700 BUG_ON(e4b->bd_bitmap_page == NULL);
4417 BUG_ON(e4b->bd_buddy_page == NULL); 4701 BUG_ON(e4b->bd_buddy_page == NULL);
4418 4702
4419 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
4420 new_entry->start_blk = block;
4421 new_entry->group = group;
4422 new_entry->count = count;
4423 new_entry->t_tid = handle->h_transaction->t_tid;
4424 new_node = &new_entry->node; 4703 new_node = &new_entry->node;
4704 block = new_entry->start_blk;
4425 4705
4426 ext4_lock_group(sb, group);
4427 if (!*n) { 4706 if (!*n) {
4428 /* first free block exent. We need to 4707 /* first free block exent. We need to
4429 protect buddy cache from being freed, 4708 protect buddy cache from being freed,
@@ -4441,10 +4720,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4441 else if (block >= (entry->start_blk + entry->count)) 4720 else if (block >= (entry->start_blk + entry->count))
4442 n = &(*n)->rb_right; 4721 n = &(*n)->rb_right;
4443 else { 4722 else {
4444 ext4_unlock_group(sb, group); 4723 ext4_grp_locked_error(sb, e4b->bd_group, __func__,
4445 ext4_error(sb, __func__, 4724 "Double free of blocks %d (%d %d)",
4446 "Double free of blocks %d (%d %d)\n", 4725 block, entry->start_blk, entry->count);
4447 block, entry->start_blk, entry->count);
4448 return 0; 4726 return 0;
4449 } 4727 }
4450 } 4728 }
@@ -4483,7 +4761,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4483 spin_lock(&sbi->s_md_lock); 4761 spin_lock(&sbi->s_md_lock);
4484 list_add(&new_entry->list, &handle->h_transaction->t_private_list); 4762 list_add(&new_entry->list, &handle->h_transaction->t_private_list);
4485 spin_unlock(&sbi->s_md_lock); 4763 spin_unlock(&sbi->s_md_lock);
4486 ext4_unlock_group(sb, group);
4487 return 0; 4764 return 0;
4488} 4765}
4489 4766
@@ -4499,7 +4776,7 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
4499 struct ext4_allocation_context *ac = NULL; 4776 struct ext4_allocation_context *ac = NULL;
4500 struct ext4_group_desc *gdp; 4777 struct ext4_group_desc *gdp;
4501 struct ext4_super_block *es; 4778 struct ext4_super_block *es;
4502 unsigned long overflow; 4779 unsigned int overflow;
4503 ext4_grpblk_t bit; 4780 ext4_grpblk_t bit;
4504 struct buffer_head *gd_bh; 4781 struct buffer_head *gd_bh;
4505 ext4_group_t block_group; 4782 ext4_group_t block_group;
@@ -4522,6 +4799,10 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
4522 } 4799 }
4523 4800
4524 ext4_debug("freeing block %lu\n", block); 4801 ext4_debug("freeing block %lu\n", block);
4802 trace_mark(ext4_free_blocks,
4803 "dev %s block %llu count %lu metadata %d ino %lu",
4804 sb->s_id, (unsigned long long) block, count, metadata,
4805 inode ? inode->i_ino : 0);
4525 4806
4526 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4807 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4527 if (ac) { 4808 if (ac) {
@@ -4581,11 +4862,6 @@ do_more:
4581 err = ext4_journal_get_write_access(handle, gd_bh); 4862 err = ext4_journal_get_write_access(handle, gd_bh);
4582 if (err) 4863 if (err)
4583 goto error_return; 4864 goto error_return;
4584
4585 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4586 if (err)
4587 goto error_return;
4588
4589#ifdef AGGRESSIVE_CHECK 4865#ifdef AGGRESSIVE_CHECK
4590 { 4866 {
4591 int i; 4867 int i;
@@ -4593,13 +4869,6 @@ do_more:
4593 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); 4869 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
4594 } 4870 }
4595#endif 4871#endif
4596 mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
4597 bit, count);
4598
4599 /* We dirtied the bitmap block */
4600 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4601 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
4602
4603 if (ac) { 4872 if (ac) {
4604 ac->ac_b_ex.fe_group = block_group; 4873 ac->ac_b_ex.fe_group = block_group;
4605 ac->ac_b_ex.fe_start = bit; 4874 ac->ac_b_ex.fe_start = bit;
@@ -4607,19 +4876,41 @@ do_more:
4607 ext4_mb_store_history(ac); 4876 ext4_mb_store_history(ac);
4608 } 4877 }
4609 4878
4610 if (metadata) { 4879 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4611 /* blocks being freed are metadata. these blocks shouldn't 4880 if (err)
4612 * be used until this transaction is committed */ 4881 goto error_return;
4613 ext4_mb_free_metadata(handle, &e4b, block_group, bit, count); 4882 if (metadata && ext4_handle_valid(handle)) {
4883 struct ext4_free_data *new_entry;
4884 /*
4885 * blocks being freed are metadata. these blocks shouldn't
4886 * be used until this transaction is committed
4887 */
4888 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
4889 new_entry->start_blk = bit;
4890 new_entry->group = block_group;
4891 new_entry->count = count;
4892 new_entry->t_tid = handle->h_transaction->t_tid;
4893 ext4_lock_group(sb, block_group);
4894 mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
4895 bit, count);
4896 ext4_mb_free_metadata(handle, &e4b, new_entry);
4897 ext4_unlock_group(sb, block_group);
4614 } else { 4898 } else {
4615 ext4_lock_group(sb, block_group); 4899 ext4_lock_group(sb, block_group);
4900 /* need to update group_info->bb_free and bitmap
4901 * with group lock held. generate_buddy look at
4902 * them with group lock_held
4903 */
4904 mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
4905 bit, count);
4616 mb_free_blocks(inode, &e4b, bit, count); 4906 mb_free_blocks(inode, &e4b, bit, count);
4617 ext4_mb_return_to_preallocation(inode, &e4b, block, count); 4907 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
4618 ext4_unlock_group(sb, block_group); 4908 ext4_unlock_group(sb, block_group);
4619 } 4909 }
4620 4910
4621 spin_lock(sb_bgl_lock(sbi, block_group)); 4911 spin_lock(sb_bgl_lock(sbi, block_group));
4622 le16_add_cpu(&gdp->bg_free_blocks_count, count); 4912 ret = ext4_free_blks_count(sb, gdp) + count;
4913 ext4_free_blks_set(sb, gdp, ret);
4623 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); 4914 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
4624 spin_unlock(sb_bgl_lock(sbi, block_group)); 4915 spin_unlock(sb_bgl_lock(sbi, block_group));
4625 percpu_counter_add(&sbi->s_freeblocks_counter, count); 4916 percpu_counter_add(&sbi->s_freeblocks_counter, count);
@@ -4635,9 +4926,13 @@ do_more:
4635 4926
4636 *freed += count; 4927 *freed += count;
4637 4928
4929 /* We dirtied the bitmap block */
4930 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4931 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
4932
4638 /* And the group descriptor block */ 4933 /* And the group descriptor block */
4639 BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); 4934 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
4640 ret = ext4_journal_dirty_metadata(handle, gd_bh); 4935 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
4641 if (!err) 4936 if (!err)
4642 err = ret; 4937 err = ret;
4643 4938
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b5dff1fff1e5..10a2921baf14 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -20,6 +20,7 @@
20#include <linux/version.h> 20#include <linux/version.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/marker.h> 22#include <linux/marker.h>
23#include <linux/mutex.h>
23#include "ext4_jbd2.h" 24#include "ext4_jbd2.h"
24#include "ext4.h" 25#include "ext4.h"
25#include "group.h" 26#include "group.h"
@@ -98,9 +99,6 @@
98 */ 99 */
99#define MB_DEFAULT_GROUP_PREALLOC 512 100#define MB_DEFAULT_GROUP_PREALLOC 512
100 101
101static struct kmem_cache *ext4_pspace_cachep;
102static struct kmem_cache *ext4_ac_cachep;
103static struct kmem_cache *ext4_free_ext_cachep;
104 102
105struct ext4_free_data { 103struct ext4_free_data {
106 /* this links the free block information from group_info */ 104 /* this links the free block information from group_info */
@@ -120,26 +118,6 @@ struct ext4_free_data {
120 tid_t t_tid; 118 tid_t t_tid;
121}; 119};
122 120
123struct ext4_group_info {
124 unsigned long bb_state;
125 struct rb_root bb_free_root;
126 unsigned short bb_first_free;
127 unsigned short bb_free;
128 unsigned short bb_fragments;
129 struct list_head bb_prealloc_list;
130#ifdef DOUBLE_CHECK
131 void *bb_bitmap;
132#endif
133 unsigned short bb_counters[];
134};
135
136#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
137#define EXT4_GROUP_INFO_LOCKED_BIT 1
138
139#define EXT4_MB_GRP_NEED_INIT(grp) \
140 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
141
142
143struct ext4_prealloc_space { 121struct ext4_prealloc_space {
144 struct list_head pa_inode_list; 122 struct list_head pa_inode_list;
145 struct list_head pa_group_list; 123 struct list_head pa_group_list;
@@ -217,6 +195,11 @@ struct ext4_allocation_context {
217 __u8 ac_op; /* operation, for history only */ 195 __u8 ac_op; /* operation, for history only */
218 struct page *ac_bitmap_page; 196 struct page *ac_bitmap_page;
219 struct page *ac_buddy_page; 197 struct page *ac_buddy_page;
198 /*
199 * pointer to the held semaphore upon successful
200 * block allocation
201 */
202 struct rw_semaphore *alloc_semp;
220 struct ext4_prealloc_space *ac_pa; 203 struct ext4_prealloc_space *ac_pa;
221 struct ext4_locality_group *ac_lg; 204 struct ext4_locality_group *ac_lg;
222}; 205};
@@ -250,6 +233,7 @@ struct ext4_buddy {
250 struct super_block *bd_sb; 233 struct super_block *bd_sb;
251 __u16 bd_blkbits; 234 __u16 bd_blkbits;
252 ext4_group_t bd_group; 235 ext4_group_t bd_group;
236 struct rw_semaphore *alloc_semp;
253}; 237};
254#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) 238#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
255#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) 239#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
@@ -259,51 +243,12 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
259{ 243{
260 return; 244 return;
261} 245}
262#else
263static void ext4_mb_store_history(struct ext4_allocation_context *ac);
264#endif 246#endif
265 247
266#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 248#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
267 249
268struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); 250struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
269 251static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
270static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
271 ext4_group_t group);
272static void ext4_mb_return_to_preallocation(struct inode *inode,
273 struct ext4_buddy *e4b, sector_t block,
274 int count);
275static void ext4_mb_put_pa(struct ext4_allocation_context *,
276 struct super_block *, struct ext4_prealloc_space *pa);
277static int ext4_mb_init_per_dev_proc(struct super_block *sb);
278static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
279static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
280
281
282static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
283{
284 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
285
286 bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
287}
288
289static inline void ext4_unlock_group(struct super_block *sb,
290 ext4_group_t group)
291{
292 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
293
294 bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
295}
296
297static inline int ext4_is_group_locked(struct super_block *sb,
298 ext4_group_t group)
299{
300 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
301
302 return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
303 &(grinfo->bb_state));
304}
305
306static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
307 struct ext4_free_extent *fex) 252 struct ext4_free_extent *fex)
308{ 253{
309 ext4_fsblk_t block; 254 ext4_fsblk_t block;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index f2a9cf498ecd..734abca25e35 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -59,7 +59,8 @@ static int finish_range(handle_t *handle, struct inode *inode,
59 /* 59 /*
60 * Make sure the credit we accumalated is not really high 60 * Make sure the credit we accumalated is not really high
61 */ 61 */
62 if (needed && handle->h_buffer_credits >= EXT4_RESERVE_TRANS_BLOCKS) { 62 if (needed && ext4_handle_has_enough_credits(handle,
63 EXT4_RESERVE_TRANS_BLOCKS)) {
63 retval = ext4_journal_restart(handle, needed); 64 retval = ext4_journal_restart(handle, needed);
64 if (retval) 65 if (retval)
65 goto err_out; 66 goto err_out;
@@ -229,7 +230,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
229{ 230{
230 int retval = 0, needed; 231 int retval = 0, needed;
231 232
232 if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS) 233 if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
233 return 0; 234 return 0;
234 /* 235 /*
235 * We are freeing a blocks. During this we touch 236 * We are freeing a blocks. During this we touch
@@ -458,13 +459,13 @@ int ext4_ext_migrate(struct inode *inode)
458 struct list_blocks_struct lb; 459 struct list_blocks_struct lb;
459 unsigned long max_entries; 460 unsigned long max_entries;
460 461
461 if (!test_opt(inode->i_sb, EXTENTS)) 462 /*
462 /* 463 * If the filesystem does not support extents, or the inode
463 * if mounted with noextents we don't allow the migrate 464 * already is extent-based, error out.
464 */ 465 */
465 return -EINVAL; 466 if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
466 467 EXT4_FEATURE_INCOMPAT_EXTENTS) ||
467 if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 468 (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
468 return -EINVAL; 469 return -EINVAL;
469 470
470 if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) 471 if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 63adcb792988..fec0b4c2f5f1 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -74,10 +74,6 @@ static struct buffer_head *ext4_append(handle_t *handle,
74#define assert(test) J_ASSERT(test) 74#define assert(test) J_ASSERT(test)
75#endif 75#endif
76 76
77#ifndef swap
78#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
79#endif
80
81#ifdef DX_DEBUG 77#ifdef DX_DEBUG
82#define dxtrace(command) command 78#define dxtrace(command) command
83#else 79#else
@@ -372,6 +368,8 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
372 goto fail; 368 goto fail;
373 } 369 }
374 hinfo->hash_version = root->info.hash_version; 370 hinfo->hash_version = root->info.hash_version;
371 if (hinfo->hash_version <= DX_HASH_TEA)
372 hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
375 hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; 373 hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
376 if (d_name) 374 if (d_name)
377 ext4fs_dirhash(d_name->name, d_name->len, hinfo); 375 ext4fs_dirhash(d_name->name, d_name->len, hinfo);
@@ -641,6 +639,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
641 dir = dir_file->f_path.dentry->d_inode; 639 dir = dir_file->f_path.dentry->d_inode;
642 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { 640 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
643 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 641 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
642 if (hinfo.hash_version <= DX_HASH_TEA)
643 hinfo.hash_version +=
644 EXT4_SB(dir->i_sb)->s_hash_unsigned;
644 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; 645 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
645 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, 646 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
646 start_hash, start_minor_hash); 647 start_hash, start_minor_hash);
@@ -806,7 +807,7 @@ static inline int ext4_match (int len, const char * const name,
806static inline int search_dirblock(struct buffer_head *bh, 807static inline int search_dirblock(struct buffer_head *bh,
807 struct inode *dir, 808 struct inode *dir,
808 const struct qstr *d_name, 809 const struct qstr *d_name,
809 unsigned long offset, 810 unsigned int offset,
810 struct ext4_dir_entry_2 ** res_dir) 811 struct ext4_dir_entry_2 ** res_dir)
811{ 812{
812 struct ext4_dir_entry_2 * de; 813 struct ext4_dir_entry_2 * de;
@@ -1043,11 +1044,11 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1043 bh = ext4_find_entry(dir, &dentry->d_name, &de); 1044 bh = ext4_find_entry(dir, &dentry->d_name, &de);
1044 inode = NULL; 1045 inode = NULL;
1045 if (bh) { 1046 if (bh) {
1046 unsigned long ino = le32_to_cpu(de->inode); 1047 __u32 ino = le32_to_cpu(de->inode);
1047 brelse(bh); 1048 brelse(bh);
1048 if (!ext4_valid_inum(dir->i_sb, ino)) { 1049 if (!ext4_valid_inum(dir->i_sb, ino)) {
1049 ext4_error(dir->i_sb, "ext4_lookup", 1050 ext4_error(dir->i_sb, "ext4_lookup",
1050 "bad inode number: %lu", ino); 1051 "bad inode number: %u", ino);
1051 return ERR_PTR(-EIO); 1052 return ERR_PTR(-EIO);
1052 } 1053 }
1053 inode = ext4_iget(dir->i_sb, ino); 1054 inode = ext4_iget(dir->i_sb, ino);
@@ -1060,7 +1061,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1060 1061
1061struct dentry *ext4_get_parent(struct dentry *child) 1062struct dentry *ext4_get_parent(struct dentry *child)
1062{ 1063{
1063 unsigned long ino; 1064 __u32 ino;
1064 struct inode *inode; 1065 struct inode *inode;
1065 static const struct qstr dotdot = { 1066 static const struct qstr dotdot = {
1066 .name = "..", 1067 .name = "..",
@@ -1078,7 +1079,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
1078 1079
1079 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { 1080 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
1080 ext4_error(child->d_inode->i_sb, "ext4_get_parent", 1081 ext4_error(child->d_inode->i_sb, "ext4_get_parent",
1081 "bad inode number: %lu", ino); 1082 "bad inode number: %u", ino);
1082 return ERR_PTR(-EIO); 1083 return ERR_PTR(-EIO);
1083 } 1084 }
1084 1085
@@ -1166,9 +1167,9 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1166 u32 hash2; 1167 u32 hash2;
1167 struct dx_map_entry *map; 1168 struct dx_map_entry *map;
1168 char *data1 = (*bh)->b_data, *data2; 1169 char *data1 = (*bh)->b_data, *data2;
1169 unsigned split, move, size, i; 1170 unsigned split, move, size;
1170 struct ext4_dir_entry_2 *de = NULL, *de2; 1171 struct ext4_dir_entry_2 *de = NULL, *de2;
1171 int err = 0; 1172 int err = 0, i;
1172 1173
1173 bh2 = ext4_append (handle, dir, &newblock, &err); 1174 bh2 = ext4_append (handle, dir, &newblock, &err);
1174 if (!(bh2)) { 1175 if (!(bh2)) {
@@ -1228,10 +1229,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1228 de = de2; 1229 de = de2;
1229 } 1230 }
1230 dx_insert_block(frame, hash2 + continued, newblock); 1231 dx_insert_block(frame, hash2 + continued, newblock);
1231 err = ext4_journal_dirty_metadata(handle, bh2); 1232 err = ext4_handle_dirty_metadata(handle, dir, bh2);
1232 if (err) 1233 if (err)
1233 goto journal_error; 1234 goto journal_error;
1234 err = ext4_journal_dirty_metadata(handle, frame->bh); 1235 err = ext4_handle_dirty_metadata(handle, dir, frame->bh);
1235 if (err) 1236 if (err)
1236 goto journal_error; 1237 goto journal_error;
1237 brelse(bh2); 1238 brelse(bh2);
@@ -1266,7 +1267,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1266 struct inode *dir = dentry->d_parent->d_inode; 1267 struct inode *dir = dentry->d_parent->d_inode;
1267 const char *name = dentry->d_name.name; 1268 const char *name = dentry->d_name.name;
1268 int namelen = dentry->d_name.len; 1269 int namelen = dentry->d_name.len;
1269 unsigned long offset = 0; 1270 unsigned int offset = 0;
1270 unsigned short reclen; 1271 unsigned short reclen;
1271 int nlen, rlen, err; 1272 int nlen, rlen, err;
1272 char *top; 1273 char *top;
@@ -1335,8 +1336,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1335 ext4_update_dx_flag(dir); 1336 ext4_update_dx_flag(dir);
1336 dir->i_version++; 1337 dir->i_version++;
1337 ext4_mark_inode_dirty(handle, dir); 1338 ext4_mark_inode_dirty(handle, dir);
1338 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 1339 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1339 err = ext4_journal_dirty_metadata(handle, bh); 1340 err = ext4_handle_dirty_metadata(handle, dir, bh);
1340 if (err) 1341 if (err)
1341 ext4_std_error(dir->i_sb, err); 1342 ext4_std_error(dir->i_sb, err);
1342 brelse(bh); 1343 brelse(bh);
@@ -1408,6 +1409,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1408 1409
1409 /* Initialize as for dx_probe */ 1410 /* Initialize as for dx_probe */
1410 hinfo.hash_version = root->info.hash_version; 1411 hinfo.hash_version = root->info.hash_version;
1412 if (hinfo.hash_version <= DX_HASH_TEA)
1413 hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
1411 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; 1414 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
1412 ext4fs_dirhash(name, namelen, &hinfo); 1415 ext4fs_dirhash(name, namelen, &hinfo);
1413 frame = frames; 1416 frame = frames;
@@ -1437,7 +1440,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1437 struct inode *inode) 1440 struct inode *inode)
1438{ 1441{
1439 struct inode *dir = dentry->d_parent->d_inode; 1442 struct inode *dir = dentry->d_parent->d_inode;
1440 unsigned long offset;
1441 struct buffer_head *bh; 1443 struct buffer_head *bh;
1442 struct ext4_dir_entry_2 *de; 1444 struct ext4_dir_entry_2 *de;
1443 struct super_block *sb; 1445 struct super_block *sb;
@@ -1459,7 +1461,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1459 ext4_mark_inode_dirty(handle, dir); 1461 ext4_mark_inode_dirty(handle, dir);
1460 } 1462 }
1461 blocks = dir->i_size >> sb->s_blocksize_bits; 1463 blocks = dir->i_size >> sb->s_blocksize_bits;
1462 for (block = 0, offset = 0; block < blocks; block++) { 1464 for (block = 0; block < blocks; block++) {
1463 bh = ext4_bread(handle, dir, block, 0, &retval); 1465 bh = ext4_bread(handle, dir, block, 0, &retval);
1464 if(!bh) 1466 if(!bh)
1465 return retval; 1467 return retval;
@@ -1574,7 +1576,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1574 dxtrace(dx_show_index("node", frames[1].entries)); 1576 dxtrace(dx_show_index("node", frames[1].entries));
1575 dxtrace(dx_show_index("node", 1577 dxtrace(dx_show_index("node",
1576 ((struct dx_node *) bh2->b_data)->entries)); 1578 ((struct dx_node *) bh2->b_data)->entries));
1577 err = ext4_journal_dirty_metadata(handle, bh2); 1579 err = ext4_handle_dirty_metadata(handle, inode, bh2);
1578 if (err) 1580 if (err)
1579 goto journal_error; 1581 goto journal_error;
1580 brelse (bh2); 1582 brelse (bh2);
@@ -1600,7 +1602,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1600 if (err) 1602 if (err)
1601 goto journal_error; 1603 goto journal_error;
1602 } 1604 }
1603 ext4_journal_dirty_metadata(handle, frames[0].bh); 1605 ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
1604 } 1606 }
1605 de = do_split(handle, dir, &bh, frame, &hinfo, &err); 1607 de = do_split(handle, dir, &bh, frame, &hinfo, &err);
1606 if (!de) 1608 if (!de)
@@ -1646,8 +1648,8 @@ static int ext4_delete_entry(handle_t *handle,
1646 else 1648 else
1647 de->inode = 0; 1649 de->inode = 0;
1648 dir->i_version++; 1650 dir->i_version++;
1649 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 1651 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1650 ext4_journal_dirty_metadata(handle, bh); 1652 ext4_handle_dirty_metadata(handle, dir, bh);
1651 return 0; 1653 return 0;
1652 } 1654 }
1653 i += ext4_rec_len_from_disk(de->rec_len); 1655 i += ext4_rec_len_from_disk(de->rec_len);
@@ -1693,9 +1695,11 @@ static int ext4_add_nondir(handle_t *handle,
1693 if (!err) { 1695 if (!err) {
1694 ext4_mark_inode_dirty(handle, inode); 1696 ext4_mark_inode_dirty(handle, inode);
1695 d_instantiate(dentry, inode); 1697 d_instantiate(dentry, inode);
1698 unlock_new_inode(inode);
1696 return 0; 1699 return 0;
1697 } 1700 }
1698 drop_nlink(inode); 1701 drop_nlink(inode);
1702 unlock_new_inode(inode);
1699 iput(inode); 1703 iput(inode);
1700 return err; 1704 return err;
1701} 1705}
@@ -1723,7 +1727,7 @@ retry:
1723 return PTR_ERR(handle); 1727 return PTR_ERR(handle);
1724 1728
1725 if (IS_DIRSYNC(dir)) 1729 if (IS_DIRSYNC(dir))
1726 handle->h_sync = 1; 1730 ext4_handle_sync(handle);
1727 1731
1728 inode = ext4_new_inode (handle, dir, mode); 1732 inode = ext4_new_inode (handle, dir, mode);
1729 err = PTR_ERR(inode); 1733 err = PTR_ERR(inode);
@@ -1757,7 +1761,7 @@ retry:
1757 return PTR_ERR(handle); 1761 return PTR_ERR(handle);
1758 1762
1759 if (IS_DIRSYNC(dir)) 1763 if (IS_DIRSYNC(dir))
1760 handle->h_sync = 1; 1764 ext4_handle_sync(handle);
1761 1765
1762 inode = ext4_new_inode(handle, dir, mode); 1766 inode = ext4_new_inode(handle, dir, mode);
1763 err = PTR_ERR(inode); 1767 err = PTR_ERR(inode);
@@ -1793,7 +1797,7 @@ retry:
1793 return PTR_ERR(handle); 1797 return PTR_ERR(handle);
1794 1798
1795 if (IS_DIRSYNC(dir)) 1799 if (IS_DIRSYNC(dir))
1796 handle->h_sync = 1; 1800 ext4_handle_sync(handle);
1797 1801
1798 inode = ext4_new_inode(handle, dir, S_IFDIR | mode); 1802 inode = ext4_new_inode(handle, dir, S_IFDIR | mode);
1799 err = PTR_ERR(inode); 1803 err = PTR_ERR(inode);
@@ -1822,14 +1826,15 @@ retry:
1822 strcpy(de->name, ".."); 1826 strcpy(de->name, "..");
1823 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1827 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1824 inode->i_nlink = 2; 1828 inode->i_nlink = 2;
1825 BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata"); 1829 BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
1826 ext4_journal_dirty_metadata(handle, dir_block); 1830 ext4_handle_dirty_metadata(handle, dir, dir_block);
1827 brelse(dir_block); 1831 brelse(dir_block);
1828 ext4_mark_inode_dirty(handle, inode); 1832 ext4_mark_inode_dirty(handle, inode);
1829 err = ext4_add_entry(handle, dentry, inode); 1833 err = ext4_add_entry(handle, dentry, inode);
1830 if (err) { 1834 if (err) {
1831out_clear_inode: 1835out_clear_inode:
1832 clear_nlink(inode); 1836 clear_nlink(inode);
1837 unlock_new_inode(inode);
1833 ext4_mark_inode_dirty(handle, inode); 1838 ext4_mark_inode_dirty(handle, inode);
1834 iput(inode); 1839 iput(inode);
1835 goto out_stop; 1840 goto out_stop;
@@ -1838,6 +1843,7 @@ out_clear_inode:
1838 ext4_update_dx_flag(dir); 1843 ext4_update_dx_flag(dir);
1839 ext4_mark_inode_dirty(handle, dir); 1844 ext4_mark_inode_dirty(handle, dir);
1840 d_instantiate(dentry, inode); 1845 d_instantiate(dentry, inode);
1846 unlock_new_inode(inode);
1841out_stop: 1847out_stop:
1842 ext4_journal_stop(handle); 1848 ext4_journal_stop(handle);
1843 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) 1849 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
@@ -1850,7 +1856,7 @@ out_stop:
1850 */ 1856 */
1851static int empty_dir(struct inode *inode) 1857static int empty_dir(struct inode *inode)
1852{ 1858{
1853 unsigned long offset; 1859 unsigned int offset;
1854 struct buffer_head *bh; 1860 struct buffer_head *bh;
1855 struct ext4_dir_entry_2 *de, *de1; 1861 struct ext4_dir_entry_2 *de, *de1;
1856 struct super_block *sb; 1862 struct super_block *sb;
@@ -1895,7 +1901,7 @@ static int empty_dir(struct inode *inode)
1895 if (err) 1901 if (err)
1896 ext4_error(sb, __func__, 1902 ext4_error(sb, __func__,
1897 "error %d reading directory" 1903 "error %d reading directory"
1898 " #%lu offset %lu", 1904 " #%lu offset %u",
1899 err, inode->i_ino, offset); 1905 err, inode->i_ino, offset);
1900 offset += sb->s_blocksize; 1906 offset += sb->s_blocksize;
1901 continue; 1907 continue;
@@ -1933,6 +1939,9 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
1933 struct ext4_iloc iloc; 1939 struct ext4_iloc iloc;
1934 int err = 0, rc; 1940 int err = 0, rc;
1935 1941
1942 if (!ext4_handle_valid(handle))
1943 return 0;
1944
1936 lock_super(sb); 1945 lock_super(sb);
1937 if (!list_empty(&EXT4_I(inode)->i_orphan)) 1946 if (!list_empty(&EXT4_I(inode)->i_orphan))
1938 goto out_unlock; 1947 goto out_unlock;
@@ -1961,7 +1970,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
1961 /* Insert this inode at the head of the on-disk orphan list... */ 1970 /* Insert this inode at the head of the on-disk orphan list... */
1962 NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); 1971 NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
1963 EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); 1972 EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
1964 err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); 1973 err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh);
1965 rc = ext4_mark_iloc_dirty(handle, inode, &iloc); 1974 rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
1966 if (!err) 1975 if (!err)
1967 err = rc; 1976 err = rc;
@@ -1995,10 +2004,13 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
1995 struct list_head *prev; 2004 struct list_head *prev;
1996 struct ext4_inode_info *ei = EXT4_I(inode); 2005 struct ext4_inode_info *ei = EXT4_I(inode);
1997 struct ext4_sb_info *sbi; 2006 struct ext4_sb_info *sbi;
1998 unsigned long ino_next; 2007 __u32 ino_next;
1999 struct ext4_iloc iloc; 2008 struct ext4_iloc iloc;
2000 int err = 0; 2009 int err = 0;
2001 2010
2011 if (!ext4_handle_valid(handle))
2012 return 0;
2013
2002 lock_super(inode->i_sb); 2014 lock_super(inode->i_sb);
2003 if (list_empty(&ei->i_orphan)) { 2015 if (list_empty(&ei->i_orphan)) {
2004 unlock_super(inode->i_sb); 2016 unlock_super(inode->i_sb);
@@ -2017,7 +2029,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2017 * transaction handle with which to update the orphan list on 2029 * transaction handle with which to update the orphan list on
2018 * disk, but we still need to remove the inode from the linked 2030 * disk, but we still need to remove the inode from the linked
2019 * list in memory. */ 2031 * list in memory. */
2020 if (!handle) 2032 if (sbi->s_journal && !handle)
2021 goto out; 2033 goto out;
2022 2034
2023 err = ext4_reserve_inode_write(handle, inode, &iloc); 2035 err = ext4_reserve_inode_write(handle, inode, &iloc);
@@ -2025,19 +2037,19 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2025 goto out_err; 2037 goto out_err;
2026 2038
2027 if (prev == &sbi->s_orphan) { 2039 if (prev == &sbi->s_orphan) {
2028 jbd_debug(4, "superblock will point to %lu\n", ino_next); 2040 jbd_debug(4, "superblock will point to %u\n", ino_next);
2029 BUFFER_TRACE(sbi->s_sbh, "get_write_access"); 2041 BUFFER_TRACE(sbi->s_sbh, "get_write_access");
2030 err = ext4_journal_get_write_access(handle, sbi->s_sbh); 2042 err = ext4_journal_get_write_access(handle, sbi->s_sbh);
2031 if (err) 2043 if (err)
2032 goto out_brelse; 2044 goto out_brelse;
2033 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); 2045 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
2034 err = ext4_journal_dirty_metadata(handle, sbi->s_sbh); 2046 err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh);
2035 } else { 2047 } else {
2036 struct ext4_iloc iloc2; 2048 struct ext4_iloc iloc2;
2037 struct inode *i_prev = 2049 struct inode *i_prev =
2038 &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; 2050 &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
2039 2051
2040 jbd_debug(4, "orphan inode %lu will point to %lu\n", 2052 jbd_debug(4, "orphan inode %lu will point to %u\n",
2041 i_prev->i_ino, ino_next); 2053 i_prev->i_ino, ino_next);
2042 err = ext4_reserve_inode_write(handle, i_prev, &iloc2); 2054 err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
2043 if (err) 2055 if (err)
@@ -2082,7 +2094,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
2082 goto end_rmdir; 2094 goto end_rmdir;
2083 2095
2084 if (IS_DIRSYNC(dir)) 2096 if (IS_DIRSYNC(dir))
2085 handle->h_sync = 1; 2097 ext4_handle_sync(handle);
2086 2098
2087 inode = dentry->d_inode; 2099 inode = dentry->d_inode;
2088 2100
@@ -2136,7 +2148,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2136 return PTR_ERR(handle); 2148 return PTR_ERR(handle);
2137 2149
2138 if (IS_DIRSYNC(dir)) 2150 if (IS_DIRSYNC(dir))
2139 handle->h_sync = 1; 2151 ext4_handle_sync(handle);
2140 2152
2141 retval = -ENOENT; 2153 retval = -ENOENT;
2142 bh = ext4_find_entry(dir, &dentry->d_name, &de); 2154 bh = ext4_find_entry(dir, &dentry->d_name, &de);
@@ -2193,7 +2205,7 @@ retry:
2193 return PTR_ERR(handle); 2205 return PTR_ERR(handle);
2194 2206
2195 if (IS_DIRSYNC(dir)) 2207 if (IS_DIRSYNC(dir))
2196 handle->h_sync = 1; 2208 ext4_handle_sync(handle);
2197 2209
2198 inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO); 2210 inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO);
2199 err = PTR_ERR(inode); 2211 err = PTR_ERR(inode);
@@ -2208,10 +2220,10 @@ retry:
2208 * We have a transaction open. All is sweetness. It also sets 2220 * We have a transaction open. All is sweetness. It also sets
2209 * i_size in generic_commit_write(). 2221 * i_size in generic_commit_write().
2210 */ 2222 */
2211 err = __page_symlink(inode, symname, l, 2223 err = __page_symlink(inode, symname, l, 1);
2212 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
2213 if (err) { 2224 if (err) {
2214 clear_nlink(inode); 2225 clear_nlink(inode);
2226 unlock_new_inode(inode);
2215 ext4_mark_inode_dirty(handle, inode); 2227 ext4_mark_inode_dirty(handle, inode);
2216 iput(inode); 2228 iput(inode);
2217 goto out_stop; 2229 goto out_stop;
@@ -2256,13 +2268,20 @@ retry:
2256 return PTR_ERR(handle); 2268 return PTR_ERR(handle);
2257 2269
2258 if (IS_DIRSYNC(dir)) 2270 if (IS_DIRSYNC(dir))
2259 handle->h_sync = 1; 2271 ext4_handle_sync(handle);
2260 2272
2261 inode->i_ctime = ext4_current_time(inode); 2273 inode->i_ctime = ext4_current_time(inode);
2262 ext4_inc_count(handle, inode); 2274 ext4_inc_count(handle, inode);
2263 atomic_inc(&inode->i_count); 2275 atomic_inc(&inode->i_count);
2264 2276
2265 err = ext4_add_nondir(handle, dentry, inode); 2277 err = ext4_add_entry(handle, dentry, inode);
2278 if (!err) {
2279 ext4_mark_inode_dirty(handle, inode);
2280 d_instantiate(dentry, inode);
2281 } else {
2282 drop_nlink(inode);
2283 iput(inode);
2284 }
2266 ext4_journal_stop(handle); 2285 ext4_journal_stop(handle);
2267 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) 2286 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2268 goto retry; 2287 goto retry;
@@ -2298,7 +2317,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2298 return PTR_ERR(handle); 2317 return PTR_ERR(handle);
2299 2318
2300 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) 2319 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
2301 handle->h_sync = 1; 2320 ext4_handle_sync(handle);
2302 2321
2303 old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de); 2322 old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
2304 /* 2323 /*
@@ -2352,8 +2371,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2352 new_dir->i_ctime = new_dir->i_mtime = 2371 new_dir->i_ctime = new_dir->i_mtime =
2353 ext4_current_time(new_dir); 2372 ext4_current_time(new_dir);
2354 ext4_mark_inode_dirty(handle, new_dir); 2373 ext4_mark_inode_dirty(handle, new_dir);
2355 BUFFER_TRACE(new_bh, "call ext4_journal_dirty_metadata"); 2374 BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
2356 ext4_journal_dirty_metadata(handle, new_bh); 2375 ext4_handle_dirty_metadata(handle, new_dir, new_bh);
2357 brelse(new_bh); 2376 brelse(new_bh);
2358 new_bh = NULL; 2377 new_bh = NULL;
2359 } 2378 }
@@ -2403,8 +2422,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2403 BUFFER_TRACE(dir_bh, "get_write_access"); 2422 BUFFER_TRACE(dir_bh, "get_write_access");
2404 ext4_journal_get_write_access(handle, dir_bh); 2423 ext4_journal_get_write_access(handle, dir_bh);
2405 PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); 2424 PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
2406 BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata"); 2425 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
2407 ext4_journal_dirty_metadata(handle, dir_bh); 2426 ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
2408 ext4_dec_count(handle, old_dir); 2427 ext4_dec_count(handle, old_dir);
2409 if (new_inode) { 2428 if (new_inode) {
2410 /* checked empty_dir above, can't have another parent, 2429 /* checked empty_dir above, can't have another parent,
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b6ec1843a015..c328be5d6885 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -50,7 +50,7 @@ static int verify_group_input(struct super_block *sb,
50 ext4_get_group_no_and_offset(sb, start, NULL, &offset); 50 ext4_get_group_no_and_offset(sb, start, NULL, &offset);
51 if (group != sbi->s_groups_count) 51 if (group != sbi->s_groups_count)
52 ext4_warning(sb, __func__, 52 ext4_warning(sb, __func__,
53 "Cannot add at group %u (only %lu groups)", 53 "Cannot add at group %u (only %u groups)",
54 input->group, sbi->s_groups_count); 54 input->group, sbi->s_groups_count);
55 else if (offset != 0) 55 else if (offset != 0)
56 ext4_warning(sb, __func__, "Last group not full"); 56 ext4_warning(sb, __func__, "Last group not full");
@@ -149,7 +149,7 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh,
149{ 149{
150 int err; 150 int err;
151 151
152 if (handle->h_buffer_credits >= thresh) 152 if (ext4_handle_has_enough_credits(handle, thresh))
153 return 0; 153 return 0;
154 154
155 err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA); 155 err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA);
@@ -232,7 +232,7 @@ static int setup_new_group_blocks(struct super_block *sb,
232 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); 232 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
233 set_buffer_uptodate(gdb); 233 set_buffer_uptodate(gdb);
234 unlock_buffer(gdb); 234 unlock_buffer(gdb);
235 ext4_journal_dirty_metadata(handle, gdb); 235 ext4_handle_dirty_metadata(handle, NULL, gdb);
236 ext4_set_bit(bit, bh->b_data); 236 ext4_set_bit(bit, bh->b_data);
237 brelse(gdb); 237 brelse(gdb);
238 } 238 }
@@ -251,7 +251,7 @@ static int setup_new_group_blocks(struct super_block *sb,
251 err = PTR_ERR(bh); 251 err = PTR_ERR(bh);
252 goto exit_bh; 252 goto exit_bh;
253 } 253 }
254 ext4_journal_dirty_metadata(handle, gdb); 254 ext4_handle_dirty_metadata(handle, NULL, gdb);
255 ext4_set_bit(bit, bh->b_data); 255 ext4_set_bit(bit, bh->b_data);
256 brelse(gdb); 256 brelse(gdb);
257 } 257 }
@@ -276,7 +276,7 @@ static int setup_new_group_blocks(struct super_block *sb,
276 err = PTR_ERR(it); 276 err = PTR_ERR(it);
277 goto exit_bh; 277 goto exit_bh;
278 } 278 }
279 ext4_journal_dirty_metadata(handle, it); 279 ext4_handle_dirty_metadata(handle, NULL, it);
280 brelse(it); 280 brelse(it);
281 ext4_set_bit(bit, bh->b_data); 281 ext4_set_bit(bit, bh->b_data);
282 } 282 }
@@ -284,11 +284,9 @@ static int setup_new_group_blocks(struct super_block *sb,
284 if ((err = extend_or_restart_transaction(handle, 2, bh))) 284 if ((err = extend_or_restart_transaction(handle, 2, bh)))
285 goto exit_bh; 285 goto exit_bh;
286 286
287 mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb), 287 mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data);
288 bh->b_data); 288 ext4_handle_dirty_metadata(handle, NULL, bh);
289 ext4_journal_dirty_metadata(handle, bh);
290 brelse(bh); 289 brelse(bh);
291
292 /* Mark unused entries in inode bitmap used */ 290 /* Mark unused entries in inode bitmap used */
293 ext4_debug("clear inode bitmap %#04llx (+%llu)\n", 291 ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
294 input->inode_bitmap, input->inode_bitmap - start); 292 input->inode_bitmap, input->inode_bitmap - start);
@@ -297,9 +295,9 @@ static int setup_new_group_blocks(struct super_block *sb,
297 goto exit_journal; 295 goto exit_journal;
298 } 296 }
299 297
300 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb), 298 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
301 bh->b_data); 299 bh->b_data);
302 ext4_journal_dirty_metadata(handle, bh); 300 ext4_handle_dirty_metadata(handle, NULL, bh);
303exit_bh: 301exit_bh:
304 brelse(bh); 302 brelse(bh);
305 303
@@ -486,12 +484,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
486 * reserved inode, and will become GDT blocks (primary and backup). 484 * reserved inode, and will become GDT blocks (primary and backup).
487 */ 485 */
488 data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0; 486 data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
489 ext4_journal_dirty_metadata(handle, dind); 487 ext4_handle_dirty_metadata(handle, NULL, dind);
490 brelse(dind); 488 brelse(dind);
491 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; 489 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
492 ext4_mark_iloc_dirty(handle, inode, &iloc); 490 ext4_mark_iloc_dirty(handle, inode, &iloc);
493 memset((*primary)->b_data, 0, sb->s_blocksize); 491 memset((*primary)->b_data, 0, sb->s_blocksize);
494 ext4_journal_dirty_metadata(handle, *primary); 492 ext4_handle_dirty_metadata(handle, NULL, *primary);
495 493
496 o_group_desc = EXT4_SB(sb)->s_group_desc; 494 o_group_desc = EXT4_SB(sb)->s_group_desc;
497 memcpy(n_group_desc, o_group_desc, 495 memcpy(n_group_desc, o_group_desc,
@@ -502,7 +500,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
502 kfree(o_group_desc); 500 kfree(o_group_desc);
503 501
504 le16_add_cpu(&es->s_reserved_gdt_blocks, -1); 502 le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
505 ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); 503 ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
506 504
507 return 0; 505 return 0;
508 506
@@ -618,7 +616,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
618 primary[i]->b_blocknr, gdbackups, 616 primary[i]->b_blocknr, gdbackups,
619 blk + primary[i]->b_blocknr); */ 617 blk + primary[i]->b_blocknr); */
620 data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); 618 data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
621 err2 = ext4_journal_dirty_metadata(handle, primary[i]); 619 err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]);
622 if (!err) 620 if (!err)
623 err = err2; 621 err = err2;
624 } 622 }
@@ -676,7 +674,8 @@ static void update_backups(struct super_block *sb,
676 struct buffer_head *bh; 674 struct buffer_head *bh;
677 675
678 /* Out of journal space, and can't get more - abort - so sad */ 676 /* Out of journal space, and can't get more - abort - so sad */
679 if (handle->h_buffer_credits == 0 && 677 if (ext4_handle_valid(handle) &&
678 handle->h_buffer_credits == 0 &&
680 ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) && 679 ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) &&
681 (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) 680 (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
682 break; 681 break;
@@ -696,7 +695,7 @@ static void update_backups(struct super_block *sb,
696 memset(bh->b_data + size, 0, rest); 695 memset(bh->b_data + size, 0, rest);
697 set_buffer_uptodate(bh); 696 set_buffer_uptodate(bh);
698 unlock_buffer(bh); 697 unlock_buffer(bh);
699 ext4_journal_dirty_metadata(handle, bh); 698 ext4_handle_dirty_metadata(handle, NULL, bh);
700 brelse(bh); 699 brelse(bh);
701 } 700 }
702 if ((err2 = ext4_journal_stop(handle)) && !err) 701 if ((err2 = ext4_journal_stop(handle)) && !err)
@@ -715,7 +714,7 @@ static void update_backups(struct super_block *sb,
715exit_err: 714exit_err:
716 if (err) { 715 if (err) {
717 ext4_warning(sb, __func__, 716 ext4_warning(sb, __func__,
718 "can't update backup for group %lu (err %d), " 717 "can't update backup for group %u (err %d), "
719 "forcing fsck on next reboot", group, err); 718 "forcing fsck on next reboot", group, err);
720 sbi->s_mount_state &= ~EXT4_VALID_FS; 719 sbi->s_mount_state &= ~EXT4_VALID_FS;
721 sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); 720 sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
@@ -747,6 +746,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
747 struct inode *inode = NULL; 746 struct inode *inode = NULL;
748 handle_t *handle; 747 handle_t *handle;
749 int gdb_off, gdb_num; 748 int gdb_off, gdb_num;
749 int num_grp_locked = 0;
750 int err, err2; 750 int err, err2;
751 751
752 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); 752 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
@@ -761,13 +761,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
761 761
762 if (ext4_blocks_count(es) + input->blocks_count < 762 if (ext4_blocks_count(es) + input->blocks_count <
763 ext4_blocks_count(es)) { 763 ext4_blocks_count(es)) {
764 ext4_warning(sb, __func__, "blocks_count overflow\n"); 764 ext4_warning(sb, __func__, "blocks_count overflow");
765 return -EINVAL; 765 return -EINVAL;
766 } 766 }
767 767
768 if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) < 768 if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
769 le32_to_cpu(es->s_inodes_count)) { 769 le32_to_cpu(es->s_inodes_count)) {
770 ext4_warning(sb, __func__, "inodes_count overflow\n"); 770 ext4_warning(sb, __func__, "inodes_count overflow");
771 return -EINVAL; 771 return -EINVAL;
772 } 772 }
773 773
@@ -787,6 +787,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
787 } 787 }
788 } 788 }
789 789
790
790 if ((err = verify_group_input(sb, input))) 791 if ((err = verify_group_input(sb, input)))
791 goto exit_put; 792 goto exit_put;
792 793
@@ -855,6 +856,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
855 * using the new disk blocks. 856 * using the new disk blocks.
856 */ 857 */
857 858
859 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
858 /* Update group descriptor block for new group */ 860 /* Update group descriptor block for new group */
859 gdp = (struct ext4_group_desc *)((char *)primary->b_data + 861 gdp = (struct ext4_group_desc *)((char *)primary->b_data +
860 gdb_off * EXT4_DESC_SIZE(sb)); 862 gdb_off * EXT4_DESC_SIZE(sb));
@@ -862,17 +864,20 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
862 ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */ 864 ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
863 ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */ 865 ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
864 ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */ 866 ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
865 gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count); 867 ext4_free_blks_set(sb, gdp, input->free_blocks_count);
866 gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb)); 868 ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
869 gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
867 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); 870 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
868 871
869 /* 872 /*
870 * We can allocate memory for mb_alloc based on the new group 873 * We can allocate memory for mb_alloc based on the new group
871 * descriptor 874 * descriptor
872 */ 875 */
873 err = ext4_mb_add_more_groupinfo(sb, input->group, gdp); 876 err = ext4_mb_add_groupinfo(sb, input->group, gdp);
874 if (err) 877 if (err) {
878 ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
875 goto exit_journal; 879 goto exit_journal;
880 }
876 881
877 /* 882 /*
878 * Make the new blocks and inodes valid next. We do this before 883 * Make the new blocks and inodes valid next. We do this before
@@ -914,8 +919,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
914 919
915 /* Update the global fs size fields */ 920 /* Update the global fs size fields */
916 sbi->s_groups_count++; 921 sbi->s_groups_count++;
922 ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
917 923
918 ext4_journal_dirty_metadata(handle, primary); 924 ext4_handle_dirty_metadata(handle, NULL, primary);
919 925
920 /* Update the reserved block counts only once the new group is 926 /* Update the reserved block counts only once the new group is
921 * active. */ 927 * active. */
@@ -937,7 +943,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
937 EXT4_INODES_PER_GROUP(sb); 943 EXT4_INODES_PER_GROUP(sb);
938 } 944 }
939 945
940 ext4_journal_dirty_metadata(handle, sbi->s_sbh); 946 ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
941 sb->s_dirt = 1; 947 sb->s_dirt = 1;
942 948
943exit_journal: 949exit_journal:
@@ -975,9 +981,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
975 struct buffer_head *bh; 981 struct buffer_head *bh;
976 handle_t *handle; 982 handle_t *handle;
977 int err; 983 int err;
978 unsigned long freed_blocks;
979 ext4_group_t group; 984 ext4_group_t group;
980 struct ext4_group_info *grp;
981 985
982 /* We don't need to worry about locking wrt other resizers just 986 /* We don't need to worry about locking wrt other resizers just
983 * yet: we're going to revalidate es->s_blocks_count after 987 * yet: we're going to revalidate es->s_blocks_count after
@@ -997,8 +1001,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
997 " too large to resize to %llu blocks safely\n", 1001 " too large to resize to %llu blocks safely\n",
998 sb->s_id, n_blocks_count); 1002 sb->s_id, n_blocks_count);
999 if (sizeof(sector_t) < 8) 1003 if (sizeof(sector_t) < 8)
1000 ext4_warning(sb, __func__, 1004 ext4_warning(sb, __func__, "CONFIG_LBD not enabled");
1001 "CONFIG_LBD not enabled\n");
1002 return -EINVAL; 1005 return -EINVAL;
1003 } 1006 }
1004 1007
@@ -1071,62 +1074,18 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1071 goto exit_put; 1074 goto exit_put;
1072 } 1075 }
1073 ext4_blocks_count_set(es, o_blocks_count + add); 1076 ext4_blocks_count_set(es, o_blocks_count + add);
1074 ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); 1077 ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
1075 sb->s_dirt = 1; 1078 sb->s_dirt = 1;
1076 unlock_super(sb); 1079 unlock_super(sb);
1077 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, 1080 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
1078 o_blocks_count + add); 1081 o_blocks_count + add);
1079 ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); 1082 /* We add the blocks to the bitmap and set the group need init bit */
1083 ext4_add_groupblocks(handle, sb, o_blocks_count, add);
1080 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, 1084 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
1081 o_blocks_count + add); 1085 o_blocks_count + add);
1082 if ((err = ext4_journal_stop(handle))) 1086 if ((err = ext4_journal_stop(handle)))
1083 goto exit_put; 1087 goto exit_put;
1084 1088
1085 /*
1086 * Mark mballoc pages as not up to date so that they will be updated
1087 * next time they are loaded by ext4_mb_load_buddy.
1088 *
1089 * XXX Bad, Bad, BAD!!! We should not be overloading the
1090 * Uptodate flag, particularly on thte bitmap bh, as way of
1091 * hinting to ext4_mb_load_buddy() that it needs to be
1092 * overloaded. A user could take a LVM snapshot, then do an
1093 * on-line fsck, and clear the uptodate flag, and this would
1094 * not be a bug in userspace, but a bug in the kernel. FIXME!!!
1095 */
1096 {
1097 struct ext4_sb_info *sbi = EXT4_SB(sb);
1098 struct inode *inode = sbi->s_buddy_cache;
1099 int blocks_per_page;
1100 int block;
1101 int pnum;
1102 struct page *page;
1103
1104 /* Set buddy page as not up to date */
1105 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1106 block = group * 2;
1107 pnum = block / blocks_per_page;
1108 page = find_get_page(inode->i_mapping, pnum);
1109 if (page != NULL) {
1110 ClearPageUptodate(page);
1111 page_cache_release(page);
1112 }
1113
1114 /* Set bitmap page as not up to date */
1115 block++;
1116 pnum = block / blocks_per_page;
1117 page = find_get_page(inode->i_mapping, pnum);
1118 if (page != NULL) {
1119 ClearPageUptodate(page);
1120 page_cache_release(page);
1121 }
1122
1123 /* Get the info on the last group */
1124 grp = ext4_get_group_info(sb, group);
1125
1126 /* Update free blocks in group info */
1127 ext4_mb_update_group_info(grp, add);
1128 }
1129
1130 if (test_opt(sb, DEBUG)) 1089 if (test_opt(sb, DEBUG))
1131 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n", 1090 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
1132 ext4_blocks_count(es)); 1091 ext4_blocks_count(es));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e4a241c65dbe..e5f06a5f045e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -51,9 +51,7 @@ struct proc_dir_entry *ext4_proc_root;
51 51
52static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 52static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
53 unsigned long journal_devnum); 53 unsigned long journal_devnum);
54static int ext4_create_journal(struct super_block *, struct ext4_super_block *, 54static int ext4_commit_super(struct super_block *sb,
55 unsigned int);
56static void ext4_commit_super(struct super_block *sb,
57 struct ext4_super_block *es, int sync); 55 struct ext4_super_block *es, int sync);
58static void ext4_mark_recovery_complete(struct super_block *sb, 56static void ext4_mark_recovery_complete(struct super_block *sb,
59 struct ext4_super_block *es); 57 struct ext4_super_block *es);
@@ -64,9 +62,9 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
64 char nbuf[16]); 62 char nbuf[16]);
65static int ext4_remount(struct super_block *sb, int *flags, char *data); 63static int ext4_remount(struct super_block *sb, int *flags, char *data);
66static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); 64static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
67static void ext4_unlockfs(struct super_block *sb); 65static int ext4_unfreeze(struct super_block *sb);
68static void ext4_write_super(struct super_block *sb); 66static void ext4_write_super(struct super_block *sb);
69static void ext4_write_super_lockfs(struct super_block *sb); 67static int ext4_freeze(struct super_block *sb);
70 68
71 69
72ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, 70ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
@@ -93,6 +91,38 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
93 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); 91 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
94} 92}
95 93
94__u32 ext4_free_blks_count(struct super_block *sb,
95 struct ext4_group_desc *bg)
96{
97 return le16_to_cpu(bg->bg_free_blocks_count_lo) |
98 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
99 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
100}
101
102__u32 ext4_free_inodes_count(struct super_block *sb,
103 struct ext4_group_desc *bg)
104{
105 return le16_to_cpu(bg->bg_free_inodes_count_lo) |
106 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
107 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
108}
109
110__u32 ext4_used_dirs_count(struct super_block *sb,
111 struct ext4_group_desc *bg)
112{
113 return le16_to_cpu(bg->bg_used_dirs_count_lo) |
114 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
115 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
116}
117
118__u32 ext4_itable_unused_count(struct super_block *sb,
119 struct ext4_group_desc *bg)
120{
121 return le16_to_cpu(bg->bg_itable_unused_lo) |
122 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
123 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
124}
125
96void ext4_block_bitmap_set(struct super_block *sb, 126void ext4_block_bitmap_set(struct super_block *sb,
97 struct ext4_group_desc *bg, ext4_fsblk_t blk) 127 struct ext4_group_desc *bg, ext4_fsblk_t blk)
98{ 128{
@@ -117,6 +147,38 @@ void ext4_inode_table_set(struct super_block *sb,
117 bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); 147 bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
118} 148}
119 149
150void ext4_free_blks_set(struct super_block *sb,
151 struct ext4_group_desc *bg, __u32 count)
152{
153 bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
154 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
155 bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
156}
157
158void ext4_free_inodes_set(struct super_block *sb,
159 struct ext4_group_desc *bg, __u32 count)
160{
161 bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
162 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
163 bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
164}
165
166void ext4_used_dirs_set(struct super_block *sb,
167 struct ext4_group_desc *bg, __u32 count)
168{
169 bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
170 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
171 bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
172}
173
174void ext4_itable_unused_set(struct super_block *sb,
175 struct ext4_group_desc *bg, __u32 count)
176{
177 bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
178 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
179 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
180}
181
120/* 182/*
121 * Wrappers for jbd2_journal_start/end. 183 * Wrappers for jbd2_journal_start/end.
122 * 184 *
@@ -136,13 +198,19 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
136 * backs (eg. EIO in the commit thread), then we still need to 198 * backs (eg. EIO in the commit thread), then we still need to
137 * take the FS itself readonly cleanly. */ 199 * take the FS itself readonly cleanly. */
138 journal = EXT4_SB(sb)->s_journal; 200 journal = EXT4_SB(sb)->s_journal;
139 if (is_journal_aborted(journal)) { 201 if (journal) {
140 ext4_abort(sb, __func__, 202 if (is_journal_aborted(journal)) {
141 "Detected aborted journal"); 203 ext4_abort(sb, __func__,
142 return ERR_PTR(-EROFS); 204 "Detected aborted journal");
205 return ERR_PTR(-EROFS);
206 }
207 return jbd2_journal_start(journal, nblocks);
143 } 208 }
144 209 /*
145 return jbd2_journal_start(journal, nblocks); 210 * We're not journaling, return the appropriate indication.
211 */
212 current->journal_info = EXT4_NOJOURNAL_HANDLE;
213 return current->journal_info;
146} 214}
147 215
148/* 216/*
@@ -157,6 +225,14 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
157 int err; 225 int err;
158 int rc; 226 int rc;
159 227
228 if (!ext4_handle_valid(handle)) {
229 /*
230 * Do this here since we don't call jbd2_journal_stop() in
231 * no-journal mode.
232 */
233 current->journal_info = NULL;
234 return 0;
235 }
160 sb = handle->h_transaction->t_journal->j_private; 236 sb = handle->h_transaction->t_journal->j_private;
161 err = handle->h_err; 237 err = handle->h_err;
162 rc = jbd2_journal_stop(handle); 238 rc = jbd2_journal_stop(handle);
@@ -174,6 +250,8 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
174 char nbuf[16]; 250 char nbuf[16];
175 const char *errstr = ext4_decode_error(NULL, err, nbuf); 251 const char *errstr = ext4_decode_error(NULL, err, nbuf);
176 252
253 BUG_ON(!ext4_handle_valid(handle));
254
177 if (bh) 255 if (bh)
178 BUFFER_TRACE(bh, "abort"); 256 BUFFER_TRACE(bh, "abort");
179 257
@@ -350,6 +428,44 @@ void ext4_warning(struct super_block *sb, const char *function,
350 va_end(args); 428 va_end(args);
351} 429}
352 430
431void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp,
432 const char *function, const char *fmt, ...)
433__releases(bitlock)
434__acquires(bitlock)
435{
436 va_list args;
437 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
438
439 va_start(args, fmt);
440 printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
441 vprintk(fmt, args);
442 printk("\n");
443 va_end(args);
444
445 if (test_opt(sb, ERRORS_CONT)) {
446 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
447 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
448 ext4_commit_super(sb, es, 0);
449 return;
450 }
451 ext4_unlock_group(sb, grp);
452 ext4_handle_error(sb);
453 /*
454 * We only get here in the ERRORS_RO case; relocking the group
455 * may be dangerous, but nothing bad will happen since the
456 * filesystem will have already been marked read/only and the
457 * journal has been aborted. We return 1 as a hint to callers
458 * who might what to use the return value from
459 * ext4_grp_locked_error() to distinguish beween the
460 * ERRORS_CONT and ERRORS_RO case, and perhaps return more
461 * aggressively from the ext4 function in question, with a
462 * more appropriate error code.
463 */
464 ext4_lock_group(sb, grp);
465 return;
466}
467
468
353void ext4_update_dynamic_rev(struct super_block *sb) 469void ext4_update_dynamic_rev(struct super_block *sb)
354{ 470{
355 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 471 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
@@ -389,7 +505,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev)
389 return bdev; 505 return bdev;
390 506
391fail: 507fail:
392 printk(KERN_ERR "EXT4: failed to open journal device %s: %ld\n", 508 printk(KERN_ERR "EXT4-fs: failed to open journal device %s: %ld\n",
393 __bdevname(dev, b), PTR_ERR(bdev)); 509 __bdevname(dev, b), PTR_ERR(bdev));
394 return NULL; 510 return NULL;
395} 511}
@@ -448,11 +564,13 @@ static void ext4_put_super(struct super_block *sb)
448 ext4_mb_release(sb); 564 ext4_mb_release(sb);
449 ext4_ext_release(sb); 565 ext4_ext_release(sb);
450 ext4_xattr_put_super(sb); 566 ext4_xattr_put_super(sb);
451 err = jbd2_journal_destroy(sbi->s_journal); 567 if (sbi->s_journal) {
452 sbi->s_journal = NULL; 568 err = jbd2_journal_destroy(sbi->s_journal);
453 if (err < 0) 569 sbi->s_journal = NULL;
454 ext4_abort(sb, __func__, "Couldn't clean up the journal"); 570 if (err < 0)
455 571 ext4_abort(sb, __func__,
572 "Couldn't clean up the journal");
573 }
456 if (!(sb->s_flags & MS_RDONLY)) { 574 if (!(sb->s_flags & MS_RDONLY)) {
457 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 575 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
458 es->s_state = cpu_to_le16(sbi->s_mount_state); 576 es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -522,6 +640,11 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
522 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); 640 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
523 INIT_LIST_HEAD(&ei->i_prealloc_list); 641 INIT_LIST_HEAD(&ei->i_prealloc_list);
524 spin_lock_init(&ei->i_prealloc_lock); 642 spin_lock_init(&ei->i_prealloc_lock);
643 /*
644 * Note: We can be called before EXT4_SB(sb)->s_journal is set,
645 * therefore it can be null here. Don't check it, just initialize
646 * jinode.
647 */
525 jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode); 648 jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
526 ei->i_reserved_data_blocks = 0; 649 ei->i_reserved_data_blocks = 0;
527 ei->i_reserved_meta_blocks = 0; 650 ei->i_reserved_meta_blocks = 0;
@@ -588,7 +711,8 @@ static void ext4_clear_inode(struct inode *inode)
588 } 711 }
589#endif 712#endif
590 ext4_discard_preallocations(inode); 713 ext4_discard_preallocations(inode);
591 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, 714 if (EXT4_JOURNAL(inode))
715 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
592 &EXT4_I(inode)->jinode); 716 &EXT4_I(inode)->jinode);
593} 717}
594 718
@@ -681,10 +805,19 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
681#endif 805#endif
682 if (!test_opt(sb, RESERVATION)) 806 if (!test_opt(sb, RESERVATION))
683 seq_puts(seq, ",noreservation"); 807 seq_puts(seq, ",noreservation");
684 if (sbi->s_commit_interval) { 808 if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
685 seq_printf(seq, ",commit=%u", 809 seq_printf(seq, ",commit=%u",
686 (unsigned) (sbi->s_commit_interval / HZ)); 810 (unsigned) (sbi->s_commit_interval / HZ));
687 } 811 }
812 if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
813 seq_printf(seq, ",min_batch_time=%u",
814 (unsigned) sbi->s_min_batch_time);
815 }
816 if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
817 seq_printf(seq, ",max_batch_time=%u",
818 (unsigned) sbi->s_min_batch_time);
819 }
820
688 /* 821 /*
689 * We're changing the default of barrier mount option, so 822 * We're changing the default of barrier mount option, so
690 * let's always display its mount state so it's clear what its 823 * let's always display its mount state so it's clear what its
@@ -696,8 +829,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
696 seq_puts(seq, ",journal_async_commit"); 829 seq_puts(seq, ",journal_async_commit");
697 if (test_opt(sb, NOBH)) 830 if (test_opt(sb, NOBH))
698 seq_puts(seq, ",nobh"); 831 seq_puts(seq, ",nobh");
699 if (!test_opt(sb, EXTENTS))
700 seq_puts(seq, ",noextents");
701 if (test_opt(sb, I_VERSION)) 832 if (test_opt(sb, I_VERSION))
702 seq_puts(seq, ",i_version"); 833 seq_puts(seq, ",i_version");
703 if (!test_opt(sb, DELALLOC)) 834 if (!test_opt(sb, DELALLOC))
@@ -772,6 +903,25 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
772 ext4_nfs_get_inode); 903 ext4_nfs_get_inode);
773} 904}
774 905
906/*
907 * Try to release metadata pages (indirect blocks, directories) which are
908 * mapped via the block device. Since these pages could have journal heads
909 * which would prevent try_to_free_buffers() from freeing them, we must use
910 * jbd2 layer's try_to_free_buffers() function to release them.
911 */
912static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_t wait)
913{
914 journal_t *journal = EXT4_SB(sb)->s_journal;
915
916 WARN_ON(PageChecked(page));
917 if (!page_has_buffers(page))
918 return 0;
919 if (journal)
920 return jbd2_journal_try_to_free_buffers(journal, page,
921 wait & ~__GFP_WAIT);
922 return try_to_free_buffers(page);
923}
924
775#ifdef CONFIG_QUOTA 925#ifdef CONFIG_QUOTA
776#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group") 926#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
777#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) 927#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
@@ -803,7 +953,9 @@ static struct dquot_operations ext4_quota_operations = {
803 .acquire_dquot = ext4_acquire_dquot, 953 .acquire_dquot = ext4_acquire_dquot,
804 .release_dquot = ext4_release_dquot, 954 .release_dquot = ext4_release_dquot,
805 .mark_dirty = ext4_mark_dquot_dirty, 955 .mark_dirty = ext4_mark_dquot_dirty,
806 .write_info = ext4_write_info 956 .write_info = ext4_write_info,
957 .alloc_dquot = dquot_alloc,
958 .destroy_dquot = dquot_destroy,
807}; 959};
808 960
809static struct quotactl_ops ext4_qctl_operations = { 961static struct quotactl_ops ext4_qctl_operations = {
@@ -826,8 +978,8 @@ static const struct super_operations ext4_sops = {
826 .put_super = ext4_put_super, 978 .put_super = ext4_put_super,
827 .write_super = ext4_write_super, 979 .write_super = ext4_write_super,
828 .sync_fs = ext4_sync_fs, 980 .sync_fs = ext4_sync_fs,
829 .write_super_lockfs = ext4_write_super_lockfs, 981 .freeze_fs = ext4_freeze,
830 .unlockfs = ext4_unlockfs, 982 .unfreeze_fs = ext4_unfreeze,
831 .statfs = ext4_statfs, 983 .statfs = ext4_statfs,
832 .remount_fs = ext4_remount, 984 .remount_fs = ext4_remount,
833 .clear_inode = ext4_clear_inode, 985 .clear_inode = ext4_clear_inode,
@@ -836,6 +988,7 @@ static const struct super_operations ext4_sops = {
836 .quota_read = ext4_quota_read, 988 .quota_read = ext4_quota_read,
837 .quota_write = ext4_quota_write, 989 .quota_write = ext4_quota_write,
838#endif 990#endif
991 .bdev_try_to_free_page = bdev_try_to_free_page,
839}; 992};
840 993
841static const struct export_operations ext4_export_ops = { 994static const struct export_operations ext4_export_ops = {
@@ -850,16 +1003,17 @@ enum {
850 Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, 1003 Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
851 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, 1004 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
852 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, 1005 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
853 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, 1006 Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
1007 Opt_journal_update, Opt_journal_dev,
854 Opt_journal_checksum, Opt_journal_async_commit, 1008 Opt_journal_checksum, Opt_journal_async_commit,
855 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 1009 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
856 Opt_data_err_abort, Opt_data_err_ignore, 1010 Opt_data_err_abort, Opt_data_err_ignore,
857 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 1011 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
858 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 1012 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
859 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, 1013 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
860 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, 1014 Opt_grpquota, Opt_i_version,
861 Opt_stripe, Opt_delalloc, Opt_nodelalloc, 1015 Opt_stripe, Opt_delalloc, Opt_nodelalloc,
862 Opt_inode_readahead_blks 1016 Opt_inode_readahead_blks, Opt_journal_ioprio
863}; 1017};
864 1018
865static const match_table_t tokens = { 1019static const match_table_t tokens = {
@@ -889,8 +1043,9 @@ static const match_table_t tokens = {
889 {Opt_nobh, "nobh"}, 1043 {Opt_nobh, "nobh"},
890 {Opt_bh, "bh"}, 1044 {Opt_bh, "bh"},
891 {Opt_commit, "commit=%u"}, 1045 {Opt_commit, "commit=%u"},
1046 {Opt_min_batch_time, "min_batch_time=%u"},
1047 {Opt_max_batch_time, "max_batch_time=%u"},
892 {Opt_journal_update, "journal=update"}, 1048 {Opt_journal_update, "journal=update"},
893 {Opt_journal_inum, "journal=%u"},
894 {Opt_journal_dev, "journal_dev=%u"}, 1049 {Opt_journal_dev, "journal_dev=%u"},
895 {Opt_journal_checksum, "journal_checksum"}, 1050 {Opt_journal_checksum, "journal_checksum"},
896 {Opt_journal_async_commit, "journal_async_commit"}, 1051 {Opt_journal_async_commit, "journal_async_commit"},
@@ -911,14 +1066,13 @@ static const match_table_t tokens = {
911 {Opt_quota, "quota"}, 1066 {Opt_quota, "quota"},
912 {Opt_usrquota, "usrquota"}, 1067 {Opt_usrquota, "usrquota"},
913 {Opt_barrier, "barrier=%u"}, 1068 {Opt_barrier, "barrier=%u"},
914 {Opt_extents, "extents"},
915 {Opt_noextents, "noextents"},
916 {Opt_i_version, "i_version"}, 1069 {Opt_i_version, "i_version"},
917 {Opt_stripe, "stripe=%u"}, 1070 {Opt_stripe, "stripe=%u"},
918 {Opt_resize, "resize"}, 1071 {Opt_resize, "resize"},
919 {Opt_delalloc, "delalloc"}, 1072 {Opt_delalloc, "delalloc"},
920 {Opt_nodelalloc, "nodelalloc"}, 1073 {Opt_nodelalloc, "nodelalloc"},
921 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, 1074 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
1075 {Opt_journal_ioprio, "journal_ioprio=%u"},
922 {Opt_err, NULL}, 1076 {Opt_err, NULL},
923}; 1077};
924 1078
@@ -943,8 +1097,11 @@ static ext4_fsblk_t get_sb_block(void **data)
943 return sb_block; 1097 return sb_block;
944} 1098}
945 1099
1100#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
1101
946static int parse_options(char *options, struct super_block *sb, 1102static int parse_options(char *options, struct super_block *sb,
947 unsigned int *inum, unsigned long *journal_devnum, 1103 unsigned long *journal_devnum,
1104 unsigned int *journal_ioprio,
948 ext4_fsblk_t *n_blocks_count, int is_remount) 1105 ext4_fsblk_t *n_blocks_count, int is_remount)
949{ 1106{
950 struct ext4_sb_info *sbi = EXT4_SB(sb); 1107 struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -956,7 +1113,6 @@ static int parse_options(char *options, struct super_block *sb,
956 int qtype, qfmt; 1113 int qtype, qfmt;
957 char *qname; 1114 char *qname;
958#endif 1115#endif
959 ext4_fsblk_t last_block;
960 1116
961 if (!options) 1117 if (!options)
962 return 1; 1118 return 1;
@@ -1068,16 +1224,6 @@ static int parse_options(char *options, struct super_block *sb,
1068 } 1224 }
1069 set_opt(sbi->s_mount_opt, UPDATE_JOURNAL); 1225 set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
1070 break; 1226 break;
1071 case Opt_journal_inum:
1072 if (is_remount) {
1073 printk(KERN_ERR "EXT4-fs: cannot specify "
1074 "journal on remount\n");
1075 return 0;
1076 }
1077 if (match_int(&args[0], &option))
1078 return 0;
1079 *inum = option;
1080 break;
1081 case Opt_journal_dev: 1227 case Opt_journal_dev:
1082 if (is_remount) { 1228 if (is_remount) {
1083 printk(KERN_ERR "EXT4-fs: cannot specify " 1229 printk(KERN_ERR "EXT4-fs: cannot specify "
@@ -1107,6 +1253,22 @@ static int parse_options(char *options, struct super_block *sb,
1107 option = JBD2_DEFAULT_MAX_COMMIT_AGE; 1253 option = JBD2_DEFAULT_MAX_COMMIT_AGE;
1108 sbi->s_commit_interval = HZ * option; 1254 sbi->s_commit_interval = HZ * option;
1109 break; 1255 break;
1256 case Opt_max_batch_time:
1257 if (match_int(&args[0], &option))
1258 return 0;
1259 if (option < 0)
1260 return 0;
1261 if (option == 0)
1262 option = EXT4_DEF_MAX_BATCH_TIME;
1263 sbi->s_max_batch_time = option;
1264 break;
1265 case Opt_min_batch_time:
1266 if (match_int(&args[0], &option))
1267 return 0;
1268 if (option < 0)
1269 return 0;
1270 sbi->s_min_batch_time = option;
1271 break;
1110 case Opt_data_journal: 1272 case Opt_data_journal:
1111 data_opt = EXT4_MOUNT_JOURNAL_DATA; 1273 data_opt = EXT4_MOUNT_JOURNAL_DATA;
1112 goto datacheck; 1274 goto datacheck;
@@ -1142,8 +1304,7 @@ static int parse_options(char *options, struct super_block *sb,
1142 case Opt_grpjquota: 1304 case Opt_grpjquota:
1143 qtype = GRPQUOTA; 1305 qtype = GRPQUOTA;
1144set_qf_name: 1306set_qf_name:
1145 if ((sb_any_quota_enabled(sb) || 1307 if (sb_any_quota_loaded(sb) &&
1146 sb_any_quota_suspended(sb)) &&
1147 !sbi->s_qf_names[qtype]) { 1308 !sbi->s_qf_names[qtype]) {
1148 printk(KERN_ERR 1309 printk(KERN_ERR
1149 "EXT4-fs: Cannot change journaled " 1310 "EXT4-fs: Cannot change journaled "
@@ -1182,8 +1343,7 @@ set_qf_name:
1182 case Opt_offgrpjquota: 1343 case Opt_offgrpjquota:
1183 qtype = GRPQUOTA; 1344 qtype = GRPQUOTA;
1184clear_qf_name: 1345clear_qf_name:
1185 if ((sb_any_quota_enabled(sb) || 1346 if (sb_any_quota_loaded(sb) &&
1186 sb_any_quota_suspended(sb)) &&
1187 sbi->s_qf_names[qtype]) { 1347 sbi->s_qf_names[qtype]) {
1188 printk(KERN_ERR "EXT4-fs: Cannot change " 1348 printk(KERN_ERR "EXT4-fs: Cannot change "
1189 "journaled quota options when " 1349 "journaled quota options when "
@@ -1202,8 +1362,7 @@ clear_qf_name:
1202 case Opt_jqfmt_vfsv0: 1362 case Opt_jqfmt_vfsv0:
1203 qfmt = QFMT_VFS_V0; 1363 qfmt = QFMT_VFS_V0;
1204set_qf_format: 1364set_qf_format:
1205 if ((sb_any_quota_enabled(sb) || 1365 if (sb_any_quota_loaded(sb) &&
1206 sb_any_quota_suspended(sb)) &&
1207 sbi->s_jquota_fmt != qfmt) { 1366 sbi->s_jquota_fmt != qfmt) {
1208 printk(KERN_ERR "EXT4-fs: Cannot change " 1367 printk(KERN_ERR "EXT4-fs: Cannot change "
1209 "journaled quota options when " 1368 "journaled quota options when "
@@ -1222,7 +1381,7 @@ set_qf_format:
1222 set_opt(sbi->s_mount_opt, GRPQUOTA); 1381 set_opt(sbi->s_mount_opt, GRPQUOTA);
1223 break; 1382 break;
1224 case Opt_noquota: 1383 case Opt_noquota:
1225 if (sb_any_quota_enabled(sb)) { 1384 if (sb_any_quota_loaded(sb)) {
1226 printk(KERN_ERR "EXT4-fs: Cannot change quota " 1385 printk(KERN_ERR "EXT4-fs: Cannot change quota "
1227 "options when quota turned on.\n"); 1386 "options when quota turned on.\n");
1228 return 0; 1387 return 0;
@@ -1280,33 +1439,6 @@ set_qf_format:
1280 case Opt_bh: 1439 case Opt_bh:
1281 clear_opt(sbi->s_mount_opt, NOBH); 1440 clear_opt(sbi->s_mount_opt, NOBH);
1282 break; 1441 break;
1283 case Opt_extents:
1284 if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
1285 EXT4_FEATURE_INCOMPAT_EXTENTS)) {
1286 ext4_warning(sb, __func__,
1287 "extents feature not enabled "
1288 "on this filesystem, use tune2fs\n");
1289 return 0;
1290 }
1291 set_opt(sbi->s_mount_opt, EXTENTS);
1292 break;
1293 case Opt_noextents:
1294 /*
1295 * When e2fsprogs support resizing an already existing
1296 * ext3 file system to greater than 2**32 we need to
1297 * add support to block allocator to handle growing
1298 * already existing block mapped inode so that blocks
1299 * allocated for them fall within 2**32
1300 */
1301 last_block = ext4_blocks_count(sbi->s_es) - 1;
1302 if (last_block > 0xffffffffULL) {
1303 printk(KERN_ERR "EXT4-fs: Filesystem too "
1304 "large to mount with "
1305 "-o noextents options\n");
1306 return 0;
1307 }
1308 clear_opt(sbi->s_mount_opt, EXTENTS);
1309 break;
1310 case Opt_i_version: 1442 case Opt_i_version:
1311 set_opt(sbi->s_mount_opt, I_VERSION); 1443 set_opt(sbi->s_mount_opt, I_VERSION);
1312 sb->s_flags |= MS_I_VERSION; 1444 sb->s_flags |= MS_I_VERSION;
@@ -1331,6 +1463,14 @@ set_qf_format:
1331 return 0; 1463 return 0;
1332 sbi->s_inode_readahead_blks = option; 1464 sbi->s_inode_readahead_blks = option;
1333 break; 1465 break;
1466 case Opt_journal_ioprio:
1467 if (match_int(&args[0], &option))
1468 return 0;
1469 if (option < 0 || option > 7)
1470 break;
1471 *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
1472 option);
1473 break;
1334 default: 1474 default:
1335 printk(KERN_ERR 1475 printk(KERN_ERR
1336 "EXT4-fs: Unrecognized mount option \"%s\" " 1476 "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1406,24 +1546,19 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1406 printk(KERN_WARNING 1546 printk(KERN_WARNING
1407 "EXT4-fs warning: checktime reached, " 1547 "EXT4-fs warning: checktime reached, "
1408 "running e2fsck is recommended\n"); 1548 "running e2fsck is recommended\n");
1409#if 0 1549 if (!sbi->s_journal)
1410 /* @@@ We _will_ want to clear the valid bit if we find 1550 es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
1411 * inconsistencies, to force a fsck at reboot. But for
1412 * a plain journaled filesystem we can keep it set as
1413 * valid forever! :)
1414 */
1415 es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
1416#endif
1417 if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) 1551 if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
1418 es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); 1552 es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
1419 le16_add_cpu(&es->s_mnt_count, 1); 1553 le16_add_cpu(&es->s_mnt_count, 1);
1420 es->s_mtime = cpu_to_le32(get_seconds()); 1554 es->s_mtime = cpu_to_le32(get_seconds());
1421 ext4_update_dynamic_rev(sb); 1555 ext4_update_dynamic_rev(sb);
1422 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 1556 if (sbi->s_journal)
1557 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
1423 1558
1424 ext4_commit_super(sb, es, 1); 1559 ext4_commit_super(sb, es, 1);
1425 if (test_opt(sb, DEBUG)) 1560 if (test_opt(sb, DEBUG))
1426 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%lu, " 1561 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
1427 "bpg=%lu, ipg=%lu, mo=%04lx]\n", 1562 "bpg=%lu, ipg=%lu, mo=%04lx]\n",
1428 sb->s_blocksize, 1563 sb->s_blocksize,
1429 sbi->s_groups_count, 1564 sbi->s_groups_count,
@@ -1431,9 +1566,13 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1431 EXT4_INODES_PER_GROUP(sb), 1566 EXT4_INODES_PER_GROUP(sb),
1432 sbi->s_mount_opt); 1567 sbi->s_mount_opt);
1433 1568
1434 printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n", 1569 if (EXT4_SB(sb)->s_journal) {
1435 sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" : 1570 printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
1436 "external", EXT4_SB(sb)->s_journal->j_devname); 1571 sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
1572 "external", EXT4_SB(sb)->s_journal->j_devname);
1573 } else {
1574 printk(KERN_INFO "EXT4 FS on %s, no journal\n", sb->s_id);
1575 }
1437 return res; 1576 return res;
1438} 1577}
1439 1578
@@ -1445,7 +1584,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
1445 ext4_group_t flex_group_count; 1584 ext4_group_t flex_group_count;
1446 ext4_group_t flex_group; 1585 ext4_group_t flex_group;
1447 int groups_per_flex = 0; 1586 int groups_per_flex = 0;
1448 __u64 block_bitmap = 0;
1449 int i; 1587 int i;
1450 1588
1451 if (!sbi->s_es->s_log_groups_per_flex) { 1589 if (!sbi->s_es->s_log_groups_per_flex) {
@@ -1464,21 +1602,18 @@ static int ext4_fill_flex_info(struct super_block *sb)
1464 sizeof(struct flex_groups), GFP_KERNEL); 1602 sizeof(struct flex_groups), GFP_KERNEL);
1465 if (sbi->s_flex_groups == NULL) { 1603 if (sbi->s_flex_groups == NULL) {
1466 printk(KERN_ERR "EXT4-fs: not enough memory for " 1604 printk(KERN_ERR "EXT4-fs: not enough memory for "
1467 "%lu flex groups\n", flex_group_count); 1605 "%u flex groups\n", flex_group_count);
1468 goto failed; 1606 goto failed;
1469 } 1607 }
1470 1608
1471 gdp = ext4_get_group_desc(sb, 1, &bh);
1472 block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
1473
1474 for (i = 0; i < sbi->s_groups_count; i++) { 1609 for (i = 0; i < sbi->s_groups_count; i++) {
1475 gdp = ext4_get_group_desc(sb, i, &bh); 1610 gdp = ext4_get_group_desc(sb, i, &bh);
1476 1611
1477 flex_group = ext4_flex_group(sbi, i); 1612 flex_group = ext4_flex_group(sbi, i);
1478 sbi->s_flex_groups[flex_group].free_inodes += 1613 sbi->s_flex_groups[flex_group].free_inodes +=
1479 le16_to_cpu(gdp->bg_free_inodes_count); 1614 ext4_free_inodes_count(sb, gdp);
1480 sbi->s_flex_groups[flex_group].free_blocks += 1615 sbi->s_flex_groups[flex_group].free_blocks +=
1481 le16_to_cpu(gdp->bg_free_blocks_count); 1616 ext4_free_blks_count(sb, gdp);
1482 } 1617 }
1483 1618
1484 return 1; 1619 return 1;
@@ -1552,14 +1687,14 @@ static int ext4_check_descriptors(struct super_block *sb)
1552 block_bitmap = ext4_block_bitmap(sb, gdp); 1687 block_bitmap = ext4_block_bitmap(sb, gdp);
1553 if (block_bitmap < first_block || block_bitmap > last_block) { 1688 if (block_bitmap < first_block || block_bitmap > last_block) {
1554 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1689 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
1555 "Block bitmap for group %lu not in group " 1690 "Block bitmap for group %u not in group "
1556 "(block %llu)!\n", i, block_bitmap); 1691 "(block %llu)!\n", i, block_bitmap);
1557 return 0; 1692 return 0;
1558 } 1693 }
1559 inode_bitmap = ext4_inode_bitmap(sb, gdp); 1694 inode_bitmap = ext4_inode_bitmap(sb, gdp);
1560 if (inode_bitmap < first_block || inode_bitmap > last_block) { 1695 if (inode_bitmap < first_block || inode_bitmap > last_block) {
1561 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1696 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
1562 "Inode bitmap for group %lu not in group " 1697 "Inode bitmap for group %u not in group "
1563 "(block %llu)!\n", i, inode_bitmap); 1698 "(block %llu)!\n", i, inode_bitmap);
1564 return 0; 1699 return 0;
1565 } 1700 }
@@ -1567,14 +1702,14 @@ static int ext4_check_descriptors(struct super_block *sb)
1567 if (inode_table < first_block || 1702 if (inode_table < first_block ||
1568 inode_table + sbi->s_itb_per_group - 1 > last_block) { 1703 inode_table + sbi->s_itb_per_group - 1 > last_block) {
1569 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1704 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
1570 "Inode table for group %lu not in group " 1705 "Inode table for group %u not in group "
1571 "(block %llu)!\n", i, inode_table); 1706 "(block %llu)!\n", i, inode_table);
1572 return 0; 1707 return 0;
1573 } 1708 }
1574 spin_lock(sb_bgl_lock(sbi, i)); 1709 spin_lock(sb_bgl_lock(sbi, i));
1575 if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { 1710 if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
1576 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1711 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
1577 "Checksum for group %lu failed (%u!=%u)\n", 1712 "Checksum for group %u failed (%u!=%u)\n",
1578 i, le16_to_cpu(ext4_group_desc_csum(sbi, i, 1713 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
1579 gdp)), le16_to_cpu(gdp->bg_checksum)); 1714 gdp)), le16_to_cpu(gdp->bg_checksum));
1580 if (!(sb->s_flags & MS_RDONLY)) { 1715 if (!(sb->s_flags & MS_RDONLY)) {
@@ -1721,7 +1856,7 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
1721 /* small i_blocks in vfs inode? */ 1856 /* small i_blocks in vfs inode? */
1722 if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { 1857 if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
1723 /* 1858 /*
1724 * CONFIG_LSF is not enabled implies the inode 1859 * CONFIG_LBD is not enabled implies the inode
1725 * i_block represent total blocks in 512 bytes 1860 * i_block represent total blocks in 512 bytes
1726 * 32 == size of vfs inode i_blocks * 8 1861 * 32 == size of vfs inode i_blocks * 8
1727 */ 1862 */
@@ -1764,7 +1899,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
1764 1899
1765 if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { 1900 if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
1766 /* 1901 /*
1767 * !has_huge_files or CONFIG_LSF is not enabled 1902 * !has_huge_files or CONFIG_LBD is not enabled
1768 * implies the inode i_block represent total blocks in 1903 * implies the inode i_block represent total blocks in
1769 * 512 bytes 32 == size of vfs inode i_blocks * 8 1904 * 512 bytes 32 == size of vfs inode i_blocks * 8
1770 */ 1905 */
@@ -1866,19 +2001,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1866 ext4_fsblk_t sb_block = get_sb_block(&data); 2001 ext4_fsblk_t sb_block = get_sb_block(&data);
1867 ext4_fsblk_t logical_sb_block; 2002 ext4_fsblk_t logical_sb_block;
1868 unsigned long offset = 0; 2003 unsigned long offset = 0;
1869 unsigned int journal_inum = 0;
1870 unsigned long journal_devnum = 0; 2004 unsigned long journal_devnum = 0;
1871 unsigned long def_mount_opts; 2005 unsigned long def_mount_opts;
1872 struct inode *root; 2006 struct inode *root;
1873 char *cp; 2007 char *cp;
2008 const char *descr;
1874 int ret = -EINVAL; 2009 int ret = -EINVAL;
1875 int blocksize; 2010 int blocksize;
1876 int db_count; 2011 unsigned int db_count;
1877 int i; 2012 unsigned int i;
1878 int needs_recovery, has_huge_files; 2013 int needs_recovery, has_huge_files;
1879 __le32 features; 2014 int features;
1880 __u64 blocks_count; 2015 __u64 blocks_count;
1881 int err; 2016 int err;
2017 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
1882 2018
1883 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 2019 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
1884 if (!sbi) 2020 if (!sbi)
@@ -1959,31 +2095,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1959 2095
1960 sbi->s_resuid = le16_to_cpu(es->s_def_resuid); 2096 sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
1961 sbi->s_resgid = le16_to_cpu(es->s_def_resgid); 2097 sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
2098 sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
2099 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
2100 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
1962 2101
1963 set_opt(sbi->s_mount_opt, RESERVATION); 2102 set_opt(sbi->s_mount_opt, RESERVATION);
1964 set_opt(sbi->s_mount_opt, BARRIER); 2103 set_opt(sbi->s_mount_opt, BARRIER);
1965 2104
1966 /* 2105 /*
1967 * turn on extents feature by default in ext4 filesystem
1968 * only if feature flag already set by mkfs or tune2fs.
1969 * Use -o noextents to turn it off
1970 */
1971 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
1972 set_opt(sbi->s_mount_opt, EXTENTS);
1973 else
1974 ext4_warning(sb, __func__,
1975 "extents feature not enabled on this filesystem, "
1976 "use tune2fs.\n");
1977
1978 /*
1979 * enable delayed allocation by default 2106 * enable delayed allocation by default
1980 * Use -o nodelalloc to turn it off 2107 * Use -o nodelalloc to turn it off
1981 */ 2108 */
1982 set_opt(sbi->s_mount_opt, DELALLOC); 2109 set_opt(sbi->s_mount_opt, DELALLOC);
1983 2110
1984 2111
1985 if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum, 2112 if (!parse_options((char *) data, sb, &journal_devnum,
1986 NULL, 0)) 2113 &journal_ioprio, NULL, 0))
1987 goto failed_mount; 2114 goto failed_mount;
1988 2115
1989 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 2116 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -2005,15 +2132,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2005 features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP); 2132 features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
2006 if (features) { 2133 if (features) {
2007 printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of " 2134 printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
2008 "unsupported optional features (%x).\n", 2135 "unsupported optional features (%x).\n", sb->s_id,
2009 sb->s_id, le32_to_cpu(features)); 2136 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
2137 ~EXT4_FEATURE_INCOMPAT_SUPP));
2010 goto failed_mount; 2138 goto failed_mount;
2011 } 2139 }
2012 features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP); 2140 features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
2013 if (!(sb->s_flags & MS_RDONLY) && features) { 2141 if (!(sb->s_flags & MS_RDONLY) && features) {
2014 printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of " 2142 printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
2015 "unsupported optional features (%x).\n", 2143 "unsupported optional features (%x).\n", sb->s_id,
2016 sb->s_id, le32_to_cpu(features)); 2144 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
2145 ~EXT4_FEATURE_RO_COMPAT_SUPP));
2017 goto failed_mount; 2146 goto failed_mount;
2018 } 2147 }
2019 has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, 2148 has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -2021,13 +2150,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2021 if (has_huge_files) { 2150 if (has_huge_files) {
2022 /* 2151 /*
2023 * Large file size enabled file system can only be 2152 * Large file size enabled file system can only be
2024 * mount if kernel is build with CONFIG_LSF 2153 * mount if kernel is build with CONFIG_LBD
2025 */ 2154 */
2026 if (sizeof(root->i_blocks) < sizeof(u64) && 2155 if (sizeof(root->i_blocks) < sizeof(u64) &&
2027 !(sb->s_flags & MS_RDONLY)) { 2156 !(sb->s_flags & MS_RDONLY)) {
2028 printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge " 2157 printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge "
2029 "files cannot be mounted read-write " 2158 "files cannot be mounted read-write "
2030 "without CONFIG_LSF.\n", sb->s_id); 2159 "without CONFIG_LBD.\n", sb->s_id);
2031 goto failed_mount; 2160 goto failed_mount;
2032 } 2161 }
2033 } 2162 }
@@ -2118,6 +2247,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2118 for (i = 0; i < 4; i++) 2247 for (i = 0; i < 4; i++)
2119 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); 2248 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
2120 sbi->s_def_hash_version = es->s_def_hash_version; 2249 sbi->s_def_hash_version = es->s_def_hash_version;
2250 i = le32_to_cpu(es->s_flags);
2251 if (i & EXT2_FLAGS_UNSIGNED_HASH)
2252 sbi->s_hash_unsigned = 3;
2253 else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
2254#ifdef __CHAR_UNSIGNED__
2255 es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
2256 sbi->s_hash_unsigned = 3;
2257#else
2258 es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
2259#endif
2260 sb->s_dirt = 1;
2261 }
2121 2262
2122 if (sbi->s_blocks_per_group > blocksize * 8) { 2263 if (sbi->s_blocks_per_group > blocksize * 8) {
2123 printk(KERN_ERR 2264 printk(KERN_ERR
@@ -2145,20 +2286,30 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2145 if (EXT4_BLOCKS_PER_GROUP(sb) == 0) 2286 if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
2146 goto cantfind_ext4; 2287 goto cantfind_ext4;
2147 2288
2148 /* ensure blocks_count calculation below doesn't sign-extend */ 2289 /*
2149 if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) < 2290 * It makes no sense for the first data block to be beyond the end
2150 le32_to_cpu(es->s_first_data_block) + 1) { 2291 * of the filesystem.
2151 printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, " 2292 */
2152 "first data block %u, blocks per group %lu\n", 2293 if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
2153 ext4_blocks_count(es), 2294 printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
2154 le32_to_cpu(es->s_first_data_block), 2295 "block %u is beyond end of filesystem (%llu)\n",
2155 EXT4_BLOCKS_PER_GROUP(sb)); 2296 le32_to_cpu(es->s_first_data_block),
2297 ext4_blocks_count(es));
2156 goto failed_mount; 2298 goto failed_mount;
2157 } 2299 }
2158 blocks_count = (ext4_blocks_count(es) - 2300 blocks_count = (ext4_blocks_count(es) -
2159 le32_to_cpu(es->s_first_data_block) + 2301 le32_to_cpu(es->s_first_data_block) +
2160 EXT4_BLOCKS_PER_GROUP(sb) - 1); 2302 EXT4_BLOCKS_PER_GROUP(sb) - 1);
2161 do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); 2303 do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
2304 if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
2305 printk(KERN_WARNING "EXT4-fs: groups count too large: %u "
2306 "(block count %llu, first data block %u, "
2307 "blocks per group %lu)\n", sbi->s_groups_count,
2308 ext4_blocks_count(es),
2309 le32_to_cpu(es->s_first_data_block),
2310 EXT4_BLOCKS_PER_GROUP(sb));
2311 goto failed_mount;
2312 }
2162 sbi->s_groups_count = blocks_count; 2313 sbi->s_groups_count = blocks_count;
2163 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / 2314 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
2164 EXT4_DESC_PER_BLOCK(sb); 2315 EXT4_DESC_PER_BLOCK(sb);
@@ -2270,27 +2421,26 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2270 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 2421 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
2271 es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 2422 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
2272 ext4_commit_super(sb, es, 1); 2423 ext4_commit_super(sb, es, 1);
2273 printk(KERN_CRIT
2274 "EXT4-fs (device %s): mount failed\n",
2275 sb->s_id);
2276 goto failed_mount4; 2424 goto failed_mount4;
2277 } 2425 }
2278 } 2426 }
2279 } else if (journal_inum) { 2427 } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
2280 if (ext4_create_journal(sb, es, journal_inum)) 2428 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
2281 goto failed_mount3; 2429 printk(KERN_ERR "EXT4-fs: required journal recovery "
2430 "suppressed and not mounted read-only\n");
2431 goto failed_mount4;
2282 } else { 2432 } else {
2283 if (!silent) 2433 clear_opt(sbi->s_mount_opt, DATA_FLAGS);
2284 printk(KERN_ERR 2434 set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
2285 "ext4: No journal on filesystem on %s\n", 2435 sbi->s_journal = NULL;
2286 sb->s_id); 2436 needs_recovery = 0;
2287 goto failed_mount3; 2437 goto no_journal;
2288 } 2438 }
2289 2439
2290 if (ext4_blocks_count(es) > 0xffffffffULL && 2440 if (ext4_blocks_count(es) > 0xffffffffULL &&
2291 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, 2441 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
2292 JBD2_FEATURE_INCOMPAT_64BIT)) { 2442 JBD2_FEATURE_INCOMPAT_64BIT)) {
2293 printk(KERN_ERR "ext4: Failed to set 64-bit journal feature\n"); 2443 printk(KERN_ERR "EXT4-fs: Failed to set 64-bit journal feature\n");
2294 goto failed_mount4; 2444 goto failed_mount4;
2295 } 2445 }
2296 2446
@@ -2335,6 +2485,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2335 default: 2485 default:
2336 break; 2486 break;
2337 } 2487 }
2488 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
2489
2490no_journal:
2338 2491
2339 if (test_opt(sb, NOBH)) { 2492 if (test_opt(sb, NOBH)) {
2340 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { 2493 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
@@ -2420,13 +2573,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2420 EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; 2573 EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
2421 ext4_orphan_cleanup(sb, es); 2574 ext4_orphan_cleanup(sb, es);
2422 EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; 2575 EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
2423 if (needs_recovery) 2576 if (needs_recovery) {
2424 printk(KERN_INFO "EXT4-fs: recovery complete.\n"); 2577 printk(KERN_INFO "EXT4-fs: recovery complete.\n");
2425 ext4_mark_recovery_complete(sb, es); 2578 ext4_mark_recovery_complete(sb, es);
2426 printk(KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n", 2579 }
2427 test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal": 2580 if (EXT4_SB(sb)->s_journal) {
2428 test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": 2581 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
2429 "writeback"); 2582 descr = " journalled data mode";
2583 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
2584 descr = " ordered data mode";
2585 else
2586 descr = " writeback data mode";
2587 } else
2588 descr = "out journal";
2589
2590 printk(KERN_INFO "EXT4-fs: mounted filesystem %s with%s\n",
2591 sb->s_id, descr);
2430 2592
2431 lock_kernel(); 2593 lock_kernel();
2432 return 0; 2594 return 0;
@@ -2438,8 +2600,11 @@ cantfind_ext4:
2438 goto failed_mount; 2600 goto failed_mount;
2439 2601
2440failed_mount4: 2602failed_mount4:
2441 jbd2_journal_destroy(sbi->s_journal); 2603 printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id);
2442 sbi->s_journal = NULL; 2604 if (sbi->s_journal) {
2605 jbd2_journal_destroy(sbi->s_journal);
2606 sbi->s_journal = NULL;
2607 }
2443failed_mount3: 2608failed_mount3:
2444 percpu_counter_destroy(&sbi->s_freeblocks_counter); 2609 percpu_counter_destroy(&sbi->s_freeblocks_counter);
2445 percpu_counter_destroy(&sbi->s_freeinodes_counter); 2610 percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -2476,11 +2641,9 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
2476{ 2641{
2477 struct ext4_sb_info *sbi = EXT4_SB(sb); 2642 struct ext4_sb_info *sbi = EXT4_SB(sb);
2478 2643
2479 if (sbi->s_commit_interval) 2644 journal->j_commit_interval = sbi->s_commit_interval;
2480 journal->j_commit_interval = sbi->s_commit_interval; 2645 journal->j_min_batch_time = sbi->s_min_batch_time;
2481 /* We could also set up an ext4-specific default for the commit 2646 journal->j_max_batch_time = sbi->s_max_batch_time;
2482 * interval here, but for now we'll just fall back to the jbd
2483 * default. */
2484 2647
2485 spin_lock(&journal->j_state_lock); 2648 spin_lock(&journal->j_state_lock);
2486 if (test_opt(sb, BARRIER)) 2649 if (test_opt(sb, BARRIER))
@@ -2500,6 +2663,8 @@ static journal_t *ext4_get_journal(struct super_block *sb,
2500 struct inode *journal_inode; 2663 struct inode *journal_inode;
2501 journal_t *journal; 2664 journal_t *journal;
2502 2665
2666 BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
2667
2503 /* First, test for the existence of a valid inode on disk. Bad 2668 /* First, test for the existence of a valid inode on disk. Bad
2504 * things happen if we iget() an unused inode, as the subsequent 2669 * things happen if we iget() an unused inode, as the subsequent
2505 * iput() will try to delete it. */ 2670 * iput() will try to delete it. */
@@ -2548,13 +2713,15 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
2548 struct ext4_super_block *es; 2713 struct ext4_super_block *es;
2549 struct block_device *bdev; 2714 struct block_device *bdev;
2550 2715
2716 BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
2717
2551 bdev = ext4_blkdev_get(j_dev); 2718 bdev = ext4_blkdev_get(j_dev);
2552 if (bdev == NULL) 2719 if (bdev == NULL)
2553 return NULL; 2720 return NULL;
2554 2721
2555 if (bd_claim(bdev, sb)) { 2722 if (bd_claim(bdev, sb)) {
2556 printk(KERN_ERR 2723 printk(KERN_ERR
2557 "EXT4: failed to claim external journal device.\n"); 2724 "EXT4-fs: failed to claim external journal device.\n");
2558 blkdev_put(bdev, FMODE_READ|FMODE_WRITE); 2725 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
2559 return NULL; 2726 return NULL;
2560 } 2727 }
@@ -2635,6 +2802,8 @@ static int ext4_load_journal(struct super_block *sb,
2635 int err = 0; 2802 int err = 0;
2636 int really_read_only; 2803 int really_read_only;
2637 2804
2805 BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
2806
2638 if (journal_devnum && 2807 if (journal_devnum &&
2639 journal_devnum != le32_to_cpu(es->s_journal_dev)) { 2808 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
2640 printk(KERN_INFO "EXT4-fs: external journal device major/minor " 2809 printk(KERN_INFO "EXT4-fs: external journal device major/minor "
@@ -2719,55 +2888,14 @@ static int ext4_load_journal(struct super_block *sb,
2719 return 0; 2888 return 0;
2720} 2889}
2721 2890
2722static int ext4_create_journal(struct super_block *sb, 2891static int ext4_commit_super(struct super_block *sb,
2723 struct ext4_super_block *es,
2724 unsigned int journal_inum)
2725{
2726 journal_t *journal;
2727 int err;
2728
2729 if (sb->s_flags & MS_RDONLY) {
2730 printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to "
2731 "create journal.\n");
2732 return -EROFS;
2733 }
2734
2735 journal = ext4_get_journal(sb, journal_inum);
2736 if (!journal)
2737 return -EINVAL;
2738
2739 printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n",
2740 journal_inum);
2741
2742 err = jbd2_journal_create(journal);
2743 if (err) {
2744 printk(KERN_ERR "EXT4-fs: error creating journal.\n");
2745 jbd2_journal_destroy(journal);
2746 return -EIO;
2747 }
2748
2749 EXT4_SB(sb)->s_journal = journal;
2750
2751 ext4_update_dynamic_rev(sb);
2752 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
2753 EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL);
2754
2755 es->s_journal_inum = cpu_to_le32(journal_inum);
2756 sb->s_dirt = 1;
2757
2758 /* Make sure we flush the recovery flag to disk. */
2759 ext4_commit_super(sb, es, 1);
2760
2761 return 0;
2762}
2763
2764static void ext4_commit_super(struct super_block *sb,
2765 struct ext4_super_block *es, int sync) 2892 struct ext4_super_block *es, int sync)
2766{ 2893{
2767 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; 2894 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
2895 int error = 0;
2768 2896
2769 if (!sbh) 2897 if (!sbh)
2770 return; 2898 return error;
2771 if (buffer_write_io_error(sbh)) { 2899 if (buffer_write_io_error(sbh)) {
2772 /* 2900 /*
2773 * Oh, dear. A previous attempt to write the 2901 * Oh, dear. A previous attempt to write the
@@ -2777,25 +2905,33 @@ static void ext4_commit_super(struct super_block *sb,
2777 * be remapped. Nothing we can do but to retry the 2905 * be remapped. Nothing we can do but to retry the
2778 * write and hope for the best. 2906 * write and hope for the best.
2779 */ 2907 */
2780 printk(KERN_ERR "ext4: previous I/O error to " 2908 printk(KERN_ERR "EXT4-fs: previous I/O error to "
2781 "superblock detected for %s.\n", sb->s_id); 2909 "superblock detected for %s.\n", sb->s_id);
2782 clear_buffer_write_io_error(sbh); 2910 clear_buffer_write_io_error(sbh);
2783 set_buffer_uptodate(sbh); 2911 set_buffer_uptodate(sbh);
2784 } 2912 }
2785 es->s_wtime = cpu_to_le32(get_seconds()); 2913 es->s_wtime = cpu_to_le32(get_seconds());
2786 ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb)); 2914 ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
2787 es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb)); 2915 &EXT4_SB(sb)->s_freeblocks_counter));
2916 es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
2917 &EXT4_SB(sb)->s_freeinodes_counter));
2918
2788 BUFFER_TRACE(sbh, "marking dirty"); 2919 BUFFER_TRACE(sbh, "marking dirty");
2789 mark_buffer_dirty(sbh); 2920 mark_buffer_dirty(sbh);
2790 if (sync) { 2921 if (sync) {
2791 sync_dirty_buffer(sbh); 2922 error = sync_dirty_buffer(sbh);
2792 if (buffer_write_io_error(sbh)) { 2923 if (error)
2793 printk(KERN_ERR "ext4: I/O error while writing " 2924 return error;
2925
2926 error = buffer_write_io_error(sbh);
2927 if (error) {
2928 printk(KERN_ERR "EXT4-fs: I/O error while writing "
2794 "superblock for %s.\n", sb->s_id); 2929 "superblock for %s.\n", sb->s_id);
2795 clear_buffer_write_io_error(sbh); 2930 clear_buffer_write_io_error(sbh);
2796 set_buffer_uptodate(sbh); 2931 set_buffer_uptodate(sbh);
2797 } 2932 }
2798 } 2933 }
2934 return error;
2799} 2935}
2800 2936
2801 2937
@@ -2809,6 +2945,10 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
2809{ 2945{
2810 journal_t *journal = EXT4_SB(sb)->s_journal; 2946 journal_t *journal = EXT4_SB(sb)->s_journal;
2811 2947
2948 if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
2949 BUG_ON(journal != NULL);
2950 return;
2951 }
2812 jbd2_journal_lock_updates(journal); 2952 jbd2_journal_lock_updates(journal);
2813 if (jbd2_journal_flush(journal) < 0) 2953 if (jbd2_journal_flush(journal) < 0)
2814 goto out; 2954 goto out;
@@ -2838,6 +2978,8 @@ static void ext4_clear_journal_err(struct super_block *sb,
2838 int j_errno; 2978 int j_errno;
2839 const char *errstr; 2979 const char *errstr;
2840 2980
2981 BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
2982
2841 journal = EXT4_SB(sb)->s_journal; 2983 journal = EXT4_SB(sb)->s_journal;
2842 2984
2843 /* 2985 /*
@@ -2870,14 +3012,17 @@ static void ext4_clear_journal_err(struct super_block *sb,
2870int ext4_force_commit(struct super_block *sb) 3012int ext4_force_commit(struct super_block *sb)
2871{ 3013{
2872 journal_t *journal; 3014 journal_t *journal;
2873 int ret; 3015 int ret = 0;
2874 3016
2875 if (sb->s_flags & MS_RDONLY) 3017 if (sb->s_flags & MS_RDONLY)
2876 return 0; 3018 return 0;
2877 3019
2878 journal = EXT4_SB(sb)->s_journal; 3020 journal = EXT4_SB(sb)->s_journal;
2879 sb->s_dirt = 0; 3021 if (journal) {
2880 ret = ext4_journal_force_commit(journal); 3022 sb->s_dirt = 0;
3023 ret = ext4_journal_force_commit(journal);
3024 }
3025
2881 return ret; 3026 return ret;
2882} 3027}
2883 3028
@@ -2889,9 +3034,13 @@ int ext4_force_commit(struct super_block *sb)
2889 */ 3034 */
2890static void ext4_write_super(struct super_block *sb) 3035static void ext4_write_super(struct super_block *sb)
2891{ 3036{
2892 if (mutex_trylock(&sb->s_lock) != 0) 3037 if (EXT4_SB(sb)->s_journal) {
2893 BUG(); 3038 if (mutex_trylock(&sb->s_lock) != 0)
2894 sb->s_dirt = 0; 3039 BUG();
3040 sb->s_dirt = 0;
3041 } else {
3042 ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
3043 }
2895} 3044}
2896 3045
2897static int ext4_sync_fs(struct super_block *sb, int wait) 3046static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -2900,10 +3049,14 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
2900 3049
2901 trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); 3050 trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
2902 sb->s_dirt = 0; 3051 sb->s_dirt = 0;
2903 if (wait) 3052 if (EXT4_SB(sb)->s_journal) {
2904 ret = ext4_force_commit(sb); 3053 if (wait)
2905 else 3054 ret = ext4_force_commit(sb);
2906 jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL); 3055 else
3056 jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
3057 } else {
3058 ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
3059 }
2907 return ret; 3060 return ret;
2908} 3061}
2909 3062
@@ -2911,36 +3064,48 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
2911 * LVM calls this function before a (read-only) snapshot is created. This 3064 * LVM calls this function before a (read-only) snapshot is created. This
2912 * gives us a chance to flush the journal completely and mark the fs clean. 3065 * gives us a chance to flush the journal completely and mark the fs clean.
2913 */ 3066 */
2914static void ext4_write_super_lockfs(struct super_block *sb) 3067static int ext4_freeze(struct super_block *sb)
2915{ 3068{
3069 int error = 0;
3070 journal_t *journal;
2916 sb->s_dirt = 0; 3071 sb->s_dirt = 0;
2917 3072
2918 if (!(sb->s_flags & MS_RDONLY)) { 3073 if (!(sb->s_flags & MS_RDONLY)) {
2919 journal_t *journal = EXT4_SB(sb)->s_journal; 3074 journal = EXT4_SB(sb)->s_journal;
2920 3075
2921 /* Now we set up the journal barrier. */ 3076 if (journal) {
2922 jbd2_journal_lock_updates(journal); 3077 /* Now we set up the journal barrier. */
3078 jbd2_journal_lock_updates(journal);
2923 3079
2924 /* 3080 /*
2925 * We don't want to clear needs_recovery flag when we failed 3081 * We don't want to clear needs_recovery flag when we
2926 * to flush the journal. 3082 * failed to flush the journal.
2927 */ 3083 */
2928 if (jbd2_journal_flush(journal) < 0) 3084 error = jbd2_journal_flush(journal);
2929 return; 3085 if (error < 0)
3086 goto out;
3087 }
2930 3088
2931 /* Journal blocked and flushed, clear needs_recovery flag. */ 3089 /* Journal blocked and flushed, clear needs_recovery flag. */
2932 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3090 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
2933 ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); 3091 ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
3092 error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
3093 if (error)
3094 goto out;
2934 } 3095 }
3096 return 0;
3097out:
3098 jbd2_journal_unlock_updates(journal);
3099 return error;
2935} 3100}
2936 3101
2937/* 3102/*
2938 * Called by LVM after the snapshot is done. We need to reset the RECOVER 3103 * Called by LVM after the snapshot is done. We need to reset the RECOVER
2939 * flag here, even though the filesystem is not technically dirty yet. 3104 * flag here, even though the filesystem is not technically dirty yet.
2940 */ 3105 */
2941static void ext4_unlockfs(struct super_block *sb) 3106static int ext4_unfreeze(struct super_block *sb)
2942{ 3107{
2943 if (!(sb->s_flags & MS_RDONLY)) { 3108 if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) {
2944 lock_super(sb); 3109 lock_super(sb);
2945 /* Reser the needs_recovery flag before the fs is unlocked. */ 3110 /* Reser the needs_recovery flag before the fs is unlocked. */
2946 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3111 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
@@ -2948,6 +3113,7 @@ static void ext4_unlockfs(struct super_block *sb)
2948 unlock_super(sb); 3113 unlock_super(sb);
2949 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 3114 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
2950 } 3115 }
3116 return 0;
2951} 3117}
2952 3118
2953static int ext4_remount(struct super_block *sb, int *flags, char *data) 3119static int ext4_remount(struct super_block *sb, int *flags, char *data)
@@ -2958,6 +3124,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
2958 unsigned long old_sb_flags; 3124 unsigned long old_sb_flags;
2959 struct ext4_mount_options old_opts; 3125 struct ext4_mount_options old_opts;
2960 ext4_group_t g; 3126 ext4_group_t g;
3127 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
2961 int err; 3128 int err;
2962#ifdef CONFIG_QUOTA 3129#ifdef CONFIG_QUOTA
2963 int i; 3130 int i;
@@ -2969,16 +3136,21 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
2969 old_opts.s_resuid = sbi->s_resuid; 3136 old_opts.s_resuid = sbi->s_resuid;
2970 old_opts.s_resgid = sbi->s_resgid; 3137 old_opts.s_resgid = sbi->s_resgid;
2971 old_opts.s_commit_interval = sbi->s_commit_interval; 3138 old_opts.s_commit_interval = sbi->s_commit_interval;
3139 old_opts.s_min_batch_time = sbi->s_min_batch_time;
3140 old_opts.s_max_batch_time = sbi->s_max_batch_time;
2972#ifdef CONFIG_QUOTA 3141#ifdef CONFIG_QUOTA
2973 old_opts.s_jquota_fmt = sbi->s_jquota_fmt; 3142 old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
2974 for (i = 0; i < MAXQUOTAS; i++) 3143 for (i = 0; i < MAXQUOTAS; i++)
2975 old_opts.s_qf_names[i] = sbi->s_qf_names[i]; 3144 old_opts.s_qf_names[i] = sbi->s_qf_names[i];
2976#endif 3145#endif
3146 if (sbi->s_journal && sbi->s_journal->j_task->io_context)
3147 journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
2977 3148
2978 /* 3149 /*
2979 * Allow the "check" option to be passed as a remount option. 3150 * Allow the "check" option to be passed as a remount option.
2980 */ 3151 */
2981 if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) { 3152 if (!parse_options(data, sb, NULL, &journal_ioprio,
3153 &n_blocks_count, 1)) {
2982 err = -EINVAL; 3154 err = -EINVAL;
2983 goto restore_opts; 3155 goto restore_opts;
2984 } 3156 }
@@ -2991,7 +3163,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
2991 3163
2992 es = sbi->s_es; 3164 es = sbi->s_es;
2993 3165
2994 ext4_init_journal_params(sb, sbi->s_journal); 3166 if (sbi->s_journal) {
3167 ext4_init_journal_params(sb, sbi->s_journal);
3168 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
3169 }
2995 3170
2996 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || 3171 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
2997 n_blocks_count > ext4_blocks_count(es)) { 3172 n_blocks_count > ext4_blocks_count(es)) {
@@ -3020,17 +3195,20 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3020 * We have to unlock super so that we can wait for 3195 * We have to unlock super so that we can wait for
3021 * transactions. 3196 * transactions.
3022 */ 3197 */
3023 unlock_super(sb); 3198 if (sbi->s_journal) {
3024 ext4_mark_recovery_complete(sb, es); 3199 unlock_super(sb);
3025 lock_super(sb); 3200 ext4_mark_recovery_complete(sb, es);
3201 lock_super(sb);
3202 }
3026 } else { 3203 } else {
3027 __le32 ret; 3204 int ret;
3028 if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, 3205 if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
3029 ~EXT4_FEATURE_RO_COMPAT_SUPP))) { 3206 ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
3030 printk(KERN_WARNING "EXT4-fs: %s: couldn't " 3207 printk(KERN_WARNING "EXT4-fs: %s: couldn't "
3031 "remount RDWR because of unsupported " 3208 "remount RDWR because of unsupported "
3032 "optional features (%x).\n", 3209 "optional features (%x).\n", sb->s_id,
3033 sb->s_id, le32_to_cpu(ret)); 3210 (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
3211 ~EXT4_FEATURE_RO_COMPAT_SUPP));
3034 err = -EROFS; 3212 err = -EROFS;
3035 goto restore_opts; 3213 goto restore_opts;
3036 } 3214 }
@@ -3047,7 +3225,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3047 if (!ext4_group_desc_csum_verify(sbi, g, gdp)) { 3225 if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
3048 printk(KERN_ERR 3226 printk(KERN_ERR
3049 "EXT4-fs: ext4_remount: " 3227 "EXT4-fs: ext4_remount: "
3050 "Checksum for group %lu failed (%u!=%u)\n", 3228 "Checksum for group %u failed (%u!=%u)\n",
3051 g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)), 3229 g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
3052 le16_to_cpu(gdp->bg_checksum)); 3230 le16_to_cpu(gdp->bg_checksum));
3053 err = -EINVAL; 3231 err = -EINVAL;
@@ -3076,7 +3254,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3076 * been changed by e2fsck since we originally mounted 3254 * been changed by e2fsck since we originally mounted
3077 * the partition.) 3255 * the partition.)
3078 */ 3256 */
3079 ext4_clear_journal_err(sb, es); 3257 if (sbi->s_journal)
3258 ext4_clear_journal_err(sb, es);
3080 sbi->s_mount_state = le16_to_cpu(es->s_state); 3259 sbi->s_mount_state = le16_to_cpu(es->s_state);
3081 if ((err = ext4_group_extend(sb, es, n_blocks_count))) 3260 if ((err = ext4_group_extend(sb, es, n_blocks_count)))
3082 goto restore_opts; 3261 goto restore_opts;
@@ -3084,6 +3263,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3084 sb->s_flags &= ~MS_RDONLY; 3263 sb->s_flags &= ~MS_RDONLY;
3085 } 3264 }
3086 } 3265 }
3266 if (sbi->s_journal == NULL)
3267 ext4_commit_super(sb, es, 1);
3268
3087#ifdef CONFIG_QUOTA 3269#ifdef CONFIG_QUOTA
3088 /* Release old quota file names */ 3270 /* Release old quota file names */
3089 for (i = 0; i < MAXQUOTAS; i++) 3271 for (i = 0; i < MAXQUOTAS; i++)
@@ -3098,6 +3280,8 @@ restore_opts:
3098 sbi->s_resuid = old_opts.s_resuid; 3280 sbi->s_resuid = old_opts.s_resuid;
3099 sbi->s_resgid = old_opts.s_resgid; 3281 sbi->s_resgid = old_opts.s_resgid;
3100 sbi->s_commit_interval = old_opts.s_commit_interval; 3282 sbi->s_commit_interval = old_opts.s_commit_interval;
3283 sbi->s_min_batch_time = old_opts.s_min_batch_time;
3284 sbi->s_max_batch_time = old_opts.s_max_batch_time;
3101#ifdef CONFIG_QUOTA 3285#ifdef CONFIG_QUOTA
3102 sbi->s_jquota_fmt = old_opts.s_jquota_fmt; 3286 sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
3103 for (i = 0; i < MAXQUOTAS; i++) { 3287 for (i = 0; i < MAXQUOTAS; i++) {
@@ -3360,7 +3544,8 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
3360 * When we journal data on quota file, we have to flush journal to see 3544 * When we journal data on quota file, we have to flush journal to see
3361 * all updates to the file when we bypass pagecache... 3545 * all updates to the file when we bypass pagecache...
3362 */ 3546 */
3363 if (ext4_should_journal_data(path.dentry->d_inode)) { 3547 if (EXT4_SB(sb)->s_journal &&
3548 ext4_should_journal_data(path.dentry->d_inode)) {
3364 /* 3549 /*
3365 * We don't need to lock updates but journal_flush() could 3550 * We don't need to lock updates but journal_flush() could
3366 * otherwise be livelocked... 3551 * otherwise be livelocked...
@@ -3434,7 +3619,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3434 struct buffer_head *bh; 3619 struct buffer_head *bh;
3435 handle_t *handle = journal_current_handle(); 3620 handle_t *handle = journal_current_handle();
3436 3621
3437 if (!handle) { 3622 if (EXT4_SB(sb)->s_journal && !handle) {
3438 printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)" 3623 printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
3439 " cancelled because transaction is not started.\n", 3624 " cancelled because transaction is not started.\n",
3440 (unsigned long long)off, (unsigned long long)len); 3625 (unsigned long long)off, (unsigned long long)len);
@@ -3459,7 +3644,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3459 flush_dcache_page(bh->b_page); 3644 flush_dcache_page(bh->b_page);
3460 unlock_buffer(bh); 3645 unlock_buffer(bh);
3461 if (journal_quota) 3646 if (journal_quota)
3462 err = ext4_journal_dirty_metadata(handle, bh); 3647 err = ext4_handle_dirty_metadata(handle, NULL, bh);
3463 else { 3648 else {
3464 /* Always do at least ordered writes for quotas */ 3649 /* Always do at least ordered writes for quotas */
3465 err = ext4_jbd2_file_inode(handle, inode); 3650 err = ext4_jbd2_file_inode(handle, inode);
@@ -3513,18 +3698,15 @@ static int ext4_ui_proc_open(struct inode *inode, struct file *file)
3513static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf, 3698static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
3514 size_t cnt, loff_t *ppos) 3699 size_t cnt, loff_t *ppos)
3515{ 3700{
3516 unsigned int *p = PDE(file->f_path.dentry->d_inode)->data; 3701 unsigned long *p = PDE(file->f_path.dentry->d_inode)->data;
3517 char str[32]; 3702 char str[32];
3518 unsigned long value;
3519 3703
3520 if (cnt >= sizeof(str)) 3704 if (cnt >= sizeof(str))
3521 return -EINVAL; 3705 return -EINVAL;
3522 if (copy_from_user(str, buf, cnt)) 3706 if (copy_from_user(str, buf, cnt))
3523 return -EFAULT; 3707 return -EFAULT;
3524 value = simple_strtol(str, NULL, 0); 3708
3525 if (value < 0) 3709 *p = simple_strtoul(str, NULL, 0);
3526 return -ERANGE;
3527 *p = value;
3528 return cnt; 3710 return cnt;
3529} 3711}
3530 3712
@@ -3615,7 +3797,7 @@ static void __exit exit_ext4_fs(void)
3615} 3797}
3616 3798
3617MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); 3799MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
3618MODULE_DESCRIPTION("Fourth Extended Filesystem with extents"); 3800MODULE_DESCRIPTION("Fourth Extended Filesystem");
3619MODULE_LICENSE("GPL"); 3801MODULE_LICENSE("GPL");
3620module_init(init_ext4_fs) 3802module_init(init_ext4_fs)
3621module_exit(exit_ext4_fs) 3803module_exit(exit_ext4_fs)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 80626d516fee..157ce6589c54 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -457,7 +457,7 @@ static void ext4_xattr_update_super_block(handle_t *handle,
457 if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { 457 if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
458 EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR); 458 EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
459 sb->s_dirt = 1; 459 sb->s_dirt = 1;
460 ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); 460 ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
461 } 461 }
462} 462}
463 463
@@ -487,9 +487,9 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
487 ext4_forget(handle, 1, inode, bh, bh->b_blocknr); 487 ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
488 } else { 488 } else {
489 le32_add_cpu(&BHDR(bh)->h_refcount, -1); 489 le32_add_cpu(&BHDR(bh)->h_refcount, -1);
490 error = ext4_journal_dirty_metadata(handle, bh); 490 error = ext4_handle_dirty_metadata(handle, inode, bh);
491 if (IS_SYNC(inode)) 491 if (IS_SYNC(inode))
492 handle->h_sync = 1; 492 ext4_handle_sync(handle);
493 DQUOT_FREE_BLOCK(inode, 1); 493 DQUOT_FREE_BLOCK(inode, 1);
494 ea_bdebug(bh, "refcount now=%d; releasing", 494 ea_bdebug(bh, "refcount now=%d; releasing",
495 le32_to_cpu(BHDR(bh)->h_refcount)); 495 le32_to_cpu(BHDR(bh)->h_refcount));
@@ -724,8 +724,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
724 if (error == -EIO) 724 if (error == -EIO)
725 goto bad_block; 725 goto bad_block;
726 if (!error) 726 if (!error)
727 error = ext4_journal_dirty_metadata(handle, 727 error = ext4_handle_dirty_metadata(handle,
728 bs->bh); 728 inode,
729 bs->bh);
729 if (error) 730 if (error)
730 goto cleanup; 731 goto cleanup;
731 goto inserted; 732 goto inserted;
@@ -794,8 +795,9 @@ inserted:
794 ea_bdebug(new_bh, "reusing; refcount now=%d", 795 ea_bdebug(new_bh, "reusing; refcount now=%d",
795 le32_to_cpu(BHDR(new_bh)->h_refcount)); 796 le32_to_cpu(BHDR(new_bh)->h_refcount));
796 unlock_buffer(new_bh); 797 unlock_buffer(new_bh);
797 error = ext4_journal_dirty_metadata(handle, 798 error = ext4_handle_dirty_metadata(handle,
798 new_bh); 799 inode,
800 new_bh);
799 if (error) 801 if (error)
800 goto cleanup_dquot; 802 goto cleanup_dquot;
801 } 803 }
@@ -810,8 +812,8 @@ inserted:
810 /* We need to allocate a new block */ 812 /* We need to allocate a new block */
811 ext4_fsblk_t goal = ext4_group_first_block_no(sb, 813 ext4_fsblk_t goal = ext4_group_first_block_no(sb,
812 EXT4_I(inode)->i_block_group); 814 EXT4_I(inode)->i_block_group);
813 ext4_fsblk_t block = ext4_new_meta_block(handle, inode, 815 ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode,
814 goal, &error); 816 goal, NULL, &error);
815 if (error) 817 if (error)
816 goto cleanup; 818 goto cleanup;
817 ea_idebug(inode, "creating block %d", block); 819 ea_idebug(inode, "creating block %d", block);
@@ -833,7 +835,8 @@ getblk_failed:
833 set_buffer_uptodate(new_bh); 835 set_buffer_uptodate(new_bh);
834 unlock_buffer(new_bh); 836 unlock_buffer(new_bh);
835 ext4_xattr_cache_insert(new_bh); 837 ext4_xattr_cache_insert(new_bh);
836 error = ext4_journal_dirty_metadata(handle, new_bh); 838 error = ext4_handle_dirty_metadata(handle,
839 inode, new_bh);
837 if (error) 840 if (error)
838 goto cleanup; 841 goto cleanup;
839 } 842 }
@@ -1040,7 +1043,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
1040 */ 1043 */
1041 is.iloc.bh = NULL; 1044 is.iloc.bh = NULL;
1042 if (IS_SYNC(inode)) 1045 if (IS_SYNC(inode))
1043 handle->h_sync = 1; 1046 ext4_handle_sync(handle);
1044 } 1047 }
1045 1048
1046cleanup: 1049cleanup:
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 67e058357098..3a7f603b6982 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -841,7 +841,6 @@ const struct file_operations fat_dir_operations = {
841 .compat_ioctl = fat_compat_dir_ioctl, 841 .compat_ioctl = fat_compat_dir_ioctl,
842#endif 842#endif
843 .fsync = file_fsync, 843 .fsync = file_fsync,
844 .llseek = generic_file_llseek,
845}; 844};
846 845
847static int fat_get_short_entry(struct inode *dir, loff_t *pos, 846static int fat_get_short_entry(struct inode *dir, loff_t *pos,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index d937aaf77374..6b74d09adbe5 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -749,6 +749,8 @@ static struct dentry *fat_get_parent(struct dentry *child)
749 brelse(bh); 749 brelse(bh);
750 750
751 parent = d_obtain_alias(inode); 751 parent = d_obtain_alias(inode);
752 if (!IS_ERR(parent))
753 parent->d_op = sb->s_root->d_op;
752out: 754out:
753 unlock_super(sb); 755 unlock_super(sb);
754 756
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index bf326d4356a3..8ae32e37673c 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -78,7 +78,7 @@ static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
78 * for creation. 78 * for creation.
79 */ 79 */
80 if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) { 80 if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
81 if (nd->flags & LOOKUP_CREATE) 81 if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
82 return 0; 82 return 0;
83 } 83 }
84 84
diff --git a/fs/file_table.c b/fs/file_table.c
index 0fbcacc3ea75..bbeeac6efa1a 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -32,6 +32,9 @@ struct files_stat_struct files_stat = {
32/* public. Not pretty! */ 32/* public. Not pretty! */
33__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); 33__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
34 34
35/* SLAB cache for file structures */
36static struct kmem_cache *filp_cachep __read_mostly;
37
35static struct percpu_counter nr_files __cacheline_aligned_in_smp; 38static struct percpu_counter nr_files __cacheline_aligned_in_smp;
36 39
37static inline void file_free_rcu(struct rcu_head *head) 40static inline void file_free_rcu(struct rcu_head *head)
@@ -397,7 +400,12 @@ too_bad:
397void __init files_init(unsigned long mempages) 400void __init files_init(unsigned long mempages)
398{ 401{
399 int n; 402 int n;
400 /* One file with associated inode and dcache is very roughly 1K. 403
404 filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
405 SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
406
407 /*
408 * One file with associated inode and dcache is very roughly 1K.
401 * Per default don't use more than 10% of our memory for files. 409 * Per default don't use more than 10% of our memory for files.
402 */ 410 */
403 411
diff --git a/fs/filesystems.c b/fs/filesystems.c
index d0e20ced62dd..d488dcd7f2bb 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -253,24 +253,27 @@ static int __init proc_filesystems_init(void)
253module_init(proc_filesystems_init); 253module_init(proc_filesystems_init);
254#endif 254#endif
255 255
256struct file_system_type *get_fs_type(const char *name) 256static struct file_system_type *__get_fs_type(const char *name, int len)
257{ 257{
258 struct file_system_type *fs; 258 struct file_system_type *fs;
259 const char *dot = strchr(name, '.');
260 unsigned len = dot ? dot - name : strlen(name);
261 259
262 read_lock(&file_systems_lock); 260 read_lock(&file_systems_lock);
263 fs = *(find_filesystem(name, len)); 261 fs = *(find_filesystem(name, len));
264 if (fs && !try_module_get(fs->owner)) 262 if (fs && !try_module_get(fs->owner))
265 fs = NULL; 263 fs = NULL;
266 read_unlock(&file_systems_lock); 264 read_unlock(&file_systems_lock);
267 if (!fs && (request_module("%.*s", len, name) == 0)) { 265 return fs;
268 read_lock(&file_systems_lock); 266}
269 fs = *(find_filesystem(name, len)); 267
270 if (fs && !try_module_get(fs->owner)) 268struct file_system_type *get_fs_type(const char *name)
271 fs = NULL; 269{
272 read_unlock(&file_systems_lock); 270 struct file_system_type *fs;
273 } 271 const char *dot = strchr(name, '.');
272 int len = dot ? dot - name : strlen(name);
273
274 fs = __get_fs_type(name, len);
275 if (!fs && (request_module("%.*s", len, name) == 0))
276 fs = __get_fs_type(name, len);
274 277
275 if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) { 278 if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
276 put_filesystem(fs); 279 put_filesystem(fs);
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 9f3f2ceb73f0..03a6ea5e99f7 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -325,8 +325,10 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
325 if (!VXFS_ISIMMED(vip)) { 325 if (!VXFS_ISIMMED(vip)) {
326 ip->i_op = &page_symlink_inode_operations; 326 ip->i_op = &page_symlink_inode_operations;
327 ip->i_mapping->a_ops = &vxfs_aops; 327 ip->i_mapping->a_ops = &vxfs_aops;
328 } else 328 } else {
329 ip->i_op = &vxfs_immed_symlink_iops; 329 ip->i_op = &vxfs_immed_symlink_iops;
330 vip->vii_immed.vi_immed[ip->i_size] = '\0';
331 }
330 } else 332 } else
331 init_special_inode(ip, ip->i_mode, old_decode_dev(vip->vii_rdev)); 333 init_special_inode(ip, ip->i_mode, old_decode_dev(vip->vii_rdev));
332 334
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d0ff0b8cf309..e5eaa62fd17f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -421,9 +421,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
421 * If we're a pdlfush thread, then implement pdflush collision avoidance 421 * If we're a pdlfush thread, then implement pdflush collision avoidance
422 * against the entire list. 422 * against the entire list.
423 * 423 *
424 * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
425 * that it can be located for waiting on in __writeback_single_inode().
426 *
427 * If `bdi' is non-zero then we're being asked to writeback a specific queue. 424 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
428 * This function assumes that the blockdev superblock's inodes are backed by 425 * This function assumes that the blockdev superblock's inodes are backed by
429 * a variety of queues, so all inodes are searched. For other superblocks, 426 * a variety of queues, so all inodes are searched. For other superblocks,
@@ -443,6 +440,7 @@ void generic_sync_sb_inodes(struct super_block *sb,
443 struct writeback_control *wbc) 440 struct writeback_control *wbc)
444{ 441{
445 const unsigned long start = jiffies; /* livelock avoidance */ 442 const unsigned long start = jiffies; /* livelock avoidance */
443 int sync = wbc->sync_mode == WB_SYNC_ALL;
446 444
447 spin_lock(&inode_lock); 445 spin_lock(&inode_lock);
448 if (!wbc->for_kupdate || list_empty(&sb->s_io)) 446 if (!wbc->for_kupdate || list_empty(&sb->s_io))
@@ -499,10 +497,6 @@ void generic_sync_sb_inodes(struct super_block *sb,
499 __iget(inode); 497 __iget(inode);
500 pages_skipped = wbc->pages_skipped; 498 pages_skipped = wbc->pages_skipped;
501 __writeback_single_inode(inode, wbc); 499 __writeback_single_inode(inode, wbc);
502 if (wbc->sync_mode == WB_SYNC_HOLD) {
503 inode->dirtied_when = jiffies;
504 list_move(&inode->i_list, &sb->s_dirty);
505 }
506 if (current_is_pdflush()) 500 if (current_is_pdflush())
507 writeback_release(bdi); 501 writeback_release(bdi);
508 if (wbc->pages_skipped != pages_skipped) { 502 if (wbc->pages_skipped != pages_skipped) {
@@ -523,7 +517,49 @@ void generic_sync_sb_inodes(struct super_block *sb,
523 if (!list_empty(&sb->s_more_io)) 517 if (!list_empty(&sb->s_more_io))
524 wbc->more_io = 1; 518 wbc->more_io = 1;
525 } 519 }
526 spin_unlock(&inode_lock); 520
521 if (sync) {
522 struct inode *inode, *old_inode = NULL;
523
524 /*
525 * Data integrity sync. Must wait for all pages under writeback,
526 * because there may have been pages dirtied before our sync
527 * call, but which had writeout started before we write it out.
528 * In which case, the inode may not be on the dirty list, but
529 * we still have to wait for that writeout.
530 */
531 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
532 struct address_space *mapping;
533
534 if (inode->i_state & (I_FREEING|I_WILL_FREE))
535 continue;
536 mapping = inode->i_mapping;
537 if (mapping->nrpages == 0)
538 continue;
539 __iget(inode);
540 spin_unlock(&inode_lock);
541 /*
542 * We hold a reference to 'inode' so it couldn't have
543 * been removed from s_inodes list while we dropped the
544 * inode_lock. We cannot iput the inode now as we can
545 * be holding the last reference and we cannot iput it
546 * under inode_lock. So we keep the reference and iput
547 * it later.
548 */
549 iput(old_inode);
550 old_inode = inode;
551
552 filemap_fdatawait(mapping);
553
554 cond_resched();
555
556 spin_lock(&inode_lock);
557 }
558 spin_unlock(&inode_lock);
559 iput(old_inode);
560 } else
561 spin_unlock(&inode_lock);
562
527 return; /* Leave any unwritten inodes on s_io */ 563 return; /* Leave any unwritten inodes on s_io */
528} 564}
529EXPORT_SYMBOL_GPL(generic_sync_sb_inodes); 565EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
@@ -588,8 +624,7 @@ restart:
588 624
589/* 625/*
590 * writeback and wait upon the filesystem's dirty inodes. The caller will 626 * writeback and wait upon the filesystem's dirty inodes. The caller will
591 * do this in two passes - one to write, and one to wait. WB_SYNC_HOLD is 627 * do this in two passes - one to write, and one to wait.
592 * used to park the written inodes on sb->s_dirty for the wait pass.
593 * 628 *
594 * A finite limit is set on the number of pages which will be written. 629 * A finite limit is set on the number of pages which will be written.
595 * To prevent infinite livelock of sys_sync(). 630 * To prevent infinite livelock of sys_sync().
@@ -600,30 +635,21 @@ restart:
600void sync_inodes_sb(struct super_block *sb, int wait) 635void sync_inodes_sb(struct super_block *sb, int wait)
601{ 636{
602 struct writeback_control wbc = { 637 struct writeback_control wbc = {
603 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_HOLD, 638 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
604 .range_start = 0, 639 .range_start = 0,
605 .range_end = LLONG_MAX, 640 .range_end = LLONG_MAX,
606 }; 641 };
607 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
608 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
609 642
610 wbc.nr_to_write = nr_dirty + nr_unstable + 643 if (!wait) {
611 (inodes_stat.nr_inodes - inodes_stat.nr_unused) + 644 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
612 nr_dirty + nr_unstable; 645 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
613 wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */
614 sync_sb_inodes(sb, &wbc);
615}
616 646
617/* 647 wbc.nr_to_write = nr_dirty + nr_unstable +
618 * Rather lame livelock avoidance. 648 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
619 */ 649 } else
620static void set_sb_syncing(int val) 650 wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */
621{ 651
622 struct super_block *sb; 652 sync_sb_inodes(sb, &wbc);
623 spin_lock(&sb_lock);
624 list_for_each_entry_reverse(sb, &super_blocks, s_list)
625 sb->s_syncing = val;
626 spin_unlock(&sb_lock);
627} 653}
628 654
629/** 655/**
@@ -652,9 +678,6 @@ static void __sync_inodes(int wait)
652 spin_lock(&sb_lock); 678 spin_lock(&sb_lock);
653restart: 679restart:
654 list_for_each_entry(sb, &super_blocks, s_list) { 680 list_for_each_entry(sb, &super_blocks, s_list) {
655 if (sb->s_syncing)
656 continue;
657 sb->s_syncing = 1;
658 sb->s_count++; 681 sb->s_count++;
659 spin_unlock(&sb_lock); 682 spin_unlock(&sb_lock);
660 down_read(&sb->s_umount); 683 down_read(&sb->s_umount);
@@ -672,13 +695,10 @@ restart:
672 695
673void sync_inodes(int wait) 696void sync_inodes(int wait)
674{ 697{
675 set_sb_syncing(0);
676 __sync_inodes(0); 698 __sync_inodes(0);
677 699
678 if (wait) { 700 if (wait)
679 set_sb_syncing(0);
680 __sync_inodes(1); 701 __sync_inodes(1);
681 }
682} 702}
683 703
684/** 704/**
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 4f3cab321415..99c99dfb0373 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -1,6 +1,6 @@
1/* 1/*
2 FUSE: Filesystem in Userspace 2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> 3 Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
4 4
5 This program can be distributed under the terms of the GNU GPL. 5 This program can be distributed under the terms of the GNU GPL.
6 See the file COPYING. 6 See the file COPYING.
@@ -48,11 +48,13 @@ static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
48 size_t size; 48 size_t size;
49 49
50 if (!*ppos) { 50 if (!*ppos) {
51 long value;
51 struct fuse_conn *fc = fuse_ctl_file_conn_get(file); 52 struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
52 if (!fc) 53 if (!fc)
53 return 0; 54 return 0;
54 55
55 file->private_data=(void *)(long)atomic_read(&fc->num_waiting); 56 value = atomic_read(&fc->num_waiting);
57 file->private_data = (void *)value;
56 fuse_conn_put(fc); 58 fuse_conn_put(fc);
57 } 59 }
58 size = sprintf(tmp, "%ld\n", (long)file->private_data); 60 size = sprintf(tmp, "%ld\n", (long)file->private_data);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index fba571648a8e..e0c7ada08a1f 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1,6 +1,6 @@
1/* 1/*
2 FUSE: Filesystem in Userspace 2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> 3 Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
4 4
5 This program can be distributed under the terms of the GNU GPL. 5 This program can be distributed under the terms of the GNU GPL.
6 See the file COPYING. 6 See the file COPYING.
@@ -269,7 +269,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
269 * Called with fc->lock, unlocks it 269 * Called with fc->lock, unlocks it
270 */ 270 */
271static void request_end(struct fuse_conn *fc, struct fuse_req *req) 271static void request_end(struct fuse_conn *fc, struct fuse_req *req)
272 __releases(fc->lock) 272__releases(&fc->lock)
273{ 273{
274 void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; 274 void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
275 req->end = NULL; 275 req->end = NULL;
@@ -293,13 +293,13 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
293 wake_up(&req->waitq); 293 wake_up(&req->waitq);
294 if (end) 294 if (end)
295 end(fc, req); 295 end(fc, req);
296 else 296 fuse_put_request(fc, req);
297 fuse_put_request(fc, req);
298} 297}
299 298
300static void wait_answer_interruptible(struct fuse_conn *fc, 299static void wait_answer_interruptible(struct fuse_conn *fc,
301 struct fuse_req *req) 300 struct fuse_req *req)
302 __releases(fc->lock) __acquires(fc->lock) 301__releases(&fc->lock)
302__acquires(&fc->lock)
303{ 303{
304 if (signal_pending(current)) 304 if (signal_pending(current))
305 return; 305 return;
@@ -317,7 +317,8 @@ static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
317} 317}
318 318
319static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) 319static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
320 __releases(fc->lock) __acquires(fc->lock) 320__releases(&fc->lock)
321__acquires(&fc->lock)
321{ 322{
322 if (!fc->no_interrupt) { 323 if (!fc->no_interrupt) {
323 /* Any signal may interrupt this */ 324 /* Any signal may interrupt this */
@@ -380,7 +381,7 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
380 } 381 }
381} 382}
382 383
383void request_send(struct fuse_conn *fc, struct fuse_req *req) 384void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
384{ 385{
385 req->isreply = 1; 386 req->isreply = 1;
386 spin_lock(&fc->lock); 387 spin_lock(&fc->lock);
@@ -399,8 +400,8 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req)
399 spin_unlock(&fc->lock); 400 spin_unlock(&fc->lock);
400} 401}
401 402
402static void request_send_nowait_locked(struct fuse_conn *fc, 403static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
403 struct fuse_req *req) 404 struct fuse_req *req)
404{ 405{
405 req->background = 1; 406 req->background = 1;
406 fc->num_background++; 407 fc->num_background++;
@@ -414,11 +415,11 @@ static void request_send_nowait_locked(struct fuse_conn *fc,
414 flush_bg_queue(fc); 415 flush_bg_queue(fc);
415} 416}
416 417
417static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) 418static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
418{ 419{
419 spin_lock(&fc->lock); 420 spin_lock(&fc->lock);
420 if (fc->connected) { 421 if (fc->connected) {
421 request_send_nowait_locked(fc, req); 422 fuse_request_send_nowait_locked(fc, req);
422 spin_unlock(&fc->lock); 423 spin_unlock(&fc->lock);
423 } else { 424 } else {
424 req->out.h.error = -ENOTCONN; 425 req->out.h.error = -ENOTCONN;
@@ -426,16 +427,16 @@ static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
426 } 427 }
427} 428}
428 429
429void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req) 430void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
430{ 431{
431 req->isreply = 0; 432 req->isreply = 0;
432 request_send_nowait(fc, req); 433 fuse_request_send_nowait(fc, req);
433} 434}
434 435
435void request_send_background(struct fuse_conn *fc, struct fuse_req *req) 436void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
436{ 437{
437 req->isreply = 1; 438 req->isreply = 1;
438 request_send_nowait(fc, req); 439 fuse_request_send_nowait(fc, req);
439} 440}
440 441
441/* 442/*
@@ -443,10 +444,11 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
443 * 444 *
444 * fc->connected must have been checked previously 445 * fc->connected must have been checked previously
445 */ 446 */
446void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req) 447void fuse_request_send_background_locked(struct fuse_conn *fc,
448 struct fuse_req *req)
447{ 449{
448 req->isreply = 1; 450 req->isreply = 1;
449 request_send_nowait_locked(fc, req); 451 fuse_request_send_nowait_locked(fc, req);
450} 452}
451 453
452/* 454/*
@@ -539,8 +541,8 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
539 BUG_ON(!cs->nr_segs); 541 BUG_ON(!cs->nr_segs);
540 cs->seglen = cs->iov[0].iov_len; 542 cs->seglen = cs->iov[0].iov_len;
541 cs->addr = (unsigned long) cs->iov[0].iov_base; 543 cs->addr = (unsigned long) cs->iov[0].iov_base;
542 cs->iov ++; 544 cs->iov++;
543 cs->nr_segs --; 545 cs->nr_segs--;
544 } 546 }
545 down_read(&current->mm->mmap_sem); 547 down_read(&current->mm->mmap_sem);
546 err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0, 548 err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
@@ -589,9 +591,11 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
589 kunmap_atomic(mapaddr, KM_USER1); 591 kunmap_atomic(mapaddr, KM_USER1);
590 } 592 }
591 while (count) { 593 while (count) {
592 int err; 594 if (!cs->len) {
593 if (!cs->len && (err = fuse_copy_fill(cs))) 595 int err = fuse_copy_fill(cs);
594 return err; 596 if (err)
597 return err;
598 }
595 if (page) { 599 if (page) {
596 void *mapaddr = kmap_atomic(page, KM_USER1); 600 void *mapaddr = kmap_atomic(page, KM_USER1);
597 void *buf = mapaddr + offset; 601 void *buf = mapaddr + offset;
@@ -631,9 +635,11 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
631static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size) 635static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
632{ 636{
633 while (size) { 637 while (size) {
634 int err; 638 if (!cs->len) {
635 if (!cs->len && (err = fuse_copy_fill(cs))) 639 int err = fuse_copy_fill(cs);
636 return err; 640 if (err)
641 return err;
642 }
637 fuse_copy_do(cs, &val, &size); 643 fuse_copy_do(cs, &val, &size);
638 } 644 }
639 return 0; 645 return 0;
@@ -664,6 +670,8 @@ static int request_pending(struct fuse_conn *fc)
664 670
665/* Wait until a request is available on the pending list */ 671/* Wait until a request is available on the pending list */
666static void request_wait(struct fuse_conn *fc) 672static void request_wait(struct fuse_conn *fc)
673__releases(&fc->lock)
674__acquires(&fc->lock)
667{ 675{
668 DECLARE_WAITQUEUE(wait, current); 676 DECLARE_WAITQUEUE(wait, current);
669 677
@@ -691,7 +699,7 @@ static void request_wait(struct fuse_conn *fc)
691 */ 699 */
692static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req, 700static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req,
693 const struct iovec *iov, unsigned long nr_segs) 701 const struct iovec *iov, unsigned long nr_segs)
694 __releases(fc->lock) 702__releases(&fc->lock)
695{ 703{
696 struct fuse_copy_state cs; 704 struct fuse_copy_state cs;
697 struct fuse_in_header ih; 705 struct fuse_in_header ih;
@@ -813,6 +821,34 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
813 return err; 821 return err;
814} 822}
815 823
824static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
825 struct fuse_copy_state *cs)
826{
827 struct fuse_notify_poll_wakeup_out outarg;
828 int err;
829
830 if (size != sizeof(outarg))
831 return -EINVAL;
832
833 err = fuse_copy_one(cs, &outarg, sizeof(outarg));
834 if (err)
835 return err;
836
837 return fuse_notify_poll_wakeup(fc, &outarg);
838}
839
840static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
841 unsigned int size, struct fuse_copy_state *cs)
842{
843 switch (code) {
844 case FUSE_NOTIFY_POLL:
845 return fuse_notify_poll(fc, size, cs);
846
847 default:
848 return -EINVAL;
849 }
850}
851
816/* Look up request on processing list by unique ID */ 852/* Look up request on processing list by unique ID */
817static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique) 853static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
818{ 854{
@@ -876,9 +912,23 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
876 err = fuse_copy_one(&cs, &oh, sizeof(oh)); 912 err = fuse_copy_one(&cs, &oh, sizeof(oh));
877 if (err) 913 if (err)
878 goto err_finish; 914 goto err_finish;
915
916 err = -EINVAL;
917 if (oh.len != nbytes)
918 goto err_finish;
919
920 /*
921 * Zero oh.unique indicates unsolicited notification message
922 * and error contains notification code.
923 */
924 if (!oh.unique) {
925 err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs);
926 fuse_copy_finish(&cs);
927 return err ? err : nbytes;
928 }
929
879 err = -EINVAL; 930 err = -EINVAL;
880 if (!oh.unique || oh.error <= -1000 || oh.error > 0 || 931 if (oh.error <= -1000 || oh.error > 0)
881 oh.len != nbytes)
882 goto err_finish; 932 goto err_finish;
883 933
884 spin_lock(&fc->lock); 934 spin_lock(&fc->lock);
@@ -966,6 +1016,8 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
966 * This function releases and reacquires fc->lock 1016 * This function releases and reacquires fc->lock
967 */ 1017 */
968static void end_requests(struct fuse_conn *fc, struct list_head *head) 1018static void end_requests(struct fuse_conn *fc, struct list_head *head)
1019__releases(&fc->lock)
1020__acquires(&fc->lock)
969{ 1021{
970 while (!list_empty(head)) { 1022 while (!list_empty(head)) {
971 struct fuse_req *req; 1023 struct fuse_req *req;
@@ -988,7 +1040,8 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
988 * locked). 1040 * locked).
989 */ 1041 */
990static void end_io_requests(struct fuse_conn *fc) 1042static void end_io_requests(struct fuse_conn *fc)
991 __releases(fc->lock) __acquires(fc->lock) 1043__releases(&fc->lock)
1044__acquires(&fc->lock)
992{ 1045{
993 while (!list_empty(&fc->io)) { 1046 while (!list_empty(&fc->io)) {
994 struct fuse_req *req = 1047 struct fuse_req *req =
@@ -1002,11 +1055,11 @@ static void end_io_requests(struct fuse_conn *fc)
1002 wake_up(&req->waitq); 1055 wake_up(&req->waitq);
1003 if (end) { 1056 if (end) {
1004 req->end = NULL; 1057 req->end = NULL;
1005 /* The end function will consume this reference */
1006 __fuse_get_request(req); 1058 __fuse_get_request(req);
1007 spin_unlock(&fc->lock); 1059 spin_unlock(&fc->lock);
1008 wait_event(req->waitq, !req->locked); 1060 wait_event(req->waitq, !req->locked);
1009 end(fc, req); 1061 end(fc, req);
1062 fuse_put_request(fc, req);
1010 spin_lock(&fc->lock); 1063 spin_lock(&fc->lock);
1011 } 1064 }
1012 } 1065 }
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 95bc22bdd060..fdff346e96fd 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1,6 +1,6 @@
1/* 1/*
2 FUSE: Filesystem in Userspace 2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> 3 Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
4 4
5 This program can be distributed under the terms of the GNU GPL. 5 This program can be distributed under the terms of the GNU GPL.
6 See the file COPYING. 6 See the file COPYING.
@@ -189,7 +189,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
189 parent = dget_parent(entry); 189 parent = dget_parent(entry);
190 fuse_lookup_init(fc, req, get_node_id(parent->d_inode), 190 fuse_lookup_init(fc, req, get_node_id(parent->d_inode),
191 &entry->d_name, &outarg); 191 &entry->d_name, &outarg);
192 request_send(fc, req); 192 fuse_request_send(fc, req);
193 dput(parent); 193 dput(parent);
194 err = req->out.h.error; 194 err = req->out.h.error;
195 fuse_put_request(fc, req); 195 fuse_put_request(fc, req);
@@ -204,7 +204,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
204 return 0; 204 return 0;
205 } 205 }
206 spin_lock(&fc->lock); 206 spin_lock(&fc->lock);
207 fi->nlookup ++; 207 fi->nlookup++;
208 spin_unlock(&fc->lock); 208 spin_unlock(&fc->lock);
209 } 209 }
210 fuse_put_request(fc, forget_req); 210 fuse_put_request(fc, forget_req);
@@ -283,7 +283,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
283 attr_version = fuse_get_attr_version(fc); 283 attr_version = fuse_get_attr_version(fc);
284 284
285 fuse_lookup_init(fc, req, nodeid, name, outarg); 285 fuse_lookup_init(fc, req, nodeid, name, outarg);
286 request_send(fc, req); 286 fuse_request_send(fc, req);
287 err = req->out.h.error; 287 err = req->out.h.error;
288 fuse_put_request(fc, req); 288 fuse_put_request(fc, req);
289 /* Zero nodeid is same as -ENOENT, but with valid timeout */ 289 /* Zero nodeid is same as -ENOENT, but with valid timeout */
@@ -369,7 +369,7 @@ static void fuse_sync_release(struct fuse_conn *fc, struct fuse_file *ff,
369{ 369{
370 fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE); 370 fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE);
371 ff->reserved_req->force = 1; 371 ff->reserved_req->force = 1;
372 request_send(fc, ff->reserved_req); 372 fuse_request_send(fc, ff->reserved_req);
373 fuse_put_request(fc, ff->reserved_req); 373 fuse_put_request(fc, ff->reserved_req);
374 kfree(ff); 374 kfree(ff);
375} 375}
@@ -408,7 +408,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
408 goto out_put_forget_req; 408 goto out_put_forget_req;
409 409
410 err = -ENOMEM; 410 err = -ENOMEM;
411 ff = fuse_file_alloc(); 411 ff = fuse_file_alloc(fc);
412 if (!ff) 412 if (!ff)
413 goto out_put_request; 413 goto out_put_request;
414 414
@@ -432,7 +432,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
432 req->out.args[0].value = &outentry; 432 req->out.args[0].value = &outentry;
433 req->out.args[1].size = sizeof(outopen); 433 req->out.args[1].size = sizeof(outopen);
434 req->out.args[1].value = &outopen; 434 req->out.args[1].value = &outopen;
435 request_send(fc, req); 435 fuse_request_send(fc, req);
436 err = req->out.h.error; 436 err = req->out.h.error;
437 if (err) { 437 if (err) {
438 if (err == -ENOSYS) 438 if (err == -ENOSYS)
@@ -502,7 +502,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
502 else 502 else
503 req->out.args[0].size = sizeof(outarg); 503 req->out.args[0].size = sizeof(outarg);
504 req->out.args[0].value = &outarg; 504 req->out.args[0].value = &outarg;
505 request_send(fc, req); 505 fuse_request_send(fc, req);
506 err = req->out.h.error; 506 err = req->out.h.error;
507 fuse_put_request(fc, req); 507 fuse_put_request(fc, req);
508 if (err) 508 if (err)
@@ -631,15 +631,17 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
631 req->in.numargs = 1; 631 req->in.numargs = 1;
632 req->in.args[0].size = entry->d_name.len + 1; 632 req->in.args[0].size = entry->d_name.len + 1;
633 req->in.args[0].value = entry->d_name.name; 633 req->in.args[0].value = entry->d_name.name;
634 request_send(fc, req); 634 fuse_request_send(fc, req);
635 err = req->out.h.error; 635 err = req->out.h.error;
636 fuse_put_request(fc, req); 636 fuse_put_request(fc, req);
637 if (!err) { 637 if (!err) {
638 struct inode *inode = entry->d_inode; 638 struct inode *inode = entry->d_inode;
639 639
640 /* Set nlink to zero so the inode can be cleared, if 640 /*
641 the inode does have more links this will be 641 * Set nlink to zero so the inode can be cleared, if the inode
642 discovered at the next lookup/getattr */ 642 * does have more links this will be discovered at the next
643 * lookup/getattr.
644 */
643 clear_nlink(inode); 645 clear_nlink(inode);
644 fuse_invalidate_attr(inode); 646 fuse_invalidate_attr(inode);
645 fuse_invalidate_attr(dir); 647 fuse_invalidate_attr(dir);
@@ -662,7 +664,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
662 req->in.numargs = 1; 664 req->in.numargs = 1;
663 req->in.args[0].size = entry->d_name.len + 1; 665 req->in.args[0].size = entry->d_name.len + 1;
664 req->in.args[0].value = entry->d_name.name; 666 req->in.args[0].value = entry->d_name.name;
665 request_send(fc, req); 667 fuse_request_send(fc, req);
666 err = req->out.h.error; 668 err = req->out.h.error;
667 fuse_put_request(fc, req); 669 fuse_put_request(fc, req);
668 if (!err) { 670 if (!err) {
@@ -695,7 +697,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
695 req->in.args[1].value = oldent->d_name.name; 697 req->in.args[1].value = oldent->d_name.name;
696 req->in.args[2].size = newent->d_name.len + 1; 698 req->in.args[2].size = newent->d_name.len + 1;
697 req->in.args[2].value = newent->d_name.name; 699 req->in.args[2].value = newent->d_name.name;
698 request_send(fc, req); 700 fuse_request_send(fc, req);
699 err = req->out.h.error; 701 err = req->out.h.error;
700 fuse_put_request(fc, req); 702 fuse_put_request(fc, req);
701 if (!err) { 703 if (!err) {
@@ -811,7 +813,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
811 else 813 else
812 req->out.args[0].size = sizeof(outarg); 814 req->out.args[0].size = sizeof(outarg);
813 req->out.args[0].value = &outarg; 815 req->out.args[0].value = &outarg;
814 request_send(fc, req); 816 fuse_request_send(fc, req);
815 err = req->out.h.error; 817 err = req->out.h.error;
816 fuse_put_request(fc, req); 818 fuse_put_request(fc, req);
817 if (!err) { 819 if (!err) {
@@ -911,7 +913,7 @@ static int fuse_access(struct inode *inode, int mask)
911 req->in.numargs = 1; 913 req->in.numargs = 1;
912 req->in.args[0].size = sizeof(inarg); 914 req->in.args[0].size = sizeof(inarg);
913 req->in.args[0].value = &inarg; 915 req->in.args[0].value = &inarg;
914 request_send(fc, req); 916 fuse_request_send(fc, req);
915 err = req->out.h.error; 917 err = req->out.h.error;
916 fuse_put_request(fc, req); 918 fuse_put_request(fc, req);
917 if (err == -ENOSYS) { 919 if (err == -ENOSYS) {
@@ -1033,7 +1035,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
1033 req->num_pages = 1; 1035 req->num_pages = 1;
1034 req->pages[0] = page; 1036 req->pages[0] = page;
1035 fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR); 1037 fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR);
1036 request_send(fc, req); 1038 fuse_request_send(fc, req);
1037 nbytes = req->out.args[0].size; 1039 nbytes = req->out.args[0].size;
1038 err = req->out.h.error; 1040 err = req->out.h.error;
1039 fuse_put_request(fc, req); 1041 fuse_put_request(fc, req);
@@ -1067,7 +1069,7 @@ static char *read_link(struct dentry *dentry)
1067 req->out.numargs = 1; 1069 req->out.numargs = 1;
1068 req->out.args[0].size = PAGE_SIZE - 1; 1070 req->out.args[0].size = PAGE_SIZE - 1;
1069 req->out.args[0].value = link; 1071 req->out.args[0].value = link;
1070 request_send(fc, req); 1072 fuse_request_send(fc, req);
1071 if (req->out.h.error) { 1073 if (req->out.h.error) {
1072 free_page((unsigned long) link); 1074 free_page((unsigned long) link);
1073 link = ERR_PTR(req->out.h.error); 1075 link = ERR_PTR(req->out.h.error);
@@ -1273,7 +1275,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
1273 else 1275 else
1274 req->out.args[0].size = sizeof(outarg); 1276 req->out.args[0].size = sizeof(outarg);
1275 req->out.args[0].value = &outarg; 1277 req->out.args[0].value = &outarg;
1276 request_send(fc, req); 1278 fuse_request_send(fc, req);
1277 err = req->out.h.error; 1279 err = req->out.h.error;
1278 fuse_put_request(fc, req); 1280 fuse_put_request(fc, req);
1279 if (err) { 1281 if (err) {
@@ -1367,7 +1369,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
1367 req->in.args[1].value = name; 1369 req->in.args[1].value = name;
1368 req->in.args[2].size = size; 1370 req->in.args[2].size = size;
1369 req->in.args[2].value = value; 1371 req->in.args[2].value = value;
1370 request_send(fc, req); 1372 fuse_request_send(fc, req);
1371 err = req->out.h.error; 1373 err = req->out.h.error;
1372 fuse_put_request(fc, req); 1374 fuse_put_request(fc, req);
1373 if (err == -ENOSYS) { 1375 if (err == -ENOSYS) {
@@ -1413,7 +1415,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
1413 req->out.args[0].size = sizeof(outarg); 1415 req->out.args[0].size = sizeof(outarg);
1414 req->out.args[0].value = &outarg; 1416 req->out.args[0].value = &outarg;
1415 } 1417 }
1416 request_send(fc, req); 1418 fuse_request_send(fc, req);
1417 ret = req->out.h.error; 1419 ret = req->out.h.error;
1418 if (!ret) 1420 if (!ret)
1419 ret = size ? req->out.args[0].size : outarg.size; 1421 ret = size ? req->out.args[0].size : outarg.size;
@@ -1463,7 +1465,7 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
1463 req->out.args[0].size = sizeof(outarg); 1465 req->out.args[0].size = sizeof(outarg);
1464 req->out.args[0].value = &outarg; 1466 req->out.args[0].value = &outarg;
1465 } 1467 }
1466 request_send(fc, req); 1468 fuse_request_send(fc, req);
1467 ret = req->out.h.error; 1469 ret = req->out.h.error;
1468 if (!ret) 1470 if (!ret)
1469 ret = size ? req->out.args[0].size : outarg.size; 1471 ret = size ? req->out.args[0].size : outarg.size;
@@ -1496,7 +1498,7 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
1496 req->in.numargs = 1; 1498 req->in.numargs = 1;
1497 req->in.args[0].size = strlen(name) + 1; 1499 req->in.args[0].size = strlen(name) + 1;
1498 req->in.args[0].value = name; 1500 req->in.args[0].value = name;
1499 request_send(fc, req); 1501 fuse_request_send(fc, req);
1500 err = req->out.h.error; 1502 err = req->out.h.error;
1501 fuse_put_request(fc, req); 1503 fuse_put_request(fc, req);
1502 if (err == -ENOSYS) { 1504 if (err == -ENOSYS) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 34930a964b82..e8162646a9b5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1,6 +1,6 @@
1/* 1/*
2 FUSE: Filesystem in Userspace 2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> 3 Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
4 4
5 This program can be distributed under the terms of the GNU GPL. 5 This program can be distributed under the terms of the GNU GPL.
6 See the file COPYING. 6 See the file COPYING.
@@ -39,14 +39,14 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
39 req->out.numargs = 1; 39 req->out.numargs = 1;
40 req->out.args[0].size = sizeof(*outargp); 40 req->out.args[0].size = sizeof(*outargp);
41 req->out.args[0].value = outargp; 41 req->out.args[0].value = outargp;
42 request_send(fc, req); 42 fuse_request_send(fc, req);
43 err = req->out.h.error; 43 err = req->out.h.error;
44 fuse_put_request(fc, req); 44 fuse_put_request(fc, req);
45 45
46 return err; 46 return err;
47} 47}
48 48
49struct fuse_file *fuse_file_alloc(void) 49struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
50{ 50{
51 struct fuse_file *ff; 51 struct fuse_file *ff;
52 ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL); 52 ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
@@ -58,7 +58,12 @@ struct fuse_file *fuse_file_alloc(void)
58 } else { 58 } else {
59 INIT_LIST_HEAD(&ff->write_entry); 59 INIT_LIST_HEAD(&ff->write_entry);
60 atomic_set(&ff->count, 0); 60 atomic_set(&ff->count, 0);
61 spin_lock(&fc->lock);
62 ff->kh = ++fc->khctr;
63 spin_unlock(&fc->lock);
61 } 64 }
65 RB_CLEAR_NODE(&ff->polled_node);
66 init_waitqueue_head(&ff->poll_wait);
62 } 67 }
63 return ff; 68 return ff;
64} 69}
@@ -79,7 +84,6 @@ static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
79{ 84{
80 dput(req->misc.release.dentry); 85 dput(req->misc.release.dentry);
81 mntput(req->misc.release.vfsmount); 86 mntput(req->misc.release.vfsmount);
82 fuse_put_request(fc, req);
83} 87}
84 88
85static void fuse_file_put(struct fuse_file *ff) 89static void fuse_file_put(struct fuse_file *ff)
@@ -89,7 +93,7 @@ static void fuse_file_put(struct fuse_file *ff)
89 struct inode *inode = req->misc.release.dentry->d_inode; 93 struct inode *inode = req->misc.release.dentry->d_inode;
90 struct fuse_conn *fc = get_fuse_conn(inode); 94 struct fuse_conn *fc = get_fuse_conn(inode);
91 req->end = fuse_release_end; 95 req->end = fuse_release_end;
92 request_send_background(fc, req); 96 fuse_request_send_background(fc, req);
93 kfree(ff); 97 kfree(ff);
94 } 98 }
95} 99}
@@ -109,6 +113,7 @@ void fuse_finish_open(struct inode *inode, struct file *file,
109 113
110int fuse_open_common(struct inode *inode, struct file *file, int isdir) 114int fuse_open_common(struct inode *inode, struct file *file, int isdir)
111{ 115{
116 struct fuse_conn *fc = get_fuse_conn(inode);
112 struct fuse_open_out outarg; 117 struct fuse_open_out outarg;
113 struct fuse_file *ff; 118 struct fuse_file *ff;
114 int err; 119 int err;
@@ -121,7 +126,7 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
121 if (err) 126 if (err)
122 return err; 127 return err;
123 128
124 ff = fuse_file_alloc(); 129 ff = fuse_file_alloc(fc);
125 if (!ff) 130 if (!ff)
126 return -ENOMEM; 131 return -ENOMEM;
127 132
@@ -167,7 +172,11 @@ int fuse_release_common(struct inode *inode, struct file *file, int isdir)
167 172
168 spin_lock(&fc->lock); 173 spin_lock(&fc->lock);
169 list_del(&ff->write_entry); 174 list_del(&ff->write_entry);
175 if (!RB_EMPTY_NODE(&ff->polled_node))
176 rb_erase(&ff->polled_node, &fc->polled_files);
170 spin_unlock(&fc->lock); 177 spin_unlock(&fc->lock);
178
179 wake_up_interruptible_sync(&ff->poll_wait);
171 /* 180 /*
172 * Normally this will send the RELEASE request, 181 * Normally this will send the RELEASE request,
173 * however if some asynchronous READ or WRITE requests 182 * however if some asynchronous READ or WRITE requests
@@ -280,7 +289,7 @@ static int fuse_flush(struct file *file, fl_owner_t id)
280 req->in.args[0].size = sizeof(inarg); 289 req->in.args[0].size = sizeof(inarg);
281 req->in.args[0].value = &inarg; 290 req->in.args[0].value = &inarg;
282 req->force = 1; 291 req->force = 1;
283 request_send(fc, req); 292 fuse_request_send(fc, req);
284 err = req->out.h.error; 293 err = req->out.h.error;
285 fuse_put_request(fc, req); 294 fuse_put_request(fc, req);
286 if (err == -ENOSYS) { 295 if (err == -ENOSYS) {
@@ -344,7 +353,7 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
344 req->in.numargs = 1; 353 req->in.numargs = 1;
345 req->in.args[0].size = sizeof(inarg); 354 req->in.args[0].size = sizeof(inarg);
346 req->in.args[0].value = &inarg; 355 req->in.args[0].value = &inarg;
347 request_send(fc, req); 356 fuse_request_send(fc, req);
348 err = req->out.h.error; 357 err = req->out.h.error;
349 fuse_put_request(fc, req); 358 fuse_put_request(fc, req);
350 if (err == -ENOSYS) { 359 if (err == -ENOSYS) {
@@ -396,7 +405,7 @@ static size_t fuse_send_read(struct fuse_req *req, struct file *file,
396 inarg->read_flags |= FUSE_READ_LOCKOWNER; 405 inarg->read_flags |= FUSE_READ_LOCKOWNER;
397 inarg->lock_owner = fuse_lock_owner_id(fc, owner); 406 inarg->lock_owner = fuse_lock_owner_id(fc, owner);
398 } 407 }
399 request_send(fc, req); 408 fuse_request_send(fc, req);
400 return req->out.args[0].size; 409 return req->out.args[0].size;
401} 410}
402 411
@@ -493,7 +502,6 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
493 } 502 }
494 if (req->ff) 503 if (req->ff)
495 fuse_file_put(req->ff); 504 fuse_file_put(req->ff);
496 fuse_put_request(fc, req);
497} 505}
498 506
499static void fuse_send_readpages(struct fuse_req *req, struct file *file, 507static void fuse_send_readpages(struct fuse_req *req, struct file *file,
@@ -509,10 +517,11 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file,
509 struct fuse_file *ff = file->private_data; 517 struct fuse_file *ff = file->private_data;
510 req->ff = fuse_file_get(ff); 518 req->ff = fuse_file_get(ff);
511 req->end = fuse_readpages_end; 519 req->end = fuse_readpages_end;
512 request_send_background(fc, req); 520 fuse_request_send_background(fc, req);
513 } else { 521 } else {
514 request_send(fc, req); 522 fuse_request_send(fc, req);
515 fuse_readpages_end(fc, req); 523 fuse_readpages_end(fc, req);
524 fuse_put_request(fc, req);
516 } 525 }
517} 526}
518 527
@@ -543,7 +552,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
543 } 552 }
544 } 553 }
545 req->pages[req->num_pages] = page; 554 req->pages[req->num_pages] = page;
546 req->num_pages ++; 555 req->num_pages++;
547 return 0; 556 return 0;
548} 557}
549 558
@@ -636,7 +645,7 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file,
636 inarg->write_flags |= FUSE_WRITE_LOCKOWNER; 645 inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
637 inarg->lock_owner = fuse_lock_owner_id(fc, owner); 646 inarg->lock_owner = fuse_lock_owner_id(fc, owner);
638 } 647 }
639 request_send(fc, req); 648 fuse_request_send(fc, req);
640 return req->misc.write.out.size; 649 return req->misc.write.out.size;
641} 650}
642 651
@@ -646,7 +655,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
646{ 655{
647 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 656 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
648 657
649 *pagep = __grab_cache_page(mapping, index); 658 *pagep = grab_cache_page_write_begin(mapping, index, flags);
650 if (!*pagep) 659 if (!*pagep)
651 return -ENOMEM; 660 return -ENOMEM;
652 return 0; 661 return 0;
@@ -779,7 +788,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
779 break; 788 break;
780 789
781 err = -ENOMEM; 790 err = -ENOMEM;
782 page = __grab_cache_page(mapping, index); 791 page = grab_cache_page_write_begin(mapping, index, 0);
783 if (!page) 792 if (!page)
784 break; 793 break;
785 794
@@ -1042,7 +1051,6 @@ static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
1042{ 1051{
1043 __free_page(req->pages[0]); 1052 __free_page(req->pages[0]);
1044 fuse_file_put(req->ff); 1053 fuse_file_put(req->ff);
1045 fuse_put_request(fc, req);
1046} 1054}
1047 1055
1048static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) 1056static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
@@ -1060,6 +1068,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
1060 1068
1061/* Called under fc->lock, may release and reacquire it */ 1069/* Called under fc->lock, may release and reacquire it */
1062static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req) 1070static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
1071__releases(&fc->lock)
1072__acquires(&fc->lock)
1063{ 1073{
1064 struct fuse_inode *fi = get_fuse_inode(req->inode); 1074 struct fuse_inode *fi = get_fuse_inode(req->inode);
1065 loff_t size = i_size_read(req->inode); 1075 loff_t size = i_size_read(req->inode);
@@ -1079,13 +1089,14 @@ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
1079 1089
1080 req->in.args[1].size = inarg->size; 1090 req->in.args[1].size = inarg->size;
1081 fi->writectr++; 1091 fi->writectr++;
1082 request_send_background_locked(fc, req); 1092 fuse_request_send_background_locked(fc, req);
1083 return; 1093 return;
1084 1094
1085 out_free: 1095 out_free:
1086 fuse_writepage_finish(fc, req); 1096 fuse_writepage_finish(fc, req);
1087 spin_unlock(&fc->lock); 1097 spin_unlock(&fc->lock);
1088 fuse_writepage_free(fc, req); 1098 fuse_writepage_free(fc, req);
1099 fuse_put_request(fc, req);
1089 spin_lock(&fc->lock); 1100 spin_lock(&fc->lock);
1090} 1101}
1091 1102
@@ -1096,6 +1107,8 @@ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
1096 * Called with fc->lock 1107 * Called with fc->lock
1097 */ 1108 */
1098void fuse_flush_writepages(struct inode *inode) 1109void fuse_flush_writepages(struct inode *inode)
1110__releases(&fc->lock)
1111__acquires(&fc->lock)
1099{ 1112{
1100 struct fuse_conn *fc = get_fuse_conn(inode); 1113 struct fuse_conn *fc = get_fuse_conn(inode);
1101 struct fuse_inode *fi = get_fuse_inode(inode); 1114 struct fuse_inode *fi = get_fuse_inode(inode);
@@ -1325,7 +1338,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
1325 req->out.numargs = 1; 1338 req->out.numargs = 1;
1326 req->out.args[0].size = sizeof(outarg); 1339 req->out.args[0].size = sizeof(outarg);
1327 req->out.args[0].value = &outarg; 1340 req->out.args[0].value = &outarg;
1328 request_send(fc, req); 1341 fuse_request_send(fc, req);
1329 err = req->out.h.error; 1342 err = req->out.h.error;
1330 fuse_put_request(fc, req); 1343 fuse_put_request(fc, req);
1331 if (!err) 1344 if (!err)
@@ -1357,7 +1370,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
1357 return PTR_ERR(req); 1370 return PTR_ERR(req);
1358 1371
1359 fuse_lk_fill(req, file, fl, opcode, pid, flock); 1372 fuse_lk_fill(req, file, fl, opcode, pid, flock);
1360 request_send(fc, req); 1373 fuse_request_send(fc, req);
1361 err = req->out.h.error; 1374 err = req->out.h.error;
1362 /* locking is restartable */ 1375 /* locking is restartable */
1363 if (err == -EINTR) 1376 if (err == -EINTR)
@@ -1433,7 +1446,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
1433 req->out.numargs = 1; 1446 req->out.numargs = 1;
1434 req->out.args[0].size = sizeof(outarg); 1447 req->out.args[0].size = sizeof(outarg);
1435 req->out.args[0].value = &outarg; 1448 req->out.args[0].value = &outarg;
1436 request_send(fc, req); 1449 fuse_request_send(fc, req);
1437 err = req->out.h.error; 1450 err = req->out.h.error;
1438 fuse_put_request(fc, req); 1451 fuse_put_request(fc, req);
1439 if (err == -ENOSYS) 1452 if (err == -ENOSYS)
@@ -1470,6 +1483,406 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
1470 return retval; 1483 return retval;
1471} 1484}
1472 1485
1486static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
1487 unsigned int nr_segs, size_t bytes, bool to_user)
1488{
1489 struct iov_iter ii;
1490 int page_idx = 0;
1491
1492 if (!bytes)
1493 return 0;
1494
1495 iov_iter_init(&ii, iov, nr_segs, bytes, 0);
1496
1497 while (iov_iter_count(&ii)) {
1498 struct page *page = pages[page_idx++];
1499 size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
1500 void *kaddr, *map;
1501
1502 kaddr = map = kmap(page);
1503
1504 while (todo) {
1505 char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
1506 size_t iov_len = ii.iov->iov_len - ii.iov_offset;
1507 size_t copy = min(todo, iov_len);
1508 size_t left;
1509
1510 if (!to_user)
1511 left = copy_from_user(kaddr, uaddr, copy);
1512 else
1513 left = copy_to_user(uaddr, kaddr, copy);
1514
1515 if (unlikely(left))
1516 return -EFAULT;
1517
1518 iov_iter_advance(&ii, copy);
1519 todo -= copy;
1520 kaddr += copy;
1521 }
1522
1523 kunmap(map);
1524 }
1525
1526 return 0;
1527}
1528
1529/*
1530 * For ioctls, there is no generic way to determine how much memory
1531 * needs to be read and/or written. Furthermore, ioctls are allowed
1532 * to dereference the passed pointer, so the parameter requires deep
1533 * copying but FUSE has no idea whatsoever about what to copy in or
1534 * out.
1535 *
1536 * This is solved by allowing FUSE server to retry ioctl with
1537 * necessary in/out iovecs. Let's assume the ioctl implementation
1538 * needs to read in the following structure.
1539 *
1540 * struct a {
1541 * char *buf;
1542 * size_t buflen;
1543 * }
1544 *
1545 * On the first callout to FUSE server, inarg->in_size and
1546 * inarg->out_size will be NULL; then, the server completes the ioctl
1547 * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and
1548 * the actual iov array to
1549 *
1550 * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } }
1551 *
1552 * which tells FUSE to copy in the requested area and retry the ioctl.
1553 * On the second round, the server has access to the structure and
1554 * from that it can tell what to look for next, so on the invocation,
1555 * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to
1556 *
1557 * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) },
1558 * { .iov_base = a.buf, .iov_len = a.buflen } }
1559 *
1560 * FUSE will copy both struct a and the pointed buffer from the
1561 * process doing the ioctl and retry ioctl with both struct a and the
1562 * buffer.
1563 *
1564 * This time, FUSE server has everything it needs and completes ioctl
1565 * without FUSE_IOCTL_RETRY which finishes the ioctl call.
1566 *
1567 * Copying data out works the same way.
1568 *
1569 * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel
1570 * automatically initializes in and out iovs by decoding @cmd with
1571 * _IOC_* macros and the server is not allowed to request RETRY. This
1572 * limits ioctl data transfers to well-formed ioctls and is the forced
1573 * behavior for all FUSE servers.
1574 */
1575static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
1576 unsigned long arg, unsigned int flags)
1577{
1578 struct inode *inode = file->f_dentry->d_inode;
1579 struct fuse_file *ff = file->private_data;
1580 struct fuse_conn *fc = get_fuse_conn(inode);
1581 struct fuse_ioctl_in inarg = {
1582 .fh = ff->fh,
1583 .cmd = cmd,
1584 .arg = arg,
1585 .flags = flags
1586 };
1587 struct fuse_ioctl_out outarg;
1588 struct fuse_req *req = NULL;
1589 struct page **pages = NULL;
1590 struct page *iov_page = NULL;
1591 struct iovec *in_iov = NULL, *out_iov = NULL;
1592 unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
1593 size_t in_size, out_size, transferred;
1594 int err;
1595
1596 /* assume all the iovs returned by client always fits in a page */
1597 BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
1598
1599 if (!fuse_allow_task(fc, current))
1600 return -EACCES;
1601
1602 err = -EIO;
1603 if (is_bad_inode(inode))
1604 goto out;
1605
1606 err = -ENOMEM;
1607 pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
1608 iov_page = alloc_page(GFP_KERNEL);
1609 if (!pages || !iov_page)
1610 goto out;
1611
1612 /*
1613 * If restricted, initialize IO parameters as encoded in @cmd.
1614 * RETRY from server is not allowed.
1615 */
1616 if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
1617 struct iovec *iov = page_address(iov_page);
1618
1619 iov->iov_base = (void __user *)arg;
1620 iov->iov_len = _IOC_SIZE(cmd);
1621
1622 if (_IOC_DIR(cmd) & _IOC_WRITE) {
1623 in_iov = iov;
1624 in_iovs = 1;
1625 }
1626
1627 if (_IOC_DIR(cmd) & _IOC_READ) {
1628 out_iov = iov;
1629 out_iovs = 1;
1630 }
1631 }
1632
1633 retry:
1634 inarg.in_size = in_size = iov_length(in_iov, in_iovs);
1635 inarg.out_size = out_size = iov_length(out_iov, out_iovs);
1636
1637 /*
1638 * Out data can be used either for actual out data or iovs,
1639 * make sure there always is at least one page.
1640 */
1641 out_size = max_t(size_t, out_size, PAGE_SIZE);
1642 max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);
1643
1644 /* make sure there are enough buffer pages and init request with them */
1645 err = -ENOMEM;
1646 if (max_pages > FUSE_MAX_PAGES_PER_REQ)
1647 goto out;
1648 while (num_pages < max_pages) {
1649 pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
1650 if (!pages[num_pages])
1651 goto out;
1652 num_pages++;
1653 }
1654
1655 req = fuse_get_req(fc);
1656 if (IS_ERR(req)) {
1657 err = PTR_ERR(req);
1658 req = NULL;
1659 goto out;
1660 }
1661 memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
1662 req->num_pages = num_pages;
1663
1664 /* okay, let's send it to the client */
1665 req->in.h.opcode = FUSE_IOCTL;
1666 req->in.h.nodeid = get_node_id(inode);
1667 req->in.numargs = 1;
1668 req->in.args[0].size = sizeof(inarg);
1669 req->in.args[0].value = &inarg;
1670 if (in_size) {
1671 req->in.numargs++;
1672 req->in.args[1].size = in_size;
1673 req->in.argpages = 1;
1674
1675 err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size,
1676 false);
1677 if (err)
1678 goto out;
1679 }
1680
1681 req->out.numargs = 2;
1682 req->out.args[0].size = sizeof(outarg);
1683 req->out.args[0].value = &outarg;
1684 req->out.args[1].size = out_size;
1685 req->out.argpages = 1;
1686 req->out.argvar = 1;
1687
1688 fuse_request_send(fc, req);
1689 err = req->out.h.error;
1690 transferred = req->out.args[1].size;
1691 fuse_put_request(fc, req);
1692 req = NULL;
1693 if (err)
1694 goto out;
1695
1696 /* did it ask for retry? */
1697 if (outarg.flags & FUSE_IOCTL_RETRY) {
1698 char *vaddr;
1699
1700 /* no retry if in restricted mode */
1701 err = -EIO;
1702 if (!(flags & FUSE_IOCTL_UNRESTRICTED))
1703 goto out;
1704
1705 in_iovs = outarg.in_iovs;
1706 out_iovs = outarg.out_iovs;
1707
1708 /*
1709 * Make sure things are in boundary, separate checks
1710 * are to protect against overflow.
1711 */
1712 err = -ENOMEM;
1713 if (in_iovs > FUSE_IOCTL_MAX_IOV ||
1714 out_iovs > FUSE_IOCTL_MAX_IOV ||
1715 in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
1716 goto out;
1717
1718 err = -EIO;
1719 if ((in_iovs + out_iovs) * sizeof(struct iovec) != transferred)
1720 goto out;
1721
1722 /* okay, copy in iovs and retry */
1723 vaddr = kmap_atomic(pages[0], KM_USER0);
1724 memcpy(page_address(iov_page), vaddr, transferred);
1725 kunmap_atomic(vaddr, KM_USER0);
1726
1727 in_iov = page_address(iov_page);
1728 out_iov = in_iov + in_iovs;
1729
1730 goto retry;
1731 }
1732
1733 err = -EIO;
1734 if (transferred > inarg.out_size)
1735 goto out;
1736
1737 err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true);
1738 out:
1739 if (req)
1740 fuse_put_request(fc, req);
1741 if (iov_page)
1742 __free_page(iov_page);
1743 while (num_pages)
1744 __free_page(pages[--num_pages]);
1745 kfree(pages);
1746
1747 return err ? err : outarg.result;
1748}
1749
1750static long fuse_file_ioctl(struct file *file, unsigned int cmd,
1751 unsigned long arg)
1752{
1753 return fuse_file_do_ioctl(file, cmd, arg, 0);
1754}
1755
1756static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
1757 unsigned long arg)
1758{
1759 return fuse_file_do_ioctl(file, cmd, arg, FUSE_IOCTL_COMPAT);
1760}
1761
1762/*
1763 * All files which have been polled are linked to RB tree
1764 * fuse_conn->polled_files which is indexed by kh. Walk the tree and
1765 * find the matching one.
1766 */
1767static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
1768 struct rb_node **parent_out)
1769{
1770 struct rb_node **link = &fc->polled_files.rb_node;
1771 struct rb_node *last = NULL;
1772
1773 while (*link) {
1774 struct fuse_file *ff;
1775
1776 last = *link;
1777 ff = rb_entry(last, struct fuse_file, polled_node);
1778
1779 if (kh < ff->kh)
1780 link = &last->rb_left;
1781 else if (kh > ff->kh)
1782 link = &last->rb_right;
1783 else
1784 return link;
1785 }
1786
1787 if (parent_out)
1788 *parent_out = last;
1789 return link;
1790}
1791
1792/*
1793 * The file is about to be polled. Make sure it's on the polled_files
1794 * RB tree. Note that files once added to the polled_files tree are
1795 * not removed before the file is released. This is because a file
1796 * polled once is likely to be polled again.
1797 */
1798static void fuse_register_polled_file(struct fuse_conn *fc,
1799 struct fuse_file *ff)
1800{
1801 spin_lock(&fc->lock);
1802 if (RB_EMPTY_NODE(&ff->polled_node)) {
1803 struct rb_node **link, *parent;
1804
1805 link = fuse_find_polled_node(fc, ff->kh, &parent);
1806 BUG_ON(*link);
1807 rb_link_node(&ff->polled_node, parent, link);
1808 rb_insert_color(&ff->polled_node, &fc->polled_files);
1809 }
1810 spin_unlock(&fc->lock);
1811}
1812
1813static unsigned fuse_file_poll(struct file *file, poll_table *wait)
1814{
1815 struct inode *inode = file->f_dentry->d_inode;
1816 struct fuse_file *ff = file->private_data;
1817 struct fuse_conn *fc = get_fuse_conn(inode);
1818 struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
1819 struct fuse_poll_out outarg;
1820 struct fuse_req *req;
1821 int err;
1822
1823 if (fc->no_poll)
1824 return DEFAULT_POLLMASK;
1825
1826 poll_wait(file, &ff->poll_wait, wait);
1827
1828 /*
1829 * Ask for notification iff there's someone waiting for it.
1830 * The client may ignore the flag and always notify.
1831 */
1832 if (waitqueue_active(&ff->poll_wait)) {
1833 inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
1834 fuse_register_polled_file(fc, ff);
1835 }
1836
1837 req = fuse_get_req(fc);
1838 if (IS_ERR(req))
1839 return PTR_ERR(req);
1840
1841 req->in.h.opcode = FUSE_POLL;
1842 req->in.h.nodeid = get_node_id(inode);
1843 req->in.numargs = 1;
1844 req->in.args[0].size = sizeof(inarg);
1845 req->in.args[0].value = &inarg;
1846 req->out.numargs = 1;
1847 req->out.args[0].size = sizeof(outarg);
1848 req->out.args[0].value = &outarg;
1849 fuse_request_send(fc, req);
1850 err = req->out.h.error;
1851 fuse_put_request(fc, req);
1852
1853 if (!err)
1854 return outarg.revents;
1855 if (err == -ENOSYS) {
1856 fc->no_poll = 1;
1857 return DEFAULT_POLLMASK;
1858 }
1859 return POLLERR;
1860}
1861
1862/*
1863 * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
1864 * wakes up the poll waiters.
1865 */
1866int fuse_notify_poll_wakeup(struct fuse_conn *fc,
1867 struct fuse_notify_poll_wakeup_out *outarg)
1868{
1869 u64 kh = outarg->kh;
1870 struct rb_node **link;
1871
1872 spin_lock(&fc->lock);
1873
1874 link = fuse_find_polled_node(fc, kh, NULL);
1875 if (*link) {
1876 struct fuse_file *ff;
1877
1878 ff = rb_entry(*link, struct fuse_file, polled_node);
1879 wake_up_interruptible_sync(&ff->poll_wait);
1880 }
1881
1882 spin_unlock(&fc->lock);
1883 return 0;
1884}
1885
1473static const struct file_operations fuse_file_operations = { 1886static const struct file_operations fuse_file_operations = {
1474 .llseek = fuse_file_llseek, 1887 .llseek = fuse_file_llseek,
1475 .read = do_sync_read, 1888 .read = do_sync_read,
@@ -1484,6 +1897,9 @@ static const struct file_operations fuse_file_operations = {
1484 .lock = fuse_file_lock, 1897 .lock = fuse_file_lock,
1485 .flock = fuse_file_flock, 1898 .flock = fuse_file_flock,
1486 .splice_read = generic_file_splice_read, 1899 .splice_read = generic_file_splice_read,
1900 .unlocked_ioctl = fuse_file_ioctl,
1901 .compat_ioctl = fuse_file_compat_ioctl,
1902 .poll = fuse_file_poll,
1487}; 1903};
1488 1904
1489static const struct file_operations fuse_direct_io_file_operations = { 1905static const struct file_operations fuse_direct_io_file_operations = {
@@ -1496,6 +1912,9 @@ static const struct file_operations fuse_direct_io_file_operations = {
1496 .fsync = fuse_fsync, 1912 .fsync = fuse_fsync,
1497 .lock = fuse_file_lock, 1913 .lock = fuse_file_lock,
1498 .flock = fuse_file_flock, 1914 .flock = fuse_file_flock,
1915 .unlocked_ioctl = fuse_file_ioctl,
1916 .compat_ioctl = fuse_file_compat_ioctl,
1917 .poll = fuse_file_poll,
1499 /* no mmap and splice_read */ 1918 /* no mmap and splice_read */
1500}; 1919};
1501 1920
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 35accfdd747f..5e64b815a5a1 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1,6 +1,6 @@
1/* 1/*
2 FUSE: Filesystem in Userspace 2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> 3 Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
4 4
5 This program can be distributed under the terms of the GNU GPL. 5 This program can be distributed under the terms of the GNU GPL.
6 See the file COPYING. 6 See the file COPYING.
@@ -19,6 +19,8 @@
19#include <linux/backing-dev.h> 19#include <linux/backing-dev.h>
20#include <linux/mutex.h> 20#include <linux/mutex.h>
21#include <linux/rwsem.h> 21#include <linux/rwsem.h>
22#include <linux/rbtree.h>
23#include <linux/poll.h>
22 24
23/** Max number of pages that can be used in a single read request */ 25/** Max number of pages that can be used in a single read request */
24#define FUSE_MAX_PAGES_PER_REQ 32 26#define FUSE_MAX_PAGES_PER_REQ 32
@@ -100,6 +102,9 @@ struct fuse_file {
100 /** Request reserved for flush and release */ 102 /** Request reserved for flush and release */
101 struct fuse_req *reserved_req; 103 struct fuse_req *reserved_req;
102 104
105 /** Kernel file handle guaranteed to be unique */
106 u64 kh;
107
103 /** File handle used by userspace */ 108 /** File handle used by userspace */
104 u64 fh; 109 u64 fh;
105 110
@@ -108,6 +113,12 @@ struct fuse_file {
108 113
109 /** Entry on inode's write_files list */ 114 /** Entry on inode's write_files list */
110 struct list_head write_entry; 115 struct list_head write_entry;
116
117 /** RB node to be linked on fuse_conn->polled_files */
118 struct rb_node polled_node;
119
120 /** Wait queue head for poll */
121 wait_queue_head_t poll_wait;
111}; 122};
112 123
113/** One input argument of a request */ 124/** One input argument of a request */
@@ -322,6 +333,12 @@ struct fuse_conn {
322 /** The list of requests under I/O */ 333 /** The list of requests under I/O */
323 struct list_head io; 334 struct list_head io;
324 335
336 /** The next unique kernel file handle */
337 u64 khctr;
338
339 /** rbtree of fuse_files waiting for poll events indexed by ph */
340 struct rb_root polled_files;
341
325 /** Number of requests currently in the background */ 342 /** Number of requests currently in the background */
326 unsigned num_background; 343 unsigned num_background;
327 344
@@ -355,19 +372,19 @@ struct fuse_conn {
355 /** Connection failed (version mismatch). Cannot race with 372 /** Connection failed (version mismatch). Cannot race with
356 setting other bitfields since it is only set once in INIT 373 setting other bitfields since it is only set once in INIT
357 reply, before any other request, and never cleared */ 374 reply, before any other request, and never cleared */
358 unsigned conn_error : 1; 375 unsigned conn_error:1;
359 376
360 /** Connection successful. Only set in INIT */ 377 /** Connection successful. Only set in INIT */
361 unsigned conn_init : 1; 378 unsigned conn_init:1;
362 379
363 /** Do readpages asynchronously? Only set in INIT */ 380 /** Do readpages asynchronously? Only set in INIT */
364 unsigned async_read : 1; 381 unsigned async_read:1;
365 382
366 /** Do not send separate SETATTR request before open(O_TRUNC) */ 383 /** Do not send separate SETATTR request before open(O_TRUNC) */
367 unsigned atomic_o_trunc : 1; 384 unsigned atomic_o_trunc:1;
368 385
369 /** Filesystem supports NFS exporting. Only set in INIT */ 386 /** Filesystem supports NFS exporting. Only set in INIT */
370 unsigned export_support : 1; 387 unsigned export_support:1;
371 388
372 /* 389 /*
373 * The following bitfields are only for optimization purposes 390 * The following bitfields are only for optimization purposes
@@ -375,43 +392,46 @@ struct fuse_conn {
375 */ 392 */
376 393
377 /** Is fsync not implemented by fs? */ 394 /** Is fsync not implemented by fs? */
378 unsigned no_fsync : 1; 395 unsigned no_fsync:1;
379 396
380 /** Is fsyncdir not implemented by fs? */ 397 /** Is fsyncdir not implemented by fs? */
381 unsigned no_fsyncdir : 1; 398 unsigned no_fsyncdir:1;
382 399
383 /** Is flush not implemented by fs? */ 400 /** Is flush not implemented by fs? */
384 unsigned no_flush : 1; 401 unsigned no_flush:1;
385 402
386 /** Is setxattr not implemented by fs? */ 403 /** Is setxattr not implemented by fs? */
387 unsigned no_setxattr : 1; 404 unsigned no_setxattr:1;
388 405
389 /** Is getxattr not implemented by fs? */ 406 /** Is getxattr not implemented by fs? */
390 unsigned no_getxattr : 1; 407 unsigned no_getxattr:1;
391 408
392 /** Is listxattr not implemented by fs? */ 409 /** Is listxattr not implemented by fs? */
393 unsigned no_listxattr : 1; 410 unsigned no_listxattr:1;
394 411
395 /** Is removexattr not implemented by fs? */ 412 /** Is removexattr not implemented by fs? */
396 unsigned no_removexattr : 1; 413 unsigned no_removexattr:1;
397 414
398 /** Are file locking primitives not implemented by fs? */ 415 /** Are file locking primitives not implemented by fs? */
399 unsigned no_lock : 1; 416 unsigned no_lock:1;
400 417
401 /** Is access not implemented by fs? */ 418 /** Is access not implemented by fs? */
402 unsigned no_access : 1; 419 unsigned no_access:1;
403 420
404 /** Is create not implemented by fs? */ 421 /** Is create not implemented by fs? */
405 unsigned no_create : 1; 422 unsigned no_create:1;
406 423
407 /** Is interrupt not implemented by fs? */ 424 /** Is interrupt not implemented by fs? */
408 unsigned no_interrupt : 1; 425 unsigned no_interrupt:1;
409 426
410 /** Is bmap not implemented by fs? */ 427 /** Is bmap not implemented by fs? */
411 unsigned no_bmap : 1; 428 unsigned no_bmap:1;
429
430 /** Is poll not implemented by fs? */
431 unsigned no_poll:1;
412 432
413 /** Do multi-page cached writes */ 433 /** Do multi-page cached writes */
414 unsigned big_writes : 1; 434 unsigned big_writes:1;
415 435
416 /** The number of requests waiting for completion */ 436 /** The number of requests waiting for completion */
417 atomic_t num_waiting; 437 atomic_t num_waiting;
@@ -445,6 +465,9 @@ struct fuse_conn {
445 465
446 /** Version counter for attribute changes */ 466 /** Version counter for attribute changes */
447 u64 attr_version; 467 u64 attr_version;
468
469 /** Called on final put */
470 void (*release)(struct fuse_conn *);
448}; 471};
449 472
450static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) 473static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -499,7 +522,7 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
499 */ 522 */
500int fuse_open_common(struct inode *inode, struct file *file, int isdir); 523int fuse_open_common(struct inode *inode, struct file *file, int isdir);
501 524
502struct fuse_file *fuse_file_alloc(void); 525struct fuse_file *fuse_file_alloc(struct fuse_conn *fc);
503void fuse_file_free(struct fuse_file *ff); 526void fuse_file_free(struct fuse_file *ff);
504void fuse_finish_open(struct inode *inode, struct file *file, 527void fuse_finish_open(struct inode *inode, struct file *file,
505 struct fuse_file *ff, struct fuse_open_out *outarg); 528 struct fuse_file *ff, struct fuse_open_out *outarg);
@@ -519,6 +542,12 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
519 int isdir); 542 int isdir);
520 543
521/** 544/**
545 * Notify poll wakeup
546 */
547int fuse_notify_poll_wakeup(struct fuse_conn *fc,
548 struct fuse_notify_poll_wakeup_out *outarg);
549
550/**
522 * Initialize file operations on a regular file 551 * Initialize file operations on a regular file
523 */ 552 */
524void fuse_init_file_inode(struct inode *inode); 553void fuse_init_file_inode(struct inode *inode);
@@ -593,19 +622,20 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
593/** 622/**
594 * Send a request (synchronous) 623 * Send a request (synchronous)
595 */ 624 */
596void request_send(struct fuse_conn *fc, struct fuse_req *req); 625void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
597 626
598/** 627/**
599 * Send a request with no reply 628 * Send a request with no reply
600 */ 629 */
601void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req); 630void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
602 631
603/** 632/**
604 * Send a request in the background 633 * Send a request in the background
605 */ 634 */
606void request_send_background(struct fuse_conn *fc, struct fuse_req *req); 635void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
607 636
608void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req); 637void fuse_request_send_background_locked(struct fuse_conn *fc,
638 struct fuse_req *req);
609 639
610/* Abort all requests */ 640/* Abort all requests */
611void fuse_abort_conn(struct fuse_conn *fc); 641void fuse_abort_conn(struct fuse_conn *fc);
@@ -623,6 +653,11 @@ void fuse_invalidate_entry_cache(struct dentry *entry);
623struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); 653struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
624 654
625/** 655/**
656 * Initialize fuse_conn
657 */
658int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb);
659
660/**
626 * Release reference to fuse_conn 661 * Release reference to fuse_conn
627 */ 662 */
628void fuse_conn_put(struct fuse_conn *fc); 663void fuse_conn_put(struct fuse_conn *fc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2e99f34b4435..47c96fdca1ac 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1,6 +1,6 @@
1/* 1/*
2 FUSE: Filesystem in Userspace 2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> 3 Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
4 4
5 This program can be distributed under the terms of the GNU GPL. 5 This program can be distributed under the terms of the GNU GPL.
6 See the file COPYING. 6 See the file COPYING.
@@ -37,10 +37,10 @@ struct fuse_mount_data {
37 unsigned rootmode; 37 unsigned rootmode;
38 unsigned user_id; 38 unsigned user_id;
39 unsigned group_id; 39 unsigned group_id;
40 unsigned fd_present : 1; 40 unsigned fd_present:1;
41 unsigned rootmode_present : 1; 41 unsigned rootmode_present:1;
42 unsigned user_id_present : 1; 42 unsigned user_id_present:1;
43 unsigned group_id_present : 1; 43 unsigned group_id_present:1;
44 unsigned flags; 44 unsigned flags;
45 unsigned max_read; 45 unsigned max_read;
46 unsigned blksize; 46 unsigned blksize;
@@ -94,7 +94,7 @@ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
94 req->in.numargs = 1; 94 req->in.numargs = 1;
95 req->in.args[0].size = sizeof(struct fuse_forget_in); 95 req->in.args[0].size = sizeof(struct fuse_forget_in);
96 req->in.args[0].value = inarg; 96 req->in.args[0].value = inarg;
97 request_send_noreply(fc, req); 97 fuse_request_send_noreply(fc, req);
98} 98}
99 99
100static void fuse_clear_inode(struct inode *inode) 100static void fuse_clear_inode(struct inode *inode)
@@ -250,7 +250,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
250 250
251 fi = get_fuse_inode(inode); 251 fi = get_fuse_inode(inode);
252 spin_lock(&fc->lock); 252 spin_lock(&fc->lock);
253 fi->nlookup ++; 253 fi->nlookup++;
254 spin_unlock(&fc->lock); 254 spin_unlock(&fc->lock);
255 fuse_change_attributes(inode, attr, attr_valid, attr_version); 255 fuse_change_attributes(inode, attr, attr_valid, attr_version);
256 256
@@ -269,7 +269,7 @@ static void fuse_send_destroy(struct fuse_conn *fc)
269 fc->destroy_req = NULL; 269 fc->destroy_req = NULL;
270 req->in.h.opcode = FUSE_DESTROY; 270 req->in.h.opcode = FUSE_DESTROY;
271 req->force = 1; 271 req->force = 1;
272 request_send(fc, req); 272 fuse_request_send(fc, req);
273 fuse_put_request(fc, req); 273 fuse_put_request(fc, req);
274 } 274 }
275} 275}
@@ -334,7 +334,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
334 req->out.args[0].size = 334 req->out.args[0].size =
335 fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg); 335 fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg);
336 req->out.args[0].value = &outarg; 336 req->out.args[0].value = &outarg;
337 request_send(fc, req); 337 fuse_request_send(fc, req);
338 err = req->out.h.error; 338 err = req->out.h.error;
339 if (!err) 339 if (!err)
340 convert_fuse_statfs(buf, &outarg.st); 340 convert_fuse_statfs(buf, &outarg.st);
@@ -462,68 +462,69 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
462 return 0; 462 return 0;
463} 463}
464 464
465static struct fuse_conn *new_conn(struct super_block *sb) 465int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb)
466{ 466{
467 struct fuse_conn *fc;
468 int err; 467 int err;
469 468
470 fc = kzalloc(sizeof(*fc), GFP_KERNEL); 469 memset(fc, 0, sizeof(*fc));
471 if (fc) { 470 spin_lock_init(&fc->lock);
472 spin_lock_init(&fc->lock); 471 mutex_init(&fc->inst_mutex);
473 mutex_init(&fc->inst_mutex); 472 atomic_set(&fc->count, 1);
474 atomic_set(&fc->count, 1); 473 init_waitqueue_head(&fc->waitq);
475 init_waitqueue_head(&fc->waitq); 474 init_waitqueue_head(&fc->blocked_waitq);
476 init_waitqueue_head(&fc->blocked_waitq); 475 init_waitqueue_head(&fc->reserved_req_waitq);
477 init_waitqueue_head(&fc->reserved_req_waitq); 476 INIT_LIST_HEAD(&fc->pending);
478 INIT_LIST_HEAD(&fc->pending); 477 INIT_LIST_HEAD(&fc->processing);
479 INIT_LIST_HEAD(&fc->processing); 478 INIT_LIST_HEAD(&fc->io);
480 INIT_LIST_HEAD(&fc->io); 479 INIT_LIST_HEAD(&fc->interrupts);
481 INIT_LIST_HEAD(&fc->interrupts); 480 INIT_LIST_HEAD(&fc->bg_queue);
482 INIT_LIST_HEAD(&fc->bg_queue); 481 INIT_LIST_HEAD(&fc->entry);
483 atomic_set(&fc->num_waiting, 0); 482 atomic_set(&fc->num_waiting, 0);
484 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 483 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
485 fc->bdi.unplug_io_fn = default_unplug_io_fn; 484 fc->bdi.unplug_io_fn = default_unplug_io_fn;
486 /* fuse does it's own writeback accounting */ 485 /* fuse does it's own writeback accounting */
487 fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB; 486 fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
488 fc->dev = sb->s_dev; 487 fc->khctr = 0;
489 err = bdi_init(&fc->bdi); 488 fc->polled_files = RB_ROOT;
490 if (err) 489 fc->dev = sb->s_dev;
491 goto error_kfree; 490 err = bdi_init(&fc->bdi);
492 if (sb->s_bdev) { 491 if (err)
493 err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk", 492 goto error_mutex_destroy;
494 MAJOR(fc->dev), MINOR(fc->dev)); 493 if (sb->s_bdev) {
495 } else { 494 err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
496 err = bdi_register_dev(&fc->bdi, fc->dev); 495 MAJOR(fc->dev), MINOR(fc->dev));
497 } 496 } else {
498 if (err) 497 err = bdi_register_dev(&fc->bdi, fc->dev);
499 goto error_bdi_destroy;
500 /*
501 * For a single fuse filesystem use max 1% of dirty +
502 * writeback threshold.
503 *
504 * This gives about 1M of write buffer for memory maps on a
505 * machine with 1G and 10% dirty_ratio, which should be more
506 * than enough.
507 *
508 * Privileged users can raise it by writing to
509 *
510 * /sys/class/bdi/<bdi>/max_ratio
511 */
512 bdi_set_max_ratio(&fc->bdi, 1);
513 fc->reqctr = 0;
514 fc->blocked = 1;
515 fc->attr_version = 1;
516 get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
517 } 498 }
518 return fc; 499 if (err)
500 goto error_bdi_destroy;
501 /*
502 * For a single fuse filesystem use max 1% of dirty +
503 * writeback threshold.
504 *
505 * This gives about 1M of write buffer for memory maps on a
506 * machine with 1G and 10% dirty_ratio, which should be more
507 * than enough.
508 *
509 * Privileged users can raise it by writing to
510 *
511 * /sys/class/bdi/<bdi>/max_ratio
512 */
513 bdi_set_max_ratio(&fc->bdi, 1);
514 fc->reqctr = 0;
515 fc->blocked = 1;
516 fc->attr_version = 1;
517 get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
519 518
520error_bdi_destroy: 519 return 0;
520
521 error_bdi_destroy:
521 bdi_destroy(&fc->bdi); 522 bdi_destroy(&fc->bdi);
522error_kfree: 523 error_mutex_destroy:
523 mutex_destroy(&fc->inst_mutex); 524 mutex_destroy(&fc->inst_mutex);
524 kfree(fc); 525 return err;
525 return NULL;
526} 526}
527EXPORT_SYMBOL_GPL(fuse_conn_init);
527 528
528void fuse_conn_put(struct fuse_conn *fc) 529void fuse_conn_put(struct fuse_conn *fc)
529{ 530{
@@ -532,7 +533,7 @@ void fuse_conn_put(struct fuse_conn *fc)
532 fuse_request_free(fc->destroy_req); 533 fuse_request_free(fc->destroy_req);
533 mutex_destroy(&fc->inst_mutex); 534 mutex_destroy(&fc->inst_mutex);
534 bdi_destroy(&fc->bdi); 535 bdi_destroy(&fc->bdi);
535 kfree(fc); 536 fc->release(fc);
536 } 537 }
537} 538}
538 539
@@ -542,7 +543,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
542 return fc; 543 return fc;
543} 544}
544 545
545static struct inode *get_root_inode(struct super_block *sb, unsigned mode) 546static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
546{ 547{
547 struct fuse_attr attr; 548 struct fuse_attr attr;
548 memset(&attr, 0, sizeof(attr)); 549 memset(&attr, 0, sizeof(attr));
@@ -553,8 +554,7 @@ static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
553 return fuse_iget(sb, 1, 0, &attr, 0, 0); 554 return fuse_iget(sb, 1, 0, &attr, 0, 0);
554} 555}
555 556
556struct fuse_inode_handle 557struct fuse_inode_handle {
557{
558 u64 nodeid; 558 u64 nodeid;
559 u32 generation; 559 u32 generation;
560}; 560};
@@ -761,7 +761,6 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
761 fc->max_write = max_t(unsigned, 4096, fc->max_write); 761 fc->max_write = max_t(unsigned, 4096, fc->max_write);
762 fc->conn_init = 1; 762 fc->conn_init = 1;
763 } 763 }
764 fuse_put_request(fc, req);
765 fc->blocked = 0; 764 fc->blocked = 0;
766 wake_up_all(&fc->blocked_waitq); 765 wake_up_all(&fc->blocked_waitq);
767} 766}
@@ -787,7 +786,12 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
787 req->out.args[0].size = sizeof(struct fuse_init_out); 786 req->out.args[0].size = sizeof(struct fuse_init_out);
788 req->out.args[0].value = &req->misc.init_out; 787 req->out.args[0].value = &req->misc.init_out;
789 req->end = process_init_reply; 788 req->end = process_init_reply;
790 request_send_background(fc, req); 789 fuse_request_send_background(fc, req);
790}
791
792static void fuse_free_conn(struct fuse_conn *fc)
793{
794 kfree(fc);
791} 795}
792 796
793static int fuse_fill_super(struct super_block *sb, void *data, int silent) 797static int fuse_fill_super(struct super_block *sb, void *data, int silent)
@@ -828,10 +832,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
828 if (file->f_op != &fuse_dev_operations) 832 if (file->f_op != &fuse_dev_operations)
829 return -EINVAL; 833 return -EINVAL;
830 834
831 fc = new_conn(sb); 835 fc = kmalloc(sizeof(*fc), GFP_KERNEL);
832 if (!fc) 836 if (!fc)
833 return -ENOMEM; 837 return -ENOMEM;
834 838
839 err = fuse_conn_init(fc, sb);
840 if (err) {
841 kfree(fc);
842 return err;
843 }
844
845 fc->release = fuse_free_conn;
835 fc->flags = d.flags; 846 fc->flags = d.flags;
836 fc->user_id = d.user_id; 847 fc->user_id = d.user_id;
837 fc->group_id = d.group_id; 848 fc->group_id = d.group_id;
@@ -841,7 +852,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
841 sb->s_fs_info = fc; 852 sb->s_fs_info = fc;
842 853
843 err = -ENOMEM; 854 err = -ENOMEM;
844 root = get_root_inode(sb, d.rootmode); 855 root = fuse_get_root_inode(sb, d.rootmode);
845 if (!root) 856 if (!root)
846 goto err; 857 goto err;
847 858
@@ -952,7 +963,7 @@ static inline void unregister_fuseblk(void)
952 963
953static void fuse_inode_init_once(void *foo) 964static void fuse_inode_init_once(void *foo)
954{ 965{
955 struct inode * inode = foo; 966 struct inode *inode = foo;
956 967
957 inode_init_once(inode); 968 inode_init_once(inode);
958} 969}
@@ -1031,7 +1042,7 @@ static int __init fuse_init(void)
1031{ 1042{
1032 int res; 1043 int res;
1033 1044
1034 printk("fuse init (API version %i.%i)\n", 1045 printk(KERN_INFO "fuse init (API version %i.%i)\n",
1035 FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); 1046 FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
1036 1047
1037 INIT_LIST_HEAD(&fuse_conn_list); 1048 INIT_LIST_HEAD(&fuse_conn_list);
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index ab2f57e3fb87..e563a6449811 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
1config GFS2_FS 1config GFS2_FS
2 tristate "GFS2 file system support" 2 tristate "GFS2 file system support"
3 depends on EXPERIMENTAL && (64BIT || (LSF && LBD)) 3 depends on EXPERIMENTAL && (64BIT || LBD)
4 select FS_POSIX_ACL 4 select FS_POSIX_ACL
5 select CRC32 5 select CRC32
6 help 6 help
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index ec65851ec80a..c1b4ec6a9650 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,5 +1,5 @@
1obj-$(CONFIG_GFS2_FS) += gfs2.o 1obj-$(CONFIG_GFS2_FS) += gfs2.o
2gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \ 2gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
3 glops.o inode.o log.o lops.o locking.o main.o meta_io.o \ 3 glops.o inode.o log.o lops.o locking.o main.o meta_io.o \
4 mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \ 4 mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
5 ops_fstype.o ops_inode.o ops_super.o quota.o \ 5 ops_fstype.o ops_inode.o ops_super.o quota.o \
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 3e9bd46f27e3..e335dceb6a4f 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -91,7 +91,7 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
91 struct gfs2_ea_location el_this; 91 struct gfs2_ea_location el_this;
92 int error; 92 int error;
93 93
94 if (!ip->i_di.di_eattr) 94 if (!ip->i_eattr)
95 return 0; 95 return 0;
96 96
97 memset(&er, 0, sizeof(struct gfs2_ea_request)); 97 memset(&er, 0, sizeof(struct gfs2_ea_request));
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index bec76b1c2bb0..11ffc56f1f81 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -75,9 +75,9 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
75 void *kaddr = kmap(page); 75 void *kaddr = kmap(page);
76 76
77 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), 77 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
78 ip->i_di.di_size); 78 ip->i_disksize);
79 memset(kaddr + ip->i_di.di_size, 0, 79 memset(kaddr + ip->i_disksize, 0,
80 PAGE_CACHE_SIZE - ip->i_di.di_size); 80 PAGE_CACHE_SIZE - ip->i_disksize);
81 kunmap(page); 81 kunmap(page);
82 82
83 SetPageUptodate(page); 83 SetPageUptodate(page);
@@ -132,7 +132,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
132 if (error) 132 if (error)
133 goto out; 133 goto out;
134 134
135 if (ip->i_di.di_size) { 135 if (ip->i_disksize) {
136 /* Get a free block, fill it with the stuffed data, 136 /* Get a free block, fill it with the stuffed data,
137 and write it out to disk */ 137 and write it out to disk */
138 138
@@ -159,7 +159,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
159 di = (struct gfs2_dinode *)dibh->b_data; 159 di = (struct gfs2_dinode *)dibh->b_data;
160 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); 160 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
161 161
162 if (ip->i_di.di_size) { 162 if (ip->i_disksize) {
163 *(__be64 *)(di + 1) = cpu_to_be64(block); 163 *(__be64 *)(di + 1) = cpu_to_be64(block);
164 gfs2_add_inode_blocks(&ip->i_inode, 1); 164 gfs2_add_inode_blocks(&ip->i_inode, 1);
165 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); 165 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
@@ -926,7 +926,7 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
926 } 926 }
927 } 927 }
928 928
929 ip->i_di.di_size = size; 929 ip->i_disksize = size;
930 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 930 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
931 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 931 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
932 gfs2_dinode_out(ip, dibh->b_data); 932 gfs2_dinode_out(ip, dibh->b_data);
@@ -1033,7 +1033,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
1033 goto out; 1033 goto out;
1034 1034
1035 if (gfs2_is_stuffed(ip)) { 1035 if (gfs2_is_stuffed(ip)) {
1036 ip->i_di.di_size = size; 1036 ip->i_disksize = size;
1037 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1037 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1038 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1038 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1039 gfs2_dinode_out(ip, dibh->b_data); 1039 gfs2_dinode_out(ip, dibh->b_data);
@@ -1045,9 +1045,9 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
1045 error = gfs2_block_truncate_page(ip->i_inode.i_mapping); 1045 error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
1046 1046
1047 if (!error) { 1047 if (!error) {
1048 ip->i_di.di_size = size; 1048 ip->i_disksize = size;
1049 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1049 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1050 ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG; 1050 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1051 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1051 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1052 gfs2_dinode_out(ip, dibh->b_data); 1052 gfs2_dinode_out(ip, dibh->b_data);
1053 } 1053 }
@@ -1114,13 +1114,13 @@ static int trunc_end(struct gfs2_inode *ip)
1114 if (error) 1114 if (error)
1115 goto out; 1115 goto out;
1116 1116
1117 if (!ip->i_di.di_size) { 1117 if (!ip->i_disksize) {
1118 ip->i_height = 0; 1118 ip->i_height = 0;
1119 ip->i_goal = ip->i_no_addr; 1119 ip->i_goal = ip->i_no_addr;
1120 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); 1120 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1121 } 1121 }
1122 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1122 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1123 ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG; 1123 ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
1124 1124
1125 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1125 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1126 gfs2_dinode_out(ip, dibh->b_data); 1126 gfs2_dinode_out(ip, dibh->b_data);
@@ -1205,9 +1205,9 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
1205 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode))) 1205 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode)))
1206 return -EINVAL; 1206 return -EINVAL;
1207 1207
1208 if (size > ip->i_di.di_size) 1208 if (size > ip->i_disksize)
1209 error = do_grow(ip, size); 1209 error = do_grow(ip, size);
1210 else if (size < ip->i_di.di_size) 1210 else if (size < ip->i_disksize)
1211 error = do_shrink(ip, size); 1211 error = do_shrink(ip, size);
1212 else 1212 else
1213 /* update time stamps */ 1213 /* update time stamps */
@@ -1219,7 +1219,7 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
1219int gfs2_truncatei_resume(struct gfs2_inode *ip) 1219int gfs2_truncatei_resume(struct gfs2_inode *ip)
1220{ 1220{
1221 int error; 1221 int error;
1222 error = trunc_dealloc(ip, ip->i_di.di_size); 1222 error = trunc_dealloc(ip, ip->i_disksize);
1223 if (!error) 1223 if (!error)
1224 error = trunc_end(ip); 1224 error = trunc_end(ip);
1225 return error; 1225 return error;
@@ -1231,35 +1231,6 @@ int gfs2_file_dealloc(struct gfs2_inode *ip)
1231} 1231}
1232 1232
1233/** 1233/**
1234 * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
1235 * @ip: the file
1236 * @len: the number of bytes to be written to the file
1237 * @data_blocks: returns the number of data blocks required
1238 * @ind_blocks: returns the number of indirect blocks required
1239 *
1240 */
1241
1242void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
1243 unsigned int *data_blocks, unsigned int *ind_blocks)
1244{
1245 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1246 unsigned int tmp;
1247
1248 if (gfs2_is_dir(ip)) {
1249 *data_blocks = DIV_ROUND_UP(len, sdp->sd_jbsize) + 2;
1250 *ind_blocks = 3 * (sdp->sd_max_jheight - 1);
1251 } else {
1252 *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
1253 *ind_blocks = 3 * (sdp->sd_max_height - 1);
1254 }
1255
1256 for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
1257 tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
1258 *ind_blocks += tmp;
1259 }
1260}
1261
1262/**
1263 * gfs2_write_alloc_required - figure out if a write will require an allocation 1234 * gfs2_write_alloc_required - figure out if a write will require an allocation
1264 * @ip: the file being written to 1235 * @ip: the file being written to
1265 * @offset: the offset to write to 1236 * @offset: the offset to write to
@@ -1276,6 +1247,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1276 struct buffer_head bh; 1247 struct buffer_head bh;
1277 unsigned int shift; 1248 unsigned int shift;
1278 u64 lblock, lblock_stop, size; 1249 u64 lblock, lblock_stop, size;
1250 u64 end_of_file;
1279 1251
1280 *alloc_required = 0; 1252 *alloc_required = 0;
1281 1253
@@ -1291,19 +1263,12 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1291 1263
1292 *alloc_required = 1; 1264 *alloc_required = 1;
1293 shift = sdp->sd_sb.sb_bsize_shift; 1265 shift = sdp->sd_sb.sb_bsize_shift;
1294 if (gfs2_is_dir(ip)) { 1266 BUG_ON(gfs2_is_dir(ip));
1295 unsigned int bsize = sdp->sd_jbsize; 1267 end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift;
1296 lblock = offset; 1268 lblock = offset >> shift;
1297 do_div(lblock, bsize); 1269 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1298 lblock_stop = offset + len + bsize - 1; 1270 if (lblock_stop > end_of_file)
1299 do_div(lblock_stop, bsize); 1271 return 0;
1300 } else {
1301 u64 end_of_file = (ip->i_di.di_size + sdp->sd_sb.sb_bsize - 1) >> shift;
1302 lblock = offset >> shift;
1303 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1304 if (lblock_stop > end_of_file)
1305 return 0;
1306 }
1307 1272
1308 size = (lblock_stop - lblock) << shift; 1273 size = (lblock_stop - lblock) << shift;
1309 do { 1274 do {
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index 4e6cde2943bd..c983177e05ac 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -10,10 +10,40 @@
10#ifndef __BMAP_DOT_H__ 10#ifndef __BMAP_DOT_H__
11#define __BMAP_DOT_H__ 11#define __BMAP_DOT_H__
12 12
13#include "inode.h"
14
13struct inode; 15struct inode;
14struct gfs2_inode; 16struct gfs2_inode;
15struct page; 17struct page;
16 18
19
20/**
21 * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
22 * @ip: the file
23 * @len: the number of bytes to be written to the file
24 * @data_blocks: returns the number of data blocks required
25 * @ind_blocks: returns the number of indirect blocks required
26 *
27 */
28
29static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
30 unsigned int len,
31 unsigned int *data_blocks,
32 unsigned int *ind_blocks)
33{
34 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
35 unsigned int tmp;
36
37 BUG_ON(gfs2_is_dir(ip));
38 *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
39 *ind_blocks = 3 * (sdp->sd_max_height - 1);
40
41 for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
42 tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
43 *ind_blocks += tmp;
44 }
45}
46
17int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); 47int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
18int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create); 48int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create);
19int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen); 49int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
@@ -21,10 +51,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
21int gfs2_truncatei(struct gfs2_inode *ip, u64 size); 51int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
22int gfs2_truncatei_resume(struct gfs2_inode *ip); 52int gfs2_truncatei_resume(struct gfs2_inode *ip);
23int gfs2_file_dealloc(struct gfs2_inode *ip); 53int gfs2_file_dealloc(struct gfs2_inode *ip);
24
25void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
26 unsigned int *data_blocks,
27 unsigned int *ind_blocks);
28int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, 54int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
29 unsigned int len, int *alloc_required); 55 unsigned int len, int *alloc_required);
30 56
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
deleted file mode 100644
index e51991947d2c..000000000000
--- a/fs/gfs2/daemon.c
+++ /dev/null
@@ -1,136 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/kthread.h>
16#include <linux/delay.h>
17#include <linux/gfs2_ondisk.h>
18#include <linux/lm_interface.h>
19#include <linux/freezer.h>
20
21#include "gfs2.h"
22#include "incore.h"
23#include "daemon.h"
24#include "glock.h"
25#include "log.h"
26#include "quota.h"
27#include "recovery.h"
28#include "super.h"
29#include "util.h"
30
31/* This uses schedule_timeout() instead of msleep() because it's good for
32 the daemons to wake up more often than the timeout when unmounting so
33 the user's unmount doesn't sit there forever.
34
35 The kthread functions used to start these daemons block and flush signals. */
36
37/**
38 * gfs2_glockd - Reclaim unused glock structures
39 * @sdp: Pointer to GFS2 superblock
40 *
41 * One or more of these daemons run, reclaiming glocks on sd_reclaim_list.
42 * Number of daemons can be set by user, with num_glockd mount option.
43 */
44
45int gfs2_glockd(void *data)
46{
47 struct gfs2_sbd *sdp = data;
48
49 while (!kthread_should_stop()) {
50 while (atomic_read(&sdp->sd_reclaim_count))
51 gfs2_reclaim_glock(sdp);
52
53 wait_event_interruptible(sdp->sd_reclaim_wq,
54 (atomic_read(&sdp->sd_reclaim_count) ||
55 kthread_should_stop()));
56 if (freezing(current))
57 refrigerator();
58 }
59
60 return 0;
61}
62
63/**
64 * gfs2_recoverd - Recover dead machine's journals
65 * @sdp: Pointer to GFS2 superblock
66 *
67 */
68
69int gfs2_recoverd(void *data)
70{
71 struct gfs2_sbd *sdp = data;
72 unsigned long t;
73
74 while (!kthread_should_stop()) {
75 gfs2_check_journals(sdp);
76 t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ;
77 if (freezing(current))
78 refrigerator();
79 schedule_timeout_interruptible(t);
80 }
81
82 return 0;
83}
84
85/**
86 * gfs2_quotad - Write cached quota changes into the quota file
87 * @sdp: Pointer to GFS2 superblock
88 *
89 */
90
91int gfs2_quotad(void *data)
92{
93 struct gfs2_sbd *sdp = data;
94 unsigned long t;
95 int error;
96
97 while (!kthread_should_stop()) {
98 /* Update the master statfs file */
99
100 t = sdp->sd_statfs_sync_time +
101 gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
102
103 if (time_after_eq(jiffies, t)) {
104 error = gfs2_statfs_sync(sdp);
105 if (error &&
106 error != -EROFS &&
107 !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
108 fs_err(sdp, "quotad: (1) error=%d\n", error);
109 sdp->sd_statfs_sync_time = jiffies;
110 }
111
112 /* Update quota file */
113
114 t = sdp->sd_quota_sync_time +
115 gfs2_tune_get(sdp, gt_quota_quantum) * HZ;
116
117 if (time_after_eq(jiffies, t)) {
118 error = gfs2_quota_sync(sdp);
119 if (error &&
120 error != -EROFS &&
121 !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
122 fs_err(sdp, "quotad: (2) error=%d\n", error);
123 sdp->sd_quota_sync_time = jiffies;
124 }
125
126 gfs2_quota_scan(sdp);
127
128 t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ;
129 if (freezing(current))
130 refrigerator();
131 schedule_timeout_interruptible(t);
132 }
133
134 return 0;
135}
136
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
deleted file mode 100644
index 4be084fb6a62..000000000000
--- a/fs/gfs2/daemon.h
+++ /dev/null
@@ -1,17 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __DAEMON_DOT_H__
11#define __DAEMON_DOT_H__
12
13int gfs2_glockd(void *data);
14int gfs2_recoverd(void *data);
15int gfs2_quotad(void *data);
16
17#endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index eed040d8ba3a..b7c8e5c70791 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -36,7 +36,7 @@
36 * the block. In leaves, they begin at offset sizeof(struct gfs2_leaf) from the 36 * the block. In leaves, they begin at offset sizeof(struct gfs2_leaf) from the
37 * beginning of the leaf block. The dirents reside in leaves when 37 * beginning of the leaf block. The dirents reside in leaves when
38 * 38 *
39 * dip->i_di.di_flags & GFS2_DIF_EXHASH is true 39 * dip->i_diskflags & GFS2_DIF_EXHASH is true
40 * 40 *
41 * Otherwise, the dirents are "linear", within a single stuffed dinode block. 41 * Otherwise, the dirents are "linear", within a single stuffed dinode block.
42 * 42 *
@@ -128,8 +128,8 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
128 128
129 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 129 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
130 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); 130 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
131 if (ip->i_di.di_size < offset + size) 131 if (ip->i_disksize < offset + size)
132 ip->i_di.di_size = offset + size; 132 ip->i_disksize = offset + size;
133 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 133 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
134 gfs2_dinode_out(ip, dibh->b_data); 134 gfs2_dinode_out(ip, dibh->b_data);
135 135
@@ -226,8 +226,8 @@ out:
226 if (error) 226 if (error)
227 return error; 227 return error;
228 228
229 if (ip->i_di.di_size < offset + copied) 229 if (ip->i_disksize < offset + copied)
230 ip->i_di.di_size = offset + copied; 230 ip->i_disksize = offset + copied;
231 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 231 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
232 232
233 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 233 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
@@ -277,11 +277,11 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
277 int copied = 0; 277 int copied = 0;
278 int error = 0; 278 int error = 0;
279 279
280 if (offset >= ip->i_di.di_size) 280 if (offset >= ip->i_disksize)
281 return 0; 281 return 0;
282 282
283 if (offset + size > ip->i_di.di_size) 283 if (offset + size > ip->i_disksize)
284 size = ip->i_di.di_size - offset; 284 size = ip->i_disksize - offset;
285 285
286 if (!size) 286 if (!size)
287 return 0; 287 return 0;
@@ -755,12 +755,12 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
755 struct gfs2_inode *ip = GFS2_I(inode); 755 struct gfs2_inode *ip = GFS2_I(inode);
756 int error; 756 int error;
757 757
758 if (ip->i_di.di_flags & GFS2_DIF_EXHASH) { 758 if (ip->i_diskflags & GFS2_DIF_EXHASH) {
759 struct gfs2_leaf *leaf; 759 struct gfs2_leaf *leaf;
760 unsigned hsize = 1 << ip->i_depth; 760 unsigned hsize = 1 << ip->i_depth;
761 unsigned index; 761 unsigned index;
762 u64 ln; 762 u64 ln;
763 if (hsize * sizeof(u64) != ip->i_di.di_size) { 763 if (hsize * sizeof(u64) != ip->i_disksize) {
764 gfs2_consist_inode(ip); 764 gfs2_consist_inode(ip);
765 return ERR_PTR(-EIO); 765 return ERR_PTR(-EIO);
766 } 766 }
@@ -858,8 +858,8 @@ static int dir_make_exhash(struct inode *inode)
858 return -ENOSPC; 858 return -ENOSPC;
859 bn = bh->b_blocknr; 859 bn = bh->b_blocknr;
860 860
861 gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16)); 861 gfs2_assert(sdp, dip->i_entries < (1 << 16));
862 leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries); 862 leaf->lf_entries = cpu_to_be16(dip->i_entries);
863 863
864 /* Copy dirents */ 864 /* Copy dirents */
865 865
@@ -905,9 +905,9 @@ static int dir_make_exhash(struct inode *inode)
905 for (x = sdp->sd_hash_ptrs; x--; lp++) 905 for (x = sdp->sd_hash_ptrs; x--; lp++)
906 *lp = cpu_to_be64(bn); 906 *lp = cpu_to_be64(bn);
907 907
908 dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2; 908 dip->i_disksize = sdp->sd_sb.sb_bsize / 2;
909 gfs2_add_inode_blocks(&dip->i_inode, 1); 909 gfs2_add_inode_blocks(&dip->i_inode, 1);
910 dip->i_di.di_flags |= GFS2_DIF_EXHASH; 910 dip->i_diskflags |= GFS2_DIF_EXHASH;
911 911
912 for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ; 912 for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
913 dip->i_depth = y; 913 dip->i_depth = y;
@@ -1082,7 +1082,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
1082 int error = 0; 1082 int error = 0;
1083 1083
1084 hsize = 1 << dip->i_depth; 1084 hsize = 1 << dip->i_depth;
1085 if (hsize * sizeof(u64) != dip->i_di.di_size) { 1085 if (hsize * sizeof(u64) != dip->i_disksize) {
1086 gfs2_consist_inode(dip); 1086 gfs2_consist_inode(dip);
1087 return -EIO; 1087 return -EIO;
1088 } 1088 }
@@ -1091,7 +1091,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
1091 1091
1092 buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL); 1092 buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL);
1093 1093
1094 for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) { 1094 for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) {
1095 error = gfs2_dir_read_data(dip, (char *)buf, 1095 error = gfs2_dir_read_data(dip, (char *)buf,
1096 block * sdp->sd_hash_bsize, 1096 block * sdp->sd_hash_bsize,
1097 sdp->sd_hash_bsize, 1); 1097 sdp->sd_hash_bsize, 1);
@@ -1370,7 +1370,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
1370 unsigned depth = 0; 1370 unsigned depth = 0;
1371 1371
1372 hsize = 1 << dip->i_depth; 1372 hsize = 1 << dip->i_depth;
1373 if (hsize * sizeof(u64) != dip->i_di.di_size) { 1373 if (hsize * sizeof(u64) != dip->i_disksize) {
1374 gfs2_consist_inode(dip); 1374 gfs2_consist_inode(dip);
1375 return -EIO; 1375 return -EIO;
1376 } 1376 }
@@ -1426,10 +1426,10 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
1426 int copied = 0; 1426 int copied = 0;
1427 int error; 1427 int error;
1428 1428
1429 if (!dip->i_di.di_entries) 1429 if (!dip->i_entries)
1430 return 0; 1430 return 0;
1431 1431
1432 if (dip->i_di.di_flags & GFS2_DIF_EXHASH) 1432 if (dip->i_diskflags & GFS2_DIF_EXHASH)
1433 return dir_e_read(inode, offset, opaque, filldir); 1433 return dir_e_read(inode, offset, opaque, filldir);
1434 1434
1435 if (!gfs2_is_stuffed(dip)) { 1435 if (!gfs2_is_stuffed(dip)) {
@@ -1453,17 +1453,17 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
1453 error = PTR_ERR(dent); 1453 error = PTR_ERR(dent);
1454 goto out; 1454 goto out;
1455 } 1455 }
1456 if (dip->i_di.di_entries != g.offset) { 1456 if (dip->i_entries != g.offset) {
1457 fs_warn(sdp, "Number of entries corrupt in dir %llu, " 1457 fs_warn(sdp, "Number of entries corrupt in dir %llu, "
1458 "ip->i_di.di_entries (%u) != g.offset (%u)\n", 1458 "ip->i_entries (%u) != g.offset (%u)\n",
1459 (unsigned long long)dip->i_no_addr, 1459 (unsigned long long)dip->i_no_addr,
1460 dip->i_di.di_entries, 1460 dip->i_entries,
1461 g.offset); 1461 g.offset);
1462 error = -EIO; 1462 error = -EIO;
1463 goto out; 1463 goto out;
1464 } 1464 }
1465 error = do_filldir_main(dip, offset, opaque, filldir, darr, 1465 error = do_filldir_main(dip, offset, opaque, filldir, darr,
1466 dip->i_di.di_entries, &copied); 1466 dip->i_entries, &copied);
1467out: 1467out:
1468 kfree(darr); 1468 kfree(darr);
1469 } 1469 }
@@ -1612,7 +1612,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1612 dent = gfs2_init_dirent(inode, dent, name, bh); 1612 dent = gfs2_init_dirent(inode, dent, name, bh);
1613 gfs2_inum_out(nip, dent); 1613 gfs2_inum_out(nip, dent);
1614 dent->de_type = cpu_to_be16(type); 1614 dent->de_type = cpu_to_be16(type);
1615 if (ip->i_di.di_flags & GFS2_DIF_EXHASH) { 1615 if (ip->i_diskflags & GFS2_DIF_EXHASH) {
1616 leaf = (struct gfs2_leaf *)bh->b_data; 1616 leaf = (struct gfs2_leaf *)bh->b_data;
1617 be16_add_cpu(&leaf->lf_entries, 1); 1617 be16_add_cpu(&leaf->lf_entries, 1);
1618 } 1618 }
@@ -1621,14 +1621,14 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1621 if (error) 1621 if (error)
1622 break; 1622 break;
1623 gfs2_trans_add_bh(ip->i_gl, bh, 1); 1623 gfs2_trans_add_bh(ip->i_gl, bh, 1);
1624 ip->i_di.di_entries++; 1624 ip->i_entries++;
1625 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1625 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1626 gfs2_dinode_out(ip, bh->b_data); 1626 gfs2_dinode_out(ip, bh->b_data);
1627 brelse(bh); 1627 brelse(bh);
1628 error = 0; 1628 error = 0;
1629 break; 1629 break;
1630 } 1630 }
1631 if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) { 1631 if (!(ip->i_diskflags & GFS2_DIF_EXHASH)) {
1632 error = dir_make_exhash(inode); 1632 error = dir_make_exhash(inode);
1633 if (error) 1633 if (error)
1634 break; 1634 break;
@@ -1691,7 +1691,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
1691 } 1691 }
1692 1692
1693 dirent_del(dip, bh, prev, dent); 1693 dirent_del(dip, bh, prev, dent);
1694 if (dip->i_di.di_flags & GFS2_DIF_EXHASH) { 1694 if (dip->i_diskflags & GFS2_DIF_EXHASH) {
1695 struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data; 1695 struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
1696 u16 entries = be16_to_cpu(leaf->lf_entries); 1696 u16 entries = be16_to_cpu(leaf->lf_entries);
1697 if (!entries) 1697 if (!entries)
@@ -1704,10 +1704,10 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
1704 if (error) 1704 if (error)
1705 return error; 1705 return error;
1706 1706
1707 if (!dip->i_di.di_entries) 1707 if (!dip->i_entries)
1708 gfs2_consist_inode(dip); 1708 gfs2_consist_inode(dip);
1709 gfs2_trans_add_bh(dip->i_gl, bh, 1); 1709 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1710 dip->i_di.di_entries--; 1710 dip->i_entries--;
1711 dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; 1711 dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
1712 gfs2_dinode_out(dip, bh->b_data); 1712 gfs2_dinode_out(dip, bh->b_data);
1713 brelse(bh); 1713 brelse(bh);
@@ -1748,7 +1748,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
1748 gfs2_inum_out(nip, dent); 1748 gfs2_inum_out(nip, dent);
1749 dent->de_type = cpu_to_be16(new_type); 1749 dent->de_type = cpu_to_be16(new_type);
1750 1750
1751 if (dip->i_di.di_flags & GFS2_DIF_EXHASH) { 1751 if (dip->i_diskflags & GFS2_DIF_EXHASH) {
1752 brelse(bh); 1752 brelse(bh);
1753 error = gfs2_meta_inode_buffer(dip, &bh); 1753 error = gfs2_meta_inode_buffer(dip, &bh);
1754 if (error) 1754 if (error)
@@ -1784,7 +1784,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
1784 int error = 0; 1784 int error = 0;
1785 1785
1786 hsize = 1 << dip->i_depth; 1786 hsize = 1 << dip->i_depth;
1787 if (hsize * sizeof(u64) != dip->i_di.di_size) { 1787 if (hsize * sizeof(u64) != dip->i_disksize) {
1788 gfs2_consist_inode(dip); 1788 gfs2_consist_inode(dip);
1789 return -EIO; 1789 return -EIO;
1790 } 1790 }
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 8a468cac9328..4f919440c3be 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -11,6 +11,7 @@
11#define __DIR_DOT_H__ 11#define __DIR_DOT_H__
12 12
13#include <linux/dcache.h> 13#include <linux/dcache.h>
14#include <linux/crc32.h>
14 15
15struct inode; 16struct inode;
16struct gfs2_inode; 17struct gfs2_inode;
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index e3f76f451b0a..0d1c76d906ae 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -114,11 +114,11 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
114 __be64 *eablk, *end; 114 __be64 *eablk, *end;
115 int error; 115 int error;
116 116
117 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &bh); 117 error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh);
118 if (error) 118 if (error)
119 return error; 119 return error;
120 120
121 if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) { 121 if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) {
122 error = ea_foreach_i(ip, bh, ea_call, data); 122 error = ea_foreach_i(ip, bh, ea_call, data);
123 goto out; 123 goto out;
124 } 124 }
@@ -414,7 +414,7 @@ int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
414 if (error) 414 if (error)
415 return error; 415 return error;
416 416
417 if (ip->i_di.di_eattr) { 417 if (ip->i_eattr) {
418 struct ea_list ei = { .ei_er = er, .ei_size = 0 }; 418 struct ea_list ei = { .ei_er = er, .ei_size = 0 };
419 419
420 error = ea_foreach(ip, ea_list_i, &ei); 420 error = ea_foreach(ip, ea_list_i, &ei);
@@ -514,7 +514,7 @@ int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
514 struct gfs2_ea_location el; 514 struct gfs2_ea_location el;
515 int error; 515 int error;
516 516
517 if (!ip->i_di.di_eattr) 517 if (!ip->i_eattr)
518 return -ENODATA; 518 return -ENODATA;
519 519
520 error = gfs2_ea_find(ip, er, &el); 520 error = gfs2_ea_find(ip, er, &el);
@@ -741,7 +741,7 @@ static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
741 if (error) 741 if (error)
742 return error; 742 return error;
743 743
744 ip->i_di.di_eattr = bh->b_blocknr; 744 ip->i_eattr = bh->b_blocknr;
745 error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er); 745 error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
746 746
747 brelse(bh); 747 brelse(bh);
@@ -935,10 +935,10 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
935 int error; 935 int error;
936 int mh_size = sizeof(struct gfs2_meta_header); 936 int mh_size = sizeof(struct gfs2_meta_header);
937 937
938 if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) { 938 if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
939 __be64 *end; 939 __be64 *end;
940 940
941 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, 941 error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT,
942 &indbh); 942 &indbh);
943 if (error) 943 if (error)
944 return error; 944 return error;
@@ -972,9 +972,9 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
972 gfs2_buffer_clear_tail(indbh, mh_size); 972 gfs2_buffer_clear_tail(indbh, mh_size);
973 973
974 eablk = (__be64 *)(indbh->b_data + mh_size); 974 eablk = (__be64 *)(indbh->b_data + mh_size);
975 *eablk = cpu_to_be64(ip->i_di.di_eattr); 975 *eablk = cpu_to_be64(ip->i_eattr);
976 ip->i_di.di_eattr = blk; 976 ip->i_eattr = blk;
977 ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT; 977 ip->i_diskflags |= GFS2_DIF_EA_INDIRECT;
978 gfs2_add_inode_blocks(&ip->i_inode, 1); 978 gfs2_add_inode_blocks(&ip->i_inode, 1);
979 979
980 eablk++; 980 eablk++;
@@ -1015,7 +1015,7 @@ static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
1015 if (error) 1015 if (error)
1016 return error; 1016 return error;
1017 1017
1018 if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) 1018 if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT))
1019 blks++; 1019 blks++;
1020 if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize) 1020 if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
1021 blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize); 1021 blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
@@ -1040,7 +1040,7 @@ int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1040 struct gfs2_ea_location el; 1040 struct gfs2_ea_location el;
1041 int error; 1041 int error;
1042 1042
1043 if (!ip->i_di.di_eattr) { 1043 if (!ip->i_eattr) {
1044 if (er->er_flags & XATTR_REPLACE) 1044 if (er->er_flags & XATTR_REPLACE)
1045 return -ENODATA; 1045 return -ENODATA;
1046 return ea_init(ip, er); 1046 return ea_init(ip, er);
@@ -1051,7 +1051,7 @@ int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1051 return error; 1051 return error;
1052 1052
1053 if (el.el_ea) { 1053 if (el.el_ea) {
1054 if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) { 1054 if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
1055 brelse(el.el_bh); 1055 brelse(el.el_bh);
1056 return -EPERM; 1056 return -EPERM;
1057 } 1057 }
@@ -1145,7 +1145,7 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1145 struct gfs2_ea_location el; 1145 struct gfs2_ea_location el;
1146 int error; 1146 int error;
1147 1147
1148 if (!ip->i_di.di_eattr) 1148 if (!ip->i_eattr)
1149 return -ENODATA; 1149 return -ENODATA;
1150 1150
1151 error = gfs2_ea_find(ip, er, &el); 1151 error = gfs2_ea_find(ip, er, &el);
@@ -1309,7 +1309,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
1309 1309
1310 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); 1310 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
1311 1311
1312 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &indbh); 1312 error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh);
1313 if (error) 1313 if (error)
1314 return error; 1314 return error;
1315 1315
@@ -1388,7 +1388,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
1388 if (bstart) 1388 if (bstart)
1389 gfs2_free_meta(ip, bstart, blen); 1389 gfs2_free_meta(ip, bstart, blen);
1390 1390
1391 ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT; 1391 ip->i_diskflags &= ~GFS2_DIF_EA_INDIRECT;
1392 1392
1393 error = gfs2_meta_inode_buffer(ip, &dibh); 1393 error = gfs2_meta_inode_buffer(ip, &dibh);
1394 if (!error) { 1394 if (!error) {
@@ -1416,7 +1416,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
1416 struct buffer_head *dibh; 1416 struct buffer_head *dibh;
1417 int error; 1417 int error;
1418 1418
1419 rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr); 1419 rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr);
1420 if (!rgd) { 1420 if (!rgd) {
1421 gfs2_consist_inode(ip); 1421 gfs2_consist_inode(ip);
1422 return -EIO; 1422 return -EIO;
@@ -1432,9 +1432,9 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
1432 if (error) 1432 if (error)
1433 goto out_gunlock; 1433 goto out_gunlock;
1434 1434
1435 gfs2_free_meta(ip, ip->i_di.di_eattr, 1); 1435 gfs2_free_meta(ip, ip->i_eattr, 1);
1436 1436
1437 ip->i_di.di_eattr = 0; 1437 ip->i_eattr = 0;
1438 gfs2_add_inode_blocks(&ip->i_inode, -1); 1438 gfs2_add_inode_blocks(&ip->i_inode, -1);
1439 1439
1440 error = gfs2_meta_inode_buffer(ip, &dibh); 1440 error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -1479,7 +1479,7 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
1479 if (error) 1479 if (error)
1480 goto out_rindex; 1480 goto out_rindex;
1481 1481
1482 if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) { 1482 if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
1483 error = ea_dealloc_indirect(ip); 1483 error = ea_dealloc_indirect(ip);
1484 if (error) 1484 if (error)
1485 goto out_rindex; 1485 goto out_rindex;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index c962283d4e7f..6b983aef785d 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -40,6 +40,7 @@
40#include "quota.h" 40#include "quota.h"
41#include "super.h" 41#include "super.h"
42#include "util.h" 42#include "util.h"
43#include "bmap.h"
43 44
44struct gfs2_gl_hash_bucket { 45struct gfs2_gl_hash_bucket {
45 struct hlist_head hb_list; 46 struct hlist_head hb_list;
@@ -61,9 +62,10 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int
61 62
62static DECLARE_RWSEM(gfs2_umount_flush_sem); 63static DECLARE_RWSEM(gfs2_umount_flush_sem);
63static struct dentry *gfs2_root; 64static struct dentry *gfs2_root;
64static struct task_struct *scand_process;
65static unsigned int scand_secs = 5;
66static struct workqueue_struct *glock_workqueue; 65static struct workqueue_struct *glock_workqueue;
66static LIST_HEAD(lru_list);
67static atomic_t lru_count = ATOMIC_INIT(0);
68static DEFINE_SPINLOCK(lru_lock);
67 69
68#define GFS2_GL_HASH_SHIFT 15 70#define GFS2_GL_HASH_SHIFT 15
69#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT) 71#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
@@ -174,6 +176,22 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
174} 176}
175 177
176/** 178/**
179 * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
180 * @gl: the glock
181 *
182 */
183
184static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
185{
186 spin_lock(&lru_lock);
187 if (list_empty(&gl->gl_lru) && gl->gl_state != LM_ST_UNLOCKED) {
188 list_add_tail(&gl->gl_lru, &lru_list);
189 atomic_inc(&lru_count);
190 }
191 spin_unlock(&lru_lock);
192}
193
194/**
177 * gfs2_glock_put() - Decrement reference count on glock 195 * gfs2_glock_put() - Decrement reference count on glock
178 * @gl: The glock to put 196 * @gl: The glock to put
179 * 197 *
@@ -187,14 +205,23 @@ int gfs2_glock_put(struct gfs2_glock *gl)
187 if (atomic_dec_and_test(&gl->gl_ref)) { 205 if (atomic_dec_and_test(&gl->gl_ref)) {
188 hlist_del(&gl->gl_list); 206 hlist_del(&gl->gl_list);
189 write_unlock(gl_lock_addr(gl->gl_hash)); 207 write_unlock(gl_lock_addr(gl->gl_hash));
208 spin_lock(&lru_lock);
209 if (!list_empty(&gl->gl_lru)) {
210 list_del_init(&gl->gl_lru);
211 atomic_dec(&lru_count);
212 }
213 spin_unlock(&lru_lock);
190 GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED); 214 GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED);
191 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_reclaim)); 215 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_lru));
192 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); 216 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
193 glock_free(gl); 217 glock_free(gl);
194 rv = 1; 218 rv = 1;
195 goto out; 219 goto out;
196 } 220 }
197 write_unlock(gl_lock_addr(gl->gl_hash)); 221 write_unlock(gl_lock_addr(gl->gl_hash));
222 /* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */
223 if (atomic_read(&gl->gl_ref) == 2)
224 gfs2_glock_schedule_for_reclaim(gl);
198out: 225out:
199 return rv; 226 return rv;
200} 227}
@@ -289,10 +316,13 @@ static void gfs2_holder_wake(struct gfs2_holder *gh)
289 * do_promote - promote as many requests as possible on the current queue 316 * do_promote - promote as many requests as possible on the current queue
290 * @gl: The glock 317 * @gl: The glock
291 * 318 *
292 * Returns: true if there is a blocked holder at the head of the list 319 * Returns: 1 if there is a blocked holder at the head of the list, or 2
320 * if a type specific operation is underway.
293 */ 321 */
294 322
295static int do_promote(struct gfs2_glock *gl) 323static int do_promote(struct gfs2_glock *gl)
324__releases(&gl->gl_spin)
325__acquires(&gl->gl_spin)
296{ 326{
297 const struct gfs2_glock_operations *glops = gl->gl_ops; 327 const struct gfs2_glock_operations *glops = gl->gl_ops;
298 struct gfs2_holder *gh, *tmp; 328 struct gfs2_holder *gh, *tmp;
@@ -310,6 +340,8 @@ restart:
310 ret = glops->go_lock(gh); 340 ret = glops->go_lock(gh);
311 spin_lock(&gl->gl_spin); 341 spin_lock(&gl->gl_spin);
312 if (ret) { 342 if (ret) {
343 if (ret == 1)
344 return 2;
313 gh->gh_error = ret; 345 gh->gh_error = ret;
314 list_del_init(&gh->gh_list); 346 list_del_init(&gh->gh_list);
315 gfs2_holder_wake(gh); 347 gfs2_holder_wake(gh);
@@ -414,6 +446,7 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
414 const struct gfs2_glock_operations *glops = gl->gl_ops; 446 const struct gfs2_glock_operations *glops = gl->gl_ops;
415 struct gfs2_holder *gh; 447 struct gfs2_holder *gh;
416 unsigned state = ret & LM_OUT_ST_MASK; 448 unsigned state = ret & LM_OUT_ST_MASK;
449 int rv;
417 450
418 spin_lock(&gl->gl_spin); 451 spin_lock(&gl->gl_spin);
419 state_change(gl, state); 452 state_change(gl, state);
@@ -468,7 +501,6 @@ retry:
468 gfs2_demote_wake(gl); 501 gfs2_demote_wake(gl);
469 if (state != LM_ST_UNLOCKED) { 502 if (state != LM_ST_UNLOCKED) {
470 if (glops->go_xmote_bh) { 503 if (glops->go_xmote_bh) {
471 int rv;
472 spin_unlock(&gl->gl_spin); 504 spin_unlock(&gl->gl_spin);
473 rv = glops->go_xmote_bh(gl, gh); 505 rv = glops->go_xmote_bh(gl, gh);
474 if (rv == -EAGAIN) 506 if (rv == -EAGAIN)
@@ -479,10 +511,13 @@ retry:
479 goto out; 511 goto out;
480 } 512 }
481 } 513 }
482 do_promote(gl); 514 rv = do_promote(gl);
515 if (rv == 2)
516 goto out_locked;
483 } 517 }
484out: 518out:
485 clear_bit(GLF_LOCK, &gl->gl_flags); 519 clear_bit(GLF_LOCK, &gl->gl_flags);
520out_locked:
486 spin_unlock(&gl->gl_spin); 521 spin_unlock(&gl->gl_spin);
487 gfs2_glock_put(gl); 522 gfs2_glock_put(gl);
488} 523}
@@ -511,6 +546,8 @@ static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
511 */ 546 */
512 547
513static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target) 548static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
549__releases(&gl->gl_spin)
550__acquires(&gl->gl_spin)
514{ 551{
515 const struct gfs2_glock_operations *glops = gl->gl_ops; 552 const struct gfs2_glock_operations *glops = gl->gl_ops;
516 struct gfs2_sbd *sdp = gl->gl_sbd; 553 struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -576,8 +613,11 @@ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
576 */ 613 */
577 614
578static void run_queue(struct gfs2_glock *gl, const int nonblock) 615static void run_queue(struct gfs2_glock *gl, const int nonblock)
616__releases(&gl->gl_spin)
617__acquires(&gl->gl_spin)
579{ 618{
580 struct gfs2_holder *gh = NULL; 619 struct gfs2_holder *gh = NULL;
620 int ret;
581 621
582 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) 622 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
583 return; 623 return;
@@ -596,8 +636,11 @@ static void run_queue(struct gfs2_glock *gl, const int nonblock)
596 } else { 636 } else {
597 if (test_bit(GLF_DEMOTE, &gl->gl_flags)) 637 if (test_bit(GLF_DEMOTE, &gl->gl_flags))
598 gfs2_demote_wake(gl); 638 gfs2_demote_wake(gl);
599 if (do_promote(gl) == 0) 639 ret = do_promote(gl);
640 if (ret == 0)
600 goto out; 641 goto out;
642 if (ret == 2)
643 return;
601 gh = find_first_waiter(gl); 644 gh = find_first_waiter(gl);
602 gl->gl_target = gh->gh_state; 645 gl->gl_target = gh->gh_state;
603 if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) 646 if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
@@ -820,7 +863,7 @@ static void wait_on_demote(struct gfs2_glock *gl)
820 */ 863 */
821 864
822static void handle_callback(struct gfs2_glock *gl, unsigned int state, 865static void handle_callback(struct gfs2_glock *gl, unsigned int state,
823 int remote, unsigned long delay) 866 unsigned long delay)
824{ 867{
825 int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE; 868 int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
826 869
@@ -828,9 +871,6 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
828 if (gl->gl_demote_state == LM_ST_EXCLUSIVE) { 871 if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
829 gl->gl_demote_state = state; 872 gl->gl_demote_state = state;
830 gl->gl_demote_time = jiffies; 873 gl->gl_demote_time = jiffies;
831 if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN &&
832 gl->gl_object)
833 gfs2_glock_schedule_for_reclaim(gl);
834 } else if (gl->gl_demote_state != LM_ST_UNLOCKED && 874 } else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
835 gl->gl_demote_state != state) { 875 gl->gl_demote_state != state) {
836 gl->gl_demote_state = LM_ST_UNLOCKED; 876 gl->gl_demote_state = LM_ST_UNLOCKED;
@@ -877,6 +917,8 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
877 */ 917 */
878 918
879static inline void add_to_queue(struct gfs2_holder *gh) 919static inline void add_to_queue(struct gfs2_holder *gh)
920__releases(&gl->gl_spin)
921__acquires(&gl->gl_spin)
880{ 922{
881 struct gfs2_glock *gl = gh->gh_gl; 923 struct gfs2_glock *gl = gh->gh_gl;
882 struct gfs2_sbd *sdp = gl->gl_sbd; 924 struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -998,7 +1040,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
998 1040
999 spin_lock(&gl->gl_spin); 1041 spin_lock(&gl->gl_spin);
1000 if (gh->gh_flags & GL_NOCACHE) 1042 if (gh->gh_flags & GL_NOCACHE)
1001 handle_callback(gl, LM_ST_UNLOCKED, 0, 0); 1043 handle_callback(gl, LM_ST_UNLOCKED, 0);
1002 1044
1003 list_del_init(&gh->gh_list); 1045 list_del_init(&gh->gh_list);
1004 if (find_first_holder(gl) == NULL) { 1046 if (find_first_holder(gl) == NULL) {
@@ -1269,12 +1311,26 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
1269 delay = gl->gl_ops->go_min_hold_time; 1311 delay = gl->gl_ops->go_min_hold_time;
1270 1312
1271 spin_lock(&gl->gl_spin); 1313 spin_lock(&gl->gl_spin);
1272 handle_callback(gl, state, 1, delay); 1314 handle_callback(gl, state, delay);
1273 spin_unlock(&gl->gl_spin); 1315 spin_unlock(&gl->gl_spin);
1274 if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) 1316 if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
1275 gfs2_glock_put(gl); 1317 gfs2_glock_put(gl);
1276} 1318}
1277 1319
1320static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
1321{
1322 struct gfs2_jdesc *jd;
1323
1324 spin_lock(&sdp->sd_jindex_spin);
1325 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
1326 if (jd->jd_jid != jid)
1327 continue;
1328 jd->jd_dirty = 1;
1329 break;
1330 }
1331 spin_unlock(&sdp->sd_jindex_spin);
1332}
1333
1278/** 1334/**
1279 * gfs2_glock_cb - Callback used by locking module 1335 * gfs2_glock_cb - Callback used by locking module
1280 * @sdp: Pointer to the superblock 1336 * @sdp: Pointer to the superblock
@@ -1338,80 +1394,83 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
1338 * Returns: 1 if it's ok 1394 * Returns: 1 if it's ok
1339 */ 1395 */
1340 1396
1341static int demote_ok(struct gfs2_glock *gl) 1397static int demote_ok(const struct gfs2_glock *gl)
1342{ 1398{
1343 const struct gfs2_glock_operations *glops = gl->gl_ops; 1399 const struct gfs2_glock_operations *glops = gl->gl_ops;
1344 int demote = 1;
1345
1346 if (test_bit(GLF_STICKY, &gl->gl_flags))
1347 demote = 0;
1348 else if (glops->go_demote_ok)
1349 demote = glops->go_demote_ok(gl);
1350
1351 return demote;
1352}
1353
1354/**
1355 * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
1356 * @gl: the glock
1357 *
1358 */
1359
1360void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
1361{
1362 struct gfs2_sbd *sdp = gl->gl_sbd;
1363 1400
1364 spin_lock(&sdp->sd_reclaim_lock); 1401 if (gl->gl_state == LM_ST_UNLOCKED)
1365 if (list_empty(&gl->gl_reclaim)) { 1402 return 0;
1366 gfs2_glock_hold(gl); 1403 if (!list_empty(&gl->gl_holders))
1367 list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list); 1404 return 0;
1368 atomic_inc(&sdp->sd_reclaim_count); 1405 if (glops->go_demote_ok)
1369 spin_unlock(&sdp->sd_reclaim_lock); 1406 return glops->go_demote_ok(gl);
1370 wake_up(&sdp->sd_reclaim_wq); 1407 return 1;
1371 } else
1372 spin_unlock(&sdp->sd_reclaim_lock);
1373} 1408}
1374 1409
1375/**
1376 * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list
1377 * @sdp: the filesystem
1378 *
1379 * Called from gfs2_glockd() glock reclaim daemon, or when promoting a
1380 * different glock and we notice that there are a lot of glocks in the
1381 * reclaim list.
1382 *
1383 */
1384 1410
1385void gfs2_reclaim_glock(struct gfs2_sbd *sdp) 1411static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
1386{ 1412{
1387 struct gfs2_glock *gl; 1413 struct gfs2_glock *gl;
1388 int done_callback = 0; 1414 int may_demote;
1415 int nr_skipped = 0;
1416 int got_ref = 0;
1417 LIST_HEAD(skipped);
1389 1418
1390 spin_lock(&sdp->sd_reclaim_lock); 1419 if (nr == 0)
1391 if (list_empty(&sdp->sd_reclaim_list)) { 1420 goto out;
1392 spin_unlock(&sdp->sd_reclaim_lock);
1393 return;
1394 }
1395 gl = list_entry(sdp->sd_reclaim_list.next,
1396 struct gfs2_glock, gl_reclaim);
1397 list_del_init(&gl->gl_reclaim);
1398 spin_unlock(&sdp->sd_reclaim_lock);
1399 1421
1400 atomic_dec(&sdp->sd_reclaim_count); 1422 if (!(gfp_mask & __GFP_FS))
1401 atomic_inc(&sdp->sd_reclaimed); 1423 return -1;
1402 1424
1403 spin_lock(&gl->gl_spin); 1425 spin_lock(&lru_lock);
1404 if (find_first_holder(gl) == NULL && 1426 while(nr && !list_empty(&lru_list)) {
1405 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) { 1427 gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
1406 handle_callback(gl, LM_ST_UNLOCKED, 0, 0); 1428 list_del_init(&gl->gl_lru);
1407 done_callback = 1; 1429 atomic_dec(&lru_count);
1430
1431 /* Test for being demotable */
1432 if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
1433 gfs2_glock_hold(gl);
1434 got_ref = 1;
1435 spin_unlock(&lru_lock);
1436 spin_lock(&gl->gl_spin);
1437 may_demote = demote_ok(gl);
1438 spin_unlock(&gl->gl_spin);
1439 clear_bit(GLF_LOCK, &gl->gl_flags);
1440 if (may_demote) {
1441 handle_callback(gl, LM_ST_UNLOCKED, 0);
1442 nr--;
1443 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1444 gfs2_glock_put(gl);
1445 }
1446 spin_lock(&lru_lock);
1447 if (may_demote)
1448 continue;
1449 }
1450 if (list_empty(&gl->gl_lru) &&
1451 (atomic_read(&gl->gl_ref) <= (2 + got_ref))) {
1452 nr_skipped++;
1453 list_add(&gl->gl_lru, &skipped);
1454 }
1455 if (got_ref) {
1456 spin_unlock(&lru_lock);
1457 gfs2_glock_put(gl);
1458 spin_lock(&lru_lock);
1459 got_ref = 0;
1460 }
1408 } 1461 }
1409 spin_unlock(&gl->gl_spin); 1462 list_splice(&skipped, &lru_list);
1410 if (!done_callback || 1463 atomic_add(nr_skipped, &lru_count);
1411 queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 1464 spin_unlock(&lru_lock);
1412 gfs2_glock_put(gl); 1465out:
1466 return (atomic_read(&lru_count) / 100) * sysctl_vfs_cache_pressure;
1413} 1467}
1414 1468
1469static struct shrinker glock_shrinker = {
1470 .shrink = gfs2_shrink_glock_memory,
1471 .seeks = DEFAULT_SEEKS,
1472};
1473
1415/** 1474/**
1416 * examine_bucket - Call a function for glock in a hash bucket 1475 * examine_bucket - Call a function for glock in a hash bucket
1417 * @examiner: the function 1476 * @examiner: the function
@@ -1457,26 +1516,6 @@ out:
1457} 1516}
1458 1517
1459/** 1518/**
1460 * scan_glock - look at a glock and see if we can reclaim it
1461 * @gl: the glock to look at
1462 *
1463 */
1464
1465static void scan_glock(struct gfs2_glock *gl)
1466{
1467 if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object)
1468 return;
1469 if (test_bit(GLF_LOCK, &gl->gl_flags))
1470 return;
1471
1472 spin_lock(&gl->gl_spin);
1473 if (find_first_holder(gl) == NULL &&
1474 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
1475 gfs2_glock_schedule_for_reclaim(gl);
1476 spin_unlock(&gl->gl_spin);
1477}
1478
1479/**
1480 * clear_glock - look at a glock and see if we can free it from glock cache 1519 * clear_glock - look at a glock and see if we can free it from glock cache
1481 * @gl: the glock to look at 1520 * @gl: the glock to look at
1482 * 1521 *
@@ -1484,23 +1523,16 @@ static void scan_glock(struct gfs2_glock *gl)
1484 1523
1485static void clear_glock(struct gfs2_glock *gl) 1524static void clear_glock(struct gfs2_glock *gl)
1486{ 1525{
1487 struct gfs2_sbd *sdp = gl->gl_sbd; 1526 spin_lock(&lru_lock);
1488 int released; 1527 if (!list_empty(&gl->gl_lru)) {
1489 1528 list_del_init(&gl->gl_lru);
1490 spin_lock(&sdp->sd_reclaim_lock); 1529 atomic_dec(&lru_count);
1491 if (!list_empty(&gl->gl_reclaim)) {
1492 list_del_init(&gl->gl_reclaim);
1493 atomic_dec(&sdp->sd_reclaim_count);
1494 spin_unlock(&sdp->sd_reclaim_lock);
1495 released = gfs2_glock_put(gl);
1496 gfs2_assert(sdp, !released);
1497 } else {
1498 spin_unlock(&sdp->sd_reclaim_lock);
1499 } 1530 }
1531 spin_unlock(&lru_lock);
1500 1532
1501 spin_lock(&gl->gl_spin); 1533 spin_lock(&gl->gl_spin);
1502 if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED) 1534 if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED)
1503 handle_callback(gl, LM_ST_UNLOCKED, 0, 0); 1535 handle_callback(gl, LM_ST_UNLOCKED, 0);
1504 spin_unlock(&gl->gl_spin); 1536 spin_unlock(&gl->gl_spin);
1505 gfs2_glock_hold(gl); 1537 gfs2_glock_hold(gl);
1506 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 1538 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
@@ -1548,6 +1580,20 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
1548 } 1580 }
1549} 1581}
1550 1582
1583void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
1584{
1585 struct gfs2_glock *gl = ip->i_gl;
1586 int ret;
1587
1588 ret = gfs2_truncatei_resume(ip);
1589 gfs2_assert_withdraw(gl->gl_sbd, ret == 0);
1590
1591 spin_lock(&gl->gl_spin);
1592 clear_bit(GLF_LOCK, &gl->gl_flags);
1593 run_queue(gl, 1);
1594 spin_unlock(&gl->gl_spin);
1595}
1596
1551static const char *state2str(unsigned state) 1597static const char *state2str(unsigned state)
1552{ 1598{
1553 switch(state) { 1599 switch(state) {
@@ -1623,8 +1669,6 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
1623 char *p = buf; 1669 char *p = buf;
1624 if (test_bit(GLF_LOCK, gflags)) 1670 if (test_bit(GLF_LOCK, gflags))
1625 *p++ = 'l'; 1671 *p++ = 'l';
1626 if (test_bit(GLF_STICKY, gflags))
1627 *p++ = 's';
1628 if (test_bit(GLF_DEMOTE, gflags)) 1672 if (test_bit(GLF_DEMOTE, gflags))
1629 *p++ = 'D'; 1673 *p++ = 'D';
1630 if (test_bit(GLF_PENDING_DEMOTE, gflags)) 1674 if (test_bit(GLF_PENDING_DEMOTE, gflags))
@@ -1743,34 +1787,6 @@ static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
1743 return error; 1787 return error;
1744} 1788}
1745 1789
1746/**
1747 * gfs2_scand - Look for cached glocks and inodes to toss from memory
1748 * @sdp: Pointer to GFS2 superblock
1749 *
1750 * One of these daemons runs, finding candidates to add to sd_reclaim_list.
1751 * See gfs2_glockd()
1752 */
1753
1754static int gfs2_scand(void *data)
1755{
1756 unsigned x;
1757 unsigned delay;
1758
1759 while (!kthread_should_stop()) {
1760 for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
1761 examine_bucket(scan_glock, NULL, x);
1762 if (freezing(current))
1763 refrigerator();
1764 delay = scand_secs;
1765 if (delay < 1)
1766 delay = 1;
1767 schedule_timeout_interruptible(delay * HZ);
1768 }
1769
1770 return 0;
1771}
1772
1773
1774 1790
1775int __init gfs2_glock_init(void) 1791int __init gfs2_glock_init(void)
1776{ 1792{
@@ -1784,28 +1800,21 @@ int __init gfs2_glock_init(void)
1784 } 1800 }
1785#endif 1801#endif
1786 1802
1787 scand_process = kthread_run(gfs2_scand, NULL, "gfs2_scand");
1788 if (IS_ERR(scand_process))
1789 return PTR_ERR(scand_process);
1790
1791 glock_workqueue = create_workqueue("glock_workqueue"); 1803 glock_workqueue = create_workqueue("glock_workqueue");
1792 if (IS_ERR(glock_workqueue)) { 1804 if (IS_ERR(glock_workqueue))
1793 kthread_stop(scand_process);
1794 return PTR_ERR(glock_workqueue); 1805 return PTR_ERR(glock_workqueue);
1795 } 1806
1807 register_shrinker(&glock_shrinker);
1796 1808
1797 return 0; 1809 return 0;
1798} 1810}
1799 1811
1800void gfs2_glock_exit(void) 1812void gfs2_glock_exit(void)
1801{ 1813{
1814 unregister_shrinker(&glock_shrinker);
1802 destroy_workqueue(glock_workqueue); 1815 destroy_workqueue(glock_workqueue);
1803 kthread_stop(scand_process);
1804} 1816}
1805 1817
1806module_param(scand_secs, uint, S_IRUGO|S_IWUSR);
1807MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs");
1808
1809static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi) 1818static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
1810{ 1819{
1811 struct gfs2_glock *gl; 1820 struct gfs2_glock *gl;
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 695c6b193611..543ec7ecfbda 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -129,9 +129,9 @@ int gfs2_lvb_hold(struct gfs2_glock *gl);
129void gfs2_lvb_unhold(struct gfs2_glock *gl); 129void gfs2_lvb_unhold(struct gfs2_glock *gl);
130 130
131void gfs2_glock_cb(void *cb_data, unsigned int type, void *data); 131void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
132void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
133void gfs2_reclaim_glock(struct gfs2_sbd *sdp); 132void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
134void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); 133void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
134void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
135 135
136int __init gfs2_glock_init(void); 136int __init gfs2_glock_init(void);
137void gfs2_glock_exit(void); 137void gfs2_glock_exit(void);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c6c318c2a0f6..8522d3aa64fc 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -201,19 +201,12 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
201 * Returns: 1 if it's ok 201 * Returns: 1 if it's ok
202 */ 202 */
203 203
204static int inode_go_demote_ok(struct gfs2_glock *gl) 204static int inode_go_demote_ok(const struct gfs2_glock *gl)
205{ 205{
206 struct gfs2_sbd *sdp = gl->gl_sbd; 206 struct gfs2_sbd *sdp = gl->gl_sbd;
207 int demote = 0; 207 if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object)
208 208 return 0;
209 if (!gl->gl_object && !gl->gl_aspace->i_mapping->nrpages) 209 return 1;
210 demote = 1;
211 else if (!sdp->sd_args.ar_localcaching &&
212 time_after_eq(jiffies, gl->gl_stamp +
213 gfs2_tune_get(sdp, gt_demote_secs) * HZ))
214 demote = 1;
215
216 return demote;
217} 210}
218 211
219/** 212/**
@@ -227,6 +220,7 @@ static int inode_go_demote_ok(struct gfs2_glock *gl)
227static int inode_go_lock(struct gfs2_holder *gh) 220static int inode_go_lock(struct gfs2_holder *gh)
228{ 221{
229 struct gfs2_glock *gl = gh->gh_gl; 222 struct gfs2_glock *gl = gh->gh_gl;
223 struct gfs2_sbd *sdp = gl->gl_sbd;
230 struct gfs2_inode *ip = gl->gl_object; 224 struct gfs2_inode *ip = gl->gl_object;
231 int error = 0; 225 int error = 0;
232 226
@@ -239,10 +233,16 @@ static int inode_go_lock(struct gfs2_holder *gh)
239 return error; 233 return error;
240 } 234 }
241 235
242 if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) && 236 if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) &&
243 (gl->gl_state == LM_ST_EXCLUSIVE) && 237 (gl->gl_state == LM_ST_EXCLUSIVE) &&
244 (gh->gh_state == LM_ST_EXCLUSIVE)) 238 (gh->gh_state == LM_ST_EXCLUSIVE)) {
245 error = gfs2_truncatei_resume(ip); 239 spin_lock(&sdp->sd_trunc_lock);
240 if (list_empty(&ip->i_trunc_list))
241 list_add(&sdp->sd_trunc_list, &ip->i_trunc_list);
242 spin_unlock(&sdp->sd_trunc_lock);
243 wake_up(&sdp->sd_quota_wait);
244 return 1;
245 }
246 246
247 return error; 247 return error;
248} 248}
@@ -260,10 +260,13 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
260 const struct gfs2_inode *ip = gl->gl_object; 260 const struct gfs2_inode *ip = gl->gl_object;
261 if (ip == NULL) 261 if (ip == NULL)
262 return 0; 262 return 0;
263 gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%08lx\n", 263 gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu/%llu\n",
264 (unsigned long long)ip->i_no_formal_ino, 264 (unsigned long long)ip->i_no_formal_ino,
265 (unsigned long long)ip->i_no_addr, 265 (unsigned long long)ip->i_no_addr,
266 IF2DT(ip->i_inode.i_mode), ip->i_flags); 266 IF2DT(ip->i_inode.i_mode), ip->i_flags,
267 (unsigned int)ip->i_diskflags,
268 (unsigned long long)ip->i_inode.i_size,
269 (unsigned long long)ip->i_disksize);
267 return 0; 270 return 0;
268} 271}
269 272
@@ -274,7 +277,7 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
274 * Returns: 1 if it's ok 277 * Returns: 1 if it's ok
275 */ 278 */
276 279
277static int rgrp_go_demote_ok(struct gfs2_glock *gl) 280static int rgrp_go_demote_ok(const struct gfs2_glock *gl)
278{ 281{
279 return !gl->gl_aspace->i_mapping->nrpages; 282 return !gl->gl_aspace->i_mapping->nrpages;
280} 283}
@@ -318,7 +321,9 @@ static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
318 const struct gfs2_rgrpd *rgd = gl->gl_object; 321 const struct gfs2_rgrpd *rgd = gl->gl_object;
319 if (rgd == NULL) 322 if (rgd == NULL)
320 return 0; 323 return 0;
321 gfs2_print_dbg(seq, " R: n:%llu\n", (unsigned long long)rgd->rd_addr); 324 gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
325 (unsigned long long)rgd->rd_addr, rgd->rd_flags,
326 rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
322 return 0; 327 return 0;
323} 328}
324 329
@@ -374,13 +379,25 @@ static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
374} 379}
375 380
376/** 381/**
382 * trans_go_demote_ok
383 * @gl: the glock
384 *
385 * Always returns 0
386 */
387
388static int trans_go_demote_ok(const struct gfs2_glock *gl)
389{
390 return 0;
391}
392
393/**
377 * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock 394 * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
378 * @gl: the glock 395 * @gl: the glock
379 * 396 *
380 * Returns: 1 if it's ok 397 * Returns: 1 if it's ok
381 */ 398 */
382 399
383static int quota_go_demote_ok(struct gfs2_glock *gl) 400static int quota_go_demote_ok(const struct gfs2_glock *gl)
384{ 401{
385 return !atomic_read(&gl->gl_lvb_count); 402 return !atomic_read(&gl->gl_lvb_count);
386} 403}
@@ -414,6 +431,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
414const struct gfs2_glock_operations gfs2_trans_glops = { 431const struct gfs2_glock_operations gfs2_trans_glops = {
415 .go_xmote_th = trans_go_sync, 432 .go_xmote_th = trans_go_sync,
416 .go_xmote_bh = trans_go_xmote_bh, 433 .go_xmote_bh = trans_go_xmote_bh,
434 .go_demote_ok = trans_go_demote_ok,
417 .go_type = LM_TYPE_NONDISK, 435 .go_type = LM_TYPE_NONDISK,
418}; 436};
419 437
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index f566ec1b4e8e..608849d00021 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -68,12 +68,6 @@ struct gfs2_bitmap {
68 u32 bi_len; 68 u32 bi_len;
69}; 69};
70 70
71struct gfs2_rgrp_host {
72 u32 rg_free;
73 u32 rg_dinodes;
74 u64 rg_igeneration;
75};
76
77struct gfs2_rgrpd { 71struct gfs2_rgrpd {
78 struct list_head rd_list; /* Link with superblock */ 72 struct list_head rd_list; /* Link with superblock */
79 struct list_head rd_list_mru; 73 struct list_head rd_list_mru;
@@ -83,14 +77,16 @@ struct gfs2_rgrpd {
83 u32 rd_length; /* length of rgrp header in fs blocks */ 77 u32 rd_length; /* length of rgrp header in fs blocks */
84 u32 rd_data; /* num of data blocks in rgrp */ 78 u32 rd_data; /* num of data blocks in rgrp */
85 u32 rd_bitbytes; /* number of bytes in data bitmaps */ 79 u32 rd_bitbytes; /* number of bytes in data bitmaps */
86 struct gfs2_rgrp_host rd_rg; 80 u32 rd_free;
81 u32 rd_free_clone;
82 u32 rd_dinodes;
83 u64 rd_igeneration;
87 struct gfs2_bitmap *rd_bits; 84 struct gfs2_bitmap *rd_bits;
88 unsigned int rd_bh_count;
89 struct mutex rd_mutex; 85 struct mutex rd_mutex;
90 u32 rd_free_clone;
91 struct gfs2_log_element rd_le; 86 struct gfs2_log_element rd_le;
92 u32 rd_last_alloc;
93 struct gfs2_sbd *rd_sbd; 87 struct gfs2_sbd *rd_sbd;
88 unsigned int rd_bh_count;
89 u32 rd_last_alloc;
94 unsigned char rd_flags; 90 unsigned char rd_flags;
95#define GFS2_RDF_CHECK 0x01 /* Need to check for unlinked inodes */ 91#define GFS2_RDF_CHECK 0x01 /* Need to check for unlinked inodes */
96#define GFS2_RDF_NOALLOC 0x02 /* rg prohibits allocation */ 92#define GFS2_RDF_NOALLOC 0x02 /* rg prohibits allocation */
@@ -129,7 +125,7 @@ struct gfs2_glock_operations {
129 void (*go_xmote_th) (struct gfs2_glock *gl); 125 void (*go_xmote_th) (struct gfs2_glock *gl);
130 int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh); 126 int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
131 void (*go_inval) (struct gfs2_glock *gl, int flags); 127 void (*go_inval) (struct gfs2_glock *gl, int flags);
132 int (*go_demote_ok) (struct gfs2_glock *gl); 128 int (*go_demote_ok) (const struct gfs2_glock *gl);
133 int (*go_lock) (struct gfs2_holder *gh); 129 int (*go_lock) (struct gfs2_holder *gh);
134 void (*go_unlock) (struct gfs2_holder *gh); 130 void (*go_unlock) (struct gfs2_holder *gh);
135 int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); 131 int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
@@ -159,7 +155,6 @@ struct gfs2_holder {
159 155
160enum { 156enum {
161 GLF_LOCK = 1, 157 GLF_LOCK = 1,
162 GLF_STICKY = 2,
163 GLF_DEMOTE = 3, 158 GLF_DEMOTE = 3,
164 GLF_PENDING_DEMOTE = 4, 159 GLF_PENDING_DEMOTE = 4,
165 GLF_DEMOTE_IN_PROGRESS = 5, 160 GLF_DEMOTE_IN_PROGRESS = 5,
@@ -194,7 +189,7 @@ struct gfs2_glock {
194 unsigned long gl_tchange; 189 unsigned long gl_tchange;
195 void *gl_object; 190 void *gl_object;
196 191
197 struct list_head gl_reclaim; 192 struct list_head gl_lru;
198 193
199 struct gfs2_sbd *gl_sbd; 194 struct gfs2_sbd *gl_sbd;
200 195
@@ -233,29 +228,24 @@ enum {
233 GIF_USER = 4, /* user inode, not metadata addr space */ 228 GIF_USER = 4, /* user inode, not metadata addr space */
234}; 229};
235 230
236struct gfs2_dinode_host {
237 u64 di_size; /* number of bytes in file */
238 u64 di_generation; /* generation number for NFS */
239 u32 di_flags; /* GFS2_DIF_... */
240 /* These only apply to directories */
241 u32 di_entries; /* The number of entries in the directory */
242 u64 di_eattr; /* extended attribute block number */
243};
244 231
245struct gfs2_inode { 232struct gfs2_inode {
246 struct inode i_inode; 233 struct inode i_inode;
247 u64 i_no_addr; 234 u64 i_no_addr;
248 u64 i_no_formal_ino; 235 u64 i_no_formal_ino;
236 u64 i_generation;
237 u64 i_eattr;
238 loff_t i_disksize;
249 unsigned long i_flags; /* GIF_... */ 239 unsigned long i_flags; /* GIF_... */
250
251 struct gfs2_dinode_host i_di; /* To be replaced by ref to block */
252
253 struct gfs2_glock *i_gl; /* Move into i_gh? */ 240 struct gfs2_glock *i_gl; /* Move into i_gh? */
254 struct gfs2_holder i_iopen_gh; 241 struct gfs2_holder i_iopen_gh;
255 struct gfs2_holder i_gh; /* for prepare/commit_write only */ 242 struct gfs2_holder i_gh; /* for prepare/commit_write only */
256 struct gfs2_alloc *i_alloc; 243 struct gfs2_alloc *i_alloc;
257 u64 i_goal; /* goal block for allocations */ 244 u64 i_goal; /* goal block for allocations */
258 struct rw_semaphore i_rw_mutex; 245 struct rw_semaphore i_rw_mutex;
246 struct list_head i_trunc_list;
247 u32 i_entries;
248 u32 i_diskflags;
259 u8 i_height; 249 u8 i_height;
260 u8 i_depth; 250 u8 i_depth;
261}; 251};
@@ -406,13 +396,11 @@ struct gfs2_args {
406struct gfs2_tune { 396struct gfs2_tune {
407 spinlock_t gt_spin; 397 spinlock_t gt_spin;
408 398
409 unsigned int gt_demote_secs; /* Cache retention for unheld glock */
410 unsigned int gt_incore_log_blocks; 399 unsigned int gt_incore_log_blocks;
411 unsigned int gt_log_flush_secs; 400 unsigned int gt_log_flush_secs;
412 401
413 unsigned int gt_recoverd_secs; 402 unsigned int gt_recoverd_secs;
414 unsigned int gt_logd_secs; 403 unsigned int gt_logd_secs;
415 unsigned int gt_quotad_secs;
416 404
417 unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */ 405 unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
418 unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */ 406 unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
@@ -488,10 +476,6 @@ struct gfs2_sbd {
488 /* Lock Stuff */ 476 /* Lock Stuff */
489 477
490 struct lm_lockstruct sd_lockstruct; 478 struct lm_lockstruct sd_lockstruct;
491 struct list_head sd_reclaim_list;
492 spinlock_t sd_reclaim_lock;
493 wait_queue_head_t sd_reclaim_wq;
494 atomic_t sd_reclaim_count;
495 struct gfs2_holder sd_live_gh; 479 struct gfs2_holder sd_live_gh;
496 struct gfs2_glock *sd_rename_gl; 480 struct gfs2_glock *sd_rename_gl;
497 struct gfs2_glock *sd_trans_gl; 481 struct gfs2_glock *sd_trans_gl;
@@ -519,7 +503,6 @@ struct gfs2_sbd {
519 spinlock_t sd_statfs_spin; 503 spinlock_t sd_statfs_spin;
520 struct gfs2_statfs_change_host sd_statfs_master; 504 struct gfs2_statfs_change_host sd_statfs_master;
521 struct gfs2_statfs_change_host sd_statfs_local; 505 struct gfs2_statfs_change_host sd_statfs_local;
522 unsigned long sd_statfs_sync_time;
523 506
524 /* Resource group stuff */ 507 /* Resource group stuff */
525 508
@@ -552,8 +535,6 @@ struct gfs2_sbd {
552 struct task_struct *sd_recoverd_process; 535 struct task_struct *sd_recoverd_process;
553 struct task_struct *sd_logd_process; 536 struct task_struct *sd_logd_process;
554 struct task_struct *sd_quotad_process; 537 struct task_struct *sd_quotad_process;
555 struct task_struct *sd_glockd_process[GFS2_GLOCKD_MAX];
556 unsigned int sd_glockd_num;
557 538
558 /* Quota stuff */ 539 /* Quota stuff */
559 540
@@ -561,13 +542,15 @@ struct gfs2_sbd {
561 atomic_t sd_quota_count; 542 atomic_t sd_quota_count;
562 spinlock_t sd_quota_spin; 543 spinlock_t sd_quota_spin;
563 struct mutex sd_quota_mutex; 544 struct mutex sd_quota_mutex;
545 wait_queue_head_t sd_quota_wait;
546 struct list_head sd_trunc_list;
547 spinlock_t sd_trunc_lock;
564 548
565 unsigned int sd_quota_slots; 549 unsigned int sd_quota_slots;
566 unsigned int sd_quota_chunks; 550 unsigned int sd_quota_chunks;
567 unsigned char **sd_quota_bitmap; 551 unsigned char **sd_quota_bitmap;
568 552
569 u64 sd_quota_sync_gen; 553 u64 sd_quota_sync_gen;
570 unsigned long sd_quota_sync_time;
571 554
572 /* Log stuff */ 555 /* Log stuff */
573 556
@@ -624,10 +607,6 @@ struct gfs2_sbd {
624 struct mutex sd_freeze_lock; 607 struct mutex sd_freeze_lock;
625 unsigned int sd_freeze_count; 608 unsigned int sd_freeze_count;
626 609
627 /* Counters */
628
629 atomic_t sd_reclaimed;
630
631 char sd_fsname[GFS2_FSNAME_LEN]; 610 char sd_fsname[GFS2_FSNAME_LEN];
632 char sd_table_name[GFS2_FSNAME_LEN]; 611 char sd_table_name[GFS2_FSNAME_LEN];
633 char sd_proto_name[GFS2_FSNAME_LEN]; 612 char sd_proto_name[GFS2_FSNAME_LEN];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index d57616840e89..3b87c188da41 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -32,7 +32,6 @@
32#include "log.h" 32#include "log.h"
33#include "meta_io.h" 33#include "meta_io.h"
34#include "ops_address.h" 34#include "ops_address.h"
35#include "ops_inode.h"
36#include "quota.h" 35#include "quota.h"
37#include "rgrp.h" 36#include "rgrp.h"
38#include "trans.h" 37#include "trans.h"
@@ -248,7 +247,6 @@ fail:
248 247
249static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) 248static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
250{ 249{
251 struct gfs2_dinode_host *di = &ip->i_di;
252 const struct gfs2_dinode *str = buf; 250 const struct gfs2_dinode *str = buf;
253 struct timespec atime; 251 struct timespec atime;
254 u16 height, depth; 252 u16 height, depth;
@@ -274,8 +272,8 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
274 * to do that. 272 * to do that.
275 */ 273 */
276 ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink); 274 ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
277 di->di_size = be64_to_cpu(str->di_size); 275 ip->i_disksize = be64_to_cpu(str->di_size);
278 i_size_write(&ip->i_inode, di->di_size); 276 i_size_write(&ip->i_inode, ip->i_disksize);
279 gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); 277 gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
280 atime.tv_sec = be64_to_cpu(str->di_atime); 278 atime.tv_sec = be64_to_cpu(str->di_atime);
281 atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); 279 atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
@@ -287,9 +285,9 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
287 ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec); 285 ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
288 286
289 ip->i_goal = be64_to_cpu(str->di_goal_meta); 287 ip->i_goal = be64_to_cpu(str->di_goal_meta);
290 di->di_generation = be64_to_cpu(str->di_generation); 288 ip->i_generation = be64_to_cpu(str->di_generation);
291 289
292 di->di_flags = be32_to_cpu(str->di_flags); 290 ip->i_diskflags = be32_to_cpu(str->di_flags);
293 gfs2_set_inode_flags(&ip->i_inode); 291 gfs2_set_inode_flags(&ip->i_inode);
294 height = be16_to_cpu(str->di_height); 292 height = be16_to_cpu(str->di_height);
295 if (unlikely(height > GFS2_MAX_META_HEIGHT)) 293 if (unlikely(height > GFS2_MAX_META_HEIGHT))
@@ -300,9 +298,9 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
300 if (unlikely(depth > GFS2_DIR_MAX_DEPTH)) 298 if (unlikely(depth > GFS2_DIR_MAX_DEPTH))
301 goto corrupt; 299 goto corrupt;
302 ip->i_depth = (u8)depth; 300 ip->i_depth = (u8)depth;
303 di->di_entries = be32_to_cpu(str->di_entries); 301 ip->i_entries = be32_to_cpu(str->di_entries);
304 302
305 di->di_eattr = be64_to_cpu(str->di_eattr); 303 ip->i_eattr = be64_to_cpu(str->di_eattr);
306 if (S_ISREG(ip->i_inode.i_mode)) 304 if (S_ISREG(ip->i_inode.i_mode))
307 gfs2_set_aops(&ip->i_inode); 305 gfs2_set_aops(&ip->i_inode);
308 306
@@ -388,7 +386,6 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip)
388 gfs2_free_di(rgd, ip); 386 gfs2_free_di(rgd, ip);
389 387
390 gfs2_trans_end(sdp); 388 gfs2_trans_end(sdp);
391 clear_bit(GLF_STICKY, &ip->i_gl->gl_flags);
392 389
393out_rg_gunlock: 390out_rg_gunlock:
394 gfs2_glock_dq_uninit(&al->al_rgd_gh); 391 gfs2_glock_dq_uninit(&al->al_rgd_gh);
@@ -690,7 +687,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
690 return error; 687 return error;
691 } 688 }
692 689
693 if (dip->i_di.di_entries == (u32)-1) 690 if (dip->i_entries == (u32)-1)
694 return -EFBIG; 691 return -EFBIG;
695 if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1) 692 if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1)
696 return -EMLINK; 693 return -EMLINK;
@@ -790,11 +787,11 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
790 di->di_flags = 0; 787 di->di_flags = 0;
791 788
792 if (S_ISREG(mode)) { 789 if (S_ISREG(mode)) {
793 if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) || 790 if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) ||
794 gfs2_tune_get(sdp, gt_new_files_jdata)) 791 gfs2_tune_get(sdp, gt_new_files_jdata))
795 di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA); 792 di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
796 } else if (S_ISDIR(mode)) { 793 } else if (S_ISDIR(mode)) {
797 di->di_flags |= cpu_to_be32(dip->i_di.di_flags & 794 di->di_flags |= cpu_to_be32(dip->i_diskflags &
798 GFS2_DIF_INHERIT_JDATA); 795 GFS2_DIF_INHERIT_JDATA);
799 } 796 }
800 797
@@ -1068,7 +1065,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
1068 struct qstr dotname; 1065 struct qstr dotname;
1069 int error; 1066 int error;
1070 1067
1071 if (ip->i_di.di_entries != 2) { 1068 if (ip->i_entries != 2) {
1072 if (gfs2_consist_inode(ip)) 1069 if (gfs2_consist_inode(ip))
1073 gfs2_dinode_print(ip); 1070 gfs2_dinode_print(ip);
1074 return -EIO; 1071 return -EIO;
@@ -1168,7 +1165,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
1168 return error; 1165 return error;
1169 } 1166 }
1170 1167
1171 if (!ip->i_di.di_size) { 1168 if (!ip->i_disksize) {
1172 gfs2_consist_inode(ip); 1169 gfs2_consist_inode(ip);
1173 error = -EIO; 1170 error = -EIO;
1174 goto out; 1171 goto out;
@@ -1178,7 +1175,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
1178 if (error) 1175 if (error)
1179 goto out; 1176 goto out;
1180 1177
1181 x = ip->i_di.di_size + 1; 1178 x = ip->i_disksize + 1;
1182 if (x > *len) { 1179 if (x > *len) {
1183 *buf = kmalloc(x, GFP_NOFS); 1180 *buf = kmalloc(x, GFP_NOFS);
1184 if (!*buf) { 1181 if (!*buf) {
@@ -1242,7 +1239,6 @@ int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1242 1239
1243void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) 1240void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
1244{ 1241{
1245 const struct gfs2_dinode_host *di = &ip->i_di;
1246 struct gfs2_dinode *str = buf; 1242 struct gfs2_dinode *str = buf;
1247 1243
1248 str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC); 1244 str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
@@ -1256,7 +1252,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
1256 str->di_uid = cpu_to_be32(ip->i_inode.i_uid); 1252 str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
1257 str->di_gid = cpu_to_be32(ip->i_inode.i_gid); 1253 str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
1258 str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); 1254 str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
1259 str->di_size = cpu_to_be64(di->di_size); 1255 str->di_size = cpu_to_be64(ip->i_disksize);
1260 str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); 1256 str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
1261 str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); 1257 str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
1262 str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec); 1258 str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
@@ -1264,17 +1260,17 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
1264 1260
1265 str->di_goal_meta = cpu_to_be64(ip->i_goal); 1261 str->di_goal_meta = cpu_to_be64(ip->i_goal);
1266 str->di_goal_data = cpu_to_be64(ip->i_goal); 1262 str->di_goal_data = cpu_to_be64(ip->i_goal);
1267 str->di_generation = cpu_to_be64(di->di_generation); 1263 str->di_generation = cpu_to_be64(ip->i_generation);
1268 1264
1269 str->di_flags = cpu_to_be32(di->di_flags); 1265 str->di_flags = cpu_to_be32(ip->i_diskflags);
1270 str->di_height = cpu_to_be16(ip->i_height); 1266 str->di_height = cpu_to_be16(ip->i_height);
1271 str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) && 1267 str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
1272 !(ip->i_di.di_flags & GFS2_DIF_EXHASH) ? 1268 !(ip->i_diskflags & GFS2_DIF_EXHASH) ?
1273 GFS2_FORMAT_DE : 0); 1269 GFS2_FORMAT_DE : 0);
1274 str->di_depth = cpu_to_be16(ip->i_depth); 1270 str->di_depth = cpu_to_be16(ip->i_depth);
1275 str->di_entries = cpu_to_be32(di->di_entries); 1271 str->di_entries = cpu_to_be32(ip->i_entries);
1276 1272
1277 str->di_eattr = cpu_to_be64(di->di_eattr); 1273 str->di_eattr = cpu_to_be64(ip->i_eattr);
1278 str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec); 1274 str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
1279 str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec); 1275 str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec);
1280 str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec); 1276 str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec);
@@ -1282,22 +1278,21 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
1282 1278
1283void gfs2_dinode_print(const struct gfs2_inode *ip) 1279void gfs2_dinode_print(const struct gfs2_inode *ip)
1284{ 1280{
1285 const struct gfs2_dinode_host *di = &ip->i_di;
1286
1287 printk(KERN_INFO " no_formal_ino = %llu\n", 1281 printk(KERN_INFO " no_formal_ino = %llu\n",
1288 (unsigned long long)ip->i_no_formal_ino); 1282 (unsigned long long)ip->i_no_formal_ino);
1289 printk(KERN_INFO " no_addr = %llu\n", 1283 printk(KERN_INFO " no_addr = %llu\n",
1290 (unsigned long long)ip->i_no_addr); 1284 (unsigned long long)ip->i_no_addr);
1291 printk(KERN_INFO " di_size = %llu\n", (unsigned long long)di->di_size); 1285 printk(KERN_INFO " i_disksize = %llu\n",
1286 (unsigned long long)ip->i_disksize);
1292 printk(KERN_INFO " blocks = %llu\n", 1287 printk(KERN_INFO " blocks = %llu\n",
1293 (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode)); 1288 (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
1294 printk(KERN_INFO " i_goal = %llu\n", 1289 printk(KERN_INFO " i_goal = %llu\n",
1295 (unsigned long long)ip->i_goal); 1290 (unsigned long long)ip->i_goal);
1296 printk(KERN_INFO " di_flags = 0x%.8X\n", di->di_flags); 1291 printk(KERN_INFO " i_diskflags = 0x%.8X\n", ip->i_diskflags);
1297 printk(KERN_INFO " i_height = %u\n", ip->i_height); 1292 printk(KERN_INFO " i_height = %u\n", ip->i_height);
1298 printk(KERN_INFO " i_depth = %u\n", ip->i_depth); 1293 printk(KERN_INFO " i_depth = %u\n", ip->i_depth);
1299 printk(KERN_INFO " di_entries = %u\n", di->di_entries); 1294 printk(KERN_INFO " i_entries = %u\n", ip->i_entries);
1300 printk(KERN_INFO " di_eattr = %llu\n", 1295 printk(KERN_INFO " i_eattr = %llu\n",
1301 (unsigned long long)di->di_eattr); 1296 (unsigned long long)ip->i_eattr);
1302} 1297}
1303 1298
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 2d43f69610a0..d5329364cdff 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -10,6 +10,7 @@
10#ifndef __INODE_DOT_H__ 10#ifndef __INODE_DOT_H__
11#define __INODE_DOT_H__ 11#define __INODE_DOT_H__
12 12
13#include <linux/fs.h>
13#include "util.h" 14#include "util.h"
14 15
15static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) 16static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
@@ -19,7 +20,7 @@ static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
19 20
20static inline int gfs2_is_jdata(const struct gfs2_inode *ip) 21static inline int gfs2_is_jdata(const struct gfs2_inode *ip)
21{ 22{
22 return ip->i_di.di_flags & GFS2_DIF_JDATA; 23 return ip->i_diskflags & GFS2_DIF_JDATA;
23} 24}
24 25
25static inline int gfs2_is_writeback(const struct gfs2_inode *ip) 26static inline int gfs2_is_writeback(const struct gfs2_inode *ip)
@@ -97,5 +98,15 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
97void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); 98void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
98void gfs2_dinode_print(const struct gfs2_inode *ip); 99void gfs2_dinode_print(const struct gfs2_inode *ip);
99 100
101extern const struct inode_operations gfs2_file_iops;
102extern const struct inode_operations gfs2_dir_iops;
103extern const struct inode_operations gfs2_symlink_iops;
104extern const struct file_operations gfs2_file_fops;
105extern const struct file_operations gfs2_dir_fops;
106extern const struct file_operations gfs2_file_fops_nolock;
107extern const struct file_operations gfs2_dir_fops_nolock;
108
109extern void gfs2_set_inode_flags(struct inode *inode);
110
100#endif /* __INODE_DOT_H__ */ 111#endif /* __INODE_DOT_H__ */
101 112
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 0c4cbe6c8285..1aa7eb6a0226 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -194,17 +194,25 @@ out:
194static void gdlm_recovery_done(void *lockspace, unsigned int jid, 194static void gdlm_recovery_done(void *lockspace, unsigned int jid,
195 unsigned int message) 195 unsigned int message)
196{ 196{
197 char env_jid[20];
198 char env_status[20];
199 char *envp[] = { env_jid, env_status, NULL };
197 struct gdlm_ls *ls = lockspace; 200 struct gdlm_ls *ls = lockspace;
198 ls->recover_jid_done = jid; 201 ls->recover_jid_done = jid;
199 ls->recover_jid_status = message; 202 ls->recover_jid_status = message;
200 kobject_uevent(&ls->kobj, KOBJ_CHANGE); 203 sprintf(env_jid, "JID=%d", jid);
204 sprintf(env_status, "RECOVERY=%s",
205 message == LM_RD_SUCCESS ? "Done" : "Failed");
206 kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
201} 207}
202 208
203static void gdlm_others_may_mount(void *lockspace) 209static void gdlm_others_may_mount(void *lockspace)
204{ 210{
211 char *message = "FIRSTMOUNT=Done";
212 char *envp[] = { message, NULL };
205 struct gdlm_ls *ls = lockspace; 213 struct gdlm_ls *ls = lockspace;
206 ls->first_done = 1; 214 ls->first_done = 1;
207 kobject_uevent(&ls->kobj, KOBJ_CHANGE); 215 kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
208} 216}
209 217
210/* Userspace gets the offline uevent, blocks new gfs locks on 218/* Userspace gets the offline uevent, blocks new gfs locks on
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index 4ec571c3d8a9..9b7edcf7bd49 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -195,9 +195,23 @@ void gdlm_kobject_release(struct gdlm_ls *ls)
195 kobject_put(&ls->kobj); 195 kobject_put(&ls->kobj);
196} 196}
197 197
198static int gdlm_uevent(struct kset *kset, struct kobject *kobj,
199 struct kobj_uevent_env *env)
200{
201 struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
202 add_uevent_var(env, "LOCKTABLE=%s:%s", ls->clustername, ls->fsname);
203 add_uevent_var(env, "LOCKPROTO=lock_dlm");
204 return 0;
205}
206
207static struct kset_uevent_ops gdlm_uevent_ops = {
208 .uevent = gdlm_uevent,
209};
210
211
198int gdlm_sysfs_init(void) 212int gdlm_sysfs_init(void)
199{ 213{
200 gdlm_kset = kset_create_and_add("lock_dlm", NULL, kernel_kobj); 214 gdlm_kset = kset_create_and_add("lock_dlm", &gdlm_uevent_ops, kernel_kobj);
201 if (!gdlm_kset) { 215 if (!gdlm_kset) {
202 printk(KERN_WARNING "%s: can not create kset\n", __func__); 216 printk(KERN_WARNING "%s: can not create kset\n", __func__);
203 return -ENOMEM; 217 return -ENOMEM;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index bb2cc303ac29..7cacfde32194 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -19,7 +19,7 @@
19 19
20#include "gfs2.h" 20#include "gfs2.h"
21#include "incore.h" 21#include "incore.h"
22#include "ops_fstype.h" 22#include "super.h"
23#include "sys.h" 23#include "sys.h"
24#include "util.h" 24#include "util.h"
25#include "glock.h" 25#include "glock.h"
@@ -30,6 +30,7 @@ static void gfs2_init_inode_once(void *foo)
30 30
31 inode_init_once(&ip->i_inode); 31 inode_init_once(&ip->i_inode);
32 init_rwsem(&ip->i_rw_mutex); 32 init_rwsem(&ip->i_rw_mutex);
33 INIT_LIST_HEAD(&ip->i_trunc_list);
33 ip->i_alloc = NULL; 34 ip->i_alloc = NULL;
34} 35}
35 36
@@ -42,7 +43,7 @@ static void gfs2_init_glock_once(void *foo)
42 INIT_LIST_HEAD(&gl->gl_holders); 43 INIT_LIST_HEAD(&gl->gl_holders);
43 gl->gl_lvb = NULL; 44 gl->gl_lvb = NULL;
44 atomic_set(&gl->gl_lvb_count, 0); 45 atomic_set(&gl->gl_lvb_count, 0);
45 INIT_LIST_HEAD(&gl->gl_reclaim); 46 INIT_LIST_HEAD(&gl->gl_lru);
46 INIT_LIST_HEAD(&gl->gl_ail_list); 47 INIT_LIST_HEAD(&gl->gl_ail_list);
47 atomic_set(&gl->gl_ail_count, 0); 48 atomic_set(&gl->gl_ail_count, 0);
48} 49}
@@ -93,6 +94,12 @@ static int __init init_gfs2_fs(void)
93 if (!gfs2_rgrpd_cachep) 94 if (!gfs2_rgrpd_cachep)
94 goto fail; 95 goto fail;
95 96
97 gfs2_quotad_cachep = kmem_cache_create("gfs2_quotad",
98 sizeof(struct gfs2_quota_data),
99 0, 0, NULL);
100 if (!gfs2_quotad_cachep)
101 goto fail;
102
96 error = register_filesystem(&gfs2_fs_type); 103 error = register_filesystem(&gfs2_fs_type);
97 if (error) 104 if (error)
98 goto fail; 105 goto fail;
@@ -112,6 +119,9 @@ fail_unregister:
112fail: 119fail:
113 gfs2_glock_exit(); 120 gfs2_glock_exit();
114 121
122 if (gfs2_quotad_cachep)
123 kmem_cache_destroy(gfs2_quotad_cachep);
124
115 if (gfs2_rgrpd_cachep) 125 if (gfs2_rgrpd_cachep)
116 kmem_cache_destroy(gfs2_rgrpd_cachep); 126 kmem_cache_destroy(gfs2_rgrpd_cachep);
117 127
@@ -140,6 +150,7 @@ static void __exit exit_gfs2_fs(void)
140 unregister_filesystem(&gfs2_fs_type); 150 unregister_filesystem(&gfs2_fs_type);
141 unregister_filesystem(&gfs2meta_fs_type); 151 unregister_filesystem(&gfs2meta_fs_type);
142 152
153 kmem_cache_destroy(gfs2_quotad_cachep);
143 kmem_cache_destroy(gfs2_rgrpd_cachep); 154 kmem_cache_destroy(gfs2_rgrpd_cachep);
144 kmem_cache_destroy(gfs2_bufdata_cachep); 155 kmem_cache_destroy(gfs2_bufdata_cachep);
145 kmem_cache_destroy(gfs2_inode_cachep); 156 kmem_cache_destroy(gfs2_inode_cachep);
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index f96eb90a2cfa..3cb0a44ba023 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -32,7 +32,6 @@ enum {
32 Opt_debug, 32 Opt_debug,
33 Opt_nodebug, 33 Opt_nodebug,
34 Opt_upgrade, 34 Opt_upgrade,
35 Opt_num_glockd,
36 Opt_acl, 35 Opt_acl,
37 Opt_noacl, 36 Opt_noacl,
38 Opt_quota_off, 37 Opt_quota_off,
@@ -57,7 +56,6 @@ static const match_table_t tokens = {
57 {Opt_debug, "debug"}, 56 {Opt_debug, "debug"},
58 {Opt_nodebug, "nodebug"}, 57 {Opt_nodebug, "nodebug"},
59 {Opt_upgrade, "upgrade"}, 58 {Opt_upgrade, "upgrade"},
60 {Opt_num_glockd, "num_glockd=%d"},
61 {Opt_acl, "acl"}, 59 {Opt_acl, "acl"},
62 {Opt_noacl, "noacl"}, 60 {Opt_noacl, "noacl"},
63 {Opt_quota_off, "quota=off"}, 61 {Opt_quota_off, "quota=off"},
@@ -87,16 +85,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
87 int error = 0; 85 int error = 0;
88 86
89 if (!remount) { 87 if (!remount) {
90 /* If someone preloaded options, use those instead */
91 spin_lock(&gfs2_sys_margs_lock);
92 if (gfs2_sys_margs) {
93 data = gfs2_sys_margs;
94 gfs2_sys_margs = NULL;
95 }
96 spin_unlock(&gfs2_sys_margs_lock);
97
98 /* Set some defaults */ 88 /* Set some defaults */
99 args->ar_num_glockd = GFS2_GLOCKD_DEFAULT;
100 args->ar_quota = GFS2_QUOTA_DEFAULT; 89 args->ar_quota = GFS2_QUOTA_DEFAULT;
101 args->ar_data = GFS2_DATA_DEFAULT; 90 args->ar_data = GFS2_DATA_DEFAULT;
102 } 91 }
@@ -105,7 +94,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
105 process them */ 94 process them */
106 95
107 for (options = data; (o = strsep(&options, ",")); ) { 96 for (options = data; (o = strsep(&options, ",")); ) {
108 int token, option; 97 int token;
109 substring_t tmp[MAX_OPT_ARGS]; 98 substring_t tmp[MAX_OPT_ARGS];
110 99
111 if (!*o) 100 if (!*o)
@@ -196,22 +185,6 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
196 goto cant_remount; 185 goto cant_remount;
197 args->ar_upgrade = 1; 186 args->ar_upgrade = 1;
198 break; 187 break;
199 case Opt_num_glockd:
200 if ((error = match_int(&tmp[0], &option))) {
201 fs_info(sdp, "problem getting num_glockd\n");
202 goto out_error;
203 }
204
205 if (remount && option != args->ar_num_glockd)
206 goto cant_remount;
207 if (!option || option > GFS2_GLOCKD_MAX) {
208 fs_info(sdp, "0 < num_glockd <= %u (not %u)\n",
209 GFS2_GLOCKD_MAX, option);
210 error = -EINVAL;
211 goto out_error;
212 }
213 args->ar_num_glockd = option;
214 break;
215 case Opt_acl: 188 case Opt_acl:
216 args->ar_posix_acl = 1; 189 args->ar_posix_acl = 1;
217 sdp->sd_vfs->s_flags |= MS_POSIXACL; 190 sdp->sd_vfs->s_flags |= MS_POSIXACL;
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 27563816e1c5..4ddab67867eb 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -210,25 +210,23 @@ static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc
210{ 210{
211 struct inode *inode = page->mapping->host; 211 struct inode *inode = page->mapping->host;
212 struct gfs2_sbd *sdp = GFS2_SB(inode); 212 struct gfs2_sbd *sdp = GFS2_SB(inode);
213 int error; 213 int ret;
214 int done_trans = 0; 214 int done_trans = 0;
215 215
216 error = gfs2_writepage_common(page, wbc);
217 if (error <= 0)
218 return error;
219
220 if (PageChecked(page)) { 216 if (PageChecked(page)) {
221 if (wbc->sync_mode != WB_SYNC_ALL) 217 if (wbc->sync_mode != WB_SYNC_ALL)
222 goto out_ignore; 218 goto out_ignore;
223 error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0); 219 ret = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
224 if (error) 220 if (ret)
225 goto out_ignore; 221 goto out_ignore;
226 done_trans = 1; 222 done_trans = 1;
227 } 223 }
228 error = __gfs2_jdata_writepage(page, wbc); 224 ret = gfs2_writepage_common(page, wbc);
225 if (ret > 0)
226 ret = __gfs2_jdata_writepage(page, wbc);
229 if (done_trans) 227 if (done_trans)
230 gfs2_trans_end(sdp); 228 gfs2_trans_end(sdp);
231 return error; 229 return ret;
232 230
233out_ignore: 231out_ignore:
234 redirty_page_for_writepage(wbc, page); 232 redirty_page_for_writepage(wbc, page);
@@ -453,8 +451,8 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
453 451
454 kaddr = kmap_atomic(page, KM_USER0); 452 kaddr = kmap_atomic(page, KM_USER0);
455 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), 453 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
456 ip->i_di.di_size); 454 ip->i_disksize);
457 memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size); 455 memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize);
458 kunmap_atomic(kaddr, KM_USER0); 456 kunmap_atomic(kaddr, KM_USER0);
459 flush_dcache_page(page); 457 flush_dcache_page(page);
460 brelse(dibh); 458 brelse(dibh);
@@ -627,7 +625,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
627{ 625{
628 struct gfs2_inode *ip = GFS2_I(mapping->host); 626 struct gfs2_inode *ip = GFS2_I(mapping->host);
629 struct gfs2_sbd *sdp = GFS2_SB(mapping->host); 627 struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
630 unsigned int data_blocks, ind_blocks, rblocks; 628 unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
631 int alloc_required; 629 int alloc_required;
632 int error = 0; 630 int error = 0;
633 struct gfs2_alloc *al; 631 struct gfs2_alloc *al;
@@ -641,11 +639,13 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
641 if (unlikely(error)) 639 if (unlikely(error))
642 goto out_uninit; 640 goto out_uninit;
643 641
644 gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
645 error = gfs2_write_alloc_required(ip, pos, len, &alloc_required); 642 error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
646 if (error) 643 if (error)
647 goto out_unlock; 644 goto out_unlock;
648 645
646 if (alloc_required || gfs2_is_jdata(ip))
647 gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
648
649 if (alloc_required) { 649 if (alloc_required) {
650 al = gfs2_alloc_get(ip); 650 al = gfs2_alloc_get(ip);
651 if (!al) { 651 if (!al) {
@@ -675,7 +675,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
675 goto out_trans_fail; 675 goto out_trans_fail;
676 676
677 error = -ENOMEM; 677 error = -ENOMEM;
678 page = __grab_cache_page(mapping, index); 678 flags |= AOP_FLAG_NOFS;
679 page = grab_cache_page_write_begin(mapping, index, flags);
679 *pagep = page; 680 *pagep = page;
680 if (unlikely(!page)) 681 if (unlikely(!page))
681 goto out_endtrans; 682 goto out_endtrans;
@@ -782,7 +783,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
782 783
783 if (inode->i_size < to) { 784 if (inode->i_size < to) {
784 i_size_write(inode, to); 785 i_size_write(inode, to);
785 ip->i_di.di_size = inode->i_size; 786 ip->i_disksize = inode->i_size;
786 di->di_size = cpu_to_be64(inode->i_size); 787 di->di_size = cpu_to_be64(inode->i_size);
787 mark_inode_dirty(inode); 788 mark_inode_dirty(inode);
788 } 789 }
@@ -847,9 +848,9 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
847 848
848 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 849 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
849 850
850 if (likely(ret >= 0) && (inode->i_size > ip->i_di.di_size)) { 851 if (likely(ret >= 0) && (inode->i_size > ip->i_disksize)) {
851 di = (struct gfs2_dinode *)dibh->b_data; 852 di = (struct gfs2_dinode *)dibh->b_data;
852 ip->i_di.di_size = inode->i_size; 853 ip->i_disksize = inode->i_size;
853 di->di_size = cpu_to_be64(inode->i_size); 854 di->di_size = cpu_to_be64(inode->i_size);
854 mark_inode_dirty(inode); 855 mark_inode_dirty(inode);
855 } 856 }
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
index 4a5e676b4420..c2ad36330ca3 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/ops_dentry.c
@@ -19,7 +19,7 @@
19#include "incore.h" 19#include "incore.h"
20#include "dir.h" 20#include "dir.h"
21#include "glock.h" 21#include "glock.h"
22#include "ops_dentry.h" 22#include "super.h"
23#include "util.h" 23#include "util.h"
24#include "inode.h" 24#include "inode.h"
25 25
diff --git a/fs/gfs2/ops_dentry.h b/fs/gfs2/ops_dentry.h
deleted file mode 100644
index 5caa3db4d3f5..000000000000
--- a/fs/gfs2/ops_dentry.h
+++ /dev/null
@@ -1,17 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_DENTRY_DOT_H__
11#define __OPS_DENTRY_DOT_H__
12
13#include <linux/dcache.h>
14
15extern struct dentry_operations gfs2_dops;
16
17#endif /* __OPS_DENTRY_DOT_H__ */
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index bbb8c36403a9..7fdeb14ddd1a 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -22,8 +22,7 @@
22#include "glock.h" 22#include "glock.h"
23#include "glops.h" 23#include "glops.h"
24#include "inode.h" 24#include "inode.h"
25#include "ops_dentry.h" 25#include "super.h"
26#include "ops_fstype.h"
27#include "rgrp.h" 26#include "rgrp.h"
28#include "util.h" 27#include "util.h"
29 28
@@ -214,7 +213,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
214 } 213 }
215 214
216 error = -EIO; 215 error = -EIO;
217 if (GFS2_I(inode)->i_di.di_flags & GFS2_DIF_SYSTEM) { 216 if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) {
218 iput(inode); 217 iput(inode);
219 goto fail; 218 goto fail;
220 } 219 }
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 3a747f8e2188..93fe41b67f97 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -39,7 +39,6 @@
39#include "util.h" 39#include "util.h"
40#include "eaops.h" 40#include "eaops.h"
41#include "ops_address.h" 41#include "ops_address.h"
42#include "ops_inode.h"
43 42
44/** 43/**
45 * gfs2_llseek - seek to a location in a file 44 * gfs2_llseek - seek to a location in a file
@@ -158,8 +157,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
158 if (error) 157 if (error)
159 return error; 158 return error;
160 159
161 fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags); 160 fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags);
162 if (!S_ISDIR(inode->i_mode) && ip->i_di.di_flags & GFS2_DIF_JDATA) 161 if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA)
163 fsflags |= FS_JOURNAL_DATA_FL; 162 fsflags |= FS_JOURNAL_DATA_FL;
164 if (put_user(fsflags, ptr)) 163 if (put_user(fsflags, ptr))
165 error = -EFAULT; 164 error = -EFAULT;
@@ -172,17 +171,16 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
172void gfs2_set_inode_flags(struct inode *inode) 171void gfs2_set_inode_flags(struct inode *inode)
173{ 172{
174 struct gfs2_inode *ip = GFS2_I(inode); 173 struct gfs2_inode *ip = GFS2_I(inode);
175 struct gfs2_dinode_host *di = &ip->i_di;
176 unsigned int flags = inode->i_flags; 174 unsigned int flags = inode->i_flags;
177 175
178 flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); 176 flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
179 if (di->di_flags & GFS2_DIF_IMMUTABLE) 177 if (ip->i_diskflags & GFS2_DIF_IMMUTABLE)
180 flags |= S_IMMUTABLE; 178 flags |= S_IMMUTABLE;
181 if (di->di_flags & GFS2_DIF_APPENDONLY) 179 if (ip->i_diskflags & GFS2_DIF_APPENDONLY)
182 flags |= S_APPEND; 180 flags |= S_APPEND;
183 if (di->di_flags & GFS2_DIF_NOATIME) 181 if (ip->i_diskflags & GFS2_DIF_NOATIME)
184 flags |= S_NOATIME; 182 flags |= S_NOATIME;
185 if (di->di_flags & GFS2_DIF_SYNC) 183 if (ip->i_diskflags & GFS2_DIF_SYNC)
186 flags |= S_SYNC; 184 flags |= S_SYNC;
187 inode->i_flags = flags; 185 inode->i_flags = flags;
188} 186}
@@ -221,7 +219,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
221 if (error) 219 if (error)
222 goto out_drop_write; 220 goto out_drop_write;
223 221
224 flags = ip->i_di.di_flags; 222 flags = ip->i_diskflags;
225 new_flags = (flags & ~mask) | (reqflags & mask); 223 new_flags = (flags & ~mask) | (reqflags & mask);
226 if ((new_flags ^ flags) == 0) 224 if ((new_flags ^ flags) == 0)
227 goto out; 225 goto out;
@@ -260,7 +258,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
260 if (error) 258 if (error)
261 goto out_trans_end; 259 goto out_trans_end;
262 gfs2_trans_add_bh(ip->i_gl, bh, 1); 260 gfs2_trans_add_bh(ip->i_gl, bh, 1);
263 ip->i_di.di_flags = new_flags; 261 ip->i_diskflags = new_flags;
264 gfs2_dinode_out(ip, bh->b_data); 262 gfs2_dinode_out(ip, bh->b_data);
265 brelse(bh); 263 brelse(bh);
266 gfs2_set_inode_flags(inode); 264 gfs2_set_inode_flags(inode);
@@ -344,7 +342,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
344 struct gfs2_inode *ip = GFS2_I(inode); 342 struct gfs2_inode *ip = GFS2_I(inode);
345 struct gfs2_sbd *sdp = GFS2_SB(inode); 343 struct gfs2_sbd *sdp = GFS2_SB(inode);
346 unsigned long last_index; 344 unsigned long last_index;
347 u64 pos = page->index << (PAGE_CACHE_SIZE - inode->i_blkbits); 345 u64 pos = page->index << PAGE_CACHE_SHIFT;
348 unsigned int data_blocks, ind_blocks, rblocks; 346 unsigned int data_blocks, ind_blocks, rblocks;
349 int alloc_required = 0; 347 int alloc_required = 0;
350 struct gfs2_holder gh; 348 struct gfs2_holder gh;
@@ -357,7 +355,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
357 goto out; 355 goto out;
358 356
359 set_bit(GIF_SW_PAGED, &ip->i_flags); 357 set_bit(GIF_SW_PAGED, &ip->i_flags);
360 gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
361 ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required); 358 ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
362 if (ret || !alloc_required) 359 if (ret || !alloc_required)
363 goto out_unlock; 360 goto out_unlock;
@@ -369,6 +366,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
369 ret = gfs2_quota_lock_check(ip); 366 ret = gfs2_quota_lock_check(ip);
370 if (ret) 367 if (ret)
371 goto out_alloc_put; 368 goto out_alloc_put;
369 gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
372 al->al_requested = data_blocks + ind_blocks; 370 al->al_requested = data_blocks + ind_blocks;
373 ret = gfs2_inplace_reserve(ip); 371 ret = gfs2_inplace_reserve(ip);
374 if (ret) 372 if (ret)
@@ -479,7 +477,7 @@ static int gfs2_open(struct inode *inode, struct file *file)
479 goto fail; 477 goto fail;
480 478
481 if (!(file->f_flags & O_LARGEFILE) && 479 if (!(file->f_flags & O_LARGEFILE) &&
482 ip->i_di.di_size > MAX_NON_LFS) { 480 ip->i_disksize > MAX_NON_LFS) {
483 error = -EOVERFLOW; 481 error = -EOVERFLOW;
484 goto fail_gunlock; 482 goto fail_gunlock;
485 } 483 }
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b117fcf2c4f5..f91eebdde581 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -22,20 +22,18 @@
22#include "gfs2.h" 22#include "gfs2.h"
23#include "incore.h" 23#include "incore.h"
24#include "bmap.h" 24#include "bmap.h"
25#include "daemon.h"
26#include "glock.h" 25#include "glock.h"
27#include "glops.h" 26#include "glops.h"
28#include "inode.h" 27#include "inode.h"
29#include "mount.h" 28#include "mount.h"
30#include "ops_fstype.h"
31#include "ops_dentry.h"
32#include "ops_super.h"
33#include "recovery.h" 29#include "recovery.h"
34#include "rgrp.h" 30#include "rgrp.h"
35#include "super.h" 31#include "super.h"
36#include "sys.h" 32#include "sys.h"
37#include "util.h" 33#include "util.h"
38#include "log.h" 34#include "log.h"
35#include "quota.h"
36#include "dir.h"
39 37
40#define DO 0 38#define DO 0
41#define UNDO 1 39#define UNDO 1
@@ -58,12 +56,10 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
58{ 56{
59 spin_lock_init(&gt->gt_spin); 57 spin_lock_init(&gt->gt_spin);
60 58
61 gt->gt_demote_secs = 300;
62 gt->gt_incore_log_blocks = 1024; 59 gt->gt_incore_log_blocks = 1024;
63 gt->gt_log_flush_secs = 60; 60 gt->gt_log_flush_secs = 60;
64 gt->gt_recoverd_secs = 60; 61 gt->gt_recoverd_secs = 60;
65 gt->gt_logd_secs = 1; 62 gt->gt_logd_secs = 1;
66 gt->gt_quotad_secs = 5;
67 gt->gt_quota_simul_sync = 64; 63 gt->gt_quota_simul_sync = 64;
68 gt->gt_quota_warn_period = 10; 64 gt->gt_quota_warn_period = 10;
69 gt->gt_quota_scale_num = 1; 65 gt->gt_quota_scale_num = 1;
@@ -91,10 +87,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
91 87
92 gfs2_tune_init(&sdp->sd_tune); 88 gfs2_tune_init(&sdp->sd_tune);
93 89
94 INIT_LIST_HEAD(&sdp->sd_reclaim_list);
95 spin_lock_init(&sdp->sd_reclaim_lock);
96 init_waitqueue_head(&sdp->sd_reclaim_wq);
97
98 mutex_init(&sdp->sd_inum_mutex); 90 mutex_init(&sdp->sd_inum_mutex);
99 spin_lock_init(&sdp->sd_statfs_spin); 91 spin_lock_init(&sdp->sd_statfs_spin);
100 92
@@ -110,6 +102,9 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
110 INIT_LIST_HEAD(&sdp->sd_quota_list); 102 INIT_LIST_HEAD(&sdp->sd_quota_list);
111 spin_lock_init(&sdp->sd_quota_spin); 103 spin_lock_init(&sdp->sd_quota_spin);
112 mutex_init(&sdp->sd_quota_mutex); 104 mutex_init(&sdp->sd_quota_mutex);
105 init_waitqueue_head(&sdp->sd_quota_wait);
106 INIT_LIST_HEAD(&sdp->sd_trunc_list);
107 spin_lock_init(&sdp->sd_trunc_lock);
113 108
114 spin_lock_init(&sdp->sd_log_lock); 109 spin_lock_init(&sdp->sd_log_lock);
115 110
@@ -443,24 +438,11 @@ out:
443static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh, 438static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
444 int undo) 439 int undo)
445{ 440{
446 struct task_struct *p;
447 int error = 0; 441 int error = 0;
448 442
449 if (undo) 443 if (undo)
450 goto fail_trans; 444 goto fail_trans;
451 445
452 for (sdp->sd_glockd_num = 0;
453 sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd;
454 sdp->sd_glockd_num++) {
455 p = kthread_run(gfs2_glockd, sdp, "gfs2_glockd");
456 error = IS_ERR(p);
457 if (error) {
458 fs_err(sdp, "can't start glockd thread: %d\n", error);
459 goto fail;
460 }
461 sdp->sd_glockd_process[sdp->sd_glockd_num] = p;
462 }
463
464 error = gfs2_glock_nq_num(sdp, 446 error = gfs2_glock_nq_num(sdp,
465 GFS2_MOUNT_LOCK, &gfs2_nondisk_glops, 447 GFS2_MOUNT_LOCK, &gfs2_nondisk_glops,
466 LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE, 448 LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE,
@@ -493,7 +475,6 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
493 fs_err(sdp, "can't create transaction glock: %d\n", error); 475 fs_err(sdp, "can't create transaction glock: %d\n", error);
494 goto fail_rename; 476 goto fail_rename;
495 } 477 }
496 set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags);
497 478
498 return 0; 479 return 0;
499 480
@@ -506,9 +487,6 @@ fail_live:
506fail_mount: 487fail_mount:
507 gfs2_glock_dq_uninit(mount_gh); 488 gfs2_glock_dq_uninit(mount_gh);
508fail: 489fail:
509 while (sdp->sd_glockd_num--)
510 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
511
512 return error; 490 return error;
513} 491}
514 492
@@ -620,7 +598,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
620 598
621 prev_db = 0; 599 prev_db = 0;
622 600
623 for (lb = 0; lb < ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; lb++) { 601 for (lb = 0; lb < ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; lb++) {
624 bh.b_state = 0; 602 bh.b_state = 0;
625 bh.b_blocknr = 0; 603 bh.b_blocknr = 0;
626 bh.b_size = 1 << ip->i_inode.i_blkbits; 604 bh.b_size = 1 << ip->i_inode.i_blkbits;
@@ -661,6 +639,72 @@ static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
661 sdp->sd_lockstruct.ls_lockspace); 639 sdp->sd_lockstruct.ls_lockspace);
662} 640}
663 641
642/**
643 * gfs2_jindex_hold - Grab a lock on the jindex
644 * @sdp: The GFS2 superblock
645 * @ji_gh: the holder for the jindex glock
646 *
647 * Returns: errno
648 */
649
650static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
651{
652 struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
653 struct qstr name;
654 char buf[20];
655 struct gfs2_jdesc *jd;
656 int error;
657
658 name.name = buf;
659
660 mutex_lock(&sdp->sd_jindex_mutex);
661
662 for (;;) {
663 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh);
664 if (error)
665 break;
666
667 name.len = sprintf(buf, "journal%u", sdp->sd_journals);
668 name.hash = gfs2_disk_hash(name.name, name.len);
669
670 error = gfs2_dir_check(sdp->sd_jindex, &name, NULL);
671 if (error == -ENOENT) {
672 error = 0;
673 break;
674 }
675
676 gfs2_glock_dq_uninit(ji_gh);
677
678 if (error)
679 break;
680
681 error = -ENOMEM;
682 jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
683 if (!jd)
684 break;
685
686 INIT_LIST_HEAD(&jd->extent_list);
687 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
688 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
689 if (!jd->jd_inode)
690 error = -ENOENT;
691 else
692 error = PTR_ERR(jd->jd_inode);
693 kfree(jd);
694 break;
695 }
696
697 spin_lock(&sdp->sd_jindex_spin);
698 jd->jd_jid = sdp->sd_journals++;
699 list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
700 spin_unlock(&sdp->sd_jindex_spin);
701 }
702
703 mutex_unlock(&sdp->sd_jindex_mutex);
704
705 return error;
706}
707
664static int init_journal(struct gfs2_sbd *sdp, int undo) 708static int init_journal(struct gfs2_sbd *sdp, int undo)
665{ 709{
666 struct inode *master = sdp->sd_master_dir->d_inode; 710 struct inode *master = sdp->sd_master_dir->d_inode;
@@ -681,7 +725,6 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
681 return PTR_ERR(sdp->sd_jindex); 725 return PTR_ERR(sdp->sd_jindex);
682 } 726 }
683 ip = GFS2_I(sdp->sd_jindex); 727 ip = GFS2_I(sdp->sd_jindex);
684 set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
685 728
686 /* Load in the journal index special file */ 729 /* Load in the journal index special file */
687 730
@@ -832,7 +875,6 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
832 goto fail_statfs; 875 goto fail_statfs;
833 } 876 }
834 ip = GFS2_I(sdp->sd_rindex); 877 ip = GFS2_I(sdp->sd_rindex);
835 set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
836 sdp->sd_rindex_uptodate = 0; 878 sdp->sd_rindex_uptodate = 0;
837 879
838 /* Read in the quota inode */ 880 /* Read in the quota inode */
@@ -973,9 +1015,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
973 } 1015 }
974 sdp->sd_logd_process = p; 1016 sdp->sd_logd_process = p;
975 1017
976 sdp->sd_statfs_sync_time = jiffies;
977 sdp->sd_quota_sync_time = jiffies;
978
979 p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); 1018 p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
980 error = IS_ERR(p); 1019 error = IS_ERR(p);
981 if (error) { 1020 if (error) {
@@ -1224,17 +1263,21 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
1224static void gfs2_kill_sb(struct super_block *sb) 1263static void gfs2_kill_sb(struct super_block *sb)
1225{ 1264{
1226 struct gfs2_sbd *sdp = sb->s_fs_info; 1265 struct gfs2_sbd *sdp = sb->s_fs_info;
1227 if (sdp) { 1266
1228 gfs2_meta_syncfs(sdp); 1267 if (sdp == NULL) {
1229 dput(sdp->sd_root_dir); 1268 kill_block_super(sb);
1230 dput(sdp->sd_master_dir); 1269 return;
1231 sdp->sd_root_dir = NULL;
1232 sdp->sd_master_dir = NULL;
1233 } 1270 }
1271
1272 gfs2_meta_syncfs(sdp);
1273 dput(sdp->sd_root_dir);
1274 dput(sdp->sd_master_dir);
1275 sdp->sd_root_dir = NULL;
1276 sdp->sd_master_dir = NULL;
1234 shrink_dcache_sb(sb); 1277 shrink_dcache_sb(sb);
1235 kill_block_super(sb); 1278 kill_block_super(sb);
1236 if (sdp) 1279 gfs2_delete_debugfs_file(sdp);
1237 gfs2_delete_debugfs_file(sdp); 1280 kfree(sdp);
1238} 1281}
1239 1282
1240struct file_system_type gfs2_fs_type = { 1283struct file_system_type gfs2_fs_type = {
diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h
deleted file mode 100644
index da8490511836..000000000000
--- a/fs/gfs2/ops_fstype.h
+++ /dev/null
@@ -1,19 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_FSTYPE_DOT_H__
11#define __OPS_FSTYPE_DOT_H__
12
13#include <linux/fs.h>
14
15extern struct file_system_type gfs2_fs_type;
16extern struct file_system_type gfs2meta_fs_type;
17extern const struct export_operations gfs2_export_ops;
18
19#endif /* __OPS_FSTYPE_DOT_H__ */
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index d232991b9046..49877546beb9 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -19,6 +19,7 @@
19#include <linux/gfs2_ondisk.h> 19#include <linux/gfs2_ondisk.h>
20#include <linux/crc32.h> 20#include <linux/crc32.h>
21#include <linux/lm_interface.h> 21#include <linux/lm_interface.h>
22#include <linux/fiemap.h>
22#include <asm/uaccess.h> 23#include <asm/uaccess.h>
23 24
24#include "gfs2.h" 25#include "gfs2.h"
@@ -31,12 +32,11 @@
31#include "glock.h" 32#include "glock.h"
32#include "inode.h" 33#include "inode.h"
33#include "meta_io.h" 34#include "meta_io.h"
34#include "ops_dentry.h"
35#include "ops_inode.h"
36#include "quota.h" 35#include "quota.h"
37#include "rgrp.h" 36#include "rgrp.h"
38#include "trans.h" 37#include "trans.h"
39#include "util.h" 38#include "util.h"
39#include "super.h"
40 40
41/** 41/**
42 * gfs2_create - Create a file 42 * gfs2_create - Create a file
@@ -185,7 +185,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
185 if (!dip->i_inode.i_nlink) 185 if (!dip->i_inode.i_nlink)
186 goto out_gunlock; 186 goto out_gunlock;
187 error = -EFBIG; 187 error = -EFBIG;
188 if (dip->i_di.di_entries == (u32)-1) 188 if (dip->i_entries == (u32)-1)
189 goto out_gunlock; 189 goto out_gunlock;
190 error = -EPERM; 190 error = -EPERM;
191 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) 191 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -371,7 +371,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
371 371
372 ip = ghs[1].gh_gl->gl_object; 372 ip = ghs[1].gh_gl->gl_object;
373 373
374 ip->i_di.di_size = size; 374 ip->i_disksize = size;
375 375
376 error = gfs2_meta_inode_buffer(ip, &dibh); 376 error = gfs2_meta_inode_buffer(ip, &dibh);
377 377
@@ -425,9 +425,9 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
425 ip = ghs[1].gh_gl->gl_object; 425 ip = ghs[1].gh_gl->gl_object;
426 426
427 ip->i_inode.i_nlink = 2; 427 ip->i_inode.i_nlink = 2;
428 ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); 428 ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
429 ip->i_di.di_flags |= GFS2_DIF_JDATA; 429 ip->i_diskflags |= GFS2_DIF_JDATA;
430 ip->i_di.di_entries = 2; 430 ip->i_entries = 2;
431 431
432 error = gfs2_meta_inode_buffer(ip, &dibh); 432 error = gfs2_meta_inode_buffer(ip, &dibh);
433 433
@@ -517,13 +517,13 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
517 if (error) 517 if (error)
518 goto out_gunlock; 518 goto out_gunlock;
519 519
520 if (ip->i_di.di_entries < 2) { 520 if (ip->i_entries < 2) {
521 if (gfs2_consist_inode(ip)) 521 if (gfs2_consist_inode(ip))
522 gfs2_dinode_print(ip); 522 gfs2_dinode_print(ip);
523 error = -EIO; 523 error = -EIO;
524 goto out_gunlock; 524 goto out_gunlock;
525 } 525 }
526 if (ip->i_di.di_entries > 2) { 526 if (ip->i_entries > 2) {
527 error = -ENOTEMPTY; 527 error = -ENOTEMPTY;
528 goto out_gunlock; 528 goto out_gunlock;
529 } 529 }
@@ -726,13 +726,13 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
726 goto out_gunlock; 726 goto out_gunlock;
727 727
728 if (S_ISDIR(nip->i_inode.i_mode)) { 728 if (S_ISDIR(nip->i_inode.i_mode)) {
729 if (nip->i_di.di_entries < 2) { 729 if (nip->i_entries < 2) {
730 if (gfs2_consist_inode(nip)) 730 if (gfs2_consist_inode(nip))
731 gfs2_dinode_print(nip); 731 gfs2_dinode_print(nip);
732 error = -EIO; 732 error = -EIO;
733 goto out_gunlock; 733 goto out_gunlock;
734 } 734 }
735 if (nip->i_di.di_entries > 2) { 735 if (nip->i_entries > 2) {
736 error = -ENOTEMPTY; 736 error = -ENOTEMPTY;
737 goto out_gunlock; 737 goto out_gunlock;
738 } 738 }
@@ -758,7 +758,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
758 error = -EINVAL; 758 error = -EINVAL;
759 goto out_gunlock; 759 goto out_gunlock;
760 } 760 }
761 if (ndip->i_di.di_entries == (u32)-1) { 761 if (ndip->i_entries == (u32)-1) {
762 error = -EFBIG; 762 error = -EFBIG;
763 goto out_gunlock; 763 goto out_gunlock;
764 } 764 }
@@ -990,7 +990,7 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
990 struct gfs2_sbd *sdp = GFS2_SB(inode); 990 struct gfs2_sbd *sdp = GFS2_SB(inode);
991 int error; 991 int error;
992 992
993 if (attr->ia_size != ip->i_di.di_size) { 993 if (attr->ia_size != ip->i_disksize) {
994 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); 994 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
995 if (error) 995 if (error)
996 return error; 996 return error;
@@ -1001,8 +1001,8 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
1001 } 1001 }
1002 1002
1003 error = gfs2_truncatei(ip, attr->ia_size); 1003 error = gfs2_truncatei(ip, attr->ia_size);
1004 if (error && (inode->i_size != ip->i_di.di_size)) 1004 if (error && (inode->i_size != ip->i_disksize))
1005 i_size_write(inode, ip->i_di.di_size); 1005 i_size_write(inode, ip->i_disksize);
1006 1006
1007 return error; 1007 return error;
1008} 1008}
@@ -1212,6 +1212,48 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
1212 return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er); 1212 return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er);
1213} 1213}
1214 1214
1215static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
1216 u64 start, u64 len)
1217{
1218 struct gfs2_inode *ip = GFS2_I(inode);
1219 struct gfs2_holder gh;
1220 int ret;
1221
1222 ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
1223 if (ret)
1224 return ret;
1225
1226 mutex_lock(&inode->i_mutex);
1227
1228 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
1229 if (ret)
1230 goto out;
1231
1232 if (gfs2_is_stuffed(ip)) {
1233 u64 phys = ip->i_no_addr << inode->i_blkbits;
1234 u64 size = i_size_read(inode);
1235 u32 flags = FIEMAP_EXTENT_LAST|FIEMAP_EXTENT_NOT_ALIGNED|
1236 FIEMAP_EXTENT_DATA_INLINE;
1237 phys += sizeof(struct gfs2_dinode);
1238 phys += start;
1239 if (start + len > size)
1240 len = size - start;
1241 if (start < size)
1242 ret = fiemap_fill_next_extent(fieinfo, start, phys,
1243 len, flags);
1244 if (ret == 1)
1245 ret = 0;
1246 } else {
1247 ret = __generic_block_fiemap(inode, fieinfo, start, len,
1248 gfs2_block_map);
1249 }
1250
1251 gfs2_glock_dq_uninit(&gh);
1252out:
1253 mutex_unlock(&inode->i_mutex);
1254 return ret;
1255}
1256
1215const struct inode_operations gfs2_file_iops = { 1257const struct inode_operations gfs2_file_iops = {
1216 .permission = gfs2_permission, 1258 .permission = gfs2_permission,
1217 .setattr = gfs2_setattr, 1259 .setattr = gfs2_setattr,
@@ -1220,6 +1262,7 @@ const struct inode_operations gfs2_file_iops = {
1220 .getxattr = gfs2_getxattr, 1262 .getxattr = gfs2_getxattr,
1221 .listxattr = gfs2_listxattr, 1263 .listxattr = gfs2_listxattr,
1222 .removexattr = gfs2_removexattr, 1264 .removexattr = gfs2_removexattr,
1265 .fiemap = gfs2_fiemap,
1223}; 1266};
1224 1267
1225const struct inode_operations gfs2_dir_iops = { 1268const struct inode_operations gfs2_dir_iops = {
@@ -1239,6 +1282,7 @@ const struct inode_operations gfs2_dir_iops = {
1239 .getxattr = gfs2_getxattr, 1282 .getxattr = gfs2_getxattr,
1240 .listxattr = gfs2_listxattr, 1283 .listxattr = gfs2_listxattr,
1241 .removexattr = gfs2_removexattr, 1284 .removexattr = gfs2_removexattr,
1285 .fiemap = gfs2_fiemap,
1242}; 1286};
1243 1287
1244const struct inode_operations gfs2_symlink_iops = { 1288const struct inode_operations gfs2_symlink_iops = {
@@ -1251,5 +1295,6 @@ const struct inode_operations gfs2_symlink_iops = {
1251 .getxattr = gfs2_getxattr, 1295 .getxattr = gfs2_getxattr,
1252 .listxattr = gfs2_listxattr, 1296 .listxattr = gfs2_listxattr,
1253 .removexattr = gfs2_removexattr, 1297 .removexattr = gfs2_removexattr,
1298 .fiemap = gfs2_fiemap,
1254}; 1299};
1255 1300
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
deleted file mode 100644
index 14b4b797622a..000000000000
--- a/fs/gfs2/ops_inode.h
+++ /dev/null
@@ -1,25 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_INODE_DOT_H__
11#define __OPS_INODE_DOT_H__
12
13#include <linux/fs.h>
14
15extern const struct inode_operations gfs2_file_iops;
16extern const struct inode_operations gfs2_dir_iops;
17extern const struct inode_operations gfs2_symlink_iops;
18extern const struct file_operations gfs2_file_fops;
19extern const struct file_operations gfs2_dir_fops;
20extern const struct file_operations gfs2_file_fops_nolock;
21extern const struct file_operations gfs2_dir_fops_nolock;
22
23extern void gfs2_set_inode_flags(struct inode *inode);
24
25#endif /* __OPS_INODE_DOT_H__ */
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index d5355d9b5926..320323d03479 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -28,7 +28,6 @@
28#include "inode.h" 28#include "inode.h"
29#include "log.h" 29#include "log.h"
30#include "mount.h" 30#include "mount.h"
31#include "ops_super.h"
32#include "quota.h" 31#include "quota.h"
33#include "recovery.h" 32#include "recovery.h"
34#include "rgrp.h" 33#include "rgrp.h"
@@ -143,8 +142,6 @@ static void gfs2_put_super(struct super_block *sb)
143 kthread_stop(sdp->sd_quotad_process); 142 kthread_stop(sdp->sd_quotad_process);
144 kthread_stop(sdp->sd_logd_process); 143 kthread_stop(sdp->sd_logd_process);
145 kthread_stop(sdp->sd_recoverd_process); 144 kthread_stop(sdp->sd_recoverd_process);
146 while (sdp->sd_glockd_num--)
147 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
148 145
149 if (!(sb->s_flags & MS_RDONLY)) { 146 if (!(sb->s_flags & MS_RDONLY)) {
150 error = gfs2_make_fs_ro(sdp); 147 error = gfs2_make_fs_ro(sdp);
@@ -185,7 +182,6 @@ static void gfs2_put_super(struct super_block *sb)
185 182
186 /* At this point, we're through participating in the lockspace */ 183 /* At this point, we're through participating in the lockspace */
187 gfs2_sys_fs_del(sdp); 184 gfs2_sys_fs_del(sdp);
188 kfree(sdp);
189} 185}
190 186
191/** 187/**
@@ -215,18 +211,18 @@ static int gfs2_sync_fs(struct super_block *sb, int wait)
215} 211}
216 212
217/** 213/**
218 * gfs2_write_super_lockfs - prevent further writes to the filesystem 214 * gfs2_freeze - prevent further writes to the filesystem
219 * @sb: the VFS structure for the filesystem 215 * @sb: the VFS structure for the filesystem
220 * 216 *
221 */ 217 */
222 218
223static void gfs2_write_super_lockfs(struct super_block *sb) 219static int gfs2_freeze(struct super_block *sb)
224{ 220{
225 struct gfs2_sbd *sdp = sb->s_fs_info; 221 struct gfs2_sbd *sdp = sb->s_fs_info;
226 int error; 222 int error;
227 223
228 if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) 224 if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
229 return; 225 return -EINVAL;
230 226
231 for (;;) { 227 for (;;) {
232 error = gfs2_freeze_fs(sdp); 228 error = gfs2_freeze_fs(sdp);
@@ -246,17 +242,150 @@ static void gfs2_write_super_lockfs(struct super_block *sb)
246 fs_err(sdp, "retrying...\n"); 242 fs_err(sdp, "retrying...\n");
247 msleep(1000); 243 msleep(1000);
248 } 244 }
245 return 0;
249} 246}
250 247
251/** 248/**
252 * gfs2_unlockfs - reallow writes to the filesystem 249 * gfs2_unfreeze - reallow writes to the filesystem
253 * @sb: the VFS structure for the filesystem 250 * @sb: the VFS structure for the filesystem
254 * 251 *
255 */ 252 */
256 253
257static void gfs2_unlockfs(struct super_block *sb) 254static int gfs2_unfreeze(struct super_block *sb)
258{ 255{
259 gfs2_unfreeze_fs(sb->s_fs_info); 256 gfs2_unfreeze_fs(sb->s_fs_info);
257 return 0;
258}
259
260/**
261 * statfs_fill - fill in the sg for a given RG
262 * @rgd: the RG
263 * @sc: the sc structure
264 *
265 * Returns: 0 on success, -ESTALE if the LVB is invalid
266 */
267
268static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
269 struct gfs2_statfs_change_host *sc)
270{
271 gfs2_rgrp_verify(rgd);
272 sc->sc_total += rgd->rd_data;
273 sc->sc_free += rgd->rd_free;
274 sc->sc_dinodes += rgd->rd_dinodes;
275 return 0;
276}
277
278/**
279 * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
280 * @sdp: the filesystem
281 * @sc: the sc info that will be returned
282 *
283 * Any error (other than a signal) will cause this routine to fall back
284 * to the synchronous version.
285 *
286 * FIXME: This really shouldn't busy wait like this.
287 *
288 * Returns: errno
289 */
290
291static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
292{
293 struct gfs2_holder ri_gh;
294 struct gfs2_rgrpd *rgd_next;
295 struct gfs2_holder *gha, *gh;
296 unsigned int slots = 64;
297 unsigned int x;
298 int done;
299 int error = 0, err;
300
301 memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
302 gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
303 if (!gha)
304 return -ENOMEM;
305
306 error = gfs2_rindex_hold(sdp, &ri_gh);
307 if (error)
308 goto out;
309
310 rgd_next = gfs2_rgrpd_get_first(sdp);
311
312 for (;;) {
313 done = 1;
314
315 for (x = 0; x < slots; x++) {
316 gh = gha + x;
317
318 if (gh->gh_gl && gfs2_glock_poll(gh)) {
319 err = gfs2_glock_wait(gh);
320 if (err) {
321 gfs2_holder_uninit(gh);
322 error = err;
323 } else {
324 if (!error)
325 error = statfs_slow_fill(
326 gh->gh_gl->gl_object, sc);
327 gfs2_glock_dq_uninit(gh);
328 }
329 }
330
331 if (gh->gh_gl)
332 done = 0;
333 else if (rgd_next && !error) {
334 error = gfs2_glock_nq_init(rgd_next->rd_gl,
335 LM_ST_SHARED,
336 GL_ASYNC,
337 gh);
338 rgd_next = gfs2_rgrpd_get_next(rgd_next);
339 done = 0;
340 }
341
342 if (signal_pending(current))
343 error = -ERESTARTSYS;
344 }
345
346 if (done)
347 break;
348
349 yield();
350 }
351
352 gfs2_glock_dq_uninit(&ri_gh);
353
354out:
355 kfree(gha);
356 return error;
357}
358
359/**
360 * gfs2_statfs_i - Do a statfs
361 * @sdp: the filesystem
362 * @sg: the sg structure
363 *
364 * Returns: errno
365 */
366
367static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
368{
369 struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
370 struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
371
372 spin_lock(&sdp->sd_statfs_spin);
373
374 *sc = *m_sc;
375 sc->sc_total += l_sc->sc_total;
376 sc->sc_free += l_sc->sc_free;
377 sc->sc_dinodes += l_sc->sc_dinodes;
378
379 spin_unlock(&sdp->sd_statfs_spin);
380
381 if (sc->sc_free < 0)
382 sc->sc_free = 0;
383 if (sc->sc_free > sc->sc_total)
384 sc->sc_free = sc->sc_total;
385 if (sc->sc_dinodes < 0)
386 sc->sc_dinodes = 0;
387
388 return 0;
260} 389}
261 390
262/** 391/**
@@ -370,7 +499,6 @@ static void gfs2_clear_inode(struct inode *inode)
370 */ 499 */
371 if (test_bit(GIF_USER, &ip->i_flags)) { 500 if (test_bit(GIF_USER, &ip->i_flags)) {
372 ip->i_gl->gl_object = NULL; 501 ip->i_gl->gl_object = NULL;
373 gfs2_glock_schedule_for_reclaim(ip->i_gl);
374 gfs2_glock_put(ip->i_gl); 502 gfs2_glock_put(ip->i_gl);
375 ip->i_gl = NULL; 503 ip->i_gl = NULL;
376 if (ip->i_iopen_gh.gh_gl) { 504 if (ip->i_iopen_gh.gh_gl) {
@@ -423,8 +551,6 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
423 seq_printf(s, ",debug"); 551 seq_printf(s, ",debug");
424 if (args->ar_upgrade) 552 if (args->ar_upgrade)
425 seq_printf(s, ",upgrade"); 553 seq_printf(s, ",upgrade");
426 if (args->ar_num_glockd != GFS2_GLOCKD_DEFAULT)
427 seq_printf(s, ",num_glockd=%u", args->ar_num_glockd);
428 if (args->ar_posix_acl) 554 if (args->ar_posix_acl)
429 seq_printf(s, ",acl"); 555 seq_printf(s, ",acl");
430 if (args->ar_quota != GFS2_QUOTA_DEFAULT) { 556 if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
@@ -494,16 +620,16 @@ static void gfs2_delete_inode(struct inode *inode)
494 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); 620 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
495 error = gfs2_glock_nq(&ip->i_iopen_gh); 621 error = gfs2_glock_nq(&ip->i_iopen_gh);
496 if (error) 622 if (error)
497 goto out_uninit; 623 goto out_truncate;
498 624
499 if (S_ISDIR(inode->i_mode) && 625 if (S_ISDIR(inode->i_mode) &&
500 (ip->i_di.di_flags & GFS2_DIF_EXHASH)) { 626 (ip->i_diskflags & GFS2_DIF_EXHASH)) {
501 error = gfs2_dir_exhash_dealloc(ip); 627 error = gfs2_dir_exhash_dealloc(ip);
502 if (error) 628 if (error)
503 goto out_unlock; 629 goto out_unlock;
504 } 630 }
505 631
506 if (ip->i_di.di_eattr) { 632 if (ip->i_eattr) {
507 error = gfs2_ea_dealloc(ip); 633 error = gfs2_ea_dealloc(ip);
508 if (error) 634 if (error)
509 goto out_unlock; 635 goto out_unlock;
@@ -519,6 +645,7 @@ static void gfs2_delete_inode(struct inode *inode)
519 if (error) 645 if (error)
520 goto out_unlock; 646 goto out_unlock;
521 647
648out_truncate:
522 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); 649 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
523 if (error) 650 if (error)
524 goto out_unlock; 651 goto out_unlock;
@@ -527,8 +654,8 @@ static void gfs2_delete_inode(struct inode *inode)
527 gfs2_trans_end(sdp); 654 gfs2_trans_end(sdp);
528 655
529out_unlock: 656out_unlock:
530 gfs2_glock_dq(&ip->i_iopen_gh); 657 if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
531out_uninit: 658 gfs2_glock_dq(&ip->i_iopen_gh);
532 gfs2_holder_uninit(&ip->i_iopen_gh); 659 gfs2_holder_uninit(&ip->i_iopen_gh);
533 gfs2_glock_dq_uninit(&gh); 660 gfs2_glock_dq_uninit(&gh);
534 if (error && error != GLR_TRYFAILED) 661 if (error && error != GLR_TRYFAILED)
@@ -563,8 +690,8 @@ const struct super_operations gfs2_super_ops = {
563 .put_super = gfs2_put_super, 690 .put_super = gfs2_put_super,
564 .write_super = gfs2_write_super, 691 .write_super = gfs2_write_super,
565 .sync_fs = gfs2_sync_fs, 692 .sync_fs = gfs2_sync_fs,
566 .write_super_lockfs = gfs2_write_super_lockfs, 693 .freeze_fs = gfs2_freeze,
567 .unlockfs = gfs2_unlockfs, 694 .unfreeze_fs = gfs2_unfreeze,
568 .statfs = gfs2_statfs, 695 .statfs = gfs2_statfs,
569 .remount_fs = gfs2_remount_fs, 696 .remount_fs = gfs2_remount_fs,
570 .clear_inode = gfs2_clear_inode, 697 .clear_inode = gfs2_clear_inode,
diff --git a/fs/gfs2/ops_super.h b/fs/gfs2/ops_super.h
deleted file mode 100644
index 442a274c6272..000000000000
--- a/fs/gfs2/ops_super.h
+++ /dev/null
@@ -1,17 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_SUPER_DOT_H__
11#define __OPS_SUPER_DOT_H__
12
13#include <linux/fs.h>
14
15extern const struct super_operations gfs2_super_ops;
16
17#endif /* __OPS_SUPER_DOT_H__ */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 3e073f5144fa..b08d09696b3e 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -46,6 +46,8 @@
46#include <linux/bio.h> 46#include <linux/bio.h>
47#include <linux/gfs2_ondisk.h> 47#include <linux/gfs2_ondisk.h>
48#include <linux/lm_interface.h> 48#include <linux/lm_interface.h>
49#include <linux/kthread.h>
50#include <linux/freezer.h>
49 51
50#include "gfs2.h" 52#include "gfs2.h"
51#include "incore.h" 53#include "incore.h"
@@ -94,7 +96,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
94 struct gfs2_quota_data *qd; 96 struct gfs2_quota_data *qd;
95 int error; 97 int error;
96 98
97 qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_NOFS); 99 qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS);
98 if (!qd) 100 if (!qd)
99 return -ENOMEM; 101 return -ENOMEM;
100 102
@@ -119,7 +121,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
119 return 0; 121 return 0;
120 122
121fail: 123fail:
122 kfree(qd); 124 kmem_cache_free(gfs2_quotad_cachep, qd);
123 return error; 125 return error;
124} 126}
125 127
@@ -158,7 +160,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
158 if (qd || !create) { 160 if (qd || !create) {
159 if (new_qd) { 161 if (new_qd) {
160 gfs2_lvb_unhold(new_qd->qd_gl); 162 gfs2_lvb_unhold(new_qd->qd_gl);
161 kfree(new_qd); 163 kmem_cache_free(gfs2_quotad_cachep, new_qd);
162 } 164 }
163 *qdp = qd; 165 *qdp = qd;
164 return 0; 166 return 0;
@@ -1013,7 +1015,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
1013 1015
1014 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change)) 1016 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
1015 return; 1017 return;
1016 if (ip->i_di.di_flags & GFS2_DIF_SYSTEM) 1018 if (ip->i_diskflags & GFS2_DIF_SYSTEM)
1017 return; 1019 return;
1018 1020
1019 for (x = 0; x < al->al_qd_num; x++) { 1021 for (x = 0; x < al->al_qd_num; x++) {
@@ -1100,15 +1102,15 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *
1100int gfs2_quota_init(struct gfs2_sbd *sdp) 1102int gfs2_quota_init(struct gfs2_sbd *sdp)
1101{ 1103{
1102 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); 1104 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
1103 unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; 1105 unsigned int blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
1104 unsigned int x, slot = 0; 1106 unsigned int x, slot = 0;
1105 unsigned int found = 0; 1107 unsigned int found = 0;
1106 u64 dblock; 1108 u64 dblock;
1107 u32 extlen = 0; 1109 u32 extlen = 0;
1108 int error; 1110 int error;
1109 1111
1110 if (!ip->i_di.di_size || ip->i_di.di_size > (64 << 20) || 1112 if (!ip->i_disksize || ip->i_disksize > (64 << 20) ||
1111 ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) { 1113 ip->i_disksize & (sdp->sd_sb.sb_bsize - 1)) {
1112 gfs2_consist_inode(ip); 1114 gfs2_consist_inode(ip);
1113 return -EIO; 1115 return -EIO;
1114 } 1116 }
@@ -1195,7 +1197,7 @@ fail:
1195 return error; 1197 return error;
1196} 1198}
1197 1199
1198void gfs2_quota_scan(struct gfs2_sbd *sdp) 1200static void gfs2_quota_scan(struct gfs2_sbd *sdp)
1199{ 1201{
1200 struct gfs2_quota_data *qd, *safe; 1202 struct gfs2_quota_data *qd, *safe;
1201 LIST_HEAD(dead); 1203 LIST_HEAD(dead);
@@ -1222,7 +1224,7 @@ void gfs2_quota_scan(struct gfs2_sbd *sdp)
1222 gfs2_assert_warn(sdp, !qd->qd_bh_count); 1224 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1223 1225
1224 gfs2_lvb_unhold(qd->qd_gl); 1226 gfs2_lvb_unhold(qd->qd_gl);
1225 kfree(qd); 1227 kmem_cache_free(gfs2_quotad_cachep, qd);
1226 } 1228 }
1227} 1229}
1228 1230
@@ -1257,7 +1259,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
1257 gfs2_assert_warn(sdp, !qd->qd_bh_count); 1259 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1258 1260
1259 gfs2_lvb_unhold(qd->qd_gl); 1261 gfs2_lvb_unhold(qd->qd_gl);
1260 kfree(qd); 1262 kmem_cache_free(gfs2_quotad_cachep, qd);
1261 1263
1262 spin_lock(&sdp->sd_quota_spin); 1264 spin_lock(&sdp->sd_quota_spin);
1263 } 1265 }
@@ -1272,3 +1274,94 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
1272 } 1274 }
1273} 1275}
1274 1276
1277static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error)
1278{
1279 if (error == 0 || error == -EROFS)
1280 return;
1281 if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
1282 fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error);
1283}
1284
1285static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg,
1286 int (*fxn)(struct gfs2_sbd *sdp),
1287 unsigned long t, unsigned long *timeo,
1288 unsigned int *new_timeo)
1289{
1290 if (t >= *timeo) {
1291 int error = fxn(sdp);
1292 quotad_error(sdp, msg, error);
1293 *timeo = gfs2_tune_get_i(&sdp->sd_tune, new_timeo) * HZ;
1294 } else {
1295 *timeo -= t;
1296 }
1297}
1298
1299static void quotad_check_trunc_list(struct gfs2_sbd *sdp)
1300{
1301 struct gfs2_inode *ip;
1302
1303 while(1) {
1304 ip = NULL;
1305 spin_lock(&sdp->sd_trunc_lock);
1306 if (!list_empty(&sdp->sd_trunc_list)) {
1307 ip = list_entry(sdp->sd_trunc_list.next,
1308 struct gfs2_inode, i_trunc_list);
1309 list_del_init(&ip->i_trunc_list);
1310 }
1311 spin_unlock(&sdp->sd_trunc_lock);
1312 if (ip == NULL)
1313 return;
1314 gfs2_glock_finish_truncate(ip);
1315 }
1316}
1317
1318/**
1319 * gfs2_quotad - Write cached quota changes into the quota file
1320 * @sdp: Pointer to GFS2 superblock
1321 *
1322 */
1323
1324int gfs2_quotad(void *data)
1325{
1326 struct gfs2_sbd *sdp = data;
1327 struct gfs2_tune *tune = &sdp->sd_tune;
1328 unsigned long statfs_timeo = 0;
1329 unsigned long quotad_timeo = 0;
1330 unsigned long t = 0;
1331 DEFINE_WAIT(wait);
1332 int empty;
1333
1334 while (!kthread_should_stop()) {
1335
1336 /* Update the master statfs file */
1337 quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t,
1338 &statfs_timeo, &tune->gt_statfs_quantum);
1339
1340 /* Update quota file */
1341 quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
1342 &quotad_timeo, &tune->gt_quota_quantum);
1343
1344 /* FIXME: This should be turned into a shrinker */
1345 gfs2_quota_scan(sdp);
1346
1347 /* Check for & recover partially truncated inodes */
1348 quotad_check_trunc_list(sdp);
1349
1350 if (freezing(current))
1351 refrigerator();
1352 t = min(quotad_timeo, statfs_timeo);
1353
1354 prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_UNINTERRUPTIBLE);
1355 spin_lock(&sdp->sd_trunc_lock);
1356 empty = list_empty(&sdp->sd_trunc_list);
1357 spin_unlock(&sdp->sd_trunc_lock);
1358 if (empty)
1359 t -= schedule_timeout(t);
1360 else
1361 t = 0;
1362 finish_wait(&sdp->sd_quota_wait, &wait);
1363 }
1364
1365 return 0;
1366}
1367
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 3b7f4b0e5dfe..cec9032be97d 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -15,22 +15,22 @@ struct gfs2_sbd;
15 15
16#define NO_QUOTA_CHANGE ((u32)-1) 16#define NO_QUOTA_CHANGE ((u32)-1)
17 17
18int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid); 18extern int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid);
19void gfs2_quota_unhold(struct gfs2_inode *ip); 19extern void gfs2_quota_unhold(struct gfs2_inode *ip);
20 20
21int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid); 21extern int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid);
22void gfs2_quota_unlock(struct gfs2_inode *ip); 22extern void gfs2_quota_unlock(struct gfs2_inode *ip);
23 23
24int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid); 24extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
25void gfs2_quota_change(struct gfs2_inode *ip, s64 change, 25extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
26 u32 uid, u32 gid); 26 u32 uid, u32 gid);
27 27
28int gfs2_quota_sync(struct gfs2_sbd *sdp); 28extern int gfs2_quota_sync(struct gfs2_sbd *sdp);
29int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id); 29extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
30 30
31int gfs2_quota_init(struct gfs2_sbd *sdp); 31extern int gfs2_quota_init(struct gfs2_sbd *sdp);
32void gfs2_quota_scan(struct gfs2_sbd *sdp); 32extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
33void gfs2_quota_cleanup(struct gfs2_sbd *sdp); 33extern int gfs2_quotad(void *data);
34 34
35static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) 35static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
36{ 36{
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index d5e91f4f6a0b..efd09c3d2b26 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -14,6 +14,8 @@
14#include <linux/gfs2_ondisk.h> 14#include <linux/gfs2_ondisk.h>
15#include <linux/crc32.h> 15#include <linux/crc32.h>
16#include <linux/lm_interface.h> 16#include <linux/lm_interface.h>
17#include <linux/kthread.h>
18#include <linux/freezer.h>
17 19
18#include "gfs2.h" 20#include "gfs2.h"
19#include "incore.h" 21#include "incore.h"
@@ -583,13 +585,35 @@ fail:
583 return error; 585 return error;
584} 586}
585 587
588static struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
589{
590 struct gfs2_jdesc *jd;
591 int found = 0;
592
593 spin_lock(&sdp->sd_jindex_spin);
594
595 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
596 if (jd->jd_dirty) {
597 jd->jd_dirty = 0;
598 found = 1;
599 break;
600 }
601 }
602 spin_unlock(&sdp->sd_jindex_spin);
603
604 if (!found)
605 jd = NULL;
606
607 return jd;
608}
609
586/** 610/**
587 * gfs2_check_journals - Recover any dirty journals 611 * gfs2_check_journals - Recover any dirty journals
588 * @sdp: the filesystem 612 * @sdp: the filesystem
589 * 613 *
590 */ 614 */
591 615
592void gfs2_check_journals(struct gfs2_sbd *sdp) 616static void gfs2_check_journals(struct gfs2_sbd *sdp)
593{ 617{
594 struct gfs2_jdesc *jd; 618 struct gfs2_jdesc *jd;
595 619
@@ -603,3 +627,25 @@ void gfs2_check_journals(struct gfs2_sbd *sdp)
603 } 627 }
604} 628}
605 629
630/**
631 * gfs2_recoverd - Recover dead machine's journals
632 * @sdp: Pointer to GFS2 superblock
633 *
634 */
635
636int gfs2_recoverd(void *data)
637{
638 struct gfs2_sbd *sdp = data;
639 unsigned long t;
640
641 while (!kthread_should_stop()) {
642 gfs2_check_journals(sdp);
643 t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ;
644 if (freezing(current))
645 refrigerator();
646 schedule_timeout_interruptible(t);
647 }
648
649 return 0;
650}
651
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index f7235e61c723..a8218ea15b57 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -18,17 +18,17 @@ static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
18 *blk = 0; 18 *blk = 0;
19} 19}
20 20
21int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, 21extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
22 struct buffer_head **bh); 22 struct buffer_head **bh);
23 23
24int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); 24extern int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
25int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); 25extern int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
26void gfs2_revoke_clean(struct gfs2_sbd *sdp); 26extern void gfs2_revoke_clean(struct gfs2_sbd *sdp);
27 27
28int gfs2_find_jhead(struct gfs2_jdesc *jd, 28extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
29 struct gfs2_log_header_host *head); 29 struct gfs2_log_header_host *head);
30int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd); 30extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
31void gfs2_check_journals(struct gfs2_sbd *sdp); 31extern int gfs2_recoverd(void *data);
32 32
33#endif /* __RECOVERY_DOT_H__ */ 33#endif /* __RECOVERY_DOT_H__ */
34 34
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 2d90fb253505..8b01c635d925 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -269,16 +269,14 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
269 bi->bi_len, x); 269 bi->bi_len, x);
270 } 270 }
271 271
272 if (count[0] != rgd->rd_rg.rg_free) { 272 if (count[0] != rgd->rd_free) {
273 if (gfs2_consist_rgrpd(rgd)) 273 if (gfs2_consist_rgrpd(rgd))
274 fs_err(sdp, "free data mismatch: %u != %u\n", 274 fs_err(sdp, "free data mismatch: %u != %u\n",
275 count[0], rgd->rd_rg.rg_free); 275 count[0], rgd->rd_free);
276 return; 276 return;
277 } 277 }
278 278
279 tmp = rgd->rd_data - 279 tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
280 rgd->rd_rg.rg_free -
281 rgd->rd_rg.rg_dinodes;
282 if (count[1] + count[2] != tmp) { 280 if (count[1] + count[2] != tmp) {
283 if (gfs2_consist_rgrpd(rgd)) 281 if (gfs2_consist_rgrpd(rgd))
284 fs_err(sdp, "used data mismatch: %u != %u\n", 282 fs_err(sdp, "used data mismatch: %u != %u\n",
@@ -286,10 +284,10 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
286 return; 284 return;
287 } 285 }
288 286
289 if (count[3] != rgd->rd_rg.rg_dinodes) { 287 if (count[3] != rgd->rd_dinodes) {
290 if (gfs2_consist_rgrpd(rgd)) 288 if (gfs2_consist_rgrpd(rgd))
291 fs_err(sdp, "used metadata mismatch: %u != %u\n", 289 fs_err(sdp, "used metadata mismatch: %u != %u\n",
292 count[3], rgd->rd_rg.rg_dinodes); 290 count[3], rgd->rd_dinodes);
293 return; 291 return;
294 } 292 }
295 293
@@ -501,7 +499,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
501 for (rgrps = 0;; rgrps++) { 499 for (rgrps = 0;; rgrps++) {
502 loff_t pos = rgrps * sizeof(struct gfs2_rindex); 500 loff_t pos = rgrps * sizeof(struct gfs2_rindex);
503 501
504 if (pos + sizeof(struct gfs2_rindex) >= ip->i_di.di_size) 502 if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize)
505 break; 503 break;
506 error = gfs2_internal_read(ip, &ra_state, buf, &pos, 504 error = gfs2_internal_read(ip, &ra_state, buf, &pos,
507 sizeof(struct gfs2_rindex)); 505 sizeof(struct gfs2_rindex));
@@ -590,7 +588,7 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
590 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 588 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
591 struct inode *inode = &ip->i_inode; 589 struct inode *inode = &ip->i_inode;
592 struct file_ra_state ra_state; 590 struct file_ra_state ra_state;
593 u64 rgrp_count = ip->i_di.di_size; 591 u64 rgrp_count = ip->i_disksize;
594 int error; 592 int error;
595 593
596 if (do_div(rgrp_count, sizeof(struct gfs2_rindex))) { 594 if (do_div(rgrp_count, sizeof(struct gfs2_rindex))) {
@@ -634,7 +632,7 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
634 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) { 632 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
635 /* Ignore partials */ 633 /* Ignore partials */
636 if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) > 634 if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
637 ip->i_di.di_size) 635 ip->i_disksize)
638 break; 636 break;
639 error = read_rindex_entry(ip, &ra_state); 637 error = read_rindex_entry(ip, &ra_state);
640 if (error) { 638 if (error) {
@@ -692,7 +690,6 @@ int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
692static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf) 690static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
693{ 691{
694 const struct gfs2_rgrp *str = buf; 692 const struct gfs2_rgrp *str = buf;
695 struct gfs2_rgrp_host *rg = &rgd->rd_rg;
696 u32 rg_flags; 693 u32 rg_flags;
697 694
698 rg_flags = be32_to_cpu(str->rg_flags); 695 rg_flags = be32_to_cpu(str->rg_flags);
@@ -700,24 +697,23 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
700 rgd->rd_flags |= GFS2_RDF_NOALLOC; 697 rgd->rd_flags |= GFS2_RDF_NOALLOC;
701 else 698 else
702 rgd->rd_flags &= ~GFS2_RDF_NOALLOC; 699 rgd->rd_flags &= ~GFS2_RDF_NOALLOC;
703 rg->rg_free = be32_to_cpu(str->rg_free); 700 rgd->rd_free = be32_to_cpu(str->rg_free);
704 rg->rg_dinodes = be32_to_cpu(str->rg_dinodes); 701 rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
705 rg->rg_igeneration = be64_to_cpu(str->rg_igeneration); 702 rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
706} 703}
707 704
708static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf) 705static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
709{ 706{
710 struct gfs2_rgrp *str = buf; 707 struct gfs2_rgrp *str = buf;
711 struct gfs2_rgrp_host *rg = &rgd->rd_rg;
712 u32 rg_flags = 0; 708 u32 rg_flags = 0;
713 709
714 if (rgd->rd_flags & GFS2_RDF_NOALLOC) 710 if (rgd->rd_flags & GFS2_RDF_NOALLOC)
715 rg_flags |= GFS2_RGF_NOALLOC; 711 rg_flags |= GFS2_RGF_NOALLOC;
716 str->rg_flags = cpu_to_be32(rg_flags); 712 str->rg_flags = cpu_to_be32(rg_flags);
717 str->rg_free = cpu_to_be32(rg->rg_free); 713 str->rg_free = cpu_to_be32(rgd->rd_free);
718 str->rg_dinodes = cpu_to_be32(rg->rg_dinodes); 714 str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
719 str->__pad = cpu_to_be32(0); 715 str->__pad = cpu_to_be32(0);
720 str->rg_igeneration = cpu_to_be64(rg->rg_igeneration); 716 str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
721 memset(&str->rg_reserved, 0, sizeof(str->rg_reserved)); 717 memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
722} 718}
723 719
@@ -776,7 +772,7 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
776 } 772 }
777 773
778 spin_lock(&sdp->sd_rindex_spin); 774 spin_lock(&sdp->sd_rindex_spin);
779 rgd->rd_free_clone = rgd->rd_rg.rg_free; 775 rgd->rd_free_clone = rgd->rd_free;
780 rgd->rd_bh_count++; 776 rgd->rd_bh_count++;
781 spin_unlock(&sdp->sd_rindex_spin); 777 spin_unlock(&sdp->sd_rindex_spin);
782 778
@@ -850,7 +846,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
850 } 846 }
851 847
852 spin_lock(&sdp->sd_rindex_spin); 848 spin_lock(&sdp->sd_rindex_spin);
853 rgd->rd_free_clone = rgd->rd_rg.rg_free; 849 rgd->rd_free_clone = rgd->rd_free;
854 spin_unlock(&sdp->sd_rindex_spin); 850 spin_unlock(&sdp->sd_rindex_spin);
855} 851}
856 852
@@ -1403,8 +1399,8 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
1403 block = rgd->rd_data0 + blk; 1399 block = rgd->rd_data0 + blk;
1404 ip->i_goal = block; 1400 ip->i_goal = block;
1405 1401
1406 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free >= *n); 1402 gfs2_assert_withdraw(sdp, rgd->rd_free >= *n);
1407 rgd->rd_rg.rg_free -= *n; 1403 rgd->rd_free -= *n;
1408 1404
1409 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1405 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1410 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1406 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1445,10 +1441,10 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
1445 1441
1446 block = rgd->rd_data0 + blk; 1442 block = rgd->rd_data0 + blk;
1447 1443
1448 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free); 1444 gfs2_assert_withdraw(sdp, rgd->rd_free);
1449 rgd->rd_rg.rg_free--; 1445 rgd->rd_free--;
1450 rgd->rd_rg.rg_dinodes++; 1446 rgd->rd_dinodes++;
1451 *generation = rgd->rd_rg.rg_igeneration++; 1447 *generation = rgd->rd_igeneration++;
1452 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1448 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1453 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1449 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
1454 1450
@@ -1481,7 +1477,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
1481 if (!rgd) 1477 if (!rgd)
1482 return; 1478 return;
1483 1479
1484 rgd->rd_rg.rg_free += blen; 1480 rgd->rd_free += blen;
1485 1481
1486 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1482 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1487 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1483 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1509,7 +1505,7 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
1509 if (!rgd) 1505 if (!rgd)
1510 return; 1506 return;
1511 1507
1512 rgd->rd_rg.rg_free += blen; 1508 rgd->rd_free += blen;
1513 1509
1514 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1510 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1515 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1511 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1546,10 +1542,10 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
1546 return; 1542 return;
1547 gfs2_assert_withdraw(sdp, rgd == tmp_rgd); 1543 gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
1548 1544
1549 if (!rgd->rd_rg.rg_dinodes) 1545 if (!rgd->rd_dinodes)
1550 gfs2_consist_rgrpd(rgd); 1546 gfs2_consist_rgrpd(rgd);
1551 rgd->rd_rg.rg_dinodes--; 1547 rgd->rd_dinodes--;
1552 rgd->rd_rg.rg_free++; 1548 rgd->rd_free++;
1553 1549
1554 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1550 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1555 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1551 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index c3ba3d9d0aac..141b781f2fcc 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -34,76 +34,6 @@
34#include "util.h" 34#include "util.h"
35 35
36/** 36/**
37 * gfs2_jindex_hold - Grab a lock on the jindex
38 * @sdp: The GFS2 superblock
39 * @ji_gh: the holder for the jindex glock
40 *
41 * This is very similar to the gfs2_rindex_hold() function, except that
42 * in general we hold the jindex lock for longer periods of time and
43 * we grab it far less frequently (in general) then the rgrp lock.
44 *
45 * Returns: errno
46 */
47
48int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
49{
50 struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
51 struct qstr name;
52 char buf[20];
53 struct gfs2_jdesc *jd;
54 int error;
55
56 name.name = buf;
57
58 mutex_lock(&sdp->sd_jindex_mutex);
59
60 for (;;) {
61 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh);
62 if (error)
63 break;
64
65 name.len = sprintf(buf, "journal%u", sdp->sd_journals);
66 name.hash = gfs2_disk_hash(name.name, name.len);
67
68 error = gfs2_dir_check(sdp->sd_jindex, &name, NULL);
69 if (error == -ENOENT) {
70 error = 0;
71 break;
72 }
73
74 gfs2_glock_dq_uninit(ji_gh);
75
76 if (error)
77 break;
78
79 error = -ENOMEM;
80 jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
81 if (!jd)
82 break;
83
84 INIT_LIST_HEAD(&jd->extent_list);
85 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
86 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
87 if (!jd->jd_inode)
88 error = -ENOENT;
89 else
90 error = PTR_ERR(jd->jd_inode);
91 kfree(jd);
92 break;
93 }
94
95 spin_lock(&sdp->sd_jindex_spin);
96 jd->jd_jid = sdp->sd_journals++;
97 list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
98 spin_unlock(&sdp->sd_jindex_spin);
99 }
100
101 mutex_unlock(&sdp->sd_jindex_mutex);
102
103 return error;
104}
105
106/**
107 * gfs2_jindex_free - Clear all the journal index information 37 * gfs2_jindex_free - Clear all the journal index information
108 * @sdp: The GFS2 superblock 38 * @sdp: The GFS2 superblock
109 * 39 *
@@ -166,39 +96,6 @@ struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid)
166 return jd; 96 return jd;
167} 97}
168 98
169void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
170{
171 struct gfs2_jdesc *jd;
172
173 spin_lock(&sdp->sd_jindex_spin);
174 jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
175 if (jd)
176 jd->jd_dirty = 1;
177 spin_unlock(&sdp->sd_jindex_spin);
178}
179
180struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
181{
182 struct gfs2_jdesc *jd;
183 int found = 0;
184
185 spin_lock(&sdp->sd_jindex_spin);
186
187 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
188 if (jd->jd_dirty) {
189 jd->jd_dirty = 0;
190 found = 1;
191 break;
192 }
193 }
194 spin_unlock(&sdp->sd_jindex_spin);
195
196 if (!found)
197 jd = NULL;
198
199 return jd;
200}
201
202int gfs2_jdesc_check(struct gfs2_jdesc *jd) 99int gfs2_jdesc_check(struct gfs2_jdesc *jd)
203{ 100{
204 struct gfs2_inode *ip = GFS2_I(jd->jd_inode); 101 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
@@ -206,14 +103,14 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
206 int ar; 103 int ar;
207 int error; 104 int error;
208 105
209 if (ip->i_di.di_size < (8 << 20) || ip->i_di.di_size > (1 << 30) || 106 if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) ||
210 (ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1))) { 107 (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) {
211 gfs2_consist_inode(ip); 108 gfs2_consist_inode(ip);
212 return -EIO; 109 return -EIO;
213 } 110 }
214 jd->jd_blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; 111 jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
215 112
216 error = gfs2_write_alloc_required(ip, 0, ip->i_di.di_size, &ar); 113 error = gfs2_write_alloc_required(ip, 0, ip->i_disksize, &ar);
217 if (!error && ar) { 114 if (!error && ar) {
218 gfs2_consist_inode(ip); 115 gfs2_consist_inode(ip);
219 error = -EIO; 116 error = -EIO;
@@ -423,137 +320,6 @@ out:
423 return error; 320 return error;
424} 321}
425 322
426/**
427 * gfs2_statfs_i - Do a statfs
428 * @sdp: the filesystem
429 * @sg: the sg structure
430 *
431 * Returns: errno
432 */
433
434int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
435{
436 struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
437 struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
438
439 spin_lock(&sdp->sd_statfs_spin);
440
441 *sc = *m_sc;
442 sc->sc_total += l_sc->sc_total;
443 sc->sc_free += l_sc->sc_free;
444 sc->sc_dinodes += l_sc->sc_dinodes;
445
446 spin_unlock(&sdp->sd_statfs_spin);
447
448 if (sc->sc_free < 0)
449 sc->sc_free = 0;
450 if (sc->sc_free > sc->sc_total)
451 sc->sc_free = sc->sc_total;
452 if (sc->sc_dinodes < 0)
453 sc->sc_dinodes = 0;
454
455 return 0;
456}
457
458/**
459 * statfs_fill - fill in the sg for a given RG
460 * @rgd: the RG
461 * @sc: the sc structure
462 *
463 * Returns: 0 on success, -ESTALE if the LVB is invalid
464 */
465
466static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
467 struct gfs2_statfs_change_host *sc)
468{
469 gfs2_rgrp_verify(rgd);
470 sc->sc_total += rgd->rd_data;
471 sc->sc_free += rgd->rd_rg.rg_free;
472 sc->sc_dinodes += rgd->rd_rg.rg_dinodes;
473 return 0;
474}
475
476/**
477 * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
478 * @sdp: the filesystem
479 * @sc: the sc info that will be returned
480 *
481 * Any error (other than a signal) will cause this routine to fall back
482 * to the synchronous version.
483 *
484 * FIXME: This really shouldn't busy wait like this.
485 *
486 * Returns: errno
487 */
488
489int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
490{
491 struct gfs2_holder ri_gh;
492 struct gfs2_rgrpd *rgd_next;
493 struct gfs2_holder *gha, *gh;
494 unsigned int slots = 64;
495 unsigned int x;
496 int done;
497 int error = 0, err;
498
499 memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
500 gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
501 if (!gha)
502 return -ENOMEM;
503
504 error = gfs2_rindex_hold(sdp, &ri_gh);
505 if (error)
506 goto out;
507
508 rgd_next = gfs2_rgrpd_get_first(sdp);
509
510 for (;;) {
511 done = 1;
512
513 for (x = 0; x < slots; x++) {
514 gh = gha + x;
515
516 if (gh->gh_gl && gfs2_glock_poll(gh)) {
517 err = gfs2_glock_wait(gh);
518 if (err) {
519 gfs2_holder_uninit(gh);
520 error = err;
521 } else {
522 if (!error)
523 error = statfs_slow_fill(
524 gh->gh_gl->gl_object, sc);
525 gfs2_glock_dq_uninit(gh);
526 }
527 }
528
529 if (gh->gh_gl)
530 done = 0;
531 else if (rgd_next && !error) {
532 error = gfs2_glock_nq_init(rgd_next->rd_gl,
533 LM_ST_SHARED,
534 GL_ASYNC,
535 gh);
536 rgd_next = gfs2_rgrpd_get_next(rgd_next);
537 done = 0;
538 }
539
540 if (signal_pending(current))
541 error = -ERESTARTSYS;
542 }
543
544 if (done)
545 break;
546
547 yield();
548 }
549
550 gfs2_glock_dq_uninit(&ri_gh);
551
552out:
553 kfree(gha);
554 return error;
555}
556
557struct lfcc { 323struct lfcc {
558 struct list_head list; 324 struct list_head list;
559 struct gfs2_holder gh; 325 struct gfs2_holder gh;
@@ -580,10 +346,6 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
580 struct gfs2_log_header_host lh; 346 struct gfs2_log_header_host lh;
581 int error; 347 int error;
582 348
583 error = gfs2_jindex_hold(sdp, &ji_gh);
584 if (error)
585 return error;
586
587 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { 349 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
588 lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL); 350 lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
589 if (!lfcc) { 351 if (!lfcc) {
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 50a4c9b1215e..f6b8b00ad881 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -10,6 +10,8 @@
10#ifndef __SUPER_DOT_H__ 10#ifndef __SUPER_DOT_H__
11#define __SUPER_DOT_H__ 11#define __SUPER_DOT_H__
12 12
13#include <linux/fs.h>
14#include <linux/dcache.h>
13#include "incore.h" 15#include "incore.h"
14 16
15void gfs2_lm_unmount(struct gfs2_sbd *sdp); 17void gfs2_lm_unmount(struct gfs2_sbd *sdp);
@@ -23,12 +25,9 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
23 return x; 25 return x;
24} 26}
25 27
26int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh);
27void gfs2_jindex_free(struct gfs2_sbd *sdp); 28void gfs2_jindex_free(struct gfs2_sbd *sdp);
28 29
29struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid); 30struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
30void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid);
31struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp);
32int gfs2_jdesc_check(struct gfs2_jdesc *jd); 31int gfs2_jdesc_check(struct gfs2_jdesc *jd);
33 32
34int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, 33int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
@@ -40,11 +39,15 @@ int gfs2_statfs_init(struct gfs2_sbd *sdp);
40void gfs2_statfs_change(struct gfs2_sbd *sdp, 39void gfs2_statfs_change(struct gfs2_sbd *sdp,
41 s64 total, s64 free, s64 dinodes); 40 s64 total, s64 free, s64 dinodes);
42int gfs2_statfs_sync(struct gfs2_sbd *sdp); 41int gfs2_statfs_sync(struct gfs2_sbd *sdp);
43int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc);
44int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc);
45 42
46int gfs2_freeze_fs(struct gfs2_sbd *sdp); 43int gfs2_freeze_fs(struct gfs2_sbd *sdp);
47void gfs2_unfreeze_fs(struct gfs2_sbd *sdp); 44void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
48 45
46extern struct file_system_type gfs2_fs_type;
47extern struct file_system_type gfs2meta_fs_type;
48extern const struct export_operations gfs2_export_ops;
49extern const struct super_operations gfs2_super_ops;
50extern struct dentry_operations gfs2_dops;
51
49#endif /* __SUPER_DOT_H__ */ 52#endif /* __SUPER_DOT_H__ */
50 53
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 7e1879f1a02c..26c1fa777a95 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -26,9 +26,6 @@
26#include "quota.h" 26#include "quota.h"
27#include "util.h" 27#include "util.h"
28 28
29char *gfs2_sys_margs;
30spinlock_t gfs2_sys_margs_lock;
31
32static ssize_t id_show(struct gfs2_sbd *sdp, char *buf) 29static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
33{ 30{
34 return snprintf(buf, PAGE_SIZE, "%u:%u\n", 31 return snprintf(buf, PAGE_SIZE, "%u:%u\n",
@@ -263,7 +260,6 @@ ARGS_ATTR(localcaching, "%d\n");
263ARGS_ATTR(localflocks, "%d\n"); 260ARGS_ATTR(localflocks, "%d\n");
264ARGS_ATTR(debug, "%d\n"); 261ARGS_ATTR(debug, "%d\n");
265ARGS_ATTR(upgrade, "%d\n"); 262ARGS_ATTR(upgrade, "%d\n");
266ARGS_ATTR(num_glockd, "%u\n");
267ARGS_ATTR(posix_acl, "%d\n"); 263ARGS_ATTR(posix_acl, "%d\n");
268ARGS_ATTR(quota, "%u\n"); 264ARGS_ATTR(quota, "%u\n");
269ARGS_ATTR(suiddir, "%d\n"); 265ARGS_ATTR(suiddir, "%d\n");
@@ -279,7 +275,6 @@ static struct attribute *args_attrs[] = {
279 &args_attr_localflocks.attr, 275 &args_attr_localflocks.attr,
280 &args_attr_debug.attr, 276 &args_attr_debug.attr,
281 &args_attr_upgrade.attr, 277 &args_attr_upgrade.attr,
282 &args_attr_num_glockd.attr,
283 &args_attr_posix_acl.attr, 278 &args_attr_posix_acl.attr,
284 &args_attr_quota.attr, 279 &args_attr_quota.attr,
285 &args_attr_suiddir.attr, 280 &args_attr_suiddir.attr,
@@ -288,30 +283,6 @@ static struct attribute *args_attrs[] = {
288}; 283};
289 284
290/* 285/*
291 * display counters from superblock
292 */
293
294struct counters_attr {
295 struct attribute attr;
296 ssize_t (*show)(struct gfs2_sbd *, char *);
297};
298
299#define COUNTERS_ATTR(name, fmt) \
300static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
301{ \
302 return snprintf(buf, PAGE_SIZE, fmt, \
303 (unsigned int)atomic_read(&sdp->sd_##name)); \
304} \
305static struct counters_attr counters_attr_##name = __ATTR_RO(name)
306
307COUNTERS_ATTR(reclaimed, "%u\n");
308
309static struct attribute *counters_attrs[] = {
310 &counters_attr_reclaimed.attr,
311 NULL,
312};
313
314/*
315 * get and set struct gfs2_tune fields 286 * get and set struct gfs2_tune fields
316 */ 287 */
317 288
@@ -393,7 +364,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
393} \ 364} \
394TUNE_ATTR_2(name, name##_store) 365TUNE_ATTR_2(name, name##_store)
395 366
396TUNE_ATTR(demote_secs, 0);
397TUNE_ATTR(incore_log_blocks, 0); 367TUNE_ATTR(incore_log_blocks, 0);
398TUNE_ATTR(log_flush_secs, 0); 368TUNE_ATTR(log_flush_secs, 0);
399TUNE_ATTR(quota_warn_period, 0); 369TUNE_ATTR(quota_warn_period, 0);
@@ -408,11 +378,9 @@ TUNE_ATTR(stall_secs, 1);
408TUNE_ATTR(statfs_quantum, 1); 378TUNE_ATTR(statfs_quantum, 1);
409TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process); 379TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
410TUNE_ATTR_DAEMON(logd_secs, logd_process); 380TUNE_ATTR_DAEMON(logd_secs, logd_process);
411TUNE_ATTR_DAEMON(quotad_secs, quotad_process);
412TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); 381TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
413 382
414static struct attribute *tune_attrs[] = { 383static struct attribute *tune_attrs[] = {
415 &tune_attr_demote_secs.attr,
416 &tune_attr_incore_log_blocks.attr, 384 &tune_attr_incore_log_blocks.attr,
417 &tune_attr_log_flush_secs.attr, 385 &tune_attr_log_flush_secs.attr,
418 &tune_attr_quota_warn_period.attr, 386 &tune_attr_quota_warn_period.attr,
@@ -426,7 +394,6 @@ static struct attribute *tune_attrs[] = {
426 &tune_attr_statfs_quantum.attr, 394 &tune_attr_statfs_quantum.attr,
427 &tune_attr_recoverd_secs.attr, 395 &tune_attr_recoverd_secs.attr,
428 &tune_attr_logd_secs.attr, 396 &tune_attr_logd_secs.attr,
429 &tune_attr_quotad_secs.attr,
430 &tune_attr_quota_scale.attr, 397 &tune_attr_quota_scale.attr,
431 &tune_attr_new_files_jdata.attr, 398 &tune_attr_new_files_jdata.attr,
432 NULL, 399 NULL,
@@ -437,11 +404,6 @@ static struct attribute_group lockstruct_group = {
437 .attrs = lockstruct_attrs, 404 .attrs = lockstruct_attrs,
438}; 405};
439 406
440static struct attribute_group counters_group = {
441 .name = "counters",
442 .attrs = counters_attrs,
443};
444
445static struct attribute_group args_group = { 407static struct attribute_group args_group = {
446 .name = "args", 408 .name = "args",
447 .attrs = args_attrs, 409 .attrs = args_attrs,
@@ -466,13 +428,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
466 if (error) 428 if (error)
467 goto fail_reg; 429 goto fail_reg;
468 430
469 error = sysfs_create_group(&sdp->sd_kobj, &counters_group);
470 if (error)
471 goto fail_lockstruct;
472
473 error = sysfs_create_group(&sdp->sd_kobj, &args_group); 431 error = sysfs_create_group(&sdp->sd_kobj, &args_group);
474 if (error) 432 if (error)
475 goto fail_counters; 433 goto fail_lockstruct;
476 434
477 error = sysfs_create_group(&sdp->sd_kobj, &tune_group); 435 error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
478 if (error) 436 if (error)
@@ -483,8 +441,6 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
483 441
484fail_args: 442fail_args:
485 sysfs_remove_group(&sdp->sd_kobj, &args_group); 443 sysfs_remove_group(&sdp->sd_kobj, &args_group);
486fail_counters:
487 sysfs_remove_group(&sdp->sd_kobj, &counters_group);
488fail_lockstruct: 444fail_lockstruct:
489 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group); 445 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
490fail_reg: 446fail_reg:
@@ -498,16 +454,27 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
498{ 454{
499 sysfs_remove_group(&sdp->sd_kobj, &tune_group); 455 sysfs_remove_group(&sdp->sd_kobj, &tune_group);
500 sysfs_remove_group(&sdp->sd_kobj, &args_group); 456 sysfs_remove_group(&sdp->sd_kobj, &args_group);
501 sysfs_remove_group(&sdp->sd_kobj, &counters_group);
502 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group); 457 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
503 kobject_put(&sdp->sd_kobj); 458 kobject_put(&sdp->sd_kobj);
504} 459}
505 460
461static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
462 struct kobj_uevent_env *env)
463{
464 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
465 add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
466 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
467 return 0;
468}
469
470static struct kset_uevent_ops gfs2_uevent_ops = {
471 .uevent = gfs2_uevent,
472};
473
474
506int gfs2_sys_init(void) 475int gfs2_sys_init(void)
507{ 476{
508 gfs2_sys_margs = NULL; 477 gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj);
509 spin_lock_init(&gfs2_sys_margs_lock);
510 gfs2_kset = kset_create_and_add("gfs2", NULL, fs_kobj);
511 if (!gfs2_kset) 478 if (!gfs2_kset)
512 return -ENOMEM; 479 return -ENOMEM;
513 return 0; 480 return 0;
@@ -515,7 +482,6 @@ int gfs2_sys_init(void)
515 482
516void gfs2_sys_uninit(void) 483void gfs2_sys_uninit(void)
517{ 484{
518 kfree(gfs2_sys_margs);
519 kset_unregister(gfs2_kset); 485 kset_unregister(gfs2_kset);
520} 486}
521 487
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
index 1ca8cdac5304..e94560e836d7 100644
--- a/fs/gfs2/sys.h
+++ b/fs/gfs2/sys.h
@@ -13,10 +13,6 @@
13#include <linux/spinlock.h> 13#include <linux/spinlock.h>
14struct gfs2_sbd; 14struct gfs2_sbd;
15 15
16/* Allow args to be passed to GFS2 when using an initial ram disk */
17extern char *gfs2_sys_margs;
18extern spinlock_t gfs2_sys_margs_lock;
19
20int gfs2_sys_fs_add(struct gfs2_sbd *sdp); 16int gfs2_sys_fs_add(struct gfs2_sbd *sdp);
21void gfs2_sys_fs_del(struct gfs2_sbd *sdp); 17void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
22 18
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index d31e355c61fb..374f50e95496 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -25,6 +25,7 @@ struct kmem_cache *gfs2_glock_cachep __read_mostly;
25struct kmem_cache *gfs2_inode_cachep __read_mostly; 25struct kmem_cache *gfs2_inode_cachep __read_mostly;
26struct kmem_cache *gfs2_bufdata_cachep __read_mostly; 26struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
27struct kmem_cache *gfs2_rgrpd_cachep __read_mostly; 27struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
28struct kmem_cache *gfs2_quotad_cachep __read_mostly;
28 29
29void gfs2_assert_i(struct gfs2_sbd *sdp) 30void gfs2_assert_i(struct gfs2_sbd *sdp)
30{ 31{
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 7f48576289c9..33e96b0ce9ab 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -148,6 +148,7 @@ extern struct kmem_cache *gfs2_glock_cachep;
148extern struct kmem_cache *gfs2_inode_cachep; 148extern struct kmem_cache *gfs2_inode_cachep;
149extern struct kmem_cache *gfs2_bufdata_cachep; 149extern struct kmem_cache *gfs2_bufdata_cachep;
150extern struct kmem_cache *gfs2_rgrpd_cachep; 150extern struct kmem_cache *gfs2_rgrpd_cachep;
151extern struct kmem_cache *gfs2_quotad_cachep;
151 152
152static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, 153static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
153 unsigned int *p) 154 unsigned int *p)
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 3a31451ac170..5c538e0ec14b 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -501,7 +501,7 @@ int hostfs_write_begin(struct file *file, struct address_space *mapping,
501{ 501{
502 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 502 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
503 503
504 *pagep = __grab_cache_page(mapping, index); 504 *pagep = grab_cache_page_write_begin(mapping, index, flags);
505 if (!*pagep) 505 if (!*pagep)
506 return -ENOMEM; 506 return -ENOMEM;
507 return 0; 507 return 0;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 7d479ce3aceb..6903d37af037 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -252,6 +252,7 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
252 for (;;) { 252 for (;;) {
253 struct page *page; 253 struct page *page;
254 unsigned long nr, ret; 254 unsigned long nr, ret;
255 int ra;
255 256
256 /* nr is the maximum number of bytes to copy from this page */ 257 /* nr is the maximum number of bytes to copy from this page */
257 nr = huge_page_size(h); 258 nr = huge_page_size(h);
@@ -274,16 +275,19 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
274 */ 275 */
275 ret = len < nr ? len : nr; 276 ret = len < nr ? len : nr;
276 if (clear_user(buf, ret)) 277 if (clear_user(buf, ret))
277 ret = -EFAULT; 278 ra = -EFAULT;
279 else
280 ra = 0;
278 } else { 281 } else {
279 /* 282 /*
280 * We have the page, copy it to user space buffer. 283 * We have the page, copy it to user space buffer.
281 */ 284 */
282 ret = hugetlbfs_read_actor(page, offset, buf, len, nr); 285 ra = hugetlbfs_read_actor(page, offset, buf, len, nr);
286 ret = ra;
283 } 287 }
284 if (ret < 0) { 288 if (ra < 0) {
285 if (retval == 0) 289 if (retval == 0)
286 retval = ret; 290 retval = ra;
287 if (page) 291 if (page)
288 page_cache_release(page); 292 page_cache_release(page);
289 goto out; 293 goto out;
@@ -506,7 +510,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
506 inode->i_mode = mode; 510 inode->i_mode = mode;
507 inode->i_uid = uid; 511 inode->i_uid = uid;
508 inode->i_gid = gid; 512 inode->i_gid = gid;
509 inode->i_blocks = 0;
510 inode->i_mapping->a_ops = &hugetlbfs_aops; 513 inode->i_mapping->a_ops = &hugetlbfs_aops;
511 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; 514 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
512 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 515 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/inode.c b/fs/inode.c
index 0487ddba1397..913ab2d9a5d1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -22,6 +22,7 @@
22#include <linux/bootmem.h> 22#include <linux/bootmem.h>
23#include <linux/inotify.h> 23#include <linux/inotify.h>
24#include <linux/mount.h> 24#include <linux/mount.h>
25#include <linux/async.h>
25 26
26/* 27/*
27 * This is needed for the following functions: 28 * This is needed for the following functions:
@@ -108,84 +109,102 @@ static void wake_up_inode(struct inode *inode)
108 wake_up_bit(&inode->i_state, __I_LOCK); 109 wake_up_bit(&inode->i_state, __I_LOCK);
109} 110}
110 111
111static struct inode *alloc_inode(struct super_block *sb) 112/**
113 * inode_init_always - perform inode structure intialisation
114 * @sb: superblock inode belongs to
115 * @inode: inode to initialise
116 *
117 * These are initializations that need to be done on every inode
118 * allocation as the fields are not initialised by slab allocation.
119 */
120struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
112{ 121{
113 static const struct address_space_operations empty_aops; 122 static const struct address_space_operations empty_aops;
114 static struct inode_operations empty_iops; 123 static struct inode_operations empty_iops;
115 static const struct file_operations empty_fops; 124 static const struct file_operations empty_fops;
116 struct inode *inode;
117 125
118 if (sb->s_op->alloc_inode) 126 struct address_space * const mapping = &inode->i_data;
119 inode = sb->s_op->alloc_inode(sb); 127
120 else 128 inode->i_sb = sb;
121 inode = (struct inode *) kmem_cache_alloc(inode_cachep, GFP_KERNEL); 129 inode->i_blkbits = sb->s_blocksize_bits;
122 130 inode->i_flags = 0;
123 if (inode) { 131 atomic_set(&inode->i_count, 1);
124 struct address_space * const mapping = &inode->i_data; 132 inode->i_op = &empty_iops;
125 133 inode->i_fop = &empty_fops;
126 inode->i_sb = sb; 134 inode->i_nlink = 1;
127 inode->i_blkbits = sb->s_blocksize_bits; 135 inode->i_uid = 0;
128 inode->i_flags = 0; 136 inode->i_gid = 0;
129 atomic_set(&inode->i_count, 1); 137 atomic_set(&inode->i_writecount, 0);
130 inode->i_op = &empty_iops; 138 inode->i_size = 0;
131 inode->i_fop = &empty_fops; 139 inode->i_blocks = 0;
132 inode->i_nlink = 1; 140 inode->i_bytes = 0;
133 atomic_set(&inode->i_writecount, 0); 141 inode->i_generation = 0;
134 inode->i_size = 0;
135 inode->i_blocks = 0;
136 inode->i_bytes = 0;
137 inode->i_generation = 0;
138#ifdef CONFIG_QUOTA 142#ifdef CONFIG_QUOTA
139 memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); 143 memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
140#endif 144#endif
141 inode->i_pipe = NULL; 145 inode->i_pipe = NULL;
142 inode->i_bdev = NULL; 146 inode->i_bdev = NULL;
143 inode->i_cdev = NULL; 147 inode->i_cdev = NULL;
144 inode->i_rdev = 0; 148 inode->i_rdev = 0;
145 inode->dirtied_when = 0; 149 inode->dirtied_when = 0;
146 if (security_inode_alloc(inode)) { 150 if (security_inode_alloc(inode)) {
147 if (inode->i_sb->s_op->destroy_inode) 151 if (inode->i_sb->s_op->destroy_inode)
148 inode->i_sb->s_op->destroy_inode(inode); 152 inode->i_sb->s_op->destroy_inode(inode);
149 else 153 else
150 kmem_cache_free(inode_cachep, (inode)); 154 kmem_cache_free(inode_cachep, (inode));
151 return NULL; 155 return NULL;
152 } 156 }
153 157
154 spin_lock_init(&inode->i_lock); 158 spin_lock_init(&inode->i_lock);
155 lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); 159 lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
156 160
157 mutex_init(&inode->i_mutex); 161 mutex_init(&inode->i_mutex);
158 lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); 162 lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
159 163
160 init_rwsem(&inode->i_alloc_sem); 164 init_rwsem(&inode->i_alloc_sem);
161 lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key); 165 lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
162 166
163 mapping->a_ops = &empty_aops; 167 mapping->a_ops = &empty_aops;
164 mapping->host = inode; 168 mapping->host = inode;
165 mapping->flags = 0; 169 mapping->flags = 0;
166 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE); 170 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
167 mapping->assoc_mapping = NULL; 171 mapping->assoc_mapping = NULL;
168 mapping->backing_dev_info = &default_backing_dev_info; 172 mapping->backing_dev_info = &default_backing_dev_info;
169 mapping->writeback_index = 0; 173 mapping->writeback_index = 0;
170 174
171 /* 175 /*
172 * If the block_device provides a backing_dev_info for client 176 * If the block_device provides a backing_dev_info for client
173 * inodes then use that. Otherwise the inode share the bdev's 177 * inodes then use that. Otherwise the inode share the bdev's
174 * backing_dev_info. 178 * backing_dev_info.
175 */ 179 */
176 if (sb->s_bdev) { 180 if (sb->s_bdev) {
177 struct backing_dev_info *bdi; 181 struct backing_dev_info *bdi;
178 182
179 bdi = sb->s_bdev->bd_inode_backing_dev_info; 183 bdi = sb->s_bdev->bd_inode_backing_dev_info;
180 if (!bdi) 184 if (!bdi)
181 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; 185 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
182 mapping->backing_dev_info = bdi; 186 mapping->backing_dev_info = bdi;
183 }
184 inode->i_private = NULL;
185 inode->i_mapping = mapping;
186 } 187 }
188 inode->i_private = NULL;
189 inode->i_mapping = mapping;
190
187 return inode; 191 return inode;
188} 192}
193EXPORT_SYMBOL(inode_init_always);
194
195static struct inode *alloc_inode(struct super_block *sb)
196{
197 struct inode *inode;
198
199 if (sb->s_op->alloc_inode)
200 inode = sb->s_op->alloc_inode(sb);
201 else
202 inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
203
204 if (inode)
205 return inode_init_always(sb, inode);
206 return NULL;
207}
189 208
190void destroy_inode(struct inode *inode) 209void destroy_inode(struct inode *inode)
191{ 210{
@@ -196,6 +215,7 @@ void destroy_inode(struct inode *inode)
196 else 215 else
197 kmem_cache_free(inode_cachep, (inode)); 216 kmem_cache_free(inode_cachep, (inode));
198} 217}
218EXPORT_SYMBOL(destroy_inode);
199 219
200 220
201/* 221/*
@@ -534,12 +554,55 @@ repeat:
534 return node ? inode : NULL; 554 return node ? inode : NULL;
535} 555}
536 556
557static unsigned long hash(struct super_block *sb, unsigned long hashval)
558{
559 unsigned long tmp;
560
561 tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
562 L1_CACHE_BYTES;
563 tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
564 return tmp & I_HASHMASK;
565}
566
567static inline void
568__inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
569 struct inode *inode)
570{
571 inodes_stat.nr_inodes++;
572 list_add(&inode->i_list, &inode_in_use);
573 list_add(&inode->i_sb_list, &sb->s_inodes);
574 if (head)
575 hlist_add_head(&inode->i_hash, head);
576}
577
578/**
579 * inode_add_to_lists - add a new inode to relevant lists
580 * @sb: superblock inode belongs to
581 * @inode: inode to mark in use
582 *
583 * When an inode is allocated it needs to be accounted for, added to the in use
584 * list, the owning superblock and the inode hash. This needs to be done under
585 * the inode_lock, so export a function to do this rather than the inode lock
586 * itself. We calculate the hash list to add to here so it is all internal
587 * which requires the caller to have already set up the inode number in the
588 * inode to add.
589 */
590void inode_add_to_lists(struct super_block *sb, struct inode *inode)
591{
592 struct hlist_head *head = inode_hashtable + hash(sb, inode->i_ino);
593
594 spin_lock(&inode_lock);
595 __inode_add_to_lists(sb, head, inode);
596 spin_unlock(&inode_lock);
597}
598EXPORT_SYMBOL_GPL(inode_add_to_lists);
599
537/** 600/**
538 * new_inode - obtain an inode 601 * new_inode - obtain an inode
539 * @sb: superblock 602 * @sb: superblock
540 * 603 *
541 * Allocates a new inode for given superblock. The default gfp_mask 604 * Allocates a new inode for given superblock. The default gfp_mask
542 * for allocations related to inode->i_mapping is GFP_HIGHUSER_PAGECACHE. 605 * for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
543 * If HIGHMEM pages are unsuitable or it is known that pages allocated 606 * If HIGHMEM pages are unsuitable or it is known that pages allocated
544 * for the page cache are not reclaimable or migratable, 607 * for the page cache are not reclaimable or migratable,
545 * mapping_set_gfp_mask() must be called with suitable flags on the 608 * mapping_set_gfp_mask() must be called with suitable flags on the
@@ -561,9 +624,7 @@ struct inode *new_inode(struct super_block *sb)
561 inode = alloc_inode(sb); 624 inode = alloc_inode(sb);
562 if (inode) { 625 if (inode) {
563 spin_lock(&inode_lock); 626 spin_lock(&inode_lock);
564 inodes_stat.nr_inodes++; 627 __inode_add_to_lists(sb, NULL, inode);
565 list_add(&inode->i_list, &inode_in_use);
566 list_add(&inode->i_sb_list, &sb->s_inodes);
567 inode->i_ino = ++last_ino; 628 inode->i_ino = ++last_ino;
568 inode->i_state = 0; 629 inode->i_state = 0;
569 spin_unlock(&inode_lock); 630 spin_unlock(&inode_lock);
@@ -622,10 +683,7 @@ static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *h
622 if (set(inode, data)) 683 if (set(inode, data))
623 goto set_failed; 684 goto set_failed;
624 685
625 inodes_stat.nr_inodes++; 686 __inode_add_to_lists(sb, head, inode);
626 list_add(&inode->i_list, &inode_in_use);
627 list_add(&inode->i_sb_list, &sb->s_inodes);
628 hlist_add_head(&inode->i_hash, head);
629 inode->i_state = I_LOCK|I_NEW; 687 inode->i_state = I_LOCK|I_NEW;
630 spin_unlock(&inode_lock); 688 spin_unlock(&inode_lock);
631 689
@@ -671,10 +729,7 @@ static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_he
671 old = find_inode_fast(sb, head, ino); 729 old = find_inode_fast(sb, head, ino);
672 if (!old) { 730 if (!old) {
673 inode->i_ino = ino; 731 inode->i_ino = ino;
674 inodes_stat.nr_inodes++; 732 __inode_add_to_lists(sb, head, inode);
675 list_add(&inode->i_list, &inode_in_use);
676 list_add(&inode->i_sb_list, &sb->s_inodes);
677 hlist_add_head(&inode->i_hash, head);
678 inode->i_state = I_LOCK|I_NEW; 733 inode->i_state = I_LOCK|I_NEW;
679 spin_unlock(&inode_lock); 734 spin_unlock(&inode_lock);
680 735
@@ -698,16 +753,6 @@ static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_he
698 return inode; 753 return inode;
699} 754}
700 755
701static unsigned long hash(struct super_block *sb, unsigned long hashval)
702{
703 unsigned long tmp;
704
705 tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
706 L1_CACHE_BYTES;
707 tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
708 return tmp & I_HASHMASK;
709}
710
711/** 756/**
712 * iunique - get a unique inode number 757 * iunique - get a unique inode number
713 * @sb: superblock 758 * @sb: superblock
@@ -990,6 +1035,65 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
990 1035
991EXPORT_SYMBOL(iget_locked); 1036EXPORT_SYMBOL(iget_locked);
992 1037
1038int insert_inode_locked(struct inode *inode)
1039{
1040 struct super_block *sb = inode->i_sb;
1041 ino_t ino = inode->i_ino;
1042 struct hlist_head *head = inode_hashtable + hash(sb, ino);
1043 struct inode *old;
1044
1045 inode->i_state |= I_LOCK|I_NEW;
1046 while (1) {
1047 spin_lock(&inode_lock);
1048 old = find_inode_fast(sb, head, ino);
1049 if (likely(!old)) {
1050 hlist_add_head(&inode->i_hash, head);
1051 spin_unlock(&inode_lock);
1052 return 0;
1053 }
1054 __iget(old);
1055 spin_unlock(&inode_lock);
1056 wait_on_inode(old);
1057 if (unlikely(!hlist_unhashed(&old->i_hash))) {
1058 iput(old);
1059 return -EBUSY;
1060 }
1061 iput(old);
1062 }
1063}
1064
1065EXPORT_SYMBOL(insert_inode_locked);
1066
1067int insert_inode_locked4(struct inode *inode, unsigned long hashval,
1068 int (*test)(struct inode *, void *), void *data)
1069{
1070 struct super_block *sb = inode->i_sb;
1071 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1072 struct inode *old;
1073
1074 inode->i_state |= I_LOCK|I_NEW;
1075
1076 while (1) {
1077 spin_lock(&inode_lock);
1078 old = find_inode(sb, head, test, data);
1079 if (likely(!old)) {
1080 hlist_add_head(&inode->i_hash, head);
1081 spin_unlock(&inode_lock);
1082 return 0;
1083 }
1084 __iget(old);
1085 spin_unlock(&inode_lock);
1086 wait_on_inode(old);
1087 if (unlikely(!hlist_unhashed(&old->i_hash))) {
1088 iput(old);
1089 return -EBUSY;
1090 }
1091 iput(old);
1092 }
1093}
1094
1095EXPORT_SYMBOL(insert_inode_locked4);
1096
993/** 1097/**
994 * __insert_inode_hash - hash an inode 1098 * __insert_inode_hash - hash an inode
995 * @inode: unhashed inode 1099 * @inode: unhashed inode
@@ -1292,6 +1396,7 @@ int inode_wait(void *word)
1292 schedule(); 1396 schedule();
1293 return 0; 1397 return 0;
1294} 1398}
1399EXPORT_SYMBOL(inode_wait);
1295 1400
1296/* 1401/*
1297 * If we try to find an inode in the inode hash while it is being 1402 * If we try to find an inode in the inode hash while it is being
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 43e8b2c0664b..20b0a8a24c6b 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -231,7 +231,8 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
231#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits) 231#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits)
232#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits); 232#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits);
233 233
234/* 234/**
235 * __generic_block_fiemap - FIEMAP for block based inodes (no locking)
235 * @inode - the inode to map 236 * @inode - the inode to map
236 * @arg - the pointer to userspace where we copy everything to 237 * @arg - the pointer to userspace where we copy everything to
237 * @get_block - the fs's get_block function 238 * @get_block - the fs's get_block function
@@ -242,11 +243,15 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
242 * 243 *
243 * If it is possible to have data blocks beyond a hole past @inode->i_size, then 244 * If it is possible to have data blocks beyond a hole past @inode->i_size, then
244 * please do not use this function, it will stop at the first unmapped block 245 * please do not use this function, it will stop at the first unmapped block
245 * beyond i_size 246 * beyond i_size.
247 *
248 * If you use this function directly, you need to do your own locking. Use
249 * generic_block_fiemap if you want the locking done for you.
246 */ 250 */
247int generic_block_fiemap(struct inode *inode, 251
248 struct fiemap_extent_info *fieinfo, u64 start, 252int __generic_block_fiemap(struct inode *inode,
249 u64 len, get_block_t *get_block) 253 struct fiemap_extent_info *fieinfo, u64 start,
254 u64 len, get_block_t *get_block)
250{ 255{
251 struct buffer_head tmp; 256 struct buffer_head tmp;
252 unsigned int start_blk; 257 unsigned int start_blk;
@@ -260,9 +265,6 @@ int generic_block_fiemap(struct inode *inode,
260 265
261 start_blk = logical_to_blk(inode, start); 266 start_blk = logical_to_blk(inode, start);
262 267
263 /* guard against change */
264 mutex_lock(&inode->i_mutex);
265
266 length = (long long)min_t(u64, len, i_size_read(inode)); 268 length = (long long)min_t(u64, len, i_size_read(inode));
267 map_len = length; 269 map_len = length;
268 270
@@ -334,14 +336,36 @@ int generic_block_fiemap(struct inode *inode,
334 cond_resched(); 336 cond_resched();
335 } while (1); 337 } while (1);
336 338
337 mutex_unlock(&inode->i_mutex);
338
339 /* if ret is 1 then we just hit the end of the extent array */ 339 /* if ret is 1 then we just hit the end of the extent array */
340 if (ret == 1) 340 if (ret == 1)
341 ret = 0; 341 ret = 0;
342 342
343 return ret; 343 return ret;
344} 344}
345EXPORT_SYMBOL(__generic_block_fiemap);
346
347/**
348 * generic_block_fiemap - FIEMAP for block based inodes
349 * @inode: The inode to map
350 * @fieinfo: The mapping information
351 * @start: The initial block to map
352 * @len: The length of the extect to attempt to map
353 * @get_block: The block mapping function for the fs
354 *
355 * Calls __generic_block_fiemap to map the inode, after taking
356 * the inode's mutex lock.
357 */
358
359int generic_block_fiemap(struct inode *inode,
360 struct fiemap_extent_info *fieinfo, u64 start,
361 u64 len, get_block_t *get_block)
362{
363 int ret;
364 mutex_lock(&inode->i_mutex);
365 ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block);
366 mutex_unlock(&inode->i_mutex);
367 return ret;
368}
345EXPORT_SYMBOL(generic_block_fiemap); 369EXPORT_SYMBOL(generic_block_fiemap);
346 370
347#endif /* CONFIG_BLOCK */ 371#endif /* CONFIG_BLOCK */
@@ -415,6 +439,43 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp,
415 return error; 439 return error;
416} 440}
417 441
442static int ioctl_fsfreeze(struct file *filp)
443{
444 struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
445
446 if (!capable(CAP_SYS_ADMIN))
447 return -EPERM;
448
449 /* If filesystem doesn't support freeze feature, return. */
450 if (sb->s_op->freeze_fs == NULL)
451 return -EOPNOTSUPP;
452
453 /* If a blockdevice-backed filesystem isn't specified, return. */
454 if (sb->s_bdev == NULL)
455 return -EINVAL;
456
457 /* Freeze */
458 sb = freeze_bdev(sb->s_bdev);
459 if (IS_ERR(sb))
460 return PTR_ERR(sb);
461 return 0;
462}
463
464static int ioctl_fsthaw(struct file *filp)
465{
466 struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
467
468 if (!capable(CAP_SYS_ADMIN))
469 return -EPERM;
470
471 /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
472 if (sb->s_bdev == NULL)
473 return -EINVAL;
474
475 /* Thaw */
476 return thaw_bdev(sb->s_bdev, sb);
477}
478
418/* 479/*
419 * When you add any new common ioctls to the switches above and below 480 * When you add any new common ioctls to the switches above and below
420 * please update compat_sys_ioctl() too. 481 * please update compat_sys_ioctl() too.
@@ -462,6 +523,15 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
462 } else 523 } else
463 error = -ENOTTY; 524 error = -ENOTTY;
464 break; 525 break;
526
527 case FIFREEZE:
528 error = ioctl_fsfreeze(filp);
529 break;
530
531 case FITHAW:
532 error = ioctl_fsthaw(filp);
533 break;
534
465 default: 535 default:
466 if (S_ISREG(filp->f_path.dentry->d_inode->i_mode)) 536 if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
467 error = file_ioctl(filp, cmd, arg); 537 error = file_ioctl(filp, cmd, arg);
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 3569e0ad86a2..1a39ac370942 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -27,7 +27,7 @@
27#include <linux/security.h> 27#include <linux/security.h>
28#include <linux/pid_namespace.h> 28#include <linux/pid_namespace.h>
29 29
30static int set_task_ioprio(struct task_struct *task, int ioprio) 30int set_task_ioprio(struct task_struct *task, int ioprio)
31{ 31{
32 int err; 32 int err;
33 struct io_context *ioc; 33 struct io_context *ioc;
@@ -70,6 +70,7 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
70 task_unlock(task); 70 task_unlock(task);
71 return err; 71 return err;
72} 72}
73EXPORT_SYMBOL_GPL(set_task_ioprio);
73 74
74asmlinkage long sys_ioprio_set(int which, int who, int ioprio) 75asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
75{ 76{
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 3f8af0f1505b..6147ec3643a0 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -855,10 +855,6 @@ root_found:
855 } 855 }
856 sbi->s_joliet_level = joliet_level; 856 sbi->s_joliet_level = joliet_level;
857 857
858 /* check the root inode */
859 if (!inode->i_op)
860 goto out_bad_root;
861
862 /* Make sure the root inode is a directory */ 858 /* Make sure the root inode is a directory */
863 if (!S_ISDIR(inode->i_mode)) { 859 if (!S_ISDIR(inode->i_mode)) {
864 printk(KERN_WARNING 860 printk(KERN_WARNING
@@ -886,8 +882,6 @@ root_found:
886 /* 882 /*
887 * Display error messages and free resources. 883 * Display error messages and free resources.
888 */ 884 */
889out_bad_root:
890 printk(KERN_WARNING "%s: root inode not initialized\n", __func__);
891out_iput: 885out_iput:
892 iput(inode); 886 iput(inode);
893 goto out_no_inode; 887 goto out_no_inode;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 25719d902c51..3fbffb1ea714 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -306,6 +306,8 @@ void journal_commit_transaction(journal_t *journal)
306 int flags; 306 int flags;
307 int err; 307 int err;
308 unsigned long blocknr; 308 unsigned long blocknr;
309 ktime_t start_time;
310 u64 commit_time;
309 char *tagp = NULL; 311 char *tagp = NULL;
310 journal_header_t *header; 312 journal_header_t *header;
311 journal_block_tag_t *tag = NULL; 313 journal_block_tag_t *tag = NULL;
@@ -418,6 +420,7 @@ void journal_commit_transaction(journal_t *journal)
418 commit_transaction->t_state = T_FLUSH; 420 commit_transaction->t_state = T_FLUSH;
419 journal->j_committing_transaction = commit_transaction; 421 journal->j_committing_transaction = commit_transaction;
420 journal->j_running_transaction = NULL; 422 journal->j_running_transaction = NULL;
423 start_time = ktime_get();
421 commit_transaction->t_log_start = journal->j_head; 424 commit_transaction->t_log_start = journal->j_head;
422 wake_up(&journal->j_wait_transaction_locked); 425 wake_up(&journal->j_wait_transaction_locked);
423 spin_unlock(&journal->j_state_lock); 426 spin_unlock(&journal->j_state_lock);
@@ -913,6 +916,18 @@ restart_loop:
913 J_ASSERT(commit_transaction == journal->j_committing_transaction); 916 J_ASSERT(commit_transaction == journal->j_committing_transaction);
914 journal->j_commit_sequence = commit_transaction->t_tid; 917 journal->j_commit_sequence = commit_transaction->t_tid;
915 journal->j_committing_transaction = NULL; 918 journal->j_committing_transaction = NULL;
919 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
920
921 /*
922 * weight the commit time higher than the average time so we don't
923 * react too strongly to vast changes in commit time
924 */
925 if (likely(journal->j_average_commit_time))
926 journal->j_average_commit_time = (commit_time*3 +
927 journal->j_average_commit_time) / 4;
928 else
929 journal->j_average_commit_time = commit_time;
930
916 spin_unlock(&journal->j_state_lock); 931 spin_unlock(&journal->j_state_lock);
917 932
918 if (commit_transaction->t_checkpoint_list == NULL && 933 if (commit_transaction->t_checkpoint_list == NULL &&
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 60d4c32c8808..e6a117431277 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -25,6 +25,7 @@
25#include <linux/timer.h> 25#include <linux/timer.h>
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/hrtimer.h>
28 29
29static void __journal_temp_unlink_buffer(struct journal_head *jh); 30static void __journal_temp_unlink_buffer(struct journal_head *jh);
30 31
@@ -49,6 +50,7 @@ get_transaction(journal_t *journal, transaction_t *transaction)
49{ 50{
50 transaction->t_journal = journal; 51 transaction->t_journal = journal;
51 transaction->t_state = T_RUNNING; 52 transaction->t_state = T_RUNNING;
53 transaction->t_start_time = ktime_get();
52 transaction->t_tid = journal->j_transaction_sequence++; 54 transaction->t_tid = journal->j_transaction_sequence++;
53 transaction->t_expires = jiffies + journal->j_commit_interval; 55 transaction->t_expires = jiffies + journal->j_commit_interval;
54 spin_lock_init(&transaction->t_handle_lock); 56 spin_lock_init(&transaction->t_handle_lock);
@@ -752,7 +754,6 @@ out:
752 * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. 754 * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
753 * @handle: transaction to add buffer modifications to 755 * @handle: transaction to add buffer modifications to
754 * @bh: bh to be used for metadata writes 756 * @bh: bh to be used for metadata writes
755 * @credits: variable that will receive credits for the buffer
756 * 757 *
757 * Returns an error code or 0 on success. 758 * Returns an error code or 0 on success.
758 * 759 *
@@ -1370,7 +1371,7 @@ int journal_stop(handle_t *handle)
1370{ 1371{
1371 transaction_t *transaction = handle->h_transaction; 1372 transaction_t *transaction = handle->h_transaction;
1372 journal_t *journal = transaction->t_journal; 1373 journal_t *journal = transaction->t_journal;
1373 int old_handle_count, err; 1374 int err;
1374 pid_t pid; 1375 pid_t pid;
1375 1376
1376 J_ASSERT(journal_current_handle() == handle); 1377 J_ASSERT(journal_current_handle() == handle);
@@ -1399,6 +1400,17 @@ int journal_stop(handle_t *handle)
1399 * on IO anyway. Speeds up many-threaded, many-dir operations 1400 * on IO anyway. Speeds up many-threaded, many-dir operations
1400 * by 30x or more... 1401 * by 30x or more...
1401 * 1402 *
1403 * We try and optimize the sleep time against what the underlying disk
1404 * can do, instead of having a static sleep time. This is usefull for
1405 * the case where our storage is so fast that it is more optimal to go
1406 * ahead and force a flush and wait for the transaction to be committed
1407 * than it is to wait for an arbitrary amount of time for new writers to
1408 * join the transaction. We acheive this by measuring how long it takes
1409 * to commit a transaction, and compare it with how long this
1410 * transaction has been running, and if run time < commit time then we
1411 * sleep for the delta and commit. This greatly helps super fast disks
1412 * that would see slowdowns as more threads started doing fsyncs.
1413 *
1402 * But don't do this if this process was the most recent one to 1414 * But don't do this if this process was the most recent one to
1403 * perform a synchronous write. We do this to detect the case where a 1415 * perform a synchronous write. We do this to detect the case where a
1404 * single process is doing a stream of sync writes. No point in waiting 1416 * single process is doing a stream of sync writes. No point in waiting
@@ -1406,11 +1418,26 @@ int journal_stop(handle_t *handle)
1406 */ 1418 */
1407 pid = current->pid; 1419 pid = current->pid;
1408 if (handle->h_sync && journal->j_last_sync_writer != pid) { 1420 if (handle->h_sync && journal->j_last_sync_writer != pid) {
1421 u64 commit_time, trans_time;
1422
1409 journal->j_last_sync_writer = pid; 1423 journal->j_last_sync_writer = pid;
1410 do { 1424
1411 old_handle_count = transaction->t_handle_count; 1425 spin_lock(&journal->j_state_lock);
1412 schedule_timeout_uninterruptible(1); 1426 commit_time = journal->j_average_commit_time;
1413 } while (old_handle_count != transaction->t_handle_count); 1427 spin_unlock(&journal->j_state_lock);
1428
1429 trans_time = ktime_to_ns(ktime_sub(ktime_get(),
1430 transaction->t_start_time));
1431
1432 commit_time = min_t(u64, commit_time,
1433 1000*jiffies_to_usecs(1));
1434
1435 if (trans_time < commit_time) {
1436 ktime_t expires = ktime_add_ns(ktime_get(),
1437 commit_time);
1438 set_current_state(TASK_UNINTERRUPTIBLE);
1439 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
1440 }
1414 } 1441 }
1415 1442
1416 current->journal_info = NULL; 1443 current->journal_info = NULL;
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 9497718fe920..17159cacbd9e 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -249,16 +249,14 @@ restart:
249 return ret; 249 return ret;
250} 250}
251 251
252#define NR_BATCH 64
253
254static void 252static void
255__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) 253__flush_batch(journal_t *journal, int *batch_count)
256{ 254{
257 int i; 255 int i;
258 256
259 ll_rw_block(SWRITE, *batch_count, bhs); 257 ll_rw_block(SWRITE, *batch_count, journal->j_chkpt_bhs);
260 for (i = 0; i < *batch_count; i++) { 258 for (i = 0; i < *batch_count; i++) {
261 struct buffer_head *bh = bhs[i]; 259 struct buffer_head *bh = journal->j_chkpt_bhs[i];
262 clear_buffer_jwrite(bh); 260 clear_buffer_jwrite(bh);
263 BUFFER_TRACE(bh, "brelse"); 261 BUFFER_TRACE(bh, "brelse");
264 __brelse(bh); 262 __brelse(bh);
@@ -277,8 +275,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
277 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 275 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
278 */ 276 */
279static int __process_buffer(journal_t *journal, struct journal_head *jh, 277static int __process_buffer(journal_t *journal, struct journal_head *jh,
280 struct buffer_head **bhs, int *batch_count, 278 int *batch_count, transaction_t *transaction)
281 transaction_t *transaction)
282{ 279{
283 struct buffer_head *bh = jh2bh(jh); 280 struct buffer_head *bh = jh2bh(jh);
284 int ret = 0; 281 int ret = 0;
@@ -325,14 +322,14 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
325 get_bh(bh); 322 get_bh(bh);
326 J_ASSERT_BH(bh, !buffer_jwrite(bh)); 323 J_ASSERT_BH(bh, !buffer_jwrite(bh));
327 set_buffer_jwrite(bh); 324 set_buffer_jwrite(bh);
328 bhs[*batch_count] = bh; 325 journal->j_chkpt_bhs[*batch_count] = bh;
329 __buffer_relink_io(jh); 326 __buffer_relink_io(jh);
330 jbd_unlock_bh_state(bh); 327 jbd_unlock_bh_state(bh);
331 transaction->t_chp_stats.cs_written++; 328 transaction->t_chp_stats.cs_written++;
332 (*batch_count)++; 329 (*batch_count)++;
333 if (*batch_count == NR_BATCH) { 330 if (*batch_count == JBD2_NR_BATCH) {
334 spin_unlock(&journal->j_list_lock); 331 spin_unlock(&journal->j_list_lock);
335 __flush_batch(journal, bhs, batch_count); 332 __flush_batch(journal, batch_count);
336 ret = 1; 333 ret = 1;
337 } 334 }
338 } 335 }
@@ -388,7 +385,6 @@ restart:
388 if (journal->j_checkpoint_transactions == transaction && 385 if (journal->j_checkpoint_transactions == transaction &&
389 transaction->t_tid == this_tid) { 386 transaction->t_tid == this_tid) {
390 int batch_count = 0; 387 int batch_count = 0;
391 struct buffer_head *bhs[NR_BATCH];
392 struct journal_head *jh; 388 struct journal_head *jh;
393 int retry = 0, err; 389 int retry = 0, err;
394 390
@@ -402,7 +398,7 @@ restart:
402 retry = 1; 398 retry = 1;
403 break; 399 break;
404 } 400 }
405 retry = __process_buffer(journal, jh, bhs, &batch_count, 401 retry = __process_buffer(journal, jh, &batch_count,
406 transaction); 402 transaction);
407 if (retry < 0 && !result) 403 if (retry < 0 && !result)
408 result = retry; 404 result = retry;
@@ -419,7 +415,7 @@ restart:
419 spin_unlock(&journal->j_list_lock); 415 spin_unlock(&journal->j_list_lock);
420 retry = 1; 416 retry = 1;
421 } 417 }
422 __flush_batch(journal, bhs, &batch_count); 418 __flush_batch(journal, &batch_count);
423 } 419 }
424 420
425 if (retry) { 421 if (retry) {
@@ -686,6 +682,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
686 safely remove this transaction from the log */ 682 safely remove this transaction from the log */
687 683
688 __jbd2_journal_drop_transaction(journal, transaction); 684 __jbd2_journal_drop_transaction(journal, transaction);
685 kfree(transaction);
689 686
690 /* Just in case anybody was waiting for more transactions to be 687 /* Just in case anybody was waiting for more transactions to be
691 checkpointed... */ 688 checkpointed... */
@@ -760,5 +757,4 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
760 J_ASSERT(journal->j_running_transaction != transaction); 757 J_ASSERT(journal->j_running_transaction != transaction);
761 758
762 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); 759 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
763 kfree(transaction);
764} 760}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index ebc667bc54a8..62804e57a44c 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -25,6 +25,7 @@
25#include <linux/crc32.h> 25#include <linux/crc32.h>
26#include <linux/writeback.h> 26#include <linux/writeback.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/bio.h>
28 29
29/* 30/*
30 * Default IO end handler for temporary BJ_IO buffer_heads. 31 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -137,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal,
137 set_buffer_ordered(bh); 138 set_buffer_ordered(bh);
138 barrier_done = 1; 139 barrier_done = 1;
139 } 140 }
140 ret = submit_bh(WRITE, bh); 141 ret = submit_bh(WRITE_SYNC, bh);
141 if (barrier_done) 142 if (barrier_done)
142 clear_buffer_ordered(bh); 143 clear_buffer_ordered(bh);
143 144
@@ -158,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal,
158 lock_buffer(bh); 159 lock_buffer(bh);
159 set_buffer_uptodate(bh); 160 set_buffer_uptodate(bh);
160 clear_buffer_dirty(bh); 161 clear_buffer_dirty(bh);
161 ret = submit_bh(WRITE, bh); 162 ret = submit_bh(WRITE_SYNC, bh);
162 } 163 }
163 *cbh = bh; 164 *cbh = bh;
164 return ret; 165 return ret;
@@ -168,12 +169,34 @@ static int journal_submit_commit_record(journal_t *journal,
168 * This function along with journal_submit_commit_record 169 * This function along with journal_submit_commit_record
169 * allows to write the commit record asynchronously. 170 * allows to write the commit record asynchronously.
170 */ 171 */
171static int journal_wait_on_commit_record(struct buffer_head *bh) 172static int journal_wait_on_commit_record(journal_t *journal,
173 struct buffer_head *bh)
172{ 174{
173 int ret = 0; 175 int ret = 0;
174 176
177retry:
175 clear_buffer_dirty(bh); 178 clear_buffer_dirty(bh);
176 wait_on_buffer(bh); 179 wait_on_buffer(bh);
180 if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
181 printk(KERN_WARNING
182 "JBD2: wait_on_commit_record: sync failed on %s - "
183 "disabling barriers\n", journal->j_devname);
184 spin_lock(&journal->j_state_lock);
185 journal->j_flags &= ~JBD2_BARRIER;
186 spin_unlock(&journal->j_state_lock);
187
188 lock_buffer(bh);
189 clear_buffer_dirty(bh);
190 set_buffer_uptodate(bh);
191 bh->b_end_io = journal_end_buffer_io_sync;
192
193 ret = submit_bh(WRITE_SYNC, bh);
194 if (ret) {
195 unlock_buffer(bh);
196 return ret;
197 }
198 goto retry;
199 }
177 200
178 if (unlikely(!buffer_uptodate(bh))) 201 if (unlikely(!buffer_uptodate(bh)))
179 ret = -EIO; 202 ret = -EIO;
@@ -332,13 +355,15 @@ void jbd2_journal_commit_transaction(journal_t *journal)
332 int flags; 355 int flags;
333 int err; 356 int err;
334 unsigned long long blocknr; 357 unsigned long long blocknr;
358 ktime_t start_time;
359 u64 commit_time;
335 char *tagp = NULL; 360 char *tagp = NULL;
336 journal_header_t *header; 361 journal_header_t *header;
337 journal_block_tag_t *tag = NULL; 362 journal_block_tag_t *tag = NULL;
338 int space_left = 0; 363 int space_left = 0;
339 int first_tag = 0; 364 int first_tag = 0;
340 int tag_flag; 365 int tag_flag;
341 int i; 366 int i, to_free = 0;
342 int tag_bytes = journal_tag_bytes(journal); 367 int tag_bytes = journal_tag_bytes(journal);
343 struct buffer_head *cbh = NULL; /* For transactional checksums */ 368 struct buffer_head *cbh = NULL; /* For transactional checksums */
344 __u32 crc32_sum = ~0; 369 __u32 crc32_sum = ~0;
@@ -458,6 +483,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
458 commit_transaction->t_state = T_FLUSH; 483 commit_transaction->t_state = T_FLUSH;
459 journal->j_committing_transaction = commit_transaction; 484 journal->j_committing_transaction = commit_transaction;
460 journal->j_running_transaction = NULL; 485 journal->j_running_transaction = NULL;
486 start_time = ktime_get();
461 commit_transaction->t_log_start = journal->j_head; 487 commit_transaction->t_log_start = journal->j_head;
462 wake_up(&journal->j_wait_transaction_locked); 488 wake_up(&journal->j_wait_transaction_locked);
463 spin_unlock(&journal->j_state_lock); 489 spin_unlock(&journal->j_state_lock);
@@ -509,6 +535,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
509 if (is_journal_aborted(journal)) { 535 if (is_journal_aborted(journal)) {
510 clear_buffer_jbddirty(jh2bh(jh)); 536 clear_buffer_jbddirty(jh2bh(jh));
511 JBUFFER_TRACE(jh, "journal is aborting: refile"); 537 JBUFFER_TRACE(jh, "journal is aborting: refile");
538 jbd2_buffer_abort_trigger(jh,
539 jh->b_frozen_data ?
540 jh->b_frozen_triggers :
541 jh->b_triggers);
512 jbd2_journal_refile_buffer(journal, jh); 542 jbd2_journal_refile_buffer(journal, jh);
513 /* If that was the last one, we need to clean up 543 /* If that was the last one, we need to clean up
514 * any descriptor buffers which may have been 544 * any descriptor buffers which may have been
@@ -799,7 +829,7 @@ wait_for_iobuf:
799 __jbd2_journal_abort_hard(journal); 829 __jbd2_journal_abort_hard(journal);
800 } 830 }
801 if (!err && !is_journal_aborted(journal)) 831 if (!err && !is_journal_aborted(journal))
802 err = journal_wait_on_commit_record(cbh); 832 err = journal_wait_on_commit_record(journal, cbh);
803 833
804 if (err) 834 if (err)
805 jbd2_journal_abort(journal, err); 835 jbd2_journal_abort(journal, err);
@@ -844,6 +874,9 @@ restart_loop:
844 * data. 874 * data.
845 * 875 *
846 * Otherwise, we can just throw away the frozen data now. 876 * Otherwise, we can just throw away the frozen data now.
877 *
878 * We also know that the frozen data has already fired
879 * its triggers if they exist, so we can clear that too.
847 */ 880 */
848 if (jh->b_committed_data) { 881 if (jh->b_committed_data) {
849 jbd2_free(jh->b_committed_data, bh->b_size); 882 jbd2_free(jh->b_committed_data, bh->b_size);
@@ -851,10 +884,12 @@ restart_loop:
851 if (jh->b_frozen_data) { 884 if (jh->b_frozen_data) {
852 jh->b_committed_data = jh->b_frozen_data; 885 jh->b_committed_data = jh->b_frozen_data;
853 jh->b_frozen_data = NULL; 886 jh->b_frozen_data = NULL;
887 jh->b_frozen_triggers = NULL;
854 } 888 }
855 } else if (jh->b_frozen_data) { 889 } else if (jh->b_frozen_data) {
856 jbd2_free(jh->b_frozen_data, bh->b_size); 890 jbd2_free(jh->b_frozen_data, bh->b_size);
857 jh->b_frozen_data = NULL; 891 jh->b_frozen_data = NULL;
892 jh->b_frozen_triggers = NULL;
858 } 893 }
859 894
860 spin_lock(&journal->j_list_lock); 895 spin_lock(&journal->j_list_lock);
@@ -972,14 +1007,23 @@ restart_loop:
972 J_ASSERT(commit_transaction == journal->j_committing_transaction); 1007 J_ASSERT(commit_transaction == journal->j_committing_transaction);
973 journal->j_commit_sequence = commit_transaction->t_tid; 1008 journal->j_commit_sequence = commit_transaction->t_tid;
974 journal->j_committing_transaction = NULL; 1009 journal->j_committing_transaction = NULL;
975 spin_unlock(&journal->j_state_lock); 1010 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
976 1011
977 if (journal->j_commit_callback) 1012 /*
978 journal->j_commit_callback(journal, commit_transaction); 1013 * weight the commit time higher than the average time so we don't
1014 * react too strongly to vast changes in the commit time
1015 */
1016 if (likely(journal->j_average_commit_time))
1017 journal->j_average_commit_time = (commit_time +
1018 journal->j_average_commit_time*3) / 4;
1019 else
1020 journal->j_average_commit_time = commit_time;
1021 spin_unlock(&journal->j_state_lock);
979 1022
980 if (commit_transaction->t_checkpoint_list == NULL && 1023 if (commit_transaction->t_checkpoint_list == NULL &&
981 commit_transaction->t_checkpoint_io_list == NULL) { 1024 commit_transaction->t_checkpoint_io_list == NULL) {
982 __jbd2_journal_drop_transaction(journal, commit_transaction); 1025 __jbd2_journal_drop_transaction(journal, commit_transaction);
1026 to_free = 1;
983 } else { 1027 } else {
984 if (journal->j_checkpoint_transactions == NULL) { 1028 if (journal->j_checkpoint_transactions == NULL) {
985 journal->j_checkpoint_transactions = commit_transaction; 1029 journal->j_checkpoint_transactions = commit_transaction;
@@ -998,11 +1042,16 @@ restart_loop:
998 } 1042 }
999 spin_unlock(&journal->j_list_lock); 1043 spin_unlock(&journal->j_list_lock);
1000 1044
1045 if (journal->j_commit_callback)
1046 journal->j_commit_callback(journal, commit_transaction);
1047
1001 trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", 1048 trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
1002 journal->j_devname, journal->j_commit_sequence, 1049 journal->j_devname, commit_transaction->t_tid,
1003 journal->j_tail_sequence); 1050 journal->j_tail_sequence);
1004 jbd_debug(1, "JBD: commit %d complete, head %d\n", 1051 jbd_debug(1, "JBD: commit %d complete, head %d\n",
1005 journal->j_commit_sequence, journal->j_tail_sequence); 1052 journal->j_commit_sequence, journal->j_tail_sequence);
1053 if (to_free)
1054 kfree(commit_transaction);
1006 1055
1007 wake_up(&journal->j_wait_done_commit); 1056 wake_up(&journal->j_wait_done_commit);
1008} 1057}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e70d657a19f8..56675306ed81 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -40,6 +40,7 @@
40 40
41#include <asm/uaccess.h> 41#include <asm/uaccess.h>
42#include <asm/page.h> 42#include <asm/page.h>
43#include <asm/div64.h>
43 44
44EXPORT_SYMBOL(jbd2_journal_start); 45EXPORT_SYMBOL(jbd2_journal_start);
45EXPORT_SYMBOL(jbd2_journal_restart); 46EXPORT_SYMBOL(jbd2_journal_restart);
@@ -50,6 +51,7 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
50EXPORT_SYMBOL(jbd2_journal_get_write_access); 51EXPORT_SYMBOL(jbd2_journal_get_write_access);
51EXPORT_SYMBOL(jbd2_journal_get_create_access); 52EXPORT_SYMBOL(jbd2_journal_get_create_access);
52EXPORT_SYMBOL(jbd2_journal_get_undo_access); 53EXPORT_SYMBOL(jbd2_journal_get_undo_access);
54EXPORT_SYMBOL(jbd2_journal_set_triggers);
53EXPORT_SYMBOL(jbd2_journal_dirty_metadata); 55EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
54EXPORT_SYMBOL(jbd2_journal_release_buffer); 56EXPORT_SYMBOL(jbd2_journal_release_buffer);
55EXPORT_SYMBOL(jbd2_journal_forget); 57EXPORT_SYMBOL(jbd2_journal_forget);
@@ -65,7 +67,6 @@ EXPORT_SYMBOL(jbd2_journal_update_format);
65EXPORT_SYMBOL(jbd2_journal_check_used_features); 67EXPORT_SYMBOL(jbd2_journal_check_used_features);
66EXPORT_SYMBOL(jbd2_journal_check_available_features); 68EXPORT_SYMBOL(jbd2_journal_check_available_features);
67EXPORT_SYMBOL(jbd2_journal_set_features); 69EXPORT_SYMBOL(jbd2_journal_set_features);
68EXPORT_SYMBOL(jbd2_journal_create);
69EXPORT_SYMBOL(jbd2_journal_load); 70EXPORT_SYMBOL(jbd2_journal_load);
70EXPORT_SYMBOL(jbd2_journal_destroy); 71EXPORT_SYMBOL(jbd2_journal_destroy);
71EXPORT_SYMBOL(jbd2_journal_abort); 72EXPORT_SYMBOL(jbd2_journal_abort);
@@ -131,8 +132,9 @@ static int kjournald2(void *arg)
131 journal->j_task = current; 132 journal->j_task = current;
132 wake_up(&journal->j_wait_done_commit); 133 wake_up(&journal->j_wait_done_commit);
133 134
134 printk(KERN_INFO "kjournald2 starting. Commit interval %ld seconds\n", 135 printk(KERN_INFO "kjournald2 starting: pid %d, dev %s, "
135 journal->j_commit_interval / HZ); 136 "commit interval %ld seconds\n", current->pid,
137 journal->j_devname, journal->j_commit_interval / HZ);
136 138
137 /* 139 /*
138 * And now, wait forever for commit wakeup events. 140 * And now, wait forever for commit wakeup events.
@@ -290,6 +292,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
290 struct page *new_page; 292 struct page *new_page;
291 unsigned int new_offset; 293 unsigned int new_offset;
292 struct buffer_head *bh_in = jh2bh(jh_in); 294 struct buffer_head *bh_in = jh2bh(jh_in);
295 struct jbd2_buffer_trigger_type *triggers;
293 296
294 /* 297 /*
295 * The buffer really shouldn't be locked: only the current committing 298 * The buffer really shouldn't be locked: only the current committing
@@ -314,13 +317,23 @@ repeat:
314 done_copy_out = 1; 317 done_copy_out = 1;
315 new_page = virt_to_page(jh_in->b_frozen_data); 318 new_page = virt_to_page(jh_in->b_frozen_data);
316 new_offset = offset_in_page(jh_in->b_frozen_data); 319 new_offset = offset_in_page(jh_in->b_frozen_data);
320 triggers = jh_in->b_frozen_triggers;
317 } else { 321 } else {
318 new_page = jh2bh(jh_in)->b_page; 322 new_page = jh2bh(jh_in)->b_page;
319 new_offset = offset_in_page(jh2bh(jh_in)->b_data); 323 new_offset = offset_in_page(jh2bh(jh_in)->b_data);
324 triggers = jh_in->b_triggers;
320 } 325 }
321 326
322 mapped_data = kmap_atomic(new_page, KM_USER0); 327 mapped_data = kmap_atomic(new_page, KM_USER0);
323 /* 328 /*
329 * Fire any commit trigger. Do this before checking for escaping,
330 * as the trigger may modify the magic offset. If a copy-out
331 * happens afterwards, it will have the correct data in the buffer.
332 */
333 jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset,
334 triggers);
335
336 /*
324 * Check for escaping 337 * Check for escaping
325 */ 338 */
326 if (*((__be32 *)(mapped_data + new_offset)) == 339 if (*((__be32 *)(mapped_data + new_offset)) ==
@@ -352,6 +365,13 @@ repeat:
352 new_page = virt_to_page(tmp); 365 new_page = virt_to_page(tmp);
353 new_offset = offset_in_page(tmp); 366 new_offset = offset_in_page(tmp);
354 done_copy_out = 1; 367 done_copy_out = 1;
368
369 /*
370 * This isn't strictly necessary, as we're using frozen
371 * data for the escaping, but it keeps consistency with
372 * b_frozen_data usage.
373 */
374 jh_in->b_frozen_triggers = jh_in->b_triggers;
355 } 375 }
356 376
357 /* 377 /*
@@ -631,6 +651,8 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
631 return NULL; 651 return NULL;
632 652
633 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 653 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
654 if (!bh)
655 return NULL;
634 lock_buffer(bh); 656 lock_buffer(bh);
635 memset(bh->b_data, 0, journal->j_blocksize); 657 memset(bh->b_data, 0, journal->j_blocksize);
636 set_buffer_uptodate(bh); 658 set_buffer_uptodate(bh);
@@ -824,6 +846,8 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
824 jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid)); 846 jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid));
825 seq_printf(seq, " %ums logging transaction\n", 847 seq_printf(seq, " %ums logging transaction\n",
826 jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid)); 848 jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid));
849 seq_printf(seq, " %luus average transaction commit time\n",
850 do_div(s->journal->j_average_commit_time, 1000));
827 seq_printf(seq, " %lu handles per transaction\n", 851 seq_printf(seq, " %lu handles per transaction\n",
828 s->stats->u.run.rs_handle_count / s->stats->ts_tid); 852 s->stats->u.run.rs_handle_count / s->stats->ts_tid);
829 seq_printf(seq, " %lu blocks per transaction\n", 853 seq_printf(seq, " %lu blocks per transaction\n",
@@ -961,6 +985,8 @@ static journal_t * journal_init_common (void)
961 spin_lock_init(&journal->j_state_lock); 985 spin_lock_init(&journal->j_state_lock);
962 986
963 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); 987 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
988 journal->j_min_batch_time = 0;
989 journal->j_max_batch_time = 15000; /* 15ms */
964 990
965 /* The journal is marked for error until we succeed with recovery! */ 991 /* The journal is marked for error until we succeed with recovery! */
966 journal->j_flags = JBD2_ABORT; 992 journal->j_flags = JBD2_ABORT;
@@ -1016,15 +1042,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
1016 1042
1017 /* journal descriptor can store up to n blocks -bzzz */ 1043 /* journal descriptor can store up to n blocks -bzzz */
1018 journal->j_blocksize = blocksize; 1044 journal->j_blocksize = blocksize;
1045 jbd2_stats_proc_init(journal);
1019 n = journal->j_blocksize / sizeof(journal_block_tag_t); 1046 n = journal->j_blocksize / sizeof(journal_block_tag_t);
1020 journal->j_wbufsize = n; 1047 journal->j_wbufsize = n;
1021 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); 1048 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
1022 if (!journal->j_wbuf) { 1049 if (!journal->j_wbuf) {
1023 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 1050 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
1024 __func__); 1051 __func__);
1025 kfree(journal); 1052 goto out_err;
1026 journal = NULL;
1027 goto out;
1028 } 1053 }
1029 journal->j_dev = bdev; 1054 journal->j_dev = bdev;
1030 journal->j_fs_dev = fs_dev; 1055 journal->j_fs_dev = fs_dev;
@@ -1034,14 +1059,22 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
1034 p = journal->j_devname; 1059 p = journal->j_devname;
1035 while ((p = strchr(p, '/'))) 1060 while ((p = strchr(p, '/')))
1036 *p = '!'; 1061 *p = '!';
1037 jbd2_stats_proc_init(journal);
1038 1062
1039 bh = __getblk(journal->j_dev, start, journal->j_blocksize); 1063 bh = __getblk(journal->j_dev, start, journal->j_blocksize);
1040 J_ASSERT(bh != NULL); 1064 if (!bh) {
1065 printk(KERN_ERR
1066 "%s: Cannot get buffer for journal superblock\n",
1067 __func__);
1068 goto out_err;
1069 }
1041 journal->j_sb_buffer = bh; 1070 journal->j_sb_buffer = bh;
1042 journal->j_superblock = (journal_superblock_t *)bh->b_data; 1071 journal->j_superblock = (journal_superblock_t *)bh->b_data;
1043out: 1072
1044 return journal; 1073 return journal;
1074out_err:
1075 jbd2_stats_proc_exit(journal);
1076 kfree(journal);
1077 return NULL;
1045} 1078}
1046 1079
1047/** 1080/**
@@ -1089,9 +1122,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
1089 if (!journal->j_wbuf) { 1122 if (!journal->j_wbuf) {
1090 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 1123 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
1091 __func__); 1124 __func__);
1092 jbd2_stats_proc_exit(journal); 1125 goto out_err;
1093 kfree(journal);
1094 return NULL;
1095 } 1126 }
1096 1127
1097 err = jbd2_journal_bmap(journal, 0, &blocknr); 1128 err = jbd2_journal_bmap(journal, 0, &blocknr);
@@ -1099,17 +1130,24 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
1099 if (err) { 1130 if (err) {
1100 printk(KERN_ERR "%s: Cannnot locate journal superblock\n", 1131 printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
1101 __func__); 1132 __func__);
1102 jbd2_stats_proc_exit(journal); 1133 goto out_err;
1103 kfree(journal);
1104 return NULL;
1105 } 1134 }
1106 1135
1107 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 1136 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
1108 J_ASSERT(bh != NULL); 1137 if (!bh) {
1138 printk(KERN_ERR
1139 "%s: Cannot get buffer for journal superblock\n",
1140 __func__);
1141 goto out_err;
1142 }
1109 journal->j_sb_buffer = bh; 1143 journal->j_sb_buffer = bh;
1110 journal->j_superblock = (journal_superblock_t *)bh->b_data; 1144 journal->j_superblock = (journal_superblock_t *)bh->b_data;
1111 1145
1112 return journal; 1146 return journal;
1147out_err:
1148 jbd2_stats_proc_exit(journal);
1149 kfree(journal);
1150 return NULL;
1113} 1151}
1114 1152
1115/* 1153/*
@@ -1158,77 +1196,6 @@ static int journal_reset(journal_t *journal)
1158} 1196}
1159 1197
1160/** 1198/**
1161 * int jbd2_journal_create() - Initialise the new journal file
1162 * @journal: Journal to create. This structure must have been initialised
1163 *
1164 * Given a journal_t structure which tells us which disk blocks we can
1165 * use, create a new journal superblock and initialise all of the
1166 * journal fields from scratch.
1167 **/
1168int jbd2_journal_create(journal_t *journal)
1169{
1170 unsigned long long blocknr;
1171 struct buffer_head *bh;
1172 journal_superblock_t *sb;
1173 int i, err;
1174
1175 if (journal->j_maxlen < JBD2_MIN_JOURNAL_BLOCKS) {
1176 printk (KERN_ERR "Journal length (%d blocks) too short.\n",
1177 journal->j_maxlen);
1178 journal_fail_superblock(journal);
1179 return -EINVAL;
1180 }
1181
1182 if (journal->j_inode == NULL) {
1183 /*
1184 * We don't know what block to start at!
1185 */
1186 printk(KERN_EMERG
1187 "%s: creation of journal on external device!\n",
1188 __func__);
1189 BUG();
1190 }
1191
1192 /* Zero out the entire journal on disk. We cannot afford to
1193 have any blocks on disk beginning with JBD2_MAGIC_NUMBER. */
1194 jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
1195 for (i = 0; i < journal->j_maxlen; i++) {
1196 err = jbd2_journal_bmap(journal, i, &blocknr);
1197 if (err)
1198 return err;
1199 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
1200 lock_buffer(bh);
1201 memset (bh->b_data, 0, journal->j_blocksize);
1202 BUFFER_TRACE(bh, "marking dirty");
1203 mark_buffer_dirty(bh);
1204 BUFFER_TRACE(bh, "marking uptodate");
1205 set_buffer_uptodate(bh);
1206 unlock_buffer(bh);
1207 __brelse(bh);
1208 }
1209
1210 sync_blockdev(journal->j_dev);
1211 jbd_debug(1, "JBD: journal cleared.\n");
1212
1213 /* OK, fill in the initial static fields in the new superblock */
1214 sb = journal->j_superblock;
1215
1216 sb->s_header.h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
1217 sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
1218
1219 sb->s_blocksize = cpu_to_be32(journal->j_blocksize);
1220 sb->s_maxlen = cpu_to_be32(journal->j_maxlen);
1221 sb->s_first = cpu_to_be32(1);
1222
1223 journal->j_transaction_sequence = 1;
1224
1225 journal->j_flags &= ~JBD2_ABORT;
1226 journal->j_format_version = 2;
1227
1228 return journal_reset(journal);
1229}
1230
1231/**
1232 * void jbd2_journal_update_superblock() - Update journal sb on disk. 1199 * void jbd2_journal_update_superblock() - Update journal sb on disk.
1233 * @journal: The journal to update. 1200 * @journal: The journal to update.
1234 * @wait: Set to '0' if you don't want to wait for IO completion. 1201 * @wait: Set to '0' if you don't want to wait for IO completion.
@@ -1472,7 +1439,9 @@ int jbd2_journal_destroy(journal_t *journal)
1472 spin_lock(&journal->j_list_lock); 1439 spin_lock(&journal->j_list_lock);
1473 while (journal->j_checkpoint_transactions != NULL) { 1440 while (journal->j_checkpoint_transactions != NULL) {
1474 spin_unlock(&journal->j_list_lock); 1441 spin_unlock(&journal->j_list_lock);
1442 mutex_lock(&journal->j_checkpoint_mutex);
1475 jbd2_log_do_checkpoint(journal); 1443 jbd2_log_do_checkpoint(journal);
1444 mutex_unlock(&journal->j_checkpoint_mutex);
1476 spin_lock(&journal->j_list_lock); 1445 spin_lock(&journal->j_list_lock);
1477 } 1446 }
1478 1447
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 39b7805a599a..46b4e347ed7d 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -25,6 +25,7 @@
25#include <linux/timer.h> 25#include <linux/timer.h>
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/hrtimer.h>
28 29
29static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); 30static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
30 31
@@ -48,6 +49,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
48{ 49{
49 transaction->t_journal = journal; 50 transaction->t_journal = journal;
50 transaction->t_state = T_RUNNING; 51 transaction->t_state = T_RUNNING;
52 transaction->t_start_time = ktime_get();
51 transaction->t_tid = journal->j_transaction_sequence++; 53 transaction->t_tid = journal->j_transaction_sequence++;
52 transaction->t_expires = jiffies + journal->j_commit_interval; 54 transaction->t_expires = jiffies + journal->j_commit_interval;
53 spin_lock_init(&transaction->t_handle_lock); 55 spin_lock_init(&transaction->t_handle_lock);
@@ -741,6 +743,12 @@ done:
741 source = kmap_atomic(page, KM_USER0); 743 source = kmap_atomic(page, KM_USER0);
742 memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); 744 memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
743 kunmap_atomic(source, KM_USER0); 745 kunmap_atomic(source, KM_USER0);
746
747 /*
748 * Now that the frozen data is saved off, we need to store
749 * any matching triggers.
750 */
751 jh->b_frozen_triggers = jh->b_triggers;
744 } 752 }
745 jbd_unlock_bh_state(bh); 753 jbd_unlock_bh_state(bh);
746 754
@@ -944,6 +952,47 @@ out:
944} 952}
945 953
946/** 954/**
955 * void jbd2_journal_set_triggers() - Add triggers for commit writeout
956 * @bh: buffer to trigger on
957 * @type: struct jbd2_buffer_trigger_type containing the trigger(s).
958 *
959 * Set any triggers on this journal_head. This is always safe, because
960 * triggers for a committing buffer will be saved off, and triggers for
961 * a running transaction will match the buffer in that transaction.
962 *
963 * Call with NULL to clear the triggers.
964 */
965void jbd2_journal_set_triggers(struct buffer_head *bh,
966 struct jbd2_buffer_trigger_type *type)
967{
968 struct journal_head *jh = bh2jh(bh);
969
970 jh->b_triggers = type;
971}
972
973void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data,
974 struct jbd2_buffer_trigger_type *triggers)
975{
976 struct buffer_head *bh = jh2bh(jh);
977
978 if (!triggers || !triggers->t_commit)
979 return;
980
981 triggers->t_commit(triggers, bh, mapped_data, bh->b_size);
982}
983
984void jbd2_buffer_abort_trigger(struct journal_head *jh,
985 struct jbd2_buffer_trigger_type *triggers)
986{
987 if (!triggers || !triggers->t_abort)
988 return;
989
990 triggers->t_abort(triggers, jh2bh(jh));
991}
992
993
994
995/**
947 * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata 996 * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
948 * @handle: transaction to add buffer to. 997 * @handle: transaction to add buffer to.
949 * @bh: buffer to mark 998 * @bh: buffer to mark
@@ -1193,7 +1242,7 @@ int jbd2_journal_stop(handle_t *handle)
1193{ 1242{
1194 transaction_t *transaction = handle->h_transaction; 1243 transaction_t *transaction = handle->h_transaction;
1195 journal_t *journal = transaction->t_journal; 1244 journal_t *journal = transaction->t_journal;
1196 int old_handle_count, err; 1245 int err;
1197 pid_t pid; 1246 pid_t pid;
1198 1247
1199 J_ASSERT(journal_current_handle() == handle); 1248 J_ASSERT(journal_current_handle() == handle);
@@ -1216,24 +1265,54 @@ int jbd2_journal_stop(handle_t *handle)
1216 /* 1265 /*
1217 * Implement synchronous transaction batching. If the handle 1266 * Implement synchronous transaction batching. If the handle
1218 * was synchronous, don't force a commit immediately. Let's 1267 * was synchronous, don't force a commit immediately. Let's
1219 * yield and let another thread piggyback onto this transaction. 1268 * yield and let another thread piggyback onto this
1220 * Keep doing that while new threads continue to arrive. 1269 * transaction. Keep doing that while new threads continue to
1221 * It doesn't cost much - we're about to run a commit and sleep 1270 * arrive. It doesn't cost much - we're about to run a commit
1222 * on IO anyway. Speeds up many-threaded, many-dir operations 1271 * and sleep on IO anyway. Speeds up many-threaded, many-dir
1223 * by 30x or more... 1272 * operations by 30x or more...
1224 * 1273 *
1225 * But don't do this if this process was the most recent one to 1274 * We try and optimize the sleep time against what the
1226 * perform a synchronous write. We do this to detect the case where a 1275 * underlying disk can do, instead of having a static sleep
1227 * single process is doing a stream of sync writes. No point in waiting 1276 * time. This is useful for the case where our storage is so
1228 * for joiners in that case. 1277 * fast that it is more optimal to go ahead and force a flush
1278 * and wait for the transaction to be committed than it is to
1279 * wait for an arbitrary amount of time for new writers to
1280 * join the transaction. We achieve this by measuring how
1281 * long it takes to commit a transaction, and compare it with
1282 * how long this transaction has been running, and if run time
1283 * < commit time then we sleep for the delta and commit. This
1284 * greatly helps super fast disks that would see slowdowns as
1285 * more threads started doing fsyncs.
1286 *
1287 * But don't do this if this process was the most recent one
1288 * to perform a synchronous write. We do this to detect the
1289 * case where a single process is doing a stream of sync
1290 * writes. No point in waiting for joiners in that case.
1229 */ 1291 */
1230 pid = current->pid; 1292 pid = current->pid;
1231 if (handle->h_sync && journal->j_last_sync_writer != pid) { 1293 if (handle->h_sync && journal->j_last_sync_writer != pid) {
1294 u64 commit_time, trans_time;
1295
1232 journal->j_last_sync_writer = pid; 1296 journal->j_last_sync_writer = pid;
1233 do { 1297
1234 old_handle_count = transaction->t_handle_count; 1298 spin_lock(&journal->j_state_lock);
1235 schedule_timeout_uninterruptible(1); 1299 commit_time = journal->j_average_commit_time;
1236 } while (old_handle_count != transaction->t_handle_count); 1300 spin_unlock(&journal->j_state_lock);
1301
1302 trans_time = ktime_to_ns(ktime_sub(ktime_get(),
1303 transaction->t_start_time));
1304
1305 commit_time = max_t(u64, commit_time,
1306 1000*journal->j_min_batch_time);
1307 commit_time = min_t(u64, commit_time,
1308 1000*journal->j_max_batch_time);
1309
1310 if (trans_time < commit_time) {
1311 ktime_t expires = ktime_add_ns(ktime_get(),
1312 commit_time);
1313 set_current_state(TASK_UNINTERRUPTIBLE);
1314 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
1315 }
1237 } 1316 }
1238 1317
1239 current->journal_info = NULL; 1318 current->journal_info = NULL;
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index c73fa89b5f8a..170d289ac785 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -22,9 +22,7 @@
22 22
23 23
24#define BIT_DIVIDER_MIPS 1043 24#define BIT_DIVIDER_MIPS 1043
25static int bits_mips[8] = { 277,249,290,267,229,341,212,241}; /* mips32 */ 25static int bits_mips[8] = { 277, 249, 290, 267, 229, 341, 212, 241};
26
27#include <linux/errno.h>
28 26
29struct pushpull { 27struct pushpull {
30 unsigned char *buf; 28 unsigned char *buf;
@@ -43,7 +41,9 @@ struct rubin_state {
43 int bits[8]; 41 int bits[8];
44}; 42};
45 43
46static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen, unsigned ofs, unsigned reserve) 44static inline void init_pushpull(struct pushpull *pp, char *buf,
45 unsigned buflen, unsigned ofs,
46 unsigned reserve)
47{ 47{
48 pp->buf = buf; 48 pp->buf = buf;
49 pp->buflen = buflen; 49 pp->buflen = buflen;
@@ -53,16 +53,14 @@ static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen
53 53
54static inline int pushbit(struct pushpull *pp, int bit, int use_reserved) 54static inline int pushbit(struct pushpull *pp, int bit, int use_reserved)
55{ 55{
56 if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve)) { 56 if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve))
57 return -ENOSPC; 57 return -ENOSPC;
58 }
59 58
60 if (bit) { 59 if (bit)
61 pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs &7))); 60 pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs & 7)));
62 } 61 else
63 else { 62 pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs & 7)));
64 pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs &7))); 63
65 }
66 pp->ofs++; 64 pp->ofs++;
67 65
68 return 0; 66 return 0;
@@ -97,6 +95,7 @@ static void init_rubin(struct rubin_state *rs, int div, int *bits)
97 rs->p = (long) (2 * UPPER_BIT_RUBIN); 95 rs->p = (long) (2 * UPPER_BIT_RUBIN);
98 rs->bit_number = (long) 0; 96 rs->bit_number = (long) 0;
99 rs->bit_divider = div; 97 rs->bit_divider = div;
98
100 for (c=0; c<8; c++) 99 for (c=0; c<8; c++)
101 rs->bits[c] = bits[c]; 100 rs->bits[c] = bits[c];
102} 101}
@@ -108,7 +107,8 @@ static int encode(struct rubin_state *rs, long A, long B, int symbol)
108 long i0, i1; 107 long i0, i1;
109 int ret; 108 int ret;
110 109
111 while ((rs->q >= UPPER_BIT_RUBIN) || ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) { 110 while ((rs->q >= UPPER_BIT_RUBIN) ||
111 ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) {
112 rs->bit_number++; 112 rs->bit_number++;
113 113
114 ret = pushbit(&rs->pp, (rs->q & UPPER_BIT_RUBIN) ? 1 : 0, 0); 114 ret = pushbit(&rs->pp, (rs->q & UPPER_BIT_RUBIN) ? 1 : 0, 0);
@@ -119,12 +119,12 @@ static int encode(struct rubin_state *rs, long A, long B, int symbol)
119 rs->p <<= 1; 119 rs->p <<= 1;
120 } 120 }
121 i0 = A * rs->p / (A + B); 121 i0 = A * rs->p / (A + B);
122 if (i0 <= 0) { 122 if (i0 <= 0)
123 i0 = 1; 123 i0 = 1;
124 } 124
125 if (i0 >= rs->p) { 125 if (i0 >= rs->p)
126 i0 = rs->p - 1; 126 i0 = rs->p - 1;
127 } 127
128 i1 = rs->p - i0; 128 i1 = rs->p - i0;
129 129
130 if (symbol == 0) 130 if (symbol == 0)
@@ -157,11 +157,13 @@ static void init_decode(struct rubin_state *rs, int div, int *bits)
157 /* behalve lower */ 157 /* behalve lower */
158 rs->rec_q = 0; 158 rs->rec_q = 0;
159 159
160 for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE; rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp))) 160 for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE;
161 rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp)))
161 ; 162 ;
162} 163}
163 164
164static void __do_decode(struct rubin_state *rs, unsigned long p, unsigned long q) 165static void __do_decode(struct rubin_state *rs, unsigned long p,
166 unsigned long q)
165{ 167{
166 register unsigned long lower_bits_rubin = LOWER_BITS_RUBIN; 168 register unsigned long lower_bits_rubin = LOWER_BITS_RUBIN;
167 unsigned long rec_q; 169 unsigned long rec_q;
@@ -207,12 +209,11 @@ static int decode(struct rubin_state *rs, long A, long B)
207 __do_decode(rs, p, q); 209 __do_decode(rs, p, q);
208 210
209 i0 = A * rs->p / (A + B); 211 i0 = A * rs->p / (A + B);
210 if (i0 <= 0) { 212 if (i0 <= 0)
211 i0 = 1; 213 i0 = 1;
212 } 214
213 if (i0 >= rs->p) { 215 if (i0 >= rs->p)
214 i0 = rs->p - 1; 216 i0 = rs->p - 1;
215 }
216 217
217 threshold = rs->q + i0; 218 threshold = rs->q + i0;
218 symbol = rs->rec_q >= threshold; 219 symbol = rs->rec_q >= threshold;
@@ -234,14 +235,15 @@ static int out_byte(struct rubin_state *rs, unsigned char byte)
234 struct rubin_state rs_copy; 235 struct rubin_state rs_copy;
235 rs_copy = *rs; 236 rs_copy = *rs;
236 237
237 for (i=0;i<8;i++) { 238 for (i=0; i<8; i++) {
238 ret = encode(rs, rs->bit_divider-rs->bits[i],rs->bits[i],byte&1); 239 ret = encode(rs, rs->bit_divider-rs->bits[i],
240 rs->bits[i], byte & 1);
239 if (ret) { 241 if (ret) {
240 /* Failed. Restore old state */ 242 /* Failed. Restore old state */
241 *rs = rs_copy; 243 *rs = rs_copy;
242 return ret; 244 return ret;
243 } 245 }
244 byte=byte>>1; 246 byte >>= 1 ;
245 } 247 }
246 return 0; 248 return 0;
247} 249}
@@ -251,7 +253,8 @@ static int in_byte(struct rubin_state *rs)
251 int i, result = 0, bit_divider = rs->bit_divider; 253 int i, result = 0, bit_divider = rs->bit_divider;
252 254
253 for (i = 0; i < 8; i++) 255 for (i = 0; i < 8; i++)
254 result |= decode(rs, bit_divider - rs->bits[i], rs->bits[i]) << i; 256 result |= decode(rs, bit_divider - rs->bits[i],
257 rs->bits[i]) << i;
255 258
256 return result; 259 return result;
257} 260}
@@ -259,7 +262,8 @@ static int in_byte(struct rubin_state *rs)
259 262
260 263
261static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in, 264static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in,
262 unsigned char *cpage_out, uint32_t *sourcelen, uint32_t *dstlen) 265 unsigned char *cpage_out, uint32_t *sourcelen,
266 uint32_t *dstlen)
263 { 267 {
264 int outpos = 0; 268 int outpos = 0;
265 int pos=0; 269 int pos=0;
@@ -295,7 +299,8 @@ static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in,
295int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out, 299int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out,
296 uint32_t *sourcelen, uint32_t *dstlen, void *model) 300 uint32_t *sourcelen, uint32_t *dstlen, void *model)
297{ 301{
298 return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen); 302 return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in,
303 cpage_out, sourcelen, dstlen);
299} 304}
300#endif 305#endif
301static int jffs2_dynrubin_compress(unsigned char *data_in, 306static int jffs2_dynrubin_compress(unsigned char *data_in,
@@ -316,9 +321,8 @@ static int jffs2_dynrubin_compress(unsigned char *data_in,
316 return -1; 321 return -1;
317 322
318 memset(histo, 0, 256); 323 memset(histo, 0, 256);
319 for (i=0; i<mysrclen; i++) { 324 for (i=0; i<mysrclen; i++)
320 histo[data_in[i]]++; 325 histo[data_in[i]]++;
321 }
322 memset(bits, 0, sizeof(int)*8); 326 memset(bits, 0, sizeof(int)*8);
323 for (i=0; i<256; i++) { 327 for (i=0; i<256; i++) {
324 if (i&128) 328 if (i&128)
@@ -346,7 +350,8 @@ static int jffs2_dynrubin_compress(unsigned char *data_in,
346 cpage_out[i] = bits[i]; 350 cpage_out[i] = bits[i];
347 } 351 }
348 352
349 ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen, &mydstlen); 353 ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen,
354 &mydstlen);
350 if (ret) 355 if (ret)
351 return ret; 356 return ret;
352 357
@@ -363,8 +368,10 @@ static int jffs2_dynrubin_compress(unsigned char *data_in,
363 return 0; 368 return 0;
364} 369}
365 370
366static void rubin_do_decompress(int bit_divider, int *bits, unsigned char *cdata_in, 371static void rubin_do_decompress(int bit_divider, int *bits,
367 unsigned char *page_out, uint32_t srclen, uint32_t destlen) 372 unsigned char *cdata_in,
373 unsigned char *page_out, uint32_t srclen,
374 uint32_t destlen)
368{ 375{
369 int outpos = 0; 376 int outpos = 0;
370 struct rubin_state rs; 377 struct rubin_state rs;
@@ -372,9 +379,8 @@ static void rubin_do_decompress(int bit_divider, int *bits, unsigned char *cdata
372 init_pushpull(&rs.pp, cdata_in, srclen, 0, 0); 379 init_pushpull(&rs.pp, cdata_in, srclen, 0, 0);
373 init_decode(&rs, bit_divider, bits); 380 init_decode(&rs, bit_divider, bits);
374 381
375 while (outpos < destlen) { 382 while (outpos < destlen)
376 page_out[outpos++] = in_byte(&rs); 383 page_out[outpos++] = in_byte(&rs);
377 }
378} 384}
379 385
380 386
@@ -383,7 +389,8 @@ static int jffs2_rubinmips_decompress(unsigned char *data_in,
383 uint32_t sourcelen, uint32_t dstlen, 389 uint32_t sourcelen, uint32_t dstlen,
384 void *model) 390 void *model)
385{ 391{
386 rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen); 392 rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in,
393 cpage_out, sourcelen, dstlen);
387 return 0; 394 return 0;
388} 395}
389 396
@@ -398,52 +405,53 @@ static int jffs2_dynrubin_decompress(unsigned char *data_in,
398 for (c=0; c<8; c++) 405 for (c=0; c<8; c++)
399 bits[c] = data_in[c]; 406 bits[c] = data_in[c];
400 407
401 rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8, dstlen); 408 rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8,
409 dstlen);
402 return 0; 410 return 0;
403} 411}
404 412
405static struct jffs2_compressor jffs2_rubinmips_comp = { 413static struct jffs2_compressor jffs2_rubinmips_comp = {
406 .priority = JFFS2_RUBINMIPS_PRIORITY, 414 .priority = JFFS2_RUBINMIPS_PRIORITY,
407 .name = "rubinmips", 415 .name = "rubinmips",
408 .compr = JFFS2_COMPR_DYNRUBIN, 416 .compr = JFFS2_COMPR_DYNRUBIN,
409 .compress = NULL, /*&jffs2_rubinmips_compress,*/ 417 .compress = NULL, /*&jffs2_rubinmips_compress,*/
410 .decompress = &jffs2_rubinmips_decompress, 418 .decompress = &jffs2_rubinmips_decompress,
411#ifdef JFFS2_RUBINMIPS_DISABLED 419#ifdef JFFS2_RUBINMIPS_DISABLED
412 .disabled = 1, 420 .disabled = 1,
413#else 421#else
414 .disabled = 0, 422 .disabled = 0,
415#endif 423#endif
416}; 424};
417 425
418int jffs2_rubinmips_init(void) 426int jffs2_rubinmips_init(void)
419{ 427{
420 return jffs2_register_compressor(&jffs2_rubinmips_comp); 428 return jffs2_register_compressor(&jffs2_rubinmips_comp);
421} 429}
422 430
423void jffs2_rubinmips_exit(void) 431void jffs2_rubinmips_exit(void)
424{ 432{
425 jffs2_unregister_compressor(&jffs2_rubinmips_comp); 433 jffs2_unregister_compressor(&jffs2_rubinmips_comp);
426} 434}
427 435
428static struct jffs2_compressor jffs2_dynrubin_comp = { 436static struct jffs2_compressor jffs2_dynrubin_comp = {
429 .priority = JFFS2_DYNRUBIN_PRIORITY, 437 .priority = JFFS2_DYNRUBIN_PRIORITY,
430 .name = "dynrubin", 438 .name = "dynrubin",
431 .compr = JFFS2_COMPR_RUBINMIPS, 439 .compr = JFFS2_COMPR_RUBINMIPS,
432 .compress = jffs2_dynrubin_compress, 440 .compress = jffs2_dynrubin_compress,
433 .decompress = &jffs2_dynrubin_decompress, 441 .decompress = &jffs2_dynrubin_decompress,
434#ifdef JFFS2_DYNRUBIN_DISABLED 442#ifdef JFFS2_DYNRUBIN_DISABLED
435 .disabled = 1, 443 .disabled = 1,
436#else 444#else
437 .disabled = 0, 445 .disabled = 0,
438#endif 446#endif
439}; 447};
440 448
441int jffs2_dynrubin_init(void) 449int jffs2_dynrubin_init(void)
442{ 450{
443 return jffs2_register_compressor(&jffs2_dynrubin_comp); 451 return jffs2_register_compressor(&jffs2_dynrubin_comp);
444} 452}
445 453
446void jffs2_dynrubin_exit(void) 454void jffs2_dynrubin_exit(void)
447{ 455{
448 jffs2_unregister_compressor(&jffs2_dynrubin_comp); 456 jffs2_unregister_compressor(&jffs2_dynrubin_comp);
449} 457}
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 259461b910af..c32b4a1ad6cf 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -175,7 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
175{ 175{
176 /* For NAND, if the failure did not occur at the device level for a 176 /* For NAND, if the failure did not occur at the device level for a
177 specific physical page, don't bother updating the bad block table. */ 177 specific physical page, don't bother updating the bad block table. */
178 if (jffs2_cleanmarker_oob(c) && (bad_offset != MTD_FAIL_ADDR_UNKNOWN)) { 178 if (jffs2_cleanmarker_oob(c) && (bad_offset != (uint32_t)MTD_FAIL_ADDR_UNKNOWN)) {
179 /* We had a device-level failure to erase. Let's see if we've 179 /* We had a device-level failure to erase. Let's see if we've
180 failed too many times. */ 180 failed too many times. */
181 if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) { 181 if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) {
@@ -209,7 +209,8 @@ static void jffs2_erase_callback(struct erase_info *instr)
209 struct erase_priv_struct *priv = (void *)instr->priv; 209 struct erase_priv_struct *priv = (void *)instr->priv;
210 210
211 if(instr->state != MTD_ERASE_DONE) { 211 if(instr->state != MTD_ERASE_DONE) {
212 printk(KERN_WARNING "Erase at 0x%08x finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n", instr->addr, instr->state); 212 printk(KERN_WARNING "Erase at 0x%08llx finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n",
213 (unsigned long long)instr->addr, instr->state);
213 jffs2_erase_failed(priv->c, priv->jeb, instr->fail_addr); 214 jffs2_erase_failed(priv->c, priv->jeb, instr->fail_addr);
214 } else { 215 } else {
215 jffs2_erase_succeeded(priv->c, priv->jeb); 216 jffs2_erase_succeeded(priv->c, priv->jeb);
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 5a98aa87c853..5edc2bf20581 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -132,7 +132,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
132 uint32_t pageofs = index << PAGE_CACHE_SHIFT; 132 uint32_t pageofs = index << PAGE_CACHE_SHIFT;
133 int ret = 0; 133 int ret = 0;
134 134
135 pg = __grab_cache_page(mapping, index); 135 pg = grab_cache_page_write_begin(mapping, index, flags);
136 if (!pg) 136 if (!pg)
137 return -ENOMEM; 137 return -ENOMEM;
138 *pagep = pg; 138 *pagep = pg;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 1750445556c3..507ed6ec1847 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -366,9 +366,6 @@ void jffs2_free_ino_caches(struct jffs2_sb_info *c);
366void jffs2_free_raw_node_refs(struct jffs2_sb_info *c); 366void jffs2_free_raw_node_refs(struct jffs2_sb_info *c);
367struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_t offset); 367struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_t offset);
368void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c_delete); 368void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c_delete);
369struct rb_node *rb_next(struct rb_node *);
370struct rb_node *rb_prev(struct rb_node *);
371void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root);
372int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn); 369int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn);
373uint32_t jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size); 370uint32_t jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size);
374struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c, 371struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c,
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 210339784b56..b00ee9f05a06 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -59,8 +59,14 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
59 if (inode->i_size >= IDATASIZE) { 59 if (inode->i_size >= IDATASIZE) {
60 inode->i_op = &page_symlink_inode_operations; 60 inode->i_op = &page_symlink_inode_operations;
61 inode->i_mapping->a_ops = &jfs_aops; 61 inode->i_mapping->a_ops = &jfs_aops;
62 } else 62 } else {
63 inode->i_op = &jfs_symlink_inode_operations; 63 inode->i_op = &jfs_symlink_inode_operations;
64 /*
65 * The inline data should be null-terminated, but
66 * don't let on-disk corruption crash the kernel
67 */
68 JFS_IP(inode)->i_inline[inode->i_size] = '\0';
69 }
64 } else { 70 } else {
65 inode->i_op = &jfs_file_inode_operations; 71 inode->i_op = &jfs_file_inode_operations;
66 init_special_inode(inode, inode->i_mode, inode->i_rdev); 72 init_special_inode(inode, inode->i_mode, inode->i_rdev);
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index d6363d8309d0..0f94381ca6d0 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -58,9 +58,9 @@
58 58
59/* 59/*
60 * __mark_inode_dirty expects inodes to be hashed. Since we don't want 60 * __mark_inode_dirty expects inodes to be hashed. Since we don't want
61 * special inodes in the fileset inode space, we hash them to a dummy head 61 * special inodes in the fileset inode space, we make them appear hashed,
62 * but do not put on any lists.
62 */ 63 */
63static HLIST_HEAD(aggregate_hash);
64 64
65/* 65/*
66 * imap locks 66 * imap locks
@@ -496,7 +496,11 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
496 /* release the page */ 496 /* release the page */
497 release_metapage(mp); 497 release_metapage(mp);
498 498
499 hlist_add_head(&ip->i_hash, &aggregate_hash); 499 /*
500 * that will look hashed, but won't be on any list; hlist_del()
501 * will work fine and require no locking.
502 */
503 ip->i_hash.pprev = &ip->i_hash.next;
500 504
501 return (ip); 505 return (ip);
502} 506}
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 70022fd1c539..d4d142c2edd4 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -79,7 +79,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
79 inode = new_inode(sb); 79 inode = new_inode(sb);
80 if (!inode) { 80 if (!inode) {
81 jfs_warn("ialloc: new_inode returned NULL!"); 81 jfs_warn("ialloc: new_inode returned NULL!");
82 return ERR_PTR(-ENOMEM); 82 rc = -ENOMEM;
83 goto fail;
83 } 84 }
84 85
85 jfs_inode = JFS_IP(inode); 86 jfs_inode = JFS_IP(inode);
@@ -89,8 +90,12 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
89 jfs_warn("ialloc: diAlloc returned %d!", rc); 90 jfs_warn("ialloc: diAlloc returned %d!", rc);
90 if (rc == -EIO) 91 if (rc == -EIO)
91 make_bad_inode(inode); 92 make_bad_inode(inode);
92 iput(inode); 93 goto fail_put;
93 return ERR_PTR(rc); 94 }
95
96 if (insert_inode_locked(inode) < 0) {
97 rc = -EINVAL;
98 goto fail_unlock;
94 } 99 }
95 100
96 inode->i_uid = current_fsuid(); 101 inode->i_uid = current_fsuid();
@@ -112,11 +117,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
112 * Allocate inode to quota. 117 * Allocate inode to quota.
113 */ 118 */
114 if (DQUOT_ALLOC_INODE(inode)) { 119 if (DQUOT_ALLOC_INODE(inode)) {
115 DQUOT_DROP(inode); 120 rc = -EDQUOT;
116 inode->i_flags |= S_NOQUOTA; 121 goto fail_drop;
117 inode->i_nlink = 0;
118 iput(inode);
119 return ERR_PTR(-EDQUOT);
120 } 122 }
121 123
122 inode->i_mode = mode; 124 inode->i_mode = mode;
@@ -158,4 +160,15 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
158 jfs_info("ialloc returns inode = 0x%p\n", inode); 160 jfs_info("ialloc returns inode = 0x%p\n", inode);
159 161
160 return inode; 162 return inode;
163
164fail_drop:
165 DQUOT_DROP(inode);
166 inode->i_flags |= S_NOQUOTA;
167fail_unlock:
168 inode->i_nlink = 0;
169 unlock_new_inode(inode);
170fail_put:
171 iput(inode);
172fail:
173 return ERR_PTR(rc);
161} 174}
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index cc3cedffbfa1..b4de56b851e4 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -155,7 +155,6 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
155 ip->i_fop = &jfs_file_operations; 155 ip->i_fop = &jfs_file_operations;
156 ip->i_mapping->a_ops = &jfs_aops; 156 ip->i_mapping->a_ops = &jfs_aops;
157 157
158 insert_inode_hash(ip);
159 mark_inode_dirty(ip); 158 mark_inode_dirty(ip);
160 159
161 dip->i_ctime = dip->i_mtime = CURRENT_TIME; 160 dip->i_ctime = dip->i_mtime = CURRENT_TIME;
@@ -171,9 +170,12 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
171 if (rc) { 170 if (rc) {
172 free_ea_wmap(ip); 171 free_ea_wmap(ip);
173 ip->i_nlink = 0; 172 ip->i_nlink = 0;
173 unlock_new_inode(ip);
174 iput(ip); 174 iput(ip);
175 } else 175 } else {
176 d_instantiate(dentry, ip); 176 d_instantiate(dentry, ip);
177 unlock_new_inode(ip);
178 }
177 179
178 out2: 180 out2:
179 free_UCSname(&dname); 181 free_UCSname(&dname);
@@ -289,7 +291,6 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
289 ip->i_op = &jfs_dir_inode_operations; 291 ip->i_op = &jfs_dir_inode_operations;
290 ip->i_fop = &jfs_dir_operations; 292 ip->i_fop = &jfs_dir_operations;
291 293
292 insert_inode_hash(ip);
293 mark_inode_dirty(ip); 294 mark_inode_dirty(ip);
294 295
295 /* update parent directory inode */ 296 /* update parent directory inode */
@@ -306,9 +307,12 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
306 if (rc) { 307 if (rc) {
307 free_ea_wmap(ip); 308 free_ea_wmap(ip);
308 ip->i_nlink = 0; 309 ip->i_nlink = 0;
310 unlock_new_inode(ip);
309 iput(ip); 311 iput(ip);
310 } else 312 } else {
311 d_instantiate(dentry, ip); 313 d_instantiate(dentry, ip);
314 unlock_new_inode(ip);
315 }
312 316
313 out2: 317 out2:
314 free_UCSname(&dname); 318 free_UCSname(&dname);
@@ -1019,7 +1023,6 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
1019 goto out3; 1023 goto out3;
1020 } 1024 }
1021 1025
1022 insert_inode_hash(ip);
1023 mark_inode_dirty(ip); 1026 mark_inode_dirty(ip);
1024 1027
1025 dip->i_ctime = dip->i_mtime = CURRENT_TIME; 1028 dip->i_ctime = dip->i_mtime = CURRENT_TIME;
@@ -1039,9 +1042,12 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
1039 if (rc) { 1042 if (rc) {
1040 free_ea_wmap(ip); 1043 free_ea_wmap(ip);
1041 ip->i_nlink = 0; 1044 ip->i_nlink = 0;
1045 unlock_new_inode(ip);
1042 iput(ip); 1046 iput(ip);
1043 } else 1047 } else {
1044 d_instantiate(dentry, ip); 1048 d_instantiate(dentry, ip);
1049 unlock_new_inode(ip);
1050 }
1045 1051
1046 out2: 1052 out2:
1047 free_UCSname(&dname); 1053 free_UCSname(&dname);
@@ -1399,7 +1405,6 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
1399 jfs_ip->dev = new_encode_dev(rdev); 1405 jfs_ip->dev = new_encode_dev(rdev);
1400 init_special_inode(ip, ip->i_mode, rdev); 1406 init_special_inode(ip, ip->i_mode, rdev);
1401 1407
1402 insert_inode_hash(ip);
1403 mark_inode_dirty(ip); 1408 mark_inode_dirty(ip);
1404 1409
1405 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1410 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
@@ -1417,9 +1422,12 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
1417 if (rc) { 1422 if (rc) {
1418 free_ea_wmap(ip); 1423 free_ea_wmap(ip);
1419 ip->i_nlink = 0; 1424 ip->i_nlink = 0;
1425 unlock_new_inode(ip);
1420 iput(ip); 1426 iput(ip);
1421 } else 1427 } else {
1422 d_instantiate(dentry, ip); 1428 d_instantiate(dentry, ip);
1429 unlock_new_inode(ip);
1430 }
1423 1431
1424 out1: 1432 out1:
1425 free_UCSname(&dname); 1433 free_UCSname(&dname);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 0dae345e481b..b37d1f78b854 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -543,7 +543,7 @@ out_kfree:
543 return ret; 543 return ret;
544} 544}
545 545
546static void jfs_write_super_lockfs(struct super_block *sb) 546static int jfs_freeze(struct super_block *sb)
547{ 547{
548 struct jfs_sb_info *sbi = JFS_SBI(sb); 548 struct jfs_sb_info *sbi = JFS_SBI(sb);
549 struct jfs_log *log = sbi->log; 549 struct jfs_log *log = sbi->log;
@@ -553,9 +553,10 @@ static void jfs_write_super_lockfs(struct super_block *sb)
553 lmLogShutdown(log); 553 lmLogShutdown(log);
554 updateSuper(sb, FM_CLEAN); 554 updateSuper(sb, FM_CLEAN);
555 } 555 }
556 return 0;
556} 557}
557 558
558static void jfs_unlockfs(struct super_block *sb) 559static int jfs_unfreeze(struct super_block *sb)
559{ 560{
560 struct jfs_sb_info *sbi = JFS_SBI(sb); 561 struct jfs_sb_info *sbi = JFS_SBI(sb);
561 struct jfs_log *log = sbi->log; 562 struct jfs_log *log = sbi->log;
@@ -568,6 +569,7 @@ static void jfs_unlockfs(struct super_block *sb)
568 else 569 else
569 txResume(sb); 570 txResume(sb);
570 } 571 }
572 return 0;
571} 573}
572 574
573static int jfs_get_sb(struct file_system_type *fs_type, 575static int jfs_get_sb(struct file_system_type *fs_type,
@@ -735,8 +737,8 @@ static const struct super_operations jfs_super_operations = {
735 .delete_inode = jfs_delete_inode, 737 .delete_inode = jfs_delete_inode,
736 .put_super = jfs_put_super, 738 .put_super = jfs_put_super,
737 .sync_fs = jfs_sync_fs, 739 .sync_fs = jfs_sync_fs,
738 .write_super_lockfs = jfs_write_super_lockfs, 740 .freeze_fs = jfs_freeze,
739 .unlockfs = jfs_unlockfs, 741 .unfreeze_fs = jfs_unfreeze,
740 .statfs = jfs_statfs, 742 .statfs = jfs_statfs,
741 .remount_fs = jfs_remount, 743 .remount_fs = jfs_remount,
742 .show_options = jfs_show_options, 744 .show_options = jfs_show_options,
diff --git a/fs/libfs.c b/fs/libfs.c
index e960a8321902..49b44099dabb 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -231,7 +231,6 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
231 */ 231 */
232 root->i_ino = 1; 232 root->i_ino = 1;
233 root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; 233 root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
234 root->i_uid = root->i_gid = 0;
235 root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME; 234 root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
236 dentry = d_alloc(NULL, &d_name); 235 dentry = d_alloc(NULL, &d_name);
237 if (!dentry) { 236 if (!dentry) {
@@ -360,7 +359,7 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
360 index = pos >> PAGE_CACHE_SHIFT; 359 index = pos >> PAGE_CACHE_SHIFT;
361 from = pos & (PAGE_CACHE_SIZE - 1); 360 from = pos & (PAGE_CACHE_SIZE - 1);
362 361
363 page = __grab_cache_page(mapping, index); 362 page = grab_cache_page_write_begin(mapping, index, flags);
364 if (!page) 363 if (!page)
365 return -ENOMEM; 364 return -ENOMEM;
366 365
@@ -436,8 +435,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
436 */ 435 */
437 inode->i_ino = 1; 436 inode->i_ino = 1;
438 inode->i_mode = S_IFDIR | 0755; 437 inode->i_mode = S_IFDIR | 0755;
439 inode->i_uid = inode->i_gid = 0;
440 inode->i_blocks = 0;
441 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 438 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
442 inode->i_op = &simple_dir_inode_operations; 439 inode->i_op = &simple_dir_inode_operations;
443 inode->i_fop = &simple_dir_operations; 440 inode->i_fop = &simple_dir_operations;
@@ -464,8 +461,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
464 if (!inode) 461 if (!inode)
465 goto out; 462 goto out;
466 inode->i_mode = S_IFREG | files->mode; 463 inode->i_mode = S_IFREG | files->mode;
467 inode->i_uid = inode->i_gid = 0;
468 inode->i_blocks = 0;
469 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 464 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
470 inode->i_fop = files->ops; 465 inode->i_fop = files->ops;
471 inode->i_ino = i; 466 inode->i_ino = i;
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 8307dd64bf46..1f3b0fc0d351 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -14,6 +14,7 @@
14#include <linux/sunrpc/svc.h> 14#include <linux/sunrpc/svc.h>
15#include <linux/lockd/lockd.h> 15#include <linux/lockd/lockd.h>
16#include <linux/smp_lock.h> 16#include <linux/smp_lock.h>
17#include <linux/kthread.h>
17 18
18#define NLMDBG_FACILITY NLMDBG_CLIENT 19#define NLMDBG_FACILITY NLMDBG_CLIENT
19 20
@@ -60,7 +61,7 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
60 61
61 host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen, 62 host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen,
62 nlm_init->protocol, nlm_version, 63 nlm_init->protocol, nlm_version,
63 nlm_init->hostname); 64 nlm_init->hostname, nlm_init->noresvport);
64 if (host == NULL) { 65 if (host == NULL) {
65 lockd_down(); 66 lockd_down();
66 return ERR_PTR(-ENOLCK); 67 return ERR_PTR(-ENOLCK);
@@ -191,11 +192,15 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
191void 192void
192nlmclnt_recovery(struct nlm_host *host) 193nlmclnt_recovery(struct nlm_host *host)
193{ 194{
195 struct task_struct *task;
196
194 if (!host->h_reclaiming++) { 197 if (!host->h_reclaiming++) {
195 nlm_get_host(host); 198 nlm_get_host(host);
196 __module_get(THIS_MODULE); 199 task = kthread_run(reclaimer, host, "%s-reclaim", host->h_name);
197 if (kernel_thread(reclaimer, host, CLONE_FS | CLONE_FILES) < 0) 200 if (IS_ERR(task))
198 module_put(THIS_MODULE); 201 printk(KERN_ERR "lockd: unable to spawn reclaimer "
202 "thread. Locks for %s won't be reclaimed! "
203 "(%ld)\n", host->h_name, PTR_ERR(task));
199 } 204 }
200} 205}
201 206
@@ -207,7 +212,6 @@ reclaimer(void *ptr)
207 struct file_lock *fl, *next; 212 struct file_lock *fl, *next;
208 u32 nsmstate; 213 u32 nsmstate;
209 214
210 daemonize("%s-reclaim", host->h_name);
211 allow_signal(SIGKILL); 215 allow_signal(SIGKILL);
212 216
213 down_write(&host->h_rwsem); 217 down_write(&host->h_rwsem);
@@ -233,7 +237,12 @@ restart:
233 list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) { 237 list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) {
234 list_del_init(&fl->fl_u.nfs_fl.list); 238 list_del_init(&fl->fl_u.nfs_fl.list);
235 239
236 /* Why are we leaking memory here? --okir */ 240 /*
241 * sending this thread a SIGKILL will result in any unreclaimed
242 * locks being removed from the h_granted list. This means that
243 * the kernel will not attempt to reclaim them again if a new
244 * reclaimer thread is spawned for this host.
245 */
237 if (signalled()) 246 if (signalled())
238 continue; 247 continue;
239 if (nlmclnt_reclaim(host, fl) != 0) 248 if (nlmclnt_reclaim(host, fl) != 0)
@@ -261,5 +270,5 @@ restart:
261 nlm_release_host(host); 270 nlm_release_host(host);
262 lockd_down(); 271 lockd_down();
263 unlock_kernel(); 272 unlock_kernel();
264 module_put_and_exit(0); 273 return 0;
265} 274}
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 31668b690e03..dd7957064a8c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -16,7 +16,6 @@
16#include <linux/sunrpc/clnt.h> 16#include <linux/sunrpc/clnt.h>
17#include <linux/sunrpc/svc.h> 17#include <linux/sunrpc/svc.h>
18#include <linux/lockd/lockd.h> 18#include <linux/lockd/lockd.h>
19#include <linux/lockd/sm_inter.h>
20 19
21#define NLMDBG_FACILITY NLMDBG_CLIENT 20#define NLMDBG_FACILITY NLMDBG_CLIENT
22#define NLMCLNT_GRACE_WAIT (5*HZ) 21#define NLMCLNT_GRACE_WAIT (5*HZ)
@@ -518,11 +517,9 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
518 unsigned char fl_type; 517 unsigned char fl_type;
519 int status = -ENOLCK; 518 int status = -ENOLCK;
520 519
521 if (nsm_monitor(host) < 0) { 520 if (nsm_monitor(host) < 0)
522 printk(KERN_NOTICE "lockd: failed to monitor %s\n",
523 host->h_name);
524 goto out; 521 goto out;
525 } 522
526 fl->fl_flags |= FL_ACCESS; 523 fl->fl_flags |= FL_ACCESS;
527 status = do_vfs_lock(fl); 524 status = do_vfs_lock(fl);
528 fl->fl_flags = fl_flags; 525 fl->fl_flags = fl_flags;
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index e05d04416037..99d737bd4325 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -15,7 +15,6 @@
15#include <linux/sunrpc/clnt.h> 15#include <linux/sunrpc/clnt.h>
16#include <linux/sunrpc/svc.h> 16#include <linux/sunrpc/svc.h>
17#include <linux/lockd/lockd.h> 17#include <linux/lockd/lockd.h>
18#include <linux/lockd/sm_inter.h>
19#include <linux/mutex.h> 18#include <linux/mutex.h>
20 19
21#include <net/ipv6.h> 20#include <net/ipv6.h>
@@ -32,11 +31,6 @@ static int nrhosts;
32static DEFINE_MUTEX(nlm_host_mutex); 31static DEFINE_MUTEX(nlm_host_mutex);
33 32
34static void nlm_gc_hosts(void); 33static void nlm_gc_hosts(void);
35static struct nsm_handle *nsm_find(const struct sockaddr *sap,
36 const size_t salen,
37 const char *hostname,
38 const size_t hostname_len,
39 const int create);
40 34
41struct nlm_lookup_host_info { 35struct nlm_lookup_host_info {
42 const int server; /* search for server|client */ 36 const int server; /* search for server|client */
@@ -48,6 +42,7 @@ struct nlm_lookup_host_info {
48 const size_t hostname_len; /* it's length */ 42 const size_t hostname_len; /* it's length */
49 const struct sockaddr *src_sap; /* our address (optional) */ 43 const struct sockaddr *src_sap; /* our address (optional) */
50 const size_t src_len; /* it's length */ 44 const size_t src_len; /* it's length */
45 const int noresvport; /* use non-priv port */
51}; 46};
52 47
53/* 48/*
@@ -104,32 +99,6 @@ static void nlm_clear_port(struct sockaddr *sap)
104 } 99 }
105} 100}
106 101
107static void nlm_display_address(const struct sockaddr *sap,
108 char *buf, const size_t len)
109{
110 const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
111 const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
112
113 switch (sap->sa_family) {
114 case AF_UNSPEC:
115 snprintf(buf, len, "unspecified");
116 break;
117 case AF_INET:
118 snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
119 break;
120 case AF_INET6:
121 if (ipv6_addr_v4mapped(&sin6->sin6_addr))
122 snprintf(buf, len, "%pI4",
123 &sin6->sin6_addr.s6_addr32[3]);
124 else
125 snprintf(buf, len, "%pI6", &sin6->sin6_addr);
126 break;
127 default:
128 snprintf(buf, len, "unsupported address family");
129 break;
130 }
131}
132
133/* 102/*
134 * Common host lookup routine for server & client 103 * Common host lookup routine for server & client
135 */ 104 */
@@ -189,8 +158,8 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
189 atomic_inc(&nsm->sm_count); 158 atomic_inc(&nsm->sm_count);
190 else { 159 else {
191 host = NULL; 160 host = NULL;
192 nsm = nsm_find(ni->sap, ni->salen, 161 nsm = nsm_get_handle(ni->sap, ni->salen,
193 ni->hostname, ni->hostname_len, 1); 162 ni->hostname, ni->hostname_len);
194 if (!nsm) { 163 if (!nsm) {
195 dprintk("lockd: nlm_lookup_host failed; " 164 dprintk("lockd: nlm_lookup_host failed; "
196 "no nsm handle\n"); 165 "no nsm handle\n");
@@ -205,6 +174,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
205 goto out; 174 goto out;
206 } 175 }
207 host->h_name = nsm->sm_name; 176 host->h_name = nsm->sm_name;
177 host->h_addrbuf = nsm->sm_addrbuf;
208 memcpy(nlm_addr(host), ni->sap, ni->salen); 178 memcpy(nlm_addr(host), ni->sap, ni->salen);
209 host->h_addrlen = ni->salen; 179 host->h_addrlen = ni->salen;
210 nlm_clear_port(nlm_addr(host)); 180 nlm_clear_port(nlm_addr(host));
@@ -222,6 +192,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
222 host->h_nsmstate = 0; /* real NSM state */ 192 host->h_nsmstate = 0; /* real NSM state */
223 host->h_nsmhandle = nsm; 193 host->h_nsmhandle = nsm;
224 host->h_server = ni->server; 194 host->h_server = ni->server;
195 host->h_noresvport = ni->noresvport;
225 hlist_add_head(&host->h_hash, chain); 196 hlist_add_head(&host->h_hash, chain);
226 INIT_LIST_HEAD(&host->h_lockowners); 197 INIT_LIST_HEAD(&host->h_lockowners);
227 spin_lock_init(&host->h_lock); 198 spin_lock_init(&host->h_lock);
@@ -230,11 +201,6 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
230 201
231 nrhosts++; 202 nrhosts++;
232 203
233 nlm_display_address((struct sockaddr *)&host->h_addr,
234 host->h_addrbuf, sizeof(host->h_addrbuf));
235 nlm_display_address((struct sockaddr *)&host->h_srcaddr,
236 host->h_srcaddrbuf, sizeof(host->h_srcaddrbuf));
237
238 dprintk("lockd: nlm_lookup_host created host %s\n", 204 dprintk("lockd: nlm_lookup_host created host %s\n",
239 host->h_name); 205 host->h_name);
240 206
@@ -254,10 +220,8 @@ nlm_destroy_host(struct nlm_host *host)
254 BUG_ON(!list_empty(&host->h_lockowners)); 220 BUG_ON(!list_empty(&host->h_lockowners));
255 BUG_ON(atomic_read(&host->h_count)); 221 BUG_ON(atomic_read(&host->h_count));
256 222
257 /*
258 * Release NSM handle and unmonitor host.
259 */
260 nsm_unmonitor(host); 223 nsm_unmonitor(host);
224 nsm_release(host->h_nsmhandle);
261 225
262 clnt = host->h_rpcclnt; 226 clnt = host->h_rpcclnt;
263 if (clnt != NULL) 227 if (clnt != NULL)
@@ -272,6 +236,7 @@ nlm_destroy_host(struct nlm_host *host)
272 * @protocol: transport protocol to use 236 * @protocol: transport protocol to use
273 * @version: NLM protocol version 237 * @version: NLM protocol version
274 * @hostname: '\0'-terminated hostname of server 238 * @hostname: '\0'-terminated hostname of server
239 * @noresvport: 1 if non-privileged port should be used
275 * 240 *
276 * Returns an nlm_host structure that matches the passed-in 241 * Returns an nlm_host structure that matches the passed-in
277 * [server address, transport protocol, NLM version, server hostname]. 242 * [server address, transport protocol, NLM version, server hostname].
@@ -281,7 +246,9 @@ nlm_destroy_host(struct nlm_host *host)
281struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, 246struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
282 const size_t salen, 247 const size_t salen,
283 const unsigned short protocol, 248 const unsigned short protocol,
284 const u32 version, const char *hostname) 249 const u32 version,
250 const char *hostname,
251 int noresvport)
285{ 252{
286 const struct sockaddr source = { 253 const struct sockaddr source = {
287 .sa_family = AF_UNSPEC, 254 .sa_family = AF_UNSPEC,
@@ -296,6 +263,7 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
296 .hostname_len = strlen(hostname), 263 .hostname_len = strlen(hostname),
297 .src_sap = &source, 264 .src_sap = &source,
298 .src_len = sizeof(source), 265 .src_len = sizeof(source),
266 .noresvport = noresvport,
299 }; 267 };
300 268
301 dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__, 269 dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__,
@@ -372,8 +340,8 @@ nlm_bind_host(struct nlm_host *host)
372{ 340{
373 struct rpc_clnt *clnt; 341 struct rpc_clnt *clnt;
374 342
375 dprintk("lockd: nlm_bind_host %s (%s), my addr=%s\n", 343 dprintk("lockd: nlm_bind_host %s (%s)\n",
376 host->h_name, host->h_addrbuf, host->h_srcaddrbuf); 344 host->h_name, host->h_addrbuf);
377 345
378 /* Lock host handle */ 346 /* Lock host handle */
379 mutex_lock(&host->h_mutex); 347 mutex_lock(&host->h_mutex);
@@ -417,6 +385,8 @@ nlm_bind_host(struct nlm_host *host)
417 */ 385 */
418 if (!host->h_server) 386 if (!host->h_server)
419 args.flags |= RPC_CLNT_CREATE_HARDRTRY; 387 args.flags |= RPC_CLNT_CREATE_HARDRTRY;
388 if (host->h_noresvport)
389 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
420 390
421 clnt = rpc_create(&args); 391 clnt = rpc_create(&args);
422 if (!IS_ERR(clnt)) 392 if (!IS_ERR(clnt))
@@ -473,35 +443,23 @@ void nlm_release_host(struct nlm_host *host)
473 } 443 }
474} 444}
475 445
476/* 446/**
477 * We were notified that the host indicated by address &sin 447 * nlm_host_rebooted - Release all resources held by rebooted host
478 * has rebooted. 448 * @info: pointer to decoded results of NLM_SM_NOTIFY call
479 * Release all resources held by that peer. 449 *
450 * We were notified that the specified host has rebooted. Release
451 * all resources held by that peer.
480 */ 452 */
481void nlm_host_rebooted(const struct sockaddr_in *sin, 453void nlm_host_rebooted(const struct nlm_reboot *info)
482 const char *hostname,
483 unsigned int hostname_len,
484 u32 new_state)
485{ 454{
486 struct hlist_head *chain; 455 struct hlist_head *chain;
487 struct hlist_node *pos; 456 struct hlist_node *pos;
488 struct nsm_handle *nsm; 457 struct nsm_handle *nsm;
489 struct nlm_host *host; 458 struct nlm_host *host;
490 459
491 nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin), 460 nsm = nsm_reboot_lookup(info);
492 hostname, hostname_len, 0); 461 if (unlikely(nsm == NULL))
493 if (nsm == NULL) {
494 dprintk("lockd: never saw rebooted peer '%.*s' before\n",
495 hostname_len, hostname);
496 return; 462 return;
497 }
498
499 dprintk("lockd: nlm_host_rebooted(%.*s, %s)\n",
500 hostname_len, hostname, nsm->sm_addrbuf);
501
502 /* When reclaiming locks on this peer, make sure that
503 * we set up a new notification */
504 nsm->sm_monitored = 0;
505 463
506 /* Mark all hosts tied to this NSM state as having rebooted. 464 /* Mark all hosts tied to this NSM state as having rebooted.
507 * We run the loop repeatedly, because we drop the host table 465 * We run the loop repeatedly, because we drop the host table
@@ -512,8 +470,8 @@ again: mutex_lock(&nlm_host_mutex);
512 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { 470 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
513 hlist_for_each_entry(host, pos, chain, h_hash) { 471 hlist_for_each_entry(host, pos, chain, h_hash) {
514 if (host->h_nsmhandle == nsm 472 if (host->h_nsmhandle == nsm
515 && host->h_nsmstate != new_state) { 473 && host->h_nsmstate != info->state) {
516 host->h_nsmstate = new_state; 474 host->h_nsmstate = info->state;
517 host->h_state++; 475 host->h_state++;
518 476
519 nlm_get_host(host); 477 nlm_get_host(host);
@@ -621,89 +579,3 @@ nlm_gc_hosts(void)
621 579
622 next_gc = jiffies + NLM_HOST_COLLECT; 580 next_gc = jiffies + NLM_HOST_COLLECT;
623} 581}
624
625
626/*
627 * Manage NSM handles
628 */
629static LIST_HEAD(nsm_handles);
630static DEFINE_SPINLOCK(nsm_lock);
631
632static struct nsm_handle *nsm_find(const struct sockaddr *sap,
633 const size_t salen,
634 const char *hostname,
635 const size_t hostname_len,
636 const int create)
637{
638 struct nsm_handle *nsm = NULL;
639 struct nsm_handle *pos;
640
641 if (!sap)
642 return NULL;
643
644 if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
645 if (printk_ratelimit()) {
646 printk(KERN_WARNING "Invalid hostname \"%.*s\" "
647 "in NFS lock request\n",
648 (int)hostname_len, hostname);
649 }
650 return NULL;
651 }
652
653retry:
654 spin_lock(&nsm_lock);
655 list_for_each_entry(pos, &nsm_handles, sm_link) {
656
657 if (hostname && nsm_use_hostnames) {
658 if (strlen(pos->sm_name) != hostname_len
659 || memcmp(pos->sm_name, hostname, hostname_len))
660 continue;
661 } else if (!nlm_cmp_addr(nsm_addr(pos), sap))
662 continue;
663 atomic_inc(&pos->sm_count);
664 kfree(nsm);
665 nsm = pos;
666 goto found;
667 }
668 if (nsm) {
669 list_add(&nsm->sm_link, &nsm_handles);
670 goto found;
671 }
672 spin_unlock(&nsm_lock);
673
674 if (!create)
675 return NULL;
676
677 nsm = kzalloc(sizeof(*nsm) + hostname_len + 1, GFP_KERNEL);
678 if (nsm == NULL)
679 return NULL;
680
681 memcpy(nsm_addr(nsm), sap, salen);
682 nsm->sm_addrlen = salen;
683 nsm->sm_name = (char *) (nsm + 1);
684 memcpy(nsm->sm_name, hostname, hostname_len);
685 nsm->sm_name[hostname_len] = '\0';
686 nlm_display_address((struct sockaddr *)&nsm->sm_addr,
687 nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf));
688 atomic_set(&nsm->sm_count, 1);
689 goto retry;
690
691found:
692 spin_unlock(&nsm_lock);
693 return nsm;
694}
695
696/*
697 * Release an NSM handle
698 */
699void
700nsm_release(struct nsm_handle *nsm)
701{
702 if (!nsm)
703 return;
704 if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) {
705 list_del(&nsm->sm_link);
706 spin_unlock(&nsm_lock);
707 kfree(nsm);
708 }
709}
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index ffd3461f75ef..5e2c4d5ac827 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -9,35 +9,123 @@
9#include <linux/types.h> 9#include <linux/types.h>
10#include <linux/utsname.h> 10#include <linux/utsname.h>
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/ktime.h>
13
12#include <linux/sunrpc/clnt.h> 14#include <linux/sunrpc/clnt.h>
13#include <linux/sunrpc/xprtsock.h> 15#include <linux/sunrpc/xprtsock.h>
14#include <linux/sunrpc/svc.h> 16#include <linux/sunrpc/svc.h>
15#include <linux/lockd/lockd.h> 17#include <linux/lockd/lockd.h>
16#include <linux/lockd/sm_inter.h>
17
18 18
19#define NLMDBG_FACILITY NLMDBG_MONITOR 19#define NLMDBG_FACILITY NLMDBG_MONITOR
20#define NSM_PROGRAM 100024
21#define NSM_VERSION 1
22
23enum {
24 NSMPROC_NULL,
25 NSMPROC_STAT,
26 NSMPROC_MON,
27 NSMPROC_UNMON,
28 NSMPROC_UNMON_ALL,
29 NSMPROC_SIMU_CRASH,
30 NSMPROC_NOTIFY,
31};
32
33struct nsm_args {
34 struct nsm_private *priv;
35 u32 prog; /* RPC callback info */
36 u32 vers;
37 u32 proc;
20 38
21#define XDR_ADDRBUF_LEN (20) 39 char *mon_name;
40};
22 41
23static struct rpc_clnt * nsm_create(void); 42struct nsm_res {
43 u32 status;
44 u32 state;
45};
24 46
25static struct rpc_program nsm_program; 47static struct rpc_program nsm_program;
48static LIST_HEAD(nsm_handles);
49static DEFINE_SPINLOCK(nsm_lock);
26 50
27/* 51/*
28 * Local NSM state 52 * Local NSM state
29 */ 53 */
30int nsm_local_state; 54int __read_mostly nsm_local_state;
55int __read_mostly nsm_use_hostnames;
31 56
32/* 57static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
33 * Common procedure for SM_MON/SM_UNMON calls 58{
34 */ 59 return (struct sockaddr *)&nsm->sm_addr;
35static int 60}
36nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) 61
62static void nsm_display_ipv4_address(const struct sockaddr *sap, char *buf,
63 const size_t len)
64{
65 const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
66 snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
67}
68
69static void nsm_display_ipv6_address(const struct sockaddr *sap, char *buf,
70 const size_t len)
71{
72 const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
73
74 if (ipv6_addr_v4mapped(&sin6->sin6_addr))
75 snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]);
76 else if (sin6->sin6_scope_id != 0)
77 snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr,
78 sin6->sin6_scope_id);
79 else
80 snprintf(buf, len, "%pI6", &sin6->sin6_addr);
81}
82
83static void nsm_display_address(const struct sockaddr *sap,
84 char *buf, const size_t len)
85{
86 switch (sap->sa_family) {
87 case AF_INET:
88 nsm_display_ipv4_address(sap, buf, len);
89 break;
90 case AF_INET6:
91 nsm_display_ipv6_address(sap, buf, len);
92 break;
93 default:
94 snprintf(buf, len, "unsupported address family");
95 break;
96 }
97}
98
99static struct rpc_clnt *nsm_create(void)
100{
101 struct sockaddr_in sin = {
102 .sin_family = AF_INET,
103 .sin_addr.s_addr = htonl(INADDR_LOOPBACK),
104 };
105 struct rpc_create_args args = {
106 .protocol = XPRT_TRANSPORT_UDP,
107 .address = (struct sockaddr *)&sin,
108 .addrsize = sizeof(sin),
109 .servername = "rpc.statd",
110 .program = &nsm_program,
111 .version = NSM_VERSION,
112 .authflavor = RPC_AUTH_NULL,
113 };
114
115 return rpc_create(&args);
116}
117
118static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
37{ 119{
38 struct rpc_clnt *clnt; 120 struct rpc_clnt *clnt;
39 int status; 121 int status;
40 struct nsm_args args; 122 struct nsm_args args = {
123 .priv = &nsm->sm_priv,
124 .prog = NLM_PROGRAM,
125 .vers = 3,
126 .proc = NLMPROC_NSM_NOTIFY,
127 .mon_name = nsm->sm_mon_name,
128 };
41 struct rpc_message msg = { 129 struct rpc_message msg = {
42 .rpc_argp = &args, 130 .rpc_argp = &args,
43 .rpc_resp = res, 131 .rpc_resp = res,
@@ -46,22 +134,18 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
46 clnt = nsm_create(); 134 clnt = nsm_create();
47 if (IS_ERR(clnt)) { 135 if (IS_ERR(clnt)) {
48 status = PTR_ERR(clnt); 136 status = PTR_ERR(clnt);
137 dprintk("lockd: failed to create NSM upcall transport, "
138 "status=%d\n", status);
49 goto out; 139 goto out;
50 } 140 }
51 141
52 memset(&args, 0, sizeof(args));
53 args.mon_name = nsm->sm_name;
54 args.addr = nsm_addr_in(nsm)->sin_addr.s_addr;
55 args.prog = NLM_PROGRAM;
56 args.vers = 3;
57 args.proc = NLMPROC_NSM_NOTIFY;
58 memset(res, 0, sizeof(*res)); 142 memset(res, 0, sizeof(*res));
59 143
60 msg.rpc_proc = &clnt->cl_procinfo[proc]; 144 msg.rpc_proc = &clnt->cl_procinfo[proc];
61 status = rpc_call_sync(clnt, &msg, 0); 145 status = rpc_call_sync(clnt, &msg, 0);
62 if (status < 0) 146 if (status < 0)
63 printk(KERN_DEBUG "nsm_mon_unmon: rpc failed, status=%d\n", 147 dprintk("lockd: NSM upcall RPC failed, status=%d\n",
64 status); 148 status);
65 else 149 else
66 status = 0; 150 status = 0;
67 rpc_shutdown_client(clnt); 151 rpc_shutdown_client(clnt);
@@ -69,82 +153,272 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
69 return status; 153 return status;
70} 154}
71 155
72/* 156/**
73 * Set up monitoring of a remote host 157 * nsm_monitor - Notify a peer in case we reboot
158 * @host: pointer to nlm_host of peer to notify
159 *
160 * If this peer is not already monitored, this function sends an
161 * upcall to the local rpc.statd to record the name/address of
162 * the peer to notify in case we reboot.
163 *
164 * Returns zero if the peer is monitored by the local rpc.statd;
165 * otherwise a negative errno value is returned.
74 */ 166 */
75int 167int nsm_monitor(const struct nlm_host *host)
76nsm_monitor(struct nlm_host *host)
77{ 168{
78 struct nsm_handle *nsm = host->h_nsmhandle; 169 struct nsm_handle *nsm = host->h_nsmhandle;
79 struct nsm_res res; 170 struct nsm_res res;
80 int status; 171 int status;
81 172
82 dprintk("lockd: nsm_monitor(%s)\n", host->h_name); 173 dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
83 BUG_ON(nsm == NULL);
84 174
85 if (nsm->sm_monitored) 175 if (nsm->sm_monitored)
86 return 0; 176 return 0;
87 177
88 status = nsm_mon_unmon(nsm, SM_MON, &res); 178 /*
179 * Choose whether to record the caller_name or IP address of
180 * this peer in the local rpc.statd's database.
181 */
182 nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
89 183
90 if (status < 0 || res.status != 0) 184 status = nsm_mon_unmon(nsm, NSMPROC_MON, &res);
91 printk(KERN_NOTICE "lockd: cannot monitor %s\n", host->h_name); 185 if (res.status != 0)
186 status = -EIO;
187 if (status < 0)
188 printk(KERN_NOTICE "lockd: cannot monitor %s\n", nsm->sm_name);
92 else 189 else
93 nsm->sm_monitored = 1; 190 nsm->sm_monitored = 1;
94 return status; 191 return status;
95} 192}
96 193
97/* 194/**
98 * Cease to monitor remote host 195 * nsm_unmonitor - Unregister peer notification
196 * @host: pointer to nlm_host of peer to stop monitoring
197 *
198 * If this peer is monitored, this function sends an upcall to
199 * tell the local rpc.statd not to send this peer a notification
200 * when we reboot.
99 */ 201 */
100int 202void nsm_unmonitor(const struct nlm_host *host)
101nsm_unmonitor(struct nlm_host *host)
102{ 203{
103 struct nsm_handle *nsm = host->h_nsmhandle; 204 struct nsm_handle *nsm = host->h_nsmhandle;
104 struct nsm_res res; 205 struct nsm_res res;
105 int status = 0; 206 int status;
106
107 if (nsm == NULL)
108 return 0;
109 host->h_nsmhandle = NULL;
110 207
111 if (atomic_read(&nsm->sm_count) == 1 208 if (atomic_read(&nsm->sm_count) == 1
112 && nsm->sm_monitored && !nsm->sm_sticky) { 209 && nsm->sm_monitored && !nsm->sm_sticky) {
113 dprintk("lockd: nsm_unmonitor(%s)\n", host->h_name); 210 dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
114 211
115 status = nsm_mon_unmon(nsm, SM_UNMON, &res); 212 status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res);
213 if (res.status != 0)
214 status = -EIO;
116 if (status < 0) 215 if (status < 0)
117 printk(KERN_NOTICE "lockd: cannot unmonitor %s\n", 216 printk(KERN_NOTICE "lockd: cannot unmonitor %s\n",
118 host->h_name); 217 nsm->sm_name);
119 else 218 else
120 nsm->sm_monitored = 0; 219 nsm->sm_monitored = 0;
121 } 220 }
122 nsm_release(nsm); 221}
123 return status; 222
223static struct nsm_handle *nsm_lookup_hostname(const char *hostname,
224 const size_t len)
225{
226 struct nsm_handle *nsm;
227
228 list_for_each_entry(nsm, &nsm_handles, sm_link)
229 if (strlen(nsm->sm_name) == len &&
230 memcmp(nsm->sm_name, hostname, len) == 0)
231 return nsm;
232 return NULL;
233}
234
235static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap)
236{
237 struct nsm_handle *nsm;
238
239 list_for_each_entry(nsm, &nsm_handles, sm_link)
240 if (nlm_cmp_addr(nsm_addr(nsm), sap))
241 return nsm;
242 return NULL;
243}
244
245static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv)
246{
247 struct nsm_handle *nsm;
248
249 list_for_each_entry(nsm, &nsm_handles, sm_link)
250 if (memcmp(nsm->sm_priv.data, priv->data,
251 sizeof(priv->data)) == 0)
252 return nsm;
253 return NULL;
124} 254}
125 255
126/* 256/*
127 * Create NSM client for the local host 257 * Construct a unique cookie to match this nsm_handle to this monitored
258 * host. It is passed to the local rpc.statd via NSMPROC_MON, and
259 * returned via NLMPROC_SM_NOTIFY, in the "priv" field of these
260 * requests.
261 *
262 * The NSM protocol requires that these cookies be unique while the
263 * system is running. We prefer a stronger requirement of making them
264 * unique across reboots. If user space bugs cause a stale cookie to
265 * be sent to the kernel, it could cause the wrong host to lose its
266 * lock state if cookies were not unique across reboots.
267 *
268 * The cookies are exposed only to local user space via loopback. They
269 * do not appear on the physical network. If we want greater security
270 * for some reason, nsm_init_private() could perform a one-way hash to
271 * obscure the contents of the cookie.
128 */ 272 */
129static struct rpc_clnt * 273static void nsm_init_private(struct nsm_handle *nsm)
130nsm_create(void)
131{ 274{
132 struct sockaddr_in sin = { 275 u64 *p = (u64 *)&nsm->sm_priv.data;
133 .sin_family = AF_INET, 276 struct timespec ts;
134 .sin_addr.s_addr = htonl(INADDR_LOOPBACK),
135 .sin_port = 0,
136 };
137 struct rpc_create_args args = {
138 .protocol = XPRT_TRANSPORT_UDP,
139 .address = (struct sockaddr *)&sin,
140 .addrsize = sizeof(sin),
141 .servername = "localhost",
142 .program = &nsm_program,
143 .version = SM_VERSION,
144 .authflavor = RPC_AUTH_NULL,
145 };
146 277
147 return rpc_create(&args); 278 ktime_get_ts(&ts);
279 *p++ = timespec_to_ns(&ts);
280 *p = (unsigned long)nsm;
281}
282
283static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
284 const size_t salen,
285 const char *hostname,
286 const size_t hostname_len)
287{
288 struct nsm_handle *new;
289
290 new = kzalloc(sizeof(*new) + hostname_len + 1, GFP_KERNEL);
291 if (unlikely(new == NULL))
292 return NULL;
293
294 atomic_set(&new->sm_count, 1);
295 new->sm_name = (char *)(new + 1);
296 memcpy(nsm_addr(new), sap, salen);
297 new->sm_addrlen = salen;
298 nsm_init_private(new);
299 nsm_display_address((const struct sockaddr *)&new->sm_addr,
300 new->sm_addrbuf, sizeof(new->sm_addrbuf));
301 memcpy(new->sm_name, hostname, hostname_len);
302 new->sm_name[hostname_len] = '\0';
303
304 return new;
305}
306
307/**
308 * nsm_get_handle - Find or create a cached nsm_handle
309 * @sap: pointer to socket address of handle to find
310 * @salen: length of socket address
311 * @hostname: pointer to C string containing hostname to find
312 * @hostname_len: length of C string
313 *
314 * Behavior is modulated by the global nsm_use_hostnames variable.
315 *
316 * Returns a cached nsm_handle after bumping its ref count, or
317 * returns a fresh nsm_handle if a handle that matches @sap and/or
318 * @hostname cannot be found in the handle cache. Returns NULL if
319 * an error occurs.
320 */
321struct nsm_handle *nsm_get_handle(const struct sockaddr *sap,
322 const size_t salen, const char *hostname,
323 const size_t hostname_len)
324{
325 struct nsm_handle *cached, *new = NULL;
326
327 if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
328 if (printk_ratelimit()) {
329 printk(KERN_WARNING "Invalid hostname \"%.*s\" "
330 "in NFS lock request\n",
331 (int)hostname_len, hostname);
332 }
333 return NULL;
334 }
335
336retry:
337 spin_lock(&nsm_lock);
338
339 if (nsm_use_hostnames && hostname != NULL)
340 cached = nsm_lookup_hostname(hostname, hostname_len);
341 else
342 cached = nsm_lookup_addr(sap);
343
344 if (cached != NULL) {
345 atomic_inc(&cached->sm_count);
346 spin_unlock(&nsm_lock);
347 kfree(new);
348 dprintk("lockd: found nsm_handle for %s (%s), "
349 "cnt %d\n", cached->sm_name,
350 cached->sm_addrbuf,
351 atomic_read(&cached->sm_count));
352 return cached;
353 }
354
355 if (new != NULL) {
356 list_add(&new->sm_link, &nsm_handles);
357 spin_unlock(&nsm_lock);
358 dprintk("lockd: created nsm_handle for %s (%s)\n",
359 new->sm_name, new->sm_addrbuf);
360 return new;
361 }
362
363 spin_unlock(&nsm_lock);
364
365 new = nsm_create_handle(sap, salen, hostname, hostname_len);
366 if (unlikely(new == NULL))
367 return NULL;
368 goto retry;
369}
370
371/**
372 * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle
373 * @info: pointer to NLMPROC_SM_NOTIFY arguments
374 *
375 * Returns a matching nsm_handle if found in the nsm cache; the returned
376 * nsm_handle's reference count is bumped and sm_monitored is cleared.
377 * Otherwise returns NULL if some error occurred.
378 */
379struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
380{
381 struct nsm_handle *cached;
382
383 spin_lock(&nsm_lock);
384
385 cached = nsm_lookup_priv(&info->priv);
386 if (unlikely(cached == NULL)) {
387 spin_unlock(&nsm_lock);
388 dprintk("lockd: never saw rebooted peer '%.*s' before\n",
389 info->len, info->mon);
390 return cached;
391 }
392
393 atomic_inc(&cached->sm_count);
394 spin_unlock(&nsm_lock);
395
396 /*
397 * During subsequent lock activity, force a fresh
398 * notification to be set up for this host.
399 */
400 cached->sm_monitored = 0;
401
402 dprintk("lockd: host %s (%s) rebooted, cnt %d\n",
403 cached->sm_name, cached->sm_addrbuf,
404 atomic_read(&cached->sm_count));
405 return cached;
406}
407
408/**
409 * nsm_release - Release an NSM handle
410 * @nsm: pointer to handle to be released
411 *
412 */
413void nsm_release(struct nsm_handle *nsm)
414{
415 if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) {
416 list_del(&nsm->sm_link);
417 spin_unlock(&nsm_lock);
418 dprintk("lockd: destroyed nsm_handle for %s (%s)\n",
419 nsm->sm_name, nsm->sm_addrbuf);
420 kfree(nsm);
421 }
148} 422}
149 423
150/* 424/*
@@ -154,127 +428,132 @@ nsm_create(void)
154 * Status Monitor wire protocol. 428 * Status Monitor wire protocol.
155 */ 429 */
156 430
157static __be32 *xdr_encode_nsm_string(__be32 *p, char *string) 431static int encode_nsm_string(struct xdr_stream *xdr, const char *string)
158{ 432{
159 size_t len = strlen(string); 433 const u32 len = strlen(string);
160 434 __be32 *p;
161 if (len > SM_MAXSTRLEN) 435
162 len = SM_MAXSTRLEN; 436 if (unlikely(len > SM_MAXSTRLEN))
163 return xdr_encode_opaque(p, string, len); 437 return -EIO;
438 p = xdr_reserve_space(xdr, sizeof(u32) + len);
439 if (unlikely(p == NULL))
440 return -EIO;
441 xdr_encode_opaque(p, string, len);
442 return 0;
164} 443}
165 444
166/* 445/*
167 * "mon_name" specifies the host to be monitored. 446 * "mon_name" specifies the host to be monitored.
168 *
169 * Linux uses a text version of the IP address of the remote
170 * host as the host identifier (the "mon_name" argument).
171 *
172 * Linux statd always looks up the canonical hostname first for
173 * whatever remote hostname it receives, so this works alright.
174 */ 447 */
175static __be32 *xdr_encode_mon_name(__be32 *p, struct nsm_args *argp) 448static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
176{ 449{
177 char buffer[XDR_ADDRBUF_LEN + 1]; 450 return encode_nsm_string(xdr, argp->mon_name);
178 char *name = argp->mon_name;
179
180 if (!nsm_use_hostnames) {
181 snprintf(buffer, XDR_ADDRBUF_LEN,
182 "%pI4", &argp->addr);
183 name = buffer;
184 }
185
186 return xdr_encode_nsm_string(p, name);
187} 451}
188 452
189/* 453/*
190 * The "my_id" argument specifies the hostname and RPC procedure 454 * The "my_id" argument specifies the hostname and RPC procedure
191 * to be called when the status manager receives notification 455 * to be called when the status manager receives notification
192 * (via the SM_NOTIFY call) that the state of host "mon_name" 456 * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name"
193 * has changed. 457 * has changed.
194 */ 458 */
195static __be32 *xdr_encode_my_id(__be32 *p, struct nsm_args *argp) 459static int encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
196{ 460{
197 p = xdr_encode_nsm_string(p, utsname()->nodename); 461 int status;
198 if (!p) 462 __be32 *p;
199 return ERR_PTR(-EIO); 463
200 464 status = encode_nsm_string(xdr, utsname()->nodename);
465 if (unlikely(status != 0))
466 return status;
467 p = xdr_reserve_space(xdr, 3 * sizeof(u32));
468 if (unlikely(p == NULL))
469 return -EIO;
201 *p++ = htonl(argp->prog); 470 *p++ = htonl(argp->prog);
202 *p++ = htonl(argp->vers); 471 *p++ = htonl(argp->vers);
203 *p++ = htonl(argp->proc); 472 *p++ = htonl(argp->proc);
204 473 return 0;
205 return p;
206} 474}
207 475
208/* 476/*
209 * The "mon_id" argument specifies the non-private arguments 477 * The "mon_id" argument specifies the non-private arguments
210 * of an SM_MON or SM_UNMON call. 478 * of an NSMPROC_MON or NSMPROC_UNMON call.
211 */ 479 */
212static __be32 *xdr_encode_mon_id(__be32 *p, struct nsm_args *argp) 480static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
213{ 481{
214 p = xdr_encode_mon_name(p, argp); 482 int status;
215 if (!p)
216 return ERR_PTR(-EIO);
217 483
218 return xdr_encode_my_id(p, argp); 484 status = encode_mon_name(xdr, argp);
485 if (unlikely(status != 0))
486 return status;
487 return encode_my_id(xdr, argp);
219} 488}
220 489
221/* 490/*
222 * The "priv" argument may contain private information required 491 * The "priv" argument may contain private information required
223 * by the SM_MON call. This information will be supplied in the 492 * by the NSMPROC_MON call. This information will be supplied in the
224 * SM_NOTIFY call. 493 * NLMPROC_SM_NOTIFY call.
225 *
226 * Linux provides the raw IP address of the monitored host,
227 * left in network byte order.
228 */ 494 */
229static __be32 *xdr_encode_priv(__be32 *p, struct nsm_args *argp) 495static int encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
230{ 496{
231 *p++ = argp->addr; 497 __be32 *p;
232 *p++ = 0;
233 *p++ = 0;
234 *p++ = 0;
235 498
236 return p; 499 p = xdr_reserve_space(xdr, SM_PRIV_SIZE);
500 if (unlikely(p == NULL))
501 return -EIO;
502 xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE);
503 return 0;
237} 504}
238 505
239static int 506static int xdr_enc_mon(struct rpc_rqst *req, __be32 *p,
240xdr_encode_mon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) 507 const struct nsm_args *argp)
241{ 508{
242 p = xdr_encode_mon_id(p, argp); 509 struct xdr_stream xdr;
243 if (IS_ERR(p)) 510 int status;
244 return PTR_ERR(p); 511
245 512 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
246 p = xdr_encode_priv(p, argp); 513 status = encode_mon_id(&xdr, argp);
247 if (IS_ERR(p)) 514 if (unlikely(status))
248 return PTR_ERR(p); 515 return status;
249 516 return encode_priv(&xdr, argp);
250 rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p);
251 return 0;
252} 517}
253 518
254static int 519static int xdr_enc_unmon(struct rpc_rqst *req, __be32 *p,
255xdr_encode_unmon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) 520 const struct nsm_args *argp)
256{ 521{
257 p = xdr_encode_mon_id(p, argp); 522 struct xdr_stream xdr;
258 if (IS_ERR(p)) 523
259 return PTR_ERR(p); 524 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
260 rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p); 525 return encode_mon_id(&xdr, argp);
261 return 0;
262} 526}
263 527
264static int 528static int xdr_dec_stat_res(struct rpc_rqst *rqstp, __be32 *p,
265xdr_decode_stat_res(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp) 529 struct nsm_res *resp)
266{ 530{
531 struct xdr_stream xdr;
532
533 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
534 p = xdr_inline_decode(&xdr, 2 * sizeof(u32));
535 if (unlikely(p == NULL))
536 return -EIO;
267 resp->status = ntohl(*p++); 537 resp->status = ntohl(*p++);
268 resp->state = ntohl(*p++); 538 resp->state = ntohl(*p);
269 dprintk("nsm: xdr_decode_stat_res status %d state %d\n", 539
540 dprintk("lockd: xdr_dec_stat_res status %d state %d\n",
270 resp->status, resp->state); 541 resp->status, resp->state);
271 return 0; 542 return 0;
272} 543}
273 544
274static int 545static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p,
275xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp) 546 struct nsm_res *resp)
276{ 547{
277 resp->state = ntohl(*p++); 548 struct xdr_stream xdr;
549
550 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
551 p = xdr_inline_decode(&xdr, sizeof(u32));
552 if (unlikely(p == NULL))
553 return -EIO;
554 resp->state = ntohl(*p);
555
556 dprintk("lockd: xdr_dec_stat state %d\n", resp->state);
278 return 0; 557 return 0;
279} 558}
280 559
@@ -288,22 +567,22 @@ xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
288#define SM_unmonres_sz 1 567#define SM_unmonres_sz 1
289 568
290static struct rpc_procinfo nsm_procedures[] = { 569static struct rpc_procinfo nsm_procedures[] = {
291[SM_MON] = { 570[NSMPROC_MON] = {
292 .p_proc = SM_MON, 571 .p_proc = NSMPROC_MON,
293 .p_encode = (kxdrproc_t) xdr_encode_mon, 572 .p_encode = (kxdrproc_t)xdr_enc_mon,
294 .p_decode = (kxdrproc_t) xdr_decode_stat_res, 573 .p_decode = (kxdrproc_t)xdr_dec_stat_res,
295 .p_arglen = SM_mon_sz, 574 .p_arglen = SM_mon_sz,
296 .p_replen = SM_monres_sz, 575 .p_replen = SM_monres_sz,
297 .p_statidx = SM_MON, 576 .p_statidx = NSMPROC_MON,
298 .p_name = "MONITOR", 577 .p_name = "MONITOR",
299 }, 578 },
300[SM_UNMON] = { 579[NSMPROC_UNMON] = {
301 .p_proc = SM_UNMON, 580 .p_proc = NSMPROC_UNMON,
302 .p_encode = (kxdrproc_t) xdr_encode_unmon, 581 .p_encode = (kxdrproc_t)xdr_enc_unmon,
303 .p_decode = (kxdrproc_t) xdr_decode_stat, 582 .p_decode = (kxdrproc_t)xdr_dec_stat,
304 .p_arglen = SM_mon_id_sz, 583 .p_arglen = SM_mon_id_sz,
305 .p_replen = SM_unmonres_sz, 584 .p_replen = SM_unmonres_sz,
306 .p_statidx = SM_UNMON, 585 .p_statidx = NSMPROC_UNMON,
307 .p_name = "UNMONITOR", 586 .p_name = "UNMONITOR",
308 }, 587 },
309}; 588};
@@ -322,7 +601,7 @@ static struct rpc_stat nsm_stats;
322 601
323static struct rpc_program nsm_program = { 602static struct rpc_program nsm_program = {
324 .name = "statd", 603 .name = "statd",
325 .number = SM_PROGRAM, 604 .number = NSM_PROGRAM,
326 .nrvers = ARRAY_SIZE(nsm_version), 605 .nrvers = ARRAY_SIZE(nsm_version),
327 .version = nsm_version, 606 .version = nsm_version,
328 .stats = &nsm_stats 607 .stats = &nsm_stats
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 56b076736b56..64f1c31b5853 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -35,7 +35,6 @@
35#include <linux/sunrpc/svcsock.h> 35#include <linux/sunrpc/svcsock.h>
36#include <net/ip.h> 36#include <net/ip.h>
37#include <linux/lockd/lockd.h> 37#include <linux/lockd/lockd.h>
38#include <linux/lockd/sm_inter.h>
39#include <linux/nfs.h> 38#include <linux/nfs.h>
40 39
41#define NLMDBG_FACILITY NLMDBG_SVC 40#define NLMDBG_FACILITY NLMDBG_SVC
@@ -45,7 +44,7 @@
45static struct svc_program nlmsvc_program; 44static struct svc_program nlmsvc_program;
46 45
47struct nlmsvc_binding * nlmsvc_ops; 46struct nlmsvc_binding * nlmsvc_ops;
48EXPORT_SYMBOL(nlmsvc_ops); 47EXPORT_SYMBOL_GPL(nlmsvc_ops);
49 48
50static DEFINE_MUTEX(nlmsvc_mutex); 49static DEFINE_MUTEX(nlmsvc_mutex);
51static unsigned int nlmsvc_users; 50static unsigned int nlmsvc_users;
@@ -54,13 +53,26 @@ static struct svc_rqst *nlmsvc_rqst;
54unsigned long nlmsvc_timeout; 53unsigned long nlmsvc_timeout;
55 54
56/* 55/*
56 * If the kernel has IPv6 support available, always listen for
57 * both AF_INET and AF_INET6 requests.
58 */
59#if (defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) && \
60 defined(CONFIG_SUNRPC_REGISTER_V4)
61static const sa_family_t nlmsvc_family = AF_INET6;
62#else /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
63static const sa_family_t nlmsvc_family = AF_INET;
64#endif /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
65
66/*
57 * These can be set at insmod time (useful for NFS as root filesystem), 67 * These can be set at insmod time (useful for NFS as root filesystem),
58 * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003 68 * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003
59 */ 69 */
60static unsigned long nlm_grace_period; 70static unsigned long nlm_grace_period;
61static unsigned long nlm_timeout = LOCKD_DFLT_TIMEO; 71static unsigned long nlm_timeout = LOCKD_DFLT_TIMEO;
62static int nlm_udpport, nlm_tcpport; 72static int nlm_udpport, nlm_tcpport;
63int nsm_use_hostnames = 0; 73
74/* RLIM_NOFILE defaults to 1024. That seems like a reasonable default here. */
75static unsigned int nlm_max_connections = 1024;
64 76
65/* 77/*
66 * Constants needed for the sysctl interface. 78 * Constants needed for the sysctl interface.
@@ -143,6 +155,9 @@ lockd(void *vrqstp)
143 long timeout = MAX_SCHEDULE_TIMEOUT; 155 long timeout = MAX_SCHEDULE_TIMEOUT;
144 RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); 156 RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
145 157
158 /* update sv_maxconn if it has changed */
159 rqstp->rq_server->sv_maxconn = nlm_max_connections;
160
146 if (signalled()) { 161 if (signalled()) {
147 flush_signals(current); 162 flush_signals(current);
148 if (nlmsvc_ops) { 163 if (nlmsvc_ops) {
@@ -189,6 +204,19 @@ lockd(void *vrqstp)
189 return 0; 204 return 0;
190} 205}
191 206
207static int create_lockd_listener(struct svc_serv *serv, char *name,
208 unsigned short port)
209{
210 struct svc_xprt *xprt;
211
212 xprt = svc_find_xprt(serv, name, 0, 0);
213 if (xprt == NULL)
214 return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS);
215
216 svc_xprt_put(xprt);
217 return 0;
218}
219
192/* 220/*
193 * Ensure there are active UDP and TCP listeners for lockd. 221 * Ensure there are active UDP and TCP listeners for lockd.
194 * 222 *
@@ -202,29 +230,23 @@ lockd(void *vrqstp)
202static int make_socks(struct svc_serv *serv) 230static int make_socks(struct svc_serv *serv)
203{ 231{
204 static int warned; 232 static int warned;
205 struct svc_xprt *xprt; 233 int err;
206 int err = 0;
207 234
208 xprt = svc_find_xprt(serv, "udp", 0, 0); 235 err = create_lockd_listener(serv, "udp", nlm_udpport);
209 if (!xprt) 236 if (err < 0)
210 err = svc_create_xprt(serv, "udp", nlm_udpport, 237 goto out_err;
211 SVC_SOCK_DEFAULTS); 238
212 else 239 err = create_lockd_listener(serv, "tcp", nlm_tcpport);
213 svc_xprt_put(xprt); 240 if (err < 0)
214 if (err >= 0) { 241 goto out_err;
215 xprt = svc_find_xprt(serv, "tcp", 0, 0); 242
216 if (!xprt) 243 warned = 0;
217 err = svc_create_xprt(serv, "tcp", nlm_tcpport, 244 return 0;
218 SVC_SOCK_DEFAULTS); 245
219 else 246out_err:
220 svc_xprt_put(xprt); 247 if (warned++ == 0)
221 }
222 if (err >= 0) {
223 warned = 0;
224 err = 0;
225 } else if (warned++ == 0)
226 printk(KERN_WARNING 248 printk(KERN_WARNING
227 "lockd_up: makesock failed, error=%d\n", err); 249 "lockd_up: makesock failed, error=%d\n", err);
228 return err; 250 return err;
229} 251}
230 252
@@ -252,7 +274,7 @@ int lockd_up(void)
252 "lockd_up: no pid, %d users??\n", nlmsvc_users); 274 "lockd_up: no pid, %d users??\n", nlmsvc_users);
253 275
254 error = -ENOMEM; 276 error = -ENOMEM;
255 serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, AF_INET, NULL); 277 serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL);
256 if (!serv) { 278 if (!serv) {
257 printk(KERN_WARNING "lockd_up: create service failed\n"); 279 printk(KERN_WARNING "lockd_up: create service failed\n");
258 goto out; 280 goto out;
@@ -276,6 +298,7 @@ int lockd_up(void)
276 } 298 }
277 299
278 svc_sock_update_bufs(serv); 300 svc_sock_update_bufs(serv);
301 serv->sv_maxconn = nlm_max_connections;
279 302
280 nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name); 303 nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
281 if (IS_ERR(nlmsvc_task)) { 304 if (IS_ERR(nlmsvc_task)) {
@@ -300,7 +323,7 @@ out:
300 mutex_unlock(&nlmsvc_mutex); 323 mutex_unlock(&nlmsvc_mutex);
301 return error; 324 return error;
302} 325}
303EXPORT_SYMBOL(lockd_up); 326EXPORT_SYMBOL_GPL(lockd_up);
304 327
305/* 328/*
306 * Decrement the user count and bring down lockd if we're the last. 329 * Decrement the user count and bring down lockd if we're the last.
@@ -329,7 +352,7 @@ lockd_down(void)
329out: 352out:
330 mutex_unlock(&nlmsvc_mutex); 353 mutex_unlock(&nlmsvc_mutex);
331} 354}
332EXPORT_SYMBOL(lockd_down); 355EXPORT_SYMBOL_GPL(lockd_down);
333 356
334#ifdef CONFIG_SYSCTL 357#ifdef CONFIG_SYSCTL
335 358
@@ -485,6 +508,7 @@ module_param_call(nlm_udpport, param_set_port, param_get_int,
485module_param_call(nlm_tcpport, param_set_port, param_get_int, 508module_param_call(nlm_tcpport, param_set_port, param_get_int,
486 &nlm_tcpport, 0644); 509 &nlm_tcpport, 0644);
487module_param(nsm_use_hostnames, bool, 0644); 510module_param(nsm_use_hostnames, bool, 0644);
511module_param(nlm_max_connections, uint, 0644);
488 512
489/* 513/*
490 * Initialising and terminating the module. 514 * Initialising and terminating the module.
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 4dfdcbc6bf68..1725037374c5 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -16,8 +16,6 @@
16#include <linux/nfsd/nfsd.h> 16#include <linux/nfsd/nfsd.h>
17#include <linux/lockd/lockd.h> 17#include <linux/lockd/lockd.h>
18#include <linux/lockd/share.h> 18#include <linux/lockd/share.h>
19#include <linux/lockd/sm_inter.h>
20
21 19
22#define NLMDBG_FACILITY NLMDBG_CLIENT 20#define NLMDBG_FACILITY NLMDBG_CLIENT
23 21
@@ -419,8 +417,6 @@ static __be32
419nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, 417nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
420 void *resp) 418 void *resp)
421{ 419{
422 struct sockaddr_in saddr;
423
424 dprintk("lockd: SM_NOTIFY called\n"); 420 dprintk("lockd: SM_NOTIFY called\n");
425 421
426 if (!nlm_privileged_requester(rqstp)) { 422 if (!nlm_privileged_requester(rqstp)) {
@@ -430,14 +426,7 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
430 return rpc_system_err; 426 return rpc_system_err;
431 } 427 }
432 428
433 /* Obtain the host pointer for this NFS server and try to 429 nlm_host_rebooted(argp);
434 * reclaim all locks we hold on this server.
435 */
436 memset(&saddr, 0, sizeof(saddr));
437 saddr.sin_family = AF_INET;
438 saddr.sin_addr.s_addr = argp->addr;
439 nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
440
441 return rpc_success; 430 return rpc_success;
442} 431}
443 432
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 3ca89e2a9381..3688e55901fc 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -16,8 +16,6 @@
16#include <linux/nfsd/nfsd.h> 16#include <linux/nfsd/nfsd.h>
17#include <linux/lockd/lockd.h> 17#include <linux/lockd/lockd.h>
18#include <linux/lockd/share.h> 18#include <linux/lockd/share.h>
19#include <linux/lockd/sm_inter.h>
20
21 19
22#define NLMDBG_FACILITY NLMDBG_CLIENT 20#define NLMDBG_FACILITY NLMDBG_CLIENT
23 21
@@ -451,8 +449,6 @@ static __be32
451nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, 449nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
452 void *resp) 450 void *resp)
453{ 451{
454 struct sockaddr_in saddr;
455
456 dprintk("lockd: SM_NOTIFY called\n"); 452 dprintk("lockd: SM_NOTIFY called\n");
457 453
458 if (!nlm_privileged_requester(rqstp)) { 454 if (!nlm_privileged_requester(rqstp)) {
@@ -462,14 +458,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
462 return rpc_system_err; 458 return rpc_system_err;
463 } 459 }
464 460
465 /* Obtain the host pointer for this NFS server and try to 461 nlm_host_rebooted(argp);
466 * reclaim all locks we hold on this server.
467 */
468 memset(&saddr, 0, sizeof(saddr));
469 saddr.sin_family = AF_INET;
470 saddr.sin_addr.s_addr = argp->addr;
471 nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
472
473 return rpc_success; 462 return rpc_success;
474} 463}
475 464
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 34c2766e27c7..9e4d6aab611b 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -17,7 +17,6 @@
17#include <linux/nfsd/export.h> 17#include <linux/nfsd/export.h>
18#include <linux/lockd/lockd.h> 18#include <linux/lockd/lockd.h>
19#include <linux/lockd/share.h> 19#include <linux/lockd/share.h>
20#include <linux/lockd/sm_inter.h>
21#include <linux/module.h> 20#include <linux/module.h>
22#include <linux/mount.h> 21#include <linux/mount.h>
23 22
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 1f226290c67c..0336f2beacde 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -16,7 +16,6 @@
16#include <linux/sunrpc/svc.h> 16#include <linux/sunrpc/svc.h>
17#include <linux/sunrpc/stats.h> 17#include <linux/sunrpc/stats.h>
18#include <linux/lockd/lockd.h> 18#include <linux/lockd/lockd.h>
19#include <linux/lockd/sm_inter.h>
20 19
21#define NLMDBG_FACILITY NLMDBG_XDR 20#define NLMDBG_FACILITY NLMDBG_XDR
22 21
@@ -349,8 +348,8 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
349 if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN))) 348 if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
350 return 0; 349 return 0;
351 argp->state = ntohl(*p++); 350 argp->state = ntohl(*p++);
352 /* Preserve the address in network byte order */ 351 memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
353 argp->addr = *p++; 352 p += XDR_QUADLEN(SM_PRIV_SIZE);
354 return xdr_argsize_check(rqstp, p); 353 return xdr_argsize_check(rqstp, p);
355} 354}
356 355
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 50c493a8ad8e..e1d528653192 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -17,7 +17,6 @@
17#include <linux/sunrpc/svc.h> 17#include <linux/sunrpc/svc.h>
18#include <linux/sunrpc/stats.h> 18#include <linux/sunrpc/stats.h>
19#include <linux/lockd/lockd.h> 19#include <linux/lockd/lockd.h>
20#include <linux/lockd/sm_inter.h>
21 20
22#define NLMDBG_FACILITY NLMDBG_XDR 21#define NLMDBG_FACILITY NLMDBG_XDR
23 22
@@ -356,8 +355,8 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp
356 if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN))) 355 if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
357 return 0; 356 return 0;
358 argp->state = ntohl(*p++); 357 argp->state = ntohl(*p++);
359 /* Preserve the address in network byte order */ 358 memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
360 argp->addr = *p++; 359 p += XDR_QUADLEN(SM_PRIV_SIZE);
361 return xdr_argsize_check(rqstp, p); 360 return xdr_argsize_check(rqstp, p);
362} 361}
363 362
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index f70433816a38..d4946c4c90e2 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -280,7 +280,7 @@ int minix_add_link(struct dentry *dentry, struct inode *inode)
280 return -EINVAL; 280 return -EINVAL;
281 281
282got_it: 282got_it:
283 pos = (page->index >> PAGE_CACHE_SHIFT) + p - (char*)page_address(page); 283 pos = page_offset(page) + p - (char *)page_address(page);
284 err = __minix_write_begin(NULL, page->mapping, pos, sbi->s_dirsize, 284 err = __minix_write_begin(NULL, page->mapping, pos, sbi->s_dirsize,
285 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); 285 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
286 if (err) 286 if (err)
diff --git a/fs/mpage.c b/fs/mpage.c
index 552b80b3facc..16c3ef37eae3 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -241,7 +241,6 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
241 first_hole = page_block; 241 first_hole = page_block;
242 page_block++; 242 page_block++;
243 block_in_file++; 243 block_in_file++;
244 clear_buffer_mapped(map_bh);
245 continue; 244 continue;
246 } 245 }
247 246
@@ -308,7 +307,10 @@ alloc_new:
308 goto alloc_new; 307 goto alloc_new;
309 } 308 }
310 309
311 if (buffer_boundary(map_bh) || (first_hole != blocks_per_page)) 310 relative_block = block_in_file - *first_logical_block;
311 nblocks = map_bh->b_size >> blkbits;
312 if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
313 (first_hole != blocks_per_page))
312 bio = mpage_bio_submit(READ, bio); 314 bio = mpage_bio_submit(READ, bio);
313 else 315 else
314 *last_block_in_bio = blocks[blocks_per_page - 1]; 316 *last_block_in_bio = blocks[blocks_per_page - 1];
diff --git a/fs/namei.c b/fs/namei.c
index af3783fff1de..f05bed242422 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -226,6 +226,16 @@ int generic_permission(struct inode *inode, int mask,
226 return -EACCES; 226 return -EACCES;
227} 227}
228 228
229/**
230 * inode_permission - check for access rights to a given inode
231 * @inode: inode to check permission on
232 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
233 *
234 * Used to check for read/write/execute permissions on an inode.
235 * We use "fsuid" for this, letting us set arbitrary permissions
236 * for filesystem access without changing the "normal" uids which
237 * are used for other things.
238 */
229int inode_permission(struct inode *inode, int mask) 239int inode_permission(struct inode *inode, int mask)
230{ 240{
231 int retval; 241 int retval;
@@ -247,8 +257,7 @@ int inode_permission(struct inode *inode, int mask)
247 return -EACCES; 257 return -EACCES;
248 } 258 }
249 259
250 /* Ordinary permission routines do not understand MAY_APPEND. */ 260 if (inode->i_op->permission)
251 if (inode->i_op && inode->i_op->permission)
252 retval = inode->i_op->permission(inode, mask); 261 retval = inode->i_op->permission(inode, mask);
253 else 262 else
254 retval = generic_permission(inode, mask, NULL); 263 retval = generic_permission(inode, mask, NULL);
@@ -265,21 +274,6 @@ int inode_permission(struct inode *inode, int mask)
265} 274}
266 275
267/** 276/**
268 * vfs_permission - check for access rights to a given path
269 * @nd: lookup result that describes the path
270 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
271 *
272 * Used to check for read/write/execute permissions on a path.
273 * We use "fsuid" for this, letting us set arbitrary permissions
274 * for filesystem access without changing the "normal" uids which
275 * are used for other things.
276 */
277int vfs_permission(struct nameidata *nd, int mask)
278{
279 return inode_permission(nd->path.dentry->d_inode, mask);
280}
281
282/**
283 * file_permission - check for additional access rights to a given file 277 * file_permission - check for additional access rights to a given file
284 * @file: file to check access rights for 278 * @file: file to check access rights for
285 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) 279 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
@@ -289,7 +283,7 @@ int vfs_permission(struct nameidata *nd, int mask)
289 * 283 *
290 * Note: 284 * Note:
291 * Do not use this function in new code. All access checks should 285 * Do not use this function in new code. All access checks should
292 * be done using vfs_permission(). 286 * be done using inode_permission().
293 */ 287 */
294int file_permission(struct file *file, int mask) 288int file_permission(struct file *file, int mask)
295{ 289{
@@ -438,7 +432,7 @@ static int exec_permission_lite(struct inode *inode)
438{ 432{
439 umode_t mode = inode->i_mode; 433 umode_t mode = inode->i_mode;
440 434
441 if (inode->i_op && inode->i_op->permission) 435 if (inode->i_op->permission)
442 return -EAGAIN; 436 return -EAGAIN;
443 437
444 if (current_fsuid() == inode->i_uid) 438 if (current_fsuid() == inode->i_uid)
@@ -527,18 +521,6 @@ out_unlock:
527 return result; 521 return result;
528} 522}
529 523
530/* SMP-safe */
531static __always_inline void
532walk_init_root(const char *name, struct nameidata *nd)
533{
534 struct fs_struct *fs = current->fs;
535
536 read_lock(&fs->lock);
537 nd->path = fs->root;
538 path_get(&fs->root);
539 read_unlock(&fs->lock);
540}
541
542/* 524/*
543 * Wrapper to retry pathname resolution whenever the underlying 525 * Wrapper to retry pathname resolution whenever the underlying
544 * file system returns an ESTALE. 526 * file system returns an ESTALE.
@@ -576,9 +558,16 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
576 goto fail; 558 goto fail;
577 559
578 if (*link == '/') { 560 if (*link == '/') {
561 struct fs_struct *fs = current->fs;
562
579 path_put(&nd->path); 563 path_put(&nd->path);
580 walk_init_root(link, nd); 564
565 read_lock(&fs->lock);
566 nd->path = fs->root;
567 path_get(&fs->root);
568 read_unlock(&fs->lock);
581 } 569 }
570
582 res = link_path_walk(link, nd); 571 res = link_path_walk(link, nd);
583 if (nd->depth || res || nd->last_type!=LAST_NORM) 572 if (nd->depth || res || nd->last_type!=LAST_NORM)
584 return res; 573 return res;
@@ -859,7 +848,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
859 nd->flags |= LOOKUP_CONTINUE; 848 nd->flags |= LOOKUP_CONTINUE;
860 err = exec_permission_lite(inode); 849 err = exec_permission_lite(inode);
861 if (err == -EAGAIN) 850 if (err == -EAGAIN)
862 err = vfs_permission(nd, MAY_EXEC); 851 err = inode_permission(nd->path.dentry->d_inode,
852 MAY_EXEC);
863 if (err) 853 if (err)
864 break; 854 break;
865 855
@@ -918,9 +908,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
918 inode = next.dentry->d_inode; 908 inode = next.dentry->d_inode;
919 if (!inode) 909 if (!inode)
920 goto out_dput; 910 goto out_dput;
921 err = -ENOTDIR;
922 if (!inode->i_op)
923 goto out_dput;
924 911
925 if (inode->i_op->follow_link) { 912 if (inode->i_op->follow_link) {
926 err = do_follow_link(&next, nd); 913 err = do_follow_link(&next, nd);
@@ -930,9 +917,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
930 inode = nd->path.dentry->d_inode; 917 inode = nd->path.dentry->d_inode;
931 if (!inode) 918 if (!inode)
932 break; 919 break;
933 err = -ENOTDIR;
934 if (!inode->i_op)
935 break;
936 } else 920 } else
937 path_to_nameidata(&next, nd); 921 path_to_nameidata(&next, nd);
938 err = -ENOTDIR; 922 err = -ENOTDIR;
@@ -971,7 +955,7 @@ last_component:
971 break; 955 break;
972 inode = next.dentry->d_inode; 956 inode = next.dentry->d_inode;
973 if ((lookup_flags & LOOKUP_FOLLOW) 957 if ((lookup_flags & LOOKUP_FOLLOW)
974 && inode && inode->i_op && inode->i_op->follow_link) { 958 && inode && inode->i_op->follow_link) {
975 err = do_follow_link(&next, nd); 959 err = do_follow_link(&next, nd);
976 if (err) 960 if (err)
977 goto return_err; 961 goto return_err;
@@ -983,7 +967,7 @@ last_component:
983 break; 967 break;
984 if (lookup_flags & LOOKUP_DIRECTORY) { 968 if (lookup_flags & LOOKUP_DIRECTORY) {
985 err = -ENOTDIR; 969 err = -ENOTDIR;
986 if (!inode->i_op || !inode->i_op->lookup) 970 if (!inode->i_op->lookup)
987 break; 971 break;
988 } 972 }
989 goto return_base; 973 goto return_base;
@@ -1479,7 +1463,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
1479 if (error) 1463 if (error)
1480 return error; 1464 return error;
1481 1465
1482 if (!dir->i_op || !dir->i_op->create) 1466 if (!dir->i_op->create)
1483 return -EACCES; /* shouldn't it be ENOSYS? */ 1467 return -EACCES; /* shouldn't it be ENOSYS? */
1484 mode &= S_IALLUGO; 1468 mode &= S_IALLUGO;
1485 mode |= S_IFREG; 1469 mode |= S_IFREG;
@@ -1493,9 +1477,9 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
1493 return error; 1477 return error;
1494} 1478}
1495 1479
1496int may_open(struct nameidata *nd, int acc_mode, int flag) 1480int may_open(struct path *path, int acc_mode, int flag)
1497{ 1481{
1498 struct dentry *dentry = nd->path.dentry; 1482 struct dentry *dentry = path->dentry;
1499 struct inode *inode = dentry->d_inode; 1483 struct inode *inode = dentry->d_inode;
1500 int error; 1484 int error;
1501 1485
@@ -1516,13 +1500,13 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
1516 if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { 1500 if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
1517 flag &= ~O_TRUNC; 1501 flag &= ~O_TRUNC;
1518 } else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) { 1502 } else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
1519 if (nd->path.mnt->mnt_flags & MNT_NODEV) 1503 if (path->mnt->mnt_flags & MNT_NODEV)
1520 return -EACCES; 1504 return -EACCES;
1521 1505
1522 flag &= ~O_TRUNC; 1506 flag &= ~O_TRUNC;
1523 } 1507 }
1524 1508
1525 error = vfs_permission(nd, acc_mode); 1509 error = inode_permission(inode, acc_mode);
1526 if (error) 1510 if (error)
1527 return error; 1511 return error;
1528 /* 1512 /*
@@ -1556,6 +1540,9 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
1556 * Refuse to truncate files with mandatory locks held on them. 1540 * Refuse to truncate files with mandatory locks held on them.
1557 */ 1541 */
1558 error = locks_verify_locked(inode); 1542 error = locks_verify_locked(inode);
1543 if (!error)
1544 error = security_path_truncate(path, 0,
1545 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
1559 if (!error) { 1546 if (!error) {
1560 DQUOT_INIT(inode); 1547 DQUOT_INIT(inode);
1561 1548
@@ -1586,14 +1573,18 @@ static int __open_namei_create(struct nameidata *nd, struct path *path,
1586 1573
1587 if (!IS_POSIXACL(dir->d_inode)) 1574 if (!IS_POSIXACL(dir->d_inode))
1588 mode &= ~current->fs->umask; 1575 mode &= ~current->fs->umask;
1576 error = security_path_mknod(&nd->path, path->dentry, mode, 0);
1577 if (error)
1578 goto out_unlock;
1589 error = vfs_create(dir->d_inode, path->dentry, mode, nd); 1579 error = vfs_create(dir->d_inode, path->dentry, mode, nd);
1580out_unlock:
1590 mutex_unlock(&dir->d_inode->i_mutex); 1581 mutex_unlock(&dir->d_inode->i_mutex);
1591 dput(nd->path.dentry); 1582 dput(nd->path.dentry);
1592 nd->path.dentry = path->dentry; 1583 nd->path.dentry = path->dentry;
1593 if (error) 1584 if (error)
1594 return error; 1585 return error;
1595 /* Don't check for write permission, don't truncate */ 1586 /* Don't check for write permission, don't truncate */
1596 return may_open(nd, 0, flag & ~O_TRUNC); 1587 return may_open(&nd->path, 0, flag & ~O_TRUNC);
1597} 1588}
1598 1589
1599/* 1590/*
@@ -1755,7 +1746,7 @@ do_last:
1755 error = -ENOENT; 1746 error = -ENOENT;
1756 if (!path.dentry->d_inode) 1747 if (!path.dentry->d_inode)
1757 goto exit_dput; 1748 goto exit_dput;
1758 if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link) 1749 if (path.dentry->d_inode->i_op->follow_link)
1759 goto do_link; 1750 goto do_link;
1760 1751
1761 path_to_nameidata(&path, &nd); 1752 path_to_nameidata(&path, &nd);
@@ -1779,7 +1770,7 @@ ok:
1779 if (error) 1770 if (error)
1780 goto exit; 1771 goto exit;
1781 } 1772 }
1782 error = may_open(&nd, acc_mode, flag); 1773 error = may_open(&nd.path, acc_mode, flag);
1783 if (error) { 1774 if (error) {
1784 if (will_write) 1775 if (will_write)
1785 mnt_drop_write(nd.path.mnt); 1776 mnt_drop_write(nd.path.mnt);
@@ -1936,7 +1927,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1936 if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) 1927 if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
1937 return -EPERM; 1928 return -EPERM;
1938 1929
1939 if (!dir->i_op || !dir->i_op->mknod) 1930 if (!dir->i_op->mknod)
1940 return -EPERM; 1931 return -EPERM;
1941 1932
1942 error = devcgroup_inode_mknod(mode, dev); 1933 error = devcgroup_inode_mknod(mode, dev);
@@ -1999,6 +1990,9 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
1999 error = mnt_want_write(nd.path.mnt); 1990 error = mnt_want_write(nd.path.mnt);
2000 if (error) 1991 if (error)
2001 goto out_dput; 1992 goto out_dput;
1993 error = security_path_mknod(&nd.path, dentry, mode, dev);
1994 if (error)
1995 goto out_drop_write;
2002 switch (mode & S_IFMT) { 1996 switch (mode & S_IFMT) {
2003 case 0: case S_IFREG: 1997 case 0: case S_IFREG:
2004 error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd); 1998 error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd);
@@ -2011,6 +2005,7 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
2011 error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0); 2005 error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0);
2012 break; 2006 break;
2013 } 2007 }
2008out_drop_write:
2014 mnt_drop_write(nd.path.mnt); 2009 mnt_drop_write(nd.path.mnt);
2015out_dput: 2010out_dput:
2016 dput(dentry); 2011 dput(dentry);
@@ -2034,7 +2029,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
2034 if (error) 2029 if (error)
2035 return error; 2030 return error;
2036 2031
2037 if (!dir->i_op || !dir->i_op->mkdir) 2032 if (!dir->i_op->mkdir)
2038 return -EPERM; 2033 return -EPERM;
2039 2034
2040 mode &= (S_IRWXUGO|S_ISVTX); 2035 mode &= (S_IRWXUGO|S_ISVTX);
@@ -2070,7 +2065,11 @@ asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
2070 error = mnt_want_write(nd.path.mnt); 2065 error = mnt_want_write(nd.path.mnt);
2071 if (error) 2066 if (error)
2072 goto out_dput; 2067 goto out_dput;
2068 error = security_path_mkdir(&nd.path, dentry, mode);
2069 if (error)
2070 goto out_drop_write;
2073 error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode); 2071 error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode);
2072out_drop_write:
2074 mnt_drop_write(nd.path.mnt); 2073 mnt_drop_write(nd.path.mnt);
2075out_dput: 2074out_dput:
2076 dput(dentry); 2075 dput(dentry);
@@ -2121,7 +2120,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
2121 if (error) 2120 if (error)
2122 return error; 2121 return error;
2123 2122
2124 if (!dir->i_op || !dir->i_op->rmdir) 2123 if (!dir->i_op->rmdir)
2125 return -EPERM; 2124 return -EPERM;
2126 2125
2127 DQUOT_INIT(dir); 2126 DQUOT_INIT(dir);
@@ -2180,7 +2179,11 @@ static long do_rmdir(int dfd, const char __user *pathname)
2180 error = mnt_want_write(nd.path.mnt); 2179 error = mnt_want_write(nd.path.mnt);
2181 if (error) 2180 if (error)
2182 goto exit3; 2181 goto exit3;
2182 error = security_path_rmdir(&nd.path, dentry);
2183 if (error)
2184 goto exit4;
2183 error = vfs_rmdir(nd.path.dentry->d_inode, dentry); 2185 error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
2186exit4:
2184 mnt_drop_write(nd.path.mnt); 2187 mnt_drop_write(nd.path.mnt);
2185exit3: 2188exit3:
2186 dput(dentry); 2189 dput(dentry);
@@ -2204,7 +2207,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
2204 if (error) 2207 if (error)
2205 return error; 2208 return error;
2206 2209
2207 if (!dir->i_op || !dir->i_op->unlink) 2210 if (!dir->i_op->unlink)
2208 return -EPERM; 2211 return -EPERM;
2209 2212
2210 DQUOT_INIT(dir); 2213 DQUOT_INIT(dir);
@@ -2265,7 +2268,11 @@ static long do_unlinkat(int dfd, const char __user *pathname)
2265 error = mnt_want_write(nd.path.mnt); 2268 error = mnt_want_write(nd.path.mnt);
2266 if (error) 2269 if (error)
2267 goto exit2; 2270 goto exit2;
2271 error = security_path_unlink(&nd.path, dentry);
2272 if (error)
2273 goto exit3;
2268 error = vfs_unlink(nd.path.dentry->d_inode, dentry); 2274 error = vfs_unlink(nd.path.dentry->d_inode, dentry);
2275exit3:
2269 mnt_drop_write(nd.path.mnt); 2276 mnt_drop_write(nd.path.mnt);
2270 exit2: 2277 exit2:
2271 dput(dentry); 2278 dput(dentry);
@@ -2307,7 +2314,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
2307 if (error) 2314 if (error)
2308 return error; 2315 return error;
2309 2316
2310 if (!dir->i_op || !dir->i_op->symlink) 2317 if (!dir->i_op->symlink)
2311 return -EPERM; 2318 return -EPERM;
2312 2319
2313 error = security_inode_symlink(dir, dentry, oldname); 2320 error = security_inode_symlink(dir, dentry, oldname);
@@ -2346,7 +2353,11 @@ asmlinkage long sys_symlinkat(const char __user *oldname,
2346 error = mnt_want_write(nd.path.mnt); 2353 error = mnt_want_write(nd.path.mnt);
2347 if (error) 2354 if (error)
2348 goto out_dput; 2355 goto out_dput;
2356 error = security_path_symlink(&nd.path, dentry, from);
2357 if (error)
2358 goto out_drop_write;
2349 error = vfs_symlink(nd.path.dentry->d_inode, dentry, from); 2359 error = vfs_symlink(nd.path.dentry->d_inode, dentry, from);
2360out_drop_write:
2350 mnt_drop_write(nd.path.mnt); 2361 mnt_drop_write(nd.path.mnt);
2351out_dput: 2362out_dput:
2352 dput(dentry); 2363 dput(dentry);
@@ -2384,7 +2395,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
2384 */ 2395 */
2385 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 2396 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2386 return -EPERM; 2397 return -EPERM;
2387 if (!dir->i_op || !dir->i_op->link) 2398 if (!dir->i_op->link)
2388 return -EPERM; 2399 return -EPERM;
2389 if (S_ISDIR(inode->i_mode)) 2400 if (S_ISDIR(inode->i_mode))
2390 return -EPERM; 2401 return -EPERM;
@@ -2443,7 +2454,11 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
2443 error = mnt_want_write(nd.path.mnt); 2454 error = mnt_want_write(nd.path.mnt);
2444 if (error) 2455 if (error)
2445 goto out_dput; 2456 goto out_dput;
2457 error = security_path_link(old_path.dentry, &nd.path, new_dentry);
2458 if (error)
2459 goto out_drop_write;
2446 error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry); 2460 error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry);
2461out_drop_write:
2447 mnt_drop_write(nd.path.mnt); 2462 mnt_drop_write(nd.path.mnt);
2448out_dput: 2463out_dput:
2449 dput(new_dentry); 2464 dput(new_dentry);
@@ -2587,7 +2602,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
2587 if (error) 2602 if (error)
2588 return error; 2603 return error;
2589 2604
2590 if (!old_dir->i_op || !old_dir->i_op->rename) 2605 if (!old_dir->i_op->rename)
2591 return -EPERM; 2606 return -EPERM;
2592 2607
2593 DQUOT_INIT(old_dir); 2608 DQUOT_INIT(old_dir);
@@ -2679,8 +2694,13 @@ asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
2679 error = mnt_want_write(oldnd.path.mnt); 2694 error = mnt_want_write(oldnd.path.mnt);
2680 if (error) 2695 if (error)
2681 goto exit5; 2696 goto exit5;
2697 error = security_path_rename(&oldnd.path, old_dentry,
2698 &newnd.path, new_dentry);
2699 if (error)
2700 goto exit6;
2682 error = vfs_rename(old_dir->d_inode, old_dentry, 2701 error = vfs_rename(old_dir->d_inode, old_dentry,
2683 new_dir->d_inode, new_dentry); 2702 new_dir->d_inode, new_dentry);
2703exit6:
2684 mnt_drop_write(oldnd.path.mnt); 2704 mnt_drop_write(oldnd.path.mnt);
2685exit5: 2705exit5:
2686 dput(new_dentry); 2706 dput(new_dentry);
@@ -2750,13 +2770,16 @@ int vfs_follow_link(struct nameidata *nd, const char *link)
2750/* get the link contents into pagecache */ 2770/* get the link contents into pagecache */
2751static char *page_getlink(struct dentry * dentry, struct page **ppage) 2771static char *page_getlink(struct dentry * dentry, struct page **ppage)
2752{ 2772{
2753 struct page * page; 2773 char *kaddr;
2774 struct page *page;
2754 struct address_space *mapping = dentry->d_inode->i_mapping; 2775 struct address_space *mapping = dentry->d_inode->i_mapping;
2755 page = read_mapping_page(mapping, 0, NULL); 2776 page = read_mapping_page(mapping, 0, NULL);
2756 if (IS_ERR(page)) 2777 if (IS_ERR(page))
2757 return (char*)page; 2778 return (char*)page;
2758 *ppage = page; 2779 *ppage = page;
2759 return kmap(page); 2780 kaddr = kmap(page);
2781 nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
2782 return kaddr;
2760} 2783}
2761 2784
2762int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) 2785int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
@@ -2788,18 +2811,23 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
2788 } 2811 }
2789} 2812}
2790 2813
2791int __page_symlink(struct inode *inode, const char *symname, int len, 2814/*
2792 gfp_t gfp_mask) 2815 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
2816 */
2817int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
2793{ 2818{
2794 struct address_space *mapping = inode->i_mapping; 2819 struct address_space *mapping = inode->i_mapping;
2795 struct page *page; 2820 struct page *page;
2796 void *fsdata; 2821 void *fsdata;
2797 int err; 2822 int err;
2798 char *kaddr; 2823 char *kaddr;
2824 unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
2825 if (nofs)
2826 flags |= AOP_FLAG_NOFS;
2799 2827
2800retry: 2828retry:
2801 err = pagecache_write_begin(NULL, mapping, 0, len-1, 2829 err = pagecache_write_begin(NULL, mapping, 0, len-1,
2802 AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); 2830 flags, &page, &fsdata);
2803 if (err) 2831 if (err)
2804 goto fail; 2832 goto fail;
2805 2833
@@ -2823,7 +2851,7 @@ fail:
2823int page_symlink(struct inode *inode, const char *symname, int len) 2851int page_symlink(struct inode *inode, const char *symname, int len)
2824{ 2852{
2825 return __page_symlink(inode, symname, len, 2853 return __page_symlink(inode, symname, len,
2826 mapping_gfp_mask(inode->i_mapping)); 2854 !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
2827} 2855}
2828 2856
2829const struct inode_operations page_symlink_inode_operations = { 2857const struct inode_operations page_symlink_inode_operations = {
@@ -2849,7 +2877,6 @@ EXPORT_SYMBOL(path_lookup);
2849EXPORT_SYMBOL(kern_path); 2877EXPORT_SYMBOL(kern_path);
2850EXPORT_SYMBOL(vfs_path_lookup); 2878EXPORT_SYMBOL(vfs_path_lookup);
2851EXPORT_SYMBOL(inode_permission); 2879EXPORT_SYMBOL(inode_permission);
2852EXPORT_SYMBOL(vfs_permission);
2853EXPORT_SYMBOL(file_permission); 2880EXPORT_SYMBOL(file_permission);
2854EXPORT_SYMBOL(unlock_rename); 2881EXPORT_SYMBOL(unlock_rename);
2855EXPORT_SYMBOL(vfs_create); 2882EXPORT_SYMBOL(vfs_create);
@@ -2865,3 +2892,10 @@ EXPORT_SYMBOL(vfs_symlink);
2865EXPORT_SYMBOL(vfs_unlink); 2892EXPORT_SYMBOL(vfs_unlink);
2866EXPORT_SYMBOL(dentry_unhash); 2893EXPORT_SYMBOL(dentry_unhash);
2867EXPORT_SYMBOL(generic_readlink); 2894EXPORT_SYMBOL(generic_readlink);
2895
2896/* to be mentioned only in INIT_TASK */
2897struct fs_struct init_fs = {
2898 .count = ATOMIC_INIT(1),
2899 .lock = __RW_LOCK_UNLOCKED(init_fs.lock),
2900 .umask = 0022,
2901};
diff --git a/fs/namespace.c b/fs/namespace.c
index 1c09cab8f7cf..a40685d800a8 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1990,7 +1990,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
1990 if (!new_ns->root) { 1990 if (!new_ns->root) {
1991 up_write(&namespace_sem); 1991 up_write(&namespace_sem);
1992 kfree(new_ns); 1992 kfree(new_ns);
1993 return ERR_PTR(-ENOMEM);; 1993 return ERR_PTR(-ENOMEM);
1994 } 1994 }
1995 spin_lock(&vfsmount_lock); 1995 spin_lock(&vfsmount_lock);
1996 list_add_tail(&new_ns->list, &new_ns->root->mnt_list); 1996 list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c
index 335b003dddf9..0af3349de851 100644
--- a/fs/ncpfs/getopt.c
+++ b/fs/ncpfs/getopt.c
@@ -16,7 +16,6 @@
16 * @opts: an array of &struct option entries controlling parser operations 16 * @opts: an array of &struct option entries controlling parser operations
17 * @optopt: output; will contain the current option 17 * @optopt: output; will contain the current option
18 * @optarg: output; will contain the value (if one exists) 18 * @optarg: output; will contain the value (if one exists)
19 * @flag: output; may be NULL; should point to a long for or'ing flags
20 * @value: output; may be NULL; will be overwritten with the integer value 19 * @value: output; may be NULL; will be overwritten with the integer value
21 * of the current argument. 20 * of the current argument.
22 * 21 *
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 6d04e050c74e..f54360f50a9c 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -98,7 +98,7 @@ struct compat_ncp_objectname_ioctl
98{ 98{
99 s32 auth_type; 99 s32 auth_type;
100 u32 object_name_len; 100 u32 object_name_len;
101 compat_caddr_t object_name; /* an userspace data, in most cases user name */ 101 compat_caddr_t object_name; /* a userspace data, in most cases user name */
102}; 102};
103 103
104struct compat_ncp_fs_info_v2 { 104struct compat_ncp_fs_info_v2 {
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index c2e9cfd9e5a4..3e634f2a1083 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -16,6 +16,7 @@
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/freezer.h> 17#include <linux/freezer.h>
18#include <linux/kthread.h> 18#include <linux/kthread.h>
19#include <linux/sunrpc/svcauth_gss.h>
19 20
20#include <net/inet_sock.h> 21#include <net/inet_sock.h>
21 22
@@ -182,10 +183,34 @@ void nfs_callback_down(void)
182 mutex_unlock(&nfs_callback_mutex); 183 mutex_unlock(&nfs_callback_mutex);
183} 184}
184 185
186static int check_gss_callback_principal(struct nfs_client *clp,
187 struct svc_rqst *rqstp)
188{
189 struct rpc_clnt *r = clp->cl_rpcclient;
190 char *p = svc_gss_principal(rqstp);
191
192 /*
193 * It might just be a normal user principal, in which case
194 * userspace won't bother to tell us the name at all.
195 */
196 if (p == NULL)
197 return SVC_DENIED;
198
199 /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */
200
201 if (memcmp(p, "nfs@", 4) != 0)
202 return SVC_DENIED;
203 p += 4;
204 if (strcmp(p, r->cl_server) != 0)
205 return SVC_DENIED;
206 return SVC_OK;
207}
208
185static int nfs_callback_authenticate(struct svc_rqst *rqstp) 209static int nfs_callback_authenticate(struct svc_rqst *rqstp)
186{ 210{
187 struct nfs_client *clp; 211 struct nfs_client *clp;
188 RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); 212 RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
213 int ret = SVC_OK;
189 214
190 /* Don't talk to strangers */ 215 /* Don't talk to strangers */
191 clp = nfs_find_client(svc_addr(rqstp), 4); 216 clp = nfs_find_client(svc_addr(rqstp), 4);
@@ -194,21 +219,22 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
194 219
195 dprintk("%s: %s NFSv4 callback!\n", __func__, 220 dprintk("%s: %s NFSv4 callback!\n", __func__,
196 svc_print_addr(rqstp, buf, sizeof(buf))); 221 svc_print_addr(rqstp, buf, sizeof(buf)));
197 nfs_put_client(clp);
198 222
199 switch (rqstp->rq_authop->flavour) { 223 switch (rqstp->rq_authop->flavour) {
200 case RPC_AUTH_NULL: 224 case RPC_AUTH_NULL:
201 if (rqstp->rq_proc != CB_NULL) 225 if (rqstp->rq_proc != CB_NULL)
202 return SVC_DENIED; 226 ret = SVC_DENIED;
203 break; 227 break;
204 case RPC_AUTH_UNIX: 228 case RPC_AUTH_UNIX:
205 break; 229 break;
206 case RPC_AUTH_GSS: 230 case RPC_AUTH_GSS:
207 /* FIXME: RPCSEC_GSS handling? */ 231 ret = check_gss_callback_principal(clp, rqstp);
232 break;
208 default: 233 default:
209 return SVC_DENIED; 234 ret = SVC_DENIED;
210 } 235 }
211 return SVC_OK; 236 nfs_put_client(clp);
237 return ret;
212} 238}
213 239
214/* 240/*
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 7547600b6174..9b728f3565a1 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -143,7 +143,6 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
143 clp->cl_proto = cl_init->proto; 143 clp->cl_proto = cl_init->proto;
144 144
145#ifdef CONFIG_NFS_V4 145#ifdef CONFIG_NFS_V4
146 init_rwsem(&clp->cl_sem);
147 INIT_LIST_HEAD(&clp->cl_delegations); 146 INIT_LIST_HEAD(&clp->cl_delegations);
148 spin_lock_init(&clp->cl_lock); 147 spin_lock_init(&clp->cl_lock);
149 INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state); 148 INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
@@ -224,31 +223,54 @@ void nfs_put_client(struct nfs_client *clp)
224 } 223 }
225} 224}
226 225
227static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1, 226#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
228 const struct sockaddr_in *sa2) 227static const struct in6_addr *nfs_map_ipv4_addr(const struct sockaddr *sa, struct in6_addr *addr_mapped)
229{ 228{
230 return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr; 229 switch (sa->sa_family) {
230 default:
231 return NULL;
232 case AF_INET6:
233 return &((const struct sockaddr_in6 *)sa)->sin6_addr;
234 break;
235 case AF_INET:
236 ipv6_addr_set_v4mapped(((const struct sockaddr_in *)sa)->sin_addr.s_addr,
237 addr_mapped);
238 return addr_mapped;
239 }
231} 240}
232 241
233static int nfs_sockaddr_match_ipaddr6(const struct sockaddr_in6 *sa1, 242static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
234 const struct sockaddr_in6 *sa2) 243 const struct sockaddr *sa2)
244{
245 const struct in6_addr *addr1;
246 const struct in6_addr *addr2;
247 struct in6_addr addr1_mapped;
248 struct in6_addr addr2_mapped;
249
250 addr1 = nfs_map_ipv4_addr(sa1, &addr1_mapped);
251 if (likely(addr1 != NULL)) {
252 addr2 = nfs_map_ipv4_addr(sa2, &addr2_mapped);
253 if (likely(addr2 != NULL))
254 return ipv6_addr_equal(addr1, addr2);
255 }
256 return 0;
257}
258#else
259static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1,
260 const struct sockaddr_in *sa2)
235{ 261{
236 return ipv6_addr_equal(&sa1->sin6_addr, &sa2->sin6_addr); 262 return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr;
237} 263}
238 264
239static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, 265static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
240 const struct sockaddr *sa2) 266 const struct sockaddr *sa2)
241{ 267{
242 switch (sa1->sa_family) { 268 if (unlikely(sa1->sa_family != AF_INET || sa2->sa_family != AF_INET))
243 case AF_INET: 269 return 0;
244 return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1, 270 return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1,
245 (const struct sockaddr_in *)sa2); 271 (const struct sockaddr_in *)sa2);
246 case AF_INET6:
247 return nfs_sockaddr_match_ipaddr6((const struct sockaddr_in6 *)sa1,
248 (const struct sockaddr_in6 *)sa2);
249 }
250 BUG();
251} 272}
273#endif
252 274
253/* 275/*
254 * Find a client by IP address and protocol version 276 * Find a client by IP address and protocol version
@@ -270,8 +292,6 @@ struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion)
270 if (clp->rpc_ops->version != nfsversion) 292 if (clp->rpc_ops->version != nfsversion)
271 continue; 293 continue;
272 294
273 if (addr->sa_family != clap->sa_family)
274 continue;
275 /* Match only the IP address, not the port number */ 295 /* Match only the IP address, not the port number */
276 if (!nfs_sockaddr_match_ipaddr(addr, clap)) 296 if (!nfs_sockaddr_match_ipaddr(addr, clap))
277 continue; 297 continue;
@@ -305,8 +325,6 @@ struct nfs_client *nfs_find_client_next(struct nfs_client *clp)
305 if (clp->rpc_ops->version != nfsvers) 325 if (clp->rpc_ops->version != nfsvers)
306 continue; 326 continue;
307 327
308 if (sap->sa_family != clap->sa_family)
309 continue;
310 /* Match only the IP address, not the port number */ 328 /* Match only the IP address, not the port number */
311 if (!nfs_sockaddr_match_ipaddr(sap, clap)) 329 if (!nfs_sockaddr_match_ipaddr(sap, clap))
312 continue; 330 continue;
@@ -470,7 +488,7 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
470static int nfs_create_rpc_client(struct nfs_client *clp, 488static int nfs_create_rpc_client(struct nfs_client *clp,
471 const struct rpc_timeout *timeparms, 489 const struct rpc_timeout *timeparms,
472 rpc_authflavor_t flavor, 490 rpc_authflavor_t flavor,
473 int flags) 491 int discrtry, int noresvport)
474{ 492{
475 struct rpc_clnt *clnt = NULL; 493 struct rpc_clnt *clnt = NULL;
476 struct rpc_create_args args = { 494 struct rpc_create_args args = {
@@ -482,9 +500,13 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
482 .program = &nfs_program, 500 .program = &nfs_program,
483 .version = clp->rpc_ops->version, 501 .version = clp->rpc_ops->version,
484 .authflavor = flavor, 502 .authflavor = flavor,
485 .flags = flags,
486 }; 503 };
487 504
505 if (discrtry)
506 args.flags |= RPC_CLNT_CREATE_DISCRTRY;
507 if (noresvport)
508 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
509
488 if (!IS_ERR(clp->cl_rpcclient)) 510 if (!IS_ERR(clp->cl_rpcclient))
489 return 0; 511 return 0;
490 512
@@ -522,6 +544,8 @@ static int nfs_start_lockd(struct nfs_server *server)
522 .protocol = server->flags & NFS_MOUNT_TCP ? 544 .protocol = server->flags & NFS_MOUNT_TCP ?
523 IPPROTO_TCP : IPPROTO_UDP, 545 IPPROTO_TCP : IPPROTO_UDP,
524 .nfs_version = clp->rpc_ops->version, 546 .nfs_version = clp->rpc_ops->version,
547 .noresvport = server->flags & NFS_MOUNT_NORESVPORT ?
548 1 : 0,
525 }; 549 };
526 550
527 if (nlm_init.nfs_version > 3) 551 if (nlm_init.nfs_version > 3)
@@ -623,7 +647,8 @@ static int nfs_init_client(struct nfs_client *clp,
623 * Create a client RPC handle for doing FSSTAT with UNIX auth only 647 * Create a client RPC handle for doing FSSTAT with UNIX auth only
624 * - RFC 2623, sec 2.3.2 648 * - RFC 2623, sec 2.3.2
625 */ 649 */
626 error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX, 0); 650 error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX,
651 0, data->flags & NFS_MOUNT_NORESVPORT);
627 if (error < 0) 652 if (error < 0)
628 goto error; 653 goto error;
629 nfs_mark_client_ready(clp, NFS_CS_READY); 654 nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -965,7 +990,8 @@ error:
965static int nfs4_init_client(struct nfs_client *clp, 990static int nfs4_init_client(struct nfs_client *clp,
966 const struct rpc_timeout *timeparms, 991 const struct rpc_timeout *timeparms,
967 const char *ip_addr, 992 const char *ip_addr,
968 rpc_authflavor_t authflavour) 993 rpc_authflavor_t authflavour,
994 int flags)
969{ 995{
970 int error; 996 int error;
971 997
@@ -979,7 +1005,7 @@ static int nfs4_init_client(struct nfs_client *clp,
979 clp->rpc_ops = &nfs_v4_clientops; 1005 clp->rpc_ops = &nfs_v4_clientops;
980 1006
981 error = nfs_create_rpc_client(clp, timeparms, authflavour, 1007 error = nfs_create_rpc_client(clp, timeparms, authflavour,
982 RPC_CLNT_CREATE_DISCRTRY); 1008 1, flags & NFS_MOUNT_NORESVPORT);
983 if (error < 0) 1009 if (error < 0)
984 goto error; 1010 goto error;
985 memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); 1011 memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
@@ -1030,7 +1056,8 @@ static int nfs4_set_client(struct nfs_server *server,
1030 error = PTR_ERR(clp); 1056 error = PTR_ERR(clp);
1031 goto error; 1057 goto error;
1032 } 1058 }
1033 error = nfs4_init_client(clp, timeparms, ip_addr, authflavour); 1059 error = nfs4_init_client(clp, timeparms, ip_addr, authflavour,
1060 server->flags);
1034 if (error < 0) 1061 if (error < 0)
1035 goto error_put; 1062 goto error_put;
1036 1063
@@ -1059,6 +1086,10 @@ static int nfs4_init_server(struct nfs_server *server,
1059 nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, 1086 nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
1060 data->timeo, data->retrans); 1087 data->timeo, data->retrans);
1061 1088
1089 /* Initialise the client representation from the mount data */
1090 server->flags = data->flags;
1091 server->caps |= NFS_CAP_ATOMIC_OPEN;
1092
1062 /* Get a client record */ 1093 /* Get a client record */
1063 error = nfs4_set_client(server, 1094 error = nfs4_set_client(server,
1064 data->nfs_server.hostname, 1095 data->nfs_server.hostname,
@@ -1071,10 +1102,6 @@ static int nfs4_init_server(struct nfs_server *server,
1071 if (error < 0) 1102 if (error < 0)
1072 goto error; 1103 goto error;
1073 1104
1074 /* Initialise the client representation from the mount data */
1075 server->flags = data->flags;
1076 server->caps |= NFS_CAP_ATOMIC_OPEN;
1077
1078 if (data->rsize) 1105 if (data->rsize)
1079 server->rsize = nfs_block_size(data->rsize, NULL); 1106 server->rsize = nfs_block_size(data->rsize, NULL);
1080 if (data->wsize) 1107 if (data->wsize)
@@ -1177,6 +1204,10 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
1177 parent_server = NFS_SB(data->sb); 1204 parent_server = NFS_SB(data->sb);
1178 parent_client = parent_server->nfs_client; 1205 parent_client = parent_server->nfs_client;
1179 1206
1207 /* Initialise the client representation from the parent server */
1208 nfs_server_copy_userdata(server, parent_server);
1209 server->caps |= NFS_CAP_ATOMIC_OPEN;
1210
1180 /* Get a client representation. 1211 /* Get a client representation.
1181 * Note: NFSv4 always uses TCP, */ 1212 * Note: NFSv4 always uses TCP, */
1182 error = nfs4_set_client(server, data->hostname, 1213 error = nfs4_set_client(server, data->hostname,
@@ -1189,10 +1220,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
1189 if (error < 0) 1220 if (error < 0)
1190 goto error; 1221 goto error;
1191 1222
1192 /* Initialise the client representation from the parent server */
1193 nfs_server_copy_userdata(server, parent_server);
1194 server->caps |= NFS_CAP_ATOMIC_OPEN;
1195
1196 error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor); 1223 error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor);
1197 if (error < 0) 1224 if (error < 0)
1198 goto error; 1225 goto error;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index cc563cfa6940..968225a88015 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -43,6 +43,27 @@ static void nfs_free_delegation(struct nfs_delegation *delegation)
43 put_rpccred(cred); 43 put_rpccred(cred);
44} 44}
45 45
46void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
47{
48 set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
49}
50
51int nfs_have_delegation(struct inode *inode, fmode_t flags)
52{
53 struct nfs_delegation *delegation;
54 int ret = 0;
55
56 flags &= FMODE_READ|FMODE_WRITE;
57 rcu_read_lock();
58 delegation = rcu_dereference(NFS_I(inode)->delegation);
59 if (delegation != NULL && (delegation->type & flags) == flags) {
60 nfs_mark_delegation_referenced(delegation);
61 ret = 1;
62 }
63 rcu_read_unlock();
64 return ret;
65}
66
46static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state) 67static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state)
47{ 68{
48 struct inode *inode = state->inode; 69 struct inode *inode = state->inode;
@@ -119,7 +140,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, st
119 delegation->maxsize = res->maxsize; 140 delegation->maxsize = res->maxsize;
120 oldcred = delegation->cred; 141 oldcred = delegation->cred;
121 delegation->cred = get_rpccred(cred); 142 delegation->cred = get_rpccred(cred);
122 delegation->flags &= ~NFS_DELEGATION_NEED_RECLAIM; 143 clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
123 NFS_I(inode)->delegation_state = delegation->type; 144 NFS_I(inode)->delegation_state = delegation->type;
124 smp_wmb(); 145 smp_wmb();
125 put_rpccred(oldcred); 146 put_rpccred(oldcred);
@@ -134,19 +155,35 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *
134 return res; 155 return res;
135} 156}
136 157
158static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation)
159{
160 struct inode *inode = NULL;
161
162 spin_lock(&delegation->lock);
163 if (delegation->inode != NULL)
164 inode = igrab(delegation->inode);
165 spin_unlock(&delegation->lock);
166 return inode;
167}
168
137static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, const nfs4_stateid *stateid) 169static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, const nfs4_stateid *stateid)
138{ 170{
139 struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation); 171 struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation);
140 172
141 if (delegation == NULL) 173 if (delegation == NULL)
142 goto nomatch; 174 goto nomatch;
175 spin_lock(&delegation->lock);
143 if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data, 176 if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data,
144 sizeof(delegation->stateid.data)) != 0) 177 sizeof(delegation->stateid.data)) != 0)
145 goto nomatch; 178 goto nomatch_unlock;
146 list_del_rcu(&delegation->super_list); 179 list_del_rcu(&delegation->super_list);
180 delegation->inode = NULL;
147 nfsi->delegation_state = 0; 181 nfsi->delegation_state = 0;
148 rcu_assign_pointer(nfsi->delegation, NULL); 182 rcu_assign_pointer(nfsi->delegation, NULL);
183 spin_unlock(&delegation->lock);
149 return delegation; 184 return delegation;
185nomatch_unlock:
186 spin_unlock(&delegation->lock);
150nomatch: 187nomatch:
151 return NULL; 188 return NULL;
152} 189}
@@ -172,6 +209,8 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
172 delegation->change_attr = nfsi->change_attr; 209 delegation->change_attr = nfsi->change_attr;
173 delegation->cred = get_rpccred(cred); 210 delegation->cred = get_rpccred(cred);
174 delegation->inode = inode; 211 delegation->inode = inode;
212 delegation->flags = 1<<NFS_DELEGATION_REFERENCED;
213 spin_lock_init(&delegation->lock);
175 214
176 spin_lock(&clp->cl_lock); 215 spin_lock(&clp->cl_lock);
177 if (rcu_dereference(nfsi->delegation) != NULL) { 216 if (rcu_dereference(nfsi->delegation) != NULL) {
@@ -226,22 +265,47 @@ static void nfs_msync_inode(struct inode *inode)
226 */ 265 */
227static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation) 266static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation)
228{ 267{
229 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
230 struct nfs_inode *nfsi = NFS_I(inode); 268 struct nfs_inode *nfsi = NFS_I(inode);
231 269
232 nfs_msync_inode(inode); 270 nfs_msync_inode(inode);
233 down_read(&clp->cl_sem);
234 /* Guard against new delegated open calls */ 271 /* Guard against new delegated open calls */
235 down_write(&nfsi->rwsem); 272 down_write(&nfsi->rwsem);
236 nfs_delegation_claim_opens(inode, &delegation->stateid); 273 nfs_delegation_claim_opens(inode, &delegation->stateid);
237 up_write(&nfsi->rwsem); 274 up_write(&nfsi->rwsem);
238 up_read(&clp->cl_sem);
239 nfs_msync_inode(inode); 275 nfs_msync_inode(inode);
240 276
241 return nfs_do_return_delegation(inode, delegation, 1); 277 return nfs_do_return_delegation(inode, delegation, 1);
242} 278}
243 279
244/* 280/*
281 * Return all delegations that have been marked for return
282 */
283void nfs_client_return_marked_delegations(struct nfs_client *clp)
284{
285 struct nfs_delegation *delegation;
286 struct inode *inode;
287
288restart:
289 rcu_read_lock();
290 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
291 if (!test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags))
292 continue;
293 inode = nfs_delegation_grab_inode(delegation);
294 if (inode == NULL)
295 continue;
296 spin_lock(&clp->cl_lock);
297 delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
298 spin_unlock(&clp->cl_lock);
299 rcu_read_unlock();
300 if (delegation != NULL)
301 __nfs_inode_return_delegation(inode, delegation);
302 iput(inode);
303 goto restart;
304 }
305 rcu_read_unlock();
306}
307
308/*
245 * This function returns the delegation without reclaiming opens 309 * This function returns the delegation without reclaiming opens
246 * or protecting against delegation reclaims. 310 * or protecting against delegation reclaims.
247 * It is therefore really only safe to be called from 311 * It is therefore really only safe to be called from
@@ -279,83 +343,55 @@ int nfs_inode_return_delegation(struct inode *inode)
279 return err; 343 return err;
280} 344}
281 345
346static void nfs_mark_return_delegation(struct nfs_client *clp, struct nfs_delegation *delegation)
347{
348 set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
349 set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
350}
351
282/* 352/*
283 * Return all delegations associated to a super block 353 * Return all delegations associated to a super block
284 */ 354 */
285void nfs_return_all_delegations(struct super_block *sb) 355void nfs_super_return_all_delegations(struct super_block *sb)
286{ 356{
287 struct nfs_client *clp = NFS_SB(sb)->nfs_client; 357 struct nfs_client *clp = NFS_SB(sb)->nfs_client;
288 struct nfs_delegation *delegation; 358 struct nfs_delegation *delegation;
289 struct inode *inode;
290 359
291 if (clp == NULL) 360 if (clp == NULL)
292 return; 361 return;
293restart:
294 rcu_read_lock(); 362 rcu_read_lock();
295 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { 363 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
296 if (delegation->inode->i_sb != sb) 364 spin_lock(&delegation->lock);
297 continue; 365 if (delegation->inode != NULL && delegation->inode->i_sb == sb)
298 inode = igrab(delegation->inode); 366 set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
299 if (inode == NULL) 367 spin_unlock(&delegation->lock);
300 continue;
301 spin_lock(&clp->cl_lock);
302 delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
303 spin_unlock(&clp->cl_lock);
304 rcu_read_unlock();
305 if (delegation != NULL)
306 __nfs_inode_return_delegation(inode, delegation);
307 iput(inode);
308 goto restart;
309 } 368 }
310 rcu_read_unlock(); 369 rcu_read_unlock();
370 nfs_client_return_marked_delegations(clp);
311} 371}
312 372
313static int nfs_do_expire_all_delegations(void *ptr) 373static void nfs_client_mark_return_all_delegations(struct nfs_client *clp)
314{ 374{
315 struct nfs_client *clp = ptr;
316 struct nfs_delegation *delegation; 375 struct nfs_delegation *delegation;
317 struct inode *inode;
318 376
319 allow_signal(SIGKILL);
320restart:
321 if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) != 0)
322 goto out;
323 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0)
324 goto out;
325 rcu_read_lock(); 377 rcu_read_lock();
326 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { 378 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
327 inode = igrab(delegation->inode); 379 set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
328 if (inode == NULL) 380 set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
329 continue;
330 spin_lock(&clp->cl_lock);
331 delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
332 spin_unlock(&clp->cl_lock);
333 rcu_read_unlock();
334 if (delegation)
335 __nfs_inode_return_delegation(inode, delegation);
336 iput(inode);
337 goto restart;
338 } 381 }
339 rcu_read_unlock(); 382 rcu_read_unlock();
340out: 383}
341 nfs_put_client(clp); 384
342 module_put_and_exit(0); 385static void nfs_delegation_run_state_manager(struct nfs_client *clp)
386{
387 if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state))
388 nfs4_schedule_state_manager(clp);
343} 389}
344 390
345void nfs_expire_all_delegations(struct nfs_client *clp) 391void nfs_expire_all_delegations(struct nfs_client *clp)
346{ 392{
347 struct task_struct *task; 393 nfs_client_mark_return_all_delegations(clp);
348 394 nfs_delegation_run_state_manager(clp);
349 __module_get(THIS_MODULE);
350 atomic_inc(&clp->cl_count);
351 task = kthread_run(nfs_do_expire_all_delegations, clp,
352 "%s-delegreturn",
353 rpc_peeraddr2str(clp->cl_rpcclient,
354 RPC_DISPLAY_ADDR));
355 if (!IS_ERR(task))
356 return;
357 nfs_put_client(clp);
358 module_put(THIS_MODULE);
359} 395}
360 396
361/* 397/*
@@ -363,68 +399,29 @@ void nfs_expire_all_delegations(struct nfs_client *clp)
363 */ 399 */
364void nfs_handle_cb_pathdown(struct nfs_client *clp) 400void nfs_handle_cb_pathdown(struct nfs_client *clp)
365{ 401{
366 struct nfs_delegation *delegation;
367 struct inode *inode;
368
369 if (clp == NULL) 402 if (clp == NULL)
370 return; 403 return;
371restart: 404 nfs_client_mark_return_all_delegations(clp);
405}
406
407static void nfs_client_mark_return_unreferenced_delegations(struct nfs_client *clp)
408{
409 struct nfs_delegation *delegation;
410
372 rcu_read_lock(); 411 rcu_read_lock();
373 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { 412 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
374 inode = igrab(delegation->inode); 413 if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
375 if (inode == NULL)
376 continue; 414 continue;
377 spin_lock(&clp->cl_lock); 415 set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
378 delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL); 416 set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
379 spin_unlock(&clp->cl_lock);
380 rcu_read_unlock();
381 if (delegation != NULL)
382 __nfs_inode_return_delegation(inode, delegation);
383 iput(inode);
384 goto restart;
385 } 417 }
386 rcu_read_unlock(); 418 rcu_read_unlock();
387} 419}
388 420
389struct recall_threadargs { 421void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
390 struct inode *inode;
391 struct nfs_client *clp;
392 const nfs4_stateid *stateid;
393
394 struct completion started;
395 int result;
396};
397
398static int recall_thread(void *data)
399{ 422{
400 struct recall_threadargs *args = (struct recall_threadargs *)data; 423 nfs_client_mark_return_unreferenced_delegations(clp);
401 struct inode *inode = igrab(args->inode); 424 nfs_delegation_run_state_manager(clp);
402 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
403 struct nfs_inode *nfsi = NFS_I(inode);
404 struct nfs_delegation *delegation;
405
406 daemonize("nfsv4-delegreturn");
407
408 nfs_msync_inode(inode);
409 down_read(&clp->cl_sem);
410 down_write(&nfsi->rwsem);
411 spin_lock(&clp->cl_lock);
412 delegation = nfs_detach_delegation_locked(nfsi, args->stateid);
413 if (delegation != NULL)
414 args->result = 0;
415 else
416 args->result = -ENOENT;
417 spin_unlock(&clp->cl_lock);
418 complete(&args->started);
419 nfs_delegation_claim_opens(inode, args->stateid);
420 up_write(&nfsi->rwsem);
421 up_read(&clp->cl_sem);
422 nfs_msync_inode(inode);
423
424 if (delegation != NULL)
425 nfs_do_return_delegation(inode, delegation, 1);
426 iput(inode);
427 module_put_and_exit(0);
428} 425}
429 426
430/* 427/*
@@ -432,22 +429,20 @@ static int recall_thread(void *data)
432 */ 429 */
433int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid) 430int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid)
434{ 431{
435 struct recall_threadargs data = { 432 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
436 .inode = inode, 433 struct nfs_delegation *delegation;
437 .stateid = stateid,
438 };
439 int status;
440 434
441 init_completion(&data.started); 435 rcu_read_lock();
442 __module_get(THIS_MODULE); 436 delegation = rcu_dereference(NFS_I(inode)->delegation);
443 status = kernel_thread(recall_thread, &data, CLONE_KERNEL); 437 if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data,
444 if (status < 0) 438 sizeof(delegation->stateid.data)) != 0) {
445 goto out_module_put; 439 rcu_read_unlock();
446 wait_for_completion(&data.started); 440 return -ENOENT;
447 return data.result; 441 }
448out_module_put: 442 nfs_mark_return_delegation(clp, delegation);
449 module_put(THIS_MODULE); 443 rcu_read_unlock();
450 return status; 444 nfs_delegation_run_state_manager(clp);
445 return 0;
451} 446}
452 447
453/* 448/*
@@ -459,10 +454,14 @@ struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs
459 struct inode *res = NULL; 454 struct inode *res = NULL;
460 rcu_read_lock(); 455 rcu_read_lock();
461 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { 456 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
462 if (nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) { 457 spin_lock(&delegation->lock);
458 if (delegation->inode != NULL &&
459 nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
463 res = igrab(delegation->inode); 460 res = igrab(delegation->inode);
464 break;
465 } 461 }
462 spin_unlock(&delegation->lock);
463 if (res != NULL)
464 break;
466 } 465 }
467 rcu_read_unlock(); 466 rcu_read_unlock();
468 return res; 467 return res;
@@ -476,7 +475,7 @@ void nfs_delegation_mark_reclaim(struct nfs_client *clp)
476 struct nfs_delegation *delegation; 475 struct nfs_delegation *delegation;
477 rcu_read_lock(); 476 rcu_read_lock();
478 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) 477 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list)
479 delegation->flags |= NFS_DELEGATION_NEED_RECLAIM; 478 set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
480 rcu_read_unlock(); 479 rcu_read_unlock();
481} 480}
482 481
@@ -486,17 +485,22 @@ void nfs_delegation_mark_reclaim(struct nfs_client *clp)
486void nfs_delegation_reap_unclaimed(struct nfs_client *clp) 485void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
487{ 486{
488 struct nfs_delegation *delegation; 487 struct nfs_delegation *delegation;
488 struct inode *inode;
489restart: 489restart:
490 rcu_read_lock(); 490 rcu_read_lock();
491 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { 491 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
492 if ((delegation->flags & NFS_DELEGATION_NEED_RECLAIM) == 0) 492 if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0)
493 continue;
494 inode = nfs_delegation_grab_inode(delegation);
495 if (inode == NULL)
493 continue; 496 continue;
494 spin_lock(&clp->cl_lock); 497 spin_lock(&clp->cl_lock);
495 delegation = nfs_detach_delegation_locked(NFS_I(delegation->inode), NULL); 498 delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
496 spin_unlock(&clp->cl_lock); 499 spin_unlock(&clp->cl_lock);
497 rcu_read_unlock(); 500 rcu_read_unlock();
498 if (delegation != NULL) 501 if (delegation != NULL)
499 nfs_free_delegation(delegation); 502 nfs_free_delegation(delegation);
503 iput(inode);
500 goto restart; 504 goto restart;
501 } 505 }
502 rcu_read_unlock(); 506 rcu_read_unlock();
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index f1c5e2a5d88e..09f383795174 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -17,14 +17,20 @@ struct nfs_delegation {
17 struct rpc_cred *cred; 17 struct rpc_cred *cred;
18 struct inode *inode; 18 struct inode *inode;
19 nfs4_stateid stateid; 19 nfs4_stateid stateid;
20 int type; 20 fmode_t type;
21#define NFS_DELEGATION_NEED_RECLAIM 1
22 long flags;
23 loff_t maxsize; 21 loff_t maxsize;
24 __u64 change_attr; 22 __u64 change_attr;
23 unsigned long flags;
24 spinlock_t lock;
25 struct rcu_head rcu; 25 struct rcu_head rcu;
26}; 26};
27 27
28enum {
29 NFS_DELEGATION_NEED_RECLAIM = 0,
30 NFS_DELEGATION_RETURN,
31 NFS_DELEGATION_REFERENCED,
32};
33
28int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); 34int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
29void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); 35void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
30int nfs_inode_return_delegation(struct inode *inode); 36int nfs_inode_return_delegation(struct inode *inode);
@@ -32,9 +38,11 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s
32void nfs_inode_return_delegation_noreclaim(struct inode *inode); 38void nfs_inode_return_delegation_noreclaim(struct inode *inode);
33 39
34struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); 40struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
35void nfs_return_all_delegations(struct super_block *sb); 41void nfs_super_return_all_delegations(struct super_block *sb);
36void nfs_expire_all_delegations(struct nfs_client *clp); 42void nfs_expire_all_delegations(struct nfs_client *clp);
43void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
37void nfs_handle_cb_pathdown(struct nfs_client *clp); 44void nfs_handle_cb_pathdown(struct nfs_client *clp);
45void nfs_client_return_marked_delegations(struct nfs_client *clp);
38 46
39void nfs_delegation_mark_reclaim(struct nfs_client *clp); 47void nfs_delegation_mark_reclaim(struct nfs_client *clp);
40void nfs_delegation_reap_unclaimed(struct nfs_client *clp); 48void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
@@ -45,22 +53,11 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
45int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl); 53int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl);
46int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode); 54int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode);
47 55
48static inline int nfs_have_delegation(struct inode *inode, int flags) 56void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
49{ 57int nfs_have_delegation(struct inode *inode, fmode_t flags);
50 struct nfs_delegation *delegation;
51 int ret = 0;
52
53 flags &= FMODE_READ|FMODE_WRITE;
54 rcu_read_lock();
55 delegation = rcu_dereference(NFS_I(inode)->delegation);
56 if (delegation != NULL && (delegation->type & flags) == flags)
57 ret = 1;
58 rcu_read_unlock();
59 return ret;
60}
61 58
62#else 59#else
63static inline int nfs_have_delegation(struct inode *inode, int flags) 60static inline int nfs_have_delegation(struct inode *inode, fmode_t flags)
64{ 61{
65 return 0; 62 return 0;
66} 63}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3e64b98f3a93..e35c8199f82f 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -799,6 +799,9 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
799 goto out_bad; 799 goto out_bad;
800 } 800 }
801 801
802 if (nfs_have_delegation(inode, FMODE_READ))
803 goto out_set_verifier;
804
802 /* Force a full look up iff the parent directory has changed */ 805 /* Force a full look up iff the parent directory has changed */
803 if (!nfs_is_exclusive_create(dir, nd) && nfs_check_verifier(dir, dentry)) { 806 if (!nfs_is_exclusive_create(dir, nd) && nfs_check_verifier(dir, dentry)) {
804 if (nfs_lookup_verify_inode(inode, nd)) 807 if (nfs_lookup_verify_inode(inode, nd))
@@ -817,6 +820,7 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
817 if ((error = nfs_refresh_inode(inode, &fattr)) != 0) 820 if ((error = nfs_refresh_inode(inode, &fattr)) != 0)
818 goto out_bad; 821 goto out_bad;
819 822
823out_set_verifier:
820 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 824 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
821 out_valid: 825 out_valid:
822 dput(parent); 826 dput(parent);
@@ -973,7 +977,7 @@ struct dentry_operations nfs4_dentry_operations = {
973 * Use intent information to determine whether we need to substitute 977 * Use intent information to determine whether we need to substitute
974 * the NFSv4-style stateful OPEN for the LOOKUP call 978 * the NFSv4-style stateful OPEN for the LOOKUP call
975 */ 979 */
976static int is_atomic_open(struct inode *dir, struct nameidata *nd) 980static int is_atomic_open(struct nameidata *nd)
977{ 981{
978 if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_OPEN) == 0) 982 if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_OPEN) == 0)
979 return 0; 983 return 0;
@@ -996,7 +1000,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
996 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); 1000 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
997 1001
998 /* Check that we are indeed trying to open this file */ 1002 /* Check that we are indeed trying to open this file */
999 if (!is_atomic_open(dir, nd)) 1003 if (!is_atomic_open(nd))
1000 goto no_open; 1004 goto no_open;
1001 1005
1002 if (dentry->d_name.len > NFS_SERVER(dir)->namelen) { 1006 if (dentry->d_name.len > NFS_SERVER(dir)->namelen) {
@@ -1047,10 +1051,10 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1047 struct inode *dir; 1051 struct inode *dir;
1048 int openflags, ret = 0; 1052 int openflags, ret = 0;
1049 1053
1054 if (!is_atomic_open(nd))
1055 goto no_open;
1050 parent = dget_parent(dentry); 1056 parent = dget_parent(dentry);
1051 dir = parent->d_inode; 1057 dir = parent->d_inode;
1052 if (!is_atomic_open(dir, nd))
1053 goto no_open;
1054 /* We can't create new files in nfs_open_revalidate(), so we 1058 /* We can't create new files in nfs_open_revalidate(), so we
1055 * optimize away revalidation of negative dentries. 1059 * optimize away revalidation of negative dentries.
1056 */ 1060 */
@@ -1062,11 +1066,11 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1062 1066
1063 /* NFS only supports OPEN on regular files */ 1067 /* NFS only supports OPEN on regular files */
1064 if (!S_ISREG(inode->i_mode)) 1068 if (!S_ISREG(inode->i_mode))
1065 goto no_open; 1069 goto no_open_dput;
1066 openflags = nd->intent.open.flags; 1070 openflags = nd->intent.open.flags;
1067 /* We cannot do exclusive creation on a positive dentry */ 1071 /* We cannot do exclusive creation on a positive dentry */
1068 if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) 1072 if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
1069 goto no_open; 1073 goto no_open_dput;
1070 /* We can't create new files, or truncate existing ones here */ 1074 /* We can't create new files, or truncate existing ones here */
1071 openflags &= ~(O_CREAT|O_TRUNC); 1075 openflags &= ~(O_CREAT|O_TRUNC);
1072 1076
@@ -1081,10 +1085,9 @@ out:
1081 if (!ret) 1085 if (!ret)
1082 d_drop(dentry); 1086 d_drop(dentry);
1083 return ret; 1087 return ret;
1084no_open: 1088no_open_dput:
1085 dput(parent); 1089 dput(parent);
1086 if (inode != NULL && nfs_have_delegation(inode, FMODE_READ)) 1090no_open:
1087 return 1;
1088 return nfs_lookup_revalidate(dentry, nd); 1091 return nfs_lookup_revalidate(dentry, nd);
1089} 1092}
1090#endif /* CONFIG_NFSV4 */ 1093#endif /* CONFIG_NFSV4 */
@@ -1794,7 +1797,8 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
1794 cache = nfs_access_search_rbtree(inode, cred); 1797 cache = nfs_access_search_rbtree(inode, cred);
1795 if (cache == NULL) 1798 if (cache == NULL)
1796 goto out; 1799 goto out;
1797 if (!time_in_range(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) 1800 if (!nfs_have_delegation(inode, FMODE_READ) &&
1801 !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
1798 goto out_stale; 1802 goto out_stale;
1799 res->jiffies = cache->jiffies; 1803 res->jiffies = cache->jiffies;
1800 res->cred = cache->cred; 1804 res->cred = cache->cred;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index d319b49f8f06..90f292b520d2 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -354,7 +354,7 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
354 file->f_path.dentry->d_name.name, 354 file->f_path.dentry->d_name.name,
355 mapping->host->i_ino, len, (long long) pos); 355 mapping->host->i_ino, len, (long long) pos);
356 356
357 page = __grab_cache_page(mapping, index); 357 page = grab_cache_page_write_begin(mapping, index, flags);
358 if (!page) 358 if (!page)
359 return -ENOMEM; 359 return -ENOMEM;
360 *pagep = page; 360 *pagep = page;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d22eb383e1cf..0c381686171e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -592,7 +592,7 @@ static void nfs_file_set_open_context(struct file *filp, struct nfs_open_context
592/* 592/*
593 * Given an inode, search for an open context with the desired characteristics 593 * Given an inode, search for an open context with the desired characteristics
594 */ 594 */
595struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, int mode) 595struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode)
596{ 596{
597 struct nfs_inode *nfsi = NFS_I(inode); 597 struct nfs_inode *nfsi = NFS_I(inode);
598 struct nfs_open_context *pos, *ctx = NULL; 598 struct nfs_open_context *pos, *ctx = NULL;
@@ -712,14 +712,7 @@ int nfs_attribute_timeout(struct inode *inode)
712 712
713 if (nfs_have_delegation(inode, FMODE_READ)) 713 if (nfs_have_delegation(inode, FMODE_READ))
714 return 0; 714 return 0;
715 /* 715 return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
716 * Special case: if the attribute timeout is set to 0, then always
717 * treat the cache as having expired (unless holding
718 * a delegation).
719 */
720 if (nfsi->attrtimeo == 0)
721 return 1;
722 return !time_in_range(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
723} 716}
724 717
725/** 718/**
@@ -1182,7 +1175,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1182 nfsi->attrtimeo_timestamp = now; 1175 nfsi->attrtimeo_timestamp = now;
1183 nfsi->attr_gencount = nfs_inc_attr_generation_counter(); 1176 nfsi->attr_gencount = nfs_inc_attr_generation_counter();
1184 } else { 1177 } else {
1185 if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { 1178 if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
1186 if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode)) 1179 if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
1187 nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); 1180 nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
1188 nfsi->attrtimeo_timestamp = now; 1181 nfsi->attrtimeo_timestamp = now;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index d212ee41caf2..340ede8f608f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -63,6 +63,20 @@ struct nfs_parsed_mount_data {
63 struct security_mnt_opts lsm_opts; 63 struct security_mnt_opts lsm_opts;
64}; 64};
65 65
66/* mount_clnt.c */
67struct nfs_mount_request {
68 struct sockaddr *sap;
69 size_t salen;
70 char *hostname;
71 char *dirpath;
72 u32 version;
73 unsigned short protocol;
74 struct nfs_fh *fh;
75 int noresvport;
76};
77
78extern int nfs_mount(struct nfs_mount_request *info);
79
66/* client.c */ 80/* client.c */
67extern struct rpc_program nfs_program; 81extern struct rpc_program nfs_program;
68 82
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 086a6830d785..ca905a5bb1ba 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -29,47 +29,43 @@ struct mnt_fhstatus {
29 29
30/** 30/**
31 * nfs_mount - Obtain an NFS file handle for the given host and path 31 * nfs_mount - Obtain an NFS file handle for the given host and path
32 * @addr: pointer to server's address 32 * @info: pointer to mount request arguments
33 * @len: size of server's address
34 * @hostname: name of server host, or NULL
35 * @path: pointer to string containing export path to mount
36 * @version: mount version to use for this request
37 * @protocol: transport protocol to use for thie request
38 * @fh: pointer to location to place returned file handle
39 * 33 *
40 * Uses default timeout parameters specified by underlying transport. 34 * Uses default timeout parameters specified by underlying transport.
41 */ 35 */
42int nfs_mount(struct sockaddr *addr, size_t len, char *hostname, char *path, 36int nfs_mount(struct nfs_mount_request *info)
43 int version, int protocol, struct nfs_fh *fh)
44{ 37{
45 struct mnt_fhstatus result = { 38 struct mnt_fhstatus result = {
46 .fh = fh 39 .fh = info->fh
47 }; 40 };
48 struct rpc_message msg = { 41 struct rpc_message msg = {
49 .rpc_argp = path, 42 .rpc_argp = info->dirpath,
50 .rpc_resp = &result, 43 .rpc_resp = &result,
51 }; 44 };
52 struct rpc_create_args args = { 45 struct rpc_create_args args = {
53 .protocol = protocol, 46 .protocol = info->protocol,
54 .address = addr, 47 .address = info->sap,
55 .addrsize = len, 48 .addrsize = info->salen,
56 .servername = hostname, 49 .servername = info->hostname,
57 .program = &mnt_program, 50 .program = &mnt_program,
58 .version = version, 51 .version = info->version,
59 .authflavor = RPC_AUTH_UNIX, 52 .authflavor = RPC_AUTH_UNIX,
60 .flags = 0,
61 }; 53 };
62 struct rpc_clnt *mnt_clnt; 54 struct rpc_clnt *mnt_clnt;
63 int status; 55 int status;
64 56
65 dprintk("NFS: sending MNT request for %s:%s\n", 57 dprintk("NFS: sending MNT request for %s:%s\n",
66 (hostname ? hostname : "server"), path); 58 (info->hostname ? info->hostname : "server"),
59 info->dirpath);
60
61 if (info->noresvport)
62 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
67 63
68 mnt_clnt = rpc_create(&args); 64 mnt_clnt = rpc_create(&args);
69 if (IS_ERR(mnt_clnt)) 65 if (IS_ERR(mnt_clnt))
70 goto out_clnt_err; 66 goto out_clnt_err;
71 67
72 if (version == NFS_MNT3_VERSION) 68 if (info->version == NFS_MNT3_VERSION)
73 msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC3_MNT]; 69 msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC3_MNT];
74 else 70 else
75 msg.rpc_proc = &mnt_clnt->cl_procinfo[MNTPROC_MNT]; 71 msg.rpc_proc = &mnt_clnt->cl_procinfo[MNTPROC_MNT];
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index ea790645fda6..4e4d33204376 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -38,8 +38,12 @@ struct idmap;
38 ((err) != NFSERR_NOFILEHANDLE)) 38 ((err) != NFSERR_NOFILEHANDLE))
39 39
40enum nfs4_client_state { 40enum nfs4_client_state {
41 NFS4CLNT_STATE_RECOVER = 0, 41 NFS4CLNT_MANAGER_RUNNING = 0,
42 NFS4CLNT_CHECK_LEASE,
42 NFS4CLNT_LEASE_EXPIRED, 43 NFS4CLNT_LEASE_EXPIRED,
44 NFS4CLNT_RECLAIM_REBOOT,
45 NFS4CLNT_RECLAIM_NOGRACE,
46 NFS4CLNT_DELEGRETURN,
43}; 47};
44 48
45/* 49/*
@@ -90,12 +94,18 @@ struct nfs4_state_owner {
90 94
91 spinlock_t so_lock; 95 spinlock_t so_lock;
92 atomic_t so_count; 96 atomic_t so_count;
97 unsigned long so_flags;
93 struct list_head so_states; 98 struct list_head so_states;
94 struct list_head so_delegations; 99 struct list_head so_delegations;
95 struct nfs_seqid_counter so_seqid; 100 struct nfs_seqid_counter so_seqid;
96 struct rpc_sequence so_sequence; 101 struct rpc_sequence so_sequence;
97}; 102};
98 103
104enum {
105 NFS_OWNER_RECLAIM_REBOOT,
106 NFS_OWNER_RECLAIM_NOGRACE
107};
108
99/* 109/*
100 * struct nfs4_state maintains the client-side state for a given 110 * struct nfs4_state maintains the client-side state for a given
101 * (state_owner,inode) tuple (OPEN) or state_owner (LOCK). 111 * (state_owner,inode) tuple (OPEN) or state_owner (LOCK).
@@ -128,6 +138,8 @@ enum {
128 NFS_O_RDONLY_STATE, /* OPEN stateid has read-only state */ 138 NFS_O_RDONLY_STATE, /* OPEN stateid has read-only state */
129 NFS_O_WRONLY_STATE, /* OPEN stateid has write-only state */ 139 NFS_O_WRONLY_STATE, /* OPEN stateid has write-only state */
130 NFS_O_RDWR_STATE, /* OPEN stateid has read/write state */ 140 NFS_O_RDWR_STATE, /* OPEN stateid has read/write state */
141 NFS_STATE_RECLAIM_REBOOT, /* OPEN stateid server rebooted */
142 NFS_STATE_RECLAIM_NOGRACE, /* OPEN stateid needs to recover state */
131}; 143};
132 144
133struct nfs4_state { 145struct nfs4_state {
@@ -149,7 +161,7 @@ struct nfs4_state {
149 unsigned int n_rdonly; /* Number of read-only references */ 161 unsigned int n_rdonly; /* Number of read-only references */
150 unsigned int n_wronly; /* Number of write-only references */ 162 unsigned int n_wronly; /* Number of write-only references */
151 unsigned int n_rdwr; /* Number of read/write references */ 163 unsigned int n_rdwr; /* Number of read/write references */
152 int state; /* State on the server (R,W, or RW) */ 164 fmode_t state; /* State on the server (R,W, or RW) */
153 atomic_t count; 165 atomic_t count;
154}; 166};
155 167
@@ -157,9 +169,12 @@ struct nfs4_state {
157struct nfs4_exception { 169struct nfs4_exception {
158 long timeout; 170 long timeout;
159 int retry; 171 int retry;
172 struct nfs4_state *state;
160}; 173};
161 174
162struct nfs4_state_recovery_ops { 175struct nfs4_state_recovery_ops {
176 int owner_flag_bit;
177 int state_flag_bit;
163 int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *); 178 int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *);
164 int (*recover_lock)(struct nfs4_state *, struct file_lock *); 179 int (*recover_lock)(struct nfs4_state *, struct file_lock *);
165}; 180};
@@ -174,7 +189,6 @@ extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
174 189
175 190
176/* nfs4proc.c */ 191/* nfs4proc.c */
177extern int nfs4_map_errors(int err);
178extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *); 192extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *);
179extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *); 193extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *);
180extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *); 194extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
@@ -187,7 +201,7 @@ extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
187 struct nfs4_fs_locations *fs_locations, struct page *page); 201 struct nfs4_fs_locations *fs_locations, struct page *page);
188 202
189extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops; 203extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
190extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops; 204extern struct nfs4_state_recovery_ops nfs4_nograce_recovery_ops;
191 205
192extern const u32 nfs4_fattr_bitmap[2]; 206extern const u32 nfs4_fattr_bitmap[2];
193extern const u32 nfs4_statfs_bitmap[2]; 207extern const u32 nfs4_statfs_bitmap[2];
@@ -202,16 +216,18 @@ extern void nfs4_kill_renewd(struct nfs_client *);
202extern void nfs4_renew_state(struct work_struct *); 216extern void nfs4_renew_state(struct work_struct *);
203 217
204/* nfs4state.c */ 218/* nfs4state.c */
205struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp); 219struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
206 220
207extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); 221extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
208extern void nfs4_put_state_owner(struct nfs4_state_owner *); 222extern void nfs4_put_state_owner(struct nfs4_state_owner *);
209extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *); 223extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
210extern void nfs4_put_open_state(struct nfs4_state *); 224extern void nfs4_put_open_state(struct nfs4_state *);
211extern void nfs4_close_state(struct path *, struct nfs4_state *, mode_t); 225extern void nfs4_close_state(struct path *, struct nfs4_state *, fmode_t);
212extern void nfs4_close_sync(struct path *, struct nfs4_state *, mode_t); 226extern void nfs4_close_sync(struct path *, struct nfs4_state *, fmode_t);
213extern void nfs4_state_set_mode_locked(struct nfs4_state *, mode_t); 227extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
214extern void nfs4_schedule_state_recovery(struct nfs_client *); 228extern void nfs4_schedule_state_recovery(struct nfs_client *);
229extern void nfs4_schedule_state_manager(struct nfs_client *);
230extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state);
215extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); 231extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
216extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); 232extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
217extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); 233extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 83e700a2b0c0..8dde84b988d9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -62,14 +62,12 @@
62struct nfs4_opendata; 62struct nfs4_opendata;
63static int _nfs4_proc_open(struct nfs4_opendata *data); 63static int _nfs4_proc_open(struct nfs4_opendata *data);
64static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); 64static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
65static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *); 65static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
66static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception);
67static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp);
68static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); 66static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
69static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); 67static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
70 68
71/* Prevent leaks of NFSv4 errors into userland */ 69/* Prevent leaks of NFSv4 errors into userland */
72int nfs4_map_errors(int err) 70static int nfs4_map_errors(int err)
73{ 71{
74 if (err < -1000) { 72 if (err < -1000) {
75 dprintk("%s could not handle NFSv4 error %d\n", 73 dprintk("%s could not handle NFSv4 error %d\n",
@@ -195,6 +193,83 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
195 kunmap_atomic(start, KM_USER0); 193 kunmap_atomic(start, KM_USER0);
196} 194}
197 195
196static int nfs4_wait_bit_killable(void *word)
197{
198 if (fatal_signal_pending(current))
199 return -ERESTARTSYS;
200 schedule();
201 return 0;
202}
203
204static int nfs4_wait_clnt_recover(struct nfs_client *clp)
205{
206 int res;
207
208 might_sleep();
209
210 res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
211 nfs4_wait_bit_killable, TASK_KILLABLE);
212 return res;
213}
214
215static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
216{
217 int res = 0;
218
219 might_sleep();
220
221 if (*timeout <= 0)
222 *timeout = NFS4_POLL_RETRY_MIN;
223 if (*timeout > NFS4_POLL_RETRY_MAX)
224 *timeout = NFS4_POLL_RETRY_MAX;
225 schedule_timeout_killable(*timeout);
226 if (fatal_signal_pending(current))
227 res = -ERESTARTSYS;
228 *timeout <<= 1;
229 return res;
230}
231
232/* This is the error handling routine for processes that are allowed
233 * to sleep.
234 */
235static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
236{
237 struct nfs_client *clp = server->nfs_client;
238 struct nfs4_state *state = exception->state;
239 int ret = errorcode;
240
241 exception->retry = 0;
242 switch(errorcode) {
243 case 0:
244 return 0;
245 case -NFS4ERR_ADMIN_REVOKED:
246 case -NFS4ERR_BAD_STATEID:
247 case -NFS4ERR_OPENMODE:
248 if (state == NULL)
249 break;
250 nfs4_state_mark_reclaim_nograce(clp, state);
251 case -NFS4ERR_STALE_CLIENTID:
252 case -NFS4ERR_STALE_STATEID:
253 case -NFS4ERR_EXPIRED:
254 nfs4_schedule_state_recovery(clp);
255 ret = nfs4_wait_clnt_recover(clp);
256 if (ret == 0)
257 exception->retry = 1;
258 break;
259 case -NFS4ERR_FILE_OPEN:
260 case -NFS4ERR_GRACE:
261 case -NFS4ERR_DELAY:
262 ret = nfs4_delay(server->client, &exception->timeout);
263 if (ret != 0)
264 break;
265 case -NFS4ERR_OLD_STATEID:
266 exception->retry = 1;
267 }
268 /* We failed to handle the error */
269 return nfs4_map_errors(ret);
270}
271
272
198static void renew_lease(const struct nfs_server *server, unsigned long timestamp) 273static void renew_lease(const struct nfs_server *server, unsigned long timestamp)
199{ 274{
200 struct nfs_client *clp = server->nfs_client; 275 struct nfs_client *clp = server->nfs_client;
@@ -248,7 +323,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
248} 323}
249 324
250static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, 325static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
251 struct nfs4_state_owner *sp, int flags, 326 struct nfs4_state_owner *sp, fmode_t fmode, int flags,
252 const struct iattr *attrs) 327 const struct iattr *attrs)
253{ 328{
254 struct dentry *parent = dget_parent(path->dentry); 329 struct dentry *parent = dget_parent(path->dentry);
@@ -268,7 +343,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
268 p->owner = sp; 343 p->owner = sp;
269 atomic_inc(&sp->so_count); 344 atomic_inc(&sp->so_count);
270 p->o_arg.fh = NFS_FH(dir); 345 p->o_arg.fh = NFS_FH(dir);
271 p->o_arg.open_flags = flags, 346 p->o_arg.open_flags = flags;
347 p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
272 p->o_arg.clientid = server->nfs_client->cl_clientid; 348 p->o_arg.clientid = server->nfs_client->cl_clientid;
273 p->o_arg.id = sp->so_owner_id.id; 349 p->o_arg.id = sp->so_owner_id.id;
274 p->o_arg.name = &p->path.dentry->d_name; 350 p->o_arg.name = &p->path.dentry->d_name;
@@ -324,10 +400,13 @@ static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task)
324 return ret; 400 return ret;
325} 401}
326 402
327static int can_open_cached(struct nfs4_state *state, int mode) 403static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode)
328{ 404{
329 int ret = 0; 405 int ret = 0;
330 switch (mode & (FMODE_READ|FMODE_WRITE|O_EXCL)) { 406
407 if (open_mode & O_EXCL)
408 goto out;
409 switch (mode & (FMODE_READ|FMODE_WRITE)) {
331 case FMODE_READ: 410 case FMODE_READ:
332 ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0; 411 ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0;
333 break; 412 break;
@@ -337,21 +416,23 @@ static int can_open_cached(struct nfs4_state *state, int mode)
337 case FMODE_READ|FMODE_WRITE: 416 case FMODE_READ|FMODE_WRITE:
338 ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0; 417 ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0;
339 } 418 }
419out:
340 return ret; 420 return ret;
341} 421}
342 422
343static int can_open_delegated(struct nfs_delegation *delegation, mode_t open_flags) 423static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
344{ 424{
345 if ((delegation->type & open_flags) != open_flags) 425 if ((delegation->type & fmode) != fmode)
346 return 0; 426 return 0;
347 if (delegation->flags & NFS_DELEGATION_NEED_RECLAIM) 427 if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
348 return 0; 428 return 0;
429 nfs_mark_delegation_referenced(delegation);
349 return 1; 430 return 1;
350} 431}
351 432
352static void update_open_stateflags(struct nfs4_state *state, mode_t open_flags) 433static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode)
353{ 434{
354 switch (open_flags) { 435 switch (fmode) {
355 case FMODE_WRITE: 436 case FMODE_WRITE:
356 state->n_wronly++; 437 state->n_wronly++;
357 break; 438 break;
@@ -361,15 +442,15 @@ static void update_open_stateflags(struct nfs4_state *state, mode_t open_flags)
361 case FMODE_READ|FMODE_WRITE: 442 case FMODE_READ|FMODE_WRITE:
362 state->n_rdwr++; 443 state->n_rdwr++;
363 } 444 }
364 nfs4_state_set_mode_locked(state, state->state | open_flags); 445 nfs4_state_set_mode_locked(state, state->state | fmode);
365} 446}
366 447
367static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags) 448static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
368{ 449{
369 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) 450 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
370 memcpy(state->stateid.data, stateid->data, sizeof(state->stateid.data)); 451 memcpy(state->stateid.data, stateid->data, sizeof(state->stateid.data));
371 memcpy(state->open_stateid.data, stateid->data, sizeof(state->open_stateid.data)); 452 memcpy(state->open_stateid.data, stateid->data, sizeof(state->open_stateid.data));
372 switch (open_flags) { 453 switch (fmode) {
373 case FMODE_READ: 454 case FMODE_READ:
374 set_bit(NFS_O_RDONLY_STATE, &state->flags); 455 set_bit(NFS_O_RDONLY_STATE, &state->flags);
375 break; 456 break;
@@ -381,16 +462,15 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *
381 } 462 }
382} 463}
383 464
384static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags) 465static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
385{ 466{
386 write_seqlock(&state->seqlock); 467 write_seqlock(&state->seqlock);
387 nfs_set_open_stateid_locked(state, stateid, open_flags); 468 nfs_set_open_stateid_locked(state, stateid, fmode);
388 write_sequnlock(&state->seqlock); 469 write_sequnlock(&state->seqlock);
389} 470}
390 471
391static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *deleg_stateid, int open_flags) 472static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode)
392{ 473{
393 open_flags &= (FMODE_READ|FMODE_WRITE);
394 /* 474 /*
395 * Protect the call to nfs4_state_set_mode_locked and 475 * Protect the call to nfs4_state_set_mode_locked and
396 * serialise the stateid update 476 * serialise the stateid update
@@ -401,20 +481,60 @@ static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_sta
401 set_bit(NFS_DELEGATED_STATE, &state->flags); 481 set_bit(NFS_DELEGATED_STATE, &state->flags);
402 } 482 }
403 if (open_stateid != NULL) 483 if (open_stateid != NULL)
404 nfs_set_open_stateid_locked(state, open_stateid, open_flags); 484 nfs_set_open_stateid_locked(state, open_stateid, fmode);
405 write_sequnlock(&state->seqlock); 485 write_sequnlock(&state->seqlock);
406 spin_lock(&state->owner->so_lock); 486 spin_lock(&state->owner->so_lock);
407 update_open_stateflags(state, open_flags); 487 update_open_stateflags(state, fmode);
408 spin_unlock(&state->owner->so_lock); 488 spin_unlock(&state->owner->so_lock);
409} 489}
410 490
411static void nfs4_return_incompatible_delegation(struct inode *inode, mode_t open_flags) 491static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *delegation, fmode_t fmode)
492{
493 struct nfs_inode *nfsi = NFS_I(state->inode);
494 struct nfs_delegation *deleg_cur;
495 int ret = 0;
496
497 fmode &= (FMODE_READ|FMODE_WRITE);
498
499 rcu_read_lock();
500 deleg_cur = rcu_dereference(nfsi->delegation);
501 if (deleg_cur == NULL)
502 goto no_delegation;
503
504 spin_lock(&deleg_cur->lock);
505 if (nfsi->delegation != deleg_cur ||
506 (deleg_cur->type & fmode) != fmode)
507 goto no_delegation_unlock;
508
509 if (delegation == NULL)
510 delegation = &deleg_cur->stateid;
511 else if (memcmp(deleg_cur->stateid.data, delegation->data, NFS4_STATEID_SIZE) != 0)
512 goto no_delegation_unlock;
513
514 nfs_mark_delegation_referenced(deleg_cur);
515 __update_open_stateid(state, open_stateid, &deleg_cur->stateid, fmode);
516 ret = 1;
517no_delegation_unlock:
518 spin_unlock(&deleg_cur->lock);
519no_delegation:
520 rcu_read_unlock();
521
522 if (!ret && open_stateid != NULL) {
523 __update_open_stateid(state, open_stateid, NULL, fmode);
524 ret = 1;
525 }
526
527 return ret;
528}
529
530
531static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmode)
412{ 532{
413 struct nfs_delegation *delegation; 533 struct nfs_delegation *delegation;
414 534
415 rcu_read_lock(); 535 rcu_read_lock();
416 delegation = rcu_dereference(NFS_I(inode)->delegation); 536 delegation = rcu_dereference(NFS_I(inode)->delegation);
417 if (delegation == NULL || (delegation->type & open_flags) == open_flags) { 537 if (delegation == NULL || (delegation->type & fmode) == fmode) {
418 rcu_read_unlock(); 538 rcu_read_unlock();
419 return; 539 return;
420 } 540 }
@@ -427,27 +547,28 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
427 struct nfs4_state *state = opendata->state; 547 struct nfs4_state *state = opendata->state;
428 struct nfs_inode *nfsi = NFS_I(state->inode); 548 struct nfs_inode *nfsi = NFS_I(state->inode);
429 struct nfs_delegation *delegation; 549 struct nfs_delegation *delegation;
430 int open_mode = opendata->o_arg.open_flags & (FMODE_READ|FMODE_WRITE|O_EXCL); 550 int open_mode = opendata->o_arg.open_flags & O_EXCL;
551 fmode_t fmode = opendata->o_arg.fmode;
431 nfs4_stateid stateid; 552 nfs4_stateid stateid;
432 int ret = -EAGAIN; 553 int ret = -EAGAIN;
433 554
434 rcu_read_lock();
435 delegation = rcu_dereference(nfsi->delegation);
436 for (;;) { 555 for (;;) {
437 if (can_open_cached(state, open_mode)) { 556 if (can_open_cached(state, fmode, open_mode)) {
438 spin_lock(&state->owner->so_lock); 557 spin_lock(&state->owner->so_lock);
439 if (can_open_cached(state, open_mode)) { 558 if (can_open_cached(state, fmode, open_mode)) {
440 update_open_stateflags(state, open_mode); 559 update_open_stateflags(state, fmode);
441 spin_unlock(&state->owner->so_lock); 560 spin_unlock(&state->owner->so_lock);
442 rcu_read_unlock();
443 goto out_return_state; 561 goto out_return_state;
444 } 562 }
445 spin_unlock(&state->owner->so_lock); 563 spin_unlock(&state->owner->so_lock);
446 } 564 }
447 if (delegation == NULL) 565 rcu_read_lock();
448 break; 566 delegation = rcu_dereference(nfsi->delegation);
449 if (!can_open_delegated(delegation, open_mode)) 567 if (delegation == NULL ||
568 !can_open_delegated(delegation, fmode)) {
569 rcu_read_unlock();
450 break; 570 break;
571 }
451 /* Save the delegation */ 572 /* Save the delegation */
452 memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data)); 573 memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data));
453 rcu_read_unlock(); 574 rcu_read_unlock();
@@ -455,19 +576,11 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
455 if (ret != 0) 576 if (ret != 0)
456 goto out; 577 goto out;
457 ret = -EAGAIN; 578 ret = -EAGAIN;
458 rcu_read_lock(); 579
459 delegation = rcu_dereference(nfsi->delegation); 580 /* Try to update the stateid using the delegation */
460 /* If no delegation, try a cached open */ 581 if (update_open_stateid(state, NULL, &stateid, fmode))
461 if (delegation == NULL) 582 goto out_return_state;
462 continue;
463 /* Is the delegation still valid? */
464 if (memcmp(stateid.data, delegation->stateid.data, sizeof(stateid.data)) != 0)
465 continue;
466 rcu_read_unlock();
467 update_open_stateid(state, NULL, &stateid, open_mode);
468 goto out_return_state;
469 } 583 }
470 rcu_read_unlock();
471out: 584out:
472 return ERR_PTR(ret); 585 return ERR_PTR(ret);
473out_return_state: 586out_return_state:
@@ -480,7 +593,6 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data
480 struct inode *inode; 593 struct inode *inode;
481 struct nfs4_state *state = NULL; 594 struct nfs4_state *state = NULL;
482 struct nfs_delegation *delegation; 595 struct nfs_delegation *delegation;
483 nfs4_stateid *deleg_stateid = NULL;
484 int ret; 596 int ret;
485 597
486 if (!data->rpc_done) { 598 if (!data->rpc_done) {
@@ -507,7 +619,7 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data
507 if (delegation) 619 if (delegation)
508 delegation_flags = delegation->flags; 620 delegation_flags = delegation->flags;
509 rcu_read_unlock(); 621 rcu_read_unlock();
510 if (!(delegation_flags & NFS_DELEGATION_NEED_RECLAIM)) 622 if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
511 nfs_inode_set_delegation(state->inode, 623 nfs_inode_set_delegation(state->inode,
512 data->owner->so_cred, 624 data->owner->so_cred,
513 &data->o_res); 625 &data->o_res);
@@ -516,12 +628,9 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data
516 data->owner->so_cred, 628 data->owner->so_cred,
517 &data->o_res); 629 &data->o_res);
518 } 630 }
519 rcu_read_lock(); 631
520 delegation = rcu_dereference(NFS_I(inode)->delegation); 632 update_open_stateid(state, &data->o_res.stateid, NULL,
521 if (delegation != NULL) 633 data->o_arg.fmode);
522 deleg_stateid = &delegation->stateid;
523 update_open_stateid(state, &data->o_res.stateid, deleg_stateid, data->o_arg.open_flags);
524 rcu_read_unlock();
525 iput(inode); 634 iput(inode);
526out: 635out:
527 return state; 636 return state;
@@ -552,7 +661,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
552{ 661{
553 struct nfs4_opendata *opendata; 662 struct nfs4_opendata *opendata;
554 663
555 opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, NULL); 664 opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL);
556 if (opendata == NULL) 665 if (opendata == NULL)
557 return ERR_PTR(-ENOMEM); 666 return ERR_PTR(-ENOMEM);
558 opendata->state = state; 667 opendata->state = state;
@@ -560,12 +669,13 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
560 return opendata; 669 return opendata;
561} 670}
562 671
563static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, mode_t openflags, struct nfs4_state **res) 672static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmode, struct nfs4_state **res)
564{ 673{
565 struct nfs4_state *newstate; 674 struct nfs4_state *newstate;
566 int ret; 675 int ret;
567 676
568 opendata->o_arg.open_flags = openflags; 677 opendata->o_arg.open_flags = 0;
678 opendata->o_arg.fmode = fmode;
569 memset(&opendata->o_res, 0, sizeof(opendata->o_res)); 679 memset(&opendata->o_res, 0, sizeof(opendata->o_res));
570 memset(&opendata->c_res, 0, sizeof(opendata->c_res)); 680 memset(&opendata->c_res, 0, sizeof(opendata->c_res));
571 nfs4_init_opendata_res(opendata); 681 nfs4_init_opendata_res(opendata);
@@ -575,7 +685,7 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, mode_t openf
575 newstate = nfs4_opendata_to_nfs4_state(opendata); 685 newstate = nfs4_opendata_to_nfs4_state(opendata);
576 if (IS_ERR(newstate)) 686 if (IS_ERR(newstate))
577 return PTR_ERR(newstate); 687 return PTR_ERR(newstate);
578 nfs4_close_state(&opendata->path, newstate, openflags); 688 nfs4_close_state(&opendata->path, newstate, fmode);
579 *res = newstate; 689 *res = newstate;
580 return 0; 690 return 0;
581} 691}
@@ -631,7 +741,7 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
631{ 741{
632 struct nfs_delegation *delegation; 742 struct nfs_delegation *delegation;
633 struct nfs4_opendata *opendata; 743 struct nfs4_opendata *opendata;
634 int delegation_type = 0; 744 fmode_t delegation_type = 0;
635 int status; 745 int status;
636 746
637 opendata = nfs4_open_recoverdata_alloc(ctx, state); 747 opendata = nfs4_open_recoverdata_alloc(ctx, state);
@@ -641,7 +751,7 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
641 opendata->o_arg.fh = NFS_FH(state->inode); 751 opendata->o_arg.fh = NFS_FH(state->inode);
642 rcu_read_lock(); 752 rcu_read_lock();
643 delegation = rcu_dereference(NFS_I(state->inode)->delegation); 753 delegation = rcu_dereference(NFS_I(state->inode)->delegation);
644 if (delegation != NULL && (delegation->flags & NFS_DELEGATION_NEED_RECLAIM) != 0) 754 if (delegation != NULL && test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) != 0)
645 delegation_type = delegation->type; 755 delegation_type = delegation->type;
646 rcu_read_unlock(); 756 rcu_read_unlock();
647 opendata->o_arg.u.delegation_type = delegation_type; 757 opendata->o_arg.u.delegation_type = delegation_type;
@@ -744,7 +854,7 @@ static void nfs4_open_confirm_release(void *calldata)
744 goto out_free; 854 goto out_free;
745 state = nfs4_opendata_to_nfs4_state(data); 855 state = nfs4_opendata_to_nfs4_state(data);
746 if (!IS_ERR(state)) 856 if (!IS_ERR(state))
747 nfs4_close_state(&data->path, state, data->o_arg.open_flags); 857 nfs4_close_state(&data->path, state, data->o_arg.fmode);
748out_free: 858out_free:
749 nfs4_opendata_put(data); 859 nfs4_opendata_put(data);
750} 860}
@@ -808,12 +918,12 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
808 if (data->state != NULL) { 918 if (data->state != NULL) {
809 struct nfs_delegation *delegation; 919 struct nfs_delegation *delegation;
810 920
811 if (can_open_cached(data->state, data->o_arg.open_flags & (FMODE_READ|FMODE_WRITE|O_EXCL))) 921 if (can_open_cached(data->state, data->o_arg.fmode, data->o_arg.open_flags))
812 goto out_no_action; 922 goto out_no_action;
813 rcu_read_lock(); 923 rcu_read_lock();
814 delegation = rcu_dereference(NFS_I(data->state->inode)->delegation); 924 delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
815 if (delegation != NULL && 925 if (delegation != NULL &&
816 (delegation->flags & NFS_DELEGATION_NEED_RECLAIM) == 0) { 926 test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0) {
817 rcu_read_unlock(); 927 rcu_read_unlock();
818 goto out_no_action; 928 goto out_no_action;
819 } 929 }
@@ -877,7 +987,7 @@ static void nfs4_open_release(void *calldata)
877 goto out_free; 987 goto out_free;
878 state = nfs4_opendata_to_nfs4_state(data); 988 state = nfs4_opendata_to_nfs4_state(data);
879 if (!IS_ERR(state)) 989 if (!IS_ERR(state))
880 nfs4_close_state(&data->path, state, data->o_arg.open_flags); 990 nfs4_close_state(&data->path, state, data->o_arg.fmode);
881out_free: 991out_free:
882 nfs4_opendata_put(data); 992 nfs4_opendata_put(data);
883} 993}
@@ -955,10 +1065,11 @@ static int nfs4_recover_expired_lease(struct nfs_server *server)
955 int ret; 1065 int ret;
956 1066
957 for (;;) { 1067 for (;;) {
958 ret = nfs4_wait_clnt_recover(server->client, clp); 1068 ret = nfs4_wait_clnt_recover(clp);
959 if (ret != 0) 1069 if (ret != 0)
960 return ret; 1070 return ret;
961 if (!test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) 1071 if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
1072 !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
962 break; 1073 break;
963 nfs4_schedule_state_recovery(clp); 1074 nfs4_schedule_state_recovery(clp);
964 } 1075 }
@@ -993,8 +1104,9 @@ static inline int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4
993 1104
994 do { 1105 do {
995 err = _nfs4_open_expired(ctx, state); 1106 err = _nfs4_open_expired(ctx, state);
996 if (err == -NFS4ERR_DELAY) 1107 if (err != -NFS4ERR_DELAY)
997 nfs4_handle_exception(server, err, &exception); 1108 break;
1109 nfs4_handle_exception(server, err, &exception);
998 } while (exception.retry); 1110 } while (exception.retry);
999 return err; 1111 return err;
1000} 1112}
@@ -1031,12 +1143,11 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct
1031/* 1143/*
1032 * Returns a referenced nfs4_state 1144 * Returns a referenced nfs4_state
1033 */ 1145 */
1034static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res) 1146static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res)
1035{ 1147{
1036 struct nfs4_state_owner *sp; 1148 struct nfs4_state_owner *sp;
1037 struct nfs4_state *state = NULL; 1149 struct nfs4_state *state = NULL;
1038 struct nfs_server *server = NFS_SERVER(dir); 1150 struct nfs_server *server = NFS_SERVER(dir);
1039 struct nfs_client *clp = server->nfs_client;
1040 struct nfs4_opendata *opendata; 1151 struct nfs4_opendata *opendata;
1041 int status; 1152 int status;
1042 1153
@@ -1050,12 +1161,11 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct
1050 if (status != 0) 1161 if (status != 0)
1051 goto err_put_state_owner; 1162 goto err_put_state_owner;
1052 if (path->dentry->d_inode != NULL) 1163 if (path->dentry->d_inode != NULL)
1053 nfs4_return_incompatible_delegation(path->dentry->d_inode, flags & (FMODE_READ|FMODE_WRITE)); 1164 nfs4_return_incompatible_delegation(path->dentry->d_inode, fmode);
1054 down_read(&clp->cl_sem);
1055 status = -ENOMEM; 1165 status = -ENOMEM;
1056 opendata = nfs4_opendata_alloc(path, sp, flags, sattr); 1166 opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr);
1057 if (opendata == NULL) 1167 if (opendata == NULL)
1058 goto err_release_rwsem; 1168 goto err_put_state_owner;
1059 1169
1060 if (path->dentry->d_inode != NULL) 1170 if (path->dentry->d_inode != NULL)
1061 opendata->state = nfs4_get_open_state(path->dentry->d_inode, sp); 1171 opendata->state = nfs4_get_open_state(path->dentry->d_inode, sp);
@@ -1073,13 +1183,10 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct
1073 goto err_opendata_put; 1183 goto err_opendata_put;
1074 nfs4_opendata_put(opendata); 1184 nfs4_opendata_put(opendata);
1075 nfs4_put_state_owner(sp); 1185 nfs4_put_state_owner(sp);
1076 up_read(&clp->cl_sem);
1077 *res = state; 1186 *res = state;
1078 return 0; 1187 return 0;
1079err_opendata_put: 1188err_opendata_put:
1080 nfs4_opendata_put(opendata); 1189 nfs4_opendata_put(opendata);
1081err_release_rwsem:
1082 up_read(&clp->cl_sem);
1083err_put_state_owner: 1190err_put_state_owner:
1084 nfs4_put_state_owner(sp); 1191 nfs4_put_state_owner(sp);
1085out_err: 1192out_err:
@@ -1088,14 +1195,14 @@ out_err:
1088} 1195}
1089 1196
1090 1197
1091static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, int flags, struct iattr *sattr, struct rpc_cred *cred) 1198static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred)
1092{ 1199{
1093 struct nfs4_exception exception = { }; 1200 struct nfs4_exception exception = { };
1094 struct nfs4_state *res; 1201 struct nfs4_state *res;
1095 int status; 1202 int status;
1096 1203
1097 do { 1204 do {
1098 status = _nfs4_do_open(dir, path, flags, sattr, cred, &res); 1205 status = _nfs4_do_open(dir, path, fmode, flags, sattr, cred, &res);
1099 if (status == 0) 1206 if (status == 0)
1100 break; 1207 break;
1101 /* NOTE: BAD_SEQID means the server and client disagree about the 1208 /* NOTE: BAD_SEQID means the server and client disagree about the
@@ -1230,10 +1337,13 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
1230 renew_lease(server, calldata->timestamp); 1337 renew_lease(server, calldata->timestamp);
1231 break; 1338 break;
1232 case -NFS4ERR_STALE_STATEID: 1339 case -NFS4ERR_STALE_STATEID:
1340 case -NFS4ERR_OLD_STATEID:
1341 case -NFS4ERR_BAD_STATEID:
1233 case -NFS4ERR_EXPIRED: 1342 case -NFS4ERR_EXPIRED:
1234 break; 1343 if (calldata->arg.fmode == 0)
1344 break;
1235 default: 1345 default:
1236 if (nfs4_async_handle_error(task, server) == -EAGAIN) { 1346 if (nfs4_async_handle_error(task, server, state) == -EAGAIN) {
1237 rpc_restart_call(task); 1347 rpc_restart_call(task);
1238 return; 1348 return;
1239 } 1349 }
@@ -1272,10 +1382,10 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
1272 nfs_fattr_init(calldata->res.fattr); 1382 nfs_fattr_init(calldata->res.fattr);
1273 if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0) { 1383 if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0) {
1274 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; 1384 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
1275 calldata->arg.open_flags = FMODE_READ; 1385 calldata->arg.fmode = FMODE_READ;
1276 } else if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0) { 1386 } else if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0) {
1277 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; 1387 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
1278 calldata->arg.open_flags = FMODE_WRITE; 1388 calldata->arg.fmode = FMODE_WRITE;
1279 } 1389 }
1280 calldata->timestamp = jiffies; 1390 calldata->timestamp = jiffies;
1281 rpc_call_start(task); 1391 rpc_call_start(task);
@@ -1328,6 +1438,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
1328 calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid); 1438 calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid);
1329 if (calldata->arg.seqid == NULL) 1439 if (calldata->arg.seqid == NULL)
1330 goto out_free_calldata; 1440 goto out_free_calldata;
1441 calldata->arg.fmode = 0;
1331 calldata->arg.bitmask = server->attr_bitmask; 1442 calldata->arg.bitmask = server->attr_bitmask;
1332 calldata->res.fattr = &calldata->fattr; 1443 calldata->res.fattr = &calldata->fattr;
1333 calldata->res.seqid = calldata->arg.seqid; 1444 calldata->res.seqid = calldata->arg.seqid;
@@ -1354,13 +1465,13 @@ out:
1354 return status; 1465 return status;
1355} 1466}
1356 1467
1357static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state) 1468static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state, fmode_t fmode)
1358{ 1469{
1359 struct file *filp; 1470 struct file *filp;
1360 int ret; 1471 int ret;
1361 1472
1362 /* If the open_intent is for execute, we have an extra check to make */ 1473 /* If the open_intent is for execute, we have an extra check to make */
1363 if (nd->intent.open.flags & FMODE_EXEC) { 1474 if (fmode & FMODE_EXEC) {
1364 ret = nfs_may_open(state->inode, 1475 ret = nfs_may_open(state->inode,
1365 state->owner->so_cred, 1476 state->owner->so_cred,
1366 nd->intent.open.flags); 1477 nd->intent.open.flags);
@@ -1376,7 +1487,7 @@ static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct
1376 } 1487 }
1377 ret = PTR_ERR(filp); 1488 ret = PTR_ERR(filp);
1378out_close: 1489out_close:
1379 nfs4_close_sync(path, state, nd->intent.open.flags); 1490 nfs4_close_sync(path, state, fmode & (FMODE_READ|FMODE_WRITE));
1380 return ret; 1491 return ret;
1381} 1492}
1382 1493
@@ -1392,6 +1503,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
1392 struct rpc_cred *cred; 1503 struct rpc_cred *cred;
1393 struct nfs4_state *state; 1504 struct nfs4_state *state;
1394 struct dentry *res; 1505 struct dentry *res;
1506 fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
1395 1507
1396 if (nd->flags & LOOKUP_CREATE) { 1508 if (nd->flags & LOOKUP_CREATE) {
1397 attr.ia_mode = nd->intent.open.create_mode; 1509 attr.ia_mode = nd->intent.open.create_mode;
@@ -1409,7 +1521,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
1409 parent = dentry->d_parent; 1521 parent = dentry->d_parent;
1410 /* Protect against concurrent sillydeletes */ 1522 /* Protect against concurrent sillydeletes */
1411 nfs_block_sillyrename(parent); 1523 nfs_block_sillyrename(parent);
1412 state = nfs4_do_open(dir, &path, nd->intent.open.flags, &attr, cred); 1524 state = nfs4_do_open(dir, &path, fmode, nd->intent.open.flags, &attr, cred);
1413 put_rpccred(cred); 1525 put_rpccred(cred);
1414 if (IS_ERR(state)) { 1526 if (IS_ERR(state)) {
1415 if (PTR_ERR(state) == -ENOENT) { 1527 if (PTR_ERR(state) == -ENOENT) {
@@ -1424,7 +1536,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
1424 path.dentry = res; 1536 path.dentry = res;
1425 nfs_set_verifier(path.dentry, nfs_save_change_attribute(dir)); 1537 nfs_set_verifier(path.dentry, nfs_save_change_attribute(dir));
1426 nfs_unblock_sillyrename(parent); 1538 nfs_unblock_sillyrename(parent);
1427 nfs4_intent_set_file(nd, &path, state); 1539 nfs4_intent_set_file(nd, &path, state, fmode);
1428 return res; 1540 return res;
1429} 1541}
1430 1542
@@ -1437,11 +1549,12 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
1437 }; 1549 };
1438 struct rpc_cred *cred; 1550 struct rpc_cred *cred;
1439 struct nfs4_state *state; 1551 struct nfs4_state *state;
1552 fmode_t fmode = openflags & (FMODE_READ | FMODE_WRITE);
1440 1553
1441 cred = rpc_lookup_cred(); 1554 cred = rpc_lookup_cred();
1442 if (IS_ERR(cred)) 1555 if (IS_ERR(cred))
1443 return PTR_ERR(cred); 1556 return PTR_ERR(cred);
1444 state = nfs4_do_open(dir, &path, openflags, NULL, cred); 1557 state = nfs4_do_open(dir, &path, fmode, openflags, NULL, cred);
1445 put_rpccred(cred); 1558 put_rpccred(cred);
1446 if (IS_ERR(state)) { 1559 if (IS_ERR(state)) {
1447 switch (PTR_ERR(state)) { 1560 switch (PTR_ERR(state)) {
@@ -1458,10 +1571,10 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
1458 } 1571 }
1459 if (state->inode == dentry->d_inode) { 1572 if (state->inode == dentry->d_inode) {
1460 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1573 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1461 nfs4_intent_set_file(nd, &path, state); 1574 nfs4_intent_set_file(nd, &path, state, fmode);
1462 return 1; 1575 return 1;
1463 } 1576 }
1464 nfs4_close_sync(&path, state, openflags); 1577 nfs4_close_sync(&path, state, fmode);
1465out_drop: 1578out_drop:
1466 d_drop(dentry); 1579 d_drop(dentry);
1467 return 0; 1580 return 0;
@@ -1887,6 +2000,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
1887 }; 2000 };
1888 struct nfs4_state *state; 2001 struct nfs4_state *state;
1889 struct rpc_cred *cred; 2002 struct rpc_cred *cred;
2003 fmode_t fmode = flags & (FMODE_READ | FMODE_WRITE);
1890 int status = 0; 2004 int status = 0;
1891 2005
1892 cred = rpc_lookup_cred(); 2006 cred = rpc_lookup_cred();
@@ -1894,7 +2008,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
1894 status = PTR_ERR(cred); 2008 status = PTR_ERR(cred);
1895 goto out; 2009 goto out;
1896 } 2010 }
1897 state = nfs4_do_open(dir, &path, flags, sattr, cred); 2011 state = nfs4_do_open(dir, &path, fmode, flags, sattr, cred);
1898 d_drop(dentry); 2012 d_drop(dentry);
1899 if (IS_ERR(state)) { 2013 if (IS_ERR(state)) {
1900 status = PTR_ERR(state); 2014 status = PTR_ERR(state);
@@ -1910,9 +2024,9 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
1910 nfs_post_op_update_inode(state->inode, &fattr); 2024 nfs_post_op_update_inode(state->inode, &fattr);
1911 } 2025 }
1912 if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0) 2026 if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0)
1913 status = nfs4_intent_set_file(nd, &path, state); 2027 status = nfs4_intent_set_file(nd, &path, state, fmode);
1914 else 2028 else
1915 nfs4_close_sync(&path, state, flags); 2029 nfs4_close_sync(&path, state, fmode);
1916out_putcred: 2030out_putcred:
1917 put_rpccred(cred); 2031 put_rpccred(cred);
1918out: 2032out:
@@ -1974,7 +2088,7 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
1974{ 2088{
1975 struct nfs_removeres *res = task->tk_msg.rpc_resp; 2089 struct nfs_removeres *res = task->tk_msg.rpc_resp;
1976 2090
1977 if (nfs4_async_handle_error(task, res->server) == -EAGAIN) 2091 if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
1978 return 0; 2092 return 0;
1979 update_changeattr(dir, &res->cinfo); 2093 update_changeattr(dir, &res->cinfo);
1980 nfs_post_op_update_inode(dir, &res->dir_attr); 2094 nfs_post_op_update_inode(dir, &res->dir_attr);
@@ -2402,7 +2516,7 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
2402{ 2516{
2403 struct nfs_server *server = NFS_SERVER(data->inode); 2517 struct nfs_server *server = NFS_SERVER(data->inode);
2404 2518
2405 if (nfs4_async_handle_error(task, server) == -EAGAIN) { 2519 if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
2406 rpc_restart_call(task); 2520 rpc_restart_call(task);
2407 return -EAGAIN; 2521 return -EAGAIN;
2408 } 2522 }
@@ -2423,7 +2537,7 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
2423{ 2537{
2424 struct inode *inode = data->inode; 2538 struct inode *inode = data->inode;
2425 2539
2426 if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) { 2540 if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
2427 rpc_restart_call(task); 2541 rpc_restart_call(task);
2428 return -EAGAIN; 2542 return -EAGAIN;
2429 } 2543 }
@@ -2449,7 +2563,7 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
2449{ 2563{
2450 struct inode *inode = data->inode; 2564 struct inode *inode = data->inode;
2451 2565
2452 if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) { 2566 if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
2453 rpc_restart_call(task); 2567 rpc_restart_call(task);
2454 return -EAGAIN; 2568 return -EAGAIN;
2455 } 2569 }
@@ -2742,19 +2856,25 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
2742} 2856}
2743 2857
2744static int 2858static int
2745nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server) 2859nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
2746{ 2860{
2747 struct nfs_client *clp = server->nfs_client; 2861 struct nfs_client *clp = server->nfs_client;
2748 2862
2749 if (!clp || task->tk_status >= 0) 2863 if (!clp || task->tk_status >= 0)
2750 return 0; 2864 return 0;
2751 switch(task->tk_status) { 2865 switch(task->tk_status) {
2866 case -NFS4ERR_ADMIN_REVOKED:
2867 case -NFS4ERR_BAD_STATEID:
2868 case -NFS4ERR_OPENMODE:
2869 if (state == NULL)
2870 break;
2871 nfs4_state_mark_reclaim_nograce(clp, state);
2752 case -NFS4ERR_STALE_CLIENTID: 2872 case -NFS4ERR_STALE_CLIENTID:
2753 case -NFS4ERR_STALE_STATEID: 2873 case -NFS4ERR_STALE_STATEID:
2754 case -NFS4ERR_EXPIRED: 2874 case -NFS4ERR_EXPIRED:
2755 rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); 2875 rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
2756 nfs4_schedule_state_recovery(clp); 2876 nfs4_schedule_state_recovery(clp);
2757 if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) == 0) 2877 if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
2758 rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); 2878 rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
2759 task->tk_status = 0; 2879 task->tk_status = 0;
2760 return -EAGAIN; 2880 return -EAGAIN;
@@ -2772,79 +2892,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
2772 return 0; 2892 return 0;
2773} 2893}
2774 2894
2775static int nfs4_wait_bit_killable(void *word)
2776{
2777 if (fatal_signal_pending(current))
2778 return -ERESTARTSYS;
2779 schedule();
2780 return 0;
2781}
2782
2783static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp)
2784{
2785 int res;
2786
2787 might_sleep();
2788
2789 rwsem_acquire(&clp->cl_sem.dep_map, 0, 0, _RET_IP_);
2790
2791 res = wait_on_bit(&clp->cl_state, NFS4CLNT_STATE_RECOVER,
2792 nfs4_wait_bit_killable, TASK_KILLABLE);
2793
2794 rwsem_release(&clp->cl_sem.dep_map, 1, _RET_IP_);
2795 return res;
2796}
2797
2798static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
2799{
2800 int res = 0;
2801
2802 might_sleep();
2803
2804 if (*timeout <= 0)
2805 *timeout = NFS4_POLL_RETRY_MIN;
2806 if (*timeout > NFS4_POLL_RETRY_MAX)
2807 *timeout = NFS4_POLL_RETRY_MAX;
2808 schedule_timeout_killable(*timeout);
2809 if (fatal_signal_pending(current))
2810 res = -ERESTARTSYS;
2811 *timeout <<= 1;
2812 return res;
2813}
2814
2815/* This is the error handling routine for processes that are allowed
2816 * to sleep.
2817 */
2818static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
2819{
2820 struct nfs_client *clp = server->nfs_client;
2821 int ret = errorcode;
2822
2823 exception->retry = 0;
2824 switch(errorcode) {
2825 case 0:
2826 return 0;
2827 case -NFS4ERR_STALE_CLIENTID:
2828 case -NFS4ERR_STALE_STATEID:
2829 case -NFS4ERR_EXPIRED:
2830 nfs4_schedule_state_recovery(clp);
2831 ret = nfs4_wait_clnt_recover(server->client, clp);
2832 if (ret == 0)
2833 exception->retry = 1;
2834 break;
2835 case -NFS4ERR_FILE_OPEN:
2836 case -NFS4ERR_GRACE:
2837 case -NFS4ERR_DELAY:
2838 ret = nfs4_delay(server->client, &exception->timeout);
2839 if (ret != 0)
2840 break;
2841 case -NFS4ERR_OLD_STATEID:
2842 exception->retry = 1;
2843 }
2844 /* We failed to handle the error */
2845 return nfs4_map_errors(ret);
2846}
2847
2848int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred) 2895int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred)
2849{ 2896{
2850 nfs4_verifier sc_verifier; 2897 nfs4_verifier sc_verifier;
@@ -2916,7 +2963,6 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cre
2916 spin_lock(&clp->cl_lock); 2963 spin_lock(&clp->cl_lock);
2917 clp->cl_lease_time = fsinfo.lease_time * HZ; 2964 clp->cl_lease_time = fsinfo.lease_time * HZ;
2918 clp->cl_last_renewal = now; 2965 clp->cl_last_renewal = now;
2919 clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
2920 spin_unlock(&clp->cl_lock); 2966 spin_unlock(&clp->cl_lock);
2921 } 2967 }
2922 return status; 2968 return status;
@@ -3074,7 +3120,6 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
3074 struct nfs4_lock_state *lsp; 3120 struct nfs4_lock_state *lsp;
3075 int status; 3121 int status;
3076 3122
3077 down_read(&clp->cl_sem);
3078 arg.lock_owner.clientid = clp->cl_clientid; 3123 arg.lock_owner.clientid = clp->cl_clientid;
3079 status = nfs4_set_lock_state(state, request); 3124 status = nfs4_set_lock_state(state, request);
3080 if (status != 0) 3125 if (status != 0)
@@ -3091,7 +3136,6 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
3091 } 3136 }
3092 request->fl_ops->fl_release_private(request); 3137 request->fl_ops->fl_release_private(request);
3093out: 3138out:
3094 up_read(&clp->cl_sem);
3095 return status; 3139 return status;
3096} 3140}
3097 3141
@@ -3181,11 +3225,13 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
3181 sizeof(calldata->lsp->ls_stateid.data)); 3225 sizeof(calldata->lsp->ls_stateid.data));
3182 renew_lease(calldata->server, calldata->timestamp); 3226 renew_lease(calldata->server, calldata->timestamp);
3183 break; 3227 break;
3228 case -NFS4ERR_BAD_STATEID:
3229 case -NFS4ERR_OLD_STATEID:
3184 case -NFS4ERR_STALE_STATEID: 3230 case -NFS4ERR_STALE_STATEID:
3185 case -NFS4ERR_EXPIRED: 3231 case -NFS4ERR_EXPIRED:
3186 break; 3232 break;
3187 default: 3233 default:
3188 if (nfs4_async_handle_error(task, calldata->server) == -EAGAIN) 3234 if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
3189 rpc_restart_call(task); 3235 rpc_restart_call(task);
3190 } 3236 }
3191} 3237}
@@ -3248,6 +3294,7 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
3248 3294
3249static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request) 3295static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
3250{ 3296{
3297 struct nfs_inode *nfsi = NFS_I(state->inode);
3251 struct nfs_seqid *seqid; 3298 struct nfs_seqid *seqid;
3252 struct nfs4_lock_state *lsp; 3299 struct nfs4_lock_state *lsp;
3253 struct rpc_task *task; 3300 struct rpc_task *task;
@@ -3257,8 +3304,12 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
3257 status = nfs4_set_lock_state(state, request); 3304 status = nfs4_set_lock_state(state, request);
3258 /* Unlock _before_ we do the RPC call */ 3305 /* Unlock _before_ we do the RPC call */
3259 request->fl_flags |= FL_EXISTS; 3306 request->fl_flags |= FL_EXISTS;
3260 if (do_vfs_lock(request->fl_file, request) == -ENOENT) 3307 down_read(&nfsi->rwsem);
3308 if (do_vfs_lock(request->fl_file, request) == -ENOENT) {
3309 up_read(&nfsi->rwsem);
3261 goto out; 3310 goto out;
3311 }
3312 up_read(&nfsi->rwsem);
3262 if (status != 0) 3313 if (status != 0)
3263 goto out; 3314 goto out;
3264 /* Is this a delegated lock? */ 3315 /* Is this a delegated lock? */
@@ -3484,7 +3535,7 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
3484 3535
3485static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) 3536static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
3486{ 3537{
3487 struct nfs_client *clp = state->owner->so_client; 3538 struct nfs_inode *nfsi = NFS_I(state->inode);
3488 unsigned char fl_flags = request->fl_flags; 3539 unsigned char fl_flags = request->fl_flags;
3489 int status; 3540 int status;
3490 3541
@@ -3496,19 +3547,13 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
3496 status = do_vfs_lock(request->fl_file, request); 3547 status = do_vfs_lock(request->fl_file, request);
3497 if (status < 0) 3548 if (status < 0)
3498 goto out; 3549 goto out;
3499 down_read(&clp->cl_sem); 3550 down_read(&nfsi->rwsem);
3500 if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { 3551 if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
3501 struct nfs_inode *nfsi = NFS_I(state->inode);
3502 /* Yes: cache locks! */ 3552 /* Yes: cache locks! */
3503 down_read(&nfsi->rwsem);
3504 /* ...but avoid races with delegation recall... */ 3553 /* ...but avoid races with delegation recall... */
3505 if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { 3554 request->fl_flags = fl_flags & ~FL_SLEEP;
3506 request->fl_flags = fl_flags & ~FL_SLEEP; 3555 status = do_vfs_lock(request->fl_file, request);
3507 status = do_vfs_lock(request->fl_file, request); 3556 goto out_unlock;
3508 up_read(&nfsi->rwsem);
3509 goto out_unlock;
3510 }
3511 up_read(&nfsi->rwsem);
3512 } 3557 }
3513 status = _nfs4_do_setlk(state, cmd, request, 0); 3558 status = _nfs4_do_setlk(state, cmd, request, 0);
3514 if (status != 0) 3559 if (status != 0)
@@ -3518,7 +3563,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
3518 if (do_vfs_lock(request->fl_file, request) < 0) 3563 if (do_vfs_lock(request->fl_file, request) < 0)
3519 printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__); 3564 printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__);
3520out_unlock: 3565out_unlock:
3521 up_read(&clp->cl_sem); 3566 up_read(&nfsi->rwsem);
3522out: 3567out:
3523 request->fl_flags = fl_flags; 3568 request->fl_flags = fl_flags;
3524 return status; 3569 return status;
@@ -3664,11 +3709,15 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
3664} 3709}
3665 3710
3666struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops = { 3711struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops = {
3712 .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,
3713 .state_flag_bit = NFS_STATE_RECLAIM_REBOOT,
3667 .recover_open = nfs4_open_reclaim, 3714 .recover_open = nfs4_open_reclaim,
3668 .recover_lock = nfs4_lock_reclaim, 3715 .recover_lock = nfs4_lock_reclaim,
3669}; 3716};
3670 3717
3671struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops = { 3718struct nfs4_state_recovery_ops nfs4_nograce_recovery_ops = {
3719 .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
3720 .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
3672 .recover_open = nfs4_open_expired, 3721 .recover_open = nfs4_open_expired,
3673 .recover_lock = nfs4_lock_expired, 3722 .recover_lock = nfs4_lock_expired,
3674}; 3723};
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 3305acbbe2ae..f524e932ff7b 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -65,7 +65,6 @@ nfs4_renew_state(struct work_struct *work)
65 long lease, timeout; 65 long lease, timeout;
66 unsigned long last, now; 66 unsigned long last, now;
67 67
68 down_read(&clp->cl_sem);
69 dprintk("%s: start\n", __func__); 68 dprintk("%s: start\n", __func__);
70 /* Are there any active superblocks? */ 69 /* Are there any active superblocks? */
71 if (list_empty(&clp->cl_superblocks)) 70 if (list_empty(&clp->cl_superblocks))
@@ -77,17 +76,19 @@ nfs4_renew_state(struct work_struct *work)
77 timeout = (2 * lease) / 3 + (long)last - (long)now; 76 timeout = (2 * lease) / 3 + (long)last - (long)now;
78 /* Are we close to a lease timeout? */ 77 /* Are we close to a lease timeout? */
79 if (time_after(now, last + lease/3)) { 78 if (time_after(now, last + lease/3)) {
80 cred = nfs4_get_renew_cred(clp); 79 cred = nfs4_get_renew_cred_locked(clp);
80 spin_unlock(&clp->cl_lock);
81 if (cred == NULL) { 81 if (cred == NULL) {
82 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); 82 if (list_empty(&clp->cl_delegations)) {
83 spin_unlock(&clp->cl_lock); 83 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
84 goto out;
85 }
84 nfs_expire_all_delegations(clp); 86 nfs_expire_all_delegations(clp);
85 goto out; 87 } else {
88 /* Queue an asynchronous RENEW. */
89 nfs4_proc_async_renew(clp, cred);
90 put_rpccred(cred);
86 } 91 }
87 spin_unlock(&clp->cl_lock);
88 /* Queue an asynchronous RENEW. */
89 nfs4_proc_async_renew(clp, cred);
90 put_rpccred(cred);
91 timeout = (2 * lease) / 3; 92 timeout = (2 * lease) / 3;
92 spin_lock(&clp->cl_lock); 93 spin_lock(&clp->cl_lock);
93 } else 94 } else
@@ -100,12 +101,11 @@ nfs4_renew_state(struct work_struct *work)
100 cancel_delayed_work(&clp->cl_renewd); 101 cancel_delayed_work(&clp->cl_renewd);
101 schedule_delayed_work(&clp->cl_renewd, timeout); 102 schedule_delayed_work(&clp->cl_renewd, timeout);
102 spin_unlock(&clp->cl_lock); 103 spin_unlock(&clp->cl_lock);
104 nfs_expire_unreferenced_delegations(clp);
103out: 105out:
104 up_read(&clp->cl_sem);
105 dprintk("%s: done\n", __func__); 106 dprintk("%s: done\n", __func__);
106} 107}
107 108
108/* Must be called with clp->cl_sem locked for writes */
109void 109void
110nfs4_schedule_state_renewal(struct nfs_client *clp) 110nfs4_schedule_state_renewal(struct nfs_client *clp)
111{ 111{
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 401ef8b28f97..2022fe47966f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -71,14 +71,12 @@ static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
71 return status; 71 return status;
72} 72}
73 73
74static struct rpc_cred *nfs4_get_machine_cred(struct nfs_client *clp) 74static struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp)
75{ 75{
76 struct rpc_cred *cred = NULL; 76 struct rpc_cred *cred = NULL;
77 77
78 spin_lock(&clp->cl_lock);
79 if (clp->cl_machine_cred != NULL) 78 if (clp->cl_machine_cred != NULL)
80 cred = get_rpccred(clp->cl_machine_cred); 79 cred = get_rpccred(clp->cl_machine_cred);
81 spin_unlock(&clp->cl_lock);
82 return cred; 80 return cred;
83} 81}
84 82
@@ -94,7 +92,7 @@ static void nfs4_clear_machine_cred(struct nfs_client *clp)
94 put_rpccred(cred); 92 put_rpccred(cred);
95} 93}
96 94
97struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp) 95struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
98{ 96{
99 struct nfs4_state_owner *sp; 97 struct nfs4_state_owner *sp;
100 struct rb_node *pos; 98 struct rb_node *pos;
@@ -110,13 +108,24 @@ struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp)
110 return cred; 108 return cred;
111} 109}
112 110
111static struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp)
112{
113 struct rpc_cred *cred;
114
115 spin_lock(&clp->cl_lock);
116 cred = nfs4_get_renew_cred_locked(clp);
117 spin_unlock(&clp->cl_lock);
118 return cred;
119}
120
113static struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp) 121static struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
114{ 122{
115 struct nfs4_state_owner *sp; 123 struct nfs4_state_owner *sp;
116 struct rb_node *pos; 124 struct rb_node *pos;
117 struct rpc_cred *cred; 125 struct rpc_cred *cred;
118 126
119 cred = nfs4_get_machine_cred(clp); 127 spin_lock(&clp->cl_lock);
128 cred = nfs4_get_machine_cred_locked(clp);
120 if (cred != NULL) 129 if (cred != NULL)
121 goto out; 130 goto out;
122 pos = rb_first(&clp->cl_state_owners); 131 pos = rb_first(&clp->cl_state_owners);
@@ -125,6 +134,7 @@ static struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
125 cred = get_rpccred(sp->so_cred); 134 cred = get_rpccred(sp->so_cred);
126 } 135 }
127out: 136out:
137 spin_unlock(&clp->cl_lock);
128 return cred; 138 return cred;
129} 139}
130 140
@@ -295,10 +305,6 @@ nfs4_drop_state_owner(struct nfs4_state_owner *sp)
295 } 305 }
296} 306}
297 307
298/*
299 * Note: must be called with clp->cl_sem held in order to prevent races
300 * with reboot recovery!
301 */
302struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred) 308struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred)
303{ 309{
304 struct nfs_client *clp = server->nfs_client; 310 struct nfs_client *clp = server->nfs_client;
@@ -327,10 +333,6 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
327 return sp; 333 return sp;
328} 334}
329 335
330/*
331 * Must be called with clp->cl_sem held in order to avoid races
332 * with state recovery...
333 */
334void nfs4_put_state_owner(struct nfs4_state_owner *sp) 336void nfs4_put_state_owner(struct nfs4_state_owner *sp)
335{ 337{
336 struct nfs_client *clp = sp->so_client; 338 struct nfs_client *clp = sp->so_client;
@@ -361,18 +363,18 @@ nfs4_alloc_open_state(void)
361} 363}
362 364
363void 365void
364nfs4_state_set_mode_locked(struct nfs4_state *state, mode_t mode) 366nfs4_state_set_mode_locked(struct nfs4_state *state, fmode_t fmode)
365{ 367{
366 if (state->state == mode) 368 if (state->state == fmode)
367 return; 369 return;
368 /* NB! List reordering - see the reclaim code for why. */ 370 /* NB! List reordering - see the reclaim code for why. */
369 if ((mode & FMODE_WRITE) != (state->state & FMODE_WRITE)) { 371 if ((fmode & FMODE_WRITE) != (state->state & FMODE_WRITE)) {
370 if (mode & FMODE_WRITE) 372 if (fmode & FMODE_WRITE)
371 list_move(&state->open_states, &state->owner->so_states); 373 list_move(&state->open_states, &state->owner->so_states);
372 else 374 else
373 list_move_tail(&state->open_states, &state->owner->so_states); 375 list_move_tail(&state->open_states, &state->owner->so_states);
374 } 376 }
375 state->state = mode; 377 state->state = fmode;
376} 378}
377 379
378static struct nfs4_state * 380static struct nfs4_state *
@@ -432,10 +434,6 @@ out:
432 return state; 434 return state;
433} 435}
434 436
435/*
436 * Beware! Caller must be holding exactly one
437 * reference to clp->cl_sem!
438 */
439void nfs4_put_open_state(struct nfs4_state *state) 437void nfs4_put_open_state(struct nfs4_state *state)
440{ 438{
441 struct inode *inode = state->inode; 439 struct inode *inode = state->inode;
@@ -456,16 +454,16 @@ void nfs4_put_open_state(struct nfs4_state *state)
456/* 454/*
457 * Close the current file. 455 * Close the current file.
458 */ 456 */
459static void __nfs4_close(struct path *path, struct nfs4_state *state, mode_t mode, int wait) 457static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fmode, int wait)
460{ 458{
461 struct nfs4_state_owner *owner = state->owner; 459 struct nfs4_state_owner *owner = state->owner;
462 int call_close = 0; 460 int call_close = 0;
463 int newstate; 461 fmode_t newstate;
464 462
465 atomic_inc(&owner->so_count); 463 atomic_inc(&owner->so_count);
466 /* Protect against nfs4_find_state() */ 464 /* Protect against nfs4_find_state() */
467 spin_lock(&owner->so_lock); 465 spin_lock(&owner->so_lock);
468 switch (mode & (FMODE_READ | FMODE_WRITE)) { 466 switch (fmode & (FMODE_READ | FMODE_WRITE)) {
469 case FMODE_READ: 467 case FMODE_READ:
470 state->n_rdonly--; 468 state->n_rdonly--;
471 break; 469 break;
@@ -500,14 +498,14 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state, mode_t mod
500 nfs4_do_close(path, state, wait); 498 nfs4_do_close(path, state, wait);
501} 499}
502 500
503void nfs4_close_state(struct path *path, struct nfs4_state *state, mode_t mode) 501void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
504{ 502{
505 __nfs4_close(path, state, mode, 0); 503 __nfs4_close(path, state, fmode, 0);
506} 504}
507 505
508void nfs4_close_sync(struct path *path, struct nfs4_state *state, mode_t mode) 506void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode)
509{ 507{
510 __nfs4_close(path, state, mode, 1); 508 __nfs4_close(path, state, fmode, 1);
511} 509}
512 510
513/* 511/*
@@ -568,7 +566,6 @@ static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
568 * Return a compatible lock_state. If no initialized lock_state structure 566 * Return a compatible lock_state. If no initialized lock_state structure
569 * exists, return an uninitialized one. 567 * exists, return an uninitialized one.
570 * 568 *
571 * The caller must be holding clp->cl_sem
572 */ 569 */
573static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner) 570static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner)
574{ 571{
@@ -770,32 +767,34 @@ unlock:
770 return status; 767 return status;
771} 768}
772 769
773static int reclaimer(void *); 770static int nfs4_run_state_manager(void *);
774 771
775static inline void nfs4_clear_recover_bit(struct nfs_client *clp) 772static void nfs4_clear_state_manager_bit(struct nfs_client *clp)
776{ 773{
777 smp_mb__before_clear_bit(); 774 smp_mb__before_clear_bit();
778 clear_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state); 775 clear_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
779 smp_mb__after_clear_bit(); 776 smp_mb__after_clear_bit();
780 wake_up_bit(&clp->cl_state, NFS4CLNT_STATE_RECOVER); 777 wake_up_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING);
781 rpc_wake_up(&clp->cl_rpcwaitq); 778 rpc_wake_up(&clp->cl_rpcwaitq);
782} 779}
783 780
784/* 781/*
785 * State recovery routine 782 * Schedule the nfs_client asynchronous state management routine
786 */ 783 */
787static void nfs4_recover_state(struct nfs_client *clp) 784void nfs4_schedule_state_manager(struct nfs_client *clp)
788{ 785{
789 struct task_struct *task; 786 struct task_struct *task;
790 787
788 if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
789 return;
791 __module_get(THIS_MODULE); 790 __module_get(THIS_MODULE);
792 atomic_inc(&clp->cl_count); 791 atomic_inc(&clp->cl_count);
793 task = kthread_run(reclaimer, clp, "%s-reclaim", 792 task = kthread_run(nfs4_run_state_manager, clp, "%s-manager",
794 rpc_peeraddr2str(clp->cl_rpcclient, 793 rpc_peeraddr2str(clp->cl_rpcclient,
795 RPC_DISPLAY_ADDR)); 794 RPC_DISPLAY_ADDR));
796 if (!IS_ERR(task)) 795 if (!IS_ERR(task))
797 return; 796 return;
798 nfs4_clear_recover_bit(clp); 797 nfs4_clear_state_manager_bit(clp);
799 nfs_put_client(clp); 798 nfs_put_client(clp);
800 module_put(THIS_MODULE); 799 module_put(THIS_MODULE);
801} 800}
@@ -807,16 +806,42 @@ void nfs4_schedule_state_recovery(struct nfs_client *clp)
807{ 806{
808 if (!clp) 807 if (!clp)
809 return; 808 return;
810 if (test_and_set_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) == 0) 809 if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
811 nfs4_recover_state(clp); 810 set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
811 nfs4_schedule_state_manager(clp);
812} 812}
813 813
814static int nfs4_reclaim_locks(struct nfs4_state_recovery_ops *ops, struct nfs4_state *state) 814static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
815{
816
817 set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
818 /* Don't recover state that expired before the reboot */
819 if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) {
820 clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
821 return 0;
822 }
823 set_bit(NFS_OWNER_RECLAIM_REBOOT, &state->owner->so_flags);
824 set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
825 return 1;
826}
827
828int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
829{
830 set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
831 clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
832 set_bit(NFS_OWNER_RECLAIM_NOGRACE, &state->owner->so_flags);
833 set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
834 return 1;
835}
836
837static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops)
815{ 838{
816 struct inode *inode = state->inode; 839 struct inode *inode = state->inode;
840 struct nfs_inode *nfsi = NFS_I(inode);
817 struct file_lock *fl; 841 struct file_lock *fl;
818 int status = 0; 842 int status = 0;
819 843
844 down_write(&nfsi->rwsem);
820 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 845 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
821 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) 846 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
822 continue; 847 continue;
@@ -839,12 +864,14 @@ static int nfs4_reclaim_locks(struct nfs4_state_recovery_ops *ops, struct nfs4_s
839 goto out_err; 864 goto out_err;
840 } 865 }
841 } 866 }
867 up_write(&nfsi->rwsem);
842 return 0; 868 return 0;
843out_err: 869out_err:
870 up_write(&nfsi->rwsem);
844 return status; 871 return status;
845} 872}
846 873
847static int nfs4_reclaim_open_state(struct nfs4_state_recovery_ops *ops, struct nfs4_state_owner *sp) 874static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs4_state_recovery_ops *ops)
848{ 875{
849 struct nfs4_state *state; 876 struct nfs4_state *state;
850 struct nfs4_lock_state *lock; 877 struct nfs4_lock_state *lock;
@@ -858,28 +885,34 @@ static int nfs4_reclaim_open_state(struct nfs4_state_recovery_ops *ops, struct n
858 * recovering after a network partition or a reboot from a 885 * recovering after a network partition or a reboot from a
859 * server that doesn't support a grace period. 886 * server that doesn't support a grace period.
860 */ 887 */
888restart:
889 spin_lock(&sp->so_lock);
861 list_for_each_entry(state, &sp->so_states, open_states) { 890 list_for_each_entry(state, &sp->so_states, open_states) {
891 if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
892 continue;
862 if (state->state == 0) 893 if (state->state == 0)
863 continue; 894 continue;
895 atomic_inc(&state->count);
896 spin_unlock(&sp->so_lock);
864 status = ops->recover_open(sp, state); 897 status = ops->recover_open(sp, state);
865 if (status >= 0) { 898 if (status >= 0) {
866 status = nfs4_reclaim_locks(ops, state); 899 status = nfs4_reclaim_locks(state, ops);
867 if (status < 0) 900 if (status >= 0) {
868 goto out_err; 901 list_for_each_entry(lock, &state->lock_states, ls_locks) {
869 list_for_each_entry(lock, &state->lock_states, ls_locks) { 902 if (!(lock->ls_flags & NFS_LOCK_INITIALIZED))
870 if (!(lock->ls_flags & NFS_LOCK_INITIALIZED)) 903 printk("%s: Lock reclaim failed!\n",
871 printk("%s: Lock reclaim failed!\n",
872 __func__); 904 __func__);
905 }
906 nfs4_put_open_state(state);
907 goto restart;
873 } 908 }
874 continue;
875 } 909 }
876 switch (status) { 910 switch (status) {
877 default: 911 default:
878 printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n", 912 printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
879 __func__, status); 913 __func__, status);
880 case -ENOENT: 914 case -ENOENT:
881 case -NFS4ERR_RECLAIM_BAD: 915 case -ESTALE:
882 case -NFS4ERR_RECLAIM_CONFLICT:
883 /* 916 /*
884 * Open state on this file cannot be recovered 917 * Open state on this file cannot be recovered
885 * All we can do is revert to using the zero stateid. 918 * All we can do is revert to using the zero stateid.
@@ -889,84 +922,176 @@ static int nfs4_reclaim_open_state(struct nfs4_state_recovery_ops *ops, struct n
889 /* Mark the file as being 'closed' */ 922 /* Mark the file as being 'closed' */
890 state->state = 0; 923 state->state = 0;
891 break; 924 break;
925 case -NFS4ERR_RECLAIM_BAD:
926 case -NFS4ERR_RECLAIM_CONFLICT:
927 nfs4_state_mark_reclaim_nograce(sp->so_client, state);
928 break;
892 case -NFS4ERR_EXPIRED: 929 case -NFS4ERR_EXPIRED:
893 case -NFS4ERR_NO_GRACE: 930 case -NFS4ERR_NO_GRACE:
931 nfs4_state_mark_reclaim_nograce(sp->so_client, state);
894 case -NFS4ERR_STALE_CLIENTID: 932 case -NFS4ERR_STALE_CLIENTID:
895 goto out_err; 933 goto out_err;
896 } 934 }
935 nfs4_put_open_state(state);
936 goto restart;
897 } 937 }
938 spin_unlock(&sp->so_lock);
898 return 0; 939 return 0;
899out_err: 940out_err:
941 nfs4_put_open_state(state);
900 return status; 942 return status;
901} 943}
902 944
903static void nfs4_state_mark_reclaim(struct nfs_client *clp) 945static void nfs4_clear_open_state(struct nfs4_state *state)
946{
947 struct nfs4_lock_state *lock;
948
949 clear_bit(NFS_DELEGATED_STATE, &state->flags);
950 clear_bit(NFS_O_RDONLY_STATE, &state->flags);
951 clear_bit(NFS_O_WRONLY_STATE, &state->flags);
952 clear_bit(NFS_O_RDWR_STATE, &state->flags);
953 list_for_each_entry(lock, &state->lock_states, ls_locks) {
954 lock->ls_seqid.flags = 0;
955 lock->ls_flags &= ~NFS_LOCK_INITIALIZED;
956 }
957}
958
959static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
904{ 960{
905 struct nfs4_state_owner *sp; 961 struct nfs4_state_owner *sp;
906 struct rb_node *pos; 962 struct rb_node *pos;
907 struct nfs4_state *state; 963 struct nfs4_state *state;
908 struct nfs4_lock_state *lock;
909 964
910 /* Reset all sequence ids to zero */ 965 /* Reset all sequence ids to zero */
911 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { 966 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
912 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); 967 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
913 sp->so_seqid.counter = 0;
914 sp->so_seqid.flags = 0; 968 sp->so_seqid.flags = 0;
915 spin_lock(&sp->so_lock); 969 spin_lock(&sp->so_lock);
916 list_for_each_entry(state, &sp->so_states, open_states) { 970 list_for_each_entry(state, &sp->so_states, open_states) {
917 clear_bit(NFS_DELEGATED_STATE, &state->flags); 971 if (mark_reclaim(clp, state))
918 clear_bit(NFS_O_RDONLY_STATE, &state->flags); 972 nfs4_clear_open_state(state);
919 clear_bit(NFS_O_WRONLY_STATE, &state->flags);
920 clear_bit(NFS_O_RDWR_STATE, &state->flags);
921 list_for_each_entry(lock, &state->lock_states, ls_locks) {
922 lock->ls_seqid.counter = 0;
923 lock->ls_seqid.flags = 0;
924 lock->ls_flags &= ~NFS_LOCK_INITIALIZED;
925 }
926 } 973 }
927 spin_unlock(&sp->so_lock); 974 spin_unlock(&sp->so_lock);
928 } 975 }
929} 976}
930 977
931static int reclaimer(void *ptr) 978static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
979{
980 /* Mark all delegations for reclaim */
981 nfs_delegation_mark_reclaim(clp);
982 nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot);
983}
984
985static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
932{ 986{
933 struct nfs_client *clp = ptr;
934 struct nfs4_state_owner *sp; 987 struct nfs4_state_owner *sp;
935 struct rb_node *pos; 988 struct rb_node *pos;
936 struct nfs4_state_recovery_ops *ops; 989 struct nfs4_state *state;
937 struct rpc_cred *cred; 990
991 if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
992 return;
993
994 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
995 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
996 spin_lock(&sp->so_lock);
997 list_for_each_entry(state, &sp->so_states, open_states) {
998 if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags))
999 continue;
1000 nfs4_state_mark_reclaim_nograce(clp, state);
1001 }
1002 spin_unlock(&sp->so_lock);
1003 }
1004
1005 nfs_delegation_reap_unclaimed(clp);
1006}
1007
1008static void nfs_delegation_clear_all(struct nfs_client *clp)
1009{
1010 nfs_delegation_mark_reclaim(clp);
1011 nfs_delegation_reap_unclaimed(clp);
1012}
1013
1014static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
1015{
1016 nfs_delegation_clear_all(clp);
1017 nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
1018}
1019
1020static void nfs4_state_end_reclaim_nograce(struct nfs_client *clp)
1021{
1022 clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
1023}
1024
1025static void nfs4_recovery_handle_error(struct nfs_client *clp, int error)
1026{
1027 switch (error) {
1028 case -NFS4ERR_CB_PATH_DOWN:
1029 nfs_handle_cb_pathdown(clp);
1030 break;
1031 case -NFS4ERR_STALE_CLIENTID:
1032 case -NFS4ERR_LEASE_MOVED:
1033 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
1034 nfs4_state_start_reclaim_reboot(clp);
1035 break;
1036 case -NFS4ERR_EXPIRED:
1037 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
1038 nfs4_state_start_reclaim_nograce(clp);
1039 }
1040}
1041
1042static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops)
1043{
1044 struct rb_node *pos;
938 int status = 0; 1045 int status = 0;
939 1046
940 allow_signal(SIGKILL); 1047restart:
1048 spin_lock(&clp->cl_lock);
1049 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
1050 struct nfs4_state_owner *sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
1051 if (!test_and_clear_bit(ops->owner_flag_bit, &sp->so_flags))
1052 continue;
1053 atomic_inc(&sp->so_count);
1054 spin_unlock(&clp->cl_lock);
1055 status = nfs4_reclaim_open_state(sp, ops);
1056 if (status < 0) {
1057 set_bit(ops->owner_flag_bit, &sp->so_flags);
1058 nfs4_put_state_owner(sp);
1059 nfs4_recovery_handle_error(clp, status);
1060 return status;
1061 }
1062 nfs4_put_state_owner(sp);
1063 goto restart;
1064 }
1065 spin_unlock(&clp->cl_lock);
1066 return status;
1067}
941 1068
942 /* Ensure exclusive access to NFSv4 state */ 1069static int nfs4_check_lease(struct nfs_client *clp)
943 down_write(&clp->cl_sem); 1070{
944 /* Are there any NFS mounts out there? */ 1071 struct rpc_cred *cred;
945 if (list_empty(&clp->cl_superblocks)) 1072 int status = -NFS4ERR_EXPIRED;
946 goto out; 1073
947restart_loop: 1074 /* Is the client already known to have an expired lease? */
948 ops = &nfs4_network_partition_recovery_ops; 1075 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
949 /* Are there any open files on this volume? */ 1076 return 0;
950 cred = nfs4_get_renew_cred(clp); 1077 cred = nfs4_get_renew_cred(clp);
951 if (cred != NULL) { 1078 if (cred == NULL) {
952 /* Yes there are: try to renew the old lease */ 1079 cred = nfs4_get_setclientid_cred(clp);
953 status = nfs4_proc_renew(clp, cred); 1080 if (cred == NULL)
954 put_rpccred(cred); 1081 goto out;
955 switch (status) {
956 case 0:
957 case -NFS4ERR_CB_PATH_DOWN:
958 goto out;
959 case -NFS4ERR_STALE_CLIENTID:
960 case -NFS4ERR_LEASE_MOVED:
961 ops = &nfs4_reboot_recovery_ops;
962 }
963 } else {
964 /* "reboot" to ensure we clear all state on the server */
965 clp->cl_boot_time = CURRENT_TIME;
966 } 1082 }
967 /* We're going to have to re-establish a clientid */ 1083 status = nfs4_proc_renew(clp, cred);
968 nfs4_state_mark_reclaim(clp); 1084 put_rpccred(cred);
969 status = -ENOENT; 1085out:
1086 nfs4_recovery_handle_error(clp, status);
1087 return status;
1088}
1089
1090static int nfs4_reclaim_lease(struct nfs_client *clp)
1091{
1092 struct rpc_cred *cred;
1093 int status = -ENOENT;
1094
970 cred = nfs4_get_setclientid_cred(clp); 1095 cred = nfs4_get_setclientid_cred(clp);
971 if (cred != NULL) { 1096 if (cred != NULL) {
972 status = nfs4_init_client(clp, cred); 1097 status = nfs4_init_client(clp, cred);
@@ -974,42 +1099,90 @@ restart_loop:
974 /* Handle case where the user hasn't set up machine creds */ 1099 /* Handle case where the user hasn't set up machine creds */
975 if (status == -EACCES && cred == clp->cl_machine_cred) { 1100 if (status == -EACCES && cred == clp->cl_machine_cred) {
976 nfs4_clear_machine_cred(clp); 1101 nfs4_clear_machine_cred(clp);
977 goto restart_loop; 1102 status = -EAGAIN;
978 } 1103 }
979 } 1104 }
980 if (status) 1105 return status;
981 goto out_error; 1106}
982 /* Mark all delegations for reclaim */ 1107
983 nfs_delegation_mark_reclaim(clp); 1108static void nfs4_state_manager(struct nfs_client *clp)
984 /* Note: list is protected by exclusive lock on cl->cl_sem */ 1109{
985 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { 1110 int status = 0;
986 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); 1111
987 status = nfs4_reclaim_open_state(ops, sp); 1112 /* Ensure exclusive access to NFSv4 state */
988 if (status < 0) { 1113 for(;;) {
989 if (status == -NFS4ERR_NO_GRACE) { 1114 if (test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) {
990 ops = &nfs4_network_partition_recovery_ops; 1115 /* We're going to have to re-establish a clientid */
991 status = nfs4_reclaim_open_state(ops, sp); 1116 status = nfs4_reclaim_lease(clp);
1117 if (status) {
1118 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
1119 if (status == -EAGAIN)
1120 continue;
1121 goto out_error;
992 } 1122 }
1123 clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
1124 }
1125
1126 if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
1127 status = nfs4_check_lease(clp);
1128 if (status != 0)
1129 continue;
1130 }
1131
1132 /* First recover reboot state... */
1133 if (test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
1134 status = nfs4_do_reclaim(clp, &nfs4_reboot_recovery_ops);
993 if (status == -NFS4ERR_STALE_CLIENTID) 1135 if (status == -NFS4ERR_STALE_CLIENTID)
994 goto restart_loop; 1136 continue;
995 if (status == -NFS4ERR_EXPIRED) 1137 nfs4_state_end_reclaim_reboot(clp);
996 goto restart_loop; 1138 continue;
1139 }
1140
1141 /* Now recover expired state... */
1142 if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
1143 status = nfs4_do_reclaim(clp, &nfs4_nograce_recovery_ops);
1144 if (status < 0) {
1145 set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
1146 if (status == -NFS4ERR_STALE_CLIENTID)
1147 continue;
1148 if (status == -NFS4ERR_EXPIRED)
1149 continue;
1150 goto out_error;
1151 } else
1152 nfs4_state_end_reclaim_nograce(clp);
1153 continue;
997 } 1154 }
1155
1156 if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
1157 nfs_client_return_marked_delegations(clp);
1158 continue;
1159 }
1160
1161 nfs4_clear_state_manager_bit(clp);
1162 /* Did we race with an attempt to give us more work? */
1163 if (clp->cl_state == 0)
1164 break;
1165 if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
1166 break;
998 } 1167 }
999 nfs_delegation_reap_unclaimed(clp); 1168 return;
1000out: 1169out_error:
1001 up_write(&clp->cl_sem); 1170 printk(KERN_WARNING "Error: state manager failed on NFSv4 server %s"
1002 if (status == -NFS4ERR_CB_PATH_DOWN) 1171 " with error %d\n", clp->cl_hostname, -status);
1003 nfs_handle_cb_pathdown(clp); 1172 if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
1004 nfs4_clear_recover_bit(clp); 1173 nfs4_state_end_reclaim_reboot(clp);
1174 nfs4_clear_state_manager_bit(clp);
1175}
1176
1177static int nfs4_run_state_manager(void *ptr)
1178{
1179 struct nfs_client *clp = ptr;
1180
1181 allow_signal(SIGKILL);
1182 nfs4_state_manager(clp);
1005 nfs_put_client(clp); 1183 nfs_put_client(clp);
1006 module_put_and_exit(0); 1184 module_put_and_exit(0);
1007 return 0; 1185 return 0;
1008out_error:
1009 printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %s"
1010 " with error %d\n", clp->cl_hostname, -status);
1011 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
1012 goto out;
1013} 1186}
1014 1187
1015/* 1188/*
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index b916297d2334..d1e4c8f8a0a9 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -8,7 +8,7 @@
8 * 8 *
9 * Kendrick Smith <kmsmith@umich.edu> 9 * Kendrick Smith <kmsmith@umich.edu>
10 * Andy Adamson <andros@umich.edu> 10 * Andy Adamson <andros@umich.edu>
11 * 11 *
12 * Redistribution and use in source and binary forms, with or without 12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions 13 * modification, are permitted provided that the following conditions
14 * are met: 14 * are met:
@@ -67,7 +67,7 @@ static int nfs4_stat_to_errno(int);
67#define NFS4_MAXTAGLEN 0 67#define NFS4_MAXTAGLEN 0
68#endif 68#endif
69 69
70/* lock,open owner id: 70/* lock,open owner id:
71 * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2) 71 * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2)
72 */ 72 */
73#define open_owner_id_maxsz (1 + 4) 73#define open_owner_id_maxsz (1 + 4)
@@ -541,6 +541,7 @@ static struct {
541struct compound_hdr { 541struct compound_hdr {
542 int32_t status; 542 int32_t status;
543 uint32_t nops; 543 uint32_t nops;
544 __be32 * nops_p;
544 uint32_t taglen; 545 uint32_t taglen;
545 char * tag; 546 char * tag;
546}; 547};
@@ -578,7 +579,7 @@ static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *
578 xdr_encode_opaque(p, str, len); 579 xdr_encode_opaque(p, str, len);
579} 580}
580 581
581static int encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) 582static void encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
582{ 583{
583 __be32 *p; 584 __be32 *p;
584 585
@@ -588,8 +589,13 @@ static int encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
588 WRITE32(hdr->taglen); 589 WRITE32(hdr->taglen);
589 WRITEMEM(hdr->tag, hdr->taglen); 590 WRITEMEM(hdr->tag, hdr->taglen);
590 WRITE32(NFS4_MINOR_VERSION); 591 WRITE32(NFS4_MINOR_VERSION);
592 hdr->nops_p = p;
591 WRITE32(hdr->nops); 593 WRITE32(hdr->nops);
592 return 0; 594}
595
596static void encode_nops(struct compound_hdr *hdr)
597{
598 *hdr->nops_p = htonl(hdr->nops);
593} 599}
594 600
595static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf) 601static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf)
@@ -601,7 +607,7 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
601 xdr_encode_opaque_fixed(p, verf->data, NFS4_VERIFIER_SIZE); 607 xdr_encode_opaque_fixed(p, verf->data, NFS4_VERIFIER_SIZE);
602} 608}
603 609
604static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server) 610static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server)
605{ 611{
606 char owner_name[IDMAP_NAMESZ]; 612 char owner_name[IDMAP_NAMESZ];
607 char owner_group[IDMAP_NAMESZ]; 613 char owner_group[IDMAP_NAMESZ];
@@ -612,7 +618,6 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
612 int len; 618 int len;
613 uint32_t bmval0 = 0; 619 uint32_t bmval0 = 0;
614 uint32_t bmval1 = 0; 620 uint32_t bmval1 = 0;
615 int status;
616 621
617 /* 622 /*
618 * We reserve enough space to write the entire attribute buffer at once. 623 * We reserve enough space to write the entire attribute buffer at once.
@@ -709,7 +714,7 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
709 bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; 714 bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
710 WRITE32(NFS4_SET_TO_SERVER_TIME); 715 WRITE32(NFS4_SET_TO_SERVER_TIME);
711 } 716 }
712 717
713 /* 718 /*
714 * Now we backfill the bitmap and the attribute buffer length. 719 * Now we backfill the bitmap and the attribute buffer length.
715 */ 720 */
@@ -723,23 +728,20 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
723 *q++ = htonl(bmval1); 728 *q++ = htonl(bmval1);
724 *q++ = htonl(len); 729 *q++ = htonl(len);
725 730
726 status = 0;
727/* out: */ 731/* out: */
728 return status;
729} 732}
730 733
731static int encode_access(struct xdr_stream *xdr, u32 access) 734static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hdr *hdr)
732{ 735{
733 __be32 *p; 736 __be32 *p;
734 737
735 RESERVE_SPACE(8); 738 RESERVE_SPACE(8);
736 WRITE32(OP_ACCESS); 739 WRITE32(OP_ACCESS);
737 WRITE32(access); 740 WRITE32(access);
738 741 hdr->nops++;
739 return 0;
740} 742}
741 743
742static int encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg) 744static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
743{ 745{
744 __be32 *p; 746 __be32 *p;
745 747
@@ -747,26 +749,24 @@ static int encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
747 WRITE32(OP_CLOSE); 749 WRITE32(OP_CLOSE);
748 WRITE32(arg->seqid->sequence->counter); 750 WRITE32(arg->seqid->sequence->counter);
749 WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); 751 WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
750 752 hdr->nops++;
751 return 0;
752} 753}
753 754
754static int encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args) 755static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
755{ 756{
756 __be32 *p; 757 __be32 *p;
757
758 RESERVE_SPACE(16);
759 WRITE32(OP_COMMIT);
760 WRITE64(args->offset);
761 WRITE32(args->count);
762 758
763 return 0; 759 RESERVE_SPACE(16);
760 WRITE32(OP_COMMIT);
761 WRITE64(args->offset);
762 WRITE32(args->count);
763 hdr->nops++;
764} 764}
765 765
766static int encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create) 766static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create, struct compound_hdr *hdr)
767{ 767{
768 __be32 *p; 768 __be32 *p;
769 769
770 RESERVE_SPACE(8); 770 RESERVE_SPACE(8);
771 WRITE32(OP_CREATE); 771 WRITE32(OP_CREATE);
772 WRITE32(create->ftype); 772 WRITE32(create->ftype);
@@ -791,64 +791,62 @@ static int encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *c
791 RESERVE_SPACE(4 + create->name->len); 791 RESERVE_SPACE(4 + create->name->len);
792 WRITE32(create->name->len); 792 WRITE32(create->name->len);
793 WRITEMEM(create->name->name, create->name->len); 793 WRITEMEM(create->name->name, create->name->len);
794 hdr->nops++;
794 795
795 return encode_attrs(xdr, create->attrs, create->server); 796 encode_attrs(xdr, create->attrs, create->server);
796} 797}
797 798
798static int encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap) 799static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
799{ 800{
800 __be32 *p; 801 __be32 *p;
801 802
802 RESERVE_SPACE(12); 803 RESERVE_SPACE(12);
803 WRITE32(OP_GETATTR); 804 WRITE32(OP_GETATTR);
804 WRITE32(1); 805 WRITE32(1);
805 WRITE32(bitmap); 806 WRITE32(bitmap);
806 return 0; 807 hdr->nops++;
807} 808}
808 809
809static int encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1) 810static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1, struct compound_hdr *hdr)
810{ 811{
811 __be32 *p; 812 __be32 *p;
812 813
813 RESERVE_SPACE(16); 814 RESERVE_SPACE(16);
814 WRITE32(OP_GETATTR); 815 WRITE32(OP_GETATTR);
815 WRITE32(2); 816 WRITE32(2);
816 WRITE32(bm0); 817 WRITE32(bm0);
817 WRITE32(bm1); 818 WRITE32(bm1);
818 return 0; 819 hdr->nops++;
819} 820}
820 821
821static int encode_getfattr(struct xdr_stream *xdr, const u32* bitmask) 822static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
822{ 823{
823 return encode_getattr_two(xdr, 824 encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
824 bitmask[0] & nfs4_fattr_bitmap[0], 825 bitmask[1] & nfs4_fattr_bitmap[1], hdr);
825 bitmask[1] & nfs4_fattr_bitmap[1]);
826} 826}
827 827
828static int encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask) 828static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
829{ 829{
830 return encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0], 830 encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0],
831 bitmask[1] & nfs4_fsinfo_bitmap[1]); 831 bitmask[1] & nfs4_fsinfo_bitmap[1], hdr);
832} 832}
833 833
834static int encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask) 834static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
835{ 835{
836 return encode_getattr_two(xdr, 836 encode_getattr_two(xdr, bitmask[0] & nfs4_fs_locations_bitmap[0],
837 bitmask[0] & nfs4_fs_locations_bitmap[0], 837 bitmask[1] & nfs4_fs_locations_bitmap[1], hdr);
838 bitmask[1] & nfs4_fs_locations_bitmap[1]);
839} 838}
840 839
841static int encode_getfh(struct xdr_stream *xdr) 840static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
842{ 841{
843 __be32 *p; 842 __be32 *p;
844 843
845 RESERVE_SPACE(4); 844 RESERVE_SPACE(4);
846 WRITE32(OP_GETFH); 845 WRITE32(OP_GETFH);
847 846 hdr->nops++;
848 return 0;
849} 847}
850 848
851static int encode_link(struct xdr_stream *xdr, const struct qstr *name) 849static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
852{ 850{
853 __be32 *p; 851 __be32 *p;
854 852
@@ -856,8 +854,7 @@ static int encode_link(struct xdr_stream *xdr, const struct qstr *name)
856 WRITE32(OP_LINK); 854 WRITE32(OP_LINK);
857 WRITE32(name->len); 855 WRITE32(name->len);
858 WRITEMEM(name->name, name->len); 856 WRITEMEM(name->name, name->len);
859 857 hdr->nops++;
860 return 0;
861} 858}
862 859
863static inline int nfs4_lock_type(struct file_lock *fl, int block) 860static inline int nfs4_lock_type(struct file_lock *fl, int block)
@@ -878,7 +875,7 @@ static inline uint64_t nfs4_lock_length(struct file_lock *fl)
878 * opcode,type,reclaim,offset,length,new_lock_owner = 32 875 * opcode,type,reclaim,offset,length,new_lock_owner = 32
879 * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40 876 * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40
880 */ 877 */
881static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args) 878static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args, struct compound_hdr *hdr)
882{ 879{
883 __be32 *p; 880 __be32 *p;
884 881
@@ -904,11 +901,10 @@ static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args)
904 WRITEMEM(args->lock_stateid->data, NFS4_STATEID_SIZE); 901 WRITEMEM(args->lock_stateid->data, NFS4_STATEID_SIZE);
905 WRITE32(args->lock_seqid->sequence->counter); 902 WRITE32(args->lock_seqid->sequence->counter);
906 } 903 }
907 904 hdr->nops++;
908 return 0;
909} 905}
910 906
911static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args) 907static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args, struct compound_hdr *hdr)
912{ 908{
913 __be32 *p; 909 __be32 *p;
914 910
@@ -921,11 +917,10 @@ static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *arg
921 WRITE32(16); 917 WRITE32(16);
922 WRITEMEM("lock id:", 8); 918 WRITEMEM("lock id:", 8);
923 WRITE64(args->lock_owner.id); 919 WRITE64(args->lock_owner.id);
924 920 hdr->nops++;
925 return 0;
926} 921}
927 922
928static int encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args) 923static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args, struct compound_hdr *hdr)
929{ 924{
930 __be32 *p; 925 __be32 *p;
931 926
@@ -936,11 +931,10 @@ static int encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *arg
936 WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE); 931 WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE);
937 WRITE64(args->fl->fl_start); 932 WRITE64(args->fl->fl_start);
938 WRITE64(nfs4_lock_length(args->fl)); 933 WRITE64(nfs4_lock_length(args->fl));
939 934 hdr->nops++;
940 return 0;
941} 935}
942 936
943static int encode_lookup(struct xdr_stream *xdr, const struct qstr *name) 937static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
944{ 938{
945 int len = name->len; 939 int len = name->len;
946 __be32 *p; 940 __be32 *p;
@@ -949,27 +943,26 @@ static int encode_lookup(struct xdr_stream *xdr, const struct qstr *name)
949 WRITE32(OP_LOOKUP); 943 WRITE32(OP_LOOKUP);
950 WRITE32(len); 944 WRITE32(len);
951 WRITEMEM(name->name, len); 945 WRITEMEM(name->name, len);
952 946 hdr->nops++;
953 return 0;
954} 947}
955 948
956static void encode_share_access(struct xdr_stream *xdr, int open_flags) 949static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
957{ 950{
958 __be32 *p; 951 __be32 *p;
959 952
960 RESERVE_SPACE(8); 953 RESERVE_SPACE(8);
961 switch (open_flags & (FMODE_READ|FMODE_WRITE)) { 954 switch (fmode & (FMODE_READ|FMODE_WRITE)) {
962 case FMODE_READ: 955 case FMODE_READ:
963 WRITE32(NFS4_SHARE_ACCESS_READ); 956 WRITE32(NFS4_SHARE_ACCESS_READ);
964 break; 957 break;
965 case FMODE_WRITE: 958 case FMODE_WRITE:
966 WRITE32(NFS4_SHARE_ACCESS_WRITE); 959 WRITE32(NFS4_SHARE_ACCESS_WRITE);
967 break; 960 break;
968 case FMODE_READ|FMODE_WRITE: 961 case FMODE_READ|FMODE_WRITE:
969 WRITE32(NFS4_SHARE_ACCESS_BOTH); 962 WRITE32(NFS4_SHARE_ACCESS_BOTH);
970 break; 963 break;
971 default: 964 default:
972 BUG(); 965 WRITE32(0);
973 } 966 }
974 WRITE32(0); /* for linux, share_deny = 0 always */ 967 WRITE32(0); /* for linux, share_deny = 0 always */
975} 968}
@@ -984,7 +977,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
984 RESERVE_SPACE(8); 977 RESERVE_SPACE(8);
985 WRITE32(OP_OPEN); 978 WRITE32(OP_OPEN);
986 WRITE32(arg->seqid->sequence->counter); 979 WRITE32(arg->seqid->sequence->counter);
987 encode_share_access(xdr, arg->open_flags); 980 encode_share_access(xdr, arg->fmode);
988 RESERVE_SPACE(28); 981 RESERVE_SPACE(28);
989 WRITE64(arg->clientid); 982 WRITE64(arg->clientid);
990 WRITE32(16); 983 WRITE32(16);
@@ -998,13 +991,13 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
998 991
999 RESERVE_SPACE(4); 992 RESERVE_SPACE(4);
1000 switch(arg->open_flags & O_EXCL) { 993 switch(arg->open_flags & O_EXCL) {
1001 case 0: 994 case 0:
1002 WRITE32(NFS4_CREATE_UNCHECKED); 995 WRITE32(NFS4_CREATE_UNCHECKED);
1003 encode_attrs(xdr, arg->u.attrs, arg->server); 996 encode_attrs(xdr, arg->u.attrs, arg->server);
1004 break; 997 break;
1005 default: 998 default:
1006 WRITE32(NFS4_CREATE_EXCLUSIVE); 999 WRITE32(NFS4_CREATE_EXCLUSIVE);
1007 encode_nfs4_verifier(xdr, &arg->u.verifier); 1000 encode_nfs4_verifier(xdr, &arg->u.verifier);
1008 } 1001 }
1009} 1002}
1010 1003
@@ -1014,33 +1007,33 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
1014 1007
1015 RESERVE_SPACE(4); 1008 RESERVE_SPACE(4);
1016 switch (arg->open_flags & O_CREAT) { 1009 switch (arg->open_flags & O_CREAT) {
1017 case 0: 1010 case 0:
1018 WRITE32(NFS4_OPEN_NOCREATE); 1011 WRITE32(NFS4_OPEN_NOCREATE);
1019 break; 1012 break;
1020 default: 1013 default:
1021 BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL); 1014 BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
1022 WRITE32(NFS4_OPEN_CREATE); 1015 WRITE32(NFS4_OPEN_CREATE);
1023 encode_createmode(xdr, arg); 1016 encode_createmode(xdr, arg);
1024 } 1017 }
1025} 1018}
1026 1019
1027static inline void encode_delegation_type(struct xdr_stream *xdr, int delegation_type) 1020static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delegation_type)
1028{ 1021{
1029 __be32 *p; 1022 __be32 *p;
1030 1023
1031 RESERVE_SPACE(4); 1024 RESERVE_SPACE(4);
1032 switch (delegation_type) { 1025 switch (delegation_type) {
1033 case 0: 1026 case 0:
1034 WRITE32(NFS4_OPEN_DELEGATE_NONE); 1027 WRITE32(NFS4_OPEN_DELEGATE_NONE);
1035 break; 1028 break;
1036 case FMODE_READ: 1029 case FMODE_READ:
1037 WRITE32(NFS4_OPEN_DELEGATE_READ); 1030 WRITE32(NFS4_OPEN_DELEGATE_READ);
1038 break; 1031 break;
1039 case FMODE_WRITE|FMODE_READ: 1032 case FMODE_WRITE|FMODE_READ:
1040 WRITE32(NFS4_OPEN_DELEGATE_WRITE); 1033 WRITE32(NFS4_OPEN_DELEGATE_WRITE);
1041 break; 1034 break;
1042 default: 1035 default:
1043 BUG(); 1036 BUG();
1044 } 1037 }
1045} 1038}
1046 1039
@@ -1053,7 +1046,7 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *
1053 encode_string(xdr, name->len, name->name); 1046 encode_string(xdr, name->len, name->name);
1054} 1047}
1055 1048
1056static inline void encode_claim_previous(struct xdr_stream *xdr, int type) 1049static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type)
1057{ 1050{
1058 __be32 *p; 1051 __be32 *p;
1059 1052
@@ -1072,27 +1065,27 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
1072 encode_string(xdr, name->len, name->name); 1065 encode_string(xdr, name->len, name->name);
1073} 1066}
1074 1067
1075static int encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg) 1068static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, struct compound_hdr *hdr)
1076{ 1069{
1077 encode_openhdr(xdr, arg); 1070 encode_openhdr(xdr, arg);
1078 encode_opentype(xdr, arg); 1071 encode_opentype(xdr, arg);
1079 switch (arg->claim) { 1072 switch (arg->claim) {
1080 case NFS4_OPEN_CLAIM_NULL: 1073 case NFS4_OPEN_CLAIM_NULL:
1081 encode_claim_null(xdr, arg->name); 1074 encode_claim_null(xdr, arg->name);
1082 break; 1075 break;
1083 case NFS4_OPEN_CLAIM_PREVIOUS: 1076 case NFS4_OPEN_CLAIM_PREVIOUS:
1084 encode_claim_previous(xdr, arg->u.delegation_type); 1077 encode_claim_previous(xdr, arg->u.delegation_type);
1085 break; 1078 break;
1086 case NFS4_OPEN_CLAIM_DELEGATE_CUR: 1079 case NFS4_OPEN_CLAIM_DELEGATE_CUR:
1087 encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation); 1080 encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation);
1088 break; 1081 break;
1089 default: 1082 default:
1090 BUG(); 1083 BUG();
1091 } 1084 }
1092 return 0; 1085 hdr->nops++;
1093} 1086}
1094 1087
1095static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg) 1088static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg, struct compound_hdr *hdr)
1096{ 1089{
1097 __be32 *p; 1090 __be32 *p;
1098 1091
@@ -1100,11 +1093,10 @@ static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_con
1100 WRITE32(OP_OPEN_CONFIRM); 1093 WRITE32(OP_OPEN_CONFIRM);
1101 WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); 1094 WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
1102 WRITE32(arg->seqid->sequence->counter); 1095 WRITE32(arg->seqid->sequence->counter);
1103 1096 hdr->nops++;
1104 return 0;
1105} 1097}
1106 1098
1107static int encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg) 1099static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
1108{ 1100{
1109 __be32 *p; 1101 __be32 *p;
1110 1102
@@ -1112,12 +1104,12 @@ static int encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closea
1112 WRITE32(OP_OPEN_DOWNGRADE); 1104 WRITE32(OP_OPEN_DOWNGRADE);
1113 WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); 1105 WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
1114 WRITE32(arg->seqid->sequence->counter); 1106 WRITE32(arg->seqid->sequence->counter);
1115 encode_share_access(xdr, arg->open_flags); 1107 encode_share_access(xdr, arg->fmode);
1116 return 0; 1108 hdr->nops++;
1117} 1109}
1118 1110
1119static int 1111static void
1120encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh) 1112encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hdr *hdr)
1121{ 1113{
1122 int len = fh->size; 1114 int len = fh->size;
1123 __be32 *p; 1115 __be32 *p;
@@ -1126,18 +1118,16 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh)
1126 WRITE32(OP_PUTFH); 1118 WRITE32(OP_PUTFH);
1127 WRITE32(len); 1119 WRITE32(len);
1128 WRITEMEM(fh->data, len); 1120 WRITEMEM(fh->data, len);
1129 1121 hdr->nops++;
1130 return 0;
1131} 1122}
1132 1123
1133static int encode_putrootfh(struct xdr_stream *xdr) 1124static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1134{ 1125{
1135 __be32 *p; 1126 __be32 *p;
1136
1137 RESERVE_SPACE(4);
1138 WRITE32(OP_PUTROOTFH);
1139 1127
1140 return 0; 1128 RESERVE_SPACE(4);
1129 WRITE32(OP_PUTROOTFH);
1130 hdr->nops++;
1141} 1131}
1142 1132
1143static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx) 1133static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx)
@@ -1153,7 +1143,7 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
1153 WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE); 1143 WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE);
1154} 1144}
1155 1145
1156static int encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args) 1146static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)
1157{ 1147{
1158 __be32 *p; 1148 __be32 *p;
1159 1149
@@ -1165,11 +1155,10 @@ static int encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args)
1165 RESERVE_SPACE(12); 1155 RESERVE_SPACE(12);
1166 WRITE64(args->offset); 1156 WRITE64(args->offset);
1167 WRITE32(args->count); 1157 WRITE32(args->count);
1168 1158 hdr->nops++;
1169 return 0;
1170} 1159}
1171 1160
1172static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req) 1161static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
1173{ 1162{
1174 uint32_t attrs[2] = { 1163 uint32_t attrs[2] = {
1175 FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID, 1164 FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID,
@@ -1191,6 +1180,7 @@ static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
1191 attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; 1180 attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
1192 WRITE32(attrs[0] & readdir->bitmask[0]); 1181 WRITE32(attrs[0] & readdir->bitmask[0]);
1193 WRITE32(attrs[1] & readdir->bitmask[1]); 1182 WRITE32(attrs[1] & readdir->bitmask[1]);
1183 hdr->nops++;
1194 dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n", 1184 dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
1195 __func__, 1185 __func__,
1196 (unsigned long long)readdir->cookie, 1186 (unsigned long long)readdir->cookie,
@@ -1198,21 +1188,18 @@ static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
1198 ((u32 *)readdir->verifier.data)[1], 1188 ((u32 *)readdir->verifier.data)[1],
1199 attrs[0] & readdir->bitmask[0], 1189 attrs[0] & readdir->bitmask[0],
1200 attrs[1] & readdir->bitmask[1]); 1190 attrs[1] & readdir->bitmask[1]);
1201
1202 return 0;
1203} 1191}
1204 1192
1205static int encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req) 1193static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr)
1206{ 1194{
1207 __be32 *p; 1195 __be32 *p;
1208 1196
1209 RESERVE_SPACE(4); 1197 RESERVE_SPACE(4);
1210 WRITE32(OP_READLINK); 1198 WRITE32(OP_READLINK);
1211 1199 hdr->nops++;
1212 return 0;
1213} 1200}
1214 1201
1215static int encode_remove(struct xdr_stream *xdr, const struct qstr *name) 1202static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
1216{ 1203{
1217 __be32 *p; 1204 __be32 *p;
1218 1205
@@ -1220,11 +1207,10 @@ static int encode_remove(struct xdr_stream *xdr, const struct qstr *name)
1220 WRITE32(OP_REMOVE); 1207 WRITE32(OP_REMOVE);
1221 WRITE32(name->len); 1208 WRITE32(name->len);
1222 WRITEMEM(name->name, name->len); 1209 WRITEMEM(name->name, name->len);
1223 1210 hdr->nops++;
1224 return 0;
1225} 1211}
1226 1212
1227static int encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname) 1213static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname, struct compound_hdr *hdr)
1228{ 1214{
1229 __be32 *p; 1215 __be32 *p;
1230 1216
@@ -1232,38 +1218,35 @@ static int encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, con
1232 WRITE32(OP_RENAME); 1218 WRITE32(OP_RENAME);
1233 WRITE32(oldname->len); 1219 WRITE32(oldname->len);
1234 WRITEMEM(oldname->name, oldname->len); 1220 WRITEMEM(oldname->name, oldname->len);
1235 1221
1236 RESERVE_SPACE(4 + newname->len); 1222 RESERVE_SPACE(4 + newname->len);
1237 WRITE32(newname->len); 1223 WRITE32(newname->len);
1238 WRITEMEM(newname->name, newname->len); 1224 WRITEMEM(newname->name, newname->len);
1239 1225 hdr->nops++;
1240 return 0;
1241} 1226}
1242 1227
1243static int encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid) 1228static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid, struct compound_hdr *hdr)
1244{ 1229{
1245 __be32 *p; 1230 __be32 *p;
1246 1231
1247 RESERVE_SPACE(12); 1232 RESERVE_SPACE(12);
1248 WRITE32(OP_RENEW); 1233 WRITE32(OP_RENEW);
1249 WRITE64(client_stateid->cl_clientid); 1234 WRITE64(client_stateid->cl_clientid);
1250 1235 hdr->nops++;
1251 return 0;
1252} 1236}
1253 1237
1254static int 1238static void
1255encode_restorefh(struct xdr_stream *xdr) 1239encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1256{ 1240{
1257 __be32 *p; 1241 __be32 *p;
1258 1242
1259 RESERVE_SPACE(4); 1243 RESERVE_SPACE(4);
1260 WRITE32(OP_RESTOREFH); 1244 WRITE32(OP_RESTOREFH);
1261 1245 hdr->nops++;
1262 return 0;
1263} 1246}
1264 1247
1265static int 1248static int
1266encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg) 1249encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compound_hdr *hdr)
1267{ 1250{
1268 __be32 *p; 1251 __be32 *p;
1269 1252
@@ -1278,36 +1261,32 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg)
1278 RESERVE_SPACE(4); 1261 RESERVE_SPACE(4);
1279 WRITE32(arg->acl_len); 1262 WRITE32(arg->acl_len);
1280 xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); 1263 xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
1264 hdr->nops++;
1281 return 0; 1265 return 0;
1282} 1266}
1283 1267
1284static int 1268static void
1285encode_savefh(struct xdr_stream *xdr) 1269encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1286{ 1270{
1287 __be32 *p; 1271 __be32 *p;
1288 1272
1289 RESERVE_SPACE(4); 1273 RESERVE_SPACE(4);
1290 WRITE32(OP_SAVEFH); 1274 WRITE32(OP_SAVEFH);
1291 1275 hdr->nops++;
1292 return 0;
1293} 1276}
1294 1277
1295static int encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server) 1278static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server, struct compound_hdr *hdr)
1296{ 1279{
1297 int status;
1298 __be32 *p; 1280 __be32 *p;
1299
1300 RESERVE_SPACE(4+NFS4_STATEID_SIZE);
1301 WRITE32(OP_SETATTR);
1302 WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE);
1303 1281
1304 if ((status = encode_attrs(xdr, arg->iap, server))) 1282 RESERVE_SPACE(4+NFS4_STATEID_SIZE);
1305 return status; 1283 WRITE32(OP_SETATTR);
1306 1284 WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE);
1307 return 0; 1285 hdr->nops++;
1286 encode_attrs(xdr, arg->iap, server);
1308} 1287}
1309 1288
1310static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid) 1289static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
1311{ 1290{
1312 __be32 *p; 1291 __be32 *p;
1313 1292
@@ -1322,23 +1301,21 @@ static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclien
1322 encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr); 1301 encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
1323 RESERVE_SPACE(4); 1302 RESERVE_SPACE(4);
1324 WRITE32(setclientid->sc_cb_ident); 1303 WRITE32(setclientid->sc_cb_ident);
1325 1304 hdr->nops++;
1326 return 0;
1327} 1305}
1328 1306
1329static int encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state) 1307static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state, struct compound_hdr *hdr)
1330{ 1308{
1331 __be32 *p; 1309 __be32 *p;
1332
1333 RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
1334 WRITE32(OP_SETCLIENTID_CONFIRM);
1335 WRITE64(client_state->cl_clientid);
1336 WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
1337 1310
1338 return 0; 1311 RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
1312 WRITE32(OP_SETCLIENTID_CONFIRM);
1313 WRITE64(client_state->cl_clientid);
1314 WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
1315 hdr->nops++;
1339} 1316}
1340 1317
1341static int encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args) 1318static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
1342{ 1319{
1343 __be32 *p; 1320 __be32 *p;
1344 1321
@@ -1353,11 +1330,10 @@ static int encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args
1353 WRITE32(args->count); 1330 WRITE32(args->count);
1354 1331
1355 xdr_write_pages(xdr, args->pages, args->pgbase, args->count); 1332 xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
1356 1333 hdr->nops++;
1357 return 0;
1358} 1334}
1359 1335
1360static int encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid) 1336static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid, struct compound_hdr *hdr)
1361{ 1337{
1362 __be32 *p; 1338 __be32 *p;
1363 1339
@@ -1365,8 +1341,7 @@ static int encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *statei
1365 1341
1366 WRITE32(OP_DELEGRETURN); 1342 WRITE32(OP_DELEGRETURN);
1367 WRITEMEM(stateid->data, NFS4_STATEID_SIZE); 1343 WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
1368 return 0; 1344 hdr->nops++;
1369
1370} 1345}
1371/* 1346/*
1372 * END OF "GENERIC" ENCODE ROUTINES. 1347 * END OF "GENERIC" ENCODE ROUTINES.
@@ -1379,21 +1354,16 @@ static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs
1379{ 1354{
1380 struct xdr_stream xdr; 1355 struct xdr_stream xdr;
1381 struct compound_hdr hdr = { 1356 struct compound_hdr hdr = {
1382 .nops = 3, 1357 .nops = 0,
1383 }; 1358 };
1384 int status;
1385 1359
1386 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1360 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1387 encode_compound_hdr(&xdr, &hdr); 1361 encode_compound_hdr(&xdr, &hdr);
1388 status = encode_putfh(&xdr, args->fh); 1362 encode_putfh(&xdr, args->fh, &hdr);
1389 if (status != 0) 1363 encode_access(&xdr, args->access, &hdr);
1390 goto out; 1364 encode_getfattr(&xdr, args->bitmask, &hdr);
1391 status = encode_access(&xdr, args->access); 1365 encode_nops(&hdr);
1392 if (status != 0) 1366 return 0;
1393 goto out;
1394 status = encode_getfattr(&xdr, args->bitmask);
1395out:
1396 return status;
1397} 1367}
1398 1368
1399/* 1369/*
@@ -1403,21 +1373,17 @@ static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, __be32 *p, const struct nfs
1403{ 1373{
1404 struct xdr_stream xdr; 1374 struct xdr_stream xdr;
1405 struct compound_hdr hdr = { 1375 struct compound_hdr hdr = {
1406 .nops = 4, 1376 .nops = 0,
1407 }; 1377 };
1408 int status;
1409 1378
1410 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1379 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1411 encode_compound_hdr(&xdr, &hdr); 1380 encode_compound_hdr(&xdr, &hdr);
1412 if ((status = encode_putfh(&xdr, args->dir_fh)) != 0) 1381 encode_putfh(&xdr, args->dir_fh, &hdr);
1413 goto out; 1382 encode_lookup(&xdr, args->name, &hdr);
1414 if ((status = encode_lookup(&xdr, args->name)) != 0) 1383 encode_getfh(&xdr, &hdr);
1415 goto out; 1384 encode_getfattr(&xdr, args->bitmask, &hdr);
1416 if ((status = encode_getfh(&xdr)) != 0) 1385 encode_nops(&hdr);
1417 goto out; 1386 return 0;
1418 status = encode_getfattr(&xdr, args->bitmask);
1419out:
1420 return status;
1421} 1387}
1422 1388
1423/* 1389/*
@@ -1427,18 +1393,16 @@ static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, __be32 *p, const struc
1427{ 1393{
1428 struct xdr_stream xdr; 1394 struct xdr_stream xdr;
1429 struct compound_hdr hdr = { 1395 struct compound_hdr hdr = {
1430 .nops = 3, 1396 .nops = 0,
1431 }; 1397 };
1432 int status;
1433 1398
1434 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1399 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1435 encode_compound_hdr(&xdr, &hdr); 1400 encode_compound_hdr(&xdr, &hdr);
1436 if ((status = encode_putrootfh(&xdr)) != 0) 1401 encode_putrootfh(&xdr, &hdr);
1437 goto out; 1402 encode_getfh(&xdr, &hdr);
1438 if ((status = encode_getfh(&xdr)) == 0) 1403 encode_getfattr(&xdr, args->bitmask, &hdr);
1439 status = encode_getfattr(&xdr, args->bitmask); 1404 encode_nops(&hdr);
1440out: 1405 return 0;
1441 return status;
1442} 1406}
1443 1407
1444/* 1408/*
@@ -1448,19 +1412,16 @@ static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs
1448{ 1412{
1449 struct xdr_stream xdr; 1413 struct xdr_stream xdr;
1450 struct compound_hdr hdr = { 1414 struct compound_hdr hdr = {
1451 .nops = 3, 1415 .nops = 0,
1452 }; 1416 };
1453 int status;
1454 1417
1455 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1418 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1456 encode_compound_hdr(&xdr, &hdr); 1419 encode_compound_hdr(&xdr, &hdr);
1457 if ((status = encode_putfh(&xdr, args->fh)) != 0) 1420 encode_putfh(&xdr, args->fh, &hdr);
1458 goto out; 1421 encode_remove(&xdr, &args->name, &hdr);
1459 if ((status = encode_remove(&xdr, &args->name)) != 0) 1422 encode_getfattr(&xdr, args->bitmask, &hdr);
1460 goto out; 1423 encode_nops(&hdr);
1461 status = encode_getfattr(&xdr, args->bitmask); 1424 return 0;
1462out:
1463 return status;
1464} 1425}
1465 1426
1466/* 1427/*
@@ -1470,27 +1431,20 @@ static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs
1470{ 1431{
1471 struct xdr_stream xdr; 1432 struct xdr_stream xdr;
1472 struct compound_hdr hdr = { 1433 struct compound_hdr hdr = {
1473 .nops = 7, 1434 .nops = 0,
1474 }; 1435 };
1475 int status;
1476 1436
1477 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1437 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1478 encode_compound_hdr(&xdr, &hdr); 1438 encode_compound_hdr(&xdr, &hdr);
1479 if ((status = encode_putfh(&xdr, args->old_dir)) != 0) 1439 encode_putfh(&xdr, args->old_dir, &hdr);
1480 goto out; 1440 encode_savefh(&xdr, &hdr);
1481 if ((status = encode_savefh(&xdr)) != 0) 1441 encode_putfh(&xdr, args->new_dir, &hdr);
1482 goto out; 1442 encode_rename(&xdr, args->old_name, args->new_name, &hdr);
1483 if ((status = encode_putfh(&xdr, args->new_dir)) != 0) 1443 encode_getfattr(&xdr, args->bitmask, &hdr);
1484 goto out; 1444 encode_restorefh(&xdr, &hdr);
1485 if ((status = encode_rename(&xdr, args->old_name, args->new_name)) != 0) 1445 encode_getfattr(&xdr, args->bitmask, &hdr);
1486 goto out; 1446 encode_nops(&hdr);
1487 if ((status = encode_getfattr(&xdr, args->bitmask)) != 0) 1447 return 0;
1488 goto out;
1489 if ((status = encode_restorefh(&xdr)) != 0)
1490 goto out;
1491 status = encode_getfattr(&xdr, args->bitmask);
1492out:
1493 return status;
1494} 1448}
1495 1449
1496/* 1450/*
@@ -1500,27 +1454,20 @@ static int nfs4_xdr_enc_link(struct rpc_rqst *req, __be32 *p, const struct nfs4_
1500{ 1454{
1501 struct xdr_stream xdr; 1455 struct xdr_stream xdr;
1502 struct compound_hdr hdr = { 1456 struct compound_hdr hdr = {
1503 .nops = 7, 1457 .nops = 0,
1504 }; 1458 };
1505 int status;
1506 1459
1507 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1460 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1508 encode_compound_hdr(&xdr, &hdr); 1461 encode_compound_hdr(&xdr, &hdr);
1509 if ((status = encode_putfh(&xdr, args->fh)) != 0) 1462 encode_putfh(&xdr, args->fh, &hdr);
1510 goto out; 1463 encode_savefh(&xdr, &hdr);
1511 if ((status = encode_savefh(&xdr)) != 0) 1464 encode_putfh(&xdr, args->dir_fh, &hdr);
1512 goto out; 1465 encode_link(&xdr, args->name, &hdr);
1513 if ((status = encode_putfh(&xdr, args->dir_fh)) != 0) 1466 encode_getfattr(&xdr, args->bitmask, &hdr);
1514 goto out; 1467 encode_restorefh(&xdr, &hdr);
1515 if ((status = encode_link(&xdr, args->name)) != 0) 1468 encode_getfattr(&xdr, args->bitmask, &hdr);
1516 goto out; 1469 encode_nops(&hdr);
1517 if ((status = encode_getfattr(&xdr, args->bitmask)) != 0) 1470 return 0;
1518 goto out;
1519 if ((status = encode_restorefh(&xdr)) != 0)
1520 goto out;
1521 status = encode_getfattr(&xdr, args->bitmask);
1522out:
1523 return status;
1524} 1471}
1525 1472
1526/* 1473/*
@@ -1530,27 +1477,20 @@ static int nfs4_xdr_enc_create(struct rpc_rqst *req, __be32 *p, const struct nfs
1530{ 1477{
1531 struct xdr_stream xdr; 1478 struct xdr_stream xdr;
1532 struct compound_hdr hdr = { 1479 struct compound_hdr hdr = {
1533 .nops = 7, 1480 .nops = 0,
1534 }; 1481 };
1535 int status;
1536 1482
1537 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1483 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1538 encode_compound_hdr(&xdr, &hdr); 1484 encode_compound_hdr(&xdr, &hdr);
1539 if ((status = encode_putfh(&xdr, args->dir_fh)) != 0) 1485 encode_putfh(&xdr, args->dir_fh, &hdr);
1540 goto out; 1486 encode_savefh(&xdr, &hdr);
1541 if ((status = encode_savefh(&xdr)) != 0) 1487 encode_create(&xdr, args, &hdr);
1542 goto out; 1488 encode_getfh(&xdr, &hdr);
1543 if ((status = encode_create(&xdr, args)) != 0) 1489 encode_getfattr(&xdr, args->bitmask, &hdr);
1544 goto out; 1490 encode_restorefh(&xdr, &hdr);
1545 if ((status = encode_getfh(&xdr)) != 0) 1491 encode_getfattr(&xdr, args->bitmask, &hdr);
1546 goto out; 1492 encode_nops(&hdr);
1547 if ((status = encode_getfattr(&xdr, args->bitmask)) != 0) 1493 return 0;
1548 goto out;
1549 if ((status = encode_restorefh(&xdr)) != 0)
1550 goto out;
1551 status = encode_getfattr(&xdr, args->bitmask);
1552out:
1553 return status;
1554} 1494}
1555 1495
1556/* 1496/*
@@ -1568,15 +1508,15 @@ static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nf
1568{ 1508{
1569 struct xdr_stream xdr; 1509 struct xdr_stream xdr;
1570 struct compound_hdr hdr = { 1510 struct compound_hdr hdr = {
1571 .nops = 2, 1511 .nops = 0,
1572 }; 1512 };
1573 int status;
1574 1513
1575 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1514 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1576 encode_compound_hdr(&xdr, &hdr); 1515 encode_compound_hdr(&xdr, &hdr);
1577 if ((status = encode_putfh(&xdr, args->fh)) == 0) 1516 encode_putfh(&xdr, args->fh, &hdr);
1578 status = encode_getfattr(&xdr, args->bitmask); 1517 encode_getfattr(&xdr, args->bitmask, &hdr);
1579 return status; 1518 encode_nops(&hdr);
1519 return 0;
1580} 1520}
1581 1521
1582/* 1522/*
@@ -1584,23 +1524,18 @@ static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nf
1584 */ 1524 */
1585static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args) 1525static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args)
1586{ 1526{
1587 struct xdr_stream xdr; 1527 struct xdr_stream xdr;
1588 struct compound_hdr hdr = { 1528 struct compound_hdr hdr = {
1589 .nops = 3, 1529 .nops = 0,
1590 }; 1530 };
1591 int status; 1531
1592 1532 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1593 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1533 encode_compound_hdr(&xdr, &hdr);
1594 encode_compound_hdr(&xdr, &hdr); 1534 encode_putfh(&xdr, args->fh, &hdr);
1595 status = encode_putfh(&xdr, args->fh); 1535 encode_close(&xdr, args, &hdr);
1596 if(status) 1536 encode_getfattr(&xdr, args->bitmask, &hdr);
1597 goto out; 1537 encode_nops(&hdr);
1598 status = encode_close(&xdr, args); 1538 return 0;
1599 if (status != 0)
1600 goto out;
1601 status = encode_getfattr(&xdr, args->bitmask);
1602out:
1603 return status;
1604} 1539}
1605 1540
1606/* 1541/*
@@ -1610,33 +1545,20 @@ static int nfs4_xdr_enc_open(struct rpc_rqst *req, __be32 *p, struct nfs_openarg
1610{ 1545{
1611 struct xdr_stream xdr; 1546 struct xdr_stream xdr;
1612 struct compound_hdr hdr = { 1547 struct compound_hdr hdr = {
1613 .nops = 7, 1548 .nops = 0,
1614 }; 1549 };
1615 int status;
1616 1550
1617 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1551 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1618 encode_compound_hdr(&xdr, &hdr); 1552 encode_compound_hdr(&xdr, &hdr);
1619 status = encode_putfh(&xdr, args->fh); 1553 encode_putfh(&xdr, args->fh, &hdr);
1620 if (status) 1554 encode_savefh(&xdr, &hdr);
1621 goto out; 1555 encode_open(&xdr, args, &hdr);
1622 status = encode_savefh(&xdr); 1556 encode_getfh(&xdr, &hdr);
1623 if (status) 1557 encode_getfattr(&xdr, args->bitmask, &hdr);
1624 goto out; 1558 encode_restorefh(&xdr, &hdr);
1625 status = encode_open(&xdr, args); 1559 encode_getfattr(&xdr, args->bitmask, &hdr);
1626 if (status) 1560 encode_nops(&hdr);
1627 goto out; 1561 return 0;
1628 status = encode_getfh(&xdr);
1629 if (status)
1630 goto out;
1631 status = encode_getfattr(&xdr, args->bitmask);
1632 if (status)
1633 goto out;
1634 status = encode_restorefh(&xdr);
1635 if (status)
1636 goto out;
1637 status = encode_getfattr(&xdr, args->bitmask);
1638out:
1639 return status;
1640} 1562}
1641 1563
1642/* 1564/*
@@ -1646,18 +1568,15 @@ static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, __be32 *p, struct nfs
1646{ 1568{
1647 struct xdr_stream xdr; 1569 struct xdr_stream xdr;
1648 struct compound_hdr hdr = { 1570 struct compound_hdr hdr = {
1649 .nops = 2, 1571 .nops = 0,
1650 }; 1572 };
1651 int status;
1652 1573
1653 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1574 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1654 encode_compound_hdr(&xdr, &hdr); 1575 encode_compound_hdr(&xdr, &hdr);
1655 status = encode_putfh(&xdr, args->fh); 1576 encode_putfh(&xdr, args->fh, &hdr);
1656 if(status) 1577 encode_open_confirm(&xdr, args, &hdr);
1657 goto out; 1578 encode_nops(&hdr);
1658 status = encode_open_confirm(&xdr, args); 1579 return 0;
1659out:
1660 return status;
1661} 1580}
1662 1581
1663/* 1582/*
@@ -1667,21 +1586,16 @@ static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, __be32 *p, struct nfs_
1667{ 1586{
1668 struct xdr_stream xdr; 1587 struct xdr_stream xdr;
1669 struct compound_hdr hdr = { 1588 struct compound_hdr hdr = {
1670 .nops = 3, 1589 .nops = 0,
1671 }; 1590 };
1672 int status;
1673 1591
1674 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1592 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1675 encode_compound_hdr(&xdr, &hdr); 1593 encode_compound_hdr(&xdr, &hdr);
1676 status = encode_putfh(&xdr, args->fh); 1594 encode_putfh(&xdr, args->fh, &hdr);
1677 if (status) 1595 encode_open(&xdr, args, &hdr);
1678 goto out; 1596 encode_getfattr(&xdr, args->bitmask, &hdr);
1679 status = encode_open(&xdr, args); 1597 encode_nops(&hdr);
1680 if (status) 1598 return 0;
1681 goto out;
1682 status = encode_getfattr(&xdr, args->bitmask);
1683out:
1684 return status;
1685} 1599}
1686 1600
1687/* 1601/*
@@ -1691,21 +1605,16 @@ static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, __be32 *p, struct n
1691{ 1605{
1692 struct xdr_stream xdr; 1606 struct xdr_stream xdr;
1693 struct compound_hdr hdr = { 1607 struct compound_hdr hdr = {
1694 .nops = 3, 1608 .nops = 0,
1695 }; 1609 };
1696 int status;
1697 1610
1698 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1611 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1699 encode_compound_hdr(&xdr, &hdr); 1612 encode_compound_hdr(&xdr, &hdr);
1700 status = encode_putfh(&xdr, args->fh); 1613 encode_putfh(&xdr, args->fh, &hdr);
1701 if (status) 1614 encode_open_downgrade(&xdr, args, &hdr);
1702 goto out; 1615 encode_getfattr(&xdr, args->bitmask, &hdr);
1703 status = encode_open_downgrade(&xdr, args); 1616 encode_nops(&hdr);
1704 if (status != 0) 1617 return 0;
1705 goto out;
1706 status = encode_getfattr(&xdr, args->bitmask);
1707out:
1708 return status;
1709} 1618}
1710 1619
1711/* 1620/*
@@ -1715,18 +1624,15 @@ static int nfs4_xdr_enc_lock(struct rpc_rqst *req, __be32 *p, struct nfs_lock_ar
1715{ 1624{
1716 struct xdr_stream xdr; 1625 struct xdr_stream xdr;
1717 struct compound_hdr hdr = { 1626 struct compound_hdr hdr = {
1718 .nops = 2, 1627 .nops = 0,
1719 }; 1628 };
1720 int status;
1721 1629
1722 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1630 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1723 encode_compound_hdr(&xdr, &hdr); 1631 encode_compound_hdr(&xdr, &hdr);
1724 status = encode_putfh(&xdr, args->fh); 1632 encode_putfh(&xdr, args->fh, &hdr);
1725 if(status) 1633 encode_lock(&xdr, args, &hdr);
1726 goto out; 1634 encode_nops(&hdr);
1727 status = encode_lock(&xdr, args); 1635 return 0;
1728out:
1729 return status;
1730} 1636}
1731 1637
1732/* 1638/*
@@ -1736,18 +1642,15 @@ static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, __be32 *p, struct nfs_lockt_
1736{ 1642{
1737 struct xdr_stream xdr; 1643 struct xdr_stream xdr;
1738 struct compound_hdr hdr = { 1644 struct compound_hdr hdr = {
1739 .nops = 2, 1645 .nops = 0,
1740 }; 1646 };
1741 int status;
1742 1647
1743 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1648 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1744 encode_compound_hdr(&xdr, &hdr); 1649 encode_compound_hdr(&xdr, &hdr);
1745 status = encode_putfh(&xdr, args->fh); 1650 encode_putfh(&xdr, args->fh, &hdr);
1746 if(status) 1651 encode_lockt(&xdr, args, &hdr);
1747 goto out; 1652 encode_nops(&hdr);
1748 status = encode_lockt(&xdr, args); 1653 return 0;
1749out:
1750 return status;
1751} 1654}
1752 1655
1753/* 1656/*
@@ -1757,18 +1660,15 @@ static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_
1757{ 1660{
1758 struct xdr_stream xdr; 1661 struct xdr_stream xdr;
1759 struct compound_hdr hdr = { 1662 struct compound_hdr hdr = {
1760 .nops = 2, 1663 .nops = 0,
1761 }; 1664 };
1762 int status;
1763 1665
1764 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1666 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1765 encode_compound_hdr(&xdr, &hdr); 1667 encode_compound_hdr(&xdr, &hdr);
1766 status = encode_putfh(&xdr, args->fh); 1668 encode_putfh(&xdr, args->fh, &hdr);
1767 if(status) 1669 encode_locku(&xdr, args, &hdr);
1768 goto out; 1670 encode_nops(&hdr);
1769 status = encode_locku(&xdr, args); 1671 return 0;
1770out:
1771 return status;
1772} 1672}
1773 1673
1774/* 1674/*
@@ -1778,18 +1678,15 @@ static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct n
1778{ 1678{
1779 struct xdr_stream xdr; 1679 struct xdr_stream xdr;
1780 struct compound_hdr hdr = { 1680 struct compound_hdr hdr = {
1781 .nops = 2, 1681 .nops = 0,
1782 }; 1682 };
1783 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; 1683 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
1784 unsigned int replen; 1684 unsigned int replen;
1785 int status;
1786 1685
1787 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1686 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1788 encode_compound_hdr(&xdr, &hdr); 1687 encode_compound_hdr(&xdr, &hdr);
1789 status = encode_putfh(&xdr, args->fh); 1688 encode_putfh(&xdr, args->fh, &hdr);
1790 if(status) 1689 encode_readlink(&xdr, args, req, &hdr);
1791 goto out;
1792 status = encode_readlink(&xdr, args, req);
1793 1690
1794 /* set up reply kvec 1691 /* set up reply kvec
1795 * toplevel_status + taglen + rescount + OP_PUTFH + status 1692 * toplevel_status + taglen + rescount + OP_PUTFH + status
@@ -1798,9 +1695,8 @@ static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct n
1798 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_readlink_sz) << 2; 1695 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_readlink_sz) << 2;
1799 xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 1696 xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages,
1800 args->pgbase, args->pglen); 1697 args->pgbase, args->pglen);
1801 1698 encode_nops(&hdr);
1802out: 1699 return 0;
1803 return status;
1804} 1700}
1805 1701
1806/* 1702/*
@@ -1810,18 +1706,15 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
1810{ 1706{
1811 struct xdr_stream xdr; 1707 struct xdr_stream xdr;
1812 struct compound_hdr hdr = { 1708 struct compound_hdr hdr = {
1813 .nops = 2, 1709 .nops = 0,
1814 }; 1710 };
1815 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; 1711 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
1816 int replen; 1712 int replen;
1817 int status;
1818 1713
1819 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1714 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1820 encode_compound_hdr(&xdr, &hdr); 1715 encode_compound_hdr(&xdr, &hdr);
1821 status = encode_putfh(&xdr, args->fh); 1716 encode_putfh(&xdr, args->fh, &hdr);
1822 if(status) 1717 encode_readdir(&xdr, args, req, &hdr);
1823 goto out;
1824 status = encode_readdir(&xdr, args, req);
1825 1718
1826 /* set up reply kvec 1719 /* set up reply kvec
1827 * toplevel_status + taglen + rescount + OP_PUTFH + status 1720 * toplevel_status + taglen + rescount + OP_PUTFH + status
@@ -1833,9 +1726,8 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
1833 dprintk("%s: inlined page args = (%u, %p, %u, %u)\n", 1726 dprintk("%s: inlined page args = (%u, %p, %u, %u)\n",
1834 __func__, replen, args->pages, 1727 __func__, replen, args->pages,
1835 args->pgbase, args->count); 1728 args->pgbase, args->count);
1836 1729 encode_nops(&hdr);
1837out: 1730 return 0;
1838 return status;
1839} 1731}
1840 1732
1841/* 1733/*
@@ -1846,18 +1738,14 @@ static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readarg
1846 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; 1738 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
1847 struct xdr_stream xdr; 1739 struct xdr_stream xdr;
1848 struct compound_hdr hdr = { 1740 struct compound_hdr hdr = {
1849 .nops = 2, 1741 .nops = 0,
1850 }; 1742 };
1851 int replen, status; 1743 int replen;
1852 1744
1853 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1745 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1854 encode_compound_hdr(&xdr, &hdr); 1746 encode_compound_hdr(&xdr, &hdr);
1855 status = encode_putfh(&xdr, args->fh); 1747 encode_putfh(&xdr, args->fh, &hdr);
1856 if (status) 1748 encode_read(&xdr, args, &hdr);
1857 goto out;
1858 status = encode_read(&xdr, args);
1859 if (status)
1860 goto out;
1861 1749
1862 /* set up reply kvec 1750 /* set up reply kvec
1863 * toplevel status + taglen=0 + rescount + OP_PUTFH + status 1751 * toplevel status + taglen=0 + rescount + OP_PUTFH + status
@@ -1867,33 +1755,27 @@ static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readarg
1867 xdr_inline_pages(&req->rq_rcv_buf, replen, 1755 xdr_inline_pages(&req->rq_rcv_buf, replen,
1868 args->pages, args->pgbase, args->count); 1756 args->pages, args->pgbase, args->count);
1869 req->rq_rcv_buf.flags |= XDRBUF_READ; 1757 req->rq_rcv_buf.flags |= XDRBUF_READ;
1870out: 1758 encode_nops(&hdr);
1871 return status; 1759 return 0;
1872} 1760}
1873 1761
1874/* 1762/*
1875 * Encode an SETATTR request 1763 * Encode an SETATTR request
1876 */ 1764 */
1877static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, __be32 *p, struct nfs_setattrargs *args) 1765static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, __be32 *p, struct nfs_setattrargs *args)
1878
1879{ 1766{
1880 struct xdr_stream xdr; 1767 struct xdr_stream xdr;
1881 struct compound_hdr hdr = { 1768 struct compound_hdr hdr = {
1882 .nops = 3, 1769 .nops = 0,
1883 }; 1770 };
1884 int status; 1771
1885 1772 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1886 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1773 encode_compound_hdr(&xdr, &hdr);
1887 encode_compound_hdr(&xdr, &hdr); 1774 encode_putfh(&xdr, args->fh, &hdr);
1888 status = encode_putfh(&xdr, args->fh); 1775 encode_setattr(&xdr, args, args->server, &hdr);
1889 if(status) 1776 encode_getfattr(&xdr, args->bitmask, &hdr);
1890 goto out; 1777 encode_nops(&hdr);
1891 status = encode_setattr(&xdr, args, args->server); 1778 return 0;
1892 if(status)
1893 goto out;
1894 status = encode_getfattr(&xdr, args->bitmask);
1895out:
1896 return status;
1897} 1779}
1898 1780
1899/* 1781/*
@@ -1906,22 +1788,21 @@ nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p,
1906 struct xdr_stream xdr; 1788 struct xdr_stream xdr;
1907 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; 1789 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
1908 struct compound_hdr hdr = { 1790 struct compound_hdr hdr = {
1909 .nops = 2, 1791 .nops = 0,
1910 }; 1792 };
1911 int replen, status; 1793 int replen;
1912 1794
1913 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1795 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1914 encode_compound_hdr(&xdr, &hdr); 1796 encode_compound_hdr(&xdr, &hdr);
1915 status = encode_putfh(&xdr, args->fh); 1797 encode_putfh(&xdr, args->fh, &hdr);
1916 if (status) 1798 encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr);
1917 goto out; 1799
1918 status = encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0);
1919 /* set up reply buffer: */ 1800 /* set up reply buffer: */
1920 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_getacl_sz) << 2; 1801 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_getacl_sz) << 2;
1921 xdr_inline_pages(&req->rq_rcv_buf, replen, 1802 xdr_inline_pages(&req->rq_rcv_buf, replen,
1922 args->acl_pages, args->acl_pgbase, args->acl_len); 1803 args->acl_pages, args->acl_pgbase, args->acl_len);
1923out: 1804 encode_nops(&hdr);
1924 return status; 1805 return 0;
1925} 1806}
1926 1807
1927/* 1808/*
@@ -1931,22 +1812,17 @@ static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writea
1931{ 1812{
1932 struct xdr_stream xdr; 1813 struct xdr_stream xdr;
1933 struct compound_hdr hdr = { 1814 struct compound_hdr hdr = {
1934 .nops = 3, 1815 .nops = 0,
1935 }; 1816 };
1936 int status;
1937 1817
1938 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1818 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1939 encode_compound_hdr(&xdr, &hdr); 1819 encode_compound_hdr(&xdr, &hdr);
1940 status = encode_putfh(&xdr, args->fh); 1820 encode_putfh(&xdr, args->fh, &hdr);
1941 if (status) 1821 encode_write(&xdr, args, &hdr);
1942 goto out;
1943 status = encode_write(&xdr, args);
1944 if (status)
1945 goto out;
1946 req->rq_snd_buf.flags |= XDRBUF_WRITE; 1822 req->rq_snd_buf.flags |= XDRBUF_WRITE;
1947 status = encode_getfattr(&xdr, args->bitmask); 1823 encode_getfattr(&xdr, args->bitmask, &hdr);
1948out: 1824 encode_nops(&hdr);
1949 return status; 1825 return 0;
1950} 1826}
1951 1827
1952/* 1828/*
@@ -1956,21 +1832,16 @@ static int nfs4_xdr_enc_commit(struct rpc_rqst *req, __be32 *p, struct nfs_write
1956{ 1832{
1957 struct xdr_stream xdr; 1833 struct xdr_stream xdr;
1958 struct compound_hdr hdr = { 1834 struct compound_hdr hdr = {
1959 .nops = 3, 1835 .nops = 0,
1960 }; 1836 };
1961 int status;
1962 1837
1963 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1838 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1964 encode_compound_hdr(&xdr, &hdr); 1839 encode_compound_hdr(&xdr, &hdr);
1965 status = encode_putfh(&xdr, args->fh); 1840 encode_putfh(&xdr, args->fh, &hdr);
1966 if (status) 1841 encode_commit(&xdr, args, &hdr);
1967 goto out; 1842 encode_getfattr(&xdr, args->bitmask, &hdr);
1968 status = encode_commit(&xdr, args); 1843 encode_nops(&hdr);
1969 if (status) 1844 return 0;
1970 goto out;
1971 status = encode_getfattr(&xdr, args->bitmask);
1972out:
1973 return status;
1974} 1845}
1975 1846
1976/* 1847/*
@@ -1980,16 +1851,15 @@ static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsin
1980{ 1851{
1981 struct xdr_stream xdr; 1852 struct xdr_stream xdr;
1982 struct compound_hdr hdr = { 1853 struct compound_hdr hdr = {
1983 .nops = 2, 1854 .nops = 0,
1984 }; 1855 };
1985 int status;
1986 1856
1987 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1857 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1988 encode_compound_hdr(&xdr, &hdr); 1858 encode_compound_hdr(&xdr, &hdr);
1989 status = encode_putfh(&xdr, args->fh); 1859 encode_putfh(&xdr, args->fh, &hdr);
1990 if (!status) 1860 encode_fsinfo(&xdr, args->bitmask, &hdr);
1991 status = encode_fsinfo(&xdr, args->bitmask); 1861 encode_nops(&hdr);
1992 return status; 1862 return 0;
1993} 1863}
1994 1864
1995/* 1865/*
@@ -1999,17 +1869,16 @@ static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, __be32 *p, const struct n
1999{ 1869{
2000 struct xdr_stream xdr; 1870 struct xdr_stream xdr;
2001 struct compound_hdr hdr = { 1871 struct compound_hdr hdr = {
2002 .nops = 2, 1872 .nops = 0,
2003 }; 1873 };
2004 int status;
2005 1874
2006 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1875 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2007 encode_compound_hdr(&xdr, &hdr); 1876 encode_compound_hdr(&xdr, &hdr);
2008 status = encode_putfh(&xdr, args->fh); 1877 encode_putfh(&xdr, args->fh, &hdr);
2009 if (!status) 1878 encode_getattr_one(&xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
2010 status = encode_getattr_one(&xdr, 1879 &hdr);
2011 args->bitmask[0] & nfs4_pathconf_bitmap[0]); 1880 encode_nops(&hdr);
2012 return status; 1881 return 0;
2013} 1882}
2014 1883
2015/* 1884/*
@@ -2019,18 +1888,16 @@ static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs
2019{ 1888{
2020 struct xdr_stream xdr; 1889 struct xdr_stream xdr;
2021 struct compound_hdr hdr = { 1890 struct compound_hdr hdr = {
2022 .nops = 2, 1891 .nops = 0,
2023 }; 1892 };
2024 int status;
2025 1893
2026 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1894 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2027 encode_compound_hdr(&xdr, &hdr); 1895 encode_compound_hdr(&xdr, &hdr);
2028 status = encode_putfh(&xdr, args->fh); 1896 encode_putfh(&xdr, args->fh, &hdr);
2029 if (status == 0) 1897 encode_getattr_two(&xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
2030 status = encode_getattr_two(&xdr, 1898 args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr);
2031 args->bitmask[0] & nfs4_statfs_bitmap[0], 1899 encode_nops(&hdr);
2032 args->bitmask[1] & nfs4_statfs_bitmap[1]); 1900 return 0;
2033 return status;
2034} 1901}
2035 1902
2036/* 1903/*
@@ -2040,19 +1907,18 @@ static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p, const struc
2040{ 1907{
2041 struct xdr_stream xdr; 1908 struct xdr_stream xdr;
2042 struct compound_hdr hdr = { 1909 struct compound_hdr hdr = {
2043 .nops = 2, 1910 .nops = 0,
2044 }; 1911 };
2045 int status;
2046 1912
2047 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1913 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2048 encode_compound_hdr(&xdr, &hdr); 1914 encode_compound_hdr(&xdr, &hdr);
2049 status = encode_putfh(&xdr, fhandle); 1915 encode_putfh(&xdr, fhandle, &hdr);
2050 if (status == 0) 1916 encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
2051 status = encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS| 1917 FATTR4_WORD0_LINK_SUPPORT|
2052 FATTR4_WORD0_LINK_SUPPORT| 1918 FATTR4_WORD0_SYMLINK_SUPPORT|
2053 FATTR4_WORD0_SYMLINK_SUPPORT| 1919 FATTR4_WORD0_ACLSUPPORT, &hdr);
2054 FATTR4_WORD0_ACLSUPPORT); 1920 encode_nops(&hdr);
2055 return status; 1921 return 0;
2056} 1922}
2057 1923
2058/* 1924/*
@@ -2062,12 +1928,14 @@ static int nfs4_xdr_enc_renew(struct rpc_rqst *req, __be32 *p, struct nfs_client
2062{ 1928{
2063 struct xdr_stream xdr; 1929 struct xdr_stream xdr;
2064 struct compound_hdr hdr = { 1930 struct compound_hdr hdr = {
2065 .nops = 1, 1931 .nops = 0,
2066 }; 1932 };
2067 1933
2068 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1934 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2069 encode_compound_hdr(&xdr, &hdr); 1935 encode_compound_hdr(&xdr, &hdr);
2070 return encode_renew(&xdr, clp); 1936 encode_renew(&xdr, clp, &hdr);
1937 encode_nops(&hdr);
1938 return 0;
2071} 1939}
2072 1940
2073/* 1941/*
@@ -2077,12 +1945,14 @@ static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4
2077{ 1945{
2078 struct xdr_stream xdr; 1946 struct xdr_stream xdr;
2079 struct compound_hdr hdr = { 1947 struct compound_hdr hdr = {
2080 .nops = 1, 1948 .nops = 0,
2081 }; 1949 };
2082 1950
2083 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1951 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2084 encode_compound_hdr(&xdr, &hdr); 1952 encode_compound_hdr(&xdr, &hdr);
2085 return encode_setclientid(&xdr, sc); 1953 encode_setclientid(&xdr, sc, &hdr);
1954 encode_nops(&hdr);
1955 return 0;
2086} 1956}
2087 1957
2088/* 1958/*
@@ -2092,19 +1962,17 @@ static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str
2092{ 1962{
2093 struct xdr_stream xdr; 1963 struct xdr_stream xdr;
2094 struct compound_hdr hdr = { 1964 struct compound_hdr hdr = {
2095 .nops = 3, 1965 .nops = 0,
2096 }; 1966 };
2097 const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; 1967 const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
2098 int status;
2099 1968
2100 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1969 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2101 encode_compound_hdr(&xdr, &hdr); 1970 encode_compound_hdr(&xdr, &hdr);
2102 status = encode_setclientid_confirm(&xdr, clp); 1971 encode_setclientid_confirm(&xdr, clp, &hdr);
2103 if (!status) 1972 encode_putrootfh(&xdr, &hdr);
2104 status = encode_putrootfh(&xdr); 1973 encode_fsinfo(&xdr, lease_bitmap, &hdr);
2105 if (!status) 1974 encode_nops(&hdr);
2106 status = encode_fsinfo(&xdr, lease_bitmap); 1975 return 0;
2107 return status;
2108} 1976}
2109 1977
2110/* 1978/*
@@ -2114,21 +1982,16 @@ static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, __be32 *p, const struc
2114{ 1982{
2115 struct xdr_stream xdr; 1983 struct xdr_stream xdr;
2116 struct compound_hdr hdr = { 1984 struct compound_hdr hdr = {
2117 .nops = 3, 1985 .nops = 0,
2118 }; 1986 };
2119 int status;
2120 1987
2121 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1988 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2122 encode_compound_hdr(&xdr, &hdr); 1989 encode_compound_hdr(&xdr, &hdr);
2123 status = encode_putfh(&xdr, args->fhandle); 1990 encode_putfh(&xdr, args->fhandle, &hdr);
2124 if (status != 0) 1991 encode_delegreturn(&xdr, args->stateid, &hdr);
2125 goto out; 1992 encode_getfattr(&xdr, args->bitmask, &hdr);
2126 status = encode_delegreturn(&xdr, args->stateid); 1993 encode_nops(&hdr);
2127 if (status != 0) 1994 return 0;
2128 goto out;
2129 status = encode_getfattr(&xdr, args->bitmask);
2130out:
2131 return status;
2132} 1995}
2133 1996
2134/* 1997/*
@@ -2138,20 +2001,17 @@ static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs
2138{ 2001{
2139 struct xdr_stream xdr; 2002 struct xdr_stream xdr;
2140 struct compound_hdr hdr = { 2003 struct compound_hdr hdr = {
2141 .nops = 3, 2004 .nops = 0,
2142 }; 2005 };
2143 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; 2006 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
2144 int replen; 2007 int replen;
2145 int status;
2146 2008
2147 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2009 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2148 encode_compound_hdr(&xdr, &hdr); 2010 encode_compound_hdr(&xdr, &hdr);
2149 if ((status = encode_putfh(&xdr, args->dir_fh)) != 0) 2011 encode_putfh(&xdr, args->dir_fh, &hdr);
2150 goto out; 2012 encode_lookup(&xdr, args->name, &hdr);
2151 if ((status = encode_lookup(&xdr, args->name)) != 0) 2013 encode_fs_locations(&xdr, args->bitmask, &hdr);
2152 goto out; 2014
2153 if ((status = encode_fs_locations(&xdr, args->bitmask)) != 0)
2154 goto out;
2155 /* set up reply 2015 /* set up reply
2156 * toplevel_status + OP_PUTFH + status 2016 * toplevel_status + OP_PUTFH + status
2157 * + OP_LOOKUP + status + OP_GETATTR + status = 7 2017 * + OP_LOOKUP + status + OP_GETATTR + status = 7
@@ -2159,8 +2019,8 @@ static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs
2159 replen = (RPC_REPHDRSIZE + auth->au_rslack + 7) << 2; 2019 replen = (RPC_REPHDRSIZE + auth->au_rslack + 7) << 2;
2160 xdr_inline_pages(&req->rq_rcv_buf, replen, &args->page, 2020 xdr_inline_pages(&req->rq_rcv_buf, replen, &args->page,
2161 0, PAGE_SIZE); 2021 0, PAGE_SIZE);
2162out: 2022 encode_nops(&hdr);
2163 return status; 2023 return 0;
2164} 2024}
2165 2025
2166/* 2026/*
@@ -2217,11 +2077,13 @@ static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
2217 READ_BUF(8); 2077 READ_BUF(8);
2218 READ32(hdr->status); 2078 READ32(hdr->status);
2219 READ32(hdr->taglen); 2079 READ32(hdr->taglen);
2220 2080
2221 READ_BUF(hdr->taglen + 4); 2081 READ_BUF(hdr->taglen + 4);
2222 hdr->tag = (char *)p; 2082 hdr->tag = (char *)p;
2223 p += XDR_QUADLEN(hdr->taglen); 2083 p += XDR_QUADLEN(hdr->taglen);
2224 READ32(hdr->nops); 2084 READ32(hdr->nops);
2085 if (unlikely(hdr->nops < 1))
2086 return nfs4_stat_to_errno(hdr->status);
2225 return 0; 2087 return 0;
2226} 2088}
2227 2089
@@ -3047,8 +2909,7 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
3047static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) 2909static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
3048{ 2910{
3049 __be32 *savep; 2911 __be32 *savep;
3050 uint32_t attrlen, 2912 uint32_t attrlen, bitmap[2] = {0};
3051 bitmap[2] = {0};
3052 int status; 2913 int status;
3053 2914
3054 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) 2915 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -3070,14 +2931,13 @@ xdr_error:
3070 dprintk("%s: xdr returned %d!\n", __func__, -status); 2931 dprintk("%s: xdr returned %d!\n", __func__, -status);
3071 return status; 2932 return status;
3072} 2933}
3073 2934
3074static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) 2935static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
3075{ 2936{
3076 __be32 *savep; 2937 __be32 *savep;
3077 uint32_t attrlen, 2938 uint32_t attrlen, bitmap[2] = {0};
3078 bitmap[2] = {0};
3079 int status; 2939 int status;
3080 2940
3081 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) 2941 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
3082 goto xdr_error; 2942 goto xdr_error;
3083 if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) 2943 if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
@@ -3107,10 +2967,9 @@ xdr_error:
3107static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf) 2967static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)
3108{ 2968{
3109 __be32 *savep; 2969 __be32 *savep;
3110 uint32_t attrlen, 2970 uint32_t attrlen, bitmap[2] = {0};
3111 bitmap[2] = {0};
3112 int status; 2971 int status;
3113 2972
3114 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) 2973 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
3115 goto xdr_error; 2974 goto xdr_error;
3116 if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) 2975 if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
@@ -3256,7 +3115,7 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
3256static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) 3115static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
3257{ 3116{
3258 int status; 3117 int status;
3259 3118
3260 status = decode_op_hdr(xdr, OP_LINK); 3119 status = decode_op_hdr(xdr, OP_LINK);
3261 if (status) 3120 if (status)
3262 return status; 3121 return status;
@@ -3344,27 +3203,27 @@ static int decode_lookup(struct xdr_stream *xdr)
3344/* This is too sick! */ 3203/* This is too sick! */
3345static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize) 3204static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
3346{ 3205{
3347 __be32 *p; 3206 __be32 *p;
3348 uint32_t limit_type, nblocks, blocksize; 3207 uint32_t limit_type, nblocks, blocksize;
3349 3208
3350 READ_BUF(12); 3209 READ_BUF(12);
3351 READ32(limit_type); 3210 READ32(limit_type);
3352 switch (limit_type) { 3211 switch (limit_type) {
3353 case 1: 3212 case 1:
3354 READ64(*maxsize); 3213 READ64(*maxsize);
3355 break; 3214 break;
3356 case 2: 3215 case 2:
3357 READ32(nblocks); 3216 READ32(nblocks);
3358 READ32(blocksize); 3217 READ32(blocksize);
3359 *maxsize = (uint64_t)nblocks * (uint64_t)blocksize; 3218 *maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
3360 } 3219 }
3361 return 0; 3220 return 0;
3362} 3221}
3363 3222
3364static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) 3223static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
3365{ 3224{
3366 __be32 *p; 3225 __be32 *p;
3367 uint32_t delegation_type; 3226 uint32_t delegation_type;
3368 3227
3369 READ_BUF(4); 3228 READ_BUF(4);
3370 READ32(delegation_type); 3229 READ32(delegation_type);
@@ -3375,13 +3234,14 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
3375 READ_BUF(NFS4_STATEID_SIZE+4); 3234 READ_BUF(NFS4_STATEID_SIZE+4);
3376 COPYMEM(res->delegation.data, NFS4_STATEID_SIZE); 3235 COPYMEM(res->delegation.data, NFS4_STATEID_SIZE);
3377 READ32(res->do_recall); 3236 READ32(res->do_recall);
3237
3378 switch (delegation_type) { 3238 switch (delegation_type) {
3379 case NFS4_OPEN_DELEGATE_READ: 3239 case NFS4_OPEN_DELEGATE_READ:
3380 res->delegation_type = FMODE_READ; 3240 res->delegation_type = FMODE_READ;
3381 break; 3241 break;
3382 case NFS4_OPEN_DELEGATE_WRITE: 3242 case NFS4_OPEN_DELEGATE_WRITE:
3383 res->delegation_type = FMODE_WRITE|FMODE_READ; 3243 res->delegation_type = FMODE_WRITE|FMODE_READ;
3384 if (decode_space_limit(xdr, &res->maxsize) < 0) 3244 if (decode_space_limit(xdr, &res->maxsize) < 0)
3385 return -EIO; 3245 return -EIO;
3386 } 3246 }
3387 return decode_ace(xdr, NULL, res->server->nfs_client); 3247 return decode_ace(xdr, NULL, res->server->nfs_client);
@@ -3389,27 +3249,27 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
3389 3249
3390static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) 3250static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
3391{ 3251{
3392 __be32 *p; 3252 __be32 *p;
3393 uint32_t savewords, bmlen, i; 3253 uint32_t savewords, bmlen, i;
3394 int status; 3254 int status;
3395 3255
3396 status = decode_op_hdr(xdr, OP_OPEN); 3256 status = decode_op_hdr(xdr, OP_OPEN);
3397 if (status != -EIO) 3257 if (status != -EIO)
3398 nfs_increment_open_seqid(status, res->seqid); 3258 nfs_increment_open_seqid(status, res->seqid);
3399 if (status) 3259 if (status)
3400 return status; 3260 return status;
3401 READ_BUF(NFS4_STATEID_SIZE); 3261 READ_BUF(NFS4_STATEID_SIZE);
3402 COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); 3262 COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
3403 3263
3404 decode_change_info(xdr, &res->cinfo); 3264 decode_change_info(xdr, &res->cinfo);
3405 3265
3406 READ_BUF(8); 3266 READ_BUF(8);
3407 READ32(res->rflags); 3267 READ32(res->rflags);
3408 READ32(bmlen); 3268 READ32(bmlen);
3409 if (bmlen > 10) 3269 if (bmlen > 10)
3410 goto xdr_error; 3270 goto xdr_error;
3411 3271
3412 READ_BUF(bmlen << 2); 3272 READ_BUF(bmlen << 2);
3413 savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE); 3273 savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE);
3414 for (i = 0; i < savewords; ++i) 3274 for (i = 0; i < savewords; ++i)
3415 READ32(res->attrset[i]); 3275 READ32(res->attrset[i]);
@@ -3424,17 +3284,17 @@ xdr_error:
3424 3284
3425static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res) 3285static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
3426{ 3286{
3427 __be32 *p; 3287 __be32 *p;
3428 int status; 3288 int status;
3429 3289
3430 status = decode_op_hdr(xdr, OP_OPEN_CONFIRM); 3290 status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
3431 if (status != -EIO) 3291 if (status != -EIO)
3432 nfs_increment_open_seqid(status, res->seqid); 3292 nfs_increment_open_seqid(status, res->seqid);
3433 if (status) 3293 if (status)
3434 return status; 3294 return status;
3435 READ_BUF(NFS4_STATEID_SIZE); 3295 READ_BUF(NFS4_STATEID_SIZE);
3436 COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); 3296 COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
3437 return 0; 3297 return 0;
3438} 3298}
3439 3299
3440static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res) 3300static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res)
@@ -3562,7 +3422,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
3562 dprintk("NFS: readdir reply truncated!\n"); 3422 dprintk("NFS: readdir reply truncated!\n");
3563 entry[1] = 1; 3423 entry[1] = 1;
3564 } 3424 }
3565out: 3425out:
3566 kunmap_atomic(kaddr, KM_USER0); 3426 kunmap_atomic(kaddr, KM_USER0);
3567 return 0; 3427 return 0;
3568short_pkt: 3428short_pkt:
@@ -3718,7 +3578,6 @@ static int decode_setattr(struct xdr_stream *xdr, struct nfs_setattrres *res)
3718 uint32_t bmlen; 3578 uint32_t bmlen;
3719 int status; 3579 int status;
3720 3580
3721
3722 status = decode_op_hdr(xdr, OP_SETATTR); 3581 status = decode_op_hdr(xdr, OP_SETATTR);
3723 if (status) 3582 if (status)
3724 return status; 3583 return status;
@@ -3738,7 +3597,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
3738 READ32(opnum); 3597 READ32(opnum);
3739 if (opnum != OP_SETCLIENTID) { 3598 if (opnum != OP_SETCLIENTID) {
3740 dprintk("nfs: decode_setclientid: Server returned operation" 3599 dprintk("nfs: decode_setclientid: Server returned operation"
3741 " %d\n", opnum); 3600 " %d\n", opnum);
3742 return -EIO; 3601 return -EIO;
3743 } 3602 }
3744 READ32(nfserr); 3603 READ32(nfserr);
@@ -3792,34 +3651,34 @@ static int decode_delegreturn(struct xdr_stream *xdr)
3792} 3651}
3793 3652
3794/* 3653/*
3654 * END OF "GENERIC" DECODE ROUTINES.
3655 */
3656
3657/*
3795 * Decode OPEN_DOWNGRADE response 3658 * Decode OPEN_DOWNGRADE response
3796 */ 3659 */
3797static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res) 3660static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
3798{ 3661{
3799 struct xdr_stream xdr; 3662 struct xdr_stream xdr;
3800 struct compound_hdr hdr; 3663 struct compound_hdr hdr;
3801 int status; 3664 int status;
3802 3665
3803 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 3666 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
3804 status = decode_compound_hdr(&xdr, &hdr); 3667 status = decode_compound_hdr(&xdr, &hdr);
3805 if (status) 3668 if (status)
3806 goto out; 3669 goto out;
3807 status = decode_putfh(&xdr); 3670 status = decode_putfh(&xdr);
3808 if (status) 3671 if (status)
3809 goto out; 3672 goto out;
3810 status = decode_open_downgrade(&xdr, res); 3673 status = decode_open_downgrade(&xdr, res);
3811 if (status != 0) 3674 if (status != 0)
3812 goto out; 3675 goto out;
3813 decode_getfattr(&xdr, res->fattr, res->server); 3676 decode_getfattr(&xdr, res->fattr, res->server);
3814out: 3677out:
3815 return status; 3678 return status;
3816} 3679}
3817 3680
3818/* 3681/*
3819 * END OF "GENERIC" DECODE ROUTINES.
3820 */
3821
3822/*
3823 * Decode ACCESS response 3682 * Decode ACCESS response
3824 */ 3683 */
3825static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_accessres *res) 3684static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_accessres *res)
@@ -3827,7 +3686,7 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_ac
3827 struct xdr_stream xdr; 3686 struct xdr_stream xdr;
3828 struct compound_hdr hdr; 3687 struct compound_hdr hdr;
3829 int status; 3688 int status;
3830 3689
3831 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 3690 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
3832 if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) 3691 if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
3833 goto out; 3692 goto out;
@@ -3850,7 +3709,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lo
3850 struct xdr_stream xdr; 3709 struct xdr_stream xdr;
3851 struct compound_hdr hdr; 3710 struct compound_hdr hdr;
3852 int status; 3711 int status;
3853 3712
3854 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 3713 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
3855 if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) 3714 if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
3856 goto out; 3715 goto out;
@@ -3873,7 +3732,7 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nf
3873 struct xdr_stream xdr; 3732 struct xdr_stream xdr;
3874 struct compound_hdr hdr; 3733 struct compound_hdr hdr;
3875 int status; 3734 int status;
3876 3735
3877 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 3736 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
3878 if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) 3737 if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
3879 goto out; 3738 goto out;
@@ -3893,7 +3752,7 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem
3893 struct xdr_stream xdr; 3752 struct xdr_stream xdr;
3894 struct compound_hdr hdr; 3753 struct compound_hdr hdr;
3895 int status; 3754 int status;
3896 3755
3897 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 3756 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
3898 if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) 3757 if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
3899 goto out; 3758 goto out;
@@ -3914,7 +3773,7 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_re
3914 struct xdr_stream xdr; 3773 struct xdr_stream xdr;
3915 struct compound_hdr hdr; 3774 struct compound_hdr hdr;
3916 int status; 3775 int status;
3917 3776
3918 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 3777 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
3919 if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) 3778 if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
3920 goto out; 3779 goto out;
@@ -3944,7 +3803,7 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link
3944 struct xdr_stream xdr; 3803 struct xdr_stream xdr;
3945 struct compound_hdr hdr; 3804 struct compound_hdr hdr;
3946 int status; 3805 int status;
3947 3806
3948 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 3807 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
3949 if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) 3808 if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
3950 goto out; 3809 goto out;
@@ -3977,7 +3836,7 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_cr
3977 struct xdr_stream xdr; 3836 struct xdr_stream xdr;
3978 struct compound_hdr hdr; 3837 struct compound_hdr hdr;
3979 int status; 3838 int status;
3980 3839
3981 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 3840 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
3982 if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) 3841 if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
3983 goto out; 3842 goto out;
@@ -4014,7 +3873,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_g
4014 struct xdr_stream xdr; 3873 struct xdr_stream xdr;
4015 struct compound_hdr hdr; 3874 struct compound_hdr hdr;
4016 int status; 3875 int status;
4017 3876
4018 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 3877 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
4019 status = decode_compound_hdr(&xdr, &hdr); 3878 status = decode_compound_hdr(&xdr, &hdr);
4020 if (status) 3879 if (status)
@@ -4025,7 +3884,6 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_g
4025 status = decode_getfattr(&xdr, res->fattr, res->server); 3884 status = decode_getfattr(&xdr, res->fattr, res->server);
4026out: 3885out:
4027 return status; 3886 return status;
4028
4029} 3887}
4030 3888
4031/* 3889/*
@@ -4034,21 +3892,20 @@ out:
4034static int 3892static int
4035nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args) 3893nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args)
4036{ 3894{
4037 struct xdr_stream xdr; 3895 struct xdr_stream xdr;
4038 struct compound_hdr hdr = { 3896 struct compound_hdr hdr = {
4039 .nops = 2, 3897 .nops = 0,
4040 }; 3898 };
4041 int status; 3899 int status;
4042 3900
4043 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 3901 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
4044 encode_compound_hdr(&xdr, &hdr); 3902 encode_compound_hdr(&xdr, &hdr);
4045 status = encode_putfh(&xdr, args->fh); 3903 encode_putfh(&xdr, args->fh, &hdr);
4046 if (status) 3904 status = encode_setacl(&xdr, args, &hdr);
4047 goto out; 3905 encode_nops(&hdr);
4048 status = encode_setacl(&xdr, args); 3906 return status;
4049out:
4050 return status;
4051} 3907}
3908
4052/* 3909/*
4053 * Decode SETACL response 3910 * Decode SETACL response
4054 */ 3911 */
@@ -4099,18 +3956,18 @@ out:
4099 */ 3956 */
4100static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res) 3957static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
4101{ 3958{
4102 struct xdr_stream xdr; 3959 struct xdr_stream xdr;
4103 struct compound_hdr hdr; 3960 struct compound_hdr hdr;
4104 int status; 3961 int status;
4105 3962
4106 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 3963 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
4107 status = decode_compound_hdr(&xdr, &hdr); 3964 status = decode_compound_hdr(&xdr, &hdr);
4108 if (status) 3965 if (status)
4109 goto out; 3966 goto out;
4110 status = decode_putfh(&xdr); 3967 status = decode_putfh(&xdr);
4111 if (status) 3968 if (status)
4112 goto out; 3969 goto out;
4113 status = decode_close(&xdr, res); 3970 status = decode_close(&xdr, res);
4114 if (status != 0) 3971 if (status != 0)
4115 goto out; 3972 goto out;
4116 /* 3973 /*
@@ -4121,7 +3978,7 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
4121 */ 3978 */
4122 decode_getfattr(&xdr, res->fattr, res->server); 3979 decode_getfattr(&xdr, res->fattr, res->server);
4123out: 3980out:
4124 return status; 3981 return status;
4125} 3982}
4126 3983
4127/* 3984/*
@@ -4129,23 +3986,23 @@ out:
4129 */ 3986 */
4130static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res) 3987static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
4131{ 3988{
4132 struct xdr_stream xdr; 3989 struct xdr_stream xdr;
4133 struct compound_hdr hdr; 3990 struct compound_hdr hdr;
4134 int status; 3991 int status;
4135 3992
4136 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 3993 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
4137 status = decode_compound_hdr(&xdr, &hdr); 3994 status = decode_compound_hdr(&xdr, &hdr);
4138 if (status) 3995 if (status)
4139 goto out; 3996 goto out;
4140 status = decode_putfh(&xdr); 3997 status = decode_putfh(&xdr);
4141 if (status) 3998 if (status)
4142 goto out; 3999 goto out;
4143 status = decode_savefh(&xdr); 4000 status = decode_savefh(&xdr);
4001 if (status)
4002 goto out;
4003 status = decode_open(&xdr, res);
4144 if (status) 4004 if (status)
4145 goto out; 4005 goto out;
4146 status = decode_open(&xdr, res);
4147 if (status)
4148 goto out;
4149 if (decode_getfh(&xdr, &res->fh) != 0) 4006 if (decode_getfh(&xdr, &res->fh) != 0)
4150 goto out; 4007 goto out;
4151 if (decode_getfattr(&xdr, res->f_attr, res->server) != 0) 4008 if (decode_getfattr(&xdr, res->f_attr, res->server) != 0)
@@ -4154,7 +4011,7 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openr
4154 goto out; 4011 goto out;
4155 decode_getfattr(&xdr, res->dir_attr, res->server); 4012 decode_getfattr(&xdr, res->dir_attr, res->server);
4156out: 4013out:
4157 return status; 4014 return status;
4158} 4015}
4159 4016
4160/* 4017/*
@@ -4162,20 +4019,20 @@ out:
4162 */ 4019 */
4163static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, __be32 *p, struct nfs_open_confirmres *res) 4020static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, __be32 *p, struct nfs_open_confirmres *res)
4164{ 4021{
4165 struct xdr_stream xdr; 4022 struct xdr_stream xdr;
4166 struct compound_hdr hdr; 4023 struct compound_hdr hdr;
4167 int status; 4024 int status;
4168 4025
4169 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 4026 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
4170 status = decode_compound_hdr(&xdr, &hdr); 4027 status = decode_compound_hdr(&xdr, &hdr);
4171 if (status) 4028 if (status)
4172 goto out; 4029 goto out;
4173 status = decode_putfh(&xdr); 4030 status = decode_putfh(&xdr);
4174 if (status) 4031 if (status)
4175 goto out; 4032 goto out;
4176 status = decode_open_confirm(&xdr, res); 4033 status = decode_open_confirm(&xdr, res);
4177out: 4034out:
4178 return status; 4035 return status;
4179} 4036}
4180 4037
4181/* 4038/*
@@ -4183,23 +4040,23 @@ out:
4183 */ 4040 */
4184static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res) 4041static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
4185{ 4042{
4186 struct xdr_stream xdr; 4043 struct xdr_stream xdr;
4187 struct compound_hdr hdr; 4044 struct compound_hdr hdr;
4188 int status; 4045 int status;
4189 4046
4190 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 4047 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
4191 status = decode_compound_hdr(&xdr, &hdr); 4048 status = decode_compound_hdr(&xdr, &hdr);
4192 if (status) 4049 if (status)
4193 goto out; 4050 goto out;
4194 status = decode_putfh(&xdr); 4051 status = decode_putfh(&xdr);
4195 if (status) 4052 if (status)
4196 goto out; 4053 goto out;
4197 status = decode_open(&xdr, res); 4054 status = decode_open(&xdr, res);
4198 if (status) 4055 if (status)
4199 goto out; 4056 goto out;
4200 decode_getfattr(&xdr, res->f_attr, res->server); 4057 decode_getfattr(&xdr, res->f_attr, res->server);
4201out: 4058out:
4202 return status; 4059 return status;
4203} 4060}
4204 4061
4205/* 4062/*
@@ -4207,25 +4064,25 @@ out:
4207 */ 4064 */
4208static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_setattrres *res) 4065static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_setattrres *res)
4209{ 4066{
4210 struct xdr_stream xdr; 4067 struct xdr_stream xdr;
4211 struct compound_hdr hdr; 4068 struct compound_hdr hdr;
4212 int status; 4069 int status;
4213 4070
4214 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 4071 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
4215 status = decode_compound_hdr(&xdr, &hdr); 4072 status = decode_compound_hdr(&xdr, &hdr);
4216 if (status) 4073 if (status)
4217 goto out; 4074 goto out;
4218 status = decode_putfh(&xdr); 4075 status = decode_putfh(&xdr);
4219 if (status) 4076 if (status)
4220 goto out; 4077 goto out;
4221 status = decode_setattr(&xdr, res); 4078 status = decode_setattr(&xdr, res);
4222 if (status) 4079 if (status)
4223 goto out; 4080 goto out;
4224 status = decode_getfattr(&xdr, res->fattr, res->server); 4081 status = decode_getfattr(&xdr, res->fattr, res->server);
4225 if (status == NFS4ERR_DELAY) 4082 if (status == NFS4ERR_DELAY)
4226 status = 0; 4083 status = 0;
4227out: 4084out:
4228 return status; 4085 return status;
4229} 4086}
4230 4087
4231/* 4088/*
@@ -4421,8 +4278,6 @@ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs_fsinf
4421 status = decode_putfh(&xdr); 4278 status = decode_putfh(&xdr);
4422 if (!status) 4279 if (!status)
4423 status = decode_fsinfo(&xdr, fsinfo); 4280 status = decode_fsinfo(&xdr, fsinfo);
4424 if (!status)
4425 status = nfs4_stat_to_errno(hdr.status);
4426 return status; 4281 return status;
4427} 4282}
4428 4283
@@ -4511,8 +4366,6 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
4511 status = decode_compound_hdr(&xdr, &hdr); 4366 status = decode_compound_hdr(&xdr, &hdr);
4512 if (!status) 4367 if (!status)
4513 status = decode_setclientid(&xdr, clp); 4368 status = decode_setclientid(&xdr, clp);
4514 if (!status)
4515 status = nfs4_stat_to_errno(hdr.status);
4516 return status; 4369 return status;
4517} 4370}
4518 4371
@@ -4533,8 +4386,6 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str
4533 status = decode_putrootfh(&xdr); 4386 status = decode_putrootfh(&xdr);
4534 if (!status) 4387 if (!status)
4535 status = decode_fsinfo(&xdr, fsinfo); 4388 status = decode_fsinfo(&xdr, fsinfo);
4536 if (!status)
4537 status = nfs4_stat_to_errno(hdr.status);
4538 return status; 4389 return status;
4539} 4390}
4540 4391
@@ -4715,7 +4566,7 @@ nfs4_stat_to_errno(int stat)
4715 .p_replen = NFS4_##restype##_sz, \ 4566 .p_replen = NFS4_##restype##_sz, \
4716 .p_statidx = NFSPROC4_CLNT_##proc, \ 4567 .p_statidx = NFSPROC4_CLNT_##proc, \
4717 .p_name = #proc, \ 4568 .p_name = #proc, \
4718 } 4569}
4719 4570
4720struct rpc_procinfo nfs4_procedures[] = { 4571struct rpc_procinfo nfs4_procedures[] = {
4721 PROC(READ, enc_read, dec_read), 4572 PROC(READ, enc_read, dec_read),
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index d74d16ce0d49..d9ef602fbc5a 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -86,6 +86,8 @@
86#include <net/ipconfig.h> 86#include <net/ipconfig.h>
87#include <linux/parser.h> 87#include <linux/parser.h>
88 88
89#include "internal.h"
90
89/* Define this to allow debugging output */ 91/* Define this to allow debugging output */
90#undef NFSROOT_DEBUG 92#undef NFSROOT_DEBUG
91#define NFSDBG_FACILITY NFSDBG_ROOT 93#define NFSDBG_FACILITY NFSDBG_ROOT
@@ -100,7 +102,7 @@ static char nfs_root_name[256] __initdata = "";
100static __be32 servaddr __initdata = 0; 102static __be32 servaddr __initdata = 0;
101 103
102/* Name of directory to mount */ 104/* Name of directory to mount */
103static char nfs_path[NFS_MAXPATHLEN] __initdata = { 0, }; 105static char nfs_export_path[NFS_MAXPATHLEN] __initdata = { 0, };
104 106
105/* NFS-related data */ 107/* NFS-related data */
106static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */ 108static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */
@@ -312,7 +314,7 @@ static int __init root_nfs_name(char *name)
312 printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n"); 314 printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n");
313 return -1; 315 return -1;
314 } 316 }
315 sprintf(nfs_path, buf, cp); 317 sprintf(nfs_export_path, buf, cp);
316 318
317 return 1; 319 return 1;
318} 320}
@@ -340,7 +342,7 @@ static int __init root_nfs_addr(void)
340static void __init root_nfs_print(void) 342static void __init root_nfs_print(void)
341{ 343{
342 printk(KERN_NOTICE "Root-NFS: Mounting %s on server %s as root\n", 344 printk(KERN_NOTICE "Root-NFS: Mounting %s on server %s as root\n",
343 nfs_path, nfs_data.hostname); 345 nfs_export_path, nfs_data.hostname);
344 printk(KERN_NOTICE "Root-NFS: rsize = %d, wsize = %d, timeo = %d, retrans = %d\n", 346 printk(KERN_NOTICE "Root-NFS: rsize = %d, wsize = %d, timeo = %d, retrans = %d\n",
345 nfs_data.rsize, nfs_data.wsize, nfs_data.timeo, nfs_data.retrans); 347 nfs_data.rsize, nfs_data.wsize, nfs_data.timeo, nfs_data.retrans);
346 printk(KERN_NOTICE "Root-NFS: acreg (min,max) = (%d,%d), acdir (min,max) = (%d,%d)\n", 348 printk(KERN_NOTICE "Root-NFS: acreg (min,max) = (%d,%d), acdir (min,max) = (%d,%d)\n",
@@ -485,18 +487,23 @@ static int __init root_nfs_get_handle(void)
485{ 487{
486 struct nfs_fh fh; 488 struct nfs_fh fh;
487 struct sockaddr_in sin; 489 struct sockaddr_in sin;
490 struct nfs_mount_request request = {
491 .sap = (struct sockaddr *)&sin,
492 .salen = sizeof(sin),
493 .dirpath = nfs_export_path,
494 .version = (nfs_data.flags & NFS_MOUNT_VER3) ?
495 NFS_MNT3_VERSION : NFS_MNT_VERSION,
496 .protocol = (nfs_data.flags & NFS_MOUNT_TCP) ?
497 XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP,
498 .fh = &fh,
499 };
488 int status; 500 int status;
489 int protocol = (nfs_data.flags & NFS_MOUNT_TCP) ?
490 XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP;
491 int version = (nfs_data.flags & NFS_MOUNT_VER3) ?
492 NFS_MNT3_VERSION : NFS_MNT_VERSION;
493 501
494 set_sockaddr(&sin, servaddr, htons(mount_port)); 502 set_sockaddr(&sin, servaddr, htons(mount_port));
495 status = nfs_mount((struct sockaddr *) &sin, sizeof(sin), NULL, 503 status = nfs_mount(&request);
496 nfs_path, version, protocol, &fh);
497 if (status < 0) 504 if (status < 0)
498 printk(KERN_ERR "Root-NFS: Server returned error %d " 505 printk(KERN_ERR "Root-NFS: Server returned error %d "
499 "while mounting %s\n", status, nfs_path); 506 "while mounting %s\n", status, nfs_export_path);
500 else { 507 else {
501 nfs_data.root.size = fh.size; 508 nfs_data.root.size = fh.size;
502 memcpy(nfs_data.root.data, fh.data, fh.size); 509 memcpy(nfs_data.root.data, fh.data, fh.size);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 40d17987d0e8..f856004bb7fa 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -533,12 +533,6 @@ readpage_async_filler(void *data, struct page *page)
533 unsigned int len; 533 unsigned int len;
534 int error; 534 int error;
535 535
536 error = nfs_wb_page(inode, page);
537 if (error)
538 goto out_unlock;
539 if (PageUptodate(page))
540 goto out_unlock;
541
542 len = nfs_page_length(page); 536 len = nfs_page_length(page);
543 if (len == 0) 537 if (len == 0)
544 return nfs_return_empty_page(page); 538 return nfs_return_empty_page(page);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index bb0313ac9e1f..d6686f4786dc 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -75,6 +75,7 @@ enum {
75 Opt_acl, Opt_noacl, 75 Opt_acl, Opt_noacl,
76 Opt_rdirplus, Opt_nordirplus, 76 Opt_rdirplus, Opt_nordirplus,
77 Opt_sharecache, Opt_nosharecache, 77 Opt_sharecache, Opt_nosharecache,
78 Opt_resvport, Opt_noresvport,
78 79
79 /* Mount options that take integer arguments */ 80 /* Mount options that take integer arguments */
80 Opt_port, 81 Opt_port,
@@ -129,6 +130,8 @@ static const match_table_t nfs_mount_option_tokens = {
129 { Opt_nordirplus, "nordirplus" }, 130 { Opt_nordirplus, "nordirplus" },
130 { Opt_sharecache, "sharecache" }, 131 { Opt_sharecache, "sharecache" },
131 { Opt_nosharecache, "nosharecache" }, 132 { Opt_nosharecache, "nosharecache" },
133 { Opt_resvport, "resvport" },
134 { Opt_noresvport, "noresvport" },
132 135
133 { Opt_port, "port=%u" }, 136 { Opt_port, "port=%u" },
134 { Opt_rsize, "rsize=%u" }, 137 { Opt_rsize, "rsize=%u" },
@@ -512,7 +515,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
512 { NFS_MOUNT_NONLM, ",nolock", "" }, 515 { NFS_MOUNT_NONLM, ",nolock", "" },
513 { NFS_MOUNT_NOACL, ",noacl", "" }, 516 { NFS_MOUNT_NOACL, ",noacl", "" },
514 { NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" }, 517 { NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" },
515 { NFS_MOUNT_UNSHARED, ",nosharecache", ""}, 518 { NFS_MOUNT_UNSHARED, ",nosharecache", "" },
519 { NFS_MOUNT_NORESVPORT, ",noresvport", "" },
516 { 0, NULL, NULL } 520 { 0, NULL, NULL }
517 }; 521 };
518 const struct proc_nfs_info *nfs_infop; 522 const struct proc_nfs_info *nfs_infop;
@@ -1033,6 +1037,12 @@ static int nfs_parse_mount_options(char *raw,
1033 case Opt_nosharecache: 1037 case Opt_nosharecache:
1034 mnt->flags |= NFS_MOUNT_UNSHARED; 1038 mnt->flags |= NFS_MOUNT_UNSHARED;
1035 break; 1039 break;
1040 case Opt_resvport:
1041 mnt->flags &= ~NFS_MOUNT_NORESVPORT;
1042 break;
1043 case Opt_noresvport:
1044 mnt->flags |= NFS_MOUNT_NORESVPORT;
1045 break;
1036 1046
1037 /* 1047 /*
1038 * options that take numeric values 1048 * options that take numeric values
@@ -1327,8 +1337,14 @@ out_security_failure:
1327static int nfs_try_mount(struct nfs_parsed_mount_data *args, 1337static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1328 struct nfs_fh *root_fh) 1338 struct nfs_fh *root_fh)
1329{ 1339{
1330 struct sockaddr *sap = (struct sockaddr *)&args->mount_server.address; 1340 struct nfs_mount_request request = {
1331 char *hostname; 1341 .sap = (struct sockaddr *)
1342 &args->mount_server.address,
1343 .dirpath = args->nfs_server.export_path,
1344 .protocol = args->mount_server.protocol,
1345 .fh = root_fh,
1346 .noresvport = args->flags & NFS_MOUNT_NORESVPORT,
1347 };
1332 int status; 1348 int status;
1333 1349
1334 if (args->mount_server.version == 0) { 1350 if (args->mount_server.version == 0) {
@@ -1337,42 +1353,38 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1337 else 1353 else
1338 args->mount_server.version = NFS_MNT_VERSION; 1354 args->mount_server.version = NFS_MNT_VERSION;
1339 } 1355 }
1356 request.version = args->mount_server.version;
1340 1357
1341 if (args->mount_server.hostname) 1358 if (args->mount_server.hostname)
1342 hostname = args->mount_server.hostname; 1359 request.hostname = args->mount_server.hostname;
1343 else 1360 else
1344 hostname = args->nfs_server.hostname; 1361 request.hostname = args->nfs_server.hostname;
1345 1362
1346 /* 1363 /*
1347 * Construct the mount server's address. 1364 * Construct the mount server's address.
1348 */ 1365 */
1349 if (args->mount_server.address.ss_family == AF_UNSPEC) { 1366 if (args->mount_server.address.ss_family == AF_UNSPEC) {
1350 memcpy(sap, &args->nfs_server.address, 1367 memcpy(request.sap, &args->nfs_server.address,
1351 args->nfs_server.addrlen); 1368 args->nfs_server.addrlen);
1352 args->mount_server.addrlen = args->nfs_server.addrlen; 1369 args->mount_server.addrlen = args->nfs_server.addrlen;
1353 } 1370 }
1371 request.salen = args->mount_server.addrlen;
1354 1372
1355 /* 1373 /*
1356 * autobind will be used if mount_server.port == 0 1374 * autobind will be used if mount_server.port == 0
1357 */ 1375 */
1358 nfs_set_port(sap, args->mount_server.port); 1376 nfs_set_port(request.sap, args->mount_server.port);
1359 1377
1360 /* 1378 /*
1361 * Now ask the mount server to map our export path 1379 * Now ask the mount server to map our export path
1362 * to a file handle. 1380 * to a file handle.
1363 */ 1381 */
1364 status = nfs_mount(sap, 1382 status = nfs_mount(&request);
1365 args->mount_server.addrlen,
1366 hostname,
1367 args->nfs_server.export_path,
1368 args->mount_server.version,
1369 args->mount_server.protocol,
1370 root_fh);
1371 if (status == 0) 1383 if (status == 0)
1372 return 0; 1384 return 0;
1373 1385
1374 dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n", 1386 dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
1375 hostname, status); 1387 request.hostname, status);
1376 return status; 1388 return status;
1377} 1389}
1378 1390
@@ -2419,7 +2431,7 @@ static void nfs4_kill_super(struct super_block *sb)
2419{ 2431{
2420 struct nfs_server *server = NFS_SB(sb); 2432 struct nfs_server *server = NFS_SB(sb);
2421 2433
2422 nfs_return_all_delegations(sb); 2434 nfs_super_return_all_delegations(sb);
2423 kill_anon_super(sb); 2435 kill_anon_super(sb);
2424 2436
2425 nfs4_renewd_prepare_shutdown(server); 2437 nfs4_renewd_prepare_shutdown(server);
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index c11f5375d7c1..04133aacb1e5 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -29,8 +29,8 @@
29 29
30MODULE_LICENSE("GPL"); 30MODULE_LICENSE("GPL");
31 31
32EXPORT_SYMBOL(nfsacl_encode); 32EXPORT_SYMBOL_GPL(nfsacl_encode);
33EXPORT_SYMBOL(nfsacl_decode); 33EXPORT_SYMBOL_GPL(nfsacl_decode);
34 34
35struct nfsacl_encode_desc { 35struct nfsacl_encode_desc {
36 struct xdr_array2_desc desc; 36 struct xdr_array2_desc desc;
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index b1acbd6ab6fb..b27451909dff 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -38,9 +38,10 @@ static struct file *do_open(char *name, int flags)
38 return ERR_PTR(error); 38 return ERR_PTR(error);
39 39
40 if (flags == O_RDWR) 40 if (flags == O_RDWR)
41 error = may_open(&nd,MAY_READ|MAY_WRITE,FMODE_READ|FMODE_WRITE); 41 error = may_open(&nd.path, MAY_READ|MAY_WRITE,
42 FMODE_READ|FMODE_WRITE);
42 else 43 else
43 error = may_open(&nd, MAY_WRITE, FMODE_WRITE); 44 error = may_open(&nd.path, MAY_WRITE, FMODE_WRITE);
44 45
45 if (!error) 46 if (!error)
46 return dentry_open(nd.path.dentry, nd.path.mnt, flags, 47 return dentry_open(nd.path.dentry, nd.path.mnt, flags,
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 0184fe9b514c..c903e04aa217 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -76,10 +76,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
76 76
77 ret = set_groups(new, gi); 77 ret = set_groups(new, gi);
78 put_group_info(gi); 78 put_group_info(gi);
79 if (!ret) 79 if (ret < 0)
80 goto error; 80 goto error;
81 81
82 if (new->uid) 82 if (new->fsuid)
83 new->cap_effective = cap_drop_nfsd_set(new->cap_effective); 83 new->cap_effective = cap_drop_nfsd_set(new->cap_effective);
84 else 84 else
85 new->cap_effective = cap_raise_nfsd_set(new->cap_effective, 85 new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 094747a1227c..c464181b5994 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -53,9 +53,6 @@
53#define NFSPROC4_CB_NULL 0 53#define NFSPROC4_CB_NULL 0
54#define NFSPROC4_CB_COMPOUND 1 54#define NFSPROC4_CB_COMPOUND 1
55 55
56/* declarations */
57static const struct rpc_call_ops nfs4_cb_null_ops;
58
59/* Index of predefined Linux callback client operations */ 56/* Index of predefined Linux callback client operations */
60 57
61enum { 58enum {
@@ -358,6 +355,7 @@ static struct rpc_program cb_program = {
358 .nrvers = ARRAY_SIZE(nfs_cb_version), 355 .nrvers = ARRAY_SIZE(nfs_cb_version),
359 .version = nfs_cb_version, 356 .version = nfs_cb_version,
360 .stats = &cb_stats, 357 .stats = &cb_stats,
358 .pipe_dir_name = "/nfsd4_cb",
361}; 359};
362 360
363/* Reference counting, callback cleanup, etc., all look racy as heck. 361/* Reference counting, callback cleanup, etc., all look racy as heck.
@@ -382,8 +380,9 @@ static int do_probe_callback(void *data)
382 .program = &cb_program, 380 .program = &cb_program,
383 .prognumber = cb->cb_prog, 381 .prognumber = cb->cb_prog,
384 .version = nfs_cb_version[1]->number, 382 .version = nfs_cb_version[1]->number,
385 .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */ 383 .authflavor = clp->cl_flavor,
386 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), 384 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
385 .client_name = clp->cl_principal,
387 }; 386 };
388 struct rpc_message msg = { 387 struct rpc_message msg = {
389 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], 388 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
@@ -392,6 +391,11 @@ static int do_probe_callback(void *data)
392 struct rpc_clnt *client; 391 struct rpc_clnt *client;
393 int status; 392 int status;
394 393
394 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) {
395 status = nfserr_cb_path_down;
396 goto out_err;
397 }
398
395 /* Initialize address */ 399 /* Initialize address */
396 memset(&addr, 0, sizeof(addr)); 400 memset(&addr, 0, sizeof(addr));
397 addr.sin_family = AF_INET; 401 addr.sin_family = AF_INET;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 669461e291ae..9fa60a3ad48c 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -946,6 +946,11 @@ encode_op:
946 nfsd4_encode_operation(resp, op); 946 nfsd4_encode_operation(resp, op);
947 status = op->status; 947 status = op->status;
948 } 948 }
949
950 dprintk("nfsv4 compound op %p opcnt %d #%d: %d: status %d\n",
951 args->ops, args->opcnt, resp->opcnt, op->opnum,
952 be32_to_cpu(status));
953
949 if (cstate->replay_owner) { 954 if (cstate->replay_owner) {
950 nfs4_put_stateowner(cstate->replay_owner); 955 nfs4_put_stateowner(cstate->replay_owner);
951 cstate->replay_owner = NULL; 956 cstate->replay_owner = NULL;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 0f9d6efaa62b..74f7b67567fd 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -116,9 +116,9 @@ nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
116 116
117 md5_to_hex(dname, cksum.data); 117 md5_to_hex(dname, cksum.data);
118 118
119 kfree(cksum.data);
120 status = nfs_ok; 119 status = nfs_ok;
121out: 120out:
121 kfree(cksum.data);
122 crypto_free_hash(desc.tfm); 122 crypto_free_hash(desc.tfm);
123out_no_tfm: 123out_no_tfm:
124 return status; 124 return status;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index bf4cd46a5a11..88db7d3ec120 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -54,6 +54,7 @@
54#include <linux/mutex.h> 54#include <linux/mutex.h>
55#include <linux/lockd/bind.h> 55#include <linux/lockd/bind.h>
56#include <linux/module.h> 56#include <linux/module.h>
57#include <linux/sunrpc/svcauth_gss.h>
57 58
58#define NFSDDBG_FACILITY NFSDDBG_PROC 59#define NFSDDBG_FACILITY NFSDDBG_PROC
59 60
@@ -377,6 +378,7 @@ free_client(struct nfs4_client *clp)
377 shutdown_callback_client(clp); 378 shutdown_callback_client(clp);
378 if (clp->cl_cred.cr_group_info) 379 if (clp->cl_cred.cr_group_info)
379 put_group_info(clp->cl_cred.cr_group_info); 380 put_group_info(clp->cl_cred.cr_group_info);
381 kfree(clp->cl_principal);
380 kfree(clp->cl_name.data); 382 kfree(clp->cl_name.data);
381 kfree(clp); 383 kfree(clp);
382} 384}
@@ -696,6 +698,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
696 unsigned int strhashval; 698 unsigned int strhashval;
697 struct nfs4_client *conf, *unconf, *new; 699 struct nfs4_client *conf, *unconf, *new;
698 __be32 status; 700 __be32 status;
701 char *princ;
699 char dname[HEXDIR_LEN]; 702 char dname[HEXDIR_LEN];
700 703
701 if (!check_name(clname)) 704 if (!check_name(clname))
@@ -783,6 +786,15 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
783 } 786 }
784 copy_verf(new, &clverifier); 787 copy_verf(new, &clverifier);
785 new->cl_addr = sin->sin_addr.s_addr; 788 new->cl_addr = sin->sin_addr.s_addr;
789 new->cl_flavor = rqstp->rq_flavor;
790 princ = svc_gss_principal(rqstp);
791 if (princ) {
792 new->cl_principal = kstrdup(princ, GFP_KERNEL);
793 if (new->cl_principal == NULL) {
794 free_client(new);
795 goto out;
796 }
797 }
786 copy_cred(&new->cl_cred, &rqstp->rq_cred); 798 copy_cred(&new->cl_cred, &rqstp->rq_cred);
787 gen_confirm(new); 799 gen_confirm(new);
788 gen_callback(new, setclid); 800 gen_callback(new, setclid);
@@ -2404,6 +2416,26 @@ out:
2404#define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS) 2416#define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS)
2405#define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1) 2417#define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1)
2406 2418
2419static inline u64
2420end_offset(u64 start, u64 len)
2421{
2422 u64 end;
2423
2424 end = start + len;
2425 return end >= start ? end: NFS4_MAX_UINT64;
2426}
2427
2428/* last octet in a range */
2429static inline u64
2430last_byte_offset(u64 start, u64 len)
2431{
2432 u64 end;
2433
2434 BUG_ON(!len);
2435 end = start + len;
2436 return end > start ? end - 1: NFS4_MAX_UINT64;
2437}
2438
2407#define lockownerid_hashval(id) \ 2439#define lockownerid_hashval(id) \
2408 ((id) & LOCK_HASH_MASK) 2440 ((id) & LOCK_HASH_MASK)
2409 2441
@@ -2423,13 +2455,13 @@ static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE];
2423static struct nfs4_stateid * 2455static struct nfs4_stateid *
2424find_stateid(stateid_t *stid, int flags) 2456find_stateid(stateid_t *stid, int flags)
2425{ 2457{
2426 struct nfs4_stateid *local = NULL; 2458 struct nfs4_stateid *local;
2427 u32 st_id = stid->si_stateownerid; 2459 u32 st_id = stid->si_stateownerid;
2428 u32 f_id = stid->si_fileid; 2460 u32 f_id = stid->si_fileid;
2429 unsigned int hashval; 2461 unsigned int hashval;
2430 2462
2431 dprintk("NFSD: find_stateid flags 0x%x\n",flags); 2463 dprintk("NFSD: find_stateid flags 0x%x\n",flags);
2432 if ((flags & LOCK_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) { 2464 if (flags & (LOCK_STATE | RD_STATE | WR_STATE)) {
2433 hashval = stateid_hashval(st_id, f_id); 2465 hashval = stateid_hashval(st_id, f_id);
2434 list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) { 2466 list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) {
2435 if ((local->st_stateid.si_stateownerid == st_id) && 2467 if ((local->st_stateid.si_stateownerid == st_id) &&
@@ -2437,7 +2469,8 @@ find_stateid(stateid_t *stid, int flags)
2437 return local; 2469 return local;
2438 } 2470 }
2439 } 2471 }
2440 if ((flags & OPEN_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) { 2472
2473 if (flags & (OPEN_STATE | RD_STATE | WR_STATE)) {
2441 hashval = stateid_hashval(st_id, f_id); 2474 hashval = stateid_hashval(st_id, f_id);
2442 list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) { 2475 list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) {
2443 if ((local->st_stateid.si_stateownerid == st_id) && 2476 if ((local->st_stateid.si_stateownerid == st_id) &&
@@ -2506,8 +2539,8 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
2506 deny->ld_clientid.cl_id = 0; 2539 deny->ld_clientid.cl_id = 0;
2507 } 2540 }
2508 deny->ld_start = fl->fl_start; 2541 deny->ld_start = fl->fl_start;
2509 deny->ld_length = ~(u64)0; 2542 deny->ld_length = NFS4_MAX_UINT64;
2510 if (fl->fl_end != ~(u64)0) 2543 if (fl->fl_end != NFS4_MAX_UINT64)
2511 deny->ld_length = fl->fl_end - fl->fl_start + 1; 2544 deny->ld_length = fl->fl_end - fl->fl_start + 1;
2512 deny->ld_type = NFS4_READ_LT; 2545 deny->ld_type = NFS4_READ_LT;
2513 if (fl->fl_type != F_RDLCK) 2546 if (fl->fl_type != F_RDLCK)
@@ -2604,7 +2637,7 @@ out:
2604static int 2637static int
2605check_lock_length(u64 offset, u64 length) 2638check_lock_length(u64 offset, u64 length)
2606{ 2639{
2607 return ((length == 0) || ((length != ~(u64)0) && 2640 return ((length == 0) || ((length != NFS4_MAX_UINT64) &&
2608 LOFF_OVERFLOW(offset, length))); 2641 LOFF_OVERFLOW(offset, length)));
2609} 2642}
2610 2643
@@ -2724,11 +2757,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2724 file_lock.fl_lmops = &nfsd_posix_mng_ops; 2757 file_lock.fl_lmops = &nfsd_posix_mng_ops;
2725 2758
2726 file_lock.fl_start = lock->lk_offset; 2759 file_lock.fl_start = lock->lk_offset;
2727 if ((lock->lk_length == ~(u64)0) || 2760 file_lock.fl_end = last_byte_offset(lock->lk_offset, lock->lk_length);
2728 LOFF_OVERFLOW(lock->lk_offset, lock->lk_length))
2729 file_lock.fl_end = ~(u64)0;
2730 else
2731 file_lock.fl_end = lock->lk_offset + lock->lk_length - 1;
2732 nfs4_transform_lock_offset(&file_lock); 2761 nfs4_transform_lock_offset(&file_lock);
2733 2762
2734 /* 2763 /*
@@ -2769,6 +2798,25 @@ out:
2769} 2798}
2770 2799
2771/* 2800/*
2801 * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN,
2802 * so we do a temporary open here just to get an open file to pass to
2803 * vfs_test_lock. (Arguably perhaps test_lock should be done with an
2804 * inode operation.)
2805 */
2806static int nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock)
2807{
2808 struct file *file;
2809 int err;
2810
2811 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
2812 if (err)
2813 return err;
2814 err = vfs_test_lock(file, lock);
2815 nfsd_close(file);
2816 return err;
2817}
2818
2819/*
2772 * LOCKT operation 2820 * LOCKT operation
2773 */ 2821 */
2774__be32 2822__be32
@@ -2776,7 +2824,6 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2776 struct nfsd4_lockt *lockt) 2824 struct nfsd4_lockt *lockt)
2777{ 2825{
2778 struct inode *inode; 2826 struct inode *inode;
2779 struct file file;
2780 struct file_lock file_lock; 2827 struct file_lock file_lock;
2781 int error; 2828 int error;
2782 __be32 status; 2829 __be32 status;
@@ -2827,23 +2874,12 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2827 file_lock.fl_lmops = &nfsd_posix_mng_ops; 2874 file_lock.fl_lmops = &nfsd_posix_mng_ops;
2828 2875
2829 file_lock.fl_start = lockt->lt_offset; 2876 file_lock.fl_start = lockt->lt_offset;
2830 if ((lockt->lt_length == ~(u64)0) || LOFF_OVERFLOW(lockt->lt_offset, lockt->lt_length)) 2877 file_lock.fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length);
2831 file_lock.fl_end = ~(u64)0;
2832 else
2833 file_lock.fl_end = lockt->lt_offset + lockt->lt_length - 1;
2834 2878
2835 nfs4_transform_lock_offset(&file_lock); 2879 nfs4_transform_lock_offset(&file_lock);
2836 2880
2837 /* vfs_test_lock uses the struct file _only_ to resolve the inode.
2838 * since LOCKT doesn't require an OPEN, and therefore a struct
2839 * file may not exist, pass vfs_test_lock a struct file with
2840 * only the dentry:inode set.
2841 */
2842 memset(&file, 0, sizeof (struct file));
2843 file.f_path.dentry = cstate->current_fh.fh_dentry;
2844
2845 status = nfs_ok; 2881 status = nfs_ok;
2846 error = vfs_test_lock(&file, &file_lock); 2882 error = nfsd_test_lock(rqstp, &cstate->current_fh, &file_lock);
2847 if (error) { 2883 if (error) {
2848 status = nfserrno(error); 2884 status = nfserrno(error);
2849 goto out; 2885 goto out;
@@ -2894,10 +2930,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2894 file_lock.fl_lmops = &nfsd_posix_mng_ops; 2930 file_lock.fl_lmops = &nfsd_posix_mng_ops;
2895 file_lock.fl_start = locku->lu_offset; 2931 file_lock.fl_start = locku->lu_offset;
2896 2932
2897 if ((locku->lu_length == ~(u64)0) || LOFF_OVERFLOW(locku->lu_offset, locku->lu_length)) 2933 file_lock.fl_end = last_byte_offset(locku->lu_offset, locku->lu_length);
2898 file_lock.fl_end = ~(u64)0;
2899 else
2900 file_lock.fl_end = locku->lu_offset + locku->lu_length - 1;
2901 nfs4_transform_lock_offset(&file_lock); 2934 nfs4_transform_lock_offset(&file_lock);
2902 2935
2903 /* 2936 /*
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index afcdf4b76843..f65953be39c0 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1,6 +1,4 @@
1/* 1/*
2 * fs/nfs/nfs4xdr.c
3 *
4 * Server-side XDR for NFSv4 2 * Server-side XDR for NFSv4
5 * 3 *
6 * Copyright (c) 2002 The Regents of the University of Michigan. 4 * Copyright (c) 2002 The Regents of the University of Michigan.
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 77d7b8c531a6..3d93b2064ce5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -84,6 +84,8 @@ static ssize_t write_unexport(struct file *file, char *buf, size_t size);
84static ssize_t write_getfd(struct file *file, char *buf, size_t size); 84static ssize_t write_getfd(struct file *file, char *buf, size_t size);
85static ssize_t write_getfs(struct file *file, char *buf, size_t size); 85static ssize_t write_getfs(struct file *file, char *buf, size_t size);
86static ssize_t write_filehandle(struct file *file, char *buf, size_t size); 86static ssize_t write_filehandle(struct file *file, char *buf, size_t size);
87static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size);
88static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size);
87static ssize_t write_threads(struct file *file, char *buf, size_t size); 89static ssize_t write_threads(struct file *file, char *buf, size_t size);
88static ssize_t write_pool_threads(struct file *file, char *buf, size_t size); 90static ssize_t write_pool_threads(struct file *file, char *buf, size_t size);
89static ssize_t write_versions(struct file *file, char *buf, size_t size); 91static ssize_t write_versions(struct file *file, char *buf, size_t size);
@@ -94,9 +96,6 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
94static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); 96static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
95#endif 97#endif
96 98
97static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size);
98static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size);
99
100static ssize_t (*write_op[])(struct file *, char *, size_t) = { 99static ssize_t (*write_op[])(struct file *, char *, size_t) = {
101 [NFSD_Svc] = write_svc, 100 [NFSD_Svc] = write_svc,
102 [NFSD_Add] = write_add, 101 [NFSD_Add] = write_add,
@@ -106,8 +105,8 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
106 [NFSD_Getfd] = write_getfd, 105 [NFSD_Getfd] = write_getfd,
107 [NFSD_Getfs] = write_getfs, 106 [NFSD_Getfs] = write_getfs,
108 [NFSD_Fh] = write_filehandle, 107 [NFSD_Fh] = write_filehandle,
109 [NFSD_FO_UnlockIP] = failover_unlock_ip, 108 [NFSD_FO_UnlockIP] = write_unlock_ip,
110 [NFSD_FO_UnlockFS] = failover_unlock_fs, 109 [NFSD_FO_UnlockFS] = write_unlock_fs,
111 [NFSD_Threads] = write_threads, 110 [NFSD_Threads] = write_threads,
112 [NFSD_Pool_Threads] = write_pool_threads, 111 [NFSD_Pool_Threads] = write_pool_threads,
113 [NFSD_Versions] = write_versions, 112 [NFSD_Versions] = write_versions,
@@ -176,10 +175,24 @@ static const struct file_operations exports_operations = {
176/*----------------------------------------------------------------------------*/ 175/*----------------------------------------------------------------------------*/
177/* 176/*
178 * payload - write methods 177 * payload - write methods
179 * If the method has a response, the response should be put in buf,
180 * and the length returned. Otherwise return 0 or and -error.
181 */ 178 */
182 179
180/**
181 * write_svc - Start kernel's NFSD server
182 *
183 * Deprecated. /proc/fs/nfsd/threads is preferred.
184 * Function remains to support old versions of nfs-utils.
185 *
186 * Input:
187 * buf: struct nfsctl_svc
188 * svc_port: port number of this
189 * server's listener
190 * svc_nthreads: number of threads to start
191 * size: size in bytes of passed in nfsctl_svc
192 * Output:
193 * On success: returns zero
194 * On error: return code is negative errno value
195 */
183static ssize_t write_svc(struct file *file, char *buf, size_t size) 196static ssize_t write_svc(struct file *file, char *buf, size_t size)
184{ 197{
185 struct nfsctl_svc *data; 198 struct nfsctl_svc *data;
@@ -189,6 +202,30 @@ static ssize_t write_svc(struct file *file, char *buf, size_t size)
189 return nfsd_svc(data->svc_port, data->svc_nthreads); 202 return nfsd_svc(data->svc_port, data->svc_nthreads);
190} 203}
191 204
205/**
206 * write_add - Add or modify client entry in auth unix cache
207 *
208 * Deprecated. /proc/net/rpc/auth.unix.ip is preferred.
209 * Function remains to support old versions of nfs-utils.
210 *
211 * Input:
212 * buf: struct nfsctl_client
213 * cl_ident: '\0'-terminated C string
214 * containing domain name
215 * of client
216 * cl_naddr: no. of items in cl_addrlist
217 * cl_addrlist: array of client addresses
218 * cl_fhkeytype: ignored
219 * cl_fhkeylen: ignored
220 * cl_fhkey: ignored
221 * size: size in bytes of passed in nfsctl_client
222 * Output:
223 * On success: returns zero
224 * On error: return code is negative errno value
225 *
226 * Note: Only AF_INET client addresses are passed in, since
227 * nfsctl_client.cl_addrlist contains only in_addr fields for addresses.
228 */
192static ssize_t write_add(struct file *file, char *buf, size_t size) 229static ssize_t write_add(struct file *file, char *buf, size_t size)
193{ 230{
194 struct nfsctl_client *data; 231 struct nfsctl_client *data;
@@ -198,6 +235,30 @@ static ssize_t write_add(struct file *file, char *buf, size_t size)
198 return exp_addclient(data); 235 return exp_addclient(data);
199} 236}
200 237
238/**
239 * write_del - Remove client from auth unix cache
240 *
241 * Deprecated. /proc/net/rpc/auth.unix.ip is preferred.
242 * Function remains to support old versions of nfs-utils.
243 *
244 * Input:
245 * buf: struct nfsctl_client
246 * cl_ident: '\0'-terminated C string
247 * containing domain name
248 * of client
249 * cl_naddr: ignored
250 * cl_addrlist: ignored
251 * cl_fhkeytype: ignored
252 * cl_fhkeylen: ignored
253 * cl_fhkey: ignored
254 * size: size in bytes of passed in nfsctl_client
255 * Output:
256 * On success: returns zero
257 * On error: return code is negative errno value
258 *
259 * Note: Only AF_INET client addresses are passed in, since
260 * nfsctl_client.cl_addrlist contains only in_addr fields for addresses.
261 */
201static ssize_t write_del(struct file *file, char *buf, size_t size) 262static ssize_t write_del(struct file *file, char *buf, size_t size)
202{ 263{
203 struct nfsctl_client *data; 264 struct nfsctl_client *data;
@@ -207,6 +268,33 @@ static ssize_t write_del(struct file *file, char *buf, size_t size)
207 return exp_delclient(data); 268 return exp_delclient(data);
208} 269}
209 270
271/**
272 * write_export - Export part or all of a local file system
273 *
274 * Deprecated. /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred.
275 * Function remains to support old versions of nfs-utils.
276 *
277 * Input:
278 * buf: struct nfsctl_export
279 * ex_client: '\0'-terminated C string
280 * containing domain name
281 * of client allowed to access
282 * this export
283 * ex_path: '\0'-terminated C string
284 * containing pathname of
285 * directory in local file system
286 * ex_dev: fsid to use for this export
287 * ex_ino: ignored
288 * ex_flags: export flags for this export
289 * ex_anon_uid: UID to use for anonymous
290 * requests
291 * ex_anon_gid: GID to use for anonymous
292 * requests
293 * size: size in bytes of passed in nfsctl_export
294 * Output:
295 * On success: returns zero
296 * On error: return code is negative errno value
297 */
210static ssize_t write_export(struct file *file, char *buf, size_t size) 298static ssize_t write_export(struct file *file, char *buf, size_t size)
211{ 299{
212 struct nfsctl_export *data; 300 struct nfsctl_export *data;
@@ -216,6 +304,31 @@ static ssize_t write_export(struct file *file, char *buf, size_t size)
216 return exp_export(data); 304 return exp_export(data);
217} 305}
218 306
307/**
308 * write_unexport - Unexport a previously exported file system
309 *
310 * Deprecated. /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred.
311 * Function remains to support old versions of nfs-utils.
312 *
313 * Input:
314 * buf: struct nfsctl_export
315 * ex_client: '\0'-terminated C string
316 * containing domain name
317 * of client no longer allowed
318 * to access this export
319 * ex_path: '\0'-terminated C string
320 * containing pathname of
321 * directory in local file system
322 * ex_dev: ignored
323 * ex_ino: ignored
324 * ex_flags: ignored
325 * ex_anon_uid: ignored
326 * ex_anon_gid: ignored
327 * size: size in bytes of passed in nfsctl_export
328 * Output:
329 * On success: returns zero
330 * On error: return code is negative errno value
331 */
219static ssize_t write_unexport(struct file *file, char *buf, size_t size) 332static ssize_t write_unexport(struct file *file, char *buf, size_t size)
220{ 333{
221 struct nfsctl_export *data; 334 struct nfsctl_export *data;
@@ -226,6 +339,30 @@ static ssize_t write_unexport(struct file *file, char *buf, size_t size)
226 return exp_unexport(data); 339 return exp_unexport(data);
227} 340}
228 341
342/**
343 * write_getfs - Get a variable-length NFS file handle by path
344 *
345 * Deprecated. /proc/fs/nfsd/filehandle is preferred.
346 * Function remains to support old versions of nfs-utils.
347 *
348 * Input:
349 * buf: struct nfsctl_fsparm
350 * gd_addr: socket address of client
351 * gd_path: '\0'-terminated C string
352 * containing pathname of
353 * directory in local file system
354 * gd_maxlen: maximum size of returned file
355 * handle
356 * size: size in bytes of passed in nfsctl_fsparm
357 * Output:
358 * On success: passed-in buffer filled with a knfsd_fh structure
359 * (a variable-length raw NFS file handle);
360 * return code is the size in bytes of the file handle
361 * On error: return code is negative errno value
362 *
363 * Note: Only AF_INET client addresses are passed in, since gd_addr
364 * is the same size as a struct sockaddr_in.
365 */
229static ssize_t write_getfs(struct file *file, char *buf, size_t size) 366static ssize_t write_getfs(struct file *file, char *buf, size_t size)
230{ 367{
231 struct nfsctl_fsparm *data; 368 struct nfsctl_fsparm *data;
@@ -265,6 +402,29 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size)
265 return err; 402 return err;
266} 403}
267 404
405/**
406 * write_getfd - Get a fixed-length NFS file handle by path (used by mountd)
407 *
408 * Deprecated. /proc/fs/nfsd/filehandle is preferred.
409 * Function remains to support old versions of nfs-utils.
410 *
411 * Input:
412 * buf: struct nfsctl_fdparm
413 * gd_addr: socket address of client
414 * gd_path: '\0'-terminated C string
415 * containing pathname of
416 * directory in local file system
417 * gd_version: fdparm structure version
418 * size: size in bytes of passed in nfsctl_fdparm
419 * Output:
420 * On success: passed-in buffer filled with nfsctl_res
421 * (a fixed-length raw NFS file handle);
422 * return code is the size in bytes of the file handle
423 * On error: return code is negative errno value
424 *
425 * Note: Only AF_INET client addresses are passed in, since gd_addr
426 * is the same size as a struct sockaddr_in.
427 */
268static ssize_t write_getfd(struct file *file, char *buf, size_t size) 428static ssize_t write_getfd(struct file *file, char *buf, size_t size)
269{ 429{
270 struct nfsctl_fdparm *data; 430 struct nfsctl_fdparm *data;
@@ -309,7 +469,23 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
309 return err; 469 return err;
310} 470}
311 471
312static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size) 472/**
473 * write_unlock_ip - Release all locks used by a client
474 *
475 * Experimental.
476 *
477 * Input:
478 * buf: '\n'-terminated C string containing a
479 * presentation format IPv4 address
480 * size: length of C string in @buf
481 * Output:
482 * On success: returns zero if all specified locks were released;
483 * returns one if one or more locks were not released
484 * On error: return code is negative errno value
485 *
486 * Note: Only AF_INET client addresses are passed in
487 */
488static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
313{ 489{
314 struct sockaddr_in sin = { 490 struct sockaddr_in sin = {
315 .sin_family = AF_INET, 491 .sin_family = AF_INET,
@@ -339,7 +515,21 @@ static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size)
339 return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin); 515 return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin);
340} 516}
341 517
342static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size) 518/**
519 * write_unlock_fs - Release all locks on a local file system
520 *
521 * Experimental.
522 *
523 * Input:
524 * buf: '\n'-terminated C string containing the
525 * absolute pathname of a local file system
526 * size: length of C string in @buf
527 * Output:
528 * On success: returns zero if all specified locks were released;
529 * returns one if one or more locks were not released
530 * On error: return code is negative errno value
531 */
532static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
343{ 533{
344 struct path path; 534 struct path path;
345 char *fo_path; 535 char *fo_path;
@@ -360,21 +550,44 @@ static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size)
360 if (error) 550 if (error)
361 return error; 551 return error;
362 552
553 /*
554 * XXX: Needs better sanity checking. Otherwise we could end up
555 * releasing locks on the wrong file system.
556 *
557 * For example:
558 * 1. Does the path refer to a directory?
559 * 2. Is that directory a mount point, or
560 * 3. Is that directory the root of an exported file system?
561 */
363 error = nlmsvc_unlock_all_by_sb(path.mnt->mnt_sb); 562 error = nlmsvc_unlock_all_by_sb(path.mnt->mnt_sb);
364 563
365 path_put(&path); 564 path_put(&path);
366 return error; 565 return error;
367} 566}
368 567
568/**
569 * write_filehandle - Get a variable-length NFS file handle by path
570 *
571 * On input, the buffer contains a '\n'-terminated C string comprised of
572 * three alphanumeric words separated by whitespace. The string may
573 * contain escape sequences.
574 *
575 * Input:
576 * buf:
577 * domain: client domain name
578 * path: export pathname
579 * maxsize: numeric maximum size of
580 * @buf
581 * size: length of C string in @buf
582 * Output:
583 * On success: passed-in buffer filled with '\n'-terminated C
584 * string containing a ASCII hex text version
585 * of the NFS file handle;
586 * return code is the size in bytes of the string
587 * On error: return code is negative errno value
588 */
369static ssize_t write_filehandle(struct file *file, char *buf, size_t size) 589static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
370{ 590{
371 /* request is:
372 * domain path maxsize
373 * response is
374 * filehandle
375 *
376 * qword quoting is used, so filehandle will be \x....
377 */
378 char *dname, *path; 591 char *dname, *path;
379 int uninitialized_var(maxsize); 592 int uninitialized_var(maxsize);
380 char *mesg = buf; 593 char *mesg = buf;
@@ -391,11 +604,13 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
391 604
392 dname = mesg; 605 dname = mesg;
393 len = qword_get(&mesg, dname, size); 606 len = qword_get(&mesg, dname, size);
394 if (len <= 0) return -EINVAL; 607 if (len <= 0)
608 return -EINVAL;
395 609
396 path = dname+len+1; 610 path = dname+len+1;
397 len = qword_get(&mesg, path, size); 611 len = qword_get(&mesg, path, size);
398 if (len <= 0) return -EINVAL; 612 if (len <= 0)
613 return -EINVAL;
399 614
400 len = get_int(&mesg, &maxsize); 615 len = get_int(&mesg, &maxsize);
401 if (len) 616 if (len)
@@ -419,17 +634,43 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
419 if (len) 634 if (len)
420 return len; 635 return len;
421 636
422 mesg = buf; len = SIMPLE_TRANSACTION_LIMIT; 637 mesg = buf;
638 len = SIMPLE_TRANSACTION_LIMIT;
423 qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size); 639 qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size);
424 mesg[-1] = '\n'; 640 mesg[-1] = '\n';
425 return mesg - buf; 641 return mesg - buf;
426} 642}
427 643
644/**
645 * write_threads - Start NFSD, or report the current number of running threads
646 *
647 * Input:
648 * buf: ignored
649 * size: zero
650 * Output:
651 * On success: passed-in buffer filled with '\n'-terminated C
652 * string numeric value representing the number of
653 * running NFSD threads;
654 * return code is the size in bytes of the string
655 * On error: return code is zero
656 *
657 * OR
658 *
659 * Input:
660 * buf: C string containing an unsigned
661 * integer value representing the
662 * number of NFSD threads to start
663 * size: non-zero length of C string in @buf
664 * Output:
665 * On success: NFS service is started;
666 * passed-in buffer filled with '\n'-terminated C
667 * string numeric value representing the number of
668 * running NFSD threads;
669 * return code is the size in bytes of the string
670 * On error: return code is zero or a negative errno value
671 */
428static ssize_t write_threads(struct file *file, char *buf, size_t size) 672static ssize_t write_threads(struct file *file, char *buf, size_t size)
429{ 673{
430 /* if size > 0, look for a number of threads and call nfsd_svc
431 * then write out number of threads as reply
432 */
433 char *mesg = buf; 674 char *mesg = buf;
434 int rv; 675 int rv;
435 if (size > 0) { 676 if (size > 0) {
@@ -437,9 +678,9 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
437 rv = get_int(&mesg, &newthreads); 678 rv = get_int(&mesg, &newthreads);
438 if (rv) 679 if (rv)
439 return rv; 680 return rv;
440 if (newthreads <0) 681 if (newthreads < 0)
441 return -EINVAL; 682 return -EINVAL;
442 rv = nfsd_svc(2049, newthreads); 683 rv = nfsd_svc(NFS_PORT, newthreads);
443 if (rv) 684 if (rv)
444 return rv; 685 return rv;
445 } 686 }
@@ -447,6 +688,28 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
447 return strlen(buf); 688 return strlen(buf);
448} 689}
449 690
691/**
692 * write_pool_threads - Set or report the current number of threads per pool
693 *
694 * Input:
695 * buf: ignored
696 * size: zero
697 *
698 * OR
699 *
700 * Input:
701 * buf: C string containing whitespace-
702 * separated unsigned integer values
703 * representing the number of NFSD
704 * threads to start in each pool
705 * size: non-zero length of C string in @buf
706 * Output:
707 * On success: passed-in buffer filled with '\n'-terminated C
708 * string containing integer values representing the
709 * number of NFSD threads in each pool;
710 * return code is the size in bytes of the string
711 * On error: return code is zero or a negative errno value
712 */
450static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) 713static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
451{ 714{
452 /* if size > 0, look for an array of number of threads per node 715 /* if size > 0, look for an array of number of threads per node
@@ -517,10 +780,6 @@ out_free:
517 780
518static ssize_t __write_versions(struct file *file, char *buf, size_t size) 781static ssize_t __write_versions(struct file *file, char *buf, size_t size)
519{ 782{
520 /*
521 * Format:
522 * [-/+]vers [-/+]vers ...
523 */
524 char *mesg = buf; 783 char *mesg = buf;
525 char *vers, sign; 784 char *vers, sign;
526 int len, num; 785 int len, num;
@@ -578,6 +837,38 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
578 return len; 837 return len;
579} 838}
580 839
840/**
841 * write_versions - Set or report the available NFS protocol versions
842 *
843 * Input:
844 * buf: ignored
845 * size: zero
846 * Output:
847 * On success: passed-in buffer filled with '\n'-terminated C
848 * string containing positive or negative integer
849 * values representing the current status of each
850 * protocol version;
851 * return code is the size in bytes of the string
852 * On error: return code is zero or a negative errno value
853 *
854 * OR
855 *
856 * Input:
857 * buf: C string containing whitespace-
858 * separated positive or negative
859 * integer values representing NFS
860 * protocol versions to enable ("+n")
861 * or disable ("-n")
862 * size: non-zero length of C string in @buf
863 * Output:
864 * On success: status of zero or more protocol versions has
865 * been updated; passed-in buffer filled with
866 * '\n'-terminated C string containing positive
867 * or negative integer values representing the
868 * current status of each protocol version;
869 * return code is the size in bytes of the string
870 * On error: return code is zero or a negative errno value
871 */
581static ssize_t write_versions(struct file *file, char *buf, size_t size) 872static ssize_t write_versions(struct file *file, char *buf, size_t size)
582{ 873{
583 ssize_t rv; 874 ssize_t rv;
@@ -687,6 +978,75 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
687 return -EINVAL; 978 return -EINVAL;
688} 979}
689 980
981/**
982 * write_ports - Pass a socket file descriptor or transport name to listen on
983 *
984 * Input:
985 * buf: ignored
986 * size: zero
987 * Output:
988 * On success: passed-in buffer filled with a '\n'-terminated C
989 * string containing a whitespace-separated list of
990 * named NFSD listeners;
991 * return code is the size in bytes of the string
992 * On error: return code is zero or a negative errno value
993 *
994 * OR
995 *
996 * Input:
997 * buf: C string containing an unsigned
998 * integer value representing a bound
999 * but unconnected socket that is to be
1000 * used as an NFSD listener
1001 * size: non-zero length of C string in @buf
1002 * Output:
1003 * On success: NFS service is started;
1004 * passed-in buffer filled with a '\n'-terminated C
1005 * string containing a unique alphanumeric name of
1006 * the listener;
1007 * return code is the size in bytes of the string
1008 * On error: return code is a negative errno value
1009 *
1010 * OR
1011 *
1012 * Input:
1013 * buf: C string containing a "-" followed
1014 * by an integer value representing a
1015 * previously passed in socket file
1016 * descriptor
1017 * size: non-zero length of C string in @buf
1018 * Output:
1019 * On success: NFS service no longer listens on that socket;
1020 * passed-in buffer filled with a '\n'-terminated C
1021 * string containing a unique name of the listener;
1022 * return code is the size in bytes of the string
1023 * On error: return code is a negative errno value
1024 *
1025 * OR
1026 *
1027 * Input:
1028 * buf: C string containing a transport
1029 * name and an unsigned integer value
1030 * representing the port to listen on,
1031 * separated by whitespace
1032 * size: non-zero length of C string in @buf
1033 * Output:
1034 * On success: returns zero; NFS service is started
1035 * On error: return code is a negative errno value
1036 *
1037 * OR
1038 *
1039 * Input:
1040 * buf: C string containing a "-" followed
1041 * by a transport name and an unsigned
1042 * integer value representing the port
1043 * to listen on, separated by whitespace
1044 * size: non-zero length of C string in @buf
1045 * Output:
1046 * On success: returns zero; NFS service no longer listens
1047 * on that transport
1048 * On error: return code is a negative errno value
1049 */
690static ssize_t write_ports(struct file *file, char *buf, size_t size) 1050static ssize_t write_ports(struct file *file, char *buf, size_t size)
691{ 1051{
692 ssize_t rv; 1052 ssize_t rv;
@@ -700,6 +1060,27 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
700 1060
701int nfsd_max_blksize; 1061int nfsd_max_blksize;
702 1062
1063/**
1064 * write_maxblksize - Set or report the current NFS blksize
1065 *
1066 * Input:
1067 * buf: ignored
1068 * size: zero
1069 *
1070 * OR
1071 *
1072 * Input:
1073 * buf: C string containing an unsigned
1074 * integer value representing the new
1075 * NFS blksize
1076 * size: non-zero length of C string in @buf
1077 * Output:
1078 * On success: passed-in buffer filled with '\n'-terminated C string
1079 * containing numeric value of the current NFS blksize
1080 * setting;
1081 * return code is the size in bytes of the string
1082 * On error: return code is zero or a negative errno value
1083 */
703static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) 1084static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
704{ 1085{
705 char *mesg = buf; 1086 char *mesg = buf;
@@ -752,6 +1133,27 @@ static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
752 return strlen(buf); 1133 return strlen(buf);
753} 1134}
754 1135
1136/**
1137 * write_leasetime - Set or report the current NFSv4 lease time
1138 *
1139 * Input:
1140 * buf: ignored
1141 * size: zero
1142 *
1143 * OR
1144 *
1145 * Input:
1146 * buf: C string containing an unsigned
1147 * integer value representing the new
1148 * NFSv4 lease expiry time
1149 * size: non-zero length of C string in @buf
1150 * Output:
1151 * On success: passed-in buffer filled with '\n'-terminated C
1152 * string containing unsigned integer value of the
1153 * current lease expiry time;
1154 * return code is the size in bytes of the string
1155 * On error: return code is zero or a negative errno value
1156 */
755static ssize_t write_leasetime(struct file *file, char *buf, size_t size) 1157static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
756{ 1158{
757 ssize_t rv; 1159 ssize_t rv;
@@ -788,6 +1190,27 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
788 return strlen(buf); 1190 return strlen(buf);
789} 1191}
790 1192
1193/**
1194 * write_recoverydir - Set or report the pathname of the recovery directory
1195 *
1196 * Input:
1197 * buf: ignored
1198 * size: zero
1199 *
1200 * OR
1201 *
1202 * Input:
1203 * buf: C string containing the pathname
1204 * of the directory on a local file
1205 * system containing permanent NFSv4
1206 * recovery data
1207 * size: non-zero length of C string in @buf
1208 * Output:
1209 * On success: passed-in buffer filled with '\n'-terminated C string
1210 * containing the current recovery pathname setting;
1211 * return code is the size in bytes of the string
1212 * On error: return code is zero or a negative errno value
1213 */
791static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) 1214static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
792{ 1215{
793 ssize_t rv; 1216 ssize_t rv;
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index f0da7d9c3a92..9f1ca17293d3 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -258,14 +258,32 @@ out:
258 return error; 258 return error;
259} 259}
260 260
261/* 261/**
262 * Perform sanity checks on the dentry in a client's file handle. 262 * fh_verify - filehandle lookup and access checking
263 * @rqstp: pointer to current rpc request
264 * @fhp: filehandle to be verified
265 * @type: expected type of object pointed to by filehandle
266 * @access: type of access needed to object
267 *
268 * Look up a dentry from the on-the-wire filehandle, check the client's
269 * access to the export, and set the current task's credentials.
270 *
271 * Regardless of success or failure of fh_verify(), fh_put() should be
272 * called on @fhp when the caller is finished with the filehandle.
263 * 273 *
264 * Note that the file handle dentry may need to be freed even after 274 * fh_verify() may be called multiple times on a given filehandle, for
265 * an error return. 275 * example, when processing an NFSv4 compound. The first call will look
276 * up a dentry using the on-the-wire filehandle. Subsequent calls will
277 * skip the lookup and just perform the other checks and possibly change
278 * the current task's credentials.
266 * 279 *
267 * This is only called at the start of an nfsproc call, so fhp points to 280 * @type specifies the type of object expected using one of the S_IF*
268 * a svc_fh which is all 0 except for the over-the-wire file handle. 281 * constants defined in include/linux/stat.h. The caller may use zero
282 * to indicate that it doesn't care, or a negative integer to indicate
283 * that it expects something not of the given type.
284 *
285 * @access is formed from the NFSD_MAY_* constants defined in
286 * include/linux/nfsd/nfsd.h.
269 */ 287 */
270__be32 288__be32
271fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) 289fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
@@ -466,6 +484,8 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
466 goto retry; 484 goto retry;
467 break; 485 break;
468 } 486 }
487 } else if (exp->ex_flags & NFSEXP_FSID) {
488 fsid_type = FSID_NUM;
469 } else if (exp->ex_uuid) { 489 } else if (exp->ex_uuid) {
470 if (fhp->fh_maxsize >= 64) { 490 if (fhp->fh_maxsize >= 64) {
471 if (root_export) 491 if (root_export)
@@ -478,9 +498,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
478 else 498 else
479 fsid_type = FSID_UUID4_INUM; 499 fsid_type = FSID_UUID4_INUM;
480 } 500 }
481 } else if (exp->ex_flags & NFSEXP_FSID) 501 } else if (!old_valid_dev(ex_dev))
482 fsid_type = FSID_NUM;
483 else if (!old_valid_dev(ex_dev))
484 /* for newer device numbers, we must use a newer fsid format */ 502 /* for newer device numbers, we must use a newer fsid format */
485 fsid_type = FSID_ENCODE_DEV; 503 fsid_type = FSID_ENCODE_DEV;
486 else 504 else
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 5cffeca7acef..6f7f26351227 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -622,6 +622,7 @@ nfserrno (int errno)
622 { nfserr_badname, -ESRCH }, 622 { nfserr_badname, -ESRCH },
623 { nfserr_io, -ETXTBSY }, 623 { nfserr_io, -ETXTBSY },
624 { nfserr_notsupp, -EOPNOTSUPP }, 624 { nfserr_notsupp, -EOPNOTSUPP },
625 { nfserr_toosmall, -ETOOSMALL },
625 }; 626 };
626 int i; 627 int i;
627 628
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d1c5f787b365..6e50aaa56ca2 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -764,7 +764,6 @@ static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
764 764
765 return err; 765 return err;
766} 766}
767
768 767
769static int 768static int
770nfsd_sync(struct file *filp) 769nfsd_sync(struct file *filp)
@@ -1211,7 +1210,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1211 dirp = dentry->d_inode; 1210 dirp = dentry->d_inode;
1212 1211
1213 err = nfserr_notdir; 1212 err = nfserr_notdir;
1214 if(!dirp->i_op || !dirp->i_op->lookup) 1213 if (!dirp->i_op->lookup)
1215 goto out; 1214 goto out;
1216 /* 1215 /*
1217 * Check whether the response file handle has been verified yet. 1216 * Check whether the response file handle has been verified yet.
@@ -1347,7 +1346,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1347 /* Get all the sanity checks out of the way before 1346 /* Get all the sanity checks out of the way before
1348 * we lock the parent. */ 1347 * we lock the parent. */
1349 err = nfserr_notdir; 1348 err = nfserr_notdir;
1350 if(!dirp->i_op || !dirp->i_op->lookup) 1349 if (!dirp->i_op->lookup)
1351 goto out; 1350 goto out;
1352 fh_lock_nested(fhp, I_MUTEX_PARENT); 1351 fh_lock_nested(fhp, I_MUTEX_PARENT);
1353 1352
@@ -1482,7 +1481,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
1482 inode = dentry->d_inode; 1481 inode = dentry->d_inode;
1483 1482
1484 err = nfserr_inval; 1483 err = nfserr_inval;
1485 if (!inode->i_op || !inode->i_op->readlink) 1484 if (!inode->i_op->readlink)
1486 goto out; 1485 goto out;
1487 1486
1488 touch_atime(fhp->fh_export->ex_path.mnt, dentry); 1487 touch_atime(fhp->fh_export->ex_path.mnt, dentry);
@@ -2162,7 +2161,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
2162 size_t size; 2161 size_t size;
2163 int error; 2162 int error;
2164 2163
2165 if (!IS_POSIXACL(inode) || !inode->i_op || 2164 if (!IS_POSIXACL(inode) ||
2166 !inode->i_op->setxattr || !inode->i_op->removexattr) 2165 !inode->i_op->setxattr || !inode->i_op->removexattr)
2167 return -EOPNOTSUPP; 2166 return -EOPNOTSUPP;
2168 switch(type) { 2167 switch(type) {
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
new file mode 100644
index 000000000000..50914d7303c6
--- /dev/null
+++ b/fs/notify/Kconfig
@@ -0,0 +1,2 @@
1source "fs/notify/dnotify/Kconfig"
2source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
new file mode 100644
index 000000000000..5a95b6010ce7
--- /dev/null
+++ b/fs/notify/Makefile
@@ -0,0 +1,2 @@
1obj-y += dnotify/
2obj-y += inotify/
diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig
new file mode 100644
index 000000000000..26adf5dfa646
--- /dev/null
+++ b/fs/notify/dnotify/Kconfig
@@ -0,0 +1,10 @@
1config DNOTIFY
2 bool "Dnotify support"
3 default y
4 help
5 Dnotify is a directory-based per-fd file change notification system
6 that uses signals to communicate events to user-space. There exist
7 superior alternatives, but some applications may still rely on
8 dnotify.
9
10 If unsure, say Y.
diff --git a/fs/notify/dnotify/Makefile b/fs/notify/dnotify/Makefile
new file mode 100644
index 000000000000..f145251dcadb
--- /dev/null
+++ b/fs/notify/dnotify/Makefile
@@ -0,0 +1 @@
obj-$(CONFIG_DNOTIFY) += dnotify.o
diff --git a/fs/dnotify.c b/fs/notify/dnotify/dnotify.c
index 676073b8dda5..b0aa2cde80bd 100644
--- a/fs/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -115,9 +115,6 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
115 dn->dn_next = inode->i_dnotify; 115 dn->dn_next = inode->i_dnotify;
116 inode->i_dnotify = dn; 116 inode->i_dnotify = dn;
117 spin_unlock(&inode->i_lock); 117 spin_unlock(&inode->i_lock);
118
119 if (filp->f_op && filp->f_op->dir_notify)
120 return filp->f_op->dir_notify(filp, arg);
121 return 0; 118 return 0;
122 119
123out_free: 120out_free:
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
new file mode 100644
index 000000000000..446792841023
--- /dev/null
+++ b/fs/notify/inotify/Kconfig
@@ -0,0 +1,27 @@
1config INOTIFY
2 bool "Inotify file change notification support"
3 default y
4 ---help---
5 Say Y here to enable inotify support. Inotify is a file change
6 notification system and a replacement for dnotify. Inotify fixes
7 numerous shortcomings in dnotify and introduces several new features
8 including multiple file events, one-shot support, and unmount
9 notification.
10
11 For more information, see <file:Documentation/filesystems/inotify.txt>
12
13 If unsure, say Y.
14
15config INOTIFY_USER
16 bool "Inotify support for userspace"
17 depends on INOTIFY
18 default y
19 ---help---
20 Say Y here to enable inotify support for userspace, including the
21 associated system calls. Inotify allows monitoring of both files and
22 directories via a single open fd. Events are read from the file
23 descriptor, which is also select()- and poll()-able.
24
25 For more information, see <file:Documentation/filesystems/inotify.txt>
26
27 If unsure, say Y.
diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile
new file mode 100644
index 000000000000..e290f3bb9d8d
--- /dev/null
+++ b/fs/notify/inotify/Makefile
@@ -0,0 +1,2 @@
1obj-$(CONFIG_INOTIFY) += inotify.o
2obj-$(CONFIG_INOTIFY_USER) += inotify_user.o
diff --git a/fs/inotify.c b/fs/notify/inotify/inotify.c
index dae3f28f30d4..dae3f28f30d4 100644
--- a/fs/inotify.c
+++ b/fs/notify/inotify/inotify.c
diff --git a/fs/inotify_user.c b/fs/notify/inotify/inotify_user.c
index e2425bbd871f..81b8644b0136 100644
--- a/fs/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -76,10 +76,10 @@ struct inotify_device {
76 struct mutex ev_mutex; /* protects event queue */ 76 struct mutex ev_mutex; /* protects event queue */
77 struct mutex up_mutex; /* synchronizes watch updates */ 77 struct mutex up_mutex; /* synchronizes watch updates */
78 struct list_head events; /* list of queued events */ 78 struct list_head events; /* list of queued events */
79 atomic_t count; /* reference count */
80 struct user_struct *user; /* user who opened this dev */ 79 struct user_struct *user; /* user who opened this dev */
81 struct inotify_handle *ih; /* inotify handle */ 80 struct inotify_handle *ih; /* inotify handle */
82 struct fasync_struct *fa; /* async notification */ 81 struct fasync_struct *fa; /* async notification */
82 atomic_t count; /* reference count */
83 unsigned int queue_size; /* size of the queue (bytes) */ 83 unsigned int queue_size; /* size of the queue (bytes) */
84 unsigned int event_count; /* number of pending events */ 84 unsigned int event_count; /* number of pending events */
85 unsigned int max_events; /* maximum number of events */ 85 unsigned int max_events; /* maximum number of events */
@@ -704,7 +704,7 @@ fput_and_out:
704 return ret; 704 return ret;
705} 705}
706 706
707asmlinkage long sys_inotify_rm_watch(int fd, u32 wd) 707asmlinkage long sys_inotify_rm_watch(int fd, __s32 wd)
708{ 708{
709 struct file *filp; 709 struct file *filp;
710 struct inotify_device *dev; 710 struct inotify_device *dev;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index e9da092e2772..86bef156cf0a 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1406,9 +1406,6 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
1406 ni->allocated_size = sle64_to_cpu( 1406 ni->allocated_size = sle64_to_cpu(
1407 a->data.non_resident.allocated_size); 1407 a->data.non_resident.allocated_size);
1408 } 1408 }
1409 /* Setup the operations for this attribute inode. */
1410 vi->i_op = NULL;
1411 vi->i_fop = NULL;
1412 if (NInoMstProtected(ni)) 1409 if (NInoMstProtected(ni))
1413 vi->i_mapping->a_ops = &ntfs_mst_aops; 1410 vi->i_mapping->a_ops = &ntfs_mst_aops;
1414 else 1411 else
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 589dcdfdfe3c..01596079dd63 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o
12ocfs2-objs := \ 12ocfs2-objs := \
13 alloc.o \ 13 alloc.o \
14 aops.o \ 14 aops.o \
15 blockcheck.o \
15 buffer_head_io.o \ 16 buffer_head_io.o \
16 dcache.o \ 17 dcache.o \
17 dir.o \ 18 dir.o \
@@ -35,8 +36,14 @@ ocfs2-objs := \
35 sysfile.o \ 36 sysfile.o \
36 uptodate.o \ 37 uptodate.o \
37 ver.o \ 38 ver.o \
39 quota_local.o \
40 quota_global.o \
38 xattr.o 41 xattr.o
39 42
43ifeq ($(CONFIG_OCFS2_FS_POSIX_ACL),y)
44ocfs2-objs += acl.o
45endif
46
40ocfs2_stackglue-objs := stackglue.o 47ocfs2_stackglue-objs := stackglue.o
41ocfs2_stack_o2cb-objs := stack_o2cb.o 48ocfs2_stack_o2cb-objs := stack_o2cb.o
42ocfs2_stack_user-objs := stack_user.o 49ocfs2_stack_user-objs := stack_user.o
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
new file mode 100644
index 000000000000..12dfb44c22e5
--- /dev/null
+++ b/fs/ocfs2/acl.c
@@ -0,0 +1,479 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * acl.c
5 *
6 * Copyright (C) 2004, 2008 Oracle. All rights reserved.
7 *
8 * CREDITS:
9 * Lots of code in this file is copy from linux/fs/ext3/acl.c.
10 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public
14 * License version 2 as published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 */
21
22#include <linux/init.h>
23#include <linux/module.h>
24#include <linux/string.h>
25
26#define MLOG_MASK_PREFIX ML_INODE
27#include <cluster/masklog.h>
28
29#include "ocfs2.h"
30#include "alloc.h"
31#include "dlmglue.h"
32#include "file.h"
33#include "ocfs2_fs.h"
34
35#include "xattr.h"
36#include "acl.h"
37
38/*
39 * Convert from xattr value to acl struct.
40 */
41static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)
42{
43 int n, count;
44 struct posix_acl *acl;
45
46 if (!value)
47 return NULL;
48 if (size < sizeof(struct posix_acl_entry))
49 return ERR_PTR(-EINVAL);
50
51 count = size / sizeof(struct posix_acl_entry);
52 if (count < 0)
53 return ERR_PTR(-EINVAL);
54 if (count == 0)
55 return NULL;
56
57 acl = posix_acl_alloc(count, GFP_NOFS);
58 if (!acl)
59 return ERR_PTR(-ENOMEM);
60 for (n = 0; n < count; n++) {
61 struct ocfs2_acl_entry *entry =
62 (struct ocfs2_acl_entry *)value;
63
64 acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
65 acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
66 acl->a_entries[n].e_id = le32_to_cpu(entry->e_id);
67 value += sizeof(struct posix_acl_entry);
68
69 }
70 return acl;
71}
72
73/*
74 * Convert acl struct to xattr value.
75 */
76static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size)
77{
78 struct ocfs2_acl_entry *entry = NULL;
79 char *ocfs2_acl;
80 size_t n;
81
82 *size = acl->a_count * sizeof(struct posix_acl_entry);
83
84 ocfs2_acl = kmalloc(*size, GFP_NOFS);
85 if (!ocfs2_acl)
86 return ERR_PTR(-ENOMEM);
87
88 entry = (struct ocfs2_acl_entry *)ocfs2_acl;
89 for (n = 0; n < acl->a_count; n++, entry++) {
90 entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
91 entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
92 entry->e_id = cpu_to_le32(acl->a_entries[n].e_id);
93 }
94 return ocfs2_acl;
95}
96
97static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
98 int type,
99 struct buffer_head *di_bh)
100{
101 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
102 int name_index;
103 char *value = NULL;
104 struct posix_acl *acl;
105 int retval;
106
107 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
108 return NULL;
109
110 switch (type) {
111 case ACL_TYPE_ACCESS:
112 name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
113 break;
114 case ACL_TYPE_DEFAULT:
115 name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
116 break;
117 default:
118 return ERR_PTR(-EINVAL);
119 }
120
121 retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index, "", NULL, 0);
122 if (retval > 0) {
123 value = kmalloc(retval, GFP_NOFS);
124 if (!value)
125 return ERR_PTR(-ENOMEM);
126 retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
127 "", value, retval);
128 }
129
130 if (retval > 0)
131 acl = ocfs2_acl_from_xattr(value, retval);
132 else if (retval == -ENODATA || retval == 0)
133 acl = NULL;
134 else
135 acl = ERR_PTR(retval);
136
137 kfree(value);
138
139 return acl;
140}
141
142
143/*
144 * Get posix acl.
145 */
146static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
147{
148 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
149 struct buffer_head *di_bh = NULL;
150 struct posix_acl *acl;
151 int ret;
152
153 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
154 return NULL;
155
156 ret = ocfs2_inode_lock(inode, &di_bh, 0);
157 if (ret < 0) {
158 mlog_errno(ret);
159 acl = ERR_PTR(ret);
160 return acl;
161 }
162
163 acl = ocfs2_get_acl_nolock(inode, type, di_bh);
164
165 ocfs2_inode_unlock(inode, 0);
166
167 brelse(di_bh);
168
169 return acl;
170}
171
172/*
173 * Set the access or default ACL of an inode.
174 */
175static int ocfs2_set_acl(handle_t *handle,
176 struct inode *inode,
177 struct buffer_head *di_bh,
178 int type,
179 struct posix_acl *acl,
180 struct ocfs2_alloc_context *meta_ac,
181 struct ocfs2_alloc_context *data_ac)
182{
183 int name_index;
184 void *value = NULL;
185 size_t size = 0;
186 int ret;
187
188 if (S_ISLNK(inode->i_mode))
189 return -EOPNOTSUPP;
190
191 switch (type) {
192 case ACL_TYPE_ACCESS:
193 name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
194 if (acl) {
195 mode_t mode = inode->i_mode;
196 ret = posix_acl_equiv_mode(acl, &mode);
197 if (ret < 0)
198 return ret;
199 else {
200 inode->i_mode = mode;
201 if (ret == 0)
202 acl = NULL;
203 }
204 }
205 break;
206 case ACL_TYPE_DEFAULT:
207 name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
208 if (!S_ISDIR(inode->i_mode))
209 return acl ? -EACCES : 0;
210 break;
211 default:
212 return -EINVAL;
213 }
214
215 if (acl) {
216 value = ocfs2_acl_to_xattr(acl, &size);
217 if (IS_ERR(value))
218 return (int)PTR_ERR(value);
219 }
220
221 if (handle)
222 ret = ocfs2_xattr_set_handle(handle, inode, di_bh, name_index,
223 "", value, size, 0,
224 meta_ac, data_ac);
225 else
226 ret = ocfs2_xattr_set(inode, name_index, "", value, size, 0);
227
228 kfree(value);
229
230 return ret;
231}
232
233int ocfs2_check_acl(struct inode *inode, int mask)
234{
235 struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
236
237 if (IS_ERR(acl))
238 return PTR_ERR(acl);
239 if (acl) {
240 int ret = posix_acl_permission(inode, acl, mask);
241 posix_acl_release(acl);
242 return ret;
243 }
244
245 return -EAGAIN;
246}
247
248int ocfs2_acl_chmod(struct inode *inode)
249{
250 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
251 struct posix_acl *acl, *clone;
252 int ret;
253
254 if (S_ISLNK(inode->i_mode))
255 return -EOPNOTSUPP;
256
257 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
258 return 0;
259
260 acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
261 if (IS_ERR(acl) || !acl)
262 return PTR_ERR(acl);
263 clone = posix_acl_clone(acl, GFP_KERNEL);
264 posix_acl_release(acl);
265 if (!clone)
266 return -ENOMEM;
267 ret = posix_acl_chmod_masq(clone, inode->i_mode);
268 if (!ret)
269 ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
270 clone, NULL, NULL);
271 posix_acl_release(clone);
272 return ret;
273}
274
275/*
276 * Initialize the ACLs of a new inode. If parent directory has default ACL,
277 * then clone to new inode. Called from ocfs2_mknod.
278 */
279int ocfs2_init_acl(handle_t *handle,
280 struct inode *inode,
281 struct inode *dir,
282 struct buffer_head *di_bh,
283 struct buffer_head *dir_bh,
284 struct ocfs2_alloc_context *meta_ac,
285 struct ocfs2_alloc_context *data_ac)
286{
287 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
288 struct posix_acl *acl = NULL;
289 int ret = 0;
290
291 if (!S_ISLNK(inode->i_mode)) {
292 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
293 acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
294 dir_bh);
295 if (IS_ERR(acl))
296 return PTR_ERR(acl);
297 }
298 if (!acl)
299 inode->i_mode &= ~current->fs->umask;
300 }
301 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
302 struct posix_acl *clone;
303 mode_t mode;
304
305 if (S_ISDIR(inode->i_mode)) {
306 ret = ocfs2_set_acl(handle, inode, di_bh,
307 ACL_TYPE_DEFAULT, acl,
308 meta_ac, data_ac);
309 if (ret)
310 goto cleanup;
311 }
312 clone = posix_acl_clone(acl, GFP_NOFS);
313 ret = -ENOMEM;
314 if (!clone)
315 goto cleanup;
316
317 mode = inode->i_mode;
318 ret = posix_acl_create_masq(clone, &mode);
319 if (ret >= 0) {
320 inode->i_mode = mode;
321 if (ret > 0) {
322 ret = ocfs2_set_acl(handle, inode,
323 di_bh, ACL_TYPE_ACCESS,
324 clone, meta_ac, data_ac);
325 }
326 }
327 posix_acl_release(clone);
328 }
329cleanup:
330 posix_acl_release(acl);
331 return ret;
332}
333
334static size_t ocfs2_xattr_list_acl_access(struct inode *inode,
335 char *list,
336 size_t list_len,
337 const char *name,
338 size_t name_len)
339{
340 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
341 const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
342
343 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
344 return 0;
345
346 if (list && size <= list_len)
347 memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
348 return size;
349}
350
351static size_t ocfs2_xattr_list_acl_default(struct inode *inode,
352 char *list,
353 size_t list_len,
354 const char *name,
355 size_t name_len)
356{
357 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
358 const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
359
360 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
361 return 0;
362
363 if (list && size <= list_len)
364 memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
365 return size;
366}
367
368static int ocfs2_xattr_get_acl(struct inode *inode,
369 int type,
370 void *buffer,
371 size_t size)
372{
373 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
374 struct posix_acl *acl;
375 int ret;
376
377 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
378 return -EOPNOTSUPP;
379
380 acl = ocfs2_get_acl(inode, type);
381 if (IS_ERR(acl))
382 return PTR_ERR(acl);
383 if (acl == NULL)
384 return -ENODATA;
385 ret = posix_acl_to_xattr(acl, buffer, size);
386 posix_acl_release(acl);
387
388 return ret;
389}
390
391static int ocfs2_xattr_get_acl_access(struct inode *inode,
392 const char *name,
393 void *buffer,
394 size_t size)
395{
396 if (strcmp(name, "") != 0)
397 return -EINVAL;
398 return ocfs2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
399}
400
401static int ocfs2_xattr_get_acl_default(struct inode *inode,
402 const char *name,
403 void *buffer,
404 size_t size)
405{
406 if (strcmp(name, "") != 0)
407 return -EINVAL;
408 return ocfs2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
409}
410
411static int ocfs2_xattr_set_acl(struct inode *inode,
412 int type,
413 const void *value,
414 size_t size)
415{
416 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
417 struct posix_acl *acl;
418 int ret = 0;
419
420 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
421 return -EOPNOTSUPP;
422
423 if (!is_owner_or_cap(inode))
424 return -EPERM;
425
426 if (value) {
427 acl = posix_acl_from_xattr(value, size);
428 if (IS_ERR(acl))
429 return PTR_ERR(acl);
430 else if (acl) {
431 ret = posix_acl_valid(acl);
432 if (ret)
433 goto cleanup;
434 }
435 } else
436 acl = NULL;
437
438 ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
439
440cleanup:
441 posix_acl_release(acl);
442 return ret;
443}
444
445static int ocfs2_xattr_set_acl_access(struct inode *inode,
446 const char *name,
447 const void *value,
448 size_t size,
449 int flags)
450{
451 if (strcmp(name, "") != 0)
452 return -EINVAL;
453 return ocfs2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
454}
455
456static int ocfs2_xattr_set_acl_default(struct inode *inode,
457 const char *name,
458 const void *value,
459 size_t size,
460 int flags)
461{
462 if (strcmp(name, "") != 0)
463 return -EINVAL;
464 return ocfs2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
465}
466
467struct xattr_handler ocfs2_xattr_acl_access_handler = {
468 .prefix = POSIX_ACL_XATTR_ACCESS,
469 .list = ocfs2_xattr_list_acl_access,
470 .get = ocfs2_xattr_get_acl_access,
471 .set = ocfs2_xattr_set_acl_access,
472};
473
474struct xattr_handler ocfs2_xattr_acl_default_handler = {
475 .prefix = POSIX_ACL_XATTR_DEFAULT,
476 .list = ocfs2_xattr_list_acl_default,
477 .get = ocfs2_xattr_get_acl_default,
478 .set = ocfs2_xattr_set_acl_default,
479};
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
new file mode 100644
index 000000000000..8f6389ed4da5
--- /dev/null
+++ b/fs/ocfs2/acl.h
@@ -0,0 +1,58 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * acl.h
5 *
6 * Copyright (C) 2004, 2008 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License version 2 as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 */
17
18#ifndef OCFS2_ACL_H
19#define OCFS2_ACL_H
20
21#include <linux/posix_acl_xattr.h>
22
23struct ocfs2_acl_entry {
24 __le16 e_tag;
25 __le16 e_perm;
26 __le32 e_id;
27};
28
29#ifdef CONFIG_OCFS2_FS_POSIX_ACL
30
31extern int ocfs2_check_acl(struct inode *, int);
32extern int ocfs2_acl_chmod(struct inode *);
33extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
34 struct buffer_head *, struct buffer_head *,
35 struct ocfs2_alloc_context *,
36 struct ocfs2_alloc_context *);
37
38#else /* CONFIG_OCFS2_FS_POSIX_ACL*/
39
40#define ocfs2_check_acl NULL
41static inline int ocfs2_acl_chmod(struct inode *inode)
42{
43 return 0;
44}
45static inline int ocfs2_init_acl(handle_t *handle,
46 struct inode *inode,
47 struct inode *dir,
48 struct buffer_head *di_bh,
49 struct buffer_head *dir_bh,
50 struct ocfs2_alloc_context *meta_ac,
51 struct ocfs2_alloc_context *data_ac)
52{
53 return 0;
54}
55
56#endif /* CONFIG_OCFS2_FS_POSIX_ACL*/
57
58#endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 0cc2deb9394c..d861096c9d81 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -28,6 +28,7 @@
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/swap.h> 30#include <linux/swap.h>
31#include <linux/quotaops.h>
31 32
32#define MLOG_MASK_PREFIX ML_DISK_ALLOC 33#define MLOG_MASK_PREFIX ML_DISK_ALLOC
33#include <cluster/masklog.h> 34#include <cluster/masklog.h>
@@ -36,6 +37,7 @@
36 37
37#include "alloc.h" 38#include "alloc.h"
38#include "aops.h" 39#include "aops.h"
40#include "blockcheck.h"
39#include "dlmglue.h" 41#include "dlmglue.h"
40#include "extent_map.h" 42#include "extent_map.h"
41#include "inode.h" 43#include "inode.h"
@@ -46,6 +48,7 @@
46#include "file.h" 48#include "file.h"
47#include "super.h" 49#include "super.h"
48#include "uptodate.h" 50#include "uptodate.h"
51#include "xattr.h"
49 52
50#include "buffer_head_io.h" 53#include "buffer_head_io.h"
51 54
@@ -187,20 +190,12 @@ static int ocfs2_dinode_insert_check(struct inode *inode,
187static int ocfs2_dinode_sanity_check(struct inode *inode, 190static int ocfs2_dinode_sanity_check(struct inode *inode,
188 struct ocfs2_extent_tree *et) 191 struct ocfs2_extent_tree *et)
189{ 192{
190 int ret = 0; 193 struct ocfs2_dinode *di = et->et_object;
191 struct ocfs2_dinode *di;
192 194
193 BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); 195 BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
196 BUG_ON(!OCFS2_IS_VALID_DINODE(di));
194 197
195 di = et->et_object; 198 return 0;
196 if (!OCFS2_IS_VALID_DINODE(di)) {
197 ret = -EIO;
198 ocfs2_error(inode->i_sb,
199 "Inode %llu has invalid path root",
200 (unsigned long long)OCFS2_I(inode)->ip_blkno);
201 }
202
203 return ret;
204} 199}
205 200
206static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et) 201static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
@@ -213,36 +208,33 @@ static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
213 208
214static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et) 209static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
215{ 210{
216 struct ocfs2_xattr_value_root *xv = et->et_object; 211 struct ocfs2_xattr_value_buf *vb = et->et_object;
217 212
218 et->et_root_el = &xv->xr_list; 213 et->et_root_el = &vb->vb_xv->xr_list;
219} 214}
220 215
221static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et, 216static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
222 u64 blkno) 217 u64 blkno)
223{ 218{
224 struct ocfs2_xattr_value_root *xv = 219 struct ocfs2_xattr_value_buf *vb = et->et_object;
225 (struct ocfs2_xattr_value_root *)et->et_object;
226 220
227 xv->xr_last_eb_blk = cpu_to_le64(blkno); 221 vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
228} 222}
229 223
230static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et) 224static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
231{ 225{
232 struct ocfs2_xattr_value_root *xv = 226 struct ocfs2_xattr_value_buf *vb = et->et_object;
233 (struct ocfs2_xattr_value_root *) et->et_object;
234 227
235 return le64_to_cpu(xv->xr_last_eb_blk); 228 return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
236} 229}
237 230
238static void ocfs2_xattr_value_update_clusters(struct inode *inode, 231static void ocfs2_xattr_value_update_clusters(struct inode *inode,
239 struct ocfs2_extent_tree *et, 232 struct ocfs2_extent_tree *et,
240 u32 clusters) 233 u32 clusters)
241{ 234{
242 struct ocfs2_xattr_value_root *xv = 235 struct ocfs2_xattr_value_buf *vb = et->et_object;
243 (struct ocfs2_xattr_value_root *)et->et_object;
244 236
245 le32_add_cpu(&xv->xr_clusters, clusters); 237 le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
246} 238}
247 239
248static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { 240static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
@@ -304,11 +296,13 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
304static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, 296static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
305 struct inode *inode, 297 struct inode *inode,
306 struct buffer_head *bh, 298 struct buffer_head *bh,
299 ocfs2_journal_access_func access,
307 void *obj, 300 void *obj,
308 struct ocfs2_extent_tree_operations *ops) 301 struct ocfs2_extent_tree_operations *ops)
309{ 302{
310 et->et_ops = ops; 303 et->et_ops = ops;
311 et->et_root_bh = bh; 304 et->et_root_bh = bh;
305 et->et_root_journal_access = access;
312 if (!obj) 306 if (!obj)
313 obj = (void *)bh->b_data; 307 obj = (void *)bh->b_data;
314 et->et_object = obj; 308 et->et_object = obj;
@@ -324,23 +318,23 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
324 struct inode *inode, 318 struct inode *inode,
325 struct buffer_head *bh) 319 struct buffer_head *bh)
326{ 320{
327 __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops); 321 __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_di,
322 NULL, &ocfs2_dinode_et_ops);
328} 323}
329 324
330void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, 325void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
331 struct inode *inode, 326 struct inode *inode,
332 struct buffer_head *bh) 327 struct buffer_head *bh)
333{ 328{
334 __ocfs2_init_extent_tree(et, inode, bh, NULL, 329 __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_xb,
335 &ocfs2_xattr_tree_et_ops); 330 NULL, &ocfs2_xattr_tree_et_ops);
336} 331}
337 332
338void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, 333void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
339 struct inode *inode, 334 struct inode *inode,
340 struct buffer_head *bh, 335 struct ocfs2_xattr_value_buf *vb)
341 struct ocfs2_xattr_value_root *xv)
342{ 336{
343 __ocfs2_init_extent_tree(et, inode, bh, xv, 337 __ocfs2_init_extent_tree(et, inode, vb->vb_bh, vb->vb_access, vb,
344 &ocfs2_xattr_value_et_ops); 338 &ocfs2_xattr_value_et_ops);
345} 339}
346 340
@@ -362,6 +356,15 @@ static inline void ocfs2_et_update_clusters(struct inode *inode,
362 et->et_ops->eo_update_clusters(inode, et, clusters); 356 et->et_ops->eo_update_clusters(inode, et, clusters);
363} 357}
364 358
359static inline int ocfs2_et_root_journal_access(handle_t *handle,
360 struct inode *inode,
361 struct ocfs2_extent_tree *et,
362 int type)
363{
364 return et->et_root_journal_access(handle, inode, et->et_root_bh,
365 type);
366}
367
365static inline int ocfs2_et_insert_check(struct inode *inode, 368static inline int ocfs2_et_insert_check(struct inode *inode,
366 struct ocfs2_extent_tree *et, 369 struct ocfs2_extent_tree *et,
367 struct ocfs2_extent_rec *rec) 370 struct ocfs2_extent_rec *rec)
@@ -402,12 +405,14 @@ struct ocfs2_path_item {
402#define OCFS2_MAX_PATH_DEPTH 5 405#define OCFS2_MAX_PATH_DEPTH 5
403 406
404struct ocfs2_path { 407struct ocfs2_path {
405 int p_tree_depth; 408 int p_tree_depth;
406 struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; 409 ocfs2_journal_access_func p_root_access;
410 struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH];
407}; 411};
408 412
409#define path_root_bh(_path) ((_path)->p_node[0].bh) 413#define path_root_bh(_path) ((_path)->p_node[0].bh)
410#define path_root_el(_path) ((_path)->p_node[0].el) 414#define path_root_el(_path) ((_path)->p_node[0].el)
415#define path_root_access(_path)((_path)->p_root_access)
411#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh) 416#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
412#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) 417#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
413#define path_num_items(_path) ((_path)->p_tree_depth + 1) 418#define path_num_items(_path) ((_path)->p_tree_depth + 1)
@@ -440,6 +445,8 @@ static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
440 */ 445 */
441 if (keep_root) 446 if (keep_root)
442 depth = le16_to_cpu(path_root_el(path)->l_tree_depth); 447 depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
448 else
449 path_root_access(path) = NULL;
443 450
444 path->p_tree_depth = depth; 451 path->p_tree_depth = depth;
445} 452}
@@ -465,6 +472,7 @@ static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
465 472
466 BUG_ON(path_root_bh(dest) != path_root_bh(src)); 473 BUG_ON(path_root_bh(dest) != path_root_bh(src));
467 BUG_ON(path_root_el(dest) != path_root_el(src)); 474 BUG_ON(path_root_el(dest) != path_root_el(src));
475 BUG_ON(path_root_access(dest) != path_root_access(src));
468 476
469 ocfs2_reinit_path(dest, 1); 477 ocfs2_reinit_path(dest, 1);
470 478
@@ -486,6 +494,7 @@ static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
486 int i; 494 int i;
487 495
488 BUG_ON(path_root_bh(dest) != path_root_bh(src)); 496 BUG_ON(path_root_bh(dest) != path_root_bh(src));
497 BUG_ON(path_root_access(dest) != path_root_access(src));
489 498
490 for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { 499 for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
491 brelse(dest->p_node[i].bh); 500 brelse(dest->p_node[i].bh);
@@ -521,7 +530,8 @@ static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
521} 530}
522 531
523static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, 532static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
524 struct ocfs2_extent_list *root_el) 533 struct ocfs2_extent_list *root_el,
534 ocfs2_journal_access_func access)
525{ 535{
526 struct ocfs2_path *path; 536 struct ocfs2_path *path;
527 537
@@ -533,11 +543,48 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
533 get_bh(root_bh); 543 get_bh(root_bh);
534 path_root_bh(path) = root_bh; 544 path_root_bh(path) = root_bh;
535 path_root_el(path) = root_el; 545 path_root_el(path) = root_el;
546 path_root_access(path) = access;
536 } 547 }
537 548
538 return path; 549 return path;
539} 550}
540 551
552static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
553{
554 return ocfs2_new_path(path_root_bh(path), path_root_el(path),
555 path_root_access(path));
556}
557
558static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
559{
560 return ocfs2_new_path(et->et_root_bh, et->et_root_el,
561 et->et_root_journal_access);
562}
563
564/*
565 * Journal the buffer at depth idx. All idx>0 are extent_blocks,
566 * otherwise it's the root_access function.
567 *
568 * I don't like the way this function's name looks next to
569 * ocfs2_journal_access_path(), but I don't have a better one.
570 */
571static int ocfs2_path_bh_journal_access(handle_t *handle,
572 struct inode *inode,
573 struct ocfs2_path *path,
574 int idx)
575{
576 ocfs2_journal_access_func access = path_root_access(path);
577
578 if (!access)
579 access = ocfs2_journal_access;
580
581 if (idx)
582 access = ocfs2_journal_access_eb;
583
584 return access(handle, inode, path->p_node[idx].bh,
585 OCFS2_JOURNAL_ACCESS_WRITE);
586}
587
541/* 588/*
542 * Convenience function to journal all components in a path. 589 * Convenience function to journal all components in a path.
543 */ 590 */
@@ -550,8 +597,7 @@ static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
550 goto out; 597 goto out;
551 598
552 for(i = 0; i < path_num_items(path); i++) { 599 for(i = 0; i < path_num_items(path); i++) {
553 ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh, 600 ret = ocfs2_path_bh_journal_access(handle, inode, path, i);
554 OCFS2_JOURNAL_ACCESS_WRITE);
555 if (ret < 0) { 601 if (ret < 0) {
556 mlog_errno(ret); 602 mlog_errno(ret);
557 goto out; 603 goto out;
@@ -686,6 +732,80 @@ struct ocfs2_merge_ctxt {
686 int c_split_covers_rec; 732 int c_split_covers_rec;
687}; 733};
688 734
735static int ocfs2_validate_extent_block(struct super_block *sb,
736 struct buffer_head *bh)
737{
738 int rc;
739 struct ocfs2_extent_block *eb =
740 (struct ocfs2_extent_block *)bh->b_data;
741
742 mlog(0, "Validating extent block %llu\n",
743 (unsigned long long)bh->b_blocknr);
744
745 BUG_ON(!buffer_uptodate(bh));
746
747 /*
748 * If the ecc fails, we return the error but otherwise
749 * leave the filesystem running. We know any error is
750 * local to this block.
751 */
752 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
753 if (rc) {
754 mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
755 (unsigned long long)bh->b_blocknr);
756 return rc;
757 }
758
759 /*
760 * Errors after here are fatal.
761 */
762
763 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
764 ocfs2_error(sb,
765 "Extent block #%llu has bad signature %.*s",
766 (unsigned long long)bh->b_blocknr, 7,
767 eb->h_signature);
768 return -EINVAL;
769 }
770
771 if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
772 ocfs2_error(sb,
773 "Extent block #%llu has an invalid h_blkno "
774 "of %llu",
775 (unsigned long long)bh->b_blocknr,
776 (unsigned long long)le64_to_cpu(eb->h_blkno));
777 return -EINVAL;
778 }
779
780 if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
781 ocfs2_error(sb,
782 "Extent block #%llu has an invalid "
783 "h_fs_generation of #%u",
784 (unsigned long long)bh->b_blocknr,
785 le32_to_cpu(eb->h_fs_generation));
786 return -EINVAL;
787 }
788
789 return 0;
790}
791
792int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
793 struct buffer_head **bh)
794{
795 int rc;
796 struct buffer_head *tmp = *bh;
797
798 rc = ocfs2_read_block(inode, eb_blkno, &tmp,
799 ocfs2_validate_extent_block);
800
801 /* If ocfs2_read_block() got us a new bh, pass it up. */
802 if (!rc && !*bh)
803 *bh = tmp;
804
805 return rc;
806}
807
808
689/* 809/*
690 * How many free extents have we got before we need more meta data? 810 * How many free extents have we got before we need more meta data?
691 */ 811 */
@@ -705,8 +825,7 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
705 last_eb_blk = ocfs2_et_get_last_eb_blk(et); 825 last_eb_blk = ocfs2_et_get_last_eb_blk(et);
706 826
707 if (last_eb_blk) { 827 if (last_eb_blk) {
708 retval = ocfs2_read_block(inode, last_eb_blk, 828 retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
709 &eb_bh);
710 if (retval < 0) { 829 if (retval < 0) {
711 mlog_errno(retval); 830 mlog_errno(retval);
712 goto bail; 831 goto bail;
@@ -768,8 +887,8 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
768 } 887 }
769 ocfs2_set_new_buffer_uptodate(inode, bhs[i]); 888 ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
770 889
771 status = ocfs2_journal_access(handle, inode, bhs[i], 890 status = ocfs2_journal_access_eb(handle, inode, bhs[i],
772 OCFS2_JOURNAL_ACCESS_CREATE); 891 OCFS2_JOURNAL_ACCESS_CREATE);
773 if (status < 0) { 892 if (status < 0) {
774 mlog_errno(status); 893 mlog_errno(status);
775 goto bail; 894 goto bail;
@@ -908,15 +1027,12 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
908 for(i = 0; i < new_blocks; i++) { 1027 for(i = 0; i < new_blocks; i++) {
909 bh = new_eb_bhs[i]; 1028 bh = new_eb_bhs[i];
910 eb = (struct ocfs2_extent_block *) bh->b_data; 1029 eb = (struct ocfs2_extent_block *) bh->b_data;
911 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 1030 /* ocfs2_create_new_meta_bhs() should create it right! */
912 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 1031 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
913 status = -EIO;
914 goto bail;
915 }
916 eb_el = &eb->h_list; 1032 eb_el = &eb->h_list;
917 1033
918 status = ocfs2_journal_access(handle, inode, bh, 1034 status = ocfs2_journal_access_eb(handle, inode, bh,
919 OCFS2_JOURNAL_ACCESS_CREATE); 1035 OCFS2_JOURNAL_ACCESS_CREATE);
920 if (status < 0) { 1036 if (status < 0) {
921 mlog_errno(status); 1037 mlog_errno(status);
922 goto bail; 1038 goto bail;
@@ -955,21 +1071,21 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
955 * journal_dirty erroring as it won't unless we've aborted the 1071 * journal_dirty erroring as it won't unless we've aborted the
956 * handle (in which case we would never be here) so reserving 1072 * handle (in which case we would never be here) so reserving
957 * the write with journal_access is all we need to do. */ 1073 * the write with journal_access is all we need to do. */
958 status = ocfs2_journal_access(handle, inode, *last_eb_bh, 1074 status = ocfs2_journal_access_eb(handle, inode, *last_eb_bh,
959 OCFS2_JOURNAL_ACCESS_WRITE); 1075 OCFS2_JOURNAL_ACCESS_WRITE);
960 if (status < 0) { 1076 if (status < 0) {
961 mlog_errno(status); 1077 mlog_errno(status);
962 goto bail; 1078 goto bail;
963 } 1079 }
964 status = ocfs2_journal_access(handle, inode, et->et_root_bh, 1080 status = ocfs2_et_root_journal_access(handle, inode, et,
965 OCFS2_JOURNAL_ACCESS_WRITE); 1081 OCFS2_JOURNAL_ACCESS_WRITE);
966 if (status < 0) { 1082 if (status < 0) {
967 mlog_errno(status); 1083 mlog_errno(status);
968 goto bail; 1084 goto bail;
969 } 1085 }
970 if (eb_bh) { 1086 if (eb_bh) {
971 status = ocfs2_journal_access(handle, inode, eb_bh, 1087 status = ocfs2_journal_access_eb(handle, inode, eb_bh,
972 OCFS2_JOURNAL_ACCESS_WRITE); 1088 OCFS2_JOURNAL_ACCESS_WRITE);
973 if (status < 0) { 1089 if (status < 0) {
974 mlog_errno(status); 1090 mlog_errno(status);
975 goto bail; 1091 goto bail;
@@ -1052,17 +1168,14 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
1052 } 1168 }
1053 1169
1054 eb = (struct ocfs2_extent_block *) new_eb_bh->b_data; 1170 eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
1055 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 1171 /* ocfs2_create_new_meta_bhs() should create it right! */
1056 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 1172 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1057 status = -EIO;
1058 goto bail;
1059 }
1060 1173
1061 eb_el = &eb->h_list; 1174 eb_el = &eb->h_list;
1062 root_el = et->et_root_el; 1175 root_el = et->et_root_el;
1063 1176
1064 status = ocfs2_journal_access(handle, inode, new_eb_bh, 1177 status = ocfs2_journal_access_eb(handle, inode, new_eb_bh,
1065 OCFS2_JOURNAL_ACCESS_CREATE); 1178 OCFS2_JOURNAL_ACCESS_CREATE);
1066 if (status < 0) { 1179 if (status < 0) {
1067 mlog_errno(status); 1180 mlog_errno(status);
1068 goto bail; 1181 goto bail;
@@ -1080,8 +1193,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
1080 goto bail; 1193 goto bail;
1081 } 1194 }
1082 1195
1083 status = ocfs2_journal_access(handle, inode, et->et_root_bh, 1196 status = ocfs2_et_root_journal_access(handle, inode, et,
1084 OCFS2_JOURNAL_ACCESS_WRITE); 1197 OCFS2_JOURNAL_ACCESS_WRITE);
1085 if (status < 0) { 1198 if (status < 0) {
1086 mlog_errno(status); 1199 mlog_errno(status);
1087 goto bail; 1200 goto bail;
@@ -1176,18 +1289,13 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
1176 brelse(bh); 1289 brelse(bh);
1177 bh = NULL; 1290 bh = NULL;
1178 1291
1179 status = ocfs2_read_block(inode, blkno, &bh); 1292 status = ocfs2_read_extent_block(inode, blkno, &bh);
1180 if (status < 0) { 1293 if (status < 0) {
1181 mlog_errno(status); 1294 mlog_errno(status);
1182 goto bail; 1295 goto bail;
1183 } 1296 }
1184 1297
1185 eb = (struct ocfs2_extent_block *) bh->b_data; 1298 eb = (struct ocfs2_extent_block *) bh->b_data;
1186 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1187 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1188 status = -EIO;
1189 goto bail;
1190 }
1191 el = &eb->h_list; 1299 el = &eb->h_list;
1192 1300
1193 if (le16_to_cpu(el->l_next_free_rec) < 1301 if (le16_to_cpu(el->l_next_free_rec) <
@@ -1540,7 +1648,7 @@ static int __ocfs2_find_path(struct inode *inode,
1540 1648
1541 brelse(bh); 1649 brelse(bh);
1542 bh = NULL; 1650 bh = NULL;
1543 ret = ocfs2_read_block(inode, blkno, &bh); 1651 ret = ocfs2_read_extent_block(inode, blkno, &bh);
1544 if (ret) { 1652 if (ret) {
1545 mlog_errno(ret); 1653 mlog_errno(ret);
1546 goto out; 1654 goto out;
@@ -1548,11 +1656,6 @@ static int __ocfs2_find_path(struct inode *inode,
1548 1656
1549 eb = (struct ocfs2_extent_block *) bh->b_data; 1657 eb = (struct ocfs2_extent_block *) bh->b_data;
1550 el = &eb->h_list; 1658 el = &eb->h_list;
1551 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1552 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1553 ret = -EIO;
1554 goto out;
1555 }
1556 1659
1557 if (le16_to_cpu(el->l_next_free_rec) > 1660 if (le16_to_cpu(el->l_next_free_rec) >
1558 le16_to_cpu(el->l_count)) { 1661 le16_to_cpu(el->l_count)) {
@@ -1860,25 +1963,23 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
1860 root_bh = left_path->p_node[subtree_index].bh; 1963 root_bh = left_path->p_node[subtree_index].bh;
1861 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 1964 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1862 1965
1863 ret = ocfs2_journal_access(handle, inode, root_bh, 1966 ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
1864 OCFS2_JOURNAL_ACCESS_WRITE); 1967 subtree_index);
1865 if (ret) { 1968 if (ret) {
1866 mlog_errno(ret); 1969 mlog_errno(ret);
1867 goto out; 1970 goto out;
1868 } 1971 }
1869 1972
1870 for(i = subtree_index + 1; i < path_num_items(right_path); i++) { 1973 for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1871 ret = ocfs2_journal_access(handle, inode, 1974 ret = ocfs2_path_bh_journal_access(handle, inode,
1872 right_path->p_node[i].bh, 1975 right_path, i);
1873 OCFS2_JOURNAL_ACCESS_WRITE);
1874 if (ret) { 1976 if (ret) {
1875 mlog_errno(ret); 1977 mlog_errno(ret);
1876 goto out; 1978 goto out;
1877 } 1979 }
1878 1980
1879 ret = ocfs2_journal_access(handle, inode, 1981 ret = ocfs2_path_bh_journal_access(handle, inode,
1880 left_path->p_node[i].bh, 1982 left_path, i);
1881 OCFS2_JOURNAL_ACCESS_WRITE);
1882 if (ret) { 1983 if (ret) {
1883 mlog_errno(ret); 1984 mlog_errno(ret);
1884 goto out; 1985 goto out;
@@ -2102,8 +2203,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
2102 2203
2103 *ret_left_path = NULL; 2204 *ret_left_path = NULL;
2104 2205
2105 left_path = ocfs2_new_path(path_root_bh(right_path), 2206 left_path = ocfs2_new_path_from_path(right_path);
2106 path_root_el(right_path));
2107 if (!left_path) { 2207 if (!left_path) {
2108 ret = -ENOMEM; 2208 ret = -ENOMEM;
2109 mlog_errno(ret); 2209 mlog_errno(ret);
@@ -2398,9 +2498,9 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2398 return -EAGAIN; 2498 return -EAGAIN;
2399 2499
2400 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) { 2500 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2401 ret = ocfs2_journal_access(handle, inode, 2501 ret = ocfs2_journal_access_eb(handle, inode,
2402 path_leaf_bh(right_path), 2502 path_leaf_bh(right_path),
2403 OCFS2_JOURNAL_ACCESS_WRITE); 2503 OCFS2_JOURNAL_ACCESS_WRITE);
2404 if (ret) { 2504 if (ret) {
2405 mlog_errno(ret); 2505 mlog_errno(ret);
2406 goto out; 2506 goto out;
@@ -2417,8 +2517,8 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2417 * We have to update i_last_eb_blk during the meta 2517 * We have to update i_last_eb_blk during the meta
2418 * data delete. 2518 * data delete.
2419 */ 2519 */
2420 ret = ocfs2_journal_access(handle, inode, et_root_bh, 2520 ret = ocfs2_et_root_journal_access(handle, inode, et,
2421 OCFS2_JOURNAL_ACCESS_WRITE); 2521 OCFS2_JOURNAL_ACCESS_WRITE);
2422 if (ret) { 2522 if (ret) {
2423 mlog_errno(ret); 2523 mlog_errno(ret);
2424 goto out; 2524 goto out;
@@ -2433,25 +2533,23 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2433 */ 2533 */
2434 BUG_ON(right_has_empty && !del_right_subtree); 2534 BUG_ON(right_has_empty && !del_right_subtree);
2435 2535
2436 ret = ocfs2_journal_access(handle, inode, root_bh, 2536 ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
2437 OCFS2_JOURNAL_ACCESS_WRITE); 2537 subtree_index);
2438 if (ret) { 2538 if (ret) {
2439 mlog_errno(ret); 2539 mlog_errno(ret);
2440 goto out; 2540 goto out;
2441 } 2541 }
2442 2542
2443 for(i = subtree_index + 1; i < path_num_items(right_path); i++) { 2543 for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2444 ret = ocfs2_journal_access(handle, inode, 2544 ret = ocfs2_path_bh_journal_access(handle, inode,
2445 right_path->p_node[i].bh, 2545 right_path, i);
2446 OCFS2_JOURNAL_ACCESS_WRITE);
2447 if (ret) { 2546 if (ret) {
2448 mlog_errno(ret); 2547 mlog_errno(ret);
2449 goto out; 2548 goto out;
2450 } 2549 }
2451 2550
2452 ret = ocfs2_journal_access(handle, inode, 2551 ret = ocfs2_path_bh_journal_access(handle, inode,
2453 left_path->p_node[i].bh, 2552 left_path, i);
2454 OCFS2_JOURNAL_ACCESS_WRITE);
2455 if (ret) { 2553 if (ret) {
2456 mlog_errno(ret); 2554 mlog_errno(ret);
2457 goto out; 2555 goto out;
@@ -2596,16 +2694,17 @@ out:
2596 2694
2597static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode, 2695static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2598 handle_t *handle, 2696 handle_t *handle,
2599 struct buffer_head *bh, 2697 struct ocfs2_path *path)
2600 struct ocfs2_extent_list *el)
2601{ 2698{
2602 int ret; 2699 int ret;
2700 struct buffer_head *bh = path_leaf_bh(path);
2701 struct ocfs2_extent_list *el = path_leaf_el(path);
2603 2702
2604 if (!ocfs2_is_empty_extent(&el->l_recs[0])) 2703 if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2605 return 0; 2704 return 0;
2606 2705
2607 ret = ocfs2_journal_access(handle, inode, bh, 2706 ret = ocfs2_path_bh_journal_access(handle, inode, path,
2608 OCFS2_JOURNAL_ACCESS_WRITE); 2707 path_num_items(path) - 1);
2609 if (ret) { 2708 if (ret) {
2610 mlog_errno(ret); 2709 mlog_errno(ret);
2611 goto out; 2710 goto out;
@@ -2644,8 +2743,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
2644 goto out; 2743 goto out;
2645 } 2744 }
2646 2745
2647 left_path = ocfs2_new_path(path_root_bh(path), 2746 left_path = ocfs2_new_path_from_path(path);
2648 path_root_el(path));
2649 if (!left_path) { 2747 if (!left_path) {
2650 ret = -ENOMEM; 2748 ret = -ENOMEM;
2651 mlog_errno(ret); 2749 mlog_errno(ret);
@@ -2654,8 +2752,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
2654 2752
2655 ocfs2_cp_path(left_path, path); 2753 ocfs2_cp_path(left_path, path);
2656 2754
2657 right_path = ocfs2_new_path(path_root_bh(path), 2755 right_path = ocfs2_new_path_from_path(path);
2658 path_root_el(path));
2659 if (!right_path) { 2756 if (!right_path) {
2660 ret = -ENOMEM; 2757 ret = -ENOMEM;
2661 mlog_errno(ret); 2758 mlog_errno(ret);
@@ -2689,9 +2786,8 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
2689 * Caller might still want to make changes to the 2786 * Caller might still want to make changes to the
2690 * tree root, so re-add it to the journal here. 2787 * tree root, so re-add it to the journal here.
2691 */ 2788 */
2692 ret = ocfs2_journal_access(handle, inode, 2789 ret = ocfs2_path_bh_journal_access(handle, inode,
2693 path_root_bh(left_path), 2790 left_path, 0);
2694 OCFS2_JOURNAL_ACCESS_WRITE);
2695 if (ret) { 2791 if (ret) {
2696 mlog_errno(ret); 2792 mlog_errno(ret);
2697 goto out; 2793 goto out;
@@ -2785,8 +2881,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2785 * We have a path to the left of this one - it needs 2881 * We have a path to the left of this one - it needs
2786 * an update too. 2882 * an update too.
2787 */ 2883 */
2788 left_path = ocfs2_new_path(path_root_bh(path), 2884 left_path = ocfs2_new_path_from_path(path);
2789 path_root_el(path));
2790 if (!left_path) { 2885 if (!left_path) {
2791 ret = -ENOMEM; 2886 ret = -ENOMEM;
2792 mlog_errno(ret); 2887 mlog_errno(ret);
@@ -2875,8 +2970,7 @@ rightmost_no_delete:
2875 * it up front. 2970 * it up front.
2876 */ 2971 */
2877 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle, 2972 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
2878 path_leaf_bh(path), 2973 path);
2879 path_leaf_el(path));
2880 if (ret) 2974 if (ret)
2881 mlog_errno(ret); 2975 mlog_errno(ret);
2882 goto out; 2976 goto out;
@@ -3027,8 +3121,7 @@ static int ocfs2_get_right_path(struct inode *inode,
3027 /* This function shouldn't be called for the rightmost leaf. */ 3121 /* This function shouldn't be called for the rightmost leaf. */
3028 BUG_ON(right_cpos == 0); 3122 BUG_ON(right_cpos == 0);
3029 3123
3030 right_path = ocfs2_new_path(path_root_bh(left_path), 3124 right_path = ocfs2_new_path_from_path(left_path);
3031 path_root_el(left_path));
3032 if (!right_path) { 3125 if (!right_path) {
3033 ret = -ENOMEM; 3126 ret = -ENOMEM;
3034 mlog_errno(ret); 3127 mlog_errno(ret);
@@ -3111,8 +3204,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3111 root_bh = left_path->p_node[subtree_index].bh; 3204 root_bh = left_path->p_node[subtree_index].bh;
3112 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 3205 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3113 3206
3114 ret = ocfs2_journal_access(handle, inode, root_bh, 3207 ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
3115 OCFS2_JOURNAL_ACCESS_WRITE); 3208 subtree_index);
3116 if (ret) { 3209 if (ret) {
3117 mlog_errno(ret); 3210 mlog_errno(ret);
3118 goto out; 3211 goto out;
@@ -3120,17 +3213,15 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3120 3213
3121 for (i = subtree_index + 1; 3214 for (i = subtree_index + 1;
3122 i < path_num_items(right_path); i++) { 3215 i < path_num_items(right_path); i++) {
3123 ret = ocfs2_journal_access(handle, inode, 3216 ret = ocfs2_path_bh_journal_access(handle, inode,
3124 right_path->p_node[i].bh, 3217 right_path, i);
3125 OCFS2_JOURNAL_ACCESS_WRITE);
3126 if (ret) { 3218 if (ret) {
3127 mlog_errno(ret); 3219 mlog_errno(ret);
3128 goto out; 3220 goto out;
3129 } 3221 }
3130 3222
3131 ret = ocfs2_journal_access(handle, inode, 3223 ret = ocfs2_path_bh_journal_access(handle, inode,
3132 left_path->p_node[i].bh, 3224 left_path, i);
3133 OCFS2_JOURNAL_ACCESS_WRITE);
3134 if (ret) { 3225 if (ret) {
3135 mlog_errno(ret); 3226 mlog_errno(ret);
3136 goto out; 3227 goto out;
@@ -3142,8 +3233,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3142 right_rec = &el->l_recs[index + 1]; 3233 right_rec = &el->l_recs[index + 1];
3143 } 3234 }
3144 3235
3145 ret = ocfs2_journal_access(handle, inode, bh, 3236 ret = ocfs2_path_bh_journal_access(handle, inode, left_path,
3146 OCFS2_JOURNAL_ACCESS_WRITE); 3237 path_num_items(left_path) - 1);
3147 if (ret) { 3238 if (ret) {
3148 mlog_errno(ret); 3239 mlog_errno(ret);
3149 goto out; 3240 goto out;
@@ -3199,8 +3290,7 @@ static int ocfs2_get_left_path(struct inode *inode,
3199 /* This function shouldn't be called for the leftmost leaf. */ 3290 /* This function shouldn't be called for the leftmost leaf. */
3200 BUG_ON(left_cpos == 0); 3291 BUG_ON(left_cpos == 0);
3201 3292
3202 left_path = ocfs2_new_path(path_root_bh(right_path), 3293 left_path = ocfs2_new_path_from_path(right_path);
3203 path_root_el(right_path));
3204 if (!left_path) { 3294 if (!left_path) {
3205 ret = -ENOMEM; 3295 ret = -ENOMEM;
3206 mlog_errno(ret); 3296 mlog_errno(ret);
@@ -3283,8 +3373,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3283 root_bh = left_path->p_node[subtree_index].bh; 3373 root_bh = left_path->p_node[subtree_index].bh;
3284 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 3374 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3285 3375
3286 ret = ocfs2_journal_access(handle, inode, root_bh, 3376 ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
3287 OCFS2_JOURNAL_ACCESS_WRITE); 3377 subtree_index);
3288 if (ret) { 3378 if (ret) {
3289 mlog_errno(ret); 3379 mlog_errno(ret);
3290 goto out; 3380 goto out;
@@ -3292,17 +3382,15 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3292 3382
3293 for (i = subtree_index + 1; 3383 for (i = subtree_index + 1;
3294 i < path_num_items(right_path); i++) { 3384 i < path_num_items(right_path); i++) {
3295 ret = ocfs2_journal_access(handle, inode, 3385 ret = ocfs2_path_bh_journal_access(handle, inode,
3296 right_path->p_node[i].bh, 3386 right_path, i);
3297 OCFS2_JOURNAL_ACCESS_WRITE);
3298 if (ret) { 3387 if (ret) {
3299 mlog_errno(ret); 3388 mlog_errno(ret);
3300 goto out; 3389 goto out;
3301 } 3390 }
3302 3391
3303 ret = ocfs2_journal_access(handle, inode, 3392 ret = ocfs2_path_bh_journal_access(handle, inode,
3304 left_path->p_node[i].bh, 3393 left_path, i);
3305 OCFS2_JOURNAL_ACCESS_WRITE);
3306 if (ret) { 3394 if (ret) {
3307 mlog_errno(ret); 3395 mlog_errno(ret);
3308 goto out; 3396 goto out;
@@ -3314,8 +3402,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3314 has_empty_extent = 1; 3402 has_empty_extent = 1;
3315 } 3403 }
3316 3404
3317 ret = ocfs2_journal_access(handle, inode, bh, 3405 ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
3318 OCFS2_JOURNAL_ACCESS_WRITE); 3406 path_num_items(right_path) - 1);
3319 if (ret) { 3407 if (ret) {
3320 mlog_errno(ret); 3408 mlog_errno(ret);
3321 goto out; 3409 goto out;
@@ -3732,8 +3820,7 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3732 * leftmost leaf. 3820 * leftmost leaf.
3733 */ 3821 */
3734 if (left_cpos) { 3822 if (left_cpos) {
3735 left_path = ocfs2_new_path(path_root_bh(right_path), 3823 left_path = ocfs2_new_path_from_path(right_path);
3736 path_root_el(right_path));
3737 if (!left_path) { 3824 if (!left_path) {
3738 ret = -ENOMEM; 3825 ret = -ENOMEM;
3739 mlog_errno(ret); 3826 mlog_errno(ret);
@@ -3781,7 +3868,7 @@ static void ocfs2_split_record(struct inode *inode,
3781 struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el; 3868 struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3782 struct ocfs2_extent_rec *rec, *tmprec; 3869 struct ocfs2_extent_rec *rec, *tmprec;
3783 3870
3784 right_el = path_leaf_el(right_path);; 3871 right_el = path_leaf_el(right_path);
3785 if (left_path) 3872 if (left_path)
3786 left_el = path_leaf_el(left_path); 3873 left_el = path_leaf_el(left_path);
3787 3874
@@ -3958,8 +4045,8 @@ static int ocfs2_do_insert_extent(struct inode *inode,
3958 4045
3959 el = et->et_root_el; 4046 el = et->et_root_el;
3960 4047
3961 ret = ocfs2_journal_access(handle, inode, et->et_root_bh, 4048 ret = ocfs2_et_root_journal_access(handle, inode, et,
3962 OCFS2_JOURNAL_ACCESS_WRITE); 4049 OCFS2_JOURNAL_ACCESS_WRITE);
3963 if (ret) { 4050 if (ret) {
3964 mlog_errno(ret); 4051 mlog_errno(ret);
3965 goto out; 4052 goto out;
@@ -3970,7 +4057,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
3970 goto out_update_clusters; 4057 goto out_update_clusters;
3971 } 4058 }
3972 4059
3973 right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el); 4060 right_path = ocfs2_new_path_from_et(et);
3974 if (!right_path) { 4061 if (!right_path) {
3975 ret = -ENOMEM; 4062 ret = -ENOMEM;
3976 mlog_errno(ret); 4063 mlog_errno(ret);
@@ -4020,8 +4107,8 @@ static int ocfs2_do_insert_extent(struct inode *inode,
4020 * ocfs2_rotate_tree_right() might have extended the 4107 * ocfs2_rotate_tree_right() might have extended the
4021 * transaction without re-journaling our tree root. 4108 * transaction without re-journaling our tree root.
4022 */ 4109 */
4023 ret = ocfs2_journal_access(handle, inode, et->et_root_bh, 4110 ret = ocfs2_et_root_journal_access(handle, inode, et,
4024 OCFS2_JOURNAL_ACCESS_WRITE); 4111 OCFS2_JOURNAL_ACCESS_WRITE);
4025 if (ret) { 4112 if (ret) {
4026 mlog_errno(ret); 4113 mlog_errno(ret);
4027 goto out; 4114 goto out;
@@ -4082,8 +4169,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4082 goto out; 4169 goto out;
4083 4170
4084 if (left_cpos != 0) { 4171 if (left_cpos != 0) {
4085 left_path = ocfs2_new_path(path_root_bh(path), 4172 left_path = ocfs2_new_path_from_path(path);
4086 path_root_el(path));
4087 if (!left_path) 4173 if (!left_path)
4088 goto out; 4174 goto out;
4089 4175
@@ -4097,8 +4183,15 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4097 le16_to_cpu(new_el->l_count)) { 4183 le16_to_cpu(new_el->l_count)) {
4098 bh = path_leaf_bh(left_path); 4184 bh = path_leaf_bh(left_path);
4099 eb = (struct ocfs2_extent_block *)bh->b_data; 4185 eb = (struct ocfs2_extent_block *)bh->b_data;
4100 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, 4186 ocfs2_error(inode->i_sb,
4101 eb); 4187 "Extent block #%llu has an "
4188 "invalid l_next_free_rec of "
4189 "%d. It should have "
4190 "matched the l_count of %d",
4191 (unsigned long long)le64_to_cpu(eb->h_blkno),
4192 le16_to_cpu(new_el->l_next_free_rec),
4193 le16_to_cpu(new_el->l_count));
4194 status = -EINVAL;
4102 goto out; 4195 goto out;
4103 } 4196 }
4104 rec = &new_el->l_recs[ 4197 rec = &new_el->l_recs[
@@ -4132,8 +4225,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4132 if (right_cpos == 0) 4225 if (right_cpos == 0)
4133 goto out; 4226 goto out;
4134 4227
4135 right_path = ocfs2_new_path(path_root_bh(path), 4228 right_path = ocfs2_new_path_from_path(path);
4136 path_root_el(path));
4137 if (!right_path) 4229 if (!right_path)
4138 goto out; 4230 goto out;
4139 4231
@@ -4147,8 +4239,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4147 if (le16_to_cpu(new_el->l_next_free_rec) <= 1) { 4239 if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4148 bh = path_leaf_bh(right_path); 4240 bh = path_leaf_bh(right_path);
4149 eb = (struct ocfs2_extent_block *)bh->b_data; 4241 eb = (struct ocfs2_extent_block *)bh->b_data;
4150 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, 4242 ocfs2_error(inode->i_sb,
4151 eb); 4243 "Extent block #%llu has an "
4244 "invalid l_next_free_rec of %d",
4245 (unsigned long long)le64_to_cpu(eb->h_blkno),
4246 le16_to_cpu(new_el->l_next_free_rec));
4247 status = -EINVAL;
4152 goto out; 4248 goto out;
4153 } 4249 }
4154 rec = &new_el->l_recs[1]; 4250 rec = &new_el->l_recs[1];
@@ -4294,7 +4390,9 @@ static int ocfs2_figure_insert_type(struct inode *inode,
4294 * ocfs2_figure_insert_type() and ocfs2_add_branch() 4390 * ocfs2_figure_insert_type() and ocfs2_add_branch()
4295 * may want it later. 4391 * may want it later.
4296 */ 4392 */
4297 ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh); 4393 ret = ocfs2_read_extent_block(inode,
4394 ocfs2_et_get_last_eb_blk(et),
4395 &bh);
4298 if (ret) { 4396 if (ret) {
4299 mlog_exit(ret); 4397 mlog_exit(ret);
4300 goto out; 4398 goto out;
@@ -4320,7 +4418,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
4320 return 0; 4418 return 0;
4321 } 4419 }
4322 4420
4323 path = ocfs2_new_path(et->et_root_bh, et->et_root_el); 4421 path = ocfs2_new_path_from_et(et);
4324 if (!path) { 4422 if (!path) {
4325 ret = -ENOMEM; 4423 ret = -ENOMEM;
4326 mlog_errno(ret); 4424 mlog_errno(ret);
@@ -4531,9 +4629,9 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4531 4629
4532 BUG_ON(num_bits > clusters_to_add); 4630 BUG_ON(num_bits > clusters_to_add);
4533 4631
4534 /* reserve our write early -- insert_extent may update the inode */ 4632 /* reserve our write early -- insert_extent may update the tree root */
4535 status = ocfs2_journal_access(handle, inode, et->et_root_bh, 4633 status = ocfs2_et_root_journal_access(handle, inode, et,
4536 OCFS2_JOURNAL_ACCESS_WRITE); 4634 OCFS2_JOURNAL_ACCESS_WRITE);
4537 if (status < 0) { 4635 if (status < 0) {
4538 mlog_errno(status); 4636 mlog_errno(status);
4539 goto leave; 4637 goto leave;
@@ -4760,20 +4858,15 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
4760 if (path->p_tree_depth) { 4858 if (path->p_tree_depth) {
4761 struct ocfs2_extent_block *eb; 4859 struct ocfs2_extent_block *eb;
4762 4860
4763 ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), 4861 ret = ocfs2_read_extent_block(inode,
4764 &last_eb_bh); 4862 ocfs2_et_get_last_eb_blk(et),
4863 &last_eb_bh);
4765 if (ret) { 4864 if (ret) {
4766 mlog_exit(ret); 4865 mlog_exit(ret);
4767 goto out; 4866 goto out;
4768 } 4867 }
4769 4868
4770 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 4869 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4771 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
4772 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
4773 ret = -EROFS;
4774 goto out;
4775 }
4776
4777 rightmost_el = &eb->h_list; 4870 rightmost_el = &eb->h_list;
4778 } else 4871 } else
4779 rightmost_el = path_root_el(path); 4872 rightmost_el = path_root_el(path);
@@ -4854,7 +4947,7 @@ int ocfs2_mark_extent_written(struct inode *inode,
4854 if (et->et_ops == &ocfs2_dinode_et_ops) 4947 if (et->et_ops == &ocfs2_dinode_et_ops)
4855 ocfs2_extent_map_trunc(inode, 0); 4948 ocfs2_extent_map_trunc(inode, 0);
4856 4949
4857 left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el); 4950 left_path = ocfs2_new_path_from_et(et);
4858 if (!left_path) { 4951 if (!left_path) {
4859 ret = -ENOMEM; 4952 ret = -ENOMEM;
4860 mlog_errno(ret); 4953 mlog_errno(ret);
@@ -4918,8 +5011,9 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
4918 5011
4919 depth = path->p_tree_depth; 5012 depth = path->p_tree_depth;
4920 if (depth > 0) { 5013 if (depth > 0) {
4921 ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), 5014 ret = ocfs2_read_extent_block(inode,
4922 &last_eb_bh); 5015 ocfs2_et_get_last_eb_blk(et),
5016 &last_eb_bh);
4923 if (ret < 0) { 5017 if (ret < 0) {
4924 mlog_errno(ret); 5018 mlog_errno(ret);
4925 goto out; 5019 goto out;
@@ -5025,8 +5119,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5025 } 5119 }
5026 5120
5027 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) { 5121 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
5028 left_path = ocfs2_new_path(path_root_bh(path), 5122 left_path = ocfs2_new_path_from_path(path);
5029 path_root_el(path));
5030 if (!left_path) { 5123 if (!left_path) {
5031 ret = -ENOMEM; 5124 ret = -ENOMEM;
5032 mlog_errno(ret); 5125 mlog_errno(ret);
@@ -5135,7 +5228,7 @@ int ocfs2_remove_extent(struct inode *inode,
5135 5228
5136 ocfs2_extent_map_trunc(inode, 0); 5229 ocfs2_extent_map_trunc(inode, 0);
5137 5230
5138 path = ocfs2_new_path(et->et_root_bh, et->et_root_el); 5231 path = ocfs2_new_path_from_et(et);
5139 if (!path) { 5232 if (!path) {
5140 ret = -ENOMEM; 5233 ret = -ENOMEM;
5141 mlog_errno(ret); 5234 mlog_errno(ret);
@@ -5255,6 +5348,78 @@ out:
5255 return ret; 5348 return ret;
5256} 5349}
5257 5350
5351int ocfs2_remove_btree_range(struct inode *inode,
5352 struct ocfs2_extent_tree *et,
5353 u32 cpos, u32 phys_cpos, u32 len,
5354 struct ocfs2_cached_dealloc_ctxt *dealloc)
5355{
5356 int ret;
5357 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
5358 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5359 struct inode *tl_inode = osb->osb_tl_inode;
5360 handle_t *handle;
5361 struct ocfs2_alloc_context *meta_ac = NULL;
5362
5363 ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
5364 if (ret) {
5365 mlog_errno(ret);
5366 return ret;
5367 }
5368
5369 mutex_lock(&tl_inode->i_mutex);
5370
5371 if (ocfs2_truncate_log_needs_flush(osb)) {
5372 ret = __ocfs2_flush_truncate_log(osb);
5373 if (ret < 0) {
5374 mlog_errno(ret);
5375 goto out;
5376 }
5377 }
5378
5379 handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
5380 if (IS_ERR(handle)) {
5381 ret = PTR_ERR(handle);
5382 mlog_errno(ret);
5383 goto out;
5384 }
5385
5386 ret = ocfs2_et_root_journal_access(handle, inode, et,
5387 OCFS2_JOURNAL_ACCESS_WRITE);
5388 if (ret) {
5389 mlog_errno(ret);
5390 goto out;
5391 }
5392
5393 ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
5394 dealloc);
5395 if (ret) {
5396 mlog_errno(ret);
5397 goto out_commit;
5398 }
5399
5400 ocfs2_et_update_clusters(inode, et, -len);
5401
5402 ret = ocfs2_journal_dirty(handle, et->et_root_bh);
5403 if (ret) {
5404 mlog_errno(ret);
5405 goto out_commit;
5406 }
5407
5408 ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
5409 if (ret)
5410 mlog_errno(ret);
5411
5412out_commit:
5413 ocfs2_commit_trans(osb, handle);
5414out:
5415 mutex_unlock(&tl_inode->i_mutex);
5416
5417 if (meta_ac)
5418 ocfs2_free_alloc_context(meta_ac);
5419
5420 return ret;
5421}
5422
5258int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) 5423int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
5259{ 5424{
5260 struct buffer_head *tl_bh = osb->osb_tl_bh; 5425 struct buffer_head *tl_bh = osb->osb_tl_bh;
@@ -5308,13 +5473,13 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5308 start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); 5473 start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
5309 5474
5310 di = (struct ocfs2_dinode *) tl_bh->b_data; 5475 di = (struct ocfs2_dinode *) tl_bh->b_data;
5311 tl = &di->id2.i_dealloc;
5312 if (!OCFS2_IS_VALID_DINODE(di)) {
5313 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
5314 status = -EIO;
5315 goto bail;
5316 }
5317 5476
5477 /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated
5478 * by the underlying call to ocfs2_read_inode_block(), so any
5479 * corruption is a code bug */
5480 BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5481
5482 tl = &di->id2.i_dealloc;
5318 tl_count = le16_to_cpu(tl->tl_count); 5483 tl_count = le16_to_cpu(tl->tl_count);
5319 mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) || 5484 mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
5320 tl_count == 0, 5485 tl_count == 0,
@@ -5332,8 +5497,8 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5332 goto bail; 5497 goto bail;
5333 } 5498 }
5334 5499
5335 status = ocfs2_journal_access(handle, tl_inode, tl_bh, 5500 status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
5336 OCFS2_JOURNAL_ACCESS_WRITE); 5501 OCFS2_JOURNAL_ACCESS_WRITE);
5337 if (status < 0) { 5502 if (status < 0) {
5338 mlog_errno(status); 5503 mlog_errno(status);
5339 goto bail; 5504 goto bail;
@@ -5394,8 +5559,8 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5394 while (i >= 0) { 5559 while (i >= 0) {
5395 /* Caller has given us at least enough credits to 5560 /* Caller has given us at least enough credits to
5396 * update the truncate log dinode */ 5561 * update the truncate log dinode */
5397 status = ocfs2_journal_access(handle, tl_inode, tl_bh, 5562 status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
5398 OCFS2_JOURNAL_ACCESS_WRITE); 5563 OCFS2_JOURNAL_ACCESS_WRITE);
5399 if (status < 0) { 5564 if (status < 0) {
5400 mlog_errno(status); 5565 mlog_errno(status);
5401 goto bail; 5566 goto bail;
@@ -5464,13 +5629,13 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5464 BUG_ON(mutex_trylock(&tl_inode->i_mutex)); 5629 BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5465 5630
5466 di = (struct ocfs2_dinode *) tl_bh->b_data; 5631 di = (struct ocfs2_dinode *) tl_bh->b_data;
5467 tl = &di->id2.i_dealloc;
5468 if (!OCFS2_IS_VALID_DINODE(di)) {
5469 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
5470 status = -EIO;
5471 goto out;
5472 }
5473 5632
5633 /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated
5634 * by the underlying call to ocfs2_read_inode_block(), so any
5635 * corruption is a code bug */
5636 BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5637
5638 tl = &di->id2.i_dealloc;
5474 num_to_flush = le16_to_cpu(tl->tl_used); 5639 num_to_flush = le16_to_cpu(tl->tl_used);
5475 mlog(0, "Flush %u records from truncate log #%llu\n", 5640 mlog(0, "Flush %u records from truncate log #%llu\n",
5476 num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno); 5641 num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
@@ -5586,7 +5751,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
5586 goto bail; 5751 goto bail;
5587 } 5752 }
5588 5753
5589 status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); 5754 status = ocfs2_read_inode_block(inode, &bh);
5590 if (status < 0) { 5755 if (status < 0) {
5591 iput(inode); 5756 iput(inode);
5592 mlog_errno(status); 5757 mlog_errno(status);
@@ -5625,13 +5790,13 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5625 } 5790 }
5626 5791
5627 di = (struct ocfs2_dinode *) tl_bh->b_data; 5792 di = (struct ocfs2_dinode *) tl_bh->b_data;
5628 tl = &di->id2.i_dealloc;
5629 if (!OCFS2_IS_VALID_DINODE(di)) {
5630 OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
5631 status = -EIO;
5632 goto bail;
5633 }
5634 5793
5794 /* tl_bh is loaded from ocfs2_get_truncate_log_info(). It's
5795 * validated by the underlying call to ocfs2_read_inode_block(),
5796 * so any corruption is a code bug */
5797 BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5798
5799 tl = &di->id2.i_dealloc;
5635 if (le16_to_cpu(tl->tl_used)) { 5800 if (le16_to_cpu(tl->tl_used)) {
5636 mlog(0, "We'll have %u logs to recover\n", 5801 mlog(0, "We'll have %u logs to recover\n",
5637 le16_to_cpu(tl->tl_used)); 5802 le16_to_cpu(tl->tl_used));
@@ -5651,6 +5816,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5651 * tl_used. */ 5816 * tl_used. */
5652 tl->tl_used = 0; 5817 tl->tl_used = 0;
5653 5818
5819 ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
5654 status = ocfs2_write_block(osb, tl_bh, tl_inode); 5820 status = ocfs2_write_block(osb, tl_bh, tl_inode);
5655 if (status < 0) { 5821 if (status < 0) {
5656 mlog_errno(status); 5822 mlog_errno(status);
@@ -5800,7 +5966,10 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
5800 */ 5966 */
5801 5967
5802/* 5968/*
5803 * Describes a single block free from a suballocator 5969 * Describe a single bit freed from a suballocator. For the block
5970 * suballocators, it represents one block. For the global cluster
5971 * allocator, it represents some clusters and free_bit indicates
5972 * clusters number.
5804 */ 5973 */
5805struct ocfs2_cached_block_free { 5974struct ocfs2_cached_block_free {
5806 struct ocfs2_cached_block_free *free_next; 5975 struct ocfs2_cached_block_free *free_next;
@@ -5815,10 +5984,10 @@ struct ocfs2_per_slot_free_list {
5815 struct ocfs2_cached_block_free *f_first; 5984 struct ocfs2_cached_block_free *f_first;
5816}; 5985};
5817 5986
5818static int ocfs2_free_cached_items(struct ocfs2_super *osb, 5987static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
5819 int sysfile_type, 5988 int sysfile_type,
5820 int slot, 5989 int slot,
5821 struct ocfs2_cached_block_free *head) 5990 struct ocfs2_cached_block_free *head)
5822{ 5991{
5823 int ret; 5992 int ret;
5824 u64 bg_blkno; 5993 u64 bg_blkno;
@@ -5893,6 +6062,82 @@ out:
5893 return ret; 6062 return ret;
5894} 6063}
5895 6064
6065int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6066 u64 blkno, unsigned int bit)
6067{
6068 int ret = 0;
6069 struct ocfs2_cached_block_free *item;
6070
6071 item = kmalloc(sizeof(*item), GFP_NOFS);
6072 if (item == NULL) {
6073 ret = -ENOMEM;
6074 mlog_errno(ret);
6075 return ret;
6076 }
6077
6078 mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
6079 bit, (unsigned long long)blkno);
6080
6081 item->free_blk = blkno;
6082 item->free_bit = bit;
6083 item->free_next = ctxt->c_global_allocator;
6084
6085 ctxt->c_global_allocator = item;
6086 return ret;
6087}
6088
6089static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
6090 struct ocfs2_cached_block_free *head)
6091{
6092 struct ocfs2_cached_block_free *tmp;
6093 struct inode *tl_inode = osb->osb_tl_inode;
6094 handle_t *handle;
6095 int ret = 0;
6096
6097 mutex_lock(&tl_inode->i_mutex);
6098
6099 while (head) {
6100 if (ocfs2_truncate_log_needs_flush(osb)) {
6101 ret = __ocfs2_flush_truncate_log(osb);
6102 if (ret < 0) {
6103 mlog_errno(ret);
6104 break;
6105 }
6106 }
6107
6108 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
6109 if (IS_ERR(handle)) {
6110 ret = PTR_ERR(handle);
6111 mlog_errno(ret);
6112 break;
6113 }
6114
6115 ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
6116 head->free_bit);
6117
6118 ocfs2_commit_trans(osb, handle);
6119 tmp = head;
6120 head = head->free_next;
6121 kfree(tmp);
6122
6123 if (ret < 0) {
6124 mlog_errno(ret);
6125 break;
6126 }
6127 }
6128
6129 mutex_unlock(&tl_inode->i_mutex);
6130
6131 while (head) {
6132 /* Premature exit may have left some dangling items. */
6133 tmp = head;
6134 head = head->free_next;
6135 kfree(tmp);
6136 }
6137
6138 return ret;
6139}
6140
5896int ocfs2_run_deallocs(struct ocfs2_super *osb, 6141int ocfs2_run_deallocs(struct ocfs2_super *osb,
5897 struct ocfs2_cached_dealloc_ctxt *ctxt) 6142 struct ocfs2_cached_dealloc_ctxt *ctxt)
5898{ 6143{
@@ -5908,8 +6153,10 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
5908 if (fl->f_first) { 6153 if (fl->f_first) {
5909 mlog(0, "Free items: (type %u, slot %d)\n", 6154 mlog(0, "Free items: (type %u, slot %d)\n",
5910 fl->f_inode_type, fl->f_slot); 6155 fl->f_inode_type, fl->f_slot);
5911 ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type, 6156 ret2 = ocfs2_free_cached_blocks(osb,
5912 fl->f_slot, fl->f_first); 6157 fl->f_inode_type,
6158 fl->f_slot,
6159 fl->f_first);
5913 if (ret2) 6160 if (ret2)
5914 mlog_errno(ret2); 6161 mlog_errno(ret2);
5915 if (!ret) 6162 if (!ret)
@@ -5920,6 +6167,17 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
5920 kfree(fl); 6167 kfree(fl);
5921 } 6168 }
5922 6169
6170 if (ctxt->c_global_allocator) {
6171 ret2 = ocfs2_free_cached_clusters(osb,
6172 ctxt->c_global_allocator);
6173 if (ret2)
6174 mlog_errno(ret2);
6175 if (!ret)
6176 ret = ret2;
6177
6178 ctxt->c_global_allocator = NULL;
6179 }
6180
5923 return ret; 6181 return ret;
5924} 6182}
5925 6183
@@ -6075,11 +6333,10 @@ static int ocfs2_find_new_last_ext_blk(struct inode *inode,
6075 6333
6076 eb = (struct ocfs2_extent_block *) bh->b_data; 6334 eb = (struct ocfs2_extent_block *) bh->b_data;
6077 el = &eb->h_list; 6335 el = &eb->h_list;
6078 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 6336
6079 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 6337 /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
6080 ret = -EROFS; 6338 * Any corruption is a code bug. */
6081 goto out; 6339 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
6082 }
6083 6340
6084 *new_last_eb = bh; 6341 *new_last_eb = bh;
6085 get_bh(*new_last_eb); 6342 get_bh(*new_last_eb);
@@ -6326,8 +6583,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6326 } 6583 }
6327 6584
6328 if (last_eb_bh) { 6585 if (last_eb_bh) {
6329 status = ocfs2_journal_access(handle, inode, last_eb_bh, 6586 status = ocfs2_journal_access_eb(handle, inode, last_eb_bh,
6330 OCFS2_JOURNAL_ACCESS_WRITE); 6587 OCFS2_JOURNAL_ACCESS_WRITE);
6331 if (status < 0) { 6588 if (status < 0) {
6332 mlog_errno(status); 6589 mlog_errno(status);
6333 goto bail; 6590 goto bail;
@@ -6350,6 +6607,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6350 goto bail; 6607 goto bail;
6351 } 6608 }
6352 6609
6610 vfs_dq_free_space_nodirty(inode,
6611 ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
6353 spin_lock(&OCFS2_I(inode)->ip_lock); 6612 spin_lock(&OCFS2_I(inode)->ip_lock);
6354 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - 6613 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
6355 clusters_to_del; 6614 clusters_to_del;
@@ -6436,11 +6695,6 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6436 mlog_errno(ret); 6695 mlog_errno(ret);
6437 else if (ocfs2_should_order_data(inode)) { 6696 else if (ocfs2_should_order_data(inode)) {
6438 ret = ocfs2_jbd2_file_inode(handle, inode); 6697 ret = ocfs2_jbd2_file_inode(handle, inode);
6439#ifdef CONFIG_OCFS2_COMPAT_JBD
6440 ret = walk_page_buffers(handle, page_buffers(page),
6441 from, to, &partial,
6442 ocfs2_journal_dirty_data);
6443#endif
6444 if (ret < 0) 6698 if (ret < 0)
6445 mlog_errno(ret); 6699 mlog_errno(ret);
6446 } 6700 }
@@ -6663,6 +6917,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6663 struct page **pages = NULL; 6917 struct page **pages = NULL;
6664 loff_t end = osb->s_clustersize; 6918 loff_t end = osb->s_clustersize;
6665 struct ocfs2_extent_tree et; 6919 struct ocfs2_extent_tree et;
6920 int did_quota = 0;
6666 6921
6667 has_data = i_size_read(inode) ? 1 : 0; 6922 has_data = i_size_read(inode) ? 1 : 0;
6668 6923
@@ -6682,15 +6937,16 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6682 } 6937 }
6683 } 6938 }
6684 6939
6685 handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS); 6940 handle = ocfs2_start_trans(osb,
6941 ocfs2_inline_to_extents_credits(osb->sb));
6686 if (IS_ERR(handle)) { 6942 if (IS_ERR(handle)) {
6687 ret = PTR_ERR(handle); 6943 ret = PTR_ERR(handle);
6688 mlog_errno(ret); 6944 mlog_errno(ret);
6689 goto out_unlock; 6945 goto out_unlock;
6690 } 6946 }
6691 6947
6692 ret = ocfs2_journal_access(handle, inode, di_bh, 6948 ret = ocfs2_journal_access_di(handle, inode, di_bh,
6693 OCFS2_JOURNAL_ACCESS_WRITE); 6949 OCFS2_JOURNAL_ACCESS_WRITE);
6694 if (ret) { 6950 if (ret) {
6695 mlog_errno(ret); 6951 mlog_errno(ret);
6696 goto out_commit; 6952 goto out_commit;
@@ -6701,6 +6957,13 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6701 unsigned int page_end; 6957 unsigned int page_end;
6702 u64 phys; 6958 u64 phys;
6703 6959
6960 if (vfs_dq_alloc_space_nodirty(inode,
6961 ocfs2_clusters_to_bytes(osb->sb, 1))) {
6962 ret = -EDQUOT;
6963 goto out_commit;
6964 }
6965 did_quota = 1;
6966
6704 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, 6967 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
6705 &num); 6968 &num);
6706 if (ret) { 6969 if (ret) {
@@ -6774,6 +7037,10 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6774 } 7037 }
6775 7038
6776out_commit: 7039out_commit:
7040 if (ret < 0 && did_quota)
7041 vfs_dq_free_space_nodirty(inode,
7042 ocfs2_clusters_to_bytes(osb->sb, 1));
7043
6777 ocfs2_commit_trans(osb, handle); 7044 ocfs2_commit_trans(osb, handle);
6778 7045
6779out_unlock: 7046out_unlock:
@@ -6813,7 +7080,8 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
6813 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, 7080 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
6814 i_size_read(inode)); 7081 i_size_read(inode));
6815 7082
6816 path = ocfs2_new_path(fe_bh, &di->id2.i_list); 7083 path = ocfs2_new_path(fe_bh, &di->id2.i_list,
7084 ocfs2_journal_access_di);
6817 if (!path) { 7085 if (!path) {
6818 status = -ENOMEM; 7086 status = -ENOMEM;
6819 mlog_errno(status); 7087 mlog_errno(status);
@@ -6984,20 +7252,14 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
6984 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc); 7252 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
6985 7253
6986 if (fe->id2.i_list.l_tree_depth) { 7254 if (fe->id2.i_list.l_tree_depth) {
6987 status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk), 7255 status = ocfs2_read_extent_block(inode,
6988 &last_eb_bh); 7256 le64_to_cpu(fe->i_last_eb_blk),
7257 &last_eb_bh);
6989 if (status < 0) { 7258 if (status < 0) {
6990 mlog_errno(status); 7259 mlog_errno(status);
6991 goto bail; 7260 goto bail;
6992 } 7261 }
6993 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 7262 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6994 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
6995 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
6996
6997 brelse(last_eb_bh);
6998 status = -EIO;
6999 goto bail;
7000 }
7001 } 7263 }
7002 7264
7003 (*tc)->tc_last_eb_bh = last_eb_bh; 7265 (*tc)->tc_last_eb_bh = last_eb_bh;
@@ -7052,8 +7314,8 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7052 goto out; 7314 goto out;
7053 } 7315 }
7054 7316
7055 ret = ocfs2_journal_access(handle, inode, di_bh, 7317 ret = ocfs2_journal_access_di(handle, inode, di_bh,
7056 OCFS2_JOURNAL_ACCESS_WRITE); 7318 OCFS2_JOURNAL_ACCESS_WRITE);
7057 if (ret) { 7319 if (ret) {
7058 mlog_errno(ret); 7320 mlog_errno(ret);
7059 goto out_commit; 7321 goto out_commit;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 70257c84cfbe..cceff5c37f47 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -45,7 +45,9 @@
45 * 45 *
46 * ocfs2_extent_tree contains info for the root of the b-tree, it must have a 46 * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
47 * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree 47 * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
48 * functions. 48 * functions. With metadata ecc, we now call different journal_access
49 * functions for each type of metadata, so it must have the
50 * root_journal_access function.
49 * ocfs2_extent_tree_operations abstract the normal operations we do for 51 * ocfs2_extent_tree_operations abstract the normal operations we do for
50 * the root of extent b-tree. 52 * the root of extent b-tree.
51 */ 53 */
@@ -54,6 +56,7 @@ struct ocfs2_extent_tree {
54 struct ocfs2_extent_tree_operations *et_ops; 56 struct ocfs2_extent_tree_operations *et_ops;
55 struct buffer_head *et_root_bh; 57 struct buffer_head *et_root_bh;
56 struct ocfs2_extent_list *et_root_el; 58 struct ocfs2_extent_list *et_root_el;
59 ocfs2_journal_access_func et_root_journal_access;
57 void *et_object; 60 void *et_object;
58 unsigned int et_max_leaf_clusters; 61 unsigned int et_max_leaf_clusters;
59}; 62};
@@ -68,10 +71,18 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
68void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, 71void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
69 struct inode *inode, 72 struct inode *inode,
70 struct buffer_head *bh); 73 struct buffer_head *bh);
74struct ocfs2_xattr_value_buf;
71void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, 75void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
72 struct inode *inode, 76 struct inode *inode,
73 struct buffer_head *bh, 77 struct ocfs2_xattr_value_buf *vb);
74 struct ocfs2_xattr_value_root *xv); 78
79/*
80 * Read an extent block into *bh. If *bh is NULL, a bh will be
81 * allocated. This is a cached read. The extent block will be validated
82 * with ocfs2_validate_extent_block().
83 */
84int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
85 struct buffer_head **bh);
75 86
76struct ocfs2_alloc_context; 87struct ocfs2_alloc_context;
77int ocfs2_insert_extent(struct ocfs2_super *osb, 88int ocfs2_insert_extent(struct ocfs2_super *osb,
@@ -110,6 +121,11 @@ int ocfs2_remove_extent(struct inode *inode,
110 u32 cpos, u32 len, handle_t *handle, 121 u32 cpos, u32 len, handle_t *handle,
111 struct ocfs2_alloc_context *meta_ac, 122 struct ocfs2_alloc_context *meta_ac,
112 struct ocfs2_cached_dealloc_ctxt *dealloc); 123 struct ocfs2_cached_dealloc_ctxt *dealloc);
124int ocfs2_remove_btree_range(struct inode *inode,
125 struct ocfs2_extent_tree *et,
126 u32 cpos, u32 phys_cpos, u32 len,
127 struct ocfs2_cached_dealloc_ctxt *dealloc);
128
113int ocfs2_num_free_extents(struct ocfs2_super *osb, 129int ocfs2_num_free_extents(struct ocfs2_super *osb,
114 struct inode *inode, 130 struct inode *inode,
115 struct ocfs2_extent_tree *et); 131 struct ocfs2_extent_tree *et);
@@ -167,10 +183,18 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
167 */ 183 */
168struct ocfs2_cached_dealloc_ctxt { 184struct ocfs2_cached_dealloc_ctxt {
169 struct ocfs2_per_slot_free_list *c_first_suballocator; 185 struct ocfs2_per_slot_free_list *c_first_suballocator;
186 struct ocfs2_cached_block_free *c_global_allocator;
170}; 187};
171static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c) 188static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
172{ 189{
173 c->c_first_suballocator = NULL; 190 c->c_first_suballocator = NULL;
191 c->c_global_allocator = NULL;
192}
193int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
194 u64 blkno, unsigned int bit);
195static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
196{
197 return c->c_global_allocator != NULL;
174} 198}
175int ocfs2_run_deallocs(struct ocfs2_super *osb, 199int ocfs2_run_deallocs(struct ocfs2_super *osb,
176 struct ocfs2_cached_dealloc_ctxt *ctxt); 200 struct ocfs2_cached_dealloc_ctxt *ctxt);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c22543b33420..a067a6cffb01 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -27,6 +27,7 @@
27#include <linux/swap.h> 27#include <linux/swap.h>
28#include <linux/pipe_fs_i.h> 28#include <linux/pipe_fs_i.h>
29#include <linux/mpage.h> 29#include <linux/mpage.h>
30#include <linux/quotaops.h>
30 31
31#define MLOG_MASK_PREFIX ML_FILE_IO 32#define MLOG_MASK_PREFIX ML_FILE_IO
32#include <cluster/masklog.h> 33#include <cluster/masklog.h>
@@ -68,20 +69,13 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
68 goto bail; 69 goto bail;
69 } 70 }
70 71
71 status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); 72 status = ocfs2_read_inode_block(inode, &bh);
72 if (status < 0) { 73 if (status < 0) {
73 mlog_errno(status); 74 mlog_errno(status);
74 goto bail; 75 goto bail;
75 } 76 }
76 fe = (struct ocfs2_dinode *) bh->b_data; 77 fe = (struct ocfs2_dinode *) bh->b_data;
77 78
78 if (!OCFS2_IS_VALID_DINODE(fe)) {
79 mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
80 (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
81 fe->i_signature);
82 goto bail;
83 }
84
85 if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb, 79 if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
86 le32_to_cpu(fe->i_clusters))) { 80 le32_to_cpu(fe->i_clusters))) {
87 mlog(ML_ERROR, "block offset is outside the allocated size: " 81 mlog(ML_ERROR, "block offset is outside the allocated size: "
@@ -262,7 +256,7 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
262 BUG_ON(!PageLocked(page)); 256 BUG_ON(!PageLocked(page));
263 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); 257 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
264 258
265 ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh); 259 ret = ocfs2_read_inode_block(inode, &di_bh);
266 if (ret) { 260 if (ret) {
267 mlog_errno(ret); 261 mlog_errno(ret);
268 goto out; 262 goto out;
@@ -481,12 +475,6 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
481 475
482 if (ocfs2_should_order_data(inode)) { 476 if (ocfs2_should_order_data(inode)) {
483 ret = ocfs2_jbd2_file_inode(handle, inode); 477 ret = ocfs2_jbd2_file_inode(handle, inode);
484#ifdef CONFIG_OCFS2_COMPAT_JBD
485 ret = walk_page_buffers(handle,
486 page_buffers(page),
487 from, to, NULL,
488 ocfs2_journal_dirty_data);
489#endif
490 if (ret < 0) 478 if (ret < 0)
491 mlog_errno(ret); 479 mlog_errno(ret);
492 } 480 }
@@ -1072,15 +1060,8 @@ static void ocfs2_write_failure(struct inode *inode,
1072 tmppage = wc->w_pages[i]; 1060 tmppage = wc->w_pages[i];
1073 1061
1074 if (page_has_buffers(tmppage)) { 1062 if (page_has_buffers(tmppage)) {
1075 if (ocfs2_should_order_data(inode)) { 1063 if (ocfs2_should_order_data(inode))
1076 ocfs2_jbd2_file_inode(wc->w_handle, inode); 1064 ocfs2_jbd2_file_inode(wc->w_handle, inode);
1077#ifdef CONFIG_OCFS2_COMPAT_JBD
1078 walk_page_buffers(wc->w_handle,
1079 page_buffers(tmppage),
1080 from, to, NULL,
1081 ocfs2_journal_dirty_data);
1082#endif
1083 }
1084 1065
1085 block_commit_write(tmppage, from, to); 1066 block_commit_write(tmppage, from, to);
1086 } 1067 }
@@ -1531,8 +1512,8 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
1531 goto out; 1512 goto out;
1532 } 1513 }
1533 1514
1534 ret = ocfs2_journal_access(handle, inode, wc->w_di_bh, 1515 ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
1535 OCFS2_JOURNAL_ACCESS_WRITE); 1516 OCFS2_JOURNAL_ACCESS_WRITE);
1536 if (ret) { 1517 if (ret) {
1537 ocfs2_commit_trans(osb, handle); 1518 ocfs2_commit_trans(osb, handle);
1538 1519
@@ -1750,15 +1731,20 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1750 1731
1751 wc->w_handle = handle; 1732 wc->w_handle = handle;
1752 1733
1734 if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode,
1735 ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) {
1736 ret = -EDQUOT;
1737 goto out_commit;
1738 }
1753 /* 1739 /*
1754 * We don't want this to fail in ocfs2_write_end(), so do it 1740 * We don't want this to fail in ocfs2_write_end(), so do it
1755 * here. 1741 * here.
1756 */ 1742 */
1757 ret = ocfs2_journal_access(handle, inode, wc->w_di_bh, 1743 ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
1758 OCFS2_JOURNAL_ACCESS_WRITE); 1744 OCFS2_JOURNAL_ACCESS_WRITE);
1759 if (ret) { 1745 if (ret) {
1760 mlog_errno(ret); 1746 mlog_errno(ret);
1761 goto out_commit; 1747 goto out_quota;
1762 } 1748 }
1763 1749
1764 /* 1750 /*
@@ -1771,14 +1757,14 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1771 mmap_page); 1757 mmap_page);
1772 if (ret) { 1758 if (ret) {
1773 mlog_errno(ret); 1759 mlog_errno(ret);
1774 goto out_commit; 1760 goto out_quota;
1775 } 1761 }
1776 1762
1777 ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos, 1763 ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
1778 len); 1764 len);
1779 if (ret) { 1765 if (ret) {
1780 mlog_errno(ret); 1766 mlog_errno(ret);
1781 goto out_commit; 1767 goto out_quota;
1782 } 1768 }
1783 1769
1784 if (data_ac) 1770 if (data_ac)
@@ -1790,6 +1776,10 @@ success:
1790 *pagep = wc->w_target_page; 1776 *pagep = wc->w_target_page;
1791 *fsdata = wc; 1777 *fsdata = wc;
1792 return 0; 1778 return 0;
1779out_quota:
1780 if (clusters_to_alloc)
1781 vfs_dq_free_space(inode,
1782 ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
1793out_commit: 1783out_commit:
1794 ocfs2_commit_trans(osb, handle); 1784 ocfs2_commit_trans(osb, handle);
1795 1785
@@ -1919,15 +1909,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
1919 } 1909 }
1920 1910
1921 if (page_has_buffers(tmppage)) { 1911 if (page_has_buffers(tmppage)) {
1922 if (ocfs2_should_order_data(inode)) { 1912 if (ocfs2_should_order_data(inode))
1923 ocfs2_jbd2_file_inode(wc->w_handle, inode); 1913 ocfs2_jbd2_file_inode(wc->w_handle, inode);
1924#ifdef CONFIG_OCFS2_COMPAT_JBD
1925 walk_page_buffers(wc->w_handle,
1926 page_buffers(tmppage),
1927 from, to, NULL,
1928 ocfs2_journal_dirty_data);
1929#endif
1930 }
1931 block_commit_write(tmppage, from, to); 1914 block_commit_write(tmppage, from, to);
1932 } 1915 }
1933 } 1916 }
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
new file mode 100644
index 000000000000..2a947c44e594
--- /dev/null
+++ b/fs/ocfs2/blockcheck.c
@@ -0,0 +1,477 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * blockcheck.c
5 *
6 * Checksum and ECC codes for the OCFS2 userspace library.
7 *
8 * Copyright (C) 2006, 2008 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License, version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#include <linux/kernel.h>
21#include <linux/types.h>
22#include <linux/crc32.h>
23#include <linux/buffer_head.h>
24#include <linux/bitops.h>
25#include <asm/byteorder.h>
26
27#include <cluster/masklog.h>
28
29#include "ocfs2.h"
30
31#include "blockcheck.h"
32
33
34/*
35 * We use the following conventions:
36 *
37 * d = # data bits
38 * p = # parity bits
39 * c = # total code bits (d + p)
40 */
41
42
43/*
44 * Calculate the bit offset in the hamming code buffer based on the bit's
45 * offset in the data buffer. Since the hamming code reserves all
46 * power-of-two bits for parity, the data bit number and the code bit
47 * number are offest by all the parity bits beforehand.
48 *
49 * Recall that bit numbers in hamming code are 1-based. This function
50 * takes the 0-based data bit from the caller.
51 *
52 * An example. Take bit 1 of the data buffer. 1 is a power of two (2^0),
53 * so it's a parity bit. 2 is a power of two (2^1), so it's a parity bit.
54 * 3 is not a power of two. So bit 1 of the data buffer ends up as bit 3
55 * in the code buffer.
56 *
57 * The caller can pass in *p if it wants to keep track of the most recent
58 * number of parity bits added. This allows the function to start the
59 * calculation at the last place.
60 */
61static unsigned int calc_code_bit(unsigned int i, unsigned int *p_cache)
62{
63 unsigned int b, p = 0;
64
65 /*
66 * Data bits are 0-based, but we're talking code bits, which
67 * are 1-based.
68 */
69 b = i + 1;
70
71 /* Use the cache if it is there */
72 if (p_cache)
73 p = *p_cache;
74 b += p;
75
76 /*
77 * For every power of two below our bit number, bump our bit.
78 *
79 * We compare with (b + 1) because we have to compare with what b
80 * would be _if_ it were bumped up by the parity bit. Capice?
81 *
82 * p is set above.
83 */
84 for (; (1 << p) < (b + 1); p++)
85 b++;
86
87 if (p_cache)
88 *p_cache = p;
89
90 return b;
91}
92
93/*
94 * This is the low level encoder function. It can be called across
95 * multiple hunks just like the crc32 code. 'd' is the number of bits
96 * _in_this_hunk_. nr is the bit offset of this hunk. So, if you had
97 * two 512B buffers, you would do it like so:
98 *
99 * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
100 * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
101 *
102 * If you just have one buffer, use ocfs2_hamming_encode_block().
103 */
104u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr)
105{
106 unsigned int i, b, p = 0;
107
108 BUG_ON(!d);
109
110 /*
111 * b is the hamming code bit number. Hamming code specifies a
112 * 1-based array, but C uses 0-based. So 'i' is for C, and 'b' is
113 * for the algorithm.
114 *
115 * The i++ in the for loop is so that the start offset passed
116 * to ocfs2_find_next_bit_set() is one greater than the previously
117 * found bit.
118 */
119 for (i = 0; (i = ocfs2_find_next_bit(data, d, i)) < d; i++)
120 {
121 /*
122 * i is the offset in this hunk, nr + i is the total bit
123 * offset.
124 */
125 b = calc_code_bit(nr + i, &p);
126
127 /*
128 * Data bits in the resultant code are checked by
129 * parity bits that are part of the bit number
130 * representation. Huh?
131 *
132 * <wikipedia href="http://en.wikipedia.org/wiki/Hamming_code">
133 * In other words, the parity bit at position 2^k
134 * checks bits in positions having bit k set in
135 * their binary representation. Conversely, for
136 * instance, bit 13, i.e. 1101(2), is checked by
137 * bits 1000(2) = 8, 0100(2)=4 and 0001(2) = 1.
138 * </wikipedia>
139 *
140 * Note that 'k' is the _code_ bit number. 'b' in
141 * our loop.
142 */
143 parity ^= b;
144 }
145
146 /* While the data buffer was treated as little endian, the
147 * return value is in host endian. */
148 return parity;
149}
150
151u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize)
152{
153 return ocfs2_hamming_encode(0, data, blocksize * 8, 0);
154}
155
156/*
157 * Like ocfs2_hamming_encode(), this can handle hunks. nr is the bit
158 * offset of the current hunk. If bit to be fixed is not part of the
159 * current hunk, this does nothing.
160 *
161 * If you only have one hunk, use ocfs2_hamming_fix_block().
162 */
163void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
164 unsigned int fix)
165{
166 unsigned int i, b;
167
168 BUG_ON(!d);
169
170 /*
171 * If the bit to fix has an hweight of 1, it's a parity bit. One
172 * busted parity bit is its own error. Nothing to do here.
173 */
174 if (hweight32(fix) == 1)
175 return;
176
177 /*
178 * nr + d is the bit right past the data hunk we're looking at.
179 * If fix after that, nothing to do
180 */
181 if (fix >= calc_code_bit(nr + d, NULL))
182 return;
183
184 /*
185 * nr is the offset in the data hunk we're starting at. Let's
186 * start b at the offset in the code buffer. See hamming_encode()
187 * for a more detailed description of 'b'.
188 */
189 b = calc_code_bit(nr, NULL);
190 /* If the fix is before this hunk, nothing to do */
191 if (fix < b)
192 return;
193
194 for (i = 0; i < d; i++, b++)
195 {
196 /* Skip past parity bits */
197 while (hweight32(b) == 1)
198 b++;
199
200 /*
201 * i is the offset in this data hunk.
202 * nr + i is the offset in the total data buffer.
203 * b is the offset in the total code buffer.
204 *
205 * Thus, when b == fix, bit i in the current hunk needs
206 * fixing.
207 */
208 if (b == fix)
209 {
210 if (ocfs2_test_bit(i, data))
211 ocfs2_clear_bit(i, data);
212 else
213 ocfs2_set_bit(i, data);
214 break;
215 }
216 }
217}
218
219void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
220 unsigned int fix)
221{
222 ocfs2_hamming_fix(data, blocksize * 8, 0, fix);
223}
224
225/*
226 * This function generates check information for a block.
227 * data is the block to be checked. bc is a pointer to the
228 * ocfs2_block_check structure describing the crc32 and the ecc.
229 *
230 * bc should be a pointer inside data, as the function will
231 * take care of zeroing it before calculating the check information. If
232 * bc does not point inside data, the caller must make sure any inline
233 * ocfs2_block_check structures are zeroed.
234 *
235 * The data buffer must be in on-disk endian (little endian for ocfs2).
236 * bc will be filled with little-endian values and will be ready to go to
237 * disk.
238 */
239void ocfs2_block_check_compute(void *data, size_t blocksize,
240 struct ocfs2_block_check *bc)
241{
242 u32 crc;
243 u32 ecc;
244
245 memset(bc, 0, sizeof(struct ocfs2_block_check));
246
247 crc = crc32_le(~0, data, blocksize);
248 ecc = ocfs2_hamming_encode_block(data, blocksize);
249
250 /*
251 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
252 * larger than 16 bits.
253 */
254 BUG_ON(ecc > USHORT_MAX);
255
256 bc->bc_crc32e = cpu_to_le32(crc);
257 bc->bc_ecc = cpu_to_le16((u16)ecc);
258}
259
260/*
261 * This function validates existing check information. Like _compute,
262 * the function will take care of zeroing bc before calculating check codes.
263 * If bc is not a pointer inside data, the caller must have zeroed any
264 * inline ocfs2_block_check structures.
265 *
266 * Again, the data passed in should be the on-disk endian.
267 */
268int ocfs2_block_check_validate(void *data, size_t blocksize,
269 struct ocfs2_block_check *bc)
270{
271 int rc = 0;
272 struct ocfs2_block_check check;
273 u32 crc, ecc;
274
275 check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
276 check.bc_ecc = le16_to_cpu(bc->bc_ecc);
277
278 memset(bc, 0, sizeof(struct ocfs2_block_check));
279
280 /* Fast path - if the crc32 validates, we're good to go */
281 crc = crc32_le(~0, data, blocksize);
282 if (crc == check.bc_crc32e)
283 goto out;
284
285 mlog(ML_ERROR,
286 "CRC32 failed: stored: %u, computed %u. Applying ECC.\n",
287 (unsigned int)check.bc_crc32e, (unsigned int)crc);
288
289 /* Ok, try ECC fixups */
290 ecc = ocfs2_hamming_encode_block(data, blocksize);
291 ocfs2_hamming_fix_block(data, blocksize, ecc ^ check.bc_ecc);
292
293 /* And check the crc32 again */
294 crc = crc32_le(~0, data, blocksize);
295 if (crc == check.bc_crc32e)
296 goto out;
297
298 mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
299 (unsigned int)check.bc_crc32e, (unsigned int)crc);
300
301 rc = -EIO;
302
303out:
304 bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
305 bc->bc_ecc = cpu_to_le16(check.bc_ecc);
306
307 return rc;
308}
309
310/*
311 * This function generates check information for a list of buffer_heads.
312 * bhs is the blocks to be checked. bc is a pointer to the
313 * ocfs2_block_check structure describing the crc32 and the ecc.
314 *
315 * bc should be a pointer inside data, as the function will
316 * take care of zeroing it before calculating the check information. If
317 * bc does not point inside data, the caller must make sure any inline
318 * ocfs2_block_check structures are zeroed.
319 *
320 * The data buffer must be in on-disk endian (little endian for ocfs2).
321 * bc will be filled with little-endian values and will be ready to go to
322 * disk.
323 */
324void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
325 struct ocfs2_block_check *bc)
326{
327 int i;
328 u32 crc, ecc;
329
330 BUG_ON(nr < 0);
331
332 if (!nr)
333 return;
334
335 memset(bc, 0, sizeof(struct ocfs2_block_check));
336
337 for (i = 0, crc = ~0, ecc = 0; i < nr; i++) {
338 crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
339 /*
340 * The number of bits in a buffer is obviously b_size*8.
341 * The offset of this buffer is b_size*i, so the bit offset
342 * of this buffer is b_size*8*i.
343 */
344 ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
345 bhs[i]->b_size * 8,
346 bhs[i]->b_size * 8 * i);
347 }
348
349 /*
350 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
351 * larger than 16 bits.
352 */
353 BUG_ON(ecc > USHORT_MAX);
354
355 bc->bc_crc32e = cpu_to_le32(crc);
356 bc->bc_ecc = cpu_to_le16((u16)ecc);
357}
358
359/*
360 * This function validates existing check information on a list of
361 * buffer_heads. Like _compute_bhs, the function will take care of
362 * zeroing bc before calculating check codes. If bc is not a pointer
363 * inside data, the caller must have zeroed any inline
364 * ocfs2_block_check structures.
365 *
366 * Again, the data passed in should be the on-disk endian.
367 */
368int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
369 struct ocfs2_block_check *bc)
370{
371 int i, rc = 0;
372 struct ocfs2_block_check check;
373 u32 crc, ecc, fix;
374
375 BUG_ON(nr < 0);
376
377 if (!nr)
378 return 0;
379
380 check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
381 check.bc_ecc = le16_to_cpu(bc->bc_ecc);
382
383 memset(bc, 0, sizeof(struct ocfs2_block_check));
384
385 /* Fast path - if the crc32 validates, we're good to go */
386 for (i = 0, crc = ~0; i < nr; i++)
387 crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
388 if (crc == check.bc_crc32e)
389 goto out;
390
391 mlog(ML_ERROR,
392 "CRC32 failed: stored: %u, computed %u. Applying ECC.\n",
393 (unsigned int)check.bc_crc32e, (unsigned int)crc);
394
395 /* Ok, try ECC fixups */
396 for (i = 0, ecc = 0; i < nr; i++) {
397 /*
398 * The number of bits in a buffer is obviously b_size*8.
399 * The offset of this buffer is b_size*i, so the bit offset
400 * of this buffer is b_size*8*i.
401 */
402 ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
403 bhs[i]->b_size * 8,
404 bhs[i]->b_size * 8 * i);
405 }
406 fix = ecc ^ check.bc_ecc;
407 for (i = 0; i < nr; i++) {
408 /*
409 * Try the fix against each buffer. It will only affect
410 * one of them.
411 */
412 ocfs2_hamming_fix(bhs[i]->b_data, bhs[i]->b_size * 8,
413 bhs[i]->b_size * 8 * i, fix);
414 }
415
416 /* And check the crc32 again */
417 for (i = 0, crc = ~0; i < nr; i++)
418 crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
419 if (crc == check.bc_crc32e)
420 goto out;
421
422 mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
423 (unsigned int)check.bc_crc32e, (unsigned int)crc);
424
425 rc = -EIO;
426
427out:
428 bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
429 bc->bc_ecc = cpu_to_le16(check.bc_ecc);
430
431 return rc;
432}
433
434/*
435 * These are the main API. They check the superblock flag before
436 * calling the underlying operations.
437 *
438 * They expect the buffer(s) to be in disk format.
439 */
440void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
441 struct ocfs2_block_check *bc)
442{
443 if (ocfs2_meta_ecc(OCFS2_SB(sb)))
444 ocfs2_block_check_compute(data, sb->s_blocksize, bc);
445}
446
447int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
448 struct ocfs2_block_check *bc)
449{
450 int rc = 0;
451
452 if (ocfs2_meta_ecc(OCFS2_SB(sb)))
453 rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc);
454
455 return rc;
456}
457
458void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
459 struct buffer_head **bhs, int nr,
460 struct ocfs2_block_check *bc)
461{
462 if (ocfs2_meta_ecc(OCFS2_SB(sb)))
463 ocfs2_block_check_compute_bhs(bhs, nr, bc);
464}
465
466int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
467 struct buffer_head **bhs, int nr,
468 struct ocfs2_block_check *bc)
469{
470 int rc = 0;
471
472 if (ocfs2_meta_ecc(OCFS2_SB(sb)))
473 rc = ocfs2_block_check_validate_bhs(bhs, nr, bc);
474
475 return rc;
476}
477
diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h
new file mode 100644
index 000000000000..70ec3feda32f
--- /dev/null
+++ b/fs/ocfs2/blockcheck.h
@@ -0,0 +1,82 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * blockcheck.h
5 *
6 * Checksum and ECC codes for the OCFS2 userspace library.
7 *
8 * Copyright (C) 2004, 2008 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License, version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#ifndef OCFS2_BLOCKCHECK_H
21#define OCFS2_BLOCKCHECK_H
22
23
24/* High level block API */
25void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
26 struct ocfs2_block_check *bc);
27int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
28 struct ocfs2_block_check *bc);
29void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
30 struct buffer_head **bhs, int nr,
31 struct ocfs2_block_check *bc);
32int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
33 struct buffer_head **bhs, int nr,
34 struct ocfs2_block_check *bc);
35
36/* Lower level API */
37void ocfs2_block_check_compute(void *data, size_t blocksize,
38 struct ocfs2_block_check *bc);
39int ocfs2_block_check_validate(void *data, size_t blocksize,
40 struct ocfs2_block_check *bc);
41void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
42 struct ocfs2_block_check *bc);
43int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
44 struct ocfs2_block_check *bc);
45
46/*
47 * Hamming code functions
48 */
49
50/*
51 * Encoding hamming code parity bits for a buffer.
52 *
53 * This is the low level encoder function. It can be called across
54 * multiple hunks just like the crc32 code. 'd' is the number of bits
55 * _in_this_hunk_. nr is the bit offset of this hunk. So, if you had
56 * two 512B buffers, you would do it like so:
57 *
58 * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
59 * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
60 *
61 * If you just have one buffer, use ocfs2_hamming_encode_block().
62 */
63u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d,
64 unsigned int nr);
65/*
66 * Fix a buffer with a bit error. The 'fix' is the original parity
67 * xor'd with the parity calculated now.
68 *
69 * Like ocfs2_hamming_encode(), this can handle hunks. nr is the bit
70 * offset of the current hunk. If bit to be fixed is not part of the
71 * current hunk, this does nothing.
72 *
73 * If you only have one buffer, use ocfs2_hamming_fix_block().
74 */
75void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
76 unsigned int fix);
77
78/* Convenience wrappers for a single buffer of data */
79extern u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize);
80extern void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
81 unsigned int fix);
82#endif
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 3a178ec48d7c..15c8e6deee2e 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -39,6 +39,18 @@
39 39
40#include "buffer_head_io.h" 40#include "buffer_head_io.h"
41 41
42/*
43 * Bits on bh->b_state used by ocfs2.
44 *
45 * These MUST be after the JBD2 bits. Hence, we use BH_JBDPrivateStart.
46 */
47enum ocfs2_state_bits {
48 BH_NeedsValidate = BH_JBDPrivateStart,
49};
50
51/* Expand the magic b_state functions */
52BUFFER_FNS(NeedsValidate, needs_validate);
53
42int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, 54int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
43 struct inode *inode) 55 struct inode *inode)
44{ 56{
@@ -166,7 +178,9 @@ bail:
166} 178}
167 179
168int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, 180int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
169 struct buffer_head *bhs[], int flags) 181 struct buffer_head *bhs[], int flags,
182 int (*validate)(struct super_block *sb,
183 struct buffer_head *bh))
170{ 184{
171 int status = 0; 185 int status = 0;
172 int i, ignore_cache = 0; 186 int i, ignore_cache = 0;
@@ -298,6 +312,8 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
298 312
299 clear_buffer_uptodate(bh); 313 clear_buffer_uptodate(bh);
300 get_bh(bh); /* for end_buffer_read_sync() */ 314 get_bh(bh); /* for end_buffer_read_sync() */
315 if (validate)
316 set_buffer_needs_validate(bh);
301 bh->b_end_io = end_buffer_read_sync; 317 bh->b_end_io = end_buffer_read_sync;
302 submit_bh(READ, bh); 318 submit_bh(READ, bh);
303 continue; 319 continue;
@@ -328,6 +344,20 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
328 bhs[i] = NULL; 344 bhs[i] = NULL;
329 continue; 345 continue;
330 } 346 }
347
348 if (buffer_needs_validate(bh)) {
349 /* We never set NeedsValidate if the
350 * buffer was held by the journal, so
351 * that better not have changed */
352 BUG_ON(buffer_jbd(bh));
353 clear_buffer_needs_validate(bh);
354 status = validate(inode->i_sb, bh);
355 if (status) {
356 put_bh(bh);
357 bhs[i] = NULL;
358 continue;
359 }
360 }
331 } 361 }
332 362
333 /* Always set the buffer in the cache, even if it was 363 /* Always set the buffer in the cache, even if it was
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index 75e1dcb1ade7..c75d682dadd8 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -31,21 +31,24 @@
31void ocfs2_end_buffer_io_sync(struct buffer_head *bh, 31void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
32 int uptodate); 32 int uptodate);
33 33
34static inline int ocfs2_read_block(struct inode *inode,
35 u64 off,
36 struct buffer_head **bh);
37
38int ocfs2_write_block(struct ocfs2_super *osb, 34int ocfs2_write_block(struct ocfs2_super *osb,
39 struct buffer_head *bh, 35 struct buffer_head *bh,
40 struct inode *inode); 36 struct inode *inode);
41int ocfs2_read_blocks(struct inode *inode,
42 u64 block,
43 int nr,
44 struct buffer_head *bhs[],
45 int flags);
46int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, 37int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
47 unsigned int nr, struct buffer_head *bhs[]); 38 unsigned int nr, struct buffer_head *bhs[]);
48 39
40/*
41 * If not NULL, validate() will be called on a buffer that is freshly
42 * read from disk. It will not be called if the buffer was in cache.
43 * Note that if validate() is being used for this buffer, it needs to
44 * be set even for a READAHEAD call, as it marks the buffer for later
45 * validation.
46 */
47int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
48 struct buffer_head *bhs[], int flags,
49 int (*validate)(struct super_block *sb,
50 struct buffer_head *bh));
51
49int ocfs2_write_super_or_backup(struct ocfs2_super *osb, 52int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
50 struct buffer_head *bh); 53 struct buffer_head *bh);
51 54
@@ -53,7 +56,9 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
53#define OCFS2_BH_READAHEAD 8 56#define OCFS2_BH_READAHEAD 8
54 57
55static inline int ocfs2_read_block(struct inode *inode, u64 off, 58static inline int ocfs2_read_block(struct inode *inode, u64 off,
56 struct buffer_head **bh) 59 struct buffer_head **bh,
60 int (*validate)(struct super_block *sb,
61 struct buffer_head *bh))
57{ 62{
58 int status = 0; 63 int status = 0;
59 64
@@ -63,7 +68,7 @@ static inline int ocfs2_read_block(struct inode *inode, u64 off,
63 goto bail; 68 goto bail;
64 } 69 }
65 70
66 status = ocfs2_read_blocks(inode, off, 1, bh, 0); 71 status = ocfs2_read_blocks(inode, off, 1, bh, 0, validate);
67 72
68bail: 73bail:
69 return status; 74 return status;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 6ebaa58e2c03..04697ba7f73e 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -854,7 +854,7 @@ static int o2hb_thread(void *data)
854 854
855 while (!kthread_should_stop() && !reg->hr_unclean_stop) { 855 while (!kthread_should_stop() && !reg->hr_unclean_stop) {
856 /* We track the time spent inside 856 /* We track the time spent inside
857 * o2hb_do_disk_heartbeat so that we avoid more then 857 * o2hb_do_disk_heartbeat so that we avoid more than
858 * hr_timeout_ms between disk writes. On busy systems 858 * hr_timeout_ms between disk writes. On busy systems
859 * this should result in a heartbeat which is less 859 * this should result in a heartbeat which is less
860 * likely to time itself out. */ 860 * likely to time itself out. */
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index d8a0cb92cef6..96df5416993e 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -110,6 +110,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
110 define_mask(QUORUM), 110 define_mask(QUORUM),
111 define_mask(EXPORT), 111 define_mask(EXPORT),
112 define_mask(XATTR), 112 define_mask(XATTR),
113 define_mask(QUOTA),
113 define_mask(ERROR), 114 define_mask(ERROR),
114 define_mask(NOTICE), 115 define_mask(NOTICE),
115 define_mask(KTHREAD), 116 define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 57670c680471..7e72a81bc2d4 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -113,6 +113,7 @@
113#define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */ 113#define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */
114#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */ 114#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */
115#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ 115#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */
116#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */
116/* bits that are infrequently given and frequently matched in the high word */ 117/* bits that are infrequently given and frequently matched in the high word */
117#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ 118#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
118#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ 119#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 026e6eb85187..f2c4098cf337 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -40,6 +40,7 @@
40#include <linux/types.h> 40#include <linux/types.h>
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/quotaops.h>
43 44
44#define MLOG_MASK_PREFIX ML_NAMEI 45#define MLOG_MASK_PREFIX ML_NAMEI
45#include <cluster/masklog.h> 46#include <cluster/masklog.h>
@@ -47,6 +48,7 @@
47#include "ocfs2.h" 48#include "ocfs2.h"
48 49
49#include "alloc.h" 50#include "alloc.h"
51#include "blockcheck.h"
50#include "dir.h" 52#include "dir.h"
51#include "dlmglue.h" 53#include "dlmglue.h"
52#include "extent_map.h" 54#include "extent_map.h"
@@ -82,47 +84,72 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
82 struct ocfs2_alloc_context *meta_ac, 84 struct ocfs2_alloc_context *meta_ac,
83 struct buffer_head **new_bh); 85 struct buffer_head **new_bh);
84 86
85static struct buffer_head *ocfs2_bread(struct inode *inode, 87/*
86 int block, int *err, int reada) 88 * These are distinct checks because future versions of the file system will
89 * want to have a trailing dirent structure independent of indexing.
90 */
91static int ocfs2_dir_has_trailer(struct inode *dir)
87{ 92{
88 struct buffer_head *bh = NULL; 93 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
89 int tmperr; 94 return 0;
90 u64 p_blkno;
91 int readflags = 0;
92 95
93 if (reada) 96 return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb));
94 readflags |= OCFS2_BH_READAHEAD; 97}
95 98
96 if (((u64)block << inode->i_sb->s_blocksize_bits) >= 99static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb)
97 i_size_read(inode)) { 100{
98 BUG_ON(!reada); 101 return ocfs2_meta_ecc(osb);
99 return NULL; 102}
100 }
101 103
102 down_read(&OCFS2_I(inode)->ip_alloc_sem); 104static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
103 tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, 105{
104 NULL); 106 return sb->s_blocksize - sizeof(struct ocfs2_dir_block_trailer);
105 up_read(&OCFS2_I(inode)->ip_alloc_sem); 107}
106 if (tmperr < 0) {
107 mlog_errno(tmperr);
108 goto fail;
109 }
110 108
111 tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags); 109#define ocfs2_trailer_from_bh(_bh, _sb) ((struct ocfs2_dir_block_trailer *) ((_bh)->b_data + ocfs2_dir_trailer_blk_off((_sb))))
112 if (tmperr < 0)
113 goto fail;
114 110
115 tmperr = 0; 111/* XXX ocfs2_block_dqtrailer() is similar but not quite - can we make
112 * them more consistent? */
113struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
114 void *data)
115{
116 char *p = data;
116 117
117 *err = 0; 118 p += blocksize - sizeof(struct ocfs2_dir_block_trailer);
118 return bh; 119 return (struct ocfs2_dir_block_trailer *)p;
120}
119 121
120fail: 122/*
121 brelse(bh); 123 * XXX: This is executed once on every dirent. We should consider optimizing
122 bh = NULL; 124 * it.
125 */
126static int ocfs2_skip_dir_trailer(struct inode *dir,
127 struct ocfs2_dir_entry *de,
128 unsigned long offset,
129 unsigned long blklen)
130{
131 unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
123 132
124 *err = -EIO; 133 if (!ocfs2_dir_has_trailer(dir))
125 return NULL; 134 return 0;
135
136 if (offset != toff)
137 return 0;
138
139 return 1;
140}
141
142static void ocfs2_init_dir_trailer(struct inode *inode,
143 struct buffer_head *bh)
144{
145 struct ocfs2_dir_block_trailer *trailer;
146
147 trailer = ocfs2_trailer_from_bh(bh, inode->i_sb);
148 strcpy(trailer->db_signature, OCFS2_DIR_TRAILER_SIGNATURE);
149 trailer->db_compat_rec_len =
150 cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
151 trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
152 trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
126} 153}
127 154
128/* 155/*
@@ -231,7 +258,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name,
231 struct ocfs2_dinode *di; 258 struct ocfs2_dinode *di;
232 struct ocfs2_inline_data *data; 259 struct ocfs2_inline_data *data;
233 260
234 ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh); 261 ret = ocfs2_read_inode_block(dir, &di_bh);
235 if (ret) { 262 if (ret) {
236 mlog_errno(ret); 263 mlog_errno(ret);
237 goto out; 264 goto out;
@@ -250,6 +277,108 @@ out:
250 return NULL; 277 return NULL;
251} 278}
252 279
280static int ocfs2_validate_dir_block(struct super_block *sb,
281 struct buffer_head *bh)
282{
283 int rc;
284 struct ocfs2_dir_block_trailer *trailer =
285 ocfs2_trailer_from_bh(bh, sb);
286
287
288 /*
289 * We don't validate dirents here, that's handled
290 * in-place when the code walks them.
291 */
292 mlog(0, "Validating dirblock %llu\n",
293 (unsigned long long)bh->b_blocknr);
294
295 BUG_ON(!buffer_uptodate(bh));
296
297 /*
298 * If the ecc fails, we return the error but otherwise
299 * leave the filesystem running. We know any error is
300 * local to this block.
301 *
302 * Note that we are safe to call this even if the directory
303 * doesn't have a trailer. Filesystems without metaecc will do
304 * nothing, and filesystems with it will have one.
305 */
306 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &trailer->db_check);
307 if (rc)
308 mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
309 (unsigned long long)bh->b_blocknr);
310
311 return rc;
312}
313
314/*
315 * This function forces all errors to -EIO for consistency with its
316 * predecessor, ocfs2_bread(). We haven't audited what returning the
317 * real error codes would do to callers. We log the real codes with
318 * mlog_errno() before we squash them.
319 */
320static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
321 struct buffer_head **bh, int flags)
322{
323 int rc = 0;
324 struct buffer_head *tmp = *bh;
325 struct ocfs2_dir_block_trailer *trailer;
326
327 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
328 ocfs2_validate_dir_block);
329 if (rc) {
330 mlog_errno(rc);
331 goto out;
332 }
333
334 /*
335 * We check the trailer here rather than in
336 * ocfs2_validate_dir_block() because that function doesn't have
337 * the inode to test.
338 */
339 if (!(flags & OCFS2_BH_READAHEAD) &&
340 ocfs2_dir_has_trailer(inode)) {
341 trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb);
342 if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
343 rc = -EINVAL;
344 ocfs2_error(inode->i_sb,
345 "Invalid dirblock #%llu: "
346 "signature = %.*s\n",
347 (unsigned long long)tmp->b_blocknr, 7,
348 trailer->db_signature);
349 goto out;
350 }
351 if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) {
352 rc = -EINVAL;
353 ocfs2_error(inode->i_sb,
354 "Directory block #%llu has an invalid "
355 "db_blkno of %llu",
356 (unsigned long long)tmp->b_blocknr,
357 (unsigned long long)le64_to_cpu(trailer->db_blkno));
358 goto out;
359 }
360 if (le64_to_cpu(trailer->db_parent_dinode) !=
361 OCFS2_I(inode)->ip_blkno) {
362 rc = -EINVAL;
363 ocfs2_error(inode->i_sb,
364 "Directory block #%llu on dinode "
365 "#%llu has an invalid parent_dinode "
366 "of %llu",
367 (unsigned long long)tmp->b_blocknr,
368 (unsigned long long)OCFS2_I(inode)->ip_blkno,
369 (unsigned long long)le64_to_cpu(trailer->db_blkno));
370 goto out;
371 }
372 }
373
374 /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
375 if (!*bh)
376 *bh = tmp;
377
378out:
379 return rc ? -EIO : 0;
380}
381
253static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen, 382static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
254 struct inode *dir, 383 struct inode *dir,
255 struct ocfs2_dir_entry **res_dir) 384 struct ocfs2_dir_entry **res_dir)
@@ -296,15 +425,17 @@ restart:
296 } 425 }
297 num++; 426 num++;
298 427
299 bh = ocfs2_bread(dir, b++, &err, 1); 428 bh = NULL;
429 err = ocfs2_read_dir_block(dir, b++, &bh,
430 OCFS2_BH_READAHEAD);
300 bh_use[ra_max] = bh; 431 bh_use[ra_max] = bh;
301 } 432 }
302 } 433 }
303 if ((bh = bh_use[ra_ptr++]) == NULL) 434 if ((bh = bh_use[ra_ptr++]) == NULL)
304 goto next; 435 goto next;
305 if (ocfs2_read_block(dir, block, &bh)) { 436 if (ocfs2_read_dir_block(dir, block, &bh, 0)) {
306 /* read error, skip block & hope for the best. 437 /* read error, skip block & hope for the best.
307 * ocfs2_read_block() has released the bh. */ 438 * ocfs2_read_dir_block() has released the bh. */
308 ocfs2_error(dir->i_sb, "reading directory %llu, " 439 ocfs2_error(dir->i_sb, "reading directory %llu, "
309 "offset %lu\n", 440 "offset %lu\n",
310 (unsigned long long)OCFS2_I(dir)->ip_blkno, 441 (unsigned long long)OCFS2_I(dir)->ip_blkno,
@@ -381,14 +512,18 @@ int ocfs2_update_entry(struct inode *dir, handle_t *handle,
381 struct inode *new_entry_inode) 512 struct inode *new_entry_inode)
382{ 513{
383 int ret; 514 int ret;
515 ocfs2_journal_access_func access = ocfs2_journal_access_db;
384 516
385 /* 517 /*
386 * The same code works fine for both inline-data and extent 518 * The same code works fine for both inline-data and extent
387 * based directories, so no need to split this up. 519 * based directories, so no need to split this up. The only
520 * difference is the journal_access function.
388 */ 521 */
389 522
390 ret = ocfs2_journal_access(handle, dir, de_bh, 523 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
391 OCFS2_JOURNAL_ACCESS_WRITE); 524 access = ocfs2_journal_access_di;
525
526 ret = access(handle, dir, de_bh, OCFS2_JOURNAL_ACCESS_WRITE);
392 if (ret) { 527 if (ret) {
393 mlog_errno(ret); 528 mlog_errno(ret);
394 goto out; 529 goto out;
@@ -410,9 +545,13 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
410{ 545{
411 struct ocfs2_dir_entry *de, *pde; 546 struct ocfs2_dir_entry *de, *pde;
412 int i, status = -ENOENT; 547 int i, status = -ENOENT;
548 ocfs2_journal_access_func access = ocfs2_journal_access_db;
413 549
414 mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh); 550 mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
415 551
552 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
553 access = ocfs2_journal_access_di;
554
416 i = 0; 555 i = 0;
417 pde = NULL; 556 pde = NULL;
418 de = (struct ocfs2_dir_entry *) first_de; 557 de = (struct ocfs2_dir_entry *) first_de;
@@ -423,8 +562,8 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
423 goto bail; 562 goto bail;
424 } 563 }
425 if (de == de_del) { 564 if (de == de_del) {
426 status = ocfs2_journal_access(handle, dir, bh, 565 status = access(handle, dir, bh,
427 OCFS2_JOURNAL_ACCESS_WRITE); 566 OCFS2_JOURNAL_ACCESS_WRITE);
428 if (status < 0) { 567 if (status < 0) {
429 status = -EIO; 568 status = -EIO;
430 mlog_errno(status); 569 mlog_errno(status);
@@ -458,7 +597,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle,
458 struct ocfs2_dinode *di; 597 struct ocfs2_dinode *di;
459 struct ocfs2_inline_data *data; 598 struct ocfs2_inline_data *data;
460 599
461 ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh); 600 ret = ocfs2_read_inode_block(dir, &di_bh);
462 if (ret) { 601 if (ret) {
463 mlog_errno(ret); 602 mlog_errno(ret);
464 goto out; 603 goto out;
@@ -576,6 +715,16 @@ int __ocfs2_add_entry(handle_t *handle,
576 goto bail; 715 goto bail;
577 } 716 }
578 717
718 /* We're guaranteed that we should have space, so we
719 * can't possibly have hit the trailer...right? */
720 mlog_bug_on_msg(ocfs2_skip_dir_trailer(dir, de, offset, size),
721 "Hit dir trailer trying to insert %.*s "
722 "(namelen %d) into directory %llu. "
723 "offset is %lu, trailer offset is %d\n",
724 namelen, name, namelen,
725 (unsigned long long)parent_fe_bh->b_blocknr,
726 offset, ocfs2_dir_trailer_blk_off(dir->i_sb));
727
579 if (ocfs2_dirent_would_fit(de, rec_len)) { 728 if (ocfs2_dirent_would_fit(de, rec_len)) {
580 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 729 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
581 retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); 730 retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
@@ -584,8 +733,14 @@ int __ocfs2_add_entry(handle_t *handle,
584 goto bail; 733 goto bail;
585 } 734 }
586 735
587 status = ocfs2_journal_access(handle, dir, insert_bh, 736 if (insert_bh == parent_fe_bh)
588 OCFS2_JOURNAL_ACCESS_WRITE); 737 status = ocfs2_journal_access_di(handle, dir,
738 insert_bh,
739 OCFS2_JOURNAL_ACCESS_WRITE);
740 else
741 status = ocfs2_journal_access_db(handle, dir,
742 insert_bh,
743 OCFS2_JOURNAL_ACCESS_WRITE);
589 /* By now the buffer is marked for journaling */ 744 /* By now the buffer is marked for journaling */
590 offset += le16_to_cpu(de->rec_len); 745 offset += le16_to_cpu(de->rec_len);
591 if (le64_to_cpu(de->inode)) { 746 if (le64_to_cpu(de->inode)) {
@@ -611,6 +766,7 @@ int __ocfs2_add_entry(handle_t *handle,
611 retval = 0; 766 retval = 0;
612 goto bail; 767 goto bail;
613 } 768 }
769
614 offset += le16_to_cpu(de->rec_len); 770 offset += le16_to_cpu(de->rec_len);
615 de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len)); 771 de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
616 } 772 }
@@ -636,7 +792,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
636 struct ocfs2_inline_data *data; 792 struct ocfs2_inline_data *data;
637 struct ocfs2_dir_entry *de; 793 struct ocfs2_dir_entry *de;
638 794
639 ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh); 795 ret = ocfs2_read_inode_block(inode, &di_bh);
640 if (ret) { 796 if (ret) {
641 mlog(ML_ERROR, "Unable to read inode block for dir %llu\n", 797 mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
642 (unsigned long long)OCFS2_I(inode)->ip_blkno); 798 (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -724,7 +880,6 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
724 int i, stored; 880 int i, stored;
725 struct buffer_head * bh, * tmp; 881 struct buffer_head * bh, * tmp;
726 struct ocfs2_dir_entry * de; 882 struct ocfs2_dir_entry * de;
727 int err;
728 struct super_block * sb = inode->i_sb; 883 struct super_block * sb = inode->i_sb;
729 unsigned int ra_sectors = 16; 884 unsigned int ra_sectors = 16;
730 885
@@ -735,12 +890,8 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
735 890
736 while (!error && !stored && *f_pos < i_size_read(inode)) { 891 while (!error && !stored && *f_pos < i_size_read(inode)) {
737 blk = (*f_pos) >> sb->s_blocksize_bits; 892 blk = (*f_pos) >> sb->s_blocksize_bits;
738 bh = ocfs2_bread(inode, blk, &err, 0); 893 if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
739 if (!bh) { 894 /* Skip the corrupt dirblock and keep trying */
740 mlog(ML_ERROR,
741 "directory #%llu contains a hole at offset %lld\n",
742 (unsigned long long)OCFS2_I(inode)->ip_blkno,
743 *f_pos);
744 *f_pos += sb->s_blocksize - offset; 895 *f_pos += sb->s_blocksize - offset;
745 continue; 896 continue;
746 } 897 }
@@ -754,8 +905,10 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
754 || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) { 905 || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) {
755 for (i = ra_sectors >> (sb->s_blocksize_bits - 9); 906 for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
756 i > 0; i--) { 907 i > 0; i--) {
757 tmp = ocfs2_bread(inode, ++blk, &err, 1); 908 tmp = NULL;
758 brelse(tmp); 909 if (!ocfs2_read_dir_block(inode, ++blk, &tmp,
910 OCFS2_BH_READAHEAD))
911 brelse(tmp);
759 } 912 }
760 last_ra_blk = blk; 913 last_ra_blk = blk;
761 ra_sectors = 8; 914 ra_sectors = 8;
@@ -828,6 +981,7 @@ revalidate:
828 } 981 }
829 offset = 0; 982 offset = 0;
830 brelse(bh); 983 brelse(bh);
984 bh = NULL;
831 } 985 }
832 986
833 stored = 0; 987 stored = 0;
@@ -1050,9 +1204,15 @@ int ocfs2_empty_dir(struct inode *inode)
1050 return !priv.seen_other; 1204 return !priv.seen_other;
1051} 1205}
1052 1206
1053static void ocfs2_fill_initial_dirents(struct inode *inode, 1207/*
1054 struct inode *parent, 1208 * Fills "." and ".." dirents in a new directory block. Returns dirent for
1055 char *start, unsigned int size) 1209 * "..", which might be used during creation of a directory with a trailing
1210 * header. It is otherwise safe to ignore the return code.
1211 */
1212static struct ocfs2_dir_entry *ocfs2_fill_initial_dirents(struct inode *inode,
1213 struct inode *parent,
1214 char *start,
1215 unsigned int size)
1056{ 1216{
1057 struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start; 1217 struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start;
1058 1218
@@ -1069,6 +1229,8 @@ static void ocfs2_fill_initial_dirents(struct inode *inode,
1069 de->name_len = 2; 1229 de->name_len = 2;
1070 strcpy(de->name, ".."); 1230 strcpy(de->name, "..");
1071 ocfs2_set_de_type(de, S_IFDIR); 1231 ocfs2_set_de_type(de, S_IFDIR);
1232
1233 return de;
1072} 1234}
1073 1235
1074/* 1236/*
@@ -1086,8 +1248,8 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
1086 struct ocfs2_inline_data *data = &di->id2.i_data; 1248 struct ocfs2_inline_data *data = &di->id2.i_data;
1087 unsigned int size = le16_to_cpu(data->id_count); 1249 unsigned int size = le16_to_cpu(data->id_count);
1088 1250
1089 ret = ocfs2_journal_access(handle, inode, di_bh, 1251 ret = ocfs2_journal_access_di(handle, inode, di_bh,
1090 OCFS2_JOURNAL_ACCESS_WRITE); 1252 OCFS2_JOURNAL_ACCESS_WRITE);
1091 if (ret) { 1253 if (ret) {
1092 mlog_errno(ret); 1254 mlog_errno(ret);
1093 goto out; 1255 goto out;
@@ -1121,10 +1283,15 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1121 struct ocfs2_alloc_context *data_ac) 1283 struct ocfs2_alloc_context *data_ac)
1122{ 1284{
1123 int status; 1285 int status;
1286 unsigned int size = osb->sb->s_blocksize;
1124 struct buffer_head *new_bh = NULL; 1287 struct buffer_head *new_bh = NULL;
1288 struct ocfs2_dir_entry *de;
1125 1289
1126 mlog_entry_void(); 1290 mlog_entry_void();
1127 1291
1292 if (ocfs2_supports_dir_trailer(osb))
1293 size = ocfs2_dir_trailer_blk_off(parent->i_sb);
1294
1128 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh, 1295 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
1129 data_ac, NULL, &new_bh); 1296 data_ac, NULL, &new_bh);
1130 if (status < 0) { 1297 if (status < 0) {
@@ -1134,16 +1301,17 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1134 1301
1135 ocfs2_set_new_buffer_uptodate(inode, new_bh); 1302 ocfs2_set_new_buffer_uptodate(inode, new_bh);
1136 1303
1137 status = ocfs2_journal_access(handle, inode, new_bh, 1304 status = ocfs2_journal_access_db(handle, inode, new_bh,
1138 OCFS2_JOURNAL_ACCESS_CREATE); 1305 OCFS2_JOURNAL_ACCESS_CREATE);
1139 if (status < 0) { 1306 if (status < 0) {
1140 mlog_errno(status); 1307 mlog_errno(status);
1141 goto bail; 1308 goto bail;
1142 } 1309 }
1143 memset(new_bh->b_data, 0, osb->sb->s_blocksize); 1310 memset(new_bh->b_data, 0, osb->sb->s_blocksize);
1144 1311
1145 ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, 1312 de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
1146 osb->sb->s_blocksize); 1313 if (ocfs2_supports_dir_trailer(osb))
1314 ocfs2_init_dir_trailer(inode, new_bh);
1147 1315
1148 status = ocfs2_journal_dirty(handle, new_bh); 1316 status = ocfs2_journal_dirty(handle, new_bh);
1149 if (status < 0) { 1317 if (status < 0) {
@@ -1184,13 +1352,27 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
1184 data_ac); 1352 data_ac);
1185} 1353}
1186 1354
1355/*
1356 * Expand rec_len of the rightmost dirent in a directory block so that it
1357 * contains the end of our valid space for dirents. We do this during
1358 * expansion from an inline directory to one with extents. The first dir block
1359 * in that case is taken from the inline data portion of the inode block.
1360 *
1361 * We add the dir trailer if this filesystem wants it.
1362 */
1187static void ocfs2_expand_last_dirent(char *start, unsigned int old_size, 1363static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
1188 unsigned int new_size) 1364 struct super_block *sb)
1189{ 1365{
1190 struct ocfs2_dir_entry *de; 1366 struct ocfs2_dir_entry *de;
1191 struct ocfs2_dir_entry *prev_de; 1367 struct ocfs2_dir_entry *prev_de;
1192 char *de_buf, *limit; 1368 char *de_buf, *limit;
1193 unsigned int bytes = new_size - old_size; 1369 unsigned int new_size = sb->s_blocksize;
1370 unsigned int bytes;
1371
1372 if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
1373 new_size = ocfs2_dir_trailer_blk_off(sb);
1374
1375 bytes = new_size - old_size;
1194 1376
1195 limit = start + old_size; 1377 limit = start + old_size;
1196 de_buf = start; 1378 de_buf = start;
@@ -1216,9 +1398,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1216 unsigned int blocks_wanted, 1398 unsigned int blocks_wanted,
1217 struct buffer_head **first_block_bh) 1399 struct buffer_head **first_block_bh)
1218{ 1400{
1219 int ret, credits = OCFS2_INLINE_TO_EXTENTS_CREDITS;
1220 u32 alloc, bit_off, len; 1401 u32 alloc, bit_off, len;
1221 struct super_block *sb = dir->i_sb; 1402 struct super_block *sb = dir->i_sb;
1403 int ret, credits = ocfs2_inline_to_extents_credits(sb);
1222 u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits; 1404 u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits;
1223 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 1405 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
1224 struct ocfs2_inode_info *oi = OCFS2_I(dir); 1406 struct ocfs2_inode_info *oi = OCFS2_I(dir);
@@ -1227,6 +1409,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1227 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 1409 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1228 handle_t *handle; 1410 handle_t *handle;
1229 struct ocfs2_extent_tree et; 1411 struct ocfs2_extent_tree et;
1412 int did_quota = 0;
1230 1413
1231 ocfs2_init_dinode_extent_tree(&et, dir, di_bh); 1414 ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
1232 1415
@@ -1264,6 +1447,12 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1264 goto out_sem; 1447 goto out_sem;
1265 } 1448 }
1266 1449
1450 if (vfs_dq_alloc_space_nodirty(dir,
1451 ocfs2_clusters_to_bytes(osb->sb, alloc))) {
1452 ret = -EDQUOT;
1453 goto out_commit;
1454 }
1455 did_quota = 1;
1267 /* 1456 /*
1268 * Try to claim as many clusters as the bitmap can give though 1457 * Try to claim as many clusters as the bitmap can give though
1269 * if we only get one now, that's enough to continue. The rest 1458 * if we only get one now, that's enough to continue. The rest
@@ -1290,8 +1479,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1290 1479
1291 ocfs2_set_new_buffer_uptodate(dir, dirdata_bh); 1480 ocfs2_set_new_buffer_uptodate(dir, dirdata_bh);
1292 1481
1293 ret = ocfs2_journal_access(handle, dir, dirdata_bh, 1482 ret = ocfs2_journal_access_db(handle, dir, dirdata_bh,
1294 OCFS2_JOURNAL_ACCESS_CREATE); 1483 OCFS2_JOURNAL_ACCESS_CREATE);
1295 if (ret) { 1484 if (ret) {
1296 mlog_errno(ret); 1485 mlog_errno(ret);
1297 goto out_commit; 1486 goto out_commit;
@@ -1300,8 +1489,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1300 memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir)); 1489 memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
1301 memset(dirdata_bh->b_data + i_size_read(dir), 0, 1490 memset(dirdata_bh->b_data + i_size_read(dir), 0,
1302 sb->s_blocksize - i_size_read(dir)); 1491 sb->s_blocksize - i_size_read(dir));
1303 ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), 1492 ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), sb);
1304 sb->s_blocksize); 1493 if (ocfs2_supports_dir_trailer(osb))
1494 ocfs2_init_dir_trailer(dir, dirdata_bh);
1305 1495
1306 ret = ocfs2_journal_dirty(handle, dirdata_bh); 1496 ret = ocfs2_journal_dirty(handle, dirdata_bh);
1307 if (ret) { 1497 if (ret) {
@@ -1317,8 +1507,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1317 * We let the later dirent insert modify c/mtime - to the user 1507 * We let the later dirent insert modify c/mtime - to the user
1318 * the data hasn't changed. 1508 * the data hasn't changed.
1319 */ 1509 */
1320 ret = ocfs2_journal_access(handle, dir, di_bh, 1510 ret = ocfs2_journal_access_di(handle, dir, di_bh,
1321 OCFS2_JOURNAL_ACCESS_CREATE); 1511 OCFS2_JOURNAL_ACCESS_CREATE);
1322 if (ret) { 1512 if (ret) {
1323 mlog_errno(ret); 1513 mlog_errno(ret);
1324 goto out_commit; 1514 goto out_commit;
@@ -1386,6 +1576,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1386 dirdata_bh = NULL; 1576 dirdata_bh = NULL;
1387 1577
1388out_commit: 1578out_commit:
1579 if (ret < 0 && did_quota)
1580 vfs_dq_free_space_nodirty(dir,
1581 ocfs2_clusters_to_bytes(osb->sb, 2));
1389 ocfs2_commit_trans(osb, handle); 1582 ocfs2_commit_trans(osb, handle);
1390 1583
1391out_sem: 1584out_sem:
@@ -1410,7 +1603,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
1410 struct buffer_head **new_bh) 1603 struct buffer_head **new_bh)
1411{ 1604{
1412 int status; 1605 int status;
1413 int extend; 1606 int extend, did_quota = 0;
1414 u64 p_blkno, v_blkno; 1607 u64 p_blkno, v_blkno;
1415 1608
1416 spin_lock(&OCFS2_I(dir)->ip_lock); 1609 spin_lock(&OCFS2_I(dir)->ip_lock);
@@ -1420,6 +1613,13 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
1420 if (extend) { 1613 if (extend) {
1421 u32 offset = OCFS2_I(dir)->ip_clusters; 1614 u32 offset = OCFS2_I(dir)->ip_clusters;
1422 1615
1616 if (vfs_dq_alloc_space_nodirty(dir,
1617 ocfs2_clusters_to_bytes(sb, 1))) {
1618 status = -EDQUOT;
1619 goto bail;
1620 }
1621 did_quota = 1;
1622
1423 status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset, 1623 status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
1424 1, 0, parent_fe_bh, handle, 1624 1, 0, parent_fe_bh, handle,
1425 data_ac, meta_ac, NULL); 1625 data_ac, meta_ac, NULL);
@@ -1445,6 +1645,8 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
1445 } 1645 }
1446 status = 0; 1646 status = 0;
1447bail: 1647bail:
1648 if (did_quota && status < 0)
1649 vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
1448 mlog_exit(status); 1650 mlog_exit(status);
1449 return status; 1651 return status;
1450} 1652}
@@ -1569,16 +1771,22 @@ do_extend:
1569 1771
1570 ocfs2_set_new_buffer_uptodate(dir, new_bh); 1772 ocfs2_set_new_buffer_uptodate(dir, new_bh);
1571 1773
1572 status = ocfs2_journal_access(handle, dir, new_bh, 1774 status = ocfs2_journal_access_db(handle, dir, new_bh,
1573 OCFS2_JOURNAL_ACCESS_CREATE); 1775 OCFS2_JOURNAL_ACCESS_CREATE);
1574 if (status < 0) { 1776 if (status < 0) {
1575 mlog_errno(status); 1777 mlog_errno(status);
1576 goto bail; 1778 goto bail;
1577 } 1779 }
1578 memset(new_bh->b_data, 0, sb->s_blocksize); 1780 memset(new_bh->b_data, 0, sb->s_blocksize);
1781
1579 de = (struct ocfs2_dir_entry *) new_bh->b_data; 1782 de = (struct ocfs2_dir_entry *) new_bh->b_data;
1580 de->inode = 0; 1783 de->inode = 0;
1581 de->rec_len = cpu_to_le16(sb->s_blocksize); 1784 if (ocfs2_dir_has_trailer(dir)) {
1785 de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
1786 ocfs2_init_dir_trailer(dir, new_bh);
1787 } else {
1788 de->rec_len = cpu_to_le16(sb->s_blocksize);
1789 }
1582 status = ocfs2_journal_dirty(handle, new_bh); 1790 status = ocfs2_journal_dirty(handle, new_bh);
1583 if (status < 0) { 1791 if (status < 0) {
1584 mlog_errno(status); 1792 mlog_errno(status);
@@ -1620,11 +1828,21 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
1620 unsigned int *blocks_wanted) 1828 unsigned int *blocks_wanted)
1621{ 1829{
1622 int ret; 1830 int ret;
1831 struct super_block *sb = dir->i_sb;
1623 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 1832 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1624 struct ocfs2_dir_entry *de, *last_de = NULL; 1833 struct ocfs2_dir_entry *de, *last_de = NULL;
1625 char *de_buf, *limit; 1834 char *de_buf, *limit;
1626 unsigned long offset = 0; 1835 unsigned long offset = 0;
1627 unsigned int rec_len, new_rec_len; 1836 unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize;
1837
1838 /*
1839 * This calculates how many free bytes we'd have in block zero, should
1840 * this function force expansion to an extent tree.
1841 */
1842 if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
1843 free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
1844 else
1845 free_space = dir->i_sb->s_blocksize - i_size_read(dir);
1628 1846
1629 de_buf = di->id2.i_data.id_data; 1847 de_buf = di->id2.i_data.id_data;
1630 limit = de_buf + i_size_read(dir); 1848 limit = de_buf + i_size_read(dir);
@@ -1641,6 +1859,11 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
1641 ret = -EEXIST; 1859 ret = -EEXIST;
1642 goto out; 1860 goto out;
1643 } 1861 }
1862 /*
1863 * No need to check for a trailing dirent record here as
1864 * they're not used for inline dirs.
1865 */
1866
1644 if (ocfs2_dirent_would_fit(de, rec_len)) { 1867 if (ocfs2_dirent_would_fit(de, rec_len)) {
1645 /* Ok, we found a spot. Return this bh and let 1868 /* Ok, we found a spot. Return this bh and let
1646 * the caller actually fill it in. */ 1869 * the caller actually fill it in. */
@@ -1661,7 +1884,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
1661 * dirent can be found. 1884 * dirent can be found.
1662 */ 1885 */
1663 *blocks_wanted = 1; 1886 *blocks_wanted = 1;
1664 new_rec_len = le16_to_cpu(last_de->rec_len) + (dir->i_sb->s_blocksize - i_size_read(dir)); 1887 new_rec_len = le16_to_cpu(last_de->rec_len) + free_space;
1665 if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len))) 1888 if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len)))
1666 *blocks_wanted = 2; 1889 *blocks_wanted = 2;
1667 1890
@@ -1679,9 +1902,10 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
1679 struct ocfs2_dir_entry *de; 1902 struct ocfs2_dir_entry *de;
1680 struct super_block *sb = dir->i_sb; 1903 struct super_block *sb = dir->i_sb;
1681 int status; 1904 int status;
1905 int blocksize = dir->i_sb->s_blocksize;
1682 1906
1683 bh = ocfs2_bread(dir, 0, &status, 0); 1907 status = ocfs2_read_dir_block(dir, 0, &bh, 0);
1684 if (!bh) { 1908 if (status) {
1685 mlog_errno(status); 1909 mlog_errno(status);
1686 goto bail; 1910 goto bail;
1687 } 1911 }
@@ -1702,11 +1926,10 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
1702 status = -ENOSPC; 1926 status = -ENOSPC;
1703 goto bail; 1927 goto bail;
1704 } 1928 }
1705 bh = ocfs2_bread(dir, 1929 status = ocfs2_read_dir_block(dir,
1706 offset >> sb->s_blocksize_bits, 1930 offset >> sb->s_blocksize_bits,
1707 &status, 1931 &bh, 0);
1708 0); 1932 if (status) {
1709 if (!bh) {
1710 mlog_errno(status); 1933 mlog_errno(status);
1711 goto bail; 1934 goto bail;
1712 } 1935 }
@@ -1721,6 +1944,11 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
1721 status = -EEXIST; 1944 status = -EEXIST;
1722 goto bail; 1945 goto bail;
1723 } 1946 }
1947
1948 if (ocfs2_skip_dir_trailer(dir, de, offset % blocksize,
1949 blocksize))
1950 goto next;
1951
1724 if (ocfs2_dirent_would_fit(de, rec_len)) { 1952 if (ocfs2_dirent_would_fit(de, rec_len)) {
1725 /* Ok, we found a spot. Return this bh and let 1953 /* Ok, we found a spot. Return this bh and let
1726 * the caller actually fill it in. */ 1954 * the caller actually fill it in. */
@@ -1729,6 +1957,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
1729 status = 0; 1957 status = 0;
1730 goto bail; 1958 goto bail;
1731 } 1959 }
1960next:
1732 offset += le16_to_cpu(de->rec_len); 1961 offset += le16_to_cpu(de->rec_len);
1733 de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len)); 1962 de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
1734 } 1963 }
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index ce48b9080d87..c511e2e18e9f 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -83,4 +83,6 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
83 struct buffer_head *fe_bh, 83 struct buffer_head *fe_bh,
84 struct ocfs2_alloc_context *data_ac); 84 struct ocfs2_alloc_context *data_ac);
85 85
86struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
87 void *data);
86#endif /* OCFS2_DIR_H */ 88#endif /* OCFS2_DIR_H */
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 644bee55d8ba..d07ddbe4b283 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -275,6 +275,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
275 struct list_head *iter, *head=NULL; 275 struct list_head *iter, *head=NULL;
276 u64 cookie; 276 u64 cookie;
277 u32 flags; 277 u32 flags;
278 u8 node;
278 279
279 if (!dlm_grab(dlm)) { 280 if (!dlm_grab(dlm)) {
280 dlm_error(DLM_REJECTED); 281 dlm_error(DLM_REJECTED);
@@ -286,18 +287,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
286 287
287 name = past->name; 288 name = past->name;
288 locklen = past->namelen; 289 locklen = past->namelen;
289 cookie = be64_to_cpu(past->cookie); 290 cookie = past->cookie;
290 flags = be32_to_cpu(past->flags); 291 flags = be32_to_cpu(past->flags);
292 node = past->node_idx;
291 293
292 if (locklen > DLM_LOCKID_NAME_MAX) { 294 if (locklen > DLM_LOCKID_NAME_MAX) {
293 ret = DLM_IVBUFLEN; 295 ret = DLM_IVBUFLEN;
294 mlog(ML_ERROR, "Invalid name length in proxy ast handler!\n"); 296 mlog(ML_ERROR, "Invalid name length (%d) in proxy ast "
297 "handler!\n", locklen);
295 goto leave; 298 goto leave;
296 } 299 }
297 300
298 if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) == 301 if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
299 (LKM_PUT_LVB|LKM_GET_LVB)) { 302 (LKM_PUT_LVB|LKM_GET_LVB)) {
300 mlog(ML_ERROR, "both PUT and GET lvb specified\n"); 303 mlog(ML_ERROR, "Both PUT and GET lvb specified, (0x%x)\n",
304 flags);
301 ret = DLM_BADARGS; 305 ret = DLM_BADARGS;
302 goto leave; 306 goto leave;
303 } 307 }
@@ -310,22 +314,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
310 if (past->type != DLM_AST && 314 if (past->type != DLM_AST &&
311 past->type != DLM_BAST) { 315 past->type != DLM_BAST) {
312 mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu" 316 mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu"
313 "name=%.*s\n", past->type, 317 "name=%.*s, node=%u\n", past->type,
314 dlm_get_lock_cookie_node(cookie), 318 dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
315 dlm_get_lock_cookie_seq(cookie), 319 dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
316 locklen, name); 320 locklen, name, node);
317 ret = DLM_IVLOCKID; 321 ret = DLM_IVLOCKID;
318 goto leave; 322 goto leave;
319 } 323 }
320 324
321 res = dlm_lookup_lockres(dlm, name, locklen); 325 res = dlm_lookup_lockres(dlm, name, locklen);
322 if (!res) { 326 if (!res) {
323 mlog(0, "got %sast for unknown lockres! " 327 mlog(0, "Got %sast for unknown lockres! cookie=%u:%llu, "
324 "cookie=%u:%llu, name=%.*s, namelen=%u\n", 328 "name=%.*s, node=%u\n", (past->type == DLM_AST ? "" : "b"),
325 past->type == DLM_AST ? "" : "b", 329 dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
326 dlm_get_lock_cookie_node(cookie), 330 dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
327 dlm_get_lock_cookie_seq(cookie), 331 locklen, name, node);
328 locklen, name, locklen);
329 ret = DLM_IVLOCKID; 332 ret = DLM_IVLOCKID;
330 goto leave; 333 goto leave;
331 } 334 }
@@ -337,12 +340,12 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
337 340
338 spin_lock(&res->spinlock); 341 spin_lock(&res->spinlock);
339 if (res->state & DLM_LOCK_RES_RECOVERING) { 342 if (res->state & DLM_LOCK_RES_RECOVERING) {
340 mlog(0, "responding with DLM_RECOVERING!\n"); 343 mlog(0, "Responding with DLM_RECOVERING!\n");
341 ret = DLM_RECOVERING; 344 ret = DLM_RECOVERING;
342 goto unlock_out; 345 goto unlock_out;
343 } 346 }
344 if (res->state & DLM_LOCK_RES_MIGRATING) { 347 if (res->state & DLM_LOCK_RES_MIGRATING) {
345 mlog(0, "responding with DLM_MIGRATING!\n"); 348 mlog(0, "Responding with DLM_MIGRATING!\n");
346 ret = DLM_MIGRATING; 349 ret = DLM_MIGRATING;
347 goto unlock_out; 350 goto unlock_out;
348 } 351 }
@@ -351,7 +354,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
351 lock = NULL; 354 lock = NULL;
352 list_for_each(iter, head) { 355 list_for_each(iter, head) {
353 lock = list_entry (iter, struct dlm_lock, list); 356 lock = list_entry (iter, struct dlm_lock, list);
354 if (be64_to_cpu(lock->ml.cookie) == cookie) 357 if (lock->ml.cookie == cookie)
355 goto do_ast; 358 goto do_ast;
356 } 359 }
357 360
@@ -363,15 +366,15 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
363 366
364 list_for_each(iter, head) { 367 list_for_each(iter, head) {
365 lock = list_entry (iter, struct dlm_lock, list); 368 lock = list_entry (iter, struct dlm_lock, list);
366 if (be64_to_cpu(lock->ml.cookie) == cookie) 369 if (lock->ml.cookie == cookie)
367 goto do_ast; 370 goto do_ast;
368 } 371 }
369 372
370 mlog(0, "got %sast for unknown lock! cookie=%u:%llu, " 373 mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, "
371 "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b", 374 "node=%u\n", past->type == DLM_AST ? "" : "b",
372 dlm_get_lock_cookie_node(cookie), 375 dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
373 dlm_get_lock_cookie_seq(cookie), 376 dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
374 locklen, name, locklen); 377 locklen, name, node);
375 378
376 ret = DLM_NORMAL; 379 ret = DLM_NORMAL;
377unlock_out: 380unlock_out:
@@ -383,8 +386,8 @@ do_ast:
383 if (past->type == DLM_AST) { 386 if (past->type == DLM_AST) {
384 /* do not alter lock refcount. switching lists. */ 387 /* do not alter lock refcount. switching lists. */
385 list_move_tail(&lock->list, &res->granted); 388 list_move_tail(&lock->list, &res->granted);
386 mlog(0, "ast: adding to granted list... type=%d, " 389 mlog(0, "ast: Adding to granted list... type=%d, "
387 "convert_type=%d\n", lock->ml.type, lock->ml.convert_type); 390 "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
388 if (lock->ml.convert_type != LKM_IVMODE) { 391 if (lock->ml.convert_type != LKM_IVMODE) {
389 lock->ml.type = lock->ml.convert_type; 392 lock->ml.type = lock->ml.convert_type;
390 lock->ml.convert_type = LKM_IVMODE; 393 lock->ml.convert_type = LKM_IVMODE;
@@ -408,7 +411,6 @@ do_ast:
408 dlm_do_local_bast(dlm, res, lock, past->blocked_type); 411 dlm_do_local_bast(dlm, res, lock, past->blocked_type);
409 412
410leave: 413leave:
411
412 if (res) 414 if (res)
413 dlm_lockres_put(res); 415 dlm_lockres_put(res);
414 416
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index d5a86fb81a49..bb53714813ab 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -140,6 +140,7 @@ struct dlm_ctxt
140 unsigned int purge_count; 140 unsigned int purge_count;
141 spinlock_t spinlock; 141 spinlock_t spinlock;
142 spinlock_t ast_lock; 142 spinlock_t ast_lock;
143 spinlock_t track_lock;
143 char *name; 144 char *name;
144 u8 node_num; 145 u8 node_num;
145 u32 key; 146 u32 key;
@@ -316,6 +317,8 @@ struct dlm_lock_resource
316 * put on a list for the dlm thread to run. */ 317 * put on a list for the dlm thread to run. */
317 unsigned long last_used; 318 unsigned long last_used;
318 319
320 struct dlm_ctxt *dlm;
321
319 unsigned migration_pending:1; 322 unsigned migration_pending:1;
320 atomic_t asts_reserved; 323 atomic_t asts_reserved;
321 spinlock_t spinlock; 324 spinlock_t spinlock;
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 1b81dcba175d..b32f60a5acfb 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -630,43 +630,38 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
630{ 630{
631 struct debug_lockres *dl = m->private; 631 struct debug_lockres *dl = m->private;
632 struct dlm_ctxt *dlm = dl->dl_ctxt; 632 struct dlm_ctxt *dlm = dl->dl_ctxt;
633 struct dlm_lock_resource *oldres = dl->dl_res;
633 struct dlm_lock_resource *res = NULL; 634 struct dlm_lock_resource *res = NULL;
635 struct list_head *track_list;
634 636
635 spin_lock(&dlm->spinlock); 637 spin_lock(&dlm->track_lock);
638 if (oldres)
639 track_list = &oldres->tracking;
640 else
641 track_list = &dlm->tracking_list;
636 642
637 if (dl->dl_res) { 643 list_for_each_entry(res, track_list, tracking) {
638 list_for_each_entry(res, &dl->dl_res->tracking, tracking) { 644 if (&res->tracking == &dlm->tracking_list)
639 if (dl->dl_res) { 645 res = NULL;
640 dlm_lockres_put(dl->dl_res); 646 else
641 dl->dl_res = NULL;
642 }
643 if (&res->tracking == &dlm->tracking_list) {
644 mlog(0, "End of list found, %p\n", res);
645 dl = NULL;
646 break;
647 }
648 dlm_lockres_get(res); 647 dlm_lockres_get(res);
649 dl->dl_res = res; 648 break;
650 break;
651 }
652 } else {
653 if (!list_empty(&dlm->tracking_list)) {
654 list_for_each_entry(res, &dlm->tracking_list, tracking)
655 break;
656 dlm_lockres_get(res);
657 dl->dl_res = res;
658 } else
659 dl = NULL;
660 } 649 }
650 spin_unlock(&dlm->track_lock);
661 651
662 if (dl) { 652 if (oldres)
663 spin_lock(&dl->dl_res->spinlock); 653 dlm_lockres_put(oldres);
664 dump_lockres(dl->dl_res, dl->dl_buf, dl->dl_len - 1);
665 spin_unlock(&dl->dl_res->spinlock);
666 }
667 654
668 spin_unlock(&dlm->spinlock); 655 dl->dl_res = res;
656
657 if (res) {
658 spin_lock(&res->spinlock);
659 dump_lockres(res, dl->dl_buf, dl->dl_len - 1);
660 spin_unlock(&res->spinlock);
661 } else
662 dl = NULL;
669 663
664 /* passed to seq_show */
670 return dl; 665 return dl;
671} 666}
672 667
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 63f8125824e8..d8d578f45613 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1550,6 +1550,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1550 spin_lock_init(&dlm->spinlock); 1550 spin_lock_init(&dlm->spinlock);
1551 spin_lock_init(&dlm->master_lock); 1551 spin_lock_init(&dlm->master_lock);
1552 spin_lock_init(&dlm->ast_lock); 1552 spin_lock_init(&dlm->ast_lock);
1553 spin_lock_init(&dlm->track_lock);
1553 INIT_LIST_HEAD(&dlm->list); 1554 INIT_LIST_HEAD(&dlm->list);
1554 INIT_LIST_HEAD(&dlm->dirty_list); 1555 INIT_LIST_HEAD(&dlm->dirty_list);
1555 INIT_LIST_HEAD(&dlm->reco.resources); 1556 INIT_LIST_HEAD(&dlm->reco.resources);
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 6f7a77d54020..1c9efb406a96 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -341,7 +341,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
341 inode->i_mode = mode; 341 inode->i_mode = mode;
342 inode->i_uid = current_fsuid(); 342 inode->i_uid = current_fsuid();
343 inode->i_gid = current_fsgid(); 343 inode->i_gid = current_fsgid();
344 inode->i_blocks = 0;
345 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; 344 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
346 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 345 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
347 inc_nlink(inode); 346 inc_nlink(inode);
@@ -367,7 +366,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
367 inode->i_mode = mode; 366 inode->i_mode = mode;
368 inode->i_uid = current_fsuid(); 367 inode->i_uid = current_fsuid();
369 inode->i_gid = current_fsgid(); 368 inode->i_gid = current_fsgid();
370 inode->i_blocks = 0;
371 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; 369 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
372 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 370 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
373 371
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 44f87caf3683..54e182a27caf 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -505,8 +505,10 @@ void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
505static void dlm_lockres_release(struct kref *kref) 505static void dlm_lockres_release(struct kref *kref)
506{ 506{
507 struct dlm_lock_resource *res; 507 struct dlm_lock_resource *res;
508 struct dlm_ctxt *dlm;
508 509
509 res = container_of(kref, struct dlm_lock_resource, refs); 510 res = container_of(kref, struct dlm_lock_resource, refs);
511 dlm = res->dlm;
510 512
511 /* This should not happen -- all lockres' have a name 513 /* This should not happen -- all lockres' have a name
512 * associated with them at init time. */ 514 * associated with them at init time. */
@@ -515,6 +517,7 @@ static void dlm_lockres_release(struct kref *kref)
515 mlog(0, "destroying lockres %.*s\n", res->lockname.len, 517 mlog(0, "destroying lockres %.*s\n", res->lockname.len,
516 res->lockname.name); 518 res->lockname.name);
517 519
520 spin_lock(&dlm->track_lock);
518 if (!list_empty(&res->tracking)) 521 if (!list_empty(&res->tracking))
519 list_del_init(&res->tracking); 522 list_del_init(&res->tracking);
520 else { 523 else {
@@ -522,6 +525,9 @@ static void dlm_lockres_release(struct kref *kref)
522 res->lockname.len, res->lockname.name); 525 res->lockname.len, res->lockname.name);
523 dlm_print_one_lock_resource(res); 526 dlm_print_one_lock_resource(res);
524 } 527 }
528 spin_unlock(&dlm->track_lock);
529
530 dlm_put(dlm);
525 531
526 if (!hlist_unhashed(&res->hash_node) || 532 if (!hlist_unhashed(&res->hash_node) ||
527 !list_empty(&res->granted) || 533 !list_empty(&res->granted) ||
@@ -595,6 +601,10 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
595 res->migration_pending = 0; 601 res->migration_pending = 0;
596 res->inflight_locks = 0; 602 res->inflight_locks = 0;
597 603
604 /* put in dlm_lockres_release */
605 dlm_grab(dlm);
606 res->dlm = dlm;
607
598 kref_init(&res->refs); 608 kref_init(&res->refs);
599 609
600 /* just for consistency */ 610 /* just for consistency */
@@ -722,14 +732,21 @@ lookup:
722 if (tmpres) { 732 if (tmpres) {
723 int dropping_ref = 0; 733 int dropping_ref = 0;
724 734
735 spin_unlock(&dlm->spinlock);
736
725 spin_lock(&tmpres->spinlock); 737 spin_lock(&tmpres->spinlock);
738 /* We wait for the other thread that is mastering the resource */
739 if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
740 __dlm_wait_on_lockres(tmpres);
741 BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
742 }
743
726 if (tmpres->owner == dlm->node_num) { 744 if (tmpres->owner == dlm->node_num) {
727 BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF); 745 BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF);
728 dlm_lockres_grab_inflight_ref(dlm, tmpres); 746 dlm_lockres_grab_inflight_ref(dlm, tmpres);
729 } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) 747 } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF)
730 dropping_ref = 1; 748 dropping_ref = 1;
731 spin_unlock(&tmpres->spinlock); 749 spin_unlock(&tmpres->spinlock);
732 spin_unlock(&dlm->spinlock);
733 750
734 /* wait until done messaging the master, drop our ref to allow 751 /* wait until done messaging the master, drop our ref to allow
735 * the lockres to be purged, start over. */ 752 * the lockres to be purged, start over. */
@@ -2949,7 +2966,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
2949 struct dlm_node_iter *iter) 2966 struct dlm_node_iter *iter)
2950{ 2967{
2951 struct dlm_migrate_request migrate; 2968 struct dlm_migrate_request migrate;
2952 int ret, status = 0; 2969 int ret, skip, status = 0;
2953 int nodenum; 2970 int nodenum;
2954 2971
2955 memset(&migrate, 0, sizeof(migrate)); 2972 memset(&migrate, 0, sizeof(migrate));
@@ -2966,12 +2983,27 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
2966 nodenum == new_master) 2983 nodenum == new_master)
2967 continue; 2984 continue;
2968 2985
2986 /* We could race exit domain. If exited, skip. */
2987 spin_lock(&dlm->spinlock);
2988 skip = (!test_bit(nodenum, dlm->domain_map));
2989 spin_unlock(&dlm->spinlock);
2990 if (skip) {
2991 clear_bit(nodenum, iter->node_map);
2992 continue;
2993 }
2994
2969 ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key, 2995 ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
2970 &migrate, sizeof(migrate), nodenum, 2996 &migrate, sizeof(migrate), nodenum,
2971 &status); 2997 &status);
2972 if (ret < 0) 2998 if (ret < 0) {
2973 mlog_errno(ret); 2999 mlog(0, "migrate_request returned %d!\n", ret);
2974 else if (status < 0) { 3000 if (!dlm_is_host_down(ret)) {
3001 mlog(ML_ERROR, "unhandled error=%d!\n", ret);
3002 BUG();
3003 }
3004 clear_bit(nodenum, iter->node_map);
3005 ret = 0;
3006 } else if (status < 0) {
2975 mlog(0, "migrate request (node %u) returned %d!\n", 3007 mlog(0, "migrate request (node %u) returned %d!\n",
2976 nodenum, status); 3008 nodenum, status);
2977 ret = status; 3009 ret = status;
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 4060bb328bc8..d1295203029f 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -181,7 +181,8 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
181 181
182 spin_lock(&res->spinlock); 182 spin_lock(&res->spinlock);
183 /* This ensures that clear refmap is sent after the set */ 183 /* This ensures that clear refmap is sent after the set */
184 __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); 184 __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_SETREF_INPROG |
185 DLM_LOCK_RES_MIGRATING));
185 spin_unlock(&res->spinlock); 186 spin_unlock(&res->spinlock);
186 187
187 /* clear our bit from the master's refmap, ignore errors */ 188 /* clear our bit from the master's refmap, ignore errors */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 6e6cc0a2e5f7..b0c4cadd4c45 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -32,6 +32,7 @@
32#include <linux/debugfs.h> 32#include <linux/debugfs.h>
33#include <linux/seq_file.h> 33#include <linux/seq_file.h>
34#include <linux/time.h> 34#include <linux/time.h>
35#include <linux/quotaops.h>
35 36
36#define MLOG_MASK_PREFIX ML_DLM_GLUE 37#define MLOG_MASK_PREFIX ML_DLM_GLUE
37#include <cluster/masklog.h> 38#include <cluster/masklog.h>
@@ -51,6 +52,7 @@
51#include "slot_map.h" 52#include "slot_map.h"
52#include "super.h" 53#include "super.h"
53#include "uptodate.h" 54#include "uptodate.h"
55#include "quota.h"
54 56
55#include "buffer_head_io.h" 57#include "buffer_head_io.h"
56 58
@@ -68,6 +70,7 @@ struct ocfs2_mask_waiter {
68static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres); 70static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
69static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres); 71static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
70static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres); 72static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
73static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres);
71 74
72/* 75/*
73 * Return value from ->downconvert_worker functions. 76 * Return value from ->downconvert_worker functions.
@@ -102,6 +105,7 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
102static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb, 105static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
103 struct ocfs2_lock_res *lockres); 106 struct ocfs2_lock_res *lockres);
104 107
108static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres);
105 109
106#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres) 110#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
107 111
@@ -111,8 +115,7 @@ static void ocfs2_dump_meta_lvb_info(u64 level,
111 unsigned int line, 115 unsigned int line,
112 struct ocfs2_lock_res *lockres) 116 struct ocfs2_lock_res *lockres)
113{ 117{
114 struct ocfs2_meta_lvb *lvb = 118 struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
115 (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
116 119
117 mlog(level, "LVB information for %s (called from %s:%u):\n", 120 mlog(level, "LVB information for %s (called from %s:%u):\n",
118 lockres->l_name, function, line); 121 lockres->l_name, function, line);
@@ -258,6 +261,12 @@ static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
258 .flags = 0, 261 .flags = 0,
259}; 262};
260 263
264static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
265 .set_lvb = ocfs2_set_qinfo_lvb,
266 .get_osb = ocfs2_get_qinfo_osb,
267 .flags = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB,
268};
269
261static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) 270static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
262{ 271{
263 return lockres->l_type == OCFS2_LOCK_TYPE_META || 272 return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -279,6 +288,13 @@ static inline struct ocfs2_dentry_lock *ocfs2_lock_res_dl(struct ocfs2_lock_res
279 return (struct ocfs2_dentry_lock *)lockres->l_priv; 288 return (struct ocfs2_dentry_lock *)lockres->l_priv;
280} 289}
281 290
291static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_res *lockres)
292{
293 BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_QINFO);
294
295 return (struct ocfs2_mem_dqinfo *)lockres->l_priv;
296}
297
282static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres) 298static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres)
283{ 299{
284 if (lockres->l_ops->get_osb) 300 if (lockres->l_ops->get_osb)
@@ -507,6 +523,13 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
507 return OCFS2_SB(inode->i_sb); 523 return OCFS2_SB(inode->i_sb);
508} 524}
509 525
526static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres)
527{
528 struct ocfs2_mem_dqinfo *info = lockres->l_priv;
529
530 return OCFS2_SB(info->dqi_gi.dqi_sb);
531}
532
510static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres) 533static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
511{ 534{
512 struct ocfs2_file_private *fp = lockres->l_priv; 535 struct ocfs2_file_private *fp = lockres->l_priv;
@@ -609,6 +632,17 @@ void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
609 lockres->l_flags |= OCFS2_LOCK_NOCACHE; 632 lockres->l_flags |= OCFS2_LOCK_NOCACHE;
610} 633}
611 634
635void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
636 struct ocfs2_mem_dqinfo *info)
637{
638 ocfs2_lock_res_init_once(lockres);
639 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_QINFO, info->dqi_gi.dqi_type,
640 0, lockres->l_name);
641 ocfs2_lock_res_init_common(OCFS2_SB(info->dqi_gi.dqi_sb), lockres,
642 OCFS2_LOCK_TYPE_QINFO, &ocfs2_qinfo_lops,
643 info);
644}
645
612void ocfs2_lock_res_free(struct ocfs2_lock_res *res) 646void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
613{ 647{
614 mlog_entry_void(); 648 mlog_entry_void();
@@ -1290,7 +1324,7 @@ again:
1290 goto out; 1324 goto out;
1291 } 1325 }
1292 1326
1293 mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n", 1327 mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n",
1294 lockres->l_name); 1328 lockres->l_name);
1295 1329
1296 /* At this point we've gone inside the dlm and need to 1330 /* At this point we've gone inside the dlm and need to
@@ -1829,7 +1863,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
1829 1863
1830 mlog_entry_void(); 1864 mlog_entry_void();
1831 1865
1832 lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb); 1866 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
1833 1867
1834 /* 1868 /*
1835 * Invalidate the LVB of a deleted inode - this way other 1869 * Invalidate the LVB of a deleted inode - this way other
@@ -1881,7 +1915,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1881 1915
1882 mlog_meta_lvb(0, lockres); 1916 mlog_meta_lvb(0, lockres);
1883 1917
1884 lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb); 1918 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
1885 1919
1886 /* We're safe here without the lockres lock... */ 1920 /* We're safe here without the lockres lock... */
1887 spin_lock(&oi->ip_lock); 1921 spin_lock(&oi->ip_lock);
@@ -1916,8 +1950,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1916static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode, 1950static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
1917 struct ocfs2_lock_res *lockres) 1951 struct ocfs2_lock_res *lockres)
1918{ 1952{
1919 struct ocfs2_meta_lvb *lvb = 1953 struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
1920 (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
1921 1954
1922 if (lvb->lvb_version == OCFS2_LVB_VERSION 1955 if (lvb->lvb_version == OCFS2_LVB_VERSION
1923 && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation) 1956 && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
@@ -2024,7 +2057,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
2024 } else { 2057 } else {
2025 /* Boo, we have to go to disk. */ 2058 /* Boo, we have to go to disk. */
2026 /* read bh, cast, ocfs2_refresh_inode */ 2059 /* read bh, cast, ocfs2_refresh_inode */
2027 status = ocfs2_read_block(inode, oi->ip_blkno, bh); 2060 status = ocfs2_read_inode_block(inode, bh);
2028 if (status < 0) { 2061 if (status < 0) {
2029 mlog_errno(status); 2062 mlog_errno(status);
2030 goto bail_refresh; 2063 goto bail_refresh;
@@ -2032,18 +2065,14 @@ static int ocfs2_inode_lock_update(struct inode *inode,
2032 fe = (struct ocfs2_dinode *) (*bh)->b_data; 2065 fe = (struct ocfs2_dinode *) (*bh)->b_data;
2033 2066
2034 /* This is a good chance to make sure we're not 2067 /* This is a good chance to make sure we're not
2035 * locking an invalid object. 2068 * locking an invalid object. ocfs2_read_inode_block()
2069 * already checked that the inode block is sane.
2036 * 2070 *
2037 * We bug on a stale inode here because we checked 2071 * We bug on a stale inode here because we checked
2038 * above whether it was wiped from disk. The wiping 2072 * above whether it was wiped from disk. The wiping
2039 * node provides a guarantee that we receive that 2073 * node provides a guarantee that we receive that
2040 * message and can mark the inode before dropping any 2074 * message and can mark the inode before dropping any
2041 * locks associated with it. */ 2075 * locks associated with it. */
2042 if (!OCFS2_IS_VALID_DINODE(fe)) {
2043 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
2044 status = -EIO;
2045 goto bail_refresh;
2046 }
2047 mlog_bug_on_msg(inode->i_generation != 2076 mlog_bug_on_msg(inode->i_generation !=
2048 le32_to_cpu(fe->i_generation), 2077 le32_to_cpu(fe->i_generation),
2049 "Invalid dinode %llu disk generation: %u " 2078 "Invalid dinode %llu disk generation: %u "
@@ -2085,7 +2114,7 @@ static int ocfs2_assign_bh(struct inode *inode,
2085 return 0; 2114 return 0;
2086 } 2115 }
2087 2116
2088 status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, ret_bh); 2117 status = ocfs2_read_inode_block(inode, ret_bh);
2089 if (status < 0) 2118 if (status < 0)
2090 mlog_errno(status); 2119 mlog_errno(status);
2091 2120
@@ -2922,7 +2951,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
2922 ocfs2_dlm_dump_lksb(&lockres->l_lksb); 2951 ocfs2_dlm_dump_lksb(&lockres->l_lksb);
2923 BUG(); 2952 BUG();
2924 } 2953 }
2925 mlog(0, "lock %s, successfull return from ocfs2_dlm_unlock\n", 2954 mlog(0, "lock %s, successful return from ocfs2_dlm_unlock\n",
2926 lockres->l_name); 2955 lockres->l_name);
2927 2956
2928 ocfs2_wait_on_busy_lock(lockres); 2957 ocfs2_wait_on_busy_lock(lockres);
@@ -3449,6 +3478,117 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
3449 return UNBLOCK_CONTINUE_POST; 3478 return UNBLOCK_CONTINUE_POST;
3450} 3479}
3451 3480
3481static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
3482{
3483 struct ocfs2_qinfo_lvb *lvb;
3484 struct ocfs2_mem_dqinfo *oinfo = ocfs2_lock_res_qinfo(lockres);
3485 struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
3486 oinfo->dqi_gi.dqi_type);
3487
3488 mlog_entry_void();
3489
3490 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
3491 lvb->lvb_version = OCFS2_QINFO_LVB_VERSION;
3492 lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace);
3493 lvb->lvb_igrace = cpu_to_be32(info->dqi_igrace);
3494 lvb->lvb_syncms = cpu_to_be32(oinfo->dqi_syncms);
3495 lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks);
3496 lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk);
3497 lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry);
3498
3499 mlog_exit_void();
3500}
3501
3502void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
3503{
3504 struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
3505 struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
3506 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
3507
3508 mlog_entry_void();
3509 if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
3510 ocfs2_cluster_unlock(osb, lockres, level);
3511 mlog_exit_void();
3512}
3513
3514static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
3515{
3516 struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
3517 oinfo->dqi_gi.dqi_type);
3518 struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
3519 struct ocfs2_qinfo_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
3520 struct buffer_head *bh = NULL;
3521 struct ocfs2_global_disk_dqinfo *gdinfo;
3522 int status = 0;
3523
3524 if (lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) {
3525 info->dqi_bgrace = be32_to_cpu(lvb->lvb_bgrace);
3526 info->dqi_igrace = be32_to_cpu(lvb->lvb_igrace);
3527 oinfo->dqi_syncms = be32_to_cpu(lvb->lvb_syncms);
3528 oinfo->dqi_gi.dqi_blocks = be32_to_cpu(lvb->lvb_blocks);
3529 oinfo->dqi_gi.dqi_free_blk = be32_to_cpu(lvb->lvb_free_blk);
3530 oinfo->dqi_gi.dqi_free_entry =
3531 be32_to_cpu(lvb->lvb_free_entry);
3532 } else {
3533 status = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &bh);
3534 if (status) {
3535 mlog_errno(status);
3536 goto bail;
3537 }
3538 gdinfo = (struct ocfs2_global_disk_dqinfo *)
3539 (bh->b_data + OCFS2_GLOBAL_INFO_OFF);
3540 info->dqi_bgrace = le32_to_cpu(gdinfo->dqi_bgrace);
3541 info->dqi_igrace = le32_to_cpu(gdinfo->dqi_igrace);
3542 oinfo->dqi_syncms = le32_to_cpu(gdinfo->dqi_syncms);
3543 oinfo->dqi_gi.dqi_blocks = le32_to_cpu(gdinfo->dqi_blocks);
3544 oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(gdinfo->dqi_free_blk);
3545 oinfo->dqi_gi.dqi_free_entry =
3546 le32_to_cpu(gdinfo->dqi_free_entry);
3547 brelse(bh);
3548 ocfs2_track_lock_refresh(lockres);
3549 }
3550
3551bail:
3552 return status;
3553}
3554
3555/* Lock quota info, this function expects at least shared lock on the quota file
3556 * so that we can safely refresh quota info from disk. */
3557int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
3558{
3559 struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
3560 struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
3561 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
3562 int status = 0;
3563
3564 mlog_entry_void();
3565
3566 /* On RO devices, locking really isn't needed... */
3567 if (ocfs2_is_hard_readonly(osb)) {
3568 if (ex)
3569 status = -EROFS;
3570 goto bail;
3571 }
3572 if (ocfs2_mount_local(osb))
3573 goto bail;
3574
3575 status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
3576 if (status < 0) {
3577 mlog_errno(status);
3578 goto bail;
3579 }
3580 if (!ocfs2_should_refresh_lock_res(lockres))
3581 goto bail;
3582 /* OK, we have the lock but we need to refresh the quota info */
3583 status = ocfs2_refresh_qinfo(oinfo);
3584 if (status)
3585 ocfs2_qinfo_unlock(oinfo, ex);
3586 ocfs2_complete_lock_res_refresh(lockres, status);
3587bail:
3588 mlog_exit(status);
3589 return status;
3590}
3591
3452/* 3592/*
3453 * This is the filesystem locking protocol. It provides the lock handling 3593 * This is the filesystem locking protocol. It provides the lock handling
3454 * hooks for the underlying DLM. It has a maximum version number. 3594 * hooks for the underlying DLM. It has a maximum version number.
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 2bb01f09c1b1..3f8d9986b8e0 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -49,6 +49,19 @@ struct ocfs2_meta_lvb {
49 __be32 lvb_reserved2; 49 __be32 lvb_reserved2;
50}; 50};
51 51
52#define OCFS2_QINFO_LVB_VERSION 1
53
54struct ocfs2_qinfo_lvb {
55 __u8 lvb_version;
56 __u8 lvb_reserved[3];
57 __be32 lvb_bgrace;
58 __be32 lvb_igrace;
59 __be32 lvb_syncms;
60 __be32 lvb_blocks;
61 __be32 lvb_free_blk;
62 __be32 lvb_free_entry;
63};
64
52/* ocfs2_inode_lock_full() 'arg_flags' flags */ 65/* ocfs2_inode_lock_full() 'arg_flags' flags */
53/* don't wait on recovery. */ 66/* don't wait on recovery. */
54#define OCFS2_META_LOCK_RECOVERY (0x01) 67#define OCFS2_META_LOCK_RECOVERY (0x01)
@@ -69,6 +82,9 @@ void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
69struct ocfs2_file_private; 82struct ocfs2_file_private;
70void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, 83void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
71 struct ocfs2_file_private *fp); 84 struct ocfs2_file_private *fp);
85struct ocfs2_mem_dqinfo;
86void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
87 struct ocfs2_mem_dqinfo *info);
72void ocfs2_lock_res_free(struct ocfs2_lock_res *res); 88void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
73int ocfs2_create_new_inode_locks(struct inode *inode); 89int ocfs2_create_new_inode_locks(struct inode *inode);
74int ocfs2_drop_inode_locks(struct inode *inode); 90int ocfs2_drop_inode_locks(struct inode *inode);
@@ -103,6 +119,9 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex);
103void ocfs2_dentry_unlock(struct dentry *dentry, int ex); 119void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
104int ocfs2_file_lock(struct file *file, int ex, int trylock); 120int ocfs2_file_lock(struct file *file, int ex, int trylock);
105void ocfs2_file_unlock(struct file *file); 121void ocfs2_file_unlock(struct file *file);
122int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex);
123void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex);
124
106 125
107void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); 126void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
108void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, 127void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 2baedac58234..f2bb1a04d253 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -293,7 +293,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
293 struct ocfs2_extent_block *eb; 293 struct ocfs2_extent_block *eb;
294 struct ocfs2_extent_list *el; 294 struct ocfs2_extent_list *el;
295 295
296 ret = ocfs2_read_block(inode, last_eb_blk, &eb_bh); 296 ret = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
297 if (ret) { 297 if (ret) {
298 mlog_errno(ret); 298 mlog_errno(ret);
299 goto out; 299 goto out;
@@ -302,12 +302,6 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
302 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 302 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
303 el = &eb->h_list; 303 el = &eb->h_list;
304 304
305 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
306 ret = -EROFS;
307 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
308 goto out;
309 }
310
311 if (el->l_tree_depth) { 305 if (el->l_tree_depth) {
312 ocfs2_error(inode->i_sb, 306 ocfs2_error(inode->i_sb,
313 "Inode %lu has non zero tree depth in " 307 "Inode %lu has non zero tree depth in "
@@ -381,23 +375,16 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
381 if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL) 375 if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
382 goto no_more_extents; 376 goto no_more_extents;
383 377
384 ret = ocfs2_read_block(inode, 378 ret = ocfs2_read_extent_block(inode,
385 le64_to_cpu(eb->h_next_leaf_blk), 379 le64_to_cpu(eb->h_next_leaf_blk),
386 &next_eb_bh); 380 &next_eb_bh);
387 if (ret) { 381 if (ret) {
388 mlog_errno(ret); 382 mlog_errno(ret);
389 goto out; 383 goto out;
390 } 384 }
391 next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
392
393 if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) {
394 ret = -EROFS;
395 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb);
396 goto out;
397 }
398 385
386 next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
399 el = &next_eb->h_list; 387 el = &next_eb->h_list;
400
401 i = ocfs2_search_for_hole_index(el, v_cluster); 388 i = ocfs2_search_for_hole_index(el, v_cluster);
402 } 389 }
403 390
@@ -630,7 +617,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
630 if (ret == 0) 617 if (ret == 0)
631 goto out; 618 goto out;
632 619
633 ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh); 620 ret = ocfs2_read_inode_block(inode, &di_bh);
634 if (ret) { 621 if (ret) {
635 mlog_errno(ret); 622 mlog_errno(ret);
636 goto out; 623 goto out;
@@ -819,3 +806,74 @@ out:
819 806
820 return ret; 807 return ret;
821} 808}
809
810int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
811 struct buffer_head *bhs[], int flags,
812 int (*validate)(struct super_block *sb,
813 struct buffer_head *bh))
814{
815 int rc = 0;
816 u64 p_block, p_count;
817 int i, count, done = 0;
818
819 mlog_entry("(inode = %p, v_block = %llu, nr = %d, bhs = %p, "
820 "flags = %x, validate = %p)\n",
821 inode, (unsigned long long)v_block, nr, bhs, flags,
822 validate);
823
824 if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >=
825 i_size_read(inode)) {
826 BUG_ON(!(flags & OCFS2_BH_READAHEAD));
827 goto out;
828 }
829
830 while (done < nr) {
831 down_read(&OCFS2_I(inode)->ip_alloc_sem);
832 rc = ocfs2_extent_map_get_blocks(inode, v_block + done,
833 &p_block, &p_count, NULL);
834 up_read(&OCFS2_I(inode)->ip_alloc_sem);
835 if (rc) {
836 mlog_errno(rc);
837 break;
838 }
839
840 if (!p_block) {
841 rc = -EIO;
842 mlog(ML_ERROR,
843 "Inode #%llu contains a hole at offset %llu\n",
844 (unsigned long long)OCFS2_I(inode)->ip_blkno,
845 (unsigned long long)(v_block + done) <<
846 inode->i_sb->s_blocksize_bits);
847 break;
848 }
849
850 count = nr - done;
851 if (p_count < count)
852 count = p_count;
853
854 /*
855 * If the caller passed us bhs, they should have come
856 * from a previous readahead call to this function. Thus,
857 * they should have the right b_blocknr.
858 */
859 for (i = 0; i < count; i++) {
860 if (!bhs[done + i])
861 continue;
862 BUG_ON(bhs[done + i]->b_blocknr != (p_block + i));
863 }
864
865 rc = ocfs2_read_blocks(inode, p_block, count, bhs + done,
866 flags, validate);
867 if (rc) {
868 mlog_errno(rc);
869 break;
870 }
871 done += count;
872 }
873
874out:
875 mlog_exit(rc);
876 return rc;
877}
878
879
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index 1c4aa8b06f34..b7dd9731b462 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -57,4 +57,28 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
57 u32 *p_cluster, u32 *num_clusters, 57 u32 *p_cluster, u32 *num_clusters,
58 struct ocfs2_extent_list *el); 58 struct ocfs2_extent_list *el);
59 59
60int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
61 struct buffer_head *bhs[], int flags,
62 int (*validate)(struct super_block *sb,
63 struct buffer_head *bh));
64static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block,
65 struct buffer_head **bh,
66 int (*validate)(struct super_block *sb,
67 struct buffer_head *bh))
68{
69 int status = 0;
70
71 if (bh == NULL) {
72 printk("ocfs2: bh == NULL\n");
73 status = -EINVAL;
74 goto bail;
75 }
76
77 status = ocfs2_read_virt_blocks(inode, v_block, 1, bh, 0, validate);
78
79bail:
80 return status;
81}
82
83
60#endif /* _EXTENT_MAP_H */ 84#endif /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index e2570a3bc2b2..a5887df2cd8a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -35,6 +35,7 @@
35#include <linux/mount.h> 35#include <linux/mount.h>
36#include <linux/writeback.h> 36#include <linux/writeback.h>
37#include <linux/falloc.h> 37#include <linux/falloc.h>
38#include <linux/quotaops.h>
38 39
39#define MLOG_MASK_PREFIX ML_INODE 40#define MLOG_MASK_PREFIX ML_INODE
40#include <cluster/masklog.h> 41#include <cluster/masklog.h>
@@ -56,6 +57,8 @@
56#include "suballoc.h" 57#include "suballoc.h"
57#include "super.h" 58#include "super.h"
58#include "xattr.h" 59#include "xattr.h"
60#include "acl.h"
61#include "quota.h"
59 62
60#include "buffer_head_io.h" 63#include "buffer_head_io.h"
61 64
@@ -253,8 +256,8 @@ int ocfs2_update_inode_atime(struct inode *inode,
253 goto out; 256 goto out;
254 } 257 }
255 258
256 ret = ocfs2_journal_access(handle, inode, bh, 259 ret = ocfs2_journal_access_di(handle, inode, bh,
257 OCFS2_JOURNAL_ACCESS_WRITE); 260 OCFS2_JOURNAL_ACCESS_WRITE);
258 if (ret) { 261 if (ret) {
259 mlog_errno(ret); 262 mlog_errno(ret);
260 goto out_commit; 263 goto out_commit;
@@ -303,9 +306,9 @@ bail:
303 return status; 306 return status;
304} 307}
305 308
306static int ocfs2_simple_size_update(struct inode *inode, 309int ocfs2_simple_size_update(struct inode *inode,
307 struct buffer_head *di_bh, 310 struct buffer_head *di_bh,
308 u64 new_i_size) 311 u64 new_i_size)
309{ 312{
310 int ret; 313 int ret;
311 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 314 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -350,8 +353,8 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
350 goto out; 353 goto out;
351 } 354 }
352 355
353 status = ocfs2_journal_access(handle, inode, fe_bh, 356 status = ocfs2_journal_access_di(handle, inode, fe_bh,
354 OCFS2_JOURNAL_ACCESS_WRITE); 357 OCFS2_JOURNAL_ACCESS_WRITE);
355 if (status < 0) { 358 if (status < 0) {
356 mlog_errno(status); 359 mlog_errno(status);
357 goto out_commit; 360 goto out_commit;
@@ -401,12 +404,9 @@ static int ocfs2_truncate_file(struct inode *inode,
401 (unsigned long long)OCFS2_I(inode)->ip_blkno, 404 (unsigned long long)OCFS2_I(inode)->ip_blkno,
402 (unsigned long long)new_i_size); 405 (unsigned long long)new_i_size);
403 406
407 /* We trust di_bh because it comes from ocfs2_inode_lock(), which
408 * already validated it */
404 fe = (struct ocfs2_dinode *) di_bh->b_data; 409 fe = (struct ocfs2_dinode *) di_bh->b_data;
405 if (!OCFS2_IS_VALID_DINODE(fe)) {
406 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
407 status = -EIO;
408 goto bail;
409 }
410 410
411 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), 411 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
412 "Inode %llu, inode i_size = %lld != di " 412 "Inode %llu, inode i_size = %lld != di "
@@ -536,6 +536,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
536 enum ocfs2_alloc_restarted why; 536 enum ocfs2_alloc_restarted why;
537 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 537 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
538 struct ocfs2_extent_tree et; 538 struct ocfs2_extent_tree et;
539 int did_quota = 0;
539 540
540 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); 541 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
541 542
@@ -545,18 +546,12 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
545 */ 546 */
546 BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); 547 BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
547 548
548 status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); 549 status = ocfs2_read_inode_block(inode, &bh);
549 if (status < 0) { 550 if (status < 0) {
550 mlog_errno(status); 551 mlog_errno(status);
551 goto leave; 552 goto leave;
552 } 553 }
553
554 fe = (struct ocfs2_dinode *) bh->b_data; 554 fe = (struct ocfs2_dinode *) bh->b_data;
555 if (!OCFS2_IS_VALID_DINODE(fe)) {
556 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
557 status = -EIO;
558 goto leave;
559 }
560 555
561restart_all: 556restart_all:
562 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 557 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
@@ -585,11 +580,18 @@ restart_all:
585 } 580 }
586 581
587restarted_transaction: 582restarted_transaction:
583 if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb,
584 clusters_to_add))) {
585 status = -EDQUOT;
586 goto leave;
587 }
588 did_quota = 1;
589
588 /* reserve a write to the file entry early on - that we if we 590 /* reserve a write to the file entry early on - that we if we
589 * run out of credits in the allocation path, we can still 591 * run out of credits in the allocation path, we can still
590 * update i_size. */ 592 * update i_size. */
591 status = ocfs2_journal_access(handle, inode, bh, 593 status = ocfs2_journal_access_di(handle, inode, bh,
592 OCFS2_JOURNAL_ACCESS_WRITE); 594 OCFS2_JOURNAL_ACCESS_WRITE);
593 if (status < 0) { 595 if (status < 0) {
594 mlog_errno(status); 596 mlog_errno(status);
595 goto leave; 597 goto leave;
@@ -622,6 +624,10 @@ restarted_transaction:
622 spin_lock(&OCFS2_I(inode)->ip_lock); 624 spin_lock(&OCFS2_I(inode)->ip_lock);
623 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); 625 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
624 spin_unlock(&OCFS2_I(inode)->ip_lock); 626 spin_unlock(&OCFS2_I(inode)->ip_lock);
627 /* Release unused quota reservation */
628 vfs_dq_free_space(inode,
629 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
630 did_quota = 0;
625 631
626 if (why != RESTART_NONE && clusters_to_add) { 632 if (why != RESTART_NONE && clusters_to_add) {
627 if (why == RESTART_META) { 633 if (why == RESTART_META) {
@@ -654,6 +660,9 @@ restarted_transaction:
654 OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode)); 660 OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
655 661
656leave: 662leave:
663 if (status < 0 && did_quota)
664 vfs_dq_free_space(inode,
665 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
657 if (handle) { 666 if (handle) {
658 ocfs2_commit_trans(osb, handle); 667 ocfs2_commit_trans(osb, handle);
659 handle = NULL; 668 handle = NULL;
@@ -885,6 +894,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
885 struct ocfs2_super *osb = OCFS2_SB(sb); 894 struct ocfs2_super *osb = OCFS2_SB(sb);
886 struct buffer_head *bh = NULL; 895 struct buffer_head *bh = NULL;
887 handle_t *handle = NULL; 896 handle_t *handle = NULL;
897 int locked[MAXQUOTAS] = {0, 0};
898 int credits, qtype;
899 struct ocfs2_mem_dqinfo *oinfo;
888 900
889 mlog_entry("(0x%p, '%.*s')\n", dentry, 901 mlog_entry("(0x%p, '%.*s')\n", dentry,
890 dentry->d_name.len, dentry->d_name.name); 902 dentry->d_name.len, dentry->d_name.name);
@@ -955,11 +967,47 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
955 } 967 }
956 } 968 }
957 969
958 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 970 if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
959 if (IS_ERR(handle)) { 971 (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
960 status = PTR_ERR(handle); 972 credits = OCFS2_INODE_UPDATE_CREDITS;
961 mlog_errno(status); 973 if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
962 goto bail_unlock; 974 && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
975 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
976 oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv;
977 status = ocfs2_lock_global_qf(oinfo, 1);
978 if (status < 0)
979 goto bail_unlock;
980 credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) +
981 ocfs2_calc_qdel_credits(sb, USRQUOTA);
982 locked[USRQUOTA] = 1;
983 }
984 if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
985 && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
986 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
987 oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv;
988 status = ocfs2_lock_global_qf(oinfo, 1);
989 if (status < 0)
990 goto bail_unlock;
991 credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) +
992 ocfs2_calc_qdel_credits(sb, GRPQUOTA);
993 locked[GRPQUOTA] = 1;
994 }
995 handle = ocfs2_start_trans(osb, credits);
996 if (IS_ERR(handle)) {
997 status = PTR_ERR(handle);
998 mlog_errno(status);
999 goto bail_unlock;
1000 }
1001 status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
1002 if (status < 0)
1003 goto bail_commit;
1004 } else {
1005 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1006 if (IS_ERR(handle)) {
1007 status = PTR_ERR(handle);
1008 mlog_errno(status);
1009 goto bail_unlock;
1010 }
963 } 1011 }
964 1012
965 /* 1013 /*
@@ -982,6 +1030,12 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
982bail_commit: 1030bail_commit:
983 ocfs2_commit_trans(osb, handle); 1031 ocfs2_commit_trans(osb, handle);
984bail_unlock: 1032bail_unlock:
1033 for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
1034 if (!locked[qtype])
1035 continue;
1036 oinfo = sb_dqinfo(sb, qtype)->dqi_priv;
1037 ocfs2_unlock_global_qf(oinfo, 1);
1038 }
985 ocfs2_inode_unlock(inode, 1); 1039 ocfs2_inode_unlock(inode, 1);
986bail_unlock_rw: 1040bail_unlock_rw:
987 if (size_change) 1041 if (size_change)
@@ -989,6 +1043,12 @@ bail_unlock_rw:
989bail: 1043bail:
990 brelse(bh); 1044 brelse(bh);
991 1045
1046 if (!status && attr->ia_valid & ATTR_MODE) {
1047 status = ocfs2_acl_chmod(inode);
1048 if (status < 0)
1049 mlog_errno(status);
1050 }
1051
992 mlog_exit(status); 1052 mlog_exit(status);
993 return status; 1053 return status;
994} 1054}
@@ -1035,7 +1095,7 @@ int ocfs2_permission(struct inode *inode, int mask)
1035 goto out; 1095 goto out;
1036 } 1096 }
1037 1097
1038 ret = generic_permission(inode, mask, NULL); 1098 ret = generic_permission(inode, mask, ocfs2_check_acl);
1039 1099
1040 ocfs2_inode_unlock(inode, 0); 1100 ocfs2_inode_unlock(inode, 0);
1041out: 1101out:
@@ -1061,8 +1121,8 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
1061 goto out; 1121 goto out;
1062 } 1122 }
1063 1123
1064 ret = ocfs2_journal_access(handle, inode, bh, 1124 ret = ocfs2_journal_access_di(handle, inode, bh,
1065 OCFS2_JOURNAL_ACCESS_WRITE); 1125 OCFS2_JOURNAL_ACCESS_WRITE);
1066 if (ret < 0) { 1126 if (ret < 0) {
1067 mlog_errno(ret); 1127 mlog_errno(ret);
1068 goto out_trans; 1128 goto out_trans;
@@ -1128,9 +1188,8 @@ static int ocfs2_write_remove_suid(struct inode *inode)
1128{ 1188{
1129 int ret; 1189 int ret;
1130 struct buffer_head *bh = NULL; 1190 struct buffer_head *bh = NULL;
1131 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1132 1191
1133 ret = ocfs2_read_block(inode, oi->ip_blkno, &bh); 1192 ret = ocfs2_read_inode_block(inode, &bh);
1134 if (ret < 0) { 1193 if (ret < 0) {
1135 mlog_errno(ret); 1194 mlog_errno(ret);
1136 goto out; 1195 goto out;
@@ -1156,8 +1215,7 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
1156 struct buffer_head *di_bh = NULL; 1215 struct buffer_head *di_bh = NULL;
1157 1216
1158 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1217 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1159 ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, 1218 ret = ocfs2_read_inode_block(inode, &di_bh);
1160 &di_bh);
1161 if (ret) { 1219 if (ret) {
1162 mlog_errno(ret); 1220 mlog_errno(ret);
1163 goto out; 1221 goto out;
@@ -1226,83 +1284,6 @@ out:
1226 return ret; 1284 return ret;
1227} 1285}
1228 1286
1229static int __ocfs2_remove_inode_range(struct inode *inode,
1230 struct buffer_head *di_bh,
1231 u32 cpos, u32 phys_cpos, u32 len,
1232 struct ocfs2_cached_dealloc_ctxt *dealloc)
1233{
1234 int ret;
1235 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
1236 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1237 struct inode *tl_inode = osb->osb_tl_inode;
1238 handle_t *handle;
1239 struct ocfs2_alloc_context *meta_ac = NULL;
1240 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1241 struct ocfs2_extent_tree et;
1242
1243 ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
1244
1245 ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
1246 if (ret) {
1247 mlog_errno(ret);
1248 return ret;
1249 }
1250
1251 mutex_lock(&tl_inode->i_mutex);
1252
1253 if (ocfs2_truncate_log_needs_flush(osb)) {
1254 ret = __ocfs2_flush_truncate_log(osb);
1255 if (ret < 0) {
1256 mlog_errno(ret);
1257 goto out;
1258 }
1259 }
1260
1261 handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
1262 if (IS_ERR(handle)) {
1263 ret = PTR_ERR(handle);
1264 mlog_errno(ret);
1265 goto out;
1266 }
1267
1268 ret = ocfs2_journal_access(handle, inode, di_bh,
1269 OCFS2_JOURNAL_ACCESS_WRITE);
1270 if (ret) {
1271 mlog_errno(ret);
1272 goto out;
1273 }
1274
1275 ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
1276 dealloc);
1277 if (ret) {
1278 mlog_errno(ret);
1279 goto out_commit;
1280 }
1281
1282 OCFS2_I(inode)->ip_clusters -= len;
1283 di->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
1284
1285 ret = ocfs2_journal_dirty(handle, di_bh);
1286 if (ret) {
1287 mlog_errno(ret);
1288 goto out_commit;
1289 }
1290
1291 ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
1292 if (ret)
1293 mlog_errno(ret);
1294
1295out_commit:
1296 ocfs2_commit_trans(osb, handle);
1297out:
1298 mutex_unlock(&tl_inode->i_mutex);
1299
1300 if (meta_ac)
1301 ocfs2_free_alloc_context(meta_ac);
1302
1303 return ret;
1304}
1305
1306/* 1287/*
1307 * Truncate a byte range, avoiding pages within partial clusters. This 1288 * Truncate a byte range, avoiding pages within partial clusters. This
1308 * preserves those pages for the zeroing code to write to. 1289 * preserves those pages for the zeroing code to write to.
@@ -1402,7 +1383,9 @@ static int ocfs2_remove_inode_range(struct inode *inode,
1402 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1383 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1403 struct ocfs2_cached_dealloc_ctxt dealloc; 1384 struct ocfs2_cached_dealloc_ctxt dealloc;
1404 struct address_space *mapping = inode->i_mapping; 1385 struct address_space *mapping = inode->i_mapping;
1386 struct ocfs2_extent_tree et;
1405 1387
1388 ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
1406 ocfs2_init_dealloc_ctxt(&dealloc); 1389 ocfs2_init_dealloc_ctxt(&dealloc);
1407 1390
1408 if (byte_len == 0) 1391 if (byte_len == 0)
@@ -1458,9 +1441,9 @@ static int ocfs2_remove_inode_range(struct inode *inode,
1458 1441
1459 /* Only do work for non-holes */ 1442 /* Only do work for non-holes */
1460 if (phys_cpos != 0) { 1443 if (phys_cpos != 0) {
1461 ret = __ocfs2_remove_inode_range(inode, di_bh, cpos, 1444 ret = ocfs2_remove_btree_range(inode, &et, cpos,
1462 phys_cpos, alloc_size, 1445 phys_cpos, alloc_size,
1463 &dealloc); 1446 &dealloc);
1464 if (ret) { 1447 if (ret) {
1465 mlog_errno(ret); 1448 mlog_errno(ret);
1466 goto out; 1449 goto out;
@@ -1622,7 +1605,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
1622 struct ocfs2_space_resv *sr) 1605 struct ocfs2_space_resv *sr)
1623{ 1606{
1624 struct inode *inode = file->f_path.dentry->d_inode; 1607 struct inode *inode = file->f_path.dentry->d_inode;
1625 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);; 1608 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1626 1609
1627 if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && 1610 if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
1628 !ocfs2_writes_unwritten_extents(osb)) 1611 !ocfs2_writes_unwritten_extents(osb))
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index e92382cbca5f..172f9fbc9fc7 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -51,6 +51,9 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
51 struct ocfs2_alloc_context *data_ac, 51 struct ocfs2_alloc_context *data_ac,
52 struct ocfs2_alloc_context *meta_ac, 52 struct ocfs2_alloc_context *meta_ac,
53 enum ocfs2_alloc_restarted *reason_ret); 53 enum ocfs2_alloc_restarted *reason_ret);
54int ocfs2_simple_size_update(struct inode *inode,
55 struct buffer_head *di_bh,
56 u64 new_i_size);
54int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, 57int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
55 u64 zero_to); 58 u64 zero_to);
56int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); 59int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7aa00d511874..229e707bc050 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -28,6 +28,7 @@
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/pagemap.h> 30#include <linux/pagemap.h>
31#include <linux/quotaops.h>
31 32
32#include <asm/byteorder.h> 33#include <asm/byteorder.h>
33 34
@@ -37,6 +38,7 @@
37#include "ocfs2.h" 38#include "ocfs2.h"
38 39
39#include "alloc.h" 40#include "alloc.h"
41#include "blockcheck.h"
40#include "dlmglue.h" 42#include "dlmglue.h"
41#include "extent_map.h" 43#include "extent_map.h"
42#include "file.h" 44#include "file.h"
@@ -214,12 +216,11 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
214 return 0; 216 return 0;
215} 217}
216 218
217int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, 219void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
218 int create_ino) 220 int create_ino)
219{ 221{
220 struct super_block *sb; 222 struct super_block *sb;
221 struct ocfs2_super *osb; 223 struct ocfs2_super *osb;
222 int status = -EINVAL;
223 int use_plocks = 1; 224 int use_plocks = 1;
224 225
225 mlog_entry("(0x%p, size:%llu)\n", inode, 226 mlog_entry("(0x%p, size:%llu)\n", inode,
@@ -232,25 +233,17 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
232 ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks()) 233 ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks())
233 use_plocks = 0; 234 use_plocks = 0;
234 235
235 /* this means that read_inode cannot create a superblock inode 236 /*
236 * today. change if needed. */ 237 * These have all been checked by ocfs2_read_inode_block() or set
237 if (!OCFS2_IS_VALID_DINODE(fe) || 238 * by ocfs2_mknod_locked(), so a failure is a code bug.
238 !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { 239 */
239 mlog(0, "Invalid dinode: i_ino=%lu, i_blkno=%llu, " 240 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); /* This means that read_inode
240 "signature = %.*s, flags = 0x%x\n", 241 cannot create a superblock
241 inode->i_ino, 242 inode today. change if
242 (unsigned long long)le64_to_cpu(fe->i_blkno), 7, 243 that is needed. */
243 fe->i_signature, le32_to_cpu(fe->i_flags)); 244 BUG_ON(!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)));
244 goto bail; 245 BUG_ON(le32_to_cpu(fe->i_fs_generation) != osb->fs_generation);
245 }
246 246
247 if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) {
248 mlog(ML_ERROR, "file entry generation does not match "
249 "superblock! osb->fs_generation=%x, "
250 "fe->i_fs_generation=%x\n",
251 osb->fs_generation, le32_to_cpu(fe->i_fs_generation));
252 goto bail;
253 }
254 247
255 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 248 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
256 OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); 249 OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
@@ -284,14 +277,18 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
284 277
285 inode->i_nlink = le16_to_cpu(fe->i_links_count); 278 inode->i_nlink = le16_to_cpu(fe->i_links_count);
286 279
287 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) 280 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
288 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; 281 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
282 inode->i_flags |= S_NOQUOTA;
283 }
289 284
290 if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) { 285 if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
291 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; 286 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
292 mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino); 287 mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
293 } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) { 288 } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
294 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; 289 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
290 } else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) {
291 inode->i_flags |= S_NOQUOTA;
295 } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) { 292 } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
296 mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino); 293 mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
297 /* we can't actually hit this as read_inode can't 294 /* we can't actually hit this as read_inode can't
@@ -354,10 +351,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
354 351
355 ocfs2_set_inode_flags(inode); 352 ocfs2_set_inode_flags(inode);
356 353
357 status = 0; 354 mlog_exit_void();
358bail:
359 mlog_exit(status);
360 return status;
361} 355}
362 356
363static int ocfs2_read_locked_inode(struct inode *inode, 357static int ocfs2_read_locked_inode(struct inode *inode,
@@ -460,11 +454,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
460 } 454 }
461 } 455 }
462 456
463 if (can_lock) 457 if (can_lock) {
464 status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh, 458 status = ocfs2_read_inode_block_full(inode, &bh,
465 OCFS2_BH_IGNORE_CACHE); 459 OCFS2_BH_IGNORE_CACHE);
466 else 460 } else {
467 status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); 461 status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
462 if (!status)
463 status = ocfs2_validate_inode_block(osb->sb, bh);
464 }
468 if (status < 0) { 465 if (status < 0) {
469 mlog_errno(status); 466 mlog_errno(status);
470 goto bail; 467 goto bail;
@@ -472,12 +469,6 @@ static int ocfs2_read_locked_inode(struct inode *inode,
472 469
473 status = -EINVAL; 470 status = -EINVAL;
474 fe = (struct ocfs2_dinode *) bh->b_data; 471 fe = (struct ocfs2_dinode *) bh->b_data;
475 if (!OCFS2_IS_VALID_DINODE(fe)) {
476 mlog(0, "Invalid dinode #%llu: signature = %.*s\n",
477 (unsigned long long)args->fi_blkno, 7,
478 fe->i_signature);
479 goto bail;
480 }
481 472
482 /* 473 /*
483 * This is a code bug. Right now the caller needs to 474 * This is a code bug. Right now the caller needs to
@@ -491,10 +482,9 @@ static int ocfs2_read_locked_inode(struct inode *inode,
491 482
492 if (S_ISCHR(le16_to_cpu(fe->i_mode)) || 483 if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
493 S_ISBLK(le16_to_cpu(fe->i_mode))) 484 S_ISBLK(le16_to_cpu(fe->i_mode)))
494 inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); 485 inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
495 486
496 if (ocfs2_populate_inode(inode, fe, 0) < 0) 487 ocfs2_populate_inode(inode, fe, 0);
497 goto bail;
498 488
499 BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); 489 BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
500 490
@@ -547,8 +537,8 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
547 goto out; 537 goto out;
548 } 538 }
549 539
550 status = ocfs2_journal_access(handle, inode, fe_bh, 540 status = ocfs2_journal_access_di(handle, inode, fe_bh,
551 OCFS2_JOURNAL_ACCESS_WRITE); 541 OCFS2_JOURNAL_ACCESS_WRITE);
552 if (status < 0) { 542 if (status < 0) {
553 mlog_errno(status); 543 mlog_errno(status);
554 goto out; 544 goto out;
@@ -615,7 +605,8 @@ static int ocfs2_remove_inode(struct inode *inode,
615 goto bail; 605 goto bail;
616 } 606 }
617 607
618 handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS); 608 handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS +
609 ocfs2_quota_trans_credits(inode->i_sb));
619 if (IS_ERR(handle)) { 610 if (IS_ERR(handle)) {
620 status = PTR_ERR(handle); 611 status = PTR_ERR(handle);
621 mlog_errno(status); 612 mlog_errno(status);
@@ -630,8 +621,8 @@ static int ocfs2_remove_inode(struct inode *inode,
630 } 621 }
631 622
632 /* set the inodes dtime */ 623 /* set the inodes dtime */
633 status = ocfs2_journal_access(handle, inode, di_bh, 624 status = ocfs2_journal_access_di(handle, inode, di_bh,
634 OCFS2_JOURNAL_ACCESS_WRITE); 625 OCFS2_JOURNAL_ACCESS_WRITE);
635 if (status < 0) { 626 if (status < 0) {
636 mlog_errno(status); 627 mlog_errno(status);
637 goto bail_commit; 628 goto bail_commit;
@@ -647,6 +638,7 @@ static int ocfs2_remove_inode(struct inode *inode,
647 } 638 }
648 639
649 ocfs2_remove_from_cache(inode, di_bh); 640 ocfs2_remove_from_cache(inode, di_bh);
641 vfs_dq_free_inode(inode);
650 642
651 status = ocfs2_free_dinode(handle, inode_alloc_inode, 643 status = ocfs2_free_dinode(handle, inode_alloc_inode,
652 inode_alloc_bh, di); 644 inode_alloc_bh, di);
@@ -929,7 +921,10 @@ void ocfs2_delete_inode(struct inode *inode)
929 921
930 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); 922 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
931 923
932 if (is_bad_inode(inode)) { 924 /* When we fail in read_inode() we mark inode as bad. The second test
925 * catches the case when inode allocation fails before allocating
926 * a block for inode. */
927 if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) {
933 mlog(0, "Skipping delete of bad inode\n"); 928 mlog(0, "Skipping delete of bad inode\n");
934 goto bail; 929 goto bail;
935 } 930 }
@@ -1195,8 +1190,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
1195 mlog_entry("(inode %llu)\n", 1190 mlog_entry("(inode %llu)\n",
1196 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1191 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1197 1192
1198 status = ocfs2_journal_access(handle, inode, bh, 1193 status = ocfs2_journal_access_di(handle, inode, bh,
1199 OCFS2_JOURNAL_ACCESS_WRITE); 1194 OCFS2_JOURNAL_ACCESS_WRITE);
1200 if (status < 0) { 1195 if (status < 0) {
1201 mlog_errno(status); 1196 mlog_errno(status);
1202 goto leave; 1197 goto leave;
@@ -1264,3 +1259,89 @@ void ocfs2_refresh_inode(struct inode *inode,
1264 1259
1265 spin_unlock(&OCFS2_I(inode)->ip_lock); 1260 spin_unlock(&OCFS2_I(inode)->ip_lock);
1266} 1261}
1262
1263int ocfs2_validate_inode_block(struct super_block *sb,
1264 struct buffer_head *bh)
1265{
1266 int rc;
1267 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
1268
1269 mlog(0, "Validating dinode %llu\n",
1270 (unsigned long long)bh->b_blocknr);
1271
1272 BUG_ON(!buffer_uptodate(bh));
1273
1274 /*
1275 * If the ecc fails, we return the error but otherwise
1276 * leave the filesystem running. We know any error is
1277 * local to this block.
1278 */
1279 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check);
1280 if (rc) {
1281 mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
1282 (unsigned long long)bh->b_blocknr);
1283 goto bail;
1284 }
1285
1286 /*
1287 * Errors after here are fatal.
1288 */
1289
1290 rc = -EINVAL;
1291
1292 if (!OCFS2_IS_VALID_DINODE(di)) {
1293 ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
1294 (unsigned long long)bh->b_blocknr, 7,
1295 di->i_signature);
1296 goto bail;
1297 }
1298
1299 if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
1300 ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
1301 (unsigned long long)bh->b_blocknr,
1302 (unsigned long long)le64_to_cpu(di->i_blkno));
1303 goto bail;
1304 }
1305
1306 if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
1307 ocfs2_error(sb,
1308 "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
1309 (unsigned long long)bh->b_blocknr);
1310 goto bail;
1311 }
1312
1313 if (le32_to_cpu(di->i_fs_generation) !=
1314 OCFS2_SB(sb)->fs_generation) {
1315 ocfs2_error(sb,
1316 "Invalid dinode #%llu: fs_generation is %u\n",
1317 (unsigned long long)bh->b_blocknr,
1318 le32_to_cpu(di->i_fs_generation));
1319 goto bail;
1320 }
1321
1322 rc = 0;
1323
1324bail:
1325 return rc;
1326}
1327
1328int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
1329 int flags)
1330{
1331 int rc;
1332 struct buffer_head *tmp = *bh;
1333
1334 rc = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, &tmp,
1335 flags, ocfs2_validate_inode_block);
1336
1337 /* If ocfs2_read_blocks() got us a new bh, pass it up. */
1338 if (!rc && !*bh)
1339 *bh = tmp;
1340
1341 return rc;
1342}
1343
1344int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh)
1345{
1346 return ocfs2_read_inode_block_full(inode, bh, 0);
1347}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 2f37af9bcc4a..eb3c302b38d3 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -128,8 +128,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
128 int sysfile_type); 128 int sysfile_type);
129int ocfs2_inode_init_private(struct inode *inode); 129int ocfs2_inode_init_private(struct inode *inode);
130int ocfs2_inode_revalidate(struct dentry *dentry); 130int ocfs2_inode_revalidate(struct dentry *dentry);
131int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, 131void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
132 int create_ino); 132 int create_ino);
133void ocfs2_read_inode(struct inode *inode); 133void ocfs2_read_inode(struct inode *inode);
134void ocfs2_read_inode2(struct inode *inode, void *opaque); 134void ocfs2_read_inode2(struct inode *inode, void *opaque);
135ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf, 135ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
@@ -142,6 +142,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
142 struct buffer_head *bh); 142 struct buffer_head *bh);
143int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb); 143int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
144int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb); 144int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
145struct buffer_head *ocfs2_bread(struct inode *inode,
146 int block, int *err, int reada);
145 147
146void ocfs2_set_inode_flags(struct inode *inode); 148void ocfs2_set_inode_flags(struct inode *inode);
147void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi); 149void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi);
@@ -153,4 +155,16 @@ static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
153 return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits); 155 return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
154} 156}
155 157
158/* Validate that a bh contains a valid inode */
159int ocfs2_validate_inode_block(struct super_block *sb,
160 struct buffer_head *bh);
161/*
162 * Read an inode block into *bh. If *bh is NULL, a bh will be allocated.
163 * This is a cached read. The inode will be validated with
164 * ocfs2_validate_inode_block().
165 */
166int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh);
167/* The same, but can be passed OCFS2_BH_* flags */
168int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
169 int flags);
156#endif /* OCFS2_INODE_H */ 170#endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 99fe9d584f3c..57d7d25a2b9a 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -35,6 +35,7 @@
35#include "ocfs2.h" 35#include "ocfs2.h"
36 36
37#include "alloc.h" 37#include "alloc.h"
38#include "blockcheck.h"
38#include "dir.h" 39#include "dir.h"
39#include "dlmglue.h" 40#include "dlmglue.h"
40#include "extent_map.h" 41#include "extent_map.h"
@@ -45,6 +46,7 @@
45#include "slot_map.h" 46#include "slot_map.h"
46#include "super.h" 47#include "super.h"
47#include "sysfile.h" 48#include "sysfile.h"
49#include "quota.h"
48 50
49#include "buffer_head_io.h" 51#include "buffer_head_io.h"
50 52
@@ -52,10 +54,10 @@ DEFINE_SPINLOCK(trans_inc_lock);
52 54
53static int ocfs2_force_read_journal(struct inode *inode); 55static int ocfs2_force_read_journal(struct inode *inode);
54static int ocfs2_recover_node(struct ocfs2_super *osb, 56static int ocfs2_recover_node(struct ocfs2_super *osb,
55 int node_num); 57 int node_num, int slot_num);
56static int __ocfs2_recovery_thread(void *arg); 58static int __ocfs2_recovery_thread(void *arg);
57static int ocfs2_commit_cache(struct ocfs2_super *osb); 59static int ocfs2_commit_cache(struct ocfs2_super *osb);
58static int ocfs2_wait_on_mount(struct ocfs2_super *osb); 60static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota);
59static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, 61static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
60 int dirty, int replayed); 62 int dirty, int replayed);
61static int ocfs2_trylock_journal(struct ocfs2_super *osb, 63static int ocfs2_trylock_journal(struct ocfs2_super *osb,
@@ -64,6 +66,17 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
64 int slot); 66 int slot);
65static int ocfs2_commit_thread(void *arg); 67static int ocfs2_commit_thread(void *arg);
66 68
69static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
70{
71 return __ocfs2_wait_on_mount(osb, 0);
72}
73
74static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb)
75{
76 return __ocfs2_wait_on_mount(osb, 1);
77}
78
79
67 80
68/* 81/*
69 * The recovery_list is a simple linked list of node numbers to recover. 82 * The recovery_list is a simple linked list of node numbers to recover.
@@ -256,11 +269,9 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
256 BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE); 269 BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
257 BUG_ON(max_buffs <= 0); 270 BUG_ON(max_buffs <= 0);
258 271
259 /* JBD might support this, but our journalling code doesn't yet. */ 272 /* Nested transaction? Just return the handle... */
260 if (journal_current_handle()) { 273 if (journal_current_handle())
261 mlog(ML_ERROR, "Recursive transaction attempted!\n"); 274 return jbd2_journal_start(journal, max_buffs);
262 BUG();
263 }
264 275
265 down_read(&osb->journal->j_trans_barrier); 276 down_read(&osb->journal->j_trans_barrier);
266 277
@@ -285,16 +296,18 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
285int ocfs2_commit_trans(struct ocfs2_super *osb, 296int ocfs2_commit_trans(struct ocfs2_super *osb,
286 handle_t *handle) 297 handle_t *handle)
287{ 298{
288 int ret; 299 int ret, nested;
289 struct ocfs2_journal *journal = osb->journal; 300 struct ocfs2_journal *journal = osb->journal;
290 301
291 BUG_ON(!handle); 302 BUG_ON(!handle);
292 303
304 nested = handle->h_ref > 1;
293 ret = jbd2_journal_stop(handle); 305 ret = jbd2_journal_stop(handle);
294 if (ret < 0) 306 if (ret < 0)
295 mlog_errno(ret); 307 mlog_errno(ret);
296 308
297 up_read(&journal->j_trans_barrier); 309 if (!nested)
310 up_read(&journal->j_trans_barrier);
298 311
299 return ret; 312 return ret;
300} 313}
@@ -357,10 +370,137 @@ bail:
357 return status; 370 return status;
358} 371}
359 372
360int ocfs2_journal_access(handle_t *handle, 373struct ocfs2_triggers {
361 struct inode *inode, 374 struct jbd2_buffer_trigger_type ot_triggers;
362 struct buffer_head *bh, 375 int ot_offset;
363 int type) 376};
377
378static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers)
379{
380 return container_of(triggers, struct ocfs2_triggers, ot_triggers);
381}
382
383static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
384 struct buffer_head *bh,
385 void *data, size_t size)
386{
387 struct ocfs2_triggers *ot = to_ocfs2_trigger(triggers);
388
389 /*
390 * We aren't guaranteed to have the superblock here, so we
391 * must unconditionally compute the ecc data.
392 * __ocfs2_journal_access() will only set the triggers if
393 * metaecc is enabled.
394 */
395 ocfs2_block_check_compute(data, size, data + ot->ot_offset);
396}
397
398/*
399 * Quota blocks have their own trigger because the struct ocfs2_block_check
400 * offset depends on the blocksize.
401 */
402static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
403 struct buffer_head *bh,
404 void *data, size_t size)
405{
406 struct ocfs2_disk_dqtrailer *dqt =
407 ocfs2_block_dqtrailer(size, data);
408
409 /*
410 * We aren't guaranteed to have the superblock here, so we
411 * must unconditionally compute the ecc data.
412 * __ocfs2_journal_access() will only set the triggers if
413 * metaecc is enabled.
414 */
415 ocfs2_block_check_compute(data, size, &dqt->dq_check);
416}
417
418/*
419 * Directory blocks also have their own trigger because the
420 * struct ocfs2_block_check offset depends on the blocksize.
421 */
422static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
423 struct buffer_head *bh,
424 void *data, size_t size)
425{
426 struct ocfs2_dir_block_trailer *trailer =
427 ocfs2_dir_trailer_from_size(size, data);
428
429 /*
430 * We aren't guaranteed to have the superblock here, so we
431 * must unconditionally compute the ecc data.
432 * __ocfs2_journal_access() will only set the triggers if
433 * metaecc is enabled.
434 */
435 ocfs2_block_check_compute(data, size, &trailer->db_check);
436}
437
438static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
439 struct buffer_head *bh)
440{
441 mlog(ML_ERROR,
442 "ocfs2_abort_trigger called by JBD2. bh = 0x%lx, "
443 "bh->b_blocknr = %llu\n",
444 (unsigned long)bh,
445 (unsigned long long)bh->b_blocknr);
446
447 /* We aren't guaranteed to have the superblock here - but if we
448 * don't, it'll just crash. */
449 ocfs2_error(bh->b_assoc_map->host->i_sb,
450 "JBD2 has aborted our journal, ocfs2 cannot continue\n");
451}
452
453static struct ocfs2_triggers di_triggers = {
454 .ot_triggers = {
455 .t_commit = ocfs2_commit_trigger,
456 .t_abort = ocfs2_abort_trigger,
457 },
458 .ot_offset = offsetof(struct ocfs2_dinode, i_check),
459};
460
461static struct ocfs2_triggers eb_triggers = {
462 .ot_triggers = {
463 .t_commit = ocfs2_commit_trigger,
464 .t_abort = ocfs2_abort_trigger,
465 },
466 .ot_offset = offsetof(struct ocfs2_extent_block, h_check),
467};
468
469static struct ocfs2_triggers gd_triggers = {
470 .ot_triggers = {
471 .t_commit = ocfs2_commit_trigger,
472 .t_abort = ocfs2_abort_trigger,
473 },
474 .ot_offset = offsetof(struct ocfs2_group_desc, bg_check),
475};
476
477static struct ocfs2_triggers db_triggers = {
478 .ot_triggers = {
479 .t_commit = ocfs2_db_commit_trigger,
480 .t_abort = ocfs2_abort_trigger,
481 },
482};
483
484static struct ocfs2_triggers xb_triggers = {
485 .ot_triggers = {
486 .t_commit = ocfs2_commit_trigger,
487 .t_abort = ocfs2_abort_trigger,
488 },
489 .ot_offset = offsetof(struct ocfs2_xattr_block, xb_check),
490};
491
492static struct ocfs2_triggers dq_triggers = {
493 .ot_triggers = {
494 .t_commit = ocfs2_dq_commit_trigger,
495 .t_abort = ocfs2_abort_trigger,
496 },
497};
498
499static int __ocfs2_journal_access(handle_t *handle,
500 struct inode *inode,
501 struct buffer_head *bh,
502 struct ocfs2_triggers *triggers,
503 int type)
364{ 504{
365 int status; 505 int status;
366 506
@@ -406,6 +546,8 @@ int ocfs2_journal_access(handle_t *handle,
406 status = -EINVAL; 546 status = -EINVAL;
407 mlog(ML_ERROR, "Uknown access type!\n"); 547 mlog(ML_ERROR, "Uknown access type!\n");
408 } 548 }
549 if (!status && ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)) && triggers)
550 jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
409 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); 551 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
410 552
411 if (status < 0) 553 if (status < 0)
@@ -416,6 +558,54 @@ int ocfs2_journal_access(handle_t *handle,
416 return status; 558 return status;
417} 559}
418 560
561int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
562 struct buffer_head *bh, int type)
563{
564 return __ocfs2_journal_access(handle, inode, bh, &di_triggers,
565 type);
566}
567
568int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
569 struct buffer_head *bh, int type)
570{
571 return __ocfs2_journal_access(handle, inode, bh, &eb_triggers,
572 type);
573}
574
575int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
576 struct buffer_head *bh, int type)
577{
578 return __ocfs2_journal_access(handle, inode, bh, &gd_triggers,
579 type);
580}
581
582int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
583 struct buffer_head *bh, int type)
584{
585 return __ocfs2_journal_access(handle, inode, bh, &db_triggers,
586 type);
587}
588
589int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
590 struct buffer_head *bh, int type)
591{
592 return __ocfs2_journal_access(handle, inode, bh, &xb_triggers,
593 type);
594}
595
596int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
597 struct buffer_head *bh, int type)
598{
599 return __ocfs2_journal_access(handle, inode, bh, &dq_triggers,
600 type);
601}
602
603int ocfs2_journal_access(handle_t *handle, struct inode *inode,
604 struct buffer_head *bh, int type)
605{
606 return __ocfs2_journal_access(handle, inode, bh, NULL, type);
607}
608
419int ocfs2_journal_dirty(handle_t *handle, 609int ocfs2_journal_dirty(handle_t *handle,
420 struct buffer_head *bh) 610 struct buffer_head *bh)
421{ 611{
@@ -434,20 +624,6 @@ int ocfs2_journal_dirty(handle_t *handle,
434 return status; 624 return status;
435} 625}
436 626
437#ifdef CONFIG_OCFS2_COMPAT_JBD
438int ocfs2_journal_dirty_data(handle_t *handle,
439 struct buffer_head *bh)
440{
441 int err = journal_dirty_data(handle, bh);
442 if (err)
443 mlog_errno(err);
444 /* TODO: When we can handle it, abort the handle and go RO on
445 * error here. */
446
447 return err;
448}
449#endif
450
451#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE) 627#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
452 628
453void ocfs2_set_journal_params(struct ocfs2_super *osb) 629void ocfs2_set_journal_params(struct ocfs2_super *osb)
@@ -587,17 +763,11 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
587 mlog_entry_void(); 763 mlog_entry_void();
588 764
589 fe = (struct ocfs2_dinode *)bh->b_data; 765 fe = (struct ocfs2_dinode *)bh->b_data;
590 if (!OCFS2_IS_VALID_DINODE(fe)) { 766
591 /* This is called from startup/shutdown which will 767 /* The journal bh on the osb always comes from ocfs2_journal_init()
592 * handle the errors in a specific manner, so no need 768 * and was validated there inside ocfs2_inode_lock_full(). It's a
593 * to call ocfs2_error() here. */ 769 * code bug if we mess it up. */
594 mlog(ML_ERROR, "Journal dinode %llu has invalid " 770 BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
595 "signature: %.*s",
596 (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
597 fe->i_signature);
598 status = -EIO;
599 goto out;
600 }
601 771
602 flags = le32_to_cpu(fe->id1.journal1.ij_flags); 772 flags = le32_to_cpu(fe->id1.journal1.ij_flags);
603 if (dirty) 773 if (dirty)
@@ -609,11 +779,11 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
609 if (replayed) 779 if (replayed)
610 ocfs2_bump_recovery_generation(fe); 780 ocfs2_bump_recovery_generation(fe);
611 781
782 ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
612 status = ocfs2_write_block(osb, bh, journal->j_inode); 783 status = ocfs2_write_block(osb, bh, journal->j_inode);
613 if (status < 0) 784 if (status < 0)
614 mlog_errno(status); 785 mlog_errno(status);
615 786
616out:
617 mlog_exit(status); 787 mlog_exit(status);
618 return status; 788 return status;
619} 789}
@@ -878,6 +1048,7 @@ struct ocfs2_la_recovery_item {
878 int lri_slot; 1048 int lri_slot;
879 struct ocfs2_dinode *lri_la_dinode; 1049 struct ocfs2_dinode *lri_la_dinode;
880 struct ocfs2_dinode *lri_tl_dinode; 1050 struct ocfs2_dinode *lri_tl_dinode;
1051 struct ocfs2_quota_recovery *lri_qrec;
881}; 1052};
882 1053
883/* Does the second half of the recovery process. By this point, the 1054/* Does the second half of the recovery process. By this point, the
@@ -898,6 +1069,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
898 struct ocfs2_super *osb = journal->j_osb; 1069 struct ocfs2_super *osb = journal->j_osb;
899 struct ocfs2_dinode *la_dinode, *tl_dinode; 1070 struct ocfs2_dinode *la_dinode, *tl_dinode;
900 struct ocfs2_la_recovery_item *item, *n; 1071 struct ocfs2_la_recovery_item *item, *n;
1072 struct ocfs2_quota_recovery *qrec;
901 LIST_HEAD(tmp_la_list); 1073 LIST_HEAD(tmp_la_list);
902 1074
903 mlog_entry_void(); 1075 mlog_entry_void();
@@ -913,6 +1085,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
913 1085
914 mlog(0, "Complete recovery for slot %d\n", item->lri_slot); 1086 mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
915 1087
1088 ocfs2_wait_on_quotas(osb);
1089
916 la_dinode = item->lri_la_dinode; 1090 la_dinode = item->lri_la_dinode;
917 if (la_dinode) { 1091 if (la_dinode) {
918 mlog(0, "Clean up local alloc %llu\n", 1092 mlog(0, "Clean up local alloc %llu\n",
@@ -943,6 +1117,16 @@ void ocfs2_complete_recovery(struct work_struct *work)
943 if (ret < 0) 1117 if (ret < 0)
944 mlog_errno(ret); 1118 mlog_errno(ret);
945 1119
1120 qrec = item->lri_qrec;
1121 if (qrec) {
1122 mlog(0, "Recovering quota files");
1123 ret = ocfs2_finish_quota_recovery(osb, qrec,
1124 item->lri_slot);
1125 if (ret < 0)
1126 mlog_errno(ret);
1127 /* Recovery info is already freed now */
1128 }
1129
946 kfree(item); 1130 kfree(item);
947 } 1131 }
948 1132
@@ -956,7 +1140,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
956static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, 1140static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
957 int slot_num, 1141 int slot_num,
958 struct ocfs2_dinode *la_dinode, 1142 struct ocfs2_dinode *la_dinode,
959 struct ocfs2_dinode *tl_dinode) 1143 struct ocfs2_dinode *tl_dinode,
1144 struct ocfs2_quota_recovery *qrec)
960{ 1145{
961 struct ocfs2_la_recovery_item *item; 1146 struct ocfs2_la_recovery_item *item;
962 1147
@@ -971,6 +1156,9 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
971 if (tl_dinode) 1156 if (tl_dinode)
972 kfree(tl_dinode); 1157 kfree(tl_dinode);
973 1158
1159 if (qrec)
1160 ocfs2_free_quota_recovery(qrec);
1161
974 mlog_errno(-ENOMEM); 1162 mlog_errno(-ENOMEM);
975 return; 1163 return;
976 } 1164 }
@@ -979,6 +1167,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
979 item->lri_la_dinode = la_dinode; 1167 item->lri_la_dinode = la_dinode;
980 item->lri_slot = slot_num; 1168 item->lri_slot = slot_num;
981 item->lri_tl_dinode = tl_dinode; 1169 item->lri_tl_dinode = tl_dinode;
1170 item->lri_qrec = qrec;
982 1171
983 spin_lock(&journal->j_lock); 1172 spin_lock(&journal->j_lock);
984 list_add_tail(&item->lri_list, &journal->j_la_cleanups); 1173 list_add_tail(&item->lri_list, &journal->j_la_cleanups);
@@ -998,6 +1187,7 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
998 ocfs2_queue_recovery_completion(journal, 1187 ocfs2_queue_recovery_completion(journal,
999 osb->slot_num, 1188 osb->slot_num,
1000 osb->local_alloc_copy, 1189 osb->local_alloc_copy,
1190 NULL,
1001 NULL); 1191 NULL);
1002 ocfs2_schedule_truncate_log_flush(osb, 0); 1192 ocfs2_schedule_truncate_log_flush(osb, 0);
1003 1193
@@ -1006,11 +1196,26 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
1006 } 1196 }
1007} 1197}
1008 1198
1199void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
1200{
1201 if (osb->quota_rec) {
1202 ocfs2_queue_recovery_completion(osb->journal,
1203 osb->slot_num,
1204 NULL,
1205 NULL,
1206 osb->quota_rec);
1207 osb->quota_rec = NULL;
1208 }
1209}
1210
1009static int __ocfs2_recovery_thread(void *arg) 1211static int __ocfs2_recovery_thread(void *arg)
1010{ 1212{
1011 int status, node_num; 1213 int status, node_num, slot_num;
1012 struct ocfs2_super *osb = arg; 1214 struct ocfs2_super *osb = arg;
1013 struct ocfs2_recovery_map *rm = osb->recovery_map; 1215 struct ocfs2_recovery_map *rm = osb->recovery_map;
1216 int *rm_quota = NULL;
1217 int rm_quota_used = 0, i;
1218 struct ocfs2_quota_recovery *qrec;
1014 1219
1015 mlog_entry_void(); 1220 mlog_entry_void();
1016 1221
@@ -1019,6 +1224,11 @@ static int __ocfs2_recovery_thread(void *arg)
1019 goto bail; 1224 goto bail;
1020 } 1225 }
1021 1226
1227 rm_quota = kzalloc(osb->max_slots * sizeof(int), GFP_NOFS);
1228 if (!rm_quota) {
1229 status = -ENOMEM;
1230 goto bail;
1231 }
1022restart: 1232restart:
1023 status = ocfs2_super_lock(osb, 1); 1233 status = ocfs2_super_lock(osb, 1);
1024 if (status < 0) { 1234 if (status < 0) {
@@ -1032,8 +1242,28 @@ restart:
1032 * clear it until ocfs2_recover_node() has succeeded. */ 1242 * clear it until ocfs2_recover_node() has succeeded. */
1033 node_num = rm->rm_entries[0]; 1243 node_num = rm->rm_entries[0];
1034 spin_unlock(&osb->osb_lock); 1244 spin_unlock(&osb->osb_lock);
1035 1245 mlog(0, "checking node %d\n", node_num);
1036 status = ocfs2_recover_node(osb, node_num); 1246 slot_num = ocfs2_node_num_to_slot(osb, node_num);
1247 if (slot_num == -ENOENT) {
1248 status = 0;
1249 mlog(0, "no slot for this node, so no recovery"
1250 "required.\n");
1251 goto skip_recovery;
1252 }
1253 mlog(0, "node %d was using slot %d\n", node_num, slot_num);
1254
1255 /* It is a bit subtle with quota recovery. We cannot do it
1256 * immediately because we have to obtain cluster locks from
1257 * quota files and we also don't want to just skip it because
1258 * then quota usage would be out of sync until some node takes
1259 * the slot. So we remember which nodes need quota recovery
1260 * and when everything else is done, we recover quotas. */
1261 for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++);
1262 if (i == rm_quota_used)
1263 rm_quota[rm_quota_used++] = slot_num;
1264
1265 status = ocfs2_recover_node(osb, node_num, slot_num);
1266skip_recovery:
1037 if (!status) { 1267 if (!status) {
1038 ocfs2_recovery_map_clear(osb, node_num); 1268 ocfs2_recovery_map_clear(osb, node_num);
1039 } else { 1269 } else {
@@ -1055,13 +1285,27 @@ restart:
1055 if (status < 0) 1285 if (status < 0)
1056 mlog_errno(status); 1286 mlog_errno(status);
1057 1287
1288 /* Now it is right time to recover quotas... We have to do this under
1289 * superblock lock so that noone can start using the slot (and crash)
1290 * before we recover it */
1291 for (i = 0; i < rm_quota_used; i++) {
1292 qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
1293 if (IS_ERR(qrec)) {
1294 status = PTR_ERR(qrec);
1295 mlog_errno(status);
1296 continue;
1297 }
1298 ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
1299 NULL, NULL, qrec);
1300 }
1301
1058 ocfs2_super_unlock(osb, 1); 1302 ocfs2_super_unlock(osb, 1);
1059 1303
1060 /* We always run recovery on our own orphan dir - the dead 1304 /* We always run recovery on our own orphan dir - the dead
1061 * node(s) may have disallowd a previos inode delete. Re-processing 1305 * node(s) may have disallowd a previos inode delete. Re-processing
1062 * is therefore required. */ 1306 * is therefore required. */
1063 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, 1307 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
1064 NULL); 1308 NULL, NULL);
1065 1309
1066bail: 1310bail:
1067 mutex_lock(&osb->recovery_lock); 1311 mutex_lock(&osb->recovery_lock);
@@ -1076,6 +1320,9 @@ bail:
1076 1320
1077 mutex_unlock(&osb->recovery_lock); 1321 mutex_unlock(&osb->recovery_lock);
1078 1322
1323 if (rm_quota)
1324 kfree(rm_quota);
1325
1079 mlog_exit(status); 1326 mlog_exit(status);
1080 /* no one is callint kthread_stop() for us so the kthread() api 1327 /* no one is callint kthread_stop() for us so the kthread() api
1081 * requires that we call do_exit(). And it isn't exported, but 1328 * requires that we call do_exit(). And it isn't exported, but
@@ -1135,8 +1382,7 @@ static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
1135 } 1382 }
1136 SET_INODE_JOURNAL(inode); 1383 SET_INODE_JOURNAL(inode);
1137 1384
1138 status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh, 1385 status = ocfs2_read_inode_block_full(inode, bh, OCFS2_BH_IGNORE_CACHE);
1139 OCFS2_BH_IGNORE_CACHE);
1140 if (status < 0) { 1386 if (status < 0) {
1141 mlog_errno(status); 1387 mlog_errno(status);
1142 goto bail; 1388 goto bail;
@@ -1268,6 +1514,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1268 osb->slot_recovery_generations[slot_num] = 1514 osb->slot_recovery_generations[slot_num] =
1269 ocfs2_get_recovery_generation(fe); 1515 ocfs2_get_recovery_generation(fe);
1270 1516
1517 ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
1271 status = ocfs2_write_block(osb, bh, inode); 1518 status = ocfs2_write_block(osb, bh, inode);
1272 if (status < 0) 1519 if (status < 0)
1273 mlog_errno(status); 1520 mlog_errno(status);
@@ -1304,31 +1551,19 @@ done:
1304 * far less concerning. 1551 * far less concerning.
1305 */ 1552 */
1306static int ocfs2_recover_node(struct ocfs2_super *osb, 1553static int ocfs2_recover_node(struct ocfs2_super *osb,
1307 int node_num) 1554 int node_num, int slot_num)
1308{ 1555{
1309 int status = 0; 1556 int status = 0;
1310 int slot_num;
1311 struct ocfs2_dinode *la_copy = NULL; 1557 struct ocfs2_dinode *la_copy = NULL;
1312 struct ocfs2_dinode *tl_copy = NULL; 1558 struct ocfs2_dinode *tl_copy = NULL;
1313 1559
1314 mlog_entry("(node_num=%d, osb->node_num = %d)\n", 1560 mlog_entry("(node_num=%d, slot_num=%d, osb->node_num = %d)\n",
1315 node_num, osb->node_num); 1561 node_num, slot_num, osb->node_num);
1316
1317 mlog(0, "checking node %d\n", node_num);
1318 1562
1319 /* Should not ever be called to recover ourselves -- in that 1563 /* Should not ever be called to recover ourselves -- in that
1320 * case we should've called ocfs2_journal_load instead. */ 1564 * case we should've called ocfs2_journal_load instead. */
1321 BUG_ON(osb->node_num == node_num); 1565 BUG_ON(osb->node_num == node_num);
1322 1566
1323 slot_num = ocfs2_node_num_to_slot(osb, node_num);
1324 if (slot_num == -ENOENT) {
1325 status = 0;
1326 mlog(0, "no slot for this node, so no recovery required.\n");
1327 goto done;
1328 }
1329
1330 mlog(0, "node %d was using slot %d\n", node_num, slot_num);
1331
1332 status = ocfs2_replay_journal(osb, node_num, slot_num); 1567 status = ocfs2_replay_journal(osb, node_num, slot_num);
1333 if (status < 0) { 1568 if (status < 0) {
1334 if (status == -EBUSY) { 1569 if (status == -EBUSY) {
@@ -1364,7 +1599,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
1364 1599
1365 /* This will kfree the memory pointed to by la_copy and tl_copy */ 1600 /* This will kfree the memory pointed to by la_copy and tl_copy */
1366 ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy, 1601 ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
1367 tl_copy); 1602 tl_copy, NULL);
1368 1603
1369 status = 0; 1604 status = 0;
1370done: 1605done:
@@ -1659,13 +1894,14 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
1659 return ret; 1894 return ret;
1660} 1895}
1661 1896
1662static int ocfs2_wait_on_mount(struct ocfs2_super *osb) 1897static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota)
1663{ 1898{
1664 /* This check is good because ocfs2 will wait on our recovery 1899 /* This check is good because ocfs2 will wait on our recovery
1665 * thread before changing it to something other than MOUNTED 1900 * thread before changing it to something other than MOUNTED
1666 * or DISABLED. */ 1901 * or DISABLED. */
1667 wait_event(osb->osb_mount_event, 1902 wait_event(osb->osb_mount_event,
1668 atomic_read(&osb->vol_state) == VOLUME_MOUNTED || 1903 (!quota && atomic_read(&osb->vol_state) == VOLUME_MOUNTED) ||
1904 atomic_read(&osb->vol_state) == VOLUME_MOUNTED_QUOTAS ||
1669 atomic_read(&osb->vol_state) == VOLUME_DISABLED); 1905 atomic_read(&osb->vol_state) == VOLUME_DISABLED);
1670 1906
1671 /* If there's an error on mount, then we may never get to the 1907 /* If there's an error on mount, then we may never get to the
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index d4d14e9a3cea..3c3532e1307c 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -27,12 +27,7 @@
27#define OCFS2_JOURNAL_H 27#define OCFS2_JOURNAL_H
28 28
29#include <linux/fs.h> 29#include <linux/fs.h>
30#ifndef CONFIG_OCFS2_COMPAT_JBD 30#include <linux/jbd2.h>
31# include <linux/jbd2.h>
32#else
33# include <linux/jbd.h>
34# include "ocfs2_jbd_compat.h"
35#endif
36 31
37enum ocfs2_journal_state { 32enum ocfs2_journal_state {
38 OCFS2_JOURNAL_FREE = 0, 33 OCFS2_JOURNAL_FREE = 0,
@@ -173,6 +168,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb,
173 int node_num); 168 int node_num);
174int ocfs2_mark_dead_nodes(struct ocfs2_super *osb); 169int ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
175void ocfs2_complete_mount_recovery(struct ocfs2_super *osb); 170void ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
171void ocfs2_complete_quota_recovery(struct ocfs2_super *osb);
176 172
177static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb) 173static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
178{ 174{
@@ -216,9 +212,12 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
216 * ocfs2_extend_trans - Extend a handle by nblocks credits. This may 212 * ocfs2_extend_trans - Extend a handle by nblocks credits. This may
217 * commit the handle to disk in the process, but will 213 * commit the handle to disk in the process, but will
218 * not release any locks taken during the transaction. 214 * not release any locks taken during the transaction.
219 * ocfs2_journal_access - Notify the handle that we want to journal this 215 * ocfs2_journal_access* - Notify the handle that we want to journal this
220 * buffer. Will have to call ocfs2_journal_dirty once 216 * buffer. Will have to call ocfs2_journal_dirty once
221 * we've actually dirtied it. Type is one of . or . 217 * we've actually dirtied it. Type is one of . or .
218 * Always call the specific flavor of
219 * ocfs2_journal_access_*() unless you intend to
220 * manage the checksum by hand.
222 * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data. 221 * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data.
223 * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before 222 * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before
224 * the current handle commits. 223 * the current handle commits.
@@ -248,10 +247,29 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks);
248#define OCFS2_JOURNAL_ACCESS_WRITE 1 247#define OCFS2_JOURNAL_ACCESS_WRITE 1
249#define OCFS2_JOURNAL_ACCESS_UNDO 2 248#define OCFS2_JOURNAL_ACCESS_UNDO 2
250 249
251int ocfs2_journal_access(handle_t *handle, 250
252 struct inode *inode, 251/* ocfs2_inode */
253 struct buffer_head *bh, 252int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
254 int type); 253 struct buffer_head *bh, int type);
254/* ocfs2_extent_block */
255int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
256 struct buffer_head *bh, int type);
257/* ocfs2_group_desc */
258int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
259 struct buffer_head *bh, int type);
260/* ocfs2_xattr_block */
261int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
262 struct buffer_head *bh, int type);
263/* quota blocks */
264int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
265 struct buffer_head *bh, int type);
266/* dirblock */
267int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
268 struct buffer_head *bh, int type);
269/* Anything that has no ecc */
270int ocfs2_journal_access(handle_t *handle, struct inode *inode,
271 struct buffer_head *bh, int type);
272
255/* 273/*
256 * A word about the journal_access/journal_dirty "dance". It is 274 * A word about the journal_access/journal_dirty "dance". It is
257 * entirely legal to journal_access a buffer more than once (as long 275 * entirely legal to journal_access a buffer more than once (as long
@@ -273,10 +291,6 @@ int ocfs2_journal_access(handle_t *handle,
273 */ 291 */
274int ocfs2_journal_dirty(handle_t *handle, 292int ocfs2_journal_dirty(handle_t *handle,
275 struct buffer_head *bh); 293 struct buffer_head *bh);
276#ifdef CONFIG_OCFS2_COMPAT_JBD
277int ocfs2_journal_dirty_data(handle_t *handle,
278 struct buffer_head *bh);
279#endif
280 294
281/* 295/*
282 * Credit Macros: 296 * Credit Macros:
@@ -293,6 +307,37 @@ int ocfs2_journal_dirty_data(handle_t *handle,
293/* extended attribute block update */ 307/* extended attribute block update */
294#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1 308#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
295 309
310/* global quotafile inode update, data block */
311#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
312
313/*
314 * The two writes below can accidentally see global info dirty due
315 * to set_info() quotactl so make them prepared for the writes.
316 */
317/* quota data block, global info */
318/* Write to local quota file */
319#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1)
320
321/* global quota data block, local quota data block, global quota inode,
322 * global quota info */
323#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3)
324
325static inline int ocfs2_quota_trans_credits(struct super_block *sb)
326{
327 int credits = 0;
328
329 if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA))
330 credits += OCFS2_QWRITE_CREDITS;
331 if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA))
332 credits += OCFS2_QWRITE_CREDITS;
333 return credits;
334}
335
336/* Number of credits needed for removing quota structure from file */
337int ocfs2_calc_qdel_credits(struct super_block *sb, int type);
338/* Number of credits needed for initialization of new quota structure */
339int ocfs2_calc_qinit_credits(struct super_block *sb, int type);
340
296/* group extend. inode update and last group update. */ 341/* group extend. inode update and last group update. */
297#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) 342#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
298 343
@@ -303,8 +348,11 @@ int ocfs2_journal_dirty_data(handle_t *handle,
303 * prev. group desc. if we relink. */ 348 * prev. group desc. if we relink. */
304#define OCFS2_SUBALLOC_ALLOC (3) 349#define OCFS2_SUBALLOC_ALLOC (3)
305 350
306#define OCFS2_INLINE_TO_EXTENTS_CREDITS (OCFS2_SUBALLOC_ALLOC \ 351static inline int ocfs2_inline_to_extents_credits(struct super_block *sb)
307 + OCFS2_INODE_UPDATE_CREDITS) 352{
353 return OCFS2_SUBALLOC_ALLOC + OCFS2_INODE_UPDATE_CREDITS +
354 ocfs2_quota_trans_credits(sb);
355}
308 356
309/* dinode + group descriptor update. We don't relink on free yet. */ 357/* dinode + group descriptor update. We don't relink on free yet. */
310#define OCFS2_SUBALLOC_FREE (2) 358#define OCFS2_SUBALLOC_FREE (2)
@@ -313,16 +361,23 @@ int ocfs2_journal_dirty_data(handle_t *handle,
313#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \ 361#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \
314 + OCFS2_TRUNCATE_LOG_UPDATE) 362 + OCFS2_TRUNCATE_LOG_UPDATE)
315 363
316#define OCFS2_REMOVE_EXTENT_CREDITS (OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS) 364static inline int ocfs2_remove_extent_credits(struct super_block *sb)
365{
366 return OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS +
367 ocfs2_quota_trans_credits(sb);
368}
317 369
318/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + 370/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
319 * bitmap block for the new bit) */ 371 * bitmap block for the new bit) */
320#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2) 372#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
321 373
322/* parent fe, parent block, new file entry, inode alloc fe, inode alloc 374/* parent fe, parent block, new file entry, inode alloc fe, inode alloc
323 * group descriptor + mkdir/symlink blocks */ 375 * group descriptor + mkdir/symlink blocks + quota update */
324#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC \ 376static inline int ocfs2_mknod_credits(struct super_block *sb)
325 + OCFS2_DIR_LINK_ADDITIONAL_CREDITS) 377{
378 return 3 + OCFS2_SUBALLOC_ALLOC + OCFS2_DIR_LINK_ADDITIONAL_CREDITS +
379 ocfs2_quota_trans_credits(sb);
380}
326 381
327/* local alloc metadata change + main bitmap updates */ 382/* local alloc metadata change + main bitmap updates */
328#define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS \ 383#define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS \
@@ -332,13 +387,21 @@ int ocfs2_journal_dirty_data(handle_t *handle,
332 * for the dinode, one for the new block. */ 387 * for the dinode, one for the new block. */
333#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) 388#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
334 389
335/* file update (nlink, etc) + directory mtime/ctime + dir entry block */ 390/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
336#define OCFS2_LINK_CREDITS (2*OCFS2_INODE_UPDATE_CREDITS + 1) 391 * update on dir */
392static inline int ocfs2_link_credits(struct super_block *sb)
393{
394 return 2*OCFS2_INODE_UPDATE_CREDITS + 1 +
395 ocfs2_quota_trans_credits(sb);
396}
337 397
338/* inode + dir inode (if we unlink a dir), + dir entry block + orphan 398/* inode + dir inode (if we unlink a dir), + dir entry block + orphan
339 * dir inode link */ 399 * dir inode link */
340#define OCFS2_UNLINK_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 1 \ 400static inline int ocfs2_unlink_credits(struct super_block *sb)
341 + OCFS2_LINK_CREDITS) 401{
402 /* The quota update from ocfs2_link_credits is unused here... */
403 return 2 * OCFS2_INODE_UPDATE_CREDITS + 1 + ocfs2_link_credits(sb);
404}
342 405
343/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry + 406/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
344 * inode alloc group descriptor */ 407 * inode alloc group descriptor */
@@ -347,8 +410,10 @@ int ocfs2_journal_dirty_data(handle_t *handle,
347/* dinode update, old dir dinode update, new dir dinode update, old 410/* dinode update, old dir dinode update, new dir dinode update, old
348 * dir dir entry, new dir dir entry, dir entry update for renaming 411 * dir dir entry, new dir dir entry, dir entry update for renaming
349 * directory + target unlink */ 412 * directory + target unlink */
350#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \ 413static inline int ocfs2_rename_credits(struct super_block *sb)
351 + OCFS2_UNLINK_CREDITS) 414{
415 return 3 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_unlink_credits(sb);
416}
352 417
353/* global bitmap dinode, group desc., relinked group, 418/* global bitmap dinode, group desc., relinked group,
354 * suballocator dinode, group desc., relinked group, 419 * suballocator dinode, group desc., relinked group,
@@ -386,18 +451,19 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
386 * credit for the dinode there. */ 451 * credit for the dinode there. */
387 extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth); 452 extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
388 453
389 return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks; 454 return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks +
455 ocfs2_quota_trans_credits(sb);
390} 456}
391 457
392static inline int ocfs2_calc_symlink_credits(struct super_block *sb) 458static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
393{ 459{
394 int blocks = OCFS2_MKNOD_CREDITS; 460 int blocks = ocfs2_mknod_credits(sb);
395 461
396 /* links can be longer than one block so we may update many 462 /* links can be longer than one block so we may update many
397 * within our single allocated extent. */ 463 * within our single allocated extent. */
398 blocks += ocfs2_clusters_to_blocks(sb, 1); 464 blocks += ocfs2_clusters_to_blocks(sb, 1);
399 465
400 return blocks; 466 return blocks + ocfs2_quota_trans_credits(sb);
401} 467}
402 468
403static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb, 469static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
@@ -434,6 +500,8 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
434 /* update to the truncate log. */ 500 /* update to the truncate log. */
435 credits += OCFS2_TRUNCATE_LOG_UPDATE; 501 credits += OCFS2_TRUNCATE_LOG_UPDATE;
436 502
503 credits += ocfs2_quota_trans_credits(sb);
504
437 return credits; 505 return credits;
438} 506}
439 507
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 687b28713c32..ec70cdbe77fc 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -36,6 +36,7 @@
36#include "ocfs2.h" 36#include "ocfs2.h"
37 37
38#include "alloc.h" 38#include "alloc.h"
39#include "blockcheck.h"
39#include "dlmglue.h" 40#include "dlmglue.h"
40#include "inode.h" 41#include "inode.h"
41#include "journal.h" 42#include "journal.h"
@@ -248,8 +249,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
248 goto bail; 249 goto bail;
249 } 250 }
250 251
251 status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, 252 status = ocfs2_read_inode_block_full(inode, &alloc_bh,
252 &alloc_bh, OCFS2_BH_IGNORE_CACHE); 253 OCFS2_BH_IGNORE_CACHE);
253 if (status < 0) { 254 if (status < 0) {
254 mlog_errno(status); 255 mlog_errno(status);
255 goto bail; 256 goto bail;
@@ -382,8 +383,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
382 } 383 }
383 memcpy(alloc_copy, alloc, bh->b_size); 384 memcpy(alloc_copy, alloc, bh->b_size);
384 385
385 status = ocfs2_journal_access(handle, local_alloc_inode, bh, 386 status = ocfs2_journal_access_di(handle, local_alloc_inode, bh,
386 OCFS2_JOURNAL_ACCESS_WRITE); 387 OCFS2_JOURNAL_ACCESS_WRITE);
387 if (status < 0) { 388 if (status < 0) {
388 mlog_errno(status); 389 mlog_errno(status);
389 goto out_commit; 390 goto out_commit;
@@ -459,8 +460,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
459 460
460 mutex_lock(&inode->i_mutex); 461 mutex_lock(&inode->i_mutex);
461 462
462 status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, 463 status = ocfs2_read_inode_block_full(inode, &alloc_bh,
463 &alloc_bh, OCFS2_BH_IGNORE_CACHE); 464 OCFS2_BH_IGNORE_CACHE);
464 if (status < 0) { 465 if (status < 0) {
465 mlog_errno(status); 466 mlog_errno(status);
466 goto bail; 467 goto bail;
@@ -476,6 +477,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
476 alloc = (struct ocfs2_dinode *) alloc_bh->b_data; 477 alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
477 ocfs2_clear_local_alloc(alloc); 478 ocfs2_clear_local_alloc(alloc);
478 479
480 ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check);
479 status = ocfs2_write_block(osb, alloc_bh, inode); 481 status = ocfs2_write_block(osb, alloc_bh, inode);
480 if (status < 0) 482 if (status < 0)
481 mlog_errno(status); 483 mlog_errno(status);
@@ -762,9 +764,9 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
762 * delete bits from it! */ 764 * delete bits from it! */
763 *num_bits = bits_wanted; 765 *num_bits = bits_wanted;
764 766
765 status = ocfs2_journal_access(handle, local_alloc_inode, 767 status = ocfs2_journal_access_di(handle, local_alloc_inode,
766 osb->local_alloc_bh, 768 osb->local_alloc_bh,
767 OCFS2_JOURNAL_ACCESS_WRITE); 769 OCFS2_JOURNAL_ACCESS_WRITE);
768 if (status < 0) { 770 if (status < 0) {
769 mlog_errno(status); 771 mlog_errno(status);
770 goto bail; 772 goto bail;
@@ -1240,9 +1242,9 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
1240 } 1242 }
1241 memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size); 1243 memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
1242 1244
1243 status = ocfs2_journal_access(handle, local_alloc_inode, 1245 status = ocfs2_journal_access_di(handle, local_alloc_inode,
1244 osb->local_alloc_bh, 1246 osb->local_alloc_bh,
1245 OCFS2_JOURNAL_ACCESS_WRITE); 1247 OCFS2_JOURNAL_ACCESS_WRITE);
1246 if (status < 0) { 1248 if (status < 0) {
1247 mlog_errno(status); 1249 mlog_errno(status);
1248 goto bail; 1250 goto bail;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2545e7402efe..084aba86c3b2 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -40,6 +40,7 @@
40#include <linux/types.h> 40#include <linux/types.h>
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/quotaops.h>
43 44
44#define MLOG_MASK_PREFIX ML_NAMEI 45#define MLOG_MASK_PREFIX ML_NAMEI
45#include <cluster/masklog.h> 46#include <cluster/masklog.h>
@@ -61,17 +62,18 @@
61#include "sysfile.h" 62#include "sysfile.h"
62#include "uptodate.h" 63#include "uptodate.h"
63#include "xattr.h" 64#include "xattr.h"
65#include "acl.h"
64 66
65#include "buffer_head_io.h" 67#include "buffer_head_io.h"
66 68
67static int ocfs2_mknod_locked(struct ocfs2_super *osb, 69static int ocfs2_mknod_locked(struct ocfs2_super *osb,
68 struct inode *dir, 70 struct inode *dir,
69 struct dentry *dentry, int mode, 71 struct inode *inode,
72 struct dentry *dentry,
70 dev_t dev, 73 dev_t dev,
71 struct buffer_head **new_fe_bh, 74 struct buffer_head **new_fe_bh,
72 struct buffer_head *parent_fe_bh, 75 struct buffer_head *parent_fe_bh,
73 handle_t *handle, 76 handle_t *handle,
74 struct inode **ret_inode,
75 struct ocfs2_alloc_context *inode_ac); 77 struct ocfs2_alloc_context *inode_ac);
76 78
77static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, 79static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
@@ -186,6 +188,35 @@ bail:
186 return ret; 188 return ret;
187} 189}
188 190
191static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
192{
193 struct inode *inode;
194
195 inode = new_inode(dir->i_sb);
196 if (!inode) {
197 mlog(ML_ERROR, "new_inode failed!\n");
198 return NULL;
199 }
200
201 /* populate as many fields early on as possible - many of
202 * these are used by the support functions here and in
203 * callers. */
204 if (S_ISDIR(mode))
205 inode->i_nlink = 2;
206 else
207 inode->i_nlink = 1;
208 inode->i_uid = current_fsuid();
209 if (dir->i_mode & S_ISGID) {
210 inode->i_gid = dir->i_gid;
211 if (S_ISDIR(mode))
212 mode |= S_ISGID;
213 } else
214 inode->i_gid = current_fsgid();
215 inode->i_mode = mode;
216 vfs_dq_init(inode);
217 return inode;
218}
219
189static int ocfs2_mknod(struct inode *dir, 220static int ocfs2_mknod(struct inode *dir,
190 struct dentry *dentry, 221 struct dentry *dentry,
191 int mode, 222 int mode,
@@ -201,6 +232,13 @@ static int ocfs2_mknod(struct inode *dir,
201 struct inode *inode = NULL; 232 struct inode *inode = NULL;
202 struct ocfs2_alloc_context *inode_ac = NULL; 233 struct ocfs2_alloc_context *inode_ac = NULL;
203 struct ocfs2_alloc_context *data_ac = NULL; 234 struct ocfs2_alloc_context *data_ac = NULL;
235 struct ocfs2_alloc_context *xattr_ac = NULL;
236 int want_clusters = 0;
237 int xattr_credits = 0;
238 struct ocfs2_security_xattr_info si = {
239 .enable = 1,
240 };
241 int did_quota_inode = 0;
204 242
205 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, 243 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
206 (unsigned long)dev, dentry->d_name.len, 244 (unsigned long)dev, dentry->d_name.len,
@@ -250,17 +288,46 @@ static int ocfs2_mknod(struct inode *dir,
250 goto leave; 288 goto leave;
251 } 289 }
252 290
253 /* Reserve a cluster if creating an extent based directory. */ 291 inode = ocfs2_get_init_inode(dir, mode);
254 if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) { 292 if (!inode) {
255 status = ocfs2_reserve_clusters(osb, 1, &data_ac); 293 status = -ENOMEM;
256 if (status < 0) { 294 mlog_errno(status);
257 if (status != -ENOSPC) 295 goto leave;
258 mlog_errno(status); 296 }
297
298 /* get security xattr */
299 status = ocfs2_init_security_get(inode, dir, &si);
300 if (status) {
301 if (status == -EOPNOTSUPP)
302 si.enable = 0;
303 else {
304 mlog_errno(status);
259 goto leave; 305 goto leave;
260 } 306 }
261 } 307 }
262 308
263 handle = ocfs2_start_trans(osb, OCFS2_MKNOD_CREDITS); 309 /* calculate meta data/clusters for setting security and acl xattr */
310 status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode,
311 &si, &want_clusters,
312 &xattr_credits, &xattr_ac);
313 if (status < 0) {
314 mlog_errno(status);
315 goto leave;
316 }
317
318 /* Reserve a cluster if creating an extent based directory. */
319 if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb))
320 want_clusters += 1;
321
322 status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
323 if (status < 0) {
324 if (status != -ENOSPC)
325 mlog_errno(status);
326 goto leave;
327 }
328
329 handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb) +
330 xattr_credits);
264 if (IS_ERR(handle)) { 331 if (IS_ERR(handle)) {
265 status = PTR_ERR(handle); 332 status = PTR_ERR(handle);
266 handle = NULL; 333 handle = NULL;
@@ -268,10 +335,19 @@ static int ocfs2_mknod(struct inode *dir,
268 goto leave; 335 goto leave;
269 } 336 }
270 337
338 /* We don't use standard VFS wrapper because we don't want vfs_dq_init
339 * to be called. */
340 if (sb_any_quota_active(osb->sb) &&
341 osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
342 status = -EDQUOT;
343 goto leave;
344 }
345 did_quota_inode = 1;
346
271 /* do the real work now. */ 347 /* do the real work now. */
272 status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev, 348 status = ocfs2_mknod_locked(osb, dir, inode, dentry, dev,
273 &new_fe_bh, parent_fe_bh, handle, 349 &new_fe_bh, parent_fe_bh, handle,
274 &inode, inode_ac); 350 inode_ac);
275 if (status < 0) { 351 if (status < 0) {
276 mlog_errno(status); 352 mlog_errno(status);
277 goto leave; 353 goto leave;
@@ -285,8 +361,8 @@ static int ocfs2_mknod(struct inode *dir,
285 goto leave; 361 goto leave;
286 } 362 }
287 363
288 status = ocfs2_journal_access(handle, dir, parent_fe_bh, 364 status = ocfs2_journal_access_di(handle, dir, parent_fe_bh,
289 OCFS2_JOURNAL_ACCESS_WRITE); 365 OCFS2_JOURNAL_ACCESS_WRITE);
290 if (status < 0) { 366 if (status < 0) {
291 mlog_errno(status); 367 mlog_errno(status);
292 goto leave; 368 goto leave;
@@ -300,6 +376,22 @@ static int ocfs2_mknod(struct inode *dir,
300 inc_nlink(dir); 376 inc_nlink(dir);
301 } 377 }
302 378
379 status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
380 xattr_ac, data_ac);
381 if (status < 0) {
382 mlog_errno(status);
383 goto leave;
384 }
385
386 if (si.enable) {
387 status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
388 xattr_ac, data_ac);
389 if (status < 0) {
390 mlog_errno(status);
391 goto leave;
392 }
393 }
394
303 status = ocfs2_add_entry(handle, dentry, inode, 395 status = ocfs2_add_entry(handle, dentry, inode,
304 OCFS2_I(inode)->ip_blkno, parent_fe_bh, 396 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
305 de_bh); 397 de_bh);
@@ -320,6 +412,8 @@ static int ocfs2_mknod(struct inode *dir,
320 d_instantiate(dentry, inode); 412 d_instantiate(dentry, inode);
321 status = 0; 413 status = 0;
322leave: 414leave:
415 if (status < 0 && did_quota_inode)
416 vfs_dq_free_inode(inode);
323 if (handle) 417 if (handle)
324 ocfs2_commit_trans(osb, handle); 418 ocfs2_commit_trans(osb, handle);
325 419
@@ -331,9 +425,13 @@ leave:
331 brelse(new_fe_bh); 425 brelse(new_fe_bh);
332 brelse(de_bh); 426 brelse(de_bh);
333 brelse(parent_fe_bh); 427 brelse(parent_fe_bh);
428 kfree(si.name);
429 kfree(si.value);
334 430
335 if ((status < 0) && inode) 431 if ((status < 0) && inode) {
432 clear_nlink(inode);
336 iput(inode); 433 iput(inode);
434 }
337 435
338 if (inode_ac) 436 if (inode_ac)
339 ocfs2_free_alloc_context(inode_ac); 437 ocfs2_free_alloc_context(inode_ac);
@@ -341,6 +439,9 @@ leave:
341 if (data_ac) 439 if (data_ac)
342 ocfs2_free_alloc_context(data_ac); 440 ocfs2_free_alloc_context(data_ac);
343 441
442 if (xattr_ac)
443 ocfs2_free_alloc_context(xattr_ac);
444
344 mlog_exit(status); 445 mlog_exit(status);
345 446
346 return status; 447 return status;
@@ -348,12 +449,12 @@ leave:
348 449
349static int ocfs2_mknod_locked(struct ocfs2_super *osb, 450static int ocfs2_mknod_locked(struct ocfs2_super *osb,
350 struct inode *dir, 451 struct inode *dir,
351 struct dentry *dentry, int mode, 452 struct inode *inode,
453 struct dentry *dentry,
352 dev_t dev, 454 dev_t dev,
353 struct buffer_head **new_fe_bh, 455 struct buffer_head **new_fe_bh,
354 struct buffer_head *parent_fe_bh, 456 struct buffer_head *parent_fe_bh,
355 handle_t *handle, 457 handle_t *handle,
356 struct inode **ret_inode,
357 struct ocfs2_alloc_context *inode_ac) 458 struct ocfs2_alloc_context *inode_ac)
358{ 459{
359 int status = 0; 460 int status = 0;
@@ -361,14 +462,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
361 struct ocfs2_extent_list *fel; 462 struct ocfs2_extent_list *fel;
362 u64 fe_blkno = 0; 463 u64 fe_blkno = 0;
363 u16 suballoc_bit; 464 u16 suballoc_bit;
364 struct inode *inode = NULL;
365 465
366 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, 466 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
367 (unsigned long)dev, dentry->d_name.len, 467 inode->i_mode, (unsigned long)dev, dentry->d_name.len,
368 dentry->d_name.name); 468 dentry->d_name.name);
369 469
370 *new_fe_bh = NULL; 470 *new_fe_bh = NULL;
371 *ret_inode = NULL;
372 471
373 status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit, 472 status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
374 &fe_blkno); 473 &fe_blkno);
@@ -377,23 +476,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
377 goto leave; 476 goto leave;
378 } 477 }
379 478
380 inode = new_inode(dir->i_sb);
381 if (!inode) {
382 status = -ENOMEM;
383 mlog(ML_ERROR, "new_inode failed!\n");
384 goto leave;
385 }
386
387 /* populate as many fields early on as possible - many of 479 /* populate as many fields early on as possible - many of
388 * these are used by the support functions here and in 480 * these are used by the support functions here and in
389 * callers. */ 481 * callers. */
390 inode->i_ino = ino_from_blkno(osb->sb, fe_blkno); 482 inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
391 OCFS2_I(inode)->ip_blkno = fe_blkno; 483 OCFS2_I(inode)->ip_blkno = fe_blkno;
392 if (S_ISDIR(mode))
393 inode->i_nlink = 2;
394 else
395 inode->i_nlink = 1;
396 inode->i_mode = mode;
397 spin_lock(&osb->osb_lock); 484 spin_lock(&osb->osb_lock);
398 inode->i_generation = osb->s_next_generation++; 485 inode->i_generation = osb->s_next_generation++;
399 spin_unlock(&osb->osb_lock); 486 spin_unlock(&osb->osb_lock);
@@ -406,8 +493,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
406 } 493 }
407 ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh); 494 ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
408 495
409 status = ocfs2_journal_access(handle, inode, *new_fe_bh, 496 status = ocfs2_journal_access_di(handle, inode, *new_fe_bh,
410 OCFS2_JOURNAL_ACCESS_CREATE); 497 OCFS2_JOURNAL_ACCESS_CREATE);
411 if (status < 0) { 498 if (status < 0) {
412 mlog_errno(status); 499 mlog_errno(status);
413 goto leave; 500 goto leave;
@@ -421,17 +508,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
421 fe->i_blkno = cpu_to_le64(fe_blkno); 508 fe->i_blkno = cpu_to_le64(fe_blkno);
422 fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); 509 fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
423 fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot); 510 fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
424 fe->i_uid = cpu_to_le32(current_fsuid()); 511 fe->i_uid = cpu_to_le32(inode->i_uid);
425 if (dir->i_mode & S_ISGID) { 512 fe->i_gid = cpu_to_le32(inode->i_gid);
426 fe->i_gid = cpu_to_le32(dir->i_gid); 513 fe->i_mode = cpu_to_le16(inode->i_mode);
427 if (S_ISDIR(mode)) 514 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
428 mode |= S_ISGID;
429 } else
430 fe->i_gid = cpu_to_le32(current_fsgid());
431 fe->i_mode = cpu_to_le16(mode);
432 if (S_ISCHR(mode) || S_ISBLK(mode))
433 fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev)); 515 fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
434
435 fe->i_links_count = cpu_to_le16(inode->i_nlink); 516 fe->i_links_count = cpu_to_le16(inode->i_nlink);
436 517
437 fe->i_last_eb_blk = 0; 518 fe->i_last_eb_blk = 0;
@@ -446,7 +527,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
446 /* 527 /*
447 * If supported, directories start with inline data. 528 * If supported, directories start with inline data.
448 */ 529 */
449 if (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) { 530 if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) {
450 u16 feat = le16_to_cpu(fe->i_dyn_features); 531 u16 feat = le16_to_cpu(fe->i_dyn_features);
451 532
452 fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL); 533 fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
@@ -465,15 +546,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
465 goto leave; 546 goto leave;
466 } 547 }
467 548
468 if (ocfs2_populate_inode(inode, fe, 1) < 0) { 549 ocfs2_populate_inode(inode, fe, 1);
469 mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
470 "i_blkno=%llu, i_ino=%lu\n",
471 (unsigned long long)(*new_fe_bh)->b_blocknr,
472 (unsigned long long)le64_to_cpu(fe->i_blkno),
473 inode->i_ino);
474 BUG();
475 }
476
477 ocfs2_inode_set_new(osb, inode); 550 ocfs2_inode_set_new(osb, inode);
478 if (!ocfs2_mount_local(osb)) { 551 if (!ocfs2_mount_local(osb)) {
479 status = ocfs2_create_new_inode_locks(inode); 552 status = ocfs2_create_new_inode_locks(inode);
@@ -484,17 +557,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
484 status = 0; /* error in ocfs2_create_new_inode_locks is not 557 status = 0; /* error in ocfs2_create_new_inode_locks is not
485 * critical */ 558 * critical */
486 559
487 *ret_inode = inode;
488leave: 560leave:
489 if (status < 0) { 561 if (status < 0) {
490 if (*new_fe_bh) { 562 if (*new_fe_bh) {
491 brelse(*new_fe_bh); 563 brelse(*new_fe_bh);
492 *new_fe_bh = NULL; 564 *new_fe_bh = NULL;
493 } 565 }
494 if (inode) {
495 clear_nlink(inode);
496 iput(inode);
497 }
498 } 566 }
499 567
500 mlog_exit(status); 568 mlog_exit(status);
@@ -588,7 +656,7 @@ static int ocfs2_link(struct dentry *old_dentry,
588 goto out_unlock_inode; 656 goto out_unlock_inode;
589 } 657 }
590 658
591 handle = ocfs2_start_trans(osb, OCFS2_LINK_CREDITS); 659 handle = ocfs2_start_trans(osb, ocfs2_link_credits(osb->sb));
592 if (IS_ERR(handle)) { 660 if (IS_ERR(handle)) {
593 err = PTR_ERR(handle); 661 err = PTR_ERR(handle);
594 handle = NULL; 662 handle = NULL;
@@ -596,8 +664,8 @@ static int ocfs2_link(struct dentry *old_dentry,
596 goto out_unlock_inode; 664 goto out_unlock_inode;
597 } 665 }
598 666
599 err = ocfs2_journal_access(handle, inode, fe_bh, 667 err = ocfs2_journal_access_di(handle, inode, fe_bh,
600 OCFS2_JOURNAL_ACCESS_WRITE); 668 OCFS2_JOURNAL_ACCESS_WRITE);
601 if (err < 0) { 669 if (err < 0) {
602 mlog_errno(err); 670 mlog_errno(err);
603 goto out_commit; 671 goto out_commit;
@@ -775,7 +843,7 @@ static int ocfs2_unlink(struct inode *dir,
775 } 843 }
776 } 844 }
777 845
778 handle = ocfs2_start_trans(osb, OCFS2_UNLINK_CREDITS); 846 handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb));
779 if (IS_ERR(handle)) { 847 if (IS_ERR(handle)) {
780 status = PTR_ERR(handle); 848 status = PTR_ERR(handle);
781 handle = NULL; 849 handle = NULL;
@@ -783,8 +851,8 @@ static int ocfs2_unlink(struct inode *dir,
783 goto leave; 851 goto leave;
784 } 852 }
785 853
786 status = ocfs2_journal_access(handle, inode, fe_bh, 854 status = ocfs2_journal_access_di(handle, inode, fe_bh,
787 OCFS2_JOURNAL_ACCESS_WRITE); 855 OCFS2_JOURNAL_ACCESS_WRITE);
788 if (status < 0) { 856 if (status < 0) {
789 mlog_errno(status); 857 mlog_errno(status);
790 goto leave; 858 goto leave;
@@ -1181,7 +1249,7 @@ static int ocfs2_rename(struct inode *old_dir,
1181 } 1249 }
1182 } 1250 }
1183 1251
1184 handle = ocfs2_start_trans(osb, OCFS2_RENAME_CREDITS); 1252 handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb));
1185 if (IS_ERR(handle)) { 1253 if (IS_ERR(handle)) {
1186 status = PTR_ERR(handle); 1254 status = PTR_ERR(handle);
1187 handle = NULL; 1255 handle = NULL;
@@ -1197,8 +1265,8 @@ static int ocfs2_rename(struct inode *old_dir,
1197 goto bail; 1265 goto bail;
1198 } 1266 }
1199 } 1267 }
1200 status = ocfs2_journal_access(handle, new_inode, newfe_bh, 1268 status = ocfs2_journal_access_di(handle, new_inode, newfe_bh,
1201 OCFS2_JOURNAL_ACCESS_WRITE); 1269 OCFS2_JOURNAL_ACCESS_WRITE);
1202 if (status < 0) { 1270 if (status < 0) {
1203 mlog_errno(status); 1271 mlog_errno(status);
1204 goto bail; 1272 goto bail;
@@ -1244,8 +1312,8 @@ static int ocfs2_rename(struct inode *old_dir,
1244 old_inode->i_ctime = CURRENT_TIME; 1312 old_inode->i_ctime = CURRENT_TIME;
1245 mark_inode_dirty(old_inode); 1313 mark_inode_dirty(old_inode);
1246 1314
1247 status = ocfs2_journal_access(handle, old_inode, old_inode_bh, 1315 status = ocfs2_journal_access_di(handle, old_inode, old_inode_bh,
1248 OCFS2_JOURNAL_ACCESS_WRITE); 1316 OCFS2_JOURNAL_ACCESS_WRITE);
1249 if (status >= 0) { 1317 if (status >= 0) {
1250 old_di = (struct ocfs2_dinode *) old_inode_bh->b_data; 1318 old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
1251 1319
@@ -1321,9 +1389,9 @@ static int ocfs2_rename(struct inode *old_dir,
1321 (int)old_dir_nlink, old_dir->i_nlink); 1389 (int)old_dir_nlink, old_dir->i_nlink);
1322 } else { 1390 } else {
1323 struct ocfs2_dinode *fe; 1391 struct ocfs2_dinode *fe;
1324 status = ocfs2_journal_access(handle, old_dir, 1392 status = ocfs2_journal_access_di(handle, old_dir,
1325 old_dir_bh, 1393 old_dir_bh,
1326 OCFS2_JOURNAL_ACCESS_WRITE); 1394 OCFS2_JOURNAL_ACCESS_WRITE);
1327 fe = (struct ocfs2_dinode *) old_dir_bh->b_data; 1395 fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
1328 fe->i_links_count = cpu_to_le16(old_dir->i_nlink); 1396 fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
1329 status = ocfs2_journal_dirty(handle, old_dir_bh); 1397 status = ocfs2_journal_dirty(handle, old_dir_bh);
@@ -1496,6 +1564,13 @@ static int ocfs2_symlink(struct inode *dir,
1496 handle_t *handle = NULL; 1564 handle_t *handle = NULL;
1497 struct ocfs2_alloc_context *inode_ac = NULL; 1565 struct ocfs2_alloc_context *inode_ac = NULL;
1498 struct ocfs2_alloc_context *data_ac = NULL; 1566 struct ocfs2_alloc_context *data_ac = NULL;
1567 struct ocfs2_alloc_context *xattr_ac = NULL;
1568 int want_clusters = 0;
1569 int xattr_credits = 0;
1570 struct ocfs2_security_xattr_info si = {
1571 .enable = 1,
1572 };
1573 int did_quota = 0, did_quota_inode = 0;
1499 1574
1500 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, 1575 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
1501 dentry, symname, dentry->d_name.len, dentry->d_name.name); 1576 dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1542,17 +1617,46 @@ static int ocfs2_symlink(struct inode *dir,
1542 goto bail; 1617 goto bail;
1543 } 1618 }
1544 1619
1545 /* don't reserve bitmap space for fast symlinks. */ 1620 inode = ocfs2_get_init_inode(dir, S_IFLNK | S_IRWXUGO);
1546 if (l > ocfs2_fast_symlink_chars(sb)) { 1621 if (!inode) {
1547 status = ocfs2_reserve_clusters(osb, 1, &data_ac); 1622 status = -ENOMEM;
1623 mlog_errno(status);
1624 goto bail;
1625 }
1626
1627 /* get security xattr */
1628 status = ocfs2_init_security_get(inode, dir, &si);
1629 if (status) {
1630 if (status == -EOPNOTSUPP)
1631 si.enable = 0;
1632 else {
1633 mlog_errno(status);
1634 goto bail;
1635 }
1636 }
1637
1638 /* calculate meta data/clusters for setting security xattr */
1639 if (si.enable) {
1640 status = ocfs2_calc_security_init(dir, &si, &want_clusters,
1641 &xattr_credits, &xattr_ac);
1548 if (status < 0) { 1642 if (status < 0) {
1549 if (status != -ENOSPC) 1643 mlog_errno(status);
1550 mlog_errno(status);
1551 goto bail; 1644 goto bail;
1552 } 1645 }
1553 } 1646 }
1554 1647
1555 handle = ocfs2_start_trans(osb, credits); 1648 /* don't reserve bitmap space for fast symlinks. */
1649 if (l > ocfs2_fast_symlink_chars(sb))
1650 want_clusters += 1;
1651
1652 status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
1653 if (status < 0) {
1654 if (status != -ENOSPC)
1655 mlog_errno(status);
1656 goto bail;
1657 }
1658
1659 handle = ocfs2_start_trans(osb, credits + xattr_credits);
1556 if (IS_ERR(handle)) { 1660 if (IS_ERR(handle)) {
1557 status = PTR_ERR(handle); 1661 status = PTR_ERR(handle);
1558 handle = NULL; 1662 handle = NULL;
@@ -1560,10 +1664,18 @@ static int ocfs2_symlink(struct inode *dir,
1560 goto bail; 1664 goto bail;
1561 } 1665 }
1562 1666
1563 status = ocfs2_mknod_locked(osb, dir, dentry, 1667 /* We don't use standard VFS wrapper because we don't want vfs_dq_init
1564 S_IFLNK | S_IRWXUGO, 0, 1668 * to be called. */
1565 &new_fe_bh, parent_fe_bh, handle, 1669 if (sb_any_quota_active(osb->sb) &&
1566 &inode, inode_ac); 1670 osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
1671 status = -EDQUOT;
1672 goto bail;
1673 }
1674 did_quota_inode = 1;
1675
1676 status = ocfs2_mknod_locked(osb, dir, inode, dentry,
1677 0, &new_fe_bh, parent_fe_bh, handle,
1678 inode_ac);
1567 if (status < 0) { 1679 if (status < 0) {
1568 mlog_errno(status); 1680 mlog_errno(status);
1569 goto bail; 1681 goto bail;
@@ -1576,6 +1688,12 @@ static int ocfs2_symlink(struct inode *dir,
1576 u32 offset = 0; 1688 u32 offset = 0;
1577 1689
1578 inode->i_op = &ocfs2_symlink_inode_operations; 1690 inode->i_op = &ocfs2_symlink_inode_operations;
1691 if (vfs_dq_alloc_space_nodirty(inode,
1692 ocfs2_clusters_to_bytes(osb->sb, 1))) {
1693 status = -EDQUOT;
1694 goto bail;
1695 }
1696 did_quota = 1;
1579 status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0, 1697 status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
1580 new_fe_bh, 1698 new_fe_bh,
1581 handle, data_ac, NULL, 1699 handle, data_ac, NULL,
@@ -1614,6 +1732,15 @@ static int ocfs2_symlink(struct inode *dir,
1614 } 1732 }
1615 } 1733 }
1616 1734
1735 if (si.enable) {
1736 status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
1737 xattr_ac, data_ac);
1738 if (status < 0) {
1739 mlog_errno(status);
1740 goto bail;
1741 }
1742 }
1743
1617 status = ocfs2_add_entry(handle, dentry, inode, 1744 status = ocfs2_add_entry(handle, dentry, inode,
1618 le64_to_cpu(fe->i_blkno), parent_fe_bh, 1745 le64_to_cpu(fe->i_blkno), parent_fe_bh,
1619 de_bh); 1746 de_bh);
@@ -1632,6 +1759,11 @@ static int ocfs2_symlink(struct inode *dir,
1632 dentry->d_op = &ocfs2_dentry_ops; 1759 dentry->d_op = &ocfs2_dentry_ops;
1633 d_instantiate(dentry, inode); 1760 d_instantiate(dentry, inode);
1634bail: 1761bail:
1762 if (status < 0 && did_quota)
1763 vfs_dq_free_space_nodirty(inode,
1764 ocfs2_clusters_to_bytes(osb->sb, 1));
1765 if (status < 0 && did_quota_inode)
1766 vfs_dq_free_inode(inode);
1635 if (handle) 1767 if (handle)
1636 ocfs2_commit_trans(osb, handle); 1768 ocfs2_commit_trans(osb, handle);
1637 1769
@@ -1640,12 +1772,18 @@ bail:
1640 brelse(new_fe_bh); 1772 brelse(new_fe_bh);
1641 brelse(parent_fe_bh); 1773 brelse(parent_fe_bh);
1642 brelse(de_bh); 1774 brelse(de_bh);
1775 kfree(si.name);
1776 kfree(si.value);
1643 if (inode_ac) 1777 if (inode_ac)
1644 ocfs2_free_alloc_context(inode_ac); 1778 ocfs2_free_alloc_context(inode_ac);
1645 if (data_ac) 1779 if (data_ac)
1646 ocfs2_free_alloc_context(data_ac); 1780 ocfs2_free_alloc_context(data_ac);
1647 if ((status < 0) && inode) 1781 if (xattr_ac)
1782 ocfs2_free_alloc_context(xattr_ac);
1783 if ((status < 0) && inode) {
1784 clear_nlink(inode);
1648 iput(inode); 1785 iput(inode);
1786 }
1649 1787
1650 mlog_exit(status); 1788 mlog_exit(status);
1651 1789
@@ -1754,16 +1892,14 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1754 1892
1755 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); 1893 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
1756 1894
1757 status = ocfs2_read_block(orphan_dir_inode, 1895 status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh);
1758 OCFS2_I(orphan_dir_inode)->ip_blkno,
1759 &orphan_dir_bh);
1760 if (status < 0) { 1896 if (status < 0) {
1761 mlog_errno(status); 1897 mlog_errno(status);
1762 goto leave; 1898 goto leave;
1763 } 1899 }
1764 1900
1765 status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh, 1901 status = ocfs2_journal_access_di(handle, orphan_dir_inode, orphan_dir_bh,
1766 OCFS2_JOURNAL_ACCESS_WRITE); 1902 OCFS2_JOURNAL_ACCESS_WRITE);
1767 if (status < 0) { 1903 if (status < 0) {
1768 mlog_errno(status); 1904 mlog_errno(status);
1769 goto leave; 1905 goto leave;
@@ -1850,8 +1986,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
1850 goto leave; 1986 goto leave;
1851 } 1987 }
1852 1988
1853 status = ocfs2_journal_access(handle,orphan_dir_inode, orphan_dir_bh, 1989 status = ocfs2_journal_access_di(handle,orphan_dir_inode, orphan_dir_bh,
1854 OCFS2_JOURNAL_ACCESS_WRITE); 1990 OCFS2_JOURNAL_ACCESS_WRITE);
1855 if (status < 0) { 1991 if (status < 0) {
1856 mlog_errno(status); 1992 mlog_errno(status);
1857 goto leave; 1993 goto leave;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 3fed9e3d8992..ad5c24a29edd 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -161,6 +161,7 @@ enum ocfs2_vol_state
161{ 161{
162 VOLUME_INIT = 0, 162 VOLUME_INIT = 0,
163 VOLUME_MOUNTED, 163 VOLUME_MOUNTED,
164 VOLUME_MOUNTED_QUOTAS,
164 VOLUME_DISMOUNTED, 165 VOLUME_DISMOUNTED,
165 VOLUME_DISABLED 166 VOLUME_DISABLED
166}; 167};
@@ -195,6 +196,9 @@ enum ocfs2_mount_options
195 OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */ 196 OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
196 OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */ 197 OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
197 OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */ 198 OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */
199 OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* POSIX access control lists */
200 OCFS2_MOUNT_USRQUOTA = 1 << 9, /* We support user quotas */
201 OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */
198}; 202};
199 203
200#define OCFS2_OSB_SOFT_RO 0x0001 204#define OCFS2_OSB_SOFT_RO 0x0001
@@ -205,6 +209,7 @@ enum ocfs2_mount_options
205struct ocfs2_journal; 209struct ocfs2_journal;
206struct ocfs2_slot_info; 210struct ocfs2_slot_info;
207struct ocfs2_recovery_map; 211struct ocfs2_recovery_map;
212struct ocfs2_quota_recovery;
208struct ocfs2_super 213struct ocfs2_super
209{ 214{
210 struct task_struct *commit_task; 215 struct task_struct *commit_task;
@@ -286,10 +291,11 @@ struct ocfs2_super
286 char *local_alloc_debug_buf; 291 char *local_alloc_debug_buf;
287#endif 292#endif
288 293
289 /* Next two fields are for local node slot recovery during 294 /* Next three fields are for local node slot recovery during
290 * mount. */ 295 * mount. */
291 int dirty; 296 int dirty;
292 struct ocfs2_dinode *local_alloc_copy; 297 struct ocfs2_dinode *local_alloc_copy;
298 struct ocfs2_quota_recovery *quota_rec;
293 299
294 struct ocfs2_alloc_stats alloc_stats; 300 struct ocfs2_alloc_stats alloc_stats;
295 char dev_str[20]; /* "major,minor" of the device */ 301 char dev_str[20]; /* "major,minor" of the device */
@@ -333,6 +339,10 @@ struct ocfs2_super
333 339
334#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) 340#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
335 341
342/* Useful typedef for passing around journal access functions */
343typedef int (*ocfs2_journal_access_func)(handle_t *handle, struct inode *inode,
344 struct buffer_head *bh, int type);
345
336static inline int ocfs2_should_order_data(struct inode *inode) 346static inline int ocfs2_should_order_data(struct inode *inode)
337{ 347{
338 if (!S_ISREG(inode->i_mode)) 348 if (!S_ISREG(inode->i_mode))
@@ -376,6 +386,13 @@ static inline int ocfs2_supports_xattr(struct ocfs2_super *osb)
376 return 0; 386 return 0;
377} 387}
378 388
389static inline int ocfs2_meta_ecc(struct ocfs2_super *osb)
390{
391 if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_META_ECC)
392 return 1;
393 return 0;
394}
395
379/* set / clear functions because cluster events can make these happen 396/* set / clear functions because cluster events can make these happen
380 * in parallel so we want the transitions to be atomic. this also 397 * in parallel so we want the transitions to be atomic. this also
381 * means that any future flags osb_flags must be protected by spinlock 398 * means that any future flags osb_flags must be protected by spinlock
@@ -443,39 +460,19 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
443#define OCFS2_IS_VALID_DINODE(ptr) \ 460#define OCFS2_IS_VALID_DINODE(ptr) \
444 (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE)) 461 (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
445 462
446#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di) do { \
447 typeof(__di) ____di = (__di); \
448 ocfs2_error((__sb), \
449 "Dinode # %llu has bad signature %.*s", \
450 (unsigned long long)le64_to_cpu((____di)->i_blkno), 7, \
451 (____di)->i_signature); \
452} while (0)
453
454#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \ 463#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \
455 (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE)) 464 (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
456 465
457#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb) do { \
458 typeof(__eb) ____eb = (__eb); \
459 ocfs2_error((__sb), \
460 "Extent Block # %llu has bad signature %.*s", \
461 (unsigned long long)le64_to_cpu((____eb)->h_blkno), 7, \
462 (____eb)->h_signature); \
463} while (0)
464
465#define OCFS2_IS_VALID_GROUP_DESC(ptr) \ 466#define OCFS2_IS_VALID_GROUP_DESC(ptr) \
466 (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE)) 467 (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
467 468
468#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd) do { \
469 typeof(__gd) ____gd = (__gd); \
470 ocfs2_error((__sb), \
471 "Group Descriptor # %llu has bad signature %.*s", \
472 (unsigned long long)le64_to_cpu((____gd)->bg_blkno), 7, \
473 (____gd)->bg_signature); \
474} while (0)
475 469
476#define OCFS2_IS_VALID_XATTR_BLOCK(ptr) \ 470#define OCFS2_IS_VALID_XATTR_BLOCK(ptr) \
477 (!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE)) 471 (!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE))
478 472
473#define OCFS2_IS_VALID_DIR_TRAILER(ptr) \
474 (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE))
475
479static inline unsigned long ino_from_blkno(struct super_block *sb, 476static inline unsigned long ino_from_blkno(struct super_block *sb,
480 u64 blkno) 477 u64 blkno)
481{ 478{
@@ -632,5 +629,6 @@ static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
632#define ocfs2_clear_bit ext2_clear_bit 629#define ocfs2_clear_bit ext2_clear_bit
633#define ocfs2_test_bit ext2_test_bit 630#define ocfs2_test_bit ext2_test_bit
634#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit 631#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
632#define ocfs2_find_next_bit ext2_find_next_bit
635#endif /* OCFS2_H */ 633#endif /* OCFS2_H */
636 634
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 5e0c0d0aef7d..c7ae45aaa36c 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -65,6 +65,7 @@
65#define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01" 65#define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01"
66#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" 66#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01"
67#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01" 67#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01"
68#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1"
68 69
69/* Compatibility flags */ 70/* Compatibility flags */
70#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ 71#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
@@ -93,8 +94,11 @@
93 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \ 94 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
94 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \ 95 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
95 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \ 96 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
96 | OCFS2_FEATURE_INCOMPAT_XATTR) 97 | OCFS2_FEATURE_INCOMPAT_XATTR \
97#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 98 | OCFS2_FEATURE_INCOMPAT_META_ECC)
99#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
100 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
101 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
98 102
99/* 103/*
100 * Heartbeat-only devices are missing journals and other files. The 104 * Heartbeat-only devices are missing journals and other files. The
@@ -147,6 +151,9 @@
147/* Support for extended attributes */ 151/* Support for extended attributes */
148#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200 152#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200
149 153
154/* Metadata checksum and error correction */
155#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800
156
150/* 157/*
151 * backup superblock flag is used to indicate that this volume 158 * backup superblock flag is used to indicate that this volume
152 * has backup superblocks. 159 * has backup superblocks.
@@ -163,6 +170,12 @@
163 */ 170 */
164#define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 0x0001 171#define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 0x0001
165 172
173/*
174 * Maintain quota information for this filesystem
175 */
176#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002
177#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004
178
166/* The byte offset of the first backup block will be 1G. 179/* The byte offset of the first backup block will be 1G.
167 * The following will be 4G, 16G, 64G, 256G and 1T. 180 * The following will be 4G, 16G, 64G, 256G and 1T.
168 */ 181 */
@@ -192,6 +205,7 @@
192#define OCFS2_HEARTBEAT_FL (0x00000200) /* Heartbeat area */ 205#define OCFS2_HEARTBEAT_FL (0x00000200) /* Heartbeat area */
193#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */ 206#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */
194#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */ 207#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */
208#define OCFS2_QUOTA_FL (0x00001000) /* Quota file */
195 209
196/* 210/*
197 * Flags on ocfs2_dinode.i_dyn_features 211 * Flags on ocfs2_dinode.i_dyn_features
@@ -329,13 +343,17 @@ enum {
329#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE 343#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
330 HEARTBEAT_SYSTEM_INODE, 344 HEARTBEAT_SYSTEM_INODE,
331 GLOBAL_BITMAP_SYSTEM_INODE, 345 GLOBAL_BITMAP_SYSTEM_INODE,
332#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE 346 USER_QUOTA_SYSTEM_INODE,
347 GROUP_QUOTA_SYSTEM_INODE,
348#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
333 ORPHAN_DIR_SYSTEM_INODE, 349 ORPHAN_DIR_SYSTEM_INODE,
334 EXTENT_ALLOC_SYSTEM_INODE, 350 EXTENT_ALLOC_SYSTEM_INODE,
335 INODE_ALLOC_SYSTEM_INODE, 351 INODE_ALLOC_SYSTEM_INODE,
336 JOURNAL_SYSTEM_INODE, 352 JOURNAL_SYSTEM_INODE,
337 LOCAL_ALLOC_SYSTEM_INODE, 353 LOCAL_ALLOC_SYSTEM_INODE,
338 TRUNCATE_LOG_SYSTEM_INODE, 354 TRUNCATE_LOG_SYSTEM_INODE,
355 LOCAL_USER_QUOTA_SYSTEM_INODE,
356 LOCAL_GROUP_QUOTA_SYSTEM_INODE,
339 NUM_SYSTEM_INODES 357 NUM_SYSTEM_INODES
340}; 358};
341 359
@@ -349,6 +367,8 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
349 [SLOT_MAP_SYSTEM_INODE] = { "slot_map", 0, S_IFREG | 0644 }, 367 [SLOT_MAP_SYSTEM_INODE] = { "slot_map", 0, S_IFREG | 0644 },
350 [HEARTBEAT_SYSTEM_INODE] = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 }, 368 [HEARTBEAT_SYSTEM_INODE] = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 },
351 [GLOBAL_BITMAP_SYSTEM_INODE] = { "global_bitmap", 0, S_IFREG | 0644 }, 369 [GLOBAL_BITMAP_SYSTEM_INODE] = { "global_bitmap", 0, S_IFREG | 0644 },
370 [USER_QUOTA_SYSTEM_INODE] = { "aquota.user", OCFS2_QUOTA_FL, S_IFREG | 0644 },
371 [GROUP_QUOTA_SYSTEM_INODE] = { "aquota.group", OCFS2_QUOTA_FL, S_IFREG | 0644 },
352 372
353 /* Slot-specific system inodes (one copy per slot) */ 373 /* Slot-specific system inodes (one copy per slot) */
354 [ORPHAN_DIR_SYSTEM_INODE] = { "orphan_dir:%04d", 0, S_IFDIR | 0755 }, 374 [ORPHAN_DIR_SYSTEM_INODE] = { "orphan_dir:%04d", 0, S_IFDIR | 0755 },
@@ -356,7 +376,9 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
356 [INODE_ALLOC_SYSTEM_INODE] = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, 376 [INODE_ALLOC_SYSTEM_INODE] = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
357 [JOURNAL_SYSTEM_INODE] = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 }, 377 [JOURNAL_SYSTEM_INODE] = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 },
358 [LOCAL_ALLOC_SYSTEM_INODE] = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 }, 378 [LOCAL_ALLOC_SYSTEM_INODE] = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 },
359 [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 } 379 [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 },
380 [LOCAL_USER_QUOTA_SYSTEM_INODE] = { "aquota.user:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
381 [LOCAL_GROUP_QUOTA_SYSTEM_INODE] = { "aquota.group:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
360}; 382};
361 383
362/* Parameter passed from mount.ocfs2 to module */ 384/* Parameter passed from mount.ocfs2 to module */
@@ -410,6 +432,22 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
410#define OCFS2_RAW_SB(dinode) (&((dinode)->id2.i_super)) 432#define OCFS2_RAW_SB(dinode) (&((dinode)->id2.i_super))
411 433
412/* 434/*
435 * Block checking structure. This is used in metadata to validate the
436 * contents. If OCFS2_FEATURE_INCOMPAT_META_ECC is not set, it is all
437 * zeros.
438 */
439struct ocfs2_block_check {
440/*00*/ __le32 bc_crc32e; /* 802.3 Ethernet II CRC32 */
441 __le16 bc_ecc; /* Single-error-correction parity vector.
442 This is a simple Hamming code dependant
443 on the blocksize. OCFS2's maximum
444 blocksize, 4K, requires 16 parity bits,
445 so we fit in __le16. */
446 __le16 bc_reserved1;
447/*08*/
448};
449
450/*
413 * On disk extent record for OCFS2 451 * On disk extent record for OCFS2
414 * It describes a range of clusters on disk. 452 * It describes a range of clusters on disk.
415 * 453 *
@@ -496,7 +534,7 @@ struct ocfs2_truncate_log {
496struct ocfs2_extent_block 534struct ocfs2_extent_block
497{ 535{
498/*00*/ __u8 h_signature[8]; /* Signature for verification */ 536/*00*/ __u8 h_signature[8]; /* Signature for verification */
499 __le64 h_reserved1; 537 struct ocfs2_block_check h_check; /* Error checking */
500/*10*/ __le16 h_suballoc_slot; /* Slot suballocator this 538/*10*/ __le16 h_suballoc_slot; /* Slot suballocator this
501 extent_header belongs to */ 539 extent_header belongs to */
502 __le16 h_suballoc_bit; /* Bit offset in suballocator 540 __le16 h_suballoc_bit; /* Bit offset in suballocator
@@ -666,7 +704,8 @@ struct ocfs2_dinode {
666 was set in i_flags */ 704 was set in i_flags */
667 __le16 i_dyn_features; 705 __le16 i_dyn_features;
668 __le64 i_xattr_loc; 706 __le64 i_xattr_loc;
669/*80*/ __le64 i_reserved2[7]; 707/*80*/ struct ocfs2_block_check i_check; /* Error checking */
708/*88*/ __le64 i_reserved2[6];
670/*B8*/ union { 709/*B8*/ union {
671 __le64 i_pad1; /* Generic way to refer to this 710 __le64 i_pad1; /* Generic way to refer to this
672 64bit union */ 711 64bit union */
@@ -715,6 +754,34 @@ struct ocfs2_dir_entry {
715} __attribute__ ((packed)); 754} __attribute__ ((packed));
716 755
717/* 756/*
757 * Per-block record for the unindexed directory btree. This is carefully
758 * crafted so that the rec_len and name_len records of an ocfs2_dir_entry are
759 * mirrored. That way, the directory manipulation code needs a minimal amount
760 * of update.
761 *
762 * NOTE: Keep this structure aligned to a multiple of 4 bytes.
763 */
764struct ocfs2_dir_block_trailer {
765/*00*/ __le64 db_compat_inode; /* Always zero. Was inode */
766
767 __le16 db_compat_rec_len; /* Backwards compatible with
768 * ocfs2_dir_entry. */
769 __u8 db_compat_name_len; /* Always zero. Was name_len */
770 __u8 db_reserved0;
771 __le16 db_reserved1;
772 __le16 db_free_rec_len; /* Size of largest empty hole
773 * in this block. (unused) */
774/*10*/ __u8 db_signature[8]; /* Signature for verification */
775 __le64 db_reserved2;
776 __le64 db_free_next; /* Next block in list (unused) */
777/*20*/ __le64 db_blkno; /* Offset on disk, in blocks */
778 __le64 db_parent_dinode; /* dinode which owns me, in
779 blocks */
780/*30*/ struct ocfs2_block_check db_check; /* Error checking */
781/*40*/
782};
783
784/*
718 * On disk allocator group structure for OCFS2 785 * On disk allocator group structure for OCFS2
719 */ 786 */
720struct ocfs2_group_desc 787struct ocfs2_group_desc
@@ -733,7 +800,8 @@ struct ocfs2_group_desc
733/*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in 800/*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in
734 blocks */ 801 blocks */
735 __le64 bg_blkno; /* Offset on disk, in blocks */ 802 __le64 bg_blkno; /* Offset on disk, in blocks */
736/*30*/ __le64 bg_reserved2[2]; 803/*30*/ struct ocfs2_block_check bg_check; /* Error checking */
804 __le64 bg_reserved2;
737/*40*/ __u8 bg_bitmap[0]; 805/*40*/ __u8 bg_bitmap[0];
738}; 806};
739 807
@@ -776,7 +844,12 @@ struct ocfs2_xattr_header {
776 in this extent record, 844 in this extent record,
777 only valid in the first 845 only valid in the first
778 bucket. */ 846 bucket. */
779 __le64 xh_csum; 847 struct ocfs2_block_check xh_check; /* Error checking
848 (Note, this is only
849 used for xattr
850 buckets. A block uses
851 xb_check and sets
852 this field to zero.) */
780 struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */ 853 struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
781}; 854};
782 855
@@ -827,7 +900,7 @@ struct ocfs2_xattr_block {
827 block group */ 900 block group */
828 __le32 xb_fs_generation; /* Must match super block */ 901 __le32 xb_fs_generation; /* Must match super block */
829/*10*/ __le64 xb_blkno; /* Offset on disk, in blocks */ 902/*10*/ __le64 xb_blkno; /* Offset on disk, in blocks */
830 __le64 xb_csum; 903 struct ocfs2_block_check xb_check; /* Error checking */
831/*20*/ __le16 xb_flags; /* Indicates whether this block contains 904/*20*/ __le16 xb_flags; /* Indicates whether this block contains
832 real xattr or a xattr tree. */ 905 real xattr or a xattr tree. */
833 __le16 xb_reserved0; 906 __le16 xb_reserved0;
@@ -868,6 +941,128 @@ static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe)
868 return xe->xe_type & OCFS2_XATTR_TYPE_MASK; 941 return xe->xe_type & OCFS2_XATTR_TYPE_MASK;
869} 942}
870 943
944/*
945 * On disk structures for global quota file
946 */
947
948/* Magic numbers and known versions for global quota files */
949#define OCFS2_GLOBAL_QMAGICS {\
950 0x0cf52470, /* USRQUOTA */ \
951 0x0cf52471 /* GRPQUOTA */ \
952}
953
954#define OCFS2_GLOBAL_QVERSIONS {\
955 0, \
956 0, \
957}
958
959
960/* Each block of each quota file has a certain fixed number of bytes reserved
961 * for OCFS2 internal use at its end. OCFS2 can use it for things like
962 * checksums, etc. */
963#define OCFS2_QBLK_RESERVED_SPACE 8
964
965/* Generic header of all quota files */
966struct ocfs2_disk_dqheader {
967 __le32 dqh_magic; /* Magic number identifying file */
968 __le32 dqh_version; /* Quota format version */
969};
970
971#define OCFS2_GLOBAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
972
973/* Information header of global quota file (immediately follows the generic
974 * header) */
975struct ocfs2_global_disk_dqinfo {
976/*00*/ __le32 dqi_bgrace; /* Grace time for space softlimit excess */
977 __le32 dqi_igrace; /* Grace time for inode softlimit excess */
978 __le32 dqi_syncms; /* Time after which we sync local changes to
979 * global quota file */
980 __le32 dqi_blocks; /* Number of blocks in quota file */
981/*10*/ __le32 dqi_free_blk; /* First free block in quota file */
982 __le32 dqi_free_entry; /* First block with free dquot entry in quota
983 * file */
984};
985
986/* Structure with global user / group information. We reserve some space
987 * for future use. */
988struct ocfs2_global_disk_dqblk {
989/*00*/ __le32 dqb_id; /* ID the structure belongs to */
990 __le32 dqb_use_count; /* Number of nodes having reference to this structure */
991 __le64 dqb_ihardlimit; /* absolute limit on allocated inodes */
992/*10*/ __le64 dqb_isoftlimit; /* preferred inode limit */
993 __le64 dqb_curinodes; /* current # allocated inodes */
994/*20*/ __le64 dqb_bhardlimit; /* absolute limit on disk space */
995 __le64 dqb_bsoftlimit; /* preferred limit on disk space */
996/*30*/ __le64 dqb_curspace; /* current space occupied */
997 __le64 dqb_btime; /* time limit for excessive disk use */
998/*40*/ __le64 dqb_itime; /* time limit for excessive inode use */
999 __le64 dqb_pad1;
1000/*50*/ __le64 dqb_pad2;
1001};
1002
1003/*
1004 * On-disk structures for local quota file
1005 */
1006
1007/* Magic numbers and known versions for local quota files */
1008#define OCFS2_LOCAL_QMAGICS {\
1009 0x0cf524c0, /* USRQUOTA */ \
1010 0x0cf524c1 /* GRPQUOTA */ \
1011}
1012
1013#define OCFS2_LOCAL_QVERSIONS {\
1014 0, \
1015 0, \
1016}
1017
1018/* Quota flags in dqinfo header */
1019#define OLQF_CLEAN 0x0001 /* Quota file is empty (this should be after\
1020 * quota has been cleanly turned off) */
1021
1022#define OCFS2_LOCAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
1023
1024/* Information header of local quota file (immediately follows the generic
1025 * header) */
1026struct ocfs2_local_disk_dqinfo {
1027 __le32 dqi_flags; /* Flags for quota file */
1028 __le32 dqi_chunks; /* Number of chunks of quota structures
1029 * with a bitmap */
1030 __le32 dqi_blocks; /* Number of blocks allocated for quota file */
1031};
1032
1033/* Header of one chunk of a quota file */
1034struct ocfs2_local_disk_chunk {
1035 __le32 dqc_free; /* Number of free entries in the bitmap */
1036 u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding
1037 * chunk of quota file */
1038};
1039
1040/* One entry in local quota file */
1041struct ocfs2_local_disk_dqblk {
1042/*00*/ __le64 dqb_id; /* id this quota applies to */
1043 __le64 dqb_spacemod; /* Change in the amount of used space */
1044/*10*/ __le64 dqb_inodemod; /* Change in the amount of used inodes */
1045};
1046
1047
1048/*
1049 * The quota trailer lives at the end of each quota block.
1050 */
1051
1052struct ocfs2_disk_dqtrailer {
1053/*00*/ struct ocfs2_block_check dq_check; /* Error checking */
1054/*08*/ /* Cannot be larger than OCFS2_QBLK_RESERVED_SPACE */
1055};
1056
1057static inline struct ocfs2_disk_dqtrailer *ocfs2_block_dqtrailer(int blocksize,
1058 void *buf)
1059{
1060 char *ptr = buf;
1061 ptr += blocksize - OCFS2_QBLK_RESERVED_SPACE;
1062
1063 return (struct ocfs2_disk_dqtrailer *)ptr;
1064}
1065
871#ifdef __KERNEL__ 1066#ifdef __KERNEL__
872static inline int ocfs2_fast_symlink_chars(struct super_block *sb) 1067static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
873{ 1068{
diff --git a/fs/ocfs2/ocfs2_jbd_compat.h b/fs/ocfs2/ocfs2_jbd_compat.h
deleted file mode 100644
index b91c78f8f558..000000000000
--- a/fs/ocfs2/ocfs2_jbd_compat.h
+++ /dev/null
@@ -1,82 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_jbd_compat.h
5 *
6 * Compatibility defines for JBD.
7 *
8 * Copyright (C) 2008 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License version 2 as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#ifndef OCFS2_JBD_COMPAT_H
21#define OCFS2_JBD_COMPAT_H
22
23#ifndef CONFIG_OCFS2_COMPAT_JBD
24# error Should not have been included
25#endif
26
27struct jbd2_inode {
28 unsigned int dummy;
29};
30
31#define JBD2_BARRIER JFS_BARRIER
32#define JBD2_DEFAULT_MAX_COMMIT_AGE JBD_DEFAULT_MAX_COMMIT_AGE
33
34#define jbd2_journal_ack_err journal_ack_err
35#define jbd2_journal_clear_err journal_clear_err
36#define jbd2_journal_destroy journal_destroy
37#define jbd2_journal_dirty_metadata journal_dirty_metadata
38#define jbd2_journal_errno journal_errno
39#define jbd2_journal_extend journal_extend
40#define jbd2_journal_flush journal_flush
41#define jbd2_journal_force_commit journal_force_commit
42#define jbd2_journal_get_write_access journal_get_write_access
43#define jbd2_journal_get_undo_access journal_get_undo_access
44#define jbd2_journal_init_inode journal_init_inode
45#define jbd2_journal_invalidatepage journal_invalidatepage
46#define jbd2_journal_load journal_load
47#define jbd2_journal_lock_updates journal_lock_updates
48#define jbd2_journal_restart journal_restart
49#define jbd2_journal_start journal_start
50#define jbd2_journal_start_commit journal_start_commit
51#define jbd2_journal_stop journal_stop
52#define jbd2_journal_try_to_free_buffers journal_try_to_free_buffers
53#define jbd2_journal_unlock_updates journal_unlock_updates
54#define jbd2_journal_wipe journal_wipe
55#define jbd2_log_wait_commit log_wait_commit
56
57static inline int jbd2_journal_file_inode(handle_t *handle,
58 struct jbd2_inode *inode)
59{
60 return 0;
61}
62
63static inline int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
64 loff_t new_size)
65{
66 return 0;
67}
68
69static inline void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode,
70 struct inode *inode)
71{
72 return;
73}
74
75static inline void jbd2_journal_release_jbd_inode(journal_t *journal,
76 struct jbd2_inode *jinode)
77{
78 return;
79}
80
81
82#endif /* OCFS2_JBD_COMPAT_H */
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 82c200f7a8f1..eb6f50c9ceca 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -46,6 +46,7 @@ enum ocfs2_lock_type {
46 OCFS2_LOCK_TYPE_DENTRY, 46 OCFS2_LOCK_TYPE_DENTRY,
47 OCFS2_LOCK_TYPE_OPEN, 47 OCFS2_LOCK_TYPE_OPEN,
48 OCFS2_LOCK_TYPE_FLOCK, 48 OCFS2_LOCK_TYPE_FLOCK,
49 OCFS2_LOCK_TYPE_QINFO,
49 OCFS2_NUM_LOCK_TYPES 50 OCFS2_NUM_LOCK_TYPES
50}; 51};
51 52
@@ -77,6 +78,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
77 case OCFS2_LOCK_TYPE_FLOCK: 78 case OCFS2_LOCK_TYPE_FLOCK:
78 c = 'F'; 79 c = 'F';
79 break; 80 break;
81 case OCFS2_LOCK_TYPE_QINFO:
82 c = 'Q';
83 break;
80 default: 84 default:
81 c = '\0'; 85 c = '\0';
82 } 86 }
@@ -95,6 +99,7 @@ static char *ocfs2_lock_type_strings[] = {
95 [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", 99 [OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
96 [OCFS2_LOCK_TYPE_OPEN] = "Open", 100 [OCFS2_LOCK_TYPE_OPEN] = "Open",
97 [OCFS2_LOCK_TYPE_FLOCK] = "Flock", 101 [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
102 [OCFS2_LOCK_TYPE_QINFO] = "Quota",
98}; 103};
99 104
100static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) 105static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
new file mode 100644
index 000000000000..7365e2e08706
--- /dev/null
+++ b/fs/ocfs2/quota.h
@@ -0,0 +1,119 @@
1/*
2 * quota.h for OCFS2
3 *
4 * On disk quota structures for local and global quota file, in-memory
5 * structures.
6 *
7 */
8
9#ifndef _OCFS2_QUOTA_H
10#define _OCFS2_QUOTA_H
11
12#include <linux/types.h>
13#include <linux/slab.h>
14#include <linux/quota.h>
15#include <linux/list.h>
16#include <linux/dqblk_qtree.h>
17
18#include "ocfs2.h"
19
20/* Common stuff */
21/* id number of quota format */
22#define QFMT_OCFS2 3
23
24/*
25 * In-memory structures
26 */
27struct ocfs2_dquot {
28 struct dquot dq_dquot; /* Generic VFS dquot */
29 loff_t dq_local_off; /* Offset in the local quota file */
30 struct ocfs2_quota_chunk *dq_chunk; /* Chunk dquot is in */
31 unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */
32 s64 dq_origspace; /* Last globally synced space usage */
33 s64 dq_originodes; /* Last globally synced inode usage */
34};
35
36/* Description of one chunk to recover in memory */
37struct ocfs2_recovery_chunk {
38 struct list_head rc_list; /* List of chunks */
39 int rc_chunk; /* Chunk number */
40 unsigned long *rc_bitmap; /* Bitmap of entries to recover */
41};
42
43struct ocfs2_quota_recovery {
44 struct list_head r_list[MAXQUOTAS]; /* List of chunks to recover */
45};
46
47/* In-memory structure with quota header information */
48struct ocfs2_mem_dqinfo {
49 unsigned int dqi_type; /* Quota type this structure describes */
50 unsigned int dqi_chunks; /* Number of chunks in local quota file */
51 unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */
52 unsigned int dqi_syncms; /* How often should we sync with other nodes */
53 unsigned int dqi_syncjiff; /* Precomputed dqi_syncms in jiffies */
54 struct list_head dqi_chunk; /* List of chunks */
55 struct inode *dqi_gqinode; /* Global quota file inode */
56 struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */
57 struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */
58 int dqi_gqi_count; /* Number of holders of dqi_gqi_bh */
59 struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */
60 struct buffer_head *dqi_ibh; /* Buffer with information header */
61 struct qtree_mem_dqinfo dqi_gi; /* Info about global file */
62 struct delayed_work dqi_sync_work; /* Work for syncing dquots */
63 struct ocfs2_quota_recovery *dqi_rec; /* Pointer to recovery
64 * information, in case we
65 * enable quotas on file
66 * needing it */
67};
68
69static inline struct ocfs2_dquot *OCFS2_DQUOT(struct dquot *dquot)
70{
71 return container_of(dquot, struct ocfs2_dquot, dq_dquot);
72}
73
74struct ocfs2_quota_chunk {
75 struct list_head qc_chunk; /* List of quotafile chunks */
76 int qc_num; /* Number of quota chunk */
77 struct buffer_head *qc_headerbh; /* Buffer head with chunk header */
78};
79
80extern struct kmem_cache *ocfs2_dquot_cachep;
81extern struct kmem_cache *ocfs2_qf_chunk_cachep;
82
83extern struct qtree_fmt_operations ocfs2_global_ops;
84
85struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
86 struct ocfs2_super *osb, int slot_num);
87int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
88 struct ocfs2_quota_recovery *rec,
89 int slot_num);
90void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec);
91ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
92 size_t len, loff_t off);
93ssize_t ocfs2_quota_write(struct super_block *sb, int type,
94 const char *data, size_t len, loff_t off);
95int ocfs2_global_read_info(struct super_block *sb, int type);
96int ocfs2_global_write_info(struct super_block *sb, int type);
97int ocfs2_global_read_dquot(struct dquot *dquot);
98int __ocfs2_sync_dquot(struct dquot *dquot, int freeing);
99static inline int ocfs2_sync_dquot(struct dquot *dquot)
100{
101 return __ocfs2_sync_dquot(dquot, 0);
102}
103static inline int ocfs2_global_release_dquot(struct dquot *dquot)
104{
105 return __ocfs2_sync_dquot(dquot, 1);
106}
107
108int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
109void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
110int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
111 struct buffer_head **bh);
112
113extern struct dquot_operations ocfs2_quota_operations;
114extern struct quota_format_type ocfs2_quota_format;
115
116int ocfs2_quota_setup(void);
117void ocfs2_quota_shutdown(void);
118
119#endif /* _OCFS2_QUOTA_H */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
new file mode 100644
index 000000000000..6aff8f2d3e49
--- /dev/null
+++ b/fs/ocfs2/quota_global.c
@@ -0,0 +1,1025 @@
1/*
2 * Implementation of operations over global quota file
3 */
4#include <linux/spinlock.h>
5#include <linux/fs.h>
6#include <linux/quota.h>
7#include <linux/quotaops.h>
8#include <linux/dqblk_qtree.h>
9#include <linux/jiffies.h>
10#include <linux/writeback.h>
11#include <linux/workqueue.h>
12
13#define MLOG_MASK_PREFIX ML_QUOTA
14#include <cluster/masklog.h>
15
16#include "ocfs2_fs.h"
17#include "ocfs2.h"
18#include "alloc.h"
19#include "blockcheck.h"
20#include "inode.h"
21#include "journal.h"
22#include "file.h"
23#include "sysfile.h"
24#include "dlmglue.h"
25#include "uptodate.h"
26#include "quota.h"
27
28static struct workqueue_struct *ocfs2_quota_wq = NULL;
29
30static void qsync_work_fn(struct work_struct *work);
31
32static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp)
33{
34 struct ocfs2_global_disk_dqblk *d = dp;
35 struct mem_dqblk *m = &dquot->dq_dqb;
36
37 /* Update from disk only entries not set by the admin */
38 if (!test_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags)) {
39 m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit);
40 m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit);
41 }
42 if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
43 m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes);
44 if (!test_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags)) {
45 m->dqb_bhardlimit = le64_to_cpu(d->dqb_bhardlimit);
46 m->dqb_bsoftlimit = le64_to_cpu(d->dqb_bsoftlimit);
47 }
48 if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
49 m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
50 if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags))
51 m->dqb_btime = le64_to_cpu(d->dqb_btime);
52 if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags))
53 m->dqb_itime = le64_to_cpu(d->dqb_itime);
54 OCFS2_DQUOT(dquot)->dq_use_count = le32_to_cpu(d->dqb_use_count);
55}
56
57static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
58{
59 struct ocfs2_global_disk_dqblk *d = dp;
60 struct mem_dqblk *m = &dquot->dq_dqb;
61
62 d->dqb_id = cpu_to_le32(dquot->dq_id);
63 d->dqb_use_count = cpu_to_le32(OCFS2_DQUOT(dquot)->dq_use_count);
64 d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
65 d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
66 d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes);
67 d->dqb_bhardlimit = cpu_to_le64(m->dqb_bhardlimit);
68 d->dqb_bsoftlimit = cpu_to_le64(m->dqb_bsoftlimit);
69 d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
70 d->dqb_btime = cpu_to_le64(m->dqb_btime);
71 d->dqb_itime = cpu_to_le64(m->dqb_itime);
72}
73
74static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
75{
76 struct ocfs2_global_disk_dqblk *d = dp;
77 struct ocfs2_mem_dqinfo *oinfo =
78 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
79
80 if (qtree_entry_unused(&oinfo->dqi_gi, dp))
81 return 0;
82 return le32_to_cpu(d->dqb_id) == dquot->dq_id;
83}
84
85struct qtree_fmt_operations ocfs2_global_ops = {
86 .mem2disk_dqblk = ocfs2_global_mem2diskdqb,
87 .disk2mem_dqblk = ocfs2_global_disk2memdqb,
88 .is_id = ocfs2_global_is_id,
89};
90
91static int ocfs2_validate_quota_block(struct super_block *sb,
92 struct buffer_head *bh)
93{
94 struct ocfs2_disk_dqtrailer *dqt =
95 ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
96
97 mlog(0, "Validating quota block %llu\n",
98 (unsigned long long)bh->b_blocknr);
99
100 BUG_ON(!buffer_uptodate(bh));
101
102 /*
103 * If the ecc fails, we return the error but otherwise
104 * leave the filesystem running. We know any error is
105 * local to this block.
106 */
107 return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check);
108}
109
110int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
111 struct buffer_head **bh)
112{
113 int rc = 0;
114 struct buffer_head *tmp = *bh;
115
116 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
117 ocfs2_validate_quota_block);
118 if (rc)
119 mlog_errno(rc);
120
121 /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
122 if (!rc && !*bh)
123 *bh = tmp;
124
125 return rc;
126}
127
128static int ocfs2_get_quota_block(struct inode *inode, int block,
129 struct buffer_head **bh)
130{
131 u64 pblock, pcount;
132 int err;
133
134 down_read(&OCFS2_I(inode)->ip_alloc_sem);
135 err = ocfs2_extent_map_get_blocks(inode, block, &pblock, &pcount, NULL);
136 up_read(&OCFS2_I(inode)->ip_alloc_sem);
137 if (err) {
138 mlog_errno(err);
139 return err;
140 }
141 *bh = sb_getblk(inode->i_sb, pblock);
142 if (!*bh) {
143 err = -EIO;
144 mlog_errno(err);
145 }
146 return err;;
147}
148
149/* Read data from global quotafile - avoid pagecache and such because we cannot
150 * afford acquiring the locks... We use quota cluster lock to serialize
151 * operations. Caller is responsible for acquiring it. */
152ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
153 size_t len, loff_t off)
154{
155 struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
156 struct inode *gqinode = oinfo->dqi_gqinode;
157 loff_t i_size = i_size_read(gqinode);
158 int offset = off & (sb->s_blocksize - 1);
159 sector_t blk = off >> sb->s_blocksize_bits;
160 int err = 0;
161 struct buffer_head *bh;
162 size_t toread, tocopy;
163
164 if (off > i_size)
165 return 0;
166 if (off + len > i_size)
167 len = i_size - off;
168 toread = len;
169 while (toread > 0) {
170 tocopy = min_t(size_t, (sb->s_blocksize - offset), toread);
171 bh = NULL;
172 err = ocfs2_read_quota_block(gqinode, blk, &bh);
173 if (err) {
174 mlog_errno(err);
175 return err;
176 }
177 memcpy(data, bh->b_data + offset, tocopy);
178 brelse(bh);
179 offset = 0;
180 toread -= tocopy;
181 data += tocopy;
182 blk++;
183 }
184 return len;
185}
186
187/* Write to quotafile (we know the transaction is already started and has
188 * enough credits) */
189ssize_t ocfs2_quota_write(struct super_block *sb, int type,
190 const char *data, size_t len, loff_t off)
191{
192 struct mem_dqinfo *info = sb_dqinfo(sb, type);
193 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
194 struct inode *gqinode = oinfo->dqi_gqinode;
195 int offset = off & (sb->s_blocksize - 1);
196 sector_t blk = off >> sb->s_blocksize_bits;
197 int err = 0, new = 0, ja_type;
198 struct buffer_head *bh = NULL;
199 handle_t *handle = journal_current_handle();
200
201 if (!handle) {
202 mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled "
203 "because transaction was not started.\n",
204 (unsigned long long)off, (unsigned long long)len);
205 return -EIO;
206 }
207 if (len > sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset) {
208 WARN_ON(1);
209 len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
210 }
211
212 mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
213 if (gqinode->i_size < off + len) {
214 down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
215 err = ocfs2_extend_no_holes(gqinode, off + len, off);
216 up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
217 if (err < 0)
218 goto out;
219 err = ocfs2_simple_size_update(gqinode,
220 oinfo->dqi_gqi_bh,
221 off + len);
222 if (err < 0)
223 goto out;
224 new = 1;
225 }
226 /* Not rewriting whole block? */
227 if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
228 !new) {
229 err = ocfs2_read_quota_block(gqinode, blk, &bh);
230 ja_type = OCFS2_JOURNAL_ACCESS_WRITE;
231 } else {
232 err = ocfs2_get_quota_block(gqinode, blk, &bh);
233 ja_type = OCFS2_JOURNAL_ACCESS_CREATE;
234 }
235 if (err) {
236 mlog_errno(err);
237 return err;
238 }
239 lock_buffer(bh);
240 if (new)
241 memset(bh->b_data, 0, sb->s_blocksize);
242 memcpy(bh->b_data + offset, data, len);
243 flush_dcache_page(bh->b_page);
244 set_buffer_uptodate(bh);
245 unlock_buffer(bh);
246 ocfs2_set_buffer_uptodate(gqinode, bh);
247 err = ocfs2_journal_access_dq(handle, gqinode, bh, ja_type);
248 if (err < 0) {
249 brelse(bh);
250 goto out;
251 }
252 err = ocfs2_journal_dirty(handle, bh);
253 brelse(bh);
254 if (err < 0)
255 goto out;
256out:
257 if (err) {
258 mutex_unlock(&gqinode->i_mutex);
259 mlog_errno(err);
260 return err;
261 }
262 gqinode->i_version++;
263 ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
264 mutex_unlock(&gqinode->i_mutex);
265 return len;
266}
267
268int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
269{
270 int status;
271 struct buffer_head *bh = NULL;
272
273 status = ocfs2_inode_lock(oinfo->dqi_gqinode, &bh, ex);
274 if (status < 0)
275 return status;
276 spin_lock(&dq_data_lock);
277 if (!oinfo->dqi_gqi_count++)
278 oinfo->dqi_gqi_bh = bh;
279 else
280 WARN_ON(bh != oinfo->dqi_gqi_bh);
281 spin_unlock(&dq_data_lock);
282 return 0;
283}
284
285void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
286{
287 ocfs2_inode_unlock(oinfo->dqi_gqinode, ex);
288 brelse(oinfo->dqi_gqi_bh);
289 spin_lock(&dq_data_lock);
290 if (!--oinfo->dqi_gqi_count)
291 oinfo->dqi_gqi_bh = NULL;
292 spin_unlock(&dq_data_lock);
293}
294
295/* Read information header from global quota file */
296int ocfs2_global_read_info(struct super_block *sb, int type)
297{
298 struct inode *gqinode = NULL;
299 unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
300 GROUP_QUOTA_SYSTEM_INODE };
301 struct ocfs2_global_disk_dqinfo dinfo;
302 struct mem_dqinfo *info = sb_dqinfo(sb, type);
303 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
304 int status;
305
306 mlog_entry_void();
307
308 /* Read global header */
309 gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
310 OCFS2_INVALID_SLOT);
311 if (!gqinode) {
312 mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n",
313 type);
314 status = -EINVAL;
315 goto out_err;
316 }
317 oinfo->dqi_gi.dqi_sb = sb;
318 oinfo->dqi_gi.dqi_type = type;
319 ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
320 oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
321 oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
322 oinfo->dqi_gqi_bh = NULL;
323 oinfo->dqi_gqi_count = 0;
324 oinfo->dqi_gqinode = gqinode;
325 status = ocfs2_lock_global_qf(oinfo, 0);
326 if (status < 0) {
327 mlog_errno(status);
328 goto out_err;
329 }
330 status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
331 sizeof(struct ocfs2_global_disk_dqinfo),
332 OCFS2_GLOBAL_INFO_OFF);
333 ocfs2_unlock_global_qf(oinfo, 0);
334 if (status != sizeof(struct ocfs2_global_disk_dqinfo)) {
335 mlog(ML_ERROR, "Cannot read global quota info (%d).\n",
336 status);
337 if (status >= 0)
338 status = -EIO;
339 mlog_errno(status);
340 goto out_err;
341 }
342 info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
343 info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
344 oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms);
345 oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms);
346 oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
347 oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
348 oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
349 oinfo->dqi_gi.dqi_blocksize_bits = sb->s_blocksize_bits;
350 oinfo->dqi_gi.dqi_usable_bs = sb->s_blocksize -
351 OCFS2_QBLK_RESERVED_SPACE;
352 oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
353 INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
354 queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
355 oinfo->dqi_syncjiff);
356
357out_err:
358 mlog_exit(status);
359 return status;
360}
361
362/* Write information to global quota file. Expects exlusive lock on quota
363 * file inode and quota info */
364static int __ocfs2_global_write_info(struct super_block *sb, int type)
365{
366 struct mem_dqinfo *info = sb_dqinfo(sb, type);
367 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
368 struct ocfs2_global_disk_dqinfo dinfo;
369 ssize_t size;
370
371 spin_lock(&dq_data_lock);
372 info->dqi_flags &= ~DQF_INFO_DIRTY;
373 dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
374 dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
375 spin_unlock(&dq_data_lock);
376 dinfo.dqi_syncms = cpu_to_le32(oinfo->dqi_syncms);
377 dinfo.dqi_blocks = cpu_to_le32(oinfo->dqi_gi.dqi_blocks);
378 dinfo.dqi_free_blk = cpu_to_le32(oinfo->dqi_gi.dqi_free_blk);
379 dinfo.dqi_free_entry = cpu_to_le32(oinfo->dqi_gi.dqi_free_entry);
380 size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
381 sizeof(struct ocfs2_global_disk_dqinfo),
382 OCFS2_GLOBAL_INFO_OFF);
383 if (size != sizeof(struct ocfs2_global_disk_dqinfo)) {
384 mlog(ML_ERROR, "Cannot write global quota info structure\n");
385 if (size >= 0)
386 size = -EIO;
387 return size;
388 }
389 return 0;
390}
391
392int ocfs2_global_write_info(struct super_block *sb, int type)
393{
394 int err;
395 struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
396
397 err = ocfs2_qinfo_lock(info, 1);
398 if (err < 0)
399 return err;
400 err = __ocfs2_global_write_info(sb, type);
401 ocfs2_qinfo_unlock(info, 1);
402 return err;
403}
404
405/* Read in information from global quota file and acquire a reference to it.
406 * dquot_acquire() has already started the transaction and locked quota file */
407int ocfs2_global_read_dquot(struct dquot *dquot)
408{
409 int err, err2, ex = 0;
410 struct ocfs2_mem_dqinfo *info =
411 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
412
413 err = ocfs2_qinfo_lock(info, 0);
414 if (err < 0)
415 goto out;
416 err = qtree_read_dquot(&info->dqi_gi, dquot);
417 if (err < 0)
418 goto out_qlock;
419 OCFS2_DQUOT(dquot)->dq_use_count++;
420 OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
421 OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
422 if (!dquot->dq_off) { /* No real quota entry? */
423 /* Upgrade to exclusive lock for allocation */
424 err = ocfs2_qinfo_lock(info, 1);
425 if (err < 0)
426 goto out_qlock;
427 ex = 1;
428 }
429 err = qtree_write_dquot(&info->dqi_gi, dquot);
430 if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
431 err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
432 if (!err)
433 err = err2;
434 }
435out_qlock:
436 if (ex)
437 ocfs2_qinfo_unlock(info, 1);
438 ocfs2_qinfo_unlock(info, 0);
439out:
440 if (err < 0)
441 mlog_errno(err);
442 return err;
443}
444
445/* Sync local information about quota modifications with global quota file.
446 * Caller must have started the transaction and obtained exclusive lock for
447 * global quota file inode */
448int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
449{
450 int err, err2;
451 struct super_block *sb = dquot->dq_sb;
452 int type = dquot->dq_type;
453 struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
454 struct ocfs2_global_disk_dqblk dqblk;
455 s64 spacechange, inodechange;
456 time_t olditime, oldbtime;
457
458 err = sb->s_op->quota_read(sb, type, (char *)&dqblk,
459 sizeof(struct ocfs2_global_disk_dqblk),
460 dquot->dq_off);
461 if (err != sizeof(struct ocfs2_global_disk_dqblk)) {
462 if (err >= 0) {
463 mlog(ML_ERROR, "Short read from global quota file "
464 "(%u read)\n", err);
465 err = -EIO;
466 }
467 goto out;
468 }
469
470 /* Update space and inode usage. Get also other information from
471 * global quota file so that we don't overwrite any changes there.
472 * We are */
473 spin_lock(&dq_data_lock);
474 spacechange = dquot->dq_dqb.dqb_curspace -
475 OCFS2_DQUOT(dquot)->dq_origspace;
476 inodechange = dquot->dq_dqb.dqb_curinodes -
477 OCFS2_DQUOT(dquot)->dq_originodes;
478 olditime = dquot->dq_dqb.dqb_itime;
479 oldbtime = dquot->dq_dqb.dqb_btime;
480 ocfs2_global_disk2memdqb(dquot, &dqblk);
481 mlog(0, "Syncing global dquot %u space %lld+%lld, inodes %lld+%lld\n",
482 dquot->dq_id, dquot->dq_dqb.dqb_curspace, (long long)spacechange,
483 dquot->dq_dqb.dqb_curinodes, (long long)inodechange);
484 if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
485 dquot->dq_dqb.dqb_curspace += spacechange;
486 if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
487 dquot->dq_dqb.dqb_curinodes += inodechange;
488 /* Set properly space grace time... */
489 if (dquot->dq_dqb.dqb_bsoftlimit &&
490 dquot->dq_dqb.dqb_curspace > dquot->dq_dqb.dqb_bsoftlimit) {
491 if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags) &&
492 oldbtime > 0) {
493 if (dquot->dq_dqb.dqb_btime > 0)
494 dquot->dq_dqb.dqb_btime =
495 min(dquot->dq_dqb.dqb_btime, oldbtime);
496 else
497 dquot->dq_dqb.dqb_btime = oldbtime;
498 }
499 } else {
500 dquot->dq_dqb.dqb_btime = 0;
501 clear_bit(DQ_BLKS_B, &dquot->dq_flags);
502 }
503 /* Set properly inode grace time... */
504 if (dquot->dq_dqb.dqb_isoftlimit &&
505 dquot->dq_dqb.dqb_curinodes > dquot->dq_dqb.dqb_isoftlimit) {
506 if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags) &&
507 olditime > 0) {
508 if (dquot->dq_dqb.dqb_itime > 0)
509 dquot->dq_dqb.dqb_itime =
510 min(dquot->dq_dqb.dqb_itime, olditime);
511 else
512 dquot->dq_dqb.dqb_itime = olditime;
513 }
514 } else {
515 dquot->dq_dqb.dqb_itime = 0;
516 clear_bit(DQ_INODES_B, &dquot->dq_flags);
517 }
518 /* All information is properly updated, clear the flags */
519 __clear_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
520 __clear_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
521 __clear_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
522 __clear_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
523 __clear_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
524 __clear_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
525 OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
526 OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
527 spin_unlock(&dq_data_lock);
528 err = ocfs2_qinfo_lock(info, freeing);
529 if (err < 0) {
530 mlog(ML_ERROR, "Failed to lock quota info, loosing quota write"
531 " (type=%d, id=%u)\n", dquot->dq_type,
532 (unsigned)dquot->dq_id);
533 goto out;
534 }
535 if (freeing)
536 OCFS2_DQUOT(dquot)->dq_use_count--;
537 err = qtree_write_dquot(&info->dqi_gi, dquot);
538 if (err < 0)
539 goto out_qlock;
540 if (freeing && !OCFS2_DQUOT(dquot)->dq_use_count) {
541 err = qtree_release_dquot(&info->dqi_gi, dquot);
542 if (info_dirty(sb_dqinfo(sb, type))) {
543 err2 = __ocfs2_global_write_info(sb, type);
544 if (!err)
545 err = err2;
546 }
547 }
548out_qlock:
549 ocfs2_qinfo_unlock(info, freeing);
550out:
551 if (err < 0)
552 mlog_errno(err);
553 return err;
554}
555
556/*
557 * Functions for periodic syncing of dquots with global file
558 */
559static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
560{
561 handle_t *handle;
562 struct super_block *sb = dquot->dq_sb;
563 struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
564 struct ocfs2_super *osb = OCFS2_SB(sb);
565 int status = 0;
566
567 mlog_entry("id=%u qtype=%u type=%lu device=%s\n", dquot->dq_id,
568 dquot->dq_type, type, sb->s_id);
569 if (type != dquot->dq_type)
570 goto out;
571 status = ocfs2_lock_global_qf(oinfo, 1);
572 if (status < 0)
573 goto out;
574
575 handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
576 if (IS_ERR(handle)) {
577 status = PTR_ERR(handle);
578 mlog_errno(status);
579 goto out_ilock;
580 }
581 mutex_lock(&sb_dqopt(sb)->dqio_mutex);
582 status = ocfs2_sync_dquot(dquot);
583 mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
584 if (status < 0)
585 mlog_errno(status);
586 /* We have to write local structure as well... */
587 dquot_mark_dquot_dirty(dquot);
588 status = dquot_commit(dquot);
589 if (status < 0)
590 mlog_errno(status);
591 ocfs2_commit_trans(osb, handle);
592out_ilock:
593 ocfs2_unlock_global_qf(oinfo, 1);
594out:
595 mlog_exit(status);
596 return status;
597}
598
599static void qsync_work_fn(struct work_struct *work)
600{
601 struct ocfs2_mem_dqinfo *oinfo = container_of(work,
602 struct ocfs2_mem_dqinfo,
603 dqi_sync_work.work);
604 struct super_block *sb = oinfo->dqi_gqinode->i_sb;
605
606 dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
607 queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
608 oinfo->dqi_syncjiff);
609}
610
611/*
612 * Wrappers for generic quota functions
613 */
614
615static int ocfs2_write_dquot(struct dquot *dquot)
616{
617 handle_t *handle;
618 struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
619 int status = 0;
620
621 mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
622
623 handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS);
624 if (IS_ERR(handle)) {
625 status = PTR_ERR(handle);
626 mlog_errno(status);
627 goto out;
628 }
629 status = dquot_commit(dquot);
630 ocfs2_commit_trans(osb, handle);
631out:
632 mlog_exit(status);
633 return status;
634}
635
636int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
637{
638 struct ocfs2_mem_dqinfo *oinfo;
639 int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
640 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
641
642 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
643 return 0;
644
645 oinfo = sb_dqinfo(sb, type)->dqi_priv;
646 /* We modify tree, leaf block, global info, local chunk header,
647 * global and local inode */
648 return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 +
649 2 * OCFS2_INODE_UPDATE_CREDITS;
650}
651
652static int ocfs2_release_dquot(struct dquot *dquot)
653{
654 handle_t *handle;
655 struct ocfs2_mem_dqinfo *oinfo =
656 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
657 struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
658 int status = 0;
659
660 mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
661
662 status = ocfs2_lock_global_qf(oinfo, 1);
663 if (status < 0)
664 goto out;
665 handle = ocfs2_start_trans(osb,
666 ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_type));
667 if (IS_ERR(handle)) {
668 status = PTR_ERR(handle);
669 mlog_errno(status);
670 goto out_ilock;
671 }
672 status = dquot_release(dquot);
673 ocfs2_commit_trans(osb, handle);
674out_ilock:
675 ocfs2_unlock_global_qf(oinfo, 1);
676out:
677 mlog_exit(status);
678 return status;
679}
680
681int ocfs2_calc_qinit_credits(struct super_block *sb, int type)
682{
683 struct ocfs2_mem_dqinfo *oinfo;
684 int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
685 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
686 struct ocfs2_dinode *lfe, *gfe;
687
688 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
689 return 0;
690
691 oinfo = sb_dqinfo(sb, type)->dqi_priv;
692 gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data;
693 lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data;
694 /* We can extend local file + global file. In local file we
695 * can modify info, chunk header block and dquot block. In
696 * global file we can modify info, tree and leaf block */
697 return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) +
698 ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) +
699 3 + oinfo->dqi_gi.dqi_qtree_depth + 2;
700}
701
702static int ocfs2_acquire_dquot(struct dquot *dquot)
703{
704 handle_t *handle;
705 struct ocfs2_mem_dqinfo *oinfo =
706 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
707 struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
708 int status = 0;
709
710 mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
711 /* We need an exclusive lock, because we're going to update use count
712 * and instantiate possibly new dquot structure */
713 status = ocfs2_lock_global_qf(oinfo, 1);
714 if (status < 0)
715 goto out;
716 handle = ocfs2_start_trans(osb,
717 ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type));
718 if (IS_ERR(handle)) {
719 status = PTR_ERR(handle);
720 mlog_errno(status);
721 goto out_ilock;
722 }
723 status = dquot_acquire(dquot);
724 ocfs2_commit_trans(osb, handle);
725out_ilock:
726 ocfs2_unlock_global_qf(oinfo, 1);
727out:
728 mlog_exit(status);
729 return status;
730}
731
732static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
733{
734 unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) |
735 (1 << (DQ_LASTSET_B + QIF_BLIMITS_B)) |
736 (1 << (DQ_LASTSET_B + QIF_INODES_B)) |
737 (1 << (DQ_LASTSET_B + QIF_SPACE_B)) |
738 (1 << (DQ_LASTSET_B + QIF_BTIME_B)) |
739 (1 << (DQ_LASTSET_B + QIF_ITIME_B));
740 int sync = 0;
741 int status;
742 struct super_block *sb = dquot->dq_sb;
743 int type = dquot->dq_type;
744 struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
745 handle_t *handle;
746 struct ocfs2_super *osb = OCFS2_SB(sb);
747
748 mlog_entry("id=%u, type=%d", dquot->dq_id, type);
749 dquot_mark_dquot_dirty(dquot);
750
751 /* In case user set some limits, sync dquot immediately to global
752 * quota file so that information propagates quicker */
753 spin_lock(&dq_data_lock);
754 if (dquot->dq_flags & mask)
755 sync = 1;
756 spin_unlock(&dq_data_lock);
757 if (!sync) {
758 status = ocfs2_write_dquot(dquot);
759 goto out;
760 }
761 status = ocfs2_lock_global_qf(oinfo, 1);
762 if (status < 0)
763 goto out;
764 handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
765 if (IS_ERR(handle)) {
766 status = PTR_ERR(handle);
767 mlog_errno(status);
768 goto out_ilock;
769 }
770 status = ocfs2_sync_dquot(dquot);
771 if (status < 0) {
772 mlog_errno(status);
773 goto out_trans;
774 }
775 /* Now write updated local dquot structure */
776 status = dquot_commit(dquot);
777out_trans:
778 ocfs2_commit_trans(osb, handle);
779out_ilock:
780 ocfs2_unlock_global_qf(oinfo, 1);
781out:
782 mlog_exit(status);
783 return status;
784}
785
786/* This should happen only after set_dqinfo(). */
787static int ocfs2_write_info(struct super_block *sb, int type)
788{
789 handle_t *handle;
790 int status = 0;
791 struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
792
793 mlog_entry_void();
794
795 status = ocfs2_lock_global_qf(oinfo, 1);
796 if (status < 0)
797 goto out;
798 handle = ocfs2_start_trans(OCFS2_SB(sb), OCFS2_QINFO_WRITE_CREDITS);
799 if (IS_ERR(handle)) {
800 status = PTR_ERR(handle);
801 mlog_errno(status);
802 goto out_ilock;
803 }
804 status = dquot_commit_info(sb, type);
805 ocfs2_commit_trans(OCFS2_SB(sb), handle);
806out_ilock:
807 ocfs2_unlock_global_qf(oinfo, 1);
808out:
809 mlog_exit(status);
810 return status;
811}
812
813/* This is difficult. We have to lock quota inode and start transaction
814 * in this function but we don't want to take the penalty of exlusive
815 * quota file lock when we are just going to use cached structures. So
816 * we just take read lock check whether we have dquot cached and if so,
817 * we don't have to take the write lock... */
818static int ocfs2_dquot_initialize(struct inode *inode, int type)
819{
820 handle_t *handle = NULL;
821 int status = 0;
822 struct super_block *sb = inode->i_sb;
823 struct ocfs2_mem_dqinfo *oinfo;
824 int exclusive = 0;
825 int cnt;
826 qid_t id;
827
828 mlog_entry_void();
829
830 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
831 if (type != -1 && cnt != type)
832 continue;
833 if (!sb_has_quota_active(sb, cnt))
834 continue;
835 oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
836 status = ocfs2_lock_global_qf(oinfo, 0);
837 if (status < 0)
838 goto out;
839 /* This is just a performance optimization not a reliable test.
840 * Since we hold an inode lock, noone can actually release
841 * the structure until we are finished with initialization. */
842 if (inode->i_dquot[cnt] != NODQUOT) {
843 ocfs2_unlock_global_qf(oinfo, 0);
844 continue;
845 }
846 /* When we have inode lock, we know that no dquot_release() can
847 * run and thus we can safely check whether we need to
848 * read+modify global file to get quota information or whether
849 * our node already has it. */
850 if (cnt == USRQUOTA)
851 id = inode->i_uid;
852 else if (cnt == GRPQUOTA)
853 id = inode->i_gid;
854 else
855 BUG();
856 /* Obtain exclusion from quota off... */
857 down_write(&sb_dqopt(sb)->dqptr_sem);
858 exclusive = !dquot_is_cached(sb, id, cnt);
859 up_write(&sb_dqopt(sb)->dqptr_sem);
860 if (exclusive) {
861 status = ocfs2_lock_global_qf(oinfo, 1);
862 if (status < 0) {
863 exclusive = 0;
864 mlog_errno(status);
865 goto out_ilock;
866 }
867 handle = ocfs2_start_trans(OCFS2_SB(sb),
868 ocfs2_calc_qinit_credits(sb, cnt));
869 if (IS_ERR(handle)) {
870 status = PTR_ERR(handle);
871 mlog_errno(status);
872 goto out_ilock;
873 }
874 }
875 dquot_initialize(inode, cnt);
876 if (exclusive) {
877 ocfs2_commit_trans(OCFS2_SB(sb), handle);
878 ocfs2_unlock_global_qf(oinfo, 1);
879 }
880 ocfs2_unlock_global_qf(oinfo, 0);
881 }
882 mlog_exit(0);
883 return 0;
884out_ilock:
885 if (exclusive)
886 ocfs2_unlock_global_qf(oinfo, 1);
887 ocfs2_unlock_global_qf(oinfo, 0);
888out:
889 mlog_exit(status);
890 return status;
891}
892
893static int ocfs2_dquot_drop_slow(struct inode *inode)
894{
895 int status = 0;
896 int cnt;
897 int got_lock[MAXQUOTAS] = {0, 0};
898 handle_t *handle;
899 struct super_block *sb = inode->i_sb;
900 struct ocfs2_mem_dqinfo *oinfo;
901
902 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
903 if (!sb_has_quota_active(sb, cnt))
904 continue;
905 oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
906 status = ocfs2_lock_global_qf(oinfo, 1);
907 if (status < 0)
908 goto out;
909 got_lock[cnt] = 1;
910 }
911 handle = ocfs2_start_trans(OCFS2_SB(sb),
912 ocfs2_calc_qinit_credits(sb, USRQUOTA) +
913 ocfs2_calc_qinit_credits(sb, GRPQUOTA));
914 if (IS_ERR(handle)) {
915 status = PTR_ERR(handle);
916 mlog_errno(status);
917 goto out;
918 }
919 dquot_drop(inode);
920 ocfs2_commit_trans(OCFS2_SB(sb), handle);
921out:
922 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
923 if (got_lock[cnt]) {
924 oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
925 ocfs2_unlock_global_qf(oinfo, 1);
926 }
927 return status;
928}
929
930/* See the comment before ocfs2_dquot_initialize. */
931static int ocfs2_dquot_drop(struct inode *inode)
932{
933 int status = 0;
934 struct super_block *sb = inode->i_sb;
935 struct ocfs2_mem_dqinfo *oinfo;
936 int exclusive = 0;
937 int cnt;
938 int got_lock[MAXQUOTAS] = {0, 0};
939
940 mlog_entry_void();
941 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
942 if (!sb_has_quota_active(sb, cnt))
943 continue;
944 oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
945 status = ocfs2_lock_global_qf(oinfo, 0);
946 if (status < 0)
947 goto out;
948 got_lock[cnt] = 1;
949 }
950 /* Lock against anyone releasing references so that when when we check
951 * we know we are not going to be last ones to release dquot */
952 down_write(&sb_dqopt(sb)->dqptr_sem);
953 /* Urgh, this is a terrible hack :( */
954 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
955 if (inode->i_dquot[cnt] != NODQUOT &&
956 atomic_read(&inode->i_dquot[cnt]->dq_count) > 1) {
957 exclusive = 1;
958 break;
959 }
960 }
961 if (!exclusive)
962 dquot_drop_locked(inode);
963 up_write(&sb_dqopt(sb)->dqptr_sem);
964out:
965 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
966 if (got_lock[cnt]) {
967 oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
968 ocfs2_unlock_global_qf(oinfo, 0);
969 }
970 /* In case we bailed out because we had to do expensive locking
971 * do it now... */
972 if (exclusive)
973 status = ocfs2_dquot_drop_slow(inode);
974 mlog_exit(status);
975 return status;
976}
977
978static struct dquot *ocfs2_alloc_dquot(struct super_block *sb, int type)
979{
980 struct ocfs2_dquot *dquot =
981 kmem_cache_zalloc(ocfs2_dquot_cachep, GFP_NOFS);
982
983 if (!dquot)
984 return NULL;
985 return &dquot->dq_dquot;
986}
987
988static void ocfs2_destroy_dquot(struct dquot *dquot)
989{
990 kmem_cache_free(ocfs2_dquot_cachep, dquot);
991}
992
993struct dquot_operations ocfs2_quota_operations = {
994 .initialize = ocfs2_dquot_initialize,
995 .drop = ocfs2_dquot_drop,
996 .alloc_space = dquot_alloc_space,
997 .alloc_inode = dquot_alloc_inode,
998 .free_space = dquot_free_space,
999 .free_inode = dquot_free_inode,
1000 .transfer = dquot_transfer,
1001 .write_dquot = ocfs2_write_dquot,
1002 .acquire_dquot = ocfs2_acquire_dquot,
1003 .release_dquot = ocfs2_release_dquot,
1004 .mark_dirty = ocfs2_mark_dquot_dirty,
1005 .write_info = ocfs2_write_info,
1006 .alloc_dquot = ocfs2_alloc_dquot,
1007 .destroy_dquot = ocfs2_destroy_dquot,
1008};
1009
1010int ocfs2_quota_setup(void)
1011{
1012 ocfs2_quota_wq = create_workqueue("o2quot");
1013 if (!ocfs2_quota_wq)
1014 return -ENOMEM;
1015 return 0;
1016}
1017
1018void ocfs2_quota_shutdown(void)
1019{
1020 if (ocfs2_quota_wq) {
1021 flush_workqueue(ocfs2_quota_wq);
1022 destroy_workqueue(ocfs2_quota_wq);
1023 ocfs2_quota_wq = NULL;
1024 }
1025}
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
new file mode 100644
index 000000000000..07deec5e9721
--- /dev/null
+++ b/fs/ocfs2/quota_local.c
@@ -0,0 +1,1253 @@
1/*
2 * Implementation of operations over local quota file
3 */
4
5#include <linux/fs.h>
6#include <linux/quota.h>
7#include <linux/quotaops.h>
8#include <linux/module.h>
9
10#define MLOG_MASK_PREFIX ML_QUOTA
11#include <cluster/masklog.h>
12
13#include "ocfs2_fs.h"
14#include "ocfs2.h"
15#include "inode.h"
16#include "alloc.h"
17#include "file.h"
18#include "buffer_head_io.h"
19#include "journal.h"
20#include "sysfile.h"
21#include "dlmglue.h"
22#include "quota.h"
23
24/* Number of local quota structures per block */
25static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
26{
27 return ((sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) /
28 sizeof(struct ocfs2_local_disk_dqblk));
29}
30
31/* Number of blocks with entries in one chunk */
32static inline unsigned int ol_chunk_blocks(struct super_block *sb)
33{
34 return ((sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
35 OCFS2_QBLK_RESERVED_SPACE) << 3) /
36 ol_quota_entries_per_block(sb);
37}
38
39/* Number of entries in a chunk bitmap */
40static unsigned int ol_chunk_entries(struct super_block *sb)
41{
42 return ol_chunk_blocks(sb) * ol_quota_entries_per_block(sb);
43}
44
45/* Offset of the chunk in quota file */
46static unsigned int ol_quota_chunk_block(struct super_block *sb, int c)
47{
48 /* 1 block for local quota file info, 1 block per chunk for chunk info */
49 return 1 + (ol_chunk_blocks(sb) + 1) * c;
50}
51
52static unsigned int ol_dqblk_block(struct super_block *sb, int c, int off)
53{
54 int epb = ol_quota_entries_per_block(sb);
55
56 return ol_quota_chunk_block(sb, c) + 1 + off / epb;
57}
58
59static unsigned int ol_dqblk_block_off(struct super_block *sb, int c, int off)
60{
61 int epb = ol_quota_entries_per_block(sb);
62
63 return (off % epb) * sizeof(struct ocfs2_local_disk_dqblk);
64}
65
66/* Offset of the dquot structure in the quota file */
67static loff_t ol_dqblk_off(struct super_block *sb, int c, int off)
68{
69 return (ol_dqblk_block(sb, c, off) << sb->s_blocksize_bits) +
70 ol_dqblk_block_off(sb, c, off);
71}
72
73/* Compute block number from given offset */
74static inline unsigned int ol_dqblk_file_block(struct super_block *sb, loff_t off)
75{
76 return off >> sb->s_blocksize_bits;
77}
78
79static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off)
80{
81 return off & ((1 << sb->s_blocksize_bits) - 1);
82}
83
84/* Compute offset in the chunk of a structure with the given offset */
85static int ol_dqblk_chunk_off(struct super_block *sb, int c, loff_t off)
86{
87 int epb = ol_quota_entries_per_block(sb);
88
89 return ((off >> sb->s_blocksize_bits) -
90 ol_quota_chunk_block(sb, c) - 1) * epb
91 + ((unsigned int)(off & ((1 << sb->s_blocksize_bits) - 1))) /
92 sizeof(struct ocfs2_local_disk_dqblk);
93}
94
95/* Write bufferhead into the fs */
96static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
97 void (*modify)(struct buffer_head *, void *), void *private)
98{
99 struct super_block *sb = inode->i_sb;
100 handle_t *handle;
101 int status;
102
103 handle = ocfs2_start_trans(OCFS2_SB(sb), 1);
104 if (IS_ERR(handle)) {
105 status = PTR_ERR(handle);
106 mlog_errno(status);
107 return status;
108 }
109 status = ocfs2_journal_access_dq(handle, inode, bh,
110 OCFS2_JOURNAL_ACCESS_WRITE);
111 if (status < 0) {
112 mlog_errno(status);
113 ocfs2_commit_trans(OCFS2_SB(sb), handle);
114 return status;
115 }
116 lock_buffer(bh);
117 modify(bh, private);
118 unlock_buffer(bh);
119 status = ocfs2_journal_dirty(handle, bh);
120 if (status < 0) {
121 mlog_errno(status);
122 ocfs2_commit_trans(OCFS2_SB(sb), handle);
123 return status;
124 }
125 status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
126 if (status < 0) {
127 mlog_errno(status);
128 return status;
129 }
130 return 0;
131}
132
133/* Check whether we understand format of quota files */
134static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
135{
136 unsigned int lmagics[MAXQUOTAS] = OCFS2_LOCAL_QMAGICS;
137 unsigned int lversions[MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS;
138 unsigned int gmagics[MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS;
139 unsigned int gversions[MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS;
140 unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
141 GROUP_QUOTA_SYSTEM_INODE };
142 struct buffer_head *bh = NULL;
143 struct inode *linode = sb_dqopt(sb)->files[type];
144 struct inode *ginode = NULL;
145 struct ocfs2_disk_dqheader *dqhead;
146 int status, ret = 0;
147
148 /* First check whether we understand local quota file */
149 status = ocfs2_read_quota_block(linode, 0, &bh);
150 if (status) {
151 mlog_errno(status);
152 mlog(ML_ERROR, "failed to read quota file header (type=%d)\n",
153 type);
154 goto out_err;
155 }
156 dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
157 if (le32_to_cpu(dqhead->dqh_magic) != lmagics[type]) {
158 mlog(ML_ERROR, "quota file magic does not match (%u != %u),"
159 " type=%d\n", le32_to_cpu(dqhead->dqh_magic),
160 lmagics[type], type);
161 goto out_err;
162 }
163 if (le32_to_cpu(dqhead->dqh_version) != lversions[type]) {
164 mlog(ML_ERROR, "quota file version does not match (%u != %u),"
165 " type=%d\n", le32_to_cpu(dqhead->dqh_version),
166 lversions[type], type);
167 goto out_err;
168 }
169 brelse(bh);
170 bh = NULL;
171
172 /* Next check whether we understand global quota file */
173 ginode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
174 OCFS2_INVALID_SLOT);
175 if (!ginode) {
176 mlog(ML_ERROR, "cannot get global quota file inode "
177 "(type=%d)\n", type);
178 goto out_err;
179 }
180 /* Since the header is read only, we don't care about locking */
181 status = ocfs2_read_quota_block(ginode, 0, &bh);
182 if (status) {
183 mlog_errno(status);
184 mlog(ML_ERROR, "failed to read global quota file header "
185 "(type=%d)\n", type);
186 goto out_err;
187 }
188 dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
189 if (le32_to_cpu(dqhead->dqh_magic) != gmagics[type]) {
190 mlog(ML_ERROR, "global quota file magic does not match "
191 "(%u != %u), type=%d\n",
192 le32_to_cpu(dqhead->dqh_magic), gmagics[type], type);
193 goto out_err;
194 }
195 if (le32_to_cpu(dqhead->dqh_version) != gversions[type]) {
196 mlog(ML_ERROR, "global quota file version does not match "
197 "(%u != %u), type=%d\n",
198 le32_to_cpu(dqhead->dqh_version), gversions[type],
199 type);
200 goto out_err;
201 }
202
203 ret = 1;
204out_err:
205 brelse(bh);
206 iput(ginode);
207 return ret;
208}
209
210/* Release given list of quota file chunks */
211static void ocfs2_release_local_quota_bitmaps(struct list_head *head)
212{
213 struct ocfs2_quota_chunk *pos, *next;
214
215 list_for_each_entry_safe(pos, next, head, qc_chunk) {
216 list_del(&pos->qc_chunk);
217 brelse(pos->qc_headerbh);
218 kmem_cache_free(ocfs2_qf_chunk_cachep, pos);
219 }
220}
221
222/* Load quota bitmaps into memory */
223static int ocfs2_load_local_quota_bitmaps(struct inode *inode,
224 struct ocfs2_local_disk_dqinfo *ldinfo,
225 struct list_head *head)
226{
227 struct ocfs2_quota_chunk *newchunk;
228 int i, status;
229
230 INIT_LIST_HEAD(head);
231 for (i = 0; i < le32_to_cpu(ldinfo->dqi_chunks); i++) {
232 newchunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
233 if (!newchunk) {
234 ocfs2_release_local_quota_bitmaps(head);
235 return -ENOMEM;
236 }
237 newchunk->qc_num = i;
238 newchunk->qc_headerbh = NULL;
239 status = ocfs2_read_quota_block(inode,
240 ol_quota_chunk_block(inode->i_sb, i),
241 &newchunk->qc_headerbh);
242 if (status) {
243 mlog_errno(status);
244 kmem_cache_free(ocfs2_qf_chunk_cachep, newchunk);
245 ocfs2_release_local_quota_bitmaps(head);
246 return status;
247 }
248 list_add_tail(&newchunk->qc_chunk, head);
249 }
250 return 0;
251}
252
253static void olq_update_info(struct buffer_head *bh, void *private)
254{
255 struct mem_dqinfo *info = private;
256 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
257 struct ocfs2_local_disk_dqinfo *ldinfo;
258
259 ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
260 OCFS2_LOCAL_INFO_OFF);
261 spin_lock(&dq_data_lock);
262 ldinfo->dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
263 ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks);
264 ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks);
265 spin_unlock(&dq_data_lock);
266}
267
268static int ocfs2_add_recovery_chunk(struct super_block *sb,
269 struct ocfs2_local_disk_chunk *dchunk,
270 int chunk,
271 struct list_head *head)
272{
273 struct ocfs2_recovery_chunk *rc;
274
275 rc = kmalloc(sizeof(struct ocfs2_recovery_chunk), GFP_NOFS);
276 if (!rc)
277 return -ENOMEM;
278 rc->rc_chunk = chunk;
279 rc->rc_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
280 if (!rc->rc_bitmap) {
281 kfree(rc);
282 return -ENOMEM;
283 }
284 memcpy(rc->rc_bitmap, dchunk->dqc_bitmap,
285 (ol_chunk_entries(sb) + 7) >> 3);
286 list_add_tail(&rc->rc_list, head);
287 return 0;
288}
289
290static void free_recovery_list(struct list_head *head)
291{
292 struct ocfs2_recovery_chunk *next;
293 struct ocfs2_recovery_chunk *rchunk;
294
295 list_for_each_entry_safe(rchunk, next, head, rc_list) {
296 list_del(&rchunk->rc_list);
297 kfree(rchunk->rc_bitmap);
298 kfree(rchunk);
299 }
300}
301
302void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec)
303{
304 int type;
305
306 for (type = 0; type < MAXQUOTAS; type++)
307 free_recovery_list(&(rec->r_list[type]));
308 kfree(rec);
309}
310
311/* Load entries in our quota file we have to recover*/
312static int ocfs2_recovery_load_quota(struct inode *lqinode,
313 struct ocfs2_local_disk_dqinfo *ldinfo,
314 int type,
315 struct list_head *head)
316{
317 struct super_block *sb = lqinode->i_sb;
318 struct buffer_head *hbh;
319 struct ocfs2_local_disk_chunk *dchunk;
320 int i, chunks = le32_to_cpu(ldinfo->dqi_chunks);
321 int status = 0;
322
323 for (i = 0; i < chunks; i++) {
324 hbh = NULL;
325 status = ocfs2_read_quota_block(lqinode,
326 ol_quota_chunk_block(sb, i),
327 &hbh);
328 if (status) {
329 mlog_errno(status);
330 break;
331 }
332 dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
333 if (le32_to_cpu(dchunk->dqc_free) < ol_chunk_entries(sb))
334 status = ocfs2_add_recovery_chunk(sb, dchunk, i, head);
335 brelse(hbh);
336 if (status < 0)
337 break;
338 }
339 if (status < 0)
340 free_recovery_list(head);
341 return status;
342}
343
344static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void)
345{
346 int type;
347 struct ocfs2_quota_recovery *rec;
348
349 rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS);
350 if (!rec)
351 return NULL;
352 for (type = 0; type < MAXQUOTAS; type++)
353 INIT_LIST_HEAD(&(rec->r_list[type]));
354 return rec;
355}
356
357/* Load information we need for quota recovery into memory */
358struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
359 struct ocfs2_super *osb,
360 int slot_num)
361{
362 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
363 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
364 unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
365 LOCAL_GROUP_QUOTA_SYSTEM_INODE };
366 struct super_block *sb = osb->sb;
367 struct ocfs2_local_disk_dqinfo *ldinfo;
368 struct inode *lqinode;
369 struct buffer_head *bh;
370 int type;
371 int status = 0;
372 struct ocfs2_quota_recovery *rec;
373
374 mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num);
375 rec = ocfs2_alloc_quota_recovery();
376 if (!rec)
377 return ERR_PTR(-ENOMEM);
378 /* First init... */
379
380 for (type = 0; type < MAXQUOTAS; type++) {
381 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
382 continue;
383 /* At this point, journal of the slot is already replayed so
384 * we can trust metadata and data of the quota file */
385 lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
386 if (!lqinode) {
387 status = -ENOENT;
388 goto out;
389 }
390 status = ocfs2_inode_lock_full(lqinode, NULL, 1,
391 OCFS2_META_LOCK_RECOVERY);
392 if (status < 0) {
393 mlog_errno(status);
394 goto out_put;
395 }
396 /* Now read local header */
397 bh = NULL;
398 status = ocfs2_read_quota_block(lqinode, 0, &bh);
399 if (status) {
400 mlog_errno(status);
401 mlog(ML_ERROR, "failed to read quota file info header "
402 "(slot=%d type=%d)\n", slot_num, type);
403 goto out_lock;
404 }
405 ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
406 OCFS2_LOCAL_INFO_OFF);
407 status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
408 &rec->r_list[type]);
409 brelse(bh);
410out_lock:
411 ocfs2_inode_unlock(lqinode, 1);
412out_put:
413 iput(lqinode);
414 if (status < 0)
415 break;
416 }
417out:
418 if (status < 0) {
419 ocfs2_free_quota_recovery(rec);
420 rec = ERR_PTR(status);
421 }
422 return rec;
423}
424
425/* Sync changes in local quota file into global quota file and
426 * reinitialize local quota file.
427 * The function expects local quota file to be already locked and
428 * dqonoff_mutex locked. */
429static int ocfs2_recover_local_quota_file(struct inode *lqinode,
430 int type,
431 struct ocfs2_quota_recovery *rec)
432{
433 struct super_block *sb = lqinode->i_sb;
434 struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
435 struct ocfs2_local_disk_chunk *dchunk;
436 struct ocfs2_local_disk_dqblk *dqblk;
437 struct dquot *dquot;
438 handle_t *handle;
439 struct buffer_head *hbh = NULL, *qbh = NULL;
440 int status = 0;
441 int bit, chunk;
442 struct ocfs2_recovery_chunk *rchunk, *next;
443 qsize_t spacechange, inodechange;
444
445 mlog_entry("ino=%lu type=%u", (unsigned long)lqinode->i_ino, type);
446
447 status = ocfs2_lock_global_qf(oinfo, 1);
448 if (status < 0)
449 goto out;
450
451 list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) {
452 chunk = rchunk->rc_chunk;
453 hbh = NULL;
454 status = ocfs2_read_quota_block(lqinode,
455 ol_quota_chunk_block(sb, chunk),
456 &hbh);
457 if (status) {
458 mlog_errno(status);
459 break;
460 }
461 dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
462 for_each_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
463 qbh = NULL;
464 status = ocfs2_read_quota_block(lqinode,
465 ol_dqblk_block(sb, chunk, bit),
466 &qbh);
467 if (status) {
468 mlog_errno(status);
469 break;
470 }
471 dqblk = (struct ocfs2_local_disk_dqblk *)(qbh->b_data +
472 ol_dqblk_block_off(sb, chunk, bit));
473 dquot = dqget(sb, le64_to_cpu(dqblk->dqb_id), type);
474 if (!dquot) {
475 status = -EIO;
476 mlog(ML_ERROR, "Failed to get quota structure "
477 "for id %u, type %d. Cannot finish quota "
478 "file recovery.\n",
479 (unsigned)le64_to_cpu(dqblk->dqb_id),
480 type);
481 goto out_put_bh;
482 }
483 handle = ocfs2_start_trans(OCFS2_SB(sb),
484 OCFS2_QSYNC_CREDITS);
485 if (IS_ERR(handle)) {
486 status = PTR_ERR(handle);
487 mlog_errno(status);
488 goto out_put_dquot;
489 }
490 mutex_lock(&sb_dqopt(sb)->dqio_mutex);
491 spin_lock(&dq_data_lock);
492 /* Add usage from quota entry into quota changes
493 * of our node. Auxiliary variables are important
494 * due to signedness */
495 spacechange = le64_to_cpu(dqblk->dqb_spacemod);
496 inodechange = le64_to_cpu(dqblk->dqb_inodemod);
497 dquot->dq_dqb.dqb_curspace += spacechange;
498 dquot->dq_dqb.dqb_curinodes += inodechange;
499 spin_unlock(&dq_data_lock);
500 /* We want to drop reference held by the crashed
501 * node. Since we have our own reference we know
502 * global structure actually won't be freed. */
503 status = ocfs2_global_release_dquot(dquot);
504 if (status < 0) {
505 mlog_errno(status);
506 goto out_commit;
507 }
508 /* Release local quota file entry */
509 status = ocfs2_journal_access_dq(handle, lqinode,
510 qbh, OCFS2_JOURNAL_ACCESS_WRITE);
511 if (status < 0) {
512 mlog_errno(status);
513 goto out_commit;
514 }
515 lock_buffer(qbh);
516 WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap));
517 ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
518 le32_add_cpu(&dchunk->dqc_free, 1);
519 unlock_buffer(qbh);
520 status = ocfs2_journal_dirty(handle, qbh);
521 if (status < 0)
522 mlog_errno(status);
523out_commit:
524 mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
525 ocfs2_commit_trans(OCFS2_SB(sb), handle);
526out_put_dquot:
527 dqput(dquot);
528out_put_bh:
529 brelse(qbh);
530 if (status < 0)
531 break;
532 }
533 brelse(hbh);
534 list_del(&rchunk->rc_list);
535 kfree(rchunk->rc_bitmap);
536 kfree(rchunk);
537 if (status < 0)
538 break;
539 }
540 ocfs2_unlock_global_qf(oinfo, 1);
541out:
542 if (status < 0)
543 free_recovery_list(&(rec->r_list[type]));
544 mlog_exit(status);
545 return status;
546}
547
548/* Recover local quota files for given node different from us */
549int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
550 struct ocfs2_quota_recovery *rec,
551 int slot_num)
552{
553 unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
554 LOCAL_GROUP_QUOTA_SYSTEM_INODE };
555 struct super_block *sb = osb->sb;
556 struct ocfs2_local_disk_dqinfo *ldinfo;
557 struct buffer_head *bh;
558 handle_t *handle;
559 int type;
560 int status = 0;
561 struct inode *lqinode;
562 unsigned int flags;
563
564 mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num);
565 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
566 for (type = 0; type < MAXQUOTAS; type++) {
567 if (list_empty(&(rec->r_list[type])))
568 continue;
569 mlog(0, "Recovering quota in slot %d\n", slot_num);
570 lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
571 if (!lqinode) {
572 status = -ENOENT;
573 goto out;
574 }
575 status = ocfs2_inode_lock_full(lqinode, NULL, 1,
576 OCFS2_META_LOCK_NOQUEUE);
577 /* Someone else is holding the lock? Then he must be
578 * doing the recovery. Just skip the file... */
579 if (status == -EAGAIN) {
580 mlog(ML_NOTICE, "skipping quota recovery for slot %d "
581 "because quota file is locked.\n", slot_num);
582 status = 0;
583 goto out_put;
584 } else if (status < 0) {
585 mlog_errno(status);
586 goto out_put;
587 }
588 /* Now read local header */
589 bh = NULL;
590 status = ocfs2_read_quota_block(lqinode, 0, &bh);
591 if (status) {
592 mlog_errno(status);
593 mlog(ML_ERROR, "failed to read quota file info header "
594 "(slot=%d type=%d)\n", slot_num, type);
595 goto out_lock;
596 }
597 ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
598 OCFS2_LOCAL_INFO_OFF);
599 /* Is recovery still needed? */
600 flags = le32_to_cpu(ldinfo->dqi_flags);
601 if (!(flags & OLQF_CLEAN))
602 status = ocfs2_recover_local_quota_file(lqinode,
603 type,
604 rec);
605 /* We don't want to mark file as clean when it is actually
606 * active */
607 if (slot_num == osb->slot_num)
608 goto out_bh;
609 /* Mark quota file as clean if we are recovering quota file of
610 * some other node. */
611 handle = ocfs2_start_trans(osb, 1);
612 if (IS_ERR(handle)) {
613 status = PTR_ERR(handle);
614 mlog_errno(status);
615 goto out_bh;
616 }
617 status = ocfs2_journal_access_dq(handle, lqinode, bh,
618 OCFS2_JOURNAL_ACCESS_WRITE);
619 if (status < 0) {
620 mlog_errno(status);
621 goto out_trans;
622 }
623 lock_buffer(bh);
624 ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
625 unlock_buffer(bh);
626 status = ocfs2_journal_dirty(handle, bh);
627 if (status < 0)
628 mlog_errno(status);
629out_trans:
630 ocfs2_commit_trans(osb, handle);
631out_bh:
632 brelse(bh);
633out_lock:
634 ocfs2_inode_unlock(lqinode, 1);
635out_put:
636 iput(lqinode);
637 if (status < 0)
638 break;
639 }
640out:
641 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
642 kfree(rec);
643 return status;
644}
645
646/* Read information header from quota file */
647static int ocfs2_local_read_info(struct super_block *sb, int type)
648{
649 struct ocfs2_local_disk_dqinfo *ldinfo;
650 struct mem_dqinfo *info = sb_dqinfo(sb, type);
651 struct ocfs2_mem_dqinfo *oinfo;
652 struct inode *lqinode = sb_dqopt(sb)->files[type];
653 int status;
654 struct buffer_head *bh = NULL;
655 struct ocfs2_quota_recovery *rec;
656 int locked = 0;
657
658 info->dqi_maxblimit = 0x7fffffffffffffffLL;
659 info->dqi_maxilimit = 0x7fffffffffffffffLL;
660 oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
661 if (!oinfo) {
662 mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota"
663 " info.");
664 goto out_err;
665 }
666 info->dqi_priv = oinfo;
667 oinfo->dqi_type = type;
668 INIT_LIST_HEAD(&oinfo->dqi_chunk);
669 oinfo->dqi_rec = NULL;
670 oinfo->dqi_lqi_bh = NULL;
671 oinfo->dqi_ibh = NULL;
672
673 status = ocfs2_global_read_info(sb, type);
674 if (status < 0)
675 goto out_err;
676
677 status = ocfs2_inode_lock(lqinode, &oinfo->dqi_lqi_bh, 1);
678 if (status < 0) {
679 mlog_errno(status);
680 goto out_err;
681 }
682 locked = 1;
683
684 /* Now read local header */
685 status = ocfs2_read_quota_block(lqinode, 0, &bh);
686 if (status) {
687 mlog_errno(status);
688 mlog(ML_ERROR, "failed to read quota file info header "
689 "(type=%d)\n", type);
690 goto out_err;
691 }
692 ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
693 OCFS2_LOCAL_INFO_OFF);
694 info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
695 oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
696 oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
697 oinfo->dqi_ibh = bh;
698
699 /* We crashed when using local quota file? */
700 if (!(info->dqi_flags & OLQF_CLEAN)) {
701 rec = OCFS2_SB(sb)->quota_rec;
702 if (!rec) {
703 rec = ocfs2_alloc_quota_recovery();
704 if (!rec) {
705 status = -ENOMEM;
706 mlog_errno(status);
707 goto out_err;
708 }
709 OCFS2_SB(sb)->quota_rec = rec;
710 }
711
712 status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
713 &rec->r_list[type]);
714 if (status < 0) {
715 mlog_errno(status);
716 goto out_err;
717 }
718 }
719
720 status = ocfs2_load_local_quota_bitmaps(lqinode,
721 ldinfo,
722 &oinfo->dqi_chunk);
723 if (status < 0) {
724 mlog_errno(status);
725 goto out_err;
726 }
727
728 /* Now mark quota file as used */
729 info->dqi_flags &= ~OLQF_CLEAN;
730 status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info);
731 if (status < 0) {
732 mlog_errno(status);
733 goto out_err;
734 }
735
736 return 0;
737out_err:
738 if (oinfo) {
739 iput(oinfo->dqi_gqinode);
740 ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
741 ocfs2_lock_res_free(&oinfo->dqi_gqlock);
742 brelse(oinfo->dqi_lqi_bh);
743 if (locked)
744 ocfs2_inode_unlock(lqinode, 1);
745 ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
746 kfree(oinfo);
747 }
748 brelse(bh);
749 return -1;
750}
751
752/* Write local info to quota file */
753static int ocfs2_local_write_info(struct super_block *sb, int type)
754{
755 struct mem_dqinfo *info = sb_dqinfo(sb, type);
756 struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv)
757 ->dqi_ibh;
758 int status;
759
760 status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info,
761 info);
762 if (status < 0) {
763 mlog_errno(status);
764 return -1;
765 }
766
767 return 0;
768}
769
770/* Release info from memory */
771static int ocfs2_local_free_info(struct super_block *sb, int type)
772{
773 struct mem_dqinfo *info = sb_dqinfo(sb, type);
774 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
775 struct ocfs2_quota_chunk *chunk;
776 struct ocfs2_local_disk_chunk *dchunk;
777 int mark_clean = 1, len;
778 int status;
779
780 /* At this point we know there are no more dquots and thus
781 * even if there's some sync in the pdflush queue, it won't
782 * find any dquots and return without doing anything */
783 cancel_delayed_work_sync(&oinfo->dqi_sync_work);
784 iput(oinfo->dqi_gqinode);
785 ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
786 ocfs2_lock_res_free(&oinfo->dqi_gqlock);
787 list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
788 dchunk = (struct ocfs2_local_disk_chunk *)
789 (chunk->qc_headerbh->b_data);
790 if (chunk->qc_num < oinfo->dqi_chunks - 1) {
791 len = ol_chunk_entries(sb);
792 } else {
793 len = (oinfo->dqi_blocks -
794 ol_quota_chunk_block(sb, chunk->qc_num) - 1)
795 * ol_quota_entries_per_block(sb);
796 }
797 /* Not all entries free? Bug! */
798 if (le32_to_cpu(dchunk->dqc_free) != len) {
799 mlog(ML_ERROR, "releasing quota file with used "
800 "entries (type=%d)\n", type);
801 mark_clean = 0;
802 }
803 }
804 ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
805
806 /* dqonoff_mutex protects us against racing with recovery thread... */
807 if (oinfo->dqi_rec) {
808 ocfs2_free_quota_recovery(oinfo->dqi_rec);
809 mark_clean = 0;
810 }
811
812 if (!mark_clean)
813 goto out;
814
815 /* Mark local file as clean */
816 info->dqi_flags |= OLQF_CLEAN;
817 status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
818 oinfo->dqi_ibh,
819 olq_update_info,
820 info);
821 if (status < 0) {
822 mlog_errno(status);
823 goto out;
824 }
825
826out:
827 ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1);
828 brelse(oinfo->dqi_ibh);
829 brelse(oinfo->dqi_lqi_bh);
830 kfree(oinfo);
831 return 0;
832}
833
834static void olq_set_dquot(struct buffer_head *bh, void *private)
835{
836 struct ocfs2_dquot *od = private;
837 struct ocfs2_local_disk_dqblk *dqblk;
838 struct super_block *sb = od->dq_dquot.dq_sb;
839
840 dqblk = (struct ocfs2_local_disk_dqblk *)(bh->b_data
841 + ol_dqblk_block_offset(sb, od->dq_local_off));
842
843 dqblk->dqb_id = cpu_to_le64(od->dq_dquot.dq_id);
844 spin_lock(&dq_data_lock);
845 dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace -
846 od->dq_origspace);
847 dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes -
848 od->dq_originodes);
849 spin_unlock(&dq_data_lock);
850 mlog(0, "Writing local dquot %u space %lld inodes %lld\n",
851 od->dq_dquot.dq_id, (long long)le64_to_cpu(dqblk->dqb_spacemod),
852 (long long)le64_to_cpu(dqblk->dqb_inodemod));
853}
854
855/* Write dquot to local quota file */
856static int ocfs2_local_write_dquot(struct dquot *dquot)
857{
858 struct super_block *sb = dquot->dq_sb;
859 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
860 struct buffer_head *bh = NULL;
861 int status;
862
863 status = ocfs2_read_quota_block(sb_dqopt(sb)->files[dquot->dq_type],
864 ol_dqblk_file_block(sb, od->dq_local_off),
865 &bh);
866 if (status) {
867 mlog_errno(status);
868 goto out;
869 }
870 status = ocfs2_modify_bh(sb_dqopt(sb)->files[dquot->dq_type], bh,
871 olq_set_dquot, od);
872 if (status < 0) {
873 mlog_errno(status);
874 goto out;
875 }
876out:
877 brelse(bh);
878 return status;
879}
880
881/* Find free entry in local quota file */
882static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb,
883 int type,
884 int *offset)
885{
886 struct mem_dqinfo *info = sb_dqinfo(sb, type);
887 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
888 struct ocfs2_quota_chunk *chunk;
889 struct ocfs2_local_disk_chunk *dchunk;
890 int found = 0, len;
891
892 list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
893 dchunk = (struct ocfs2_local_disk_chunk *)
894 chunk->qc_headerbh->b_data;
895 if (le32_to_cpu(dchunk->dqc_free) > 0) {
896 found = 1;
897 break;
898 }
899 }
900 if (!found)
901 return NULL;
902
903 if (chunk->qc_num < oinfo->dqi_chunks - 1) {
904 len = ol_chunk_entries(sb);
905 } else {
906 len = (oinfo->dqi_blocks -
907 ol_quota_chunk_block(sb, chunk->qc_num) - 1)
908 * ol_quota_entries_per_block(sb);
909 }
910
911 found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0);
912 /* We failed? */
913 if (found == len) {
914 mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u"
915 " entries free (type=%d)\n", chunk->qc_num,
916 le32_to_cpu(dchunk->dqc_free), type);
917 return ERR_PTR(-EIO);
918 }
919 *offset = found;
920 return chunk;
921}
922
923/* Add new chunk to the local quota file */
924static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
925 struct super_block *sb,
926 int type,
927 int *offset)
928{
929 struct mem_dqinfo *info = sb_dqinfo(sb, type);
930 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
931 struct inode *lqinode = sb_dqopt(sb)->files[type];
932 struct ocfs2_quota_chunk *chunk = NULL;
933 struct ocfs2_local_disk_chunk *dchunk;
934 int status;
935 handle_t *handle;
936 struct buffer_head *bh = NULL;
937 u64 p_blkno;
938
939 /* We are protected by dqio_sem so no locking needed */
940 status = ocfs2_extend_no_holes(lqinode,
941 lqinode->i_size + 2 * sb->s_blocksize,
942 lqinode->i_size);
943 if (status < 0) {
944 mlog_errno(status);
945 goto out;
946 }
947 status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
948 lqinode->i_size + 2 * sb->s_blocksize);
949 if (status < 0) {
950 mlog_errno(status);
951 goto out;
952 }
953
954 chunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
955 if (!chunk) {
956 status = -ENOMEM;
957 mlog_errno(status);
958 goto out;
959 }
960
961 down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
962 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
963 &p_blkno, NULL, NULL);
964 up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
965 if (status < 0) {
966 mlog_errno(status);
967 goto out;
968 }
969 bh = sb_getblk(sb, p_blkno);
970 if (!bh) {
971 status = -ENOMEM;
972 mlog_errno(status);
973 goto out;
974 }
975 dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
976
977 handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
978 if (IS_ERR(handle)) {
979 status = PTR_ERR(handle);
980 mlog_errno(status);
981 goto out;
982 }
983
984 status = ocfs2_journal_access_dq(handle, lqinode, bh,
985 OCFS2_JOURNAL_ACCESS_WRITE);
986 if (status < 0) {
987 mlog_errno(status);
988 goto out_trans;
989 }
990 lock_buffer(bh);
991 dchunk->dqc_free = cpu_to_le32(ol_quota_entries_per_block(sb));
992 memset(dchunk->dqc_bitmap, 0,
993 sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
994 OCFS2_QBLK_RESERVED_SPACE);
995 set_buffer_uptodate(bh);
996 unlock_buffer(bh);
997 status = ocfs2_journal_dirty(handle, bh);
998 if (status < 0) {
999 mlog_errno(status);
1000 goto out_trans;
1001 }
1002
1003 oinfo->dqi_blocks += 2;
1004 oinfo->dqi_chunks++;
1005 status = ocfs2_local_write_info(sb, type);
1006 if (status < 0) {
1007 mlog_errno(status);
1008 goto out_trans;
1009 }
1010 status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
1011 if (status < 0) {
1012 mlog_errno(status);
1013 goto out;
1014 }
1015
1016 list_add_tail(&chunk->qc_chunk, &oinfo->dqi_chunk);
1017 chunk->qc_num = list_entry(chunk->qc_chunk.prev,
1018 struct ocfs2_quota_chunk,
1019 qc_chunk)->qc_num + 1;
1020 chunk->qc_headerbh = bh;
1021 *offset = 0;
1022 return chunk;
1023out_trans:
1024 ocfs2_commit_trans(OCFS2_SB(sb), handle);
1025out:
1026 brelse(bh);
1027 kmem_cache_free(ocfs2_qf_chunk_cachep, chunk);
1028 return ERR_PTR(status);
1029}
1030
1031/* Find free entry in local quota file */
1032static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1033 struct super_block *sb,
1034 int type,
1035 int *offset)
1036{
1037 struct mem_dqinfo *info = sb_dqinfo(sb, type);
1038 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
1039 struct ocfs2_quota_chunk *chunk;
1040 struct inode *lqinode = sb_dqopt(sb)->files[type];
1041 struct ocfs2_local_disk_chunk *dchunk;
1042 int epb = ol_quota_entries_per_block(sb);
1043 unsigned int chunk_blocks;
1044 int status;
1045 handle_t *handle;
1046
1047 if (list_empty(&oinfo->dqi_chunk))
1048 return ocfs2_local_quota_add_chunk(sb, type, offset);
1049 /* Is the last chunk full? */
1050 chunk = list_entry(oinfo->dqi_chunk.prev,
1051 struct ocfs2_quota_chunk, qc_chunk);
1052 chunk_blocks = oinfo->dqi_blocks -
1053 ol_quota_chunk_block(sb, chunk->qc_num) - 1;
1054 if (ol_chunk_blocks(sb) == chunk_blocks)
1055 return ocfs2_local_quota_add_chunk(sb, type, offset);
1056
1057 /* We are protected by dqio_sem so no locking needed */
1058 status = ocfs2_extend_no_holes(lqinode,
1059 lqinode->i_size + sb->s_blocksize,
1060 lqinode->i_size);
1061 if (status < 0) {
1062 mlog_errno(status);
1063 goto out;
1064 }
1065 status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
1066 lqinode->i_size + sb->s_blocksize);
1067 if (status < 0) {
1068 mlog_errno(status);
1069 goto out;
1070 }
1071 handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
1072 if (IS_ERR(handle)) {
1073 status = PTR_ERR(handle);
1074 mlog_errno(status);
1075 goto out;
1076 }
1077 status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh,
1078 OCFS2_JOURNAL_ACCESS_WRITE);
1079 if (status < 0) {
1080 mlog_errno(status);
1081 goto out_trans;
1082 }
1083
1084 dchunk = (struct ocfs2_local_disk_chunk *)chunk->qc_headerbh->b_data;
1085 lock_buffer(chunk->qc_headerbh);
1086 le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
1087 unlock_buffer(chunk->qc_headerbh);
1088 status = ocfs2_journal_dirty(handle, chunk->qc_headerbh);
1089 if (status < 0) {
1090 mlog_errno(status);
1091 goto out_trans;
1092 }
1093 oinfo->dqi_blocks++;
1094 status = ocfs2_local_write_info(sb, type);
1095 if (status < 0) {
1096 mlog_errno(status);
1097 goto out_trans;
1098 }
1099
1100 status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
1101 if (status < 0) {
1102 mlog_errno(status);
1103 goto out;
1104 }
1105 *offset = chunk_blocks * epb;
1106 return chunk;
1107out_trans:
1108 ocfs2_commit_trans(OCFS2_SB(sb), handle);
1109out:
1110 return ERR_PTR(status);
1111}
1112
1113static void olq_alloc_dquot(struct buffer_head *bh, void *private)
1114{
1115 int *offset = private;
1116 struct ocfs2_local_disk_chunk *dchunk;
1117
1118 dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
1119 ocfs2_set_bit(*offset, dchunk->dqc_bitmap);
1120 le32_add_cpu(&dchunk->dqc_free, -1);
1121}
1122
1123/* Create dquot in the local file for given id */
1124static int ocfs2_create_local_dquot(struct dquot *dquot)
1125{
1126 struct super_block *sb = dquot->dq_sb;
1127 int type = dquot->dq_type;
1128 struct inode *lqinode = sb_dqopt(sb)->files[type];
1129 struct ocfs2_quota_chunk *chunk;
1130 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
1131 int offset;
1132 int status;
1133
1134 chunk = ocfs2_find_free_entry(sb, type, &offset);
1135 if (!chunk) {
1136 chunk = ocfs2_extend_local_quota_file(sb, type, &offset);
1137 if (IS_ERR(chunk))
1138 return PTR_ERR(chunk);
1139 } else if (IS_ERR(chunk)) {
1140 return PTR_ERR(chunk);
1141 }
1142 od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset);
1143 od->dq_chunk = chunk;
1144
1145 /* Initialize dquot structure on disk */
1146 status = ocfs2_local_write_dquot(dquot);
1147 if (status < 0) {
1148 mlog_errno(status);
1149 goto out;
1150 }
1151
1152 /* Mark structure as allocated */
1153 status = ocfs2_modify_bh(lqinode, chunk->qc_headerbh, olq_alloc_dquot,
1154 &offset);
1155 if (status < 0) {
1156 mlog_errno(status);
1157 goto out;
1158 }
1159out:
1160 return status;
1161}
1162
1163/* Create entry in local file for dquot, load data from the global file */
1164static int ocfs2_local_read_dquot(struct dquot *dquot)
1165{
1166 int status;
1167
1168 mlog_entry("id=%u, type=%d\n", dquot->dq_id, dquot->dq_type);
1169
1170 status = ocfs2_global_read_dquot(dquot);
1171 if (status < 0) {
1172 mlog_errno(status);
1173 goto out_err;
1174 }
1175
1176 /* Now create entry in the local quota file */
1177 status = ocfs2_create_local_dquot(dquot);
1178 if (status < 0) {
1179 mlog_errno(status);
1180 goto out_err;
1181 }
1182 mlog_exit(0);
1183 return 0;
1184out_err:
1185 mlog_exit(status);
1186 return status;
1187}
1188
1189/* Release dquot structure from local quota file. ocfs2_release_dquot() has
1190 * already started a transaction and obtained exclusive lock for global
1191 * quota file. */
1192static int ocfs2_local_release_dquot(struct dquot *dquot)
1193{
1194 int status;
1195 int type = dquot->dq_type;
1196 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
1197 struct super_block *sb = dquot->dq_sb;
1198 struct ocfs2_local_disk_chunk *dchunk;
1199 int offset;
1200 handle_t *handle = journal_current_handle();
1201
1202 BUG_ON(!handle);
1203 /* First write all local changes to global file */
1204 status = ocfs2_global_release_dquot(dquot);
1205 if (status < 0) {
1206 mlog_errno(status);
1207 goto out;
1208 }
1209
1210 status = ocfs2_journal_access_dq(handle, sb_dqopt(sb)->files[type],
1211 od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE);
1212 if (status < 0) {
1213 mlog_errno(status);
1214 goto out;
1215 }
1216 offset = ol_dqblk_chunk_off(sb, od->dq_chunk->qc_num,
1217 od->dq_local_off);
1218 dchunk = (struct ocfs2_local_disk_chunk *)
1219 (od->dq_chunk->qc_headerbh->b_data);
1220 /* Mark structure as freed */
1221 lock_buffer(od->dq_chunk->qc_headerbh);
1222 ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
1223 le32_add_cpu(&dchunk->dqc_free, 1);
1224 unlock_buffer(od->dq_chunk->qc_headerbh);
1225 status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
1226 if (status < 0) {
1227 mlog_errno(status);
1228 goto out;
1229 }
1230 status = 0;
1231out:
1232 /* Clear the read bit so that next time someone uses this
1233 * dquot he reads fresh info from disk and allocates local
1234 * dquot structure */
1235 clear_bit(DQ_READ_B, &dquot->dq_flags);
1236 return status;
1237}
1238
1239static struct quota_format_ops ocfs2_format_ops = {
1240 .check_quota_file = ocfs2_local_check_quota_file,
1241 .read_file_info = ocfs2_local_read_info,
1242 .write_file_info = ocfs2_global_write_info,
1243 .free_file_info = ocfs2_local_free_info,
1244 .read_dqblk = ocfs2_local_read_dquot,
1245 .commit_dqblk = ocfs2_local_write_dquot,
1246 .release_dqblk = ocfs2_local_release_dquot,
1247};
1248
1249struct quota_format_type ocfs2_quota_format = {
1250 .qf_fmt_id = QFMT_OCFS2,
1251 .qf_ops = &ocfs2_format_ops,
1252 .qf_owner = THIS_MODULE
1253};
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index ffd48db229a7..424adaa5f900 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -106,8 +106,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
106 mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n", 106 mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
107 new_clusters, first_new_cluster); 107 new_clusters, first_new_cluster);
108 108
109 ret = ocfs2_journal_access(handle, bm_inode, group_bh, 109 ret = ocfs2_journal_access_gd(handle, bm_inode, group_bh,
110 OCFS2_JOURNAL_ACCESS_WRITE); 110 OCFS2_JOURNAL_ACCESS_WRITE);
111 if (ret < 0) { 111 if (ret < 0) {
112 mlog_errno(ret); 112 mlog_errno(ret);
113 goto out; 113 goto out;
@@ -141,8 +141,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
141 } 141 }
142 142
143 /* update the inode accordingly. */ 143 /* update the inode accordingly. */
144 ret = ocfs2_journal_access(handle, bm_inode, bm_bh, 144 ret = ocfs2_journal_access_di(handle, bm_inode, bm_bh,
145 OCFS2_JOURNAL_ACCESS_WRITE); 145 OCFS2_JOURNAL_ACCESS_WRITE);
146 if (ret < 0) { 146 if (ret < 0) {
147 mlog_errno(ret); 147 mlog_errno(ret);
148 goto out_rollback; 148 goto out_rollback;
@@ -314,6 +314,10 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
314 314
315 fe = (struct ocfs2_dinode *)main_bm_bh->b_data; 315 fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
316 316
317 /* main_bm_bh is validated by inode read inside ocfs2_inode_lock(),
318 * so any corruption is a code bug. */
319 BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
320
317 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) != 321 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
318 ocfs2_group_bitmap_size(osb->sb) * 8) { 322 ocfs2_group_bitmap_size(osb->sb) * 8) {
319 mlog(ML_ERROR, "The disk is too old and small. " 323 mlog(ML_ERROR, "The disk is too old and small. "
@@ -322,30 +326,18 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
322 goto out_unlock; 326 goto out_unlock;
323 } 327 }
324 328
325 if (!OCFS2_IS_VALID_DINODE(fe)) {
326 OCFS2_RO_ON_INVALID_DINODE(main_bm_inode->i_sb, fe);
327 ret = -EIO;
328 goto out_unlock;
329 }
330
331 first_new_cluster = le32_to_cpu(fe->i_clusters); 329 first_new_cluster = le32_to_cpu(fe->i_clusters);
332 lgd_blkno = ocfs2_which_cluster_group(main_bm_inode, 330 lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
333 first_new_cluster - 1); 331 first_new_cluster - 1);
334 332
335 ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh); 333 ret = ocfs2_read_group_descriptor(main_bm_inode, fe, lgd_blkno,
334 &group_bh);
336 if (ret < 0) { 335 if (ret < 0) {
337 mlog_errno(ret); 336 mlog_errno(ret);
338 goto out_unlock; 337 goto out_unlock;
339 } 338 }
340
341 group = (struct ocfs2_group_desc *)group_bh->b_data; 339 group = (struct ocfs2_group_desc *)group_bh->b_data;
342 340
343 ret = ocfs2_check_group_descriptor(inode->i_sb, fe, group);
344 if (ret) {
345 mlog_errno(ret);
346 goto out_unlock;
347 }
348
349 cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc); 341 cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
350 if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters > 342 if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters >
351 le16_to_cpu(fe->id2.i_chain.cl_cpg)) { 343 le16_to_cpu(fe->id2.i_chain.cl_cpg)) {
@@ -398,41 +390,16 @@ static int ocfs2_check_new_group(struct inode *inode,
398 struct buffer_head *group_bh) 390 struct buffer_head *group_bh)
399{ 391{
400 int ret; 392 int ret;
401 struct ocfs2_group_desc *gd; 393 struct ocfs2_group_desc *gd =
394 (struct ocfs2_group_desc *)group_bh->b_data;
402 u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc); 395 u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
403 unsigned int max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) *
404 le16_to_cpu(di->id2.i_chain.cl_bpc);
405
406 396
407 gd = (struct ocfs2_group_desc *)group_bh->b_data; 397 ret = ocfs2_check_group_descriptor(inode->i_sb, di, group_bh);
398 if (ret)
399 goto out;
408 400
409 ret = -EIO; 401 ret = -EINVAL;
410 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) 402 if (le16_to_cpu(gd->bg_chain) != input->chain)
411 mlog(ML_ERROR, "Group descriptor # %llu isn't valid.\n",
412 (unsigned long long)le64_to_cpu(gd->bg_blkno));
413 else if (di->i_blkno != gd->bg_parent_dinode)
414 mlog(ML_ERROR, "Group descriptor # %llu has bad parent "
415 "pointer (%llu, expected %llu)\n",
416 (unsigned long long)le64_to_cpu(gd->bg_blkno),
417 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
418 (unsigned long long)le64_to_cpu(di->i_blkno));
419 else if (le16_to_cpu(gd->bg_bits) > max_bits)
420 mlog(ML_ERROR, "Group descriptor # %llu has bit count of %u\n",
421 (unsigned long long)le64_to_cpu(gd->bg_blkno),
422 le16_to_cpu(gd->bg_bits));
423 else if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits))
424 mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
425 "claims that %u are free\n",
426 (unsigned long long)le64_to_cpu(gd->bg_blkno),
427 le16_to_cpu(gd->bg_bits),
428 le16_to_cpu(gd->bg_free_bits_count));
429 else if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size)))
430 mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
431 "max bitmap bits of %u\n",
432 (unsigned long long)le64_to_cpu(gd->bg_blkno),
433 le16_to_cpu(gd->bg_bits),
434 8 * le16_to_cpu(gd->bg_size));
435 else if (le16_to_cpu(gd->bg_chain) != input->chain)
436 mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u " 403 mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u "
437 "while input has %u set.\n", 404 "while input has %u set.\n",
438 (unsigned long long)le64_to_cpu(gd->bg_blkno), 405 (unsigned long long)le64_to_cpu(gd->bg_blkno),
@@ -451,6 +418,7 @@ static int ocfs2_check_new_group(struct inode *inode,
451 else 418 else
452 ret = 0; 419 ret = 0;
453 420
421out:
454 return ret; 422 return ret;
455} 423}
456 424
@@ -568,8 +536,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
568 cl = &fe->id2.i_chain; 536 cl = &fe->id2.i_chain;
569 cr = &cl->cl_recs[input->chain]; 537 cr = &cl->cl_recs[input->chain];
570 538
571 ret = ocfs2_journal_access(handle, main_bm_inode, group_bh, 539 ret = ocfs2_journal_access_gd(handle, main_bm_inode, group_bh,
572 OCFS2_JOURNAL_ACCESS_WRITE); 540 OCFS2_JOURNAL_ACCESS_WRITE);
573 if (ret < 0) { 541 if (ret < 0) {
574 mlog_errno(ret); 542 mlog_errno(ret);
575 goto out_commit; 543 goto out_commit;
@@ -584,8 +552,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
584 goto out_commit; 552 goto out_commit;
585 } 553 }
586 554
587 ret = ocfs2_journal_access(handle, main_bm_inode, main_bm_bh, 555 ret = ocfs2_journal_access_di(handle, main_bm_inode, main_bm_bh,
588 OCFS2_JOURNAL_ACCESS_WRITE); 556 OCFS2_JOURNAL_ACCESS_WRITE);
589 if (ret < 0) { 557 if (ret < 0) {
590 mlog_errno(ret); 558 mlog_errno(ret);
591 goto out_commit; 559 goto out_commit;
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bdda2d8f8508..40661e7824e9 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -151,7 +151,7 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
151 * this is not true, the read of -1 (UINT64_MAX) will fail. 151 * this is not true, the read of -1 (UINT64_MAX) will fail.
152 */ 152 */
153 ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh, 153 ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh,
154 OCFS2_BH_IGNORE_CACHE); 154 OCFS2_BH_IGNORE_CACHE, NULL);
155 if (ret == 0) { 155 if (ret == 0) {
156 spin_lock(&osb->osb_lock); 156 spin_lock(&osb->osb_lock);
157 ocfs2_update_slot_info(si); 157 ocfs2_update_slot_info(si);
@@ -405,7 +405,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
405 405
406 bh = NULL; /* Acquire a fresh bh */ 406 bh = NULL; /* Acquire a fresh bh */
407 status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh, 407 status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh,
408 OCFS2_BH_IGNORE_CACHE); 408 OCFS2_BH_IGNORE_CACHE, NULL);
409 if (status < 0) { 409 if (status < 0) {
410 mlog_errno(status); 410 mlog_errno(status);
411 goto bail; 411 goto bail;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c5ff18b46b57..a69628603e18 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -35,6 +35,7 @@
35#include "ocfs2.h" 35#include "ocfs2.h"
36 36
37#include "alloc.h" 37#include "alloc.h"
38#include "blockcheck.h"
38#include "dlmglue.h" 39#include "dlmglue.h"
39#include "inode.h" 40#include "inode.h"
40#include "journal.h" 41#include "journal.h"
@@ -145,62 +146,183 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
145 return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc); 146 return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
146} 147}
147 148
148/* somewhat more expensive than our other checks, so use sparingly. */ 149#define do_error(fmt, ...) \
149int ocfs2_check_group_descriptor(struct super_block *sb, 150 do{ \
150 struct ocfs2_dinode *di, 151 if (clean_error) \
151 struct ocfs2_group_desc *gd) 152 mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \
153 else \
154 ocfs2_error(sb, fmt, ##__VA_ARGS__); \
155 } while (0)
156
157static int ocfs2_validate_gd_self(struct super_block *sb,
158 struct buffer_head *bh,
159 int clean_error)
152{ 160{
153 unsigned int max_bits; 161 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
154 162
155 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { 163 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
156 OCFS2_RO_ON_INVALID_GROUP_DESC(sb, gd); 164 do_error("Group descriptor #%llu has bad signature %.*s",
157 return -EIO; 165 (unsigned long long)bh->b_blocknr, 7,
166 gd->bg_signature);
167 return -EINVAL;
158 } 168 }
159 169
170 if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
171 do_error("Group descriptor #%llu has an invalid bg_blkno "
172 "of %llu",
173 (unsigned long long)bh->b_blocknr,
174 (unsigned long long)le64_to_cpu(gd->bg_blkno));
175 return -EINVAL;
176 }
177
178 if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
179 do_error("Group descriptor #%llu has an invalid "
180 "fs_generation of #%u",
181 (unsigned long long)bh->b_blocknr,
182 le32_to_cpu(gd->bg_generation));
183 return -EINVAL;
184 }
185
186 if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
187 do_error("Group descriptor #%llu has bit count %u but "
188 "claims that %u are free",
189 (unsigned long long)bh->b_blocknr,
190 le16_to_cpu(gd->bg_bits),
191 le16_to_cpu(gd->bg_free_bits_count));
192 return -EINVAL;
193 }
194
195 if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
196 do_error("Group descriptor #%llu has bit count %u but "
197 "max bitmap bits of %u",
198 (unsigned long long)bh->b_blocknr,
199 le16_to_cpu(gd->bg_bits),
200 8 * le16_to_cpu(gd->bg_size));
201 return -EINVAL;
202 }
203
204 return 0;
205}
206
207static int ocfs2_validate_gd_parent(struct super_block *sb,
208 struct ocfs2_dinode *di,
209 struct buffer_head *bh,
210 int clean_error)
211{
212 unsigned int max_bits;
213 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
214
160 if (di->i_blkno != gd->bg_parent_dinode) { 215 if (di->i_blkno != gd->bg_parent_dinode) {
161 ocfs2_error(sb, "Group descriptor # %llu has bad parent " 216 do_error("Group descriptor #%llu has bad parent "
162 "pointer (%llu, expected %llu)", 217 "pointer (%llu, expected %llu)",
163 (unsigned long long)le64_to_cpu(gd->bg_blkno), 218 (unsigned long long)bh->b_blocknr,
164 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), 219 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
165 (unsigned long long)le64_to_cpu(di->i_blkno)); 220 (unsigned long long)le64_to_cpu(di->i_blkno));
166 return -EIO; 221 return -EINVAL;
167 } 222 }
168 223
169 max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); 224 max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
170 if (le16_to_cpu(gd->bg_bits) > max_bits) { 225 if (le16_to_cpu(gd->bg_bits) > max_bits) {
171 ocfs2_error(sb, "Group descriptor # %llu has bit count of %u", 226 do_error("Group descriptor #%llu has bit count of %u",
172 (unsigned long long)le64_to_cpu(gd->bg_blkno), 227 (unsigned long long)bh->b_blocknr,
173 le16_to_cpu(gd->bg_bits)); 228 le16_to_cpu(gd->bg_bits));
174 return -EIO; 229 return -EINVAL;
175 } 230 }
176 231
177 if (le16_to_cpu(gd->bg_chain) >= 232 if (le16_to_cpu(gd->bg_chain) >=
178 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) { 233 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
179 ocfs2_error(sb, "Group descriptor # %llu has bad chain %u", 234 do_error("Group descriptor #%llu has bad chain %u",
180 (unsigned long long)le64_to_cpu(gd->bg_blkno), 235 (unsigned long long)bh->b_blocknr,
181 le16_to_cpu(gd->bg_chain)); 236 le16_to_cpu(gd->bg_chain));
182 return -EIO; 237 return -EINVAL;
183 } 238 }
184 239
185 if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { 240 return 0;
186 ocfs2_error(sb, "Group descriptor # %llu has bit count %u but " 241}
187 "claims that %u are free",
188 (unsigned long long)le64_to_cpu(gd->bg_blkno),
189 le16_to_cpu(gd->bg_bits),
190 le16_to_cpu(gd->bg_free_bits_count));
191 return -EIO;
192 }
193 242
194 if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { 243#undef do_error
195 ocfs2_error(sb, "Group descriptor # %llu has bit count %u but " 244
196 "max bitmap bits of %u", 245/*
197 (unsigned long long)le64_to_cpu(gd->bg_blkno), 246 * This version only prints errors. It does not fail the filesystem, and
198 le16_to_cpu(gd->bg_bits), 247 * exists only for resize.
199 8 * le16_to_cpu(gd->bg_size)); 248 */
200 return -EIO; 249int ocfs2_check_group_descriptor(struct super_block *sb,
250 struct ocfs2_dinode *di,
251 struct buffer_head *bh)
252{
253 int rc;
254 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
255
256 BUG_ON(!buffer_uptodate(bh));
257
258 /*
259 * If the ecc fails, we return the error but otherwise
260 * leave the filesystem running. We know any error is
261 * local to this block.
262 */
263 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
264 if (rc) {
265 mlog(ML_ERROR,
266 "Checksum failed for group descriptor %llu\n",
267 (unsigned long long)bh->b_blocknr);
268 } else
269 rc = ocfs2_validate_gd_self(sb, bh, 1);
270 if (!rc)
271 rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
272
273 return rc;
274}
275
276static int ocfs2_validate_group_descriptor(struct super_block *sb,
277 struct buffer_head *bh)
278{
279 int rc;
280 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
281
282 mlog(0, "Validating group descriptor %llu\n",
283 (unsigned long long)bh->b_blocknr);
284
285 BUG_ON(!buffer_uptodate(bh));
286
287 /*
288 * If the ecc fails, we return the error but otherwise
289 * leave the filesystem running. We know any error is
290 * local to this block.
291 */
292 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
293 if (rc)
294 return rc;
295
296 /*
297 * Errors after here are fatal.
298 */
299
300 return ocfs2_validate_gd_self(sb, bh, 0);
301}
302
303int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
304 u64 gd_blkno, struct buffer_head **bh)
305{
306 int rc;
307 struct buffer_head *tmp = *bh;
308
309 rc = ocfs2_read_block(inode, gd_blkno, &tmp,
310 ocfs2_validate_group_descriptor);
311 if (rc)
312 goto out;
313
314 rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
315 if (rc) {
316 brelse(tmp);
317 goto out;
201 } 318 }
202 319
203 return 0; 320 /* If ocfs2_read_block() got us a new bh, pass it up. */
321 if (!*bh)
322 *bh = tmp;
323
324out:
325 return rc;
204} 326}
205 327
206static int ocfs2_block_group_fill(handle_t *handle, 328static int ocfs2_block_group_fill(handle_t *handle,
@@ -225,10 +347,10 @@ static int ocfs2_block_group_fill(handle_t *handle,
225 goto bail; 347 goto bail;
226 } 348 }
227 349
228 status = ocfs2_journal_access(handle, 350 status = ocfs2_journal_access_gd(handle,
229 alloc_inode, 351 alloc_inode,
230 bg_bh, 352 bg_bh,
231 OCFS2_JOURNAL_ACCESS_CREATE); 353 OCFS2_JOURNAL_ACCESS_CREATE);
232 if (status < 0) { 354 if (status < 0) {
233 mlog_errno(status); 355 mlog_errno(status);
234 goto bail; 356 goto bail;
@@ -358,8 +480,8 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
358 480
359 bg = (struct ocfs2_group_desc *) bg_bh->b_data; 481 bg = (struct ocfs2_group_desc *) bg_bh->b_data;
360 482
361 status = ocfs2_journal_access(handle, alloc_inode, 483 status = ocfs2_journal_access_di(handle, alloc_inode,
362 bh, OCFS2_JOURNAL_ACCESS_WRITE); 484 bh, OCFS2_JOURNAL_ACCESS_WRITE);
363 if (status < 0) { 485 if (status < 0) {
364 mlog_errno(status); 486 mlog_errno(status);
365 goto bail; 487 goto bail;
@@ -441,11 +563,11 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
441 ac->ac_alloc_slot = slot; 563 ac->ac_alloc_slot = slot;
442 564
443 fe = (struct ocfs2_dinode *) bh->b_data; 565 fe = (struct ocfs2_dinode *) bh->b_data;
444 if (!OCFS2_IS_VALID_DINODE(fe)) { 566
445 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); 567 /* The bh was validated by the inode read inside
446 status = -EIO; 568 * ocfs2_inode_lock(). Any corruption is a code bug. */
447 goto bail; 569 BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
448 } 570
449 if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { 571 if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
450 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu", 572 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
451 (unsigned long long)le64_to_cpu(fe->i_blkno)); 573 (unsigned long long)le64_to_cpu(fe->i_blkno));
@@ -790,10 +912,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
790 int offset, start, found, status = 0; 912 int offset, start, found, status = 0;
791 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 913 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
792 914
793 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { 915 /* Callers got this descriptor from
794 OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg); 916 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
795 return -EIO; 917 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
796 }
797 918
798 found = start = best_offset = best_size = 0; 919 found = start = best_offset = best_size = 0;
799 bitmap = bg->bg_bitmap; 920 bitmap = bg->bg_bitmap;
@@ -858,11 +979,9 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
858 979
859 mlog_entry_void(); 980 mlog_entry_void();
860 981
861 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { 982 /* All callers get the descriptor via
862 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); 983 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
863 status = -EIO; 984 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
864 goto bail;
865 }
866 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); 985 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
867 986
868 mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, 987 mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
@@ -871,10 +990,10 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
871 if (ocfs2_is_cluster_bitmap(alloc_inode)) 990 if (ocfs2_is_cluster_bitmap(alloc_inode))
872 journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 991 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
873 992
874 status = ocfs2_journal_access(handle, 993 status = ocfs2_journal_access_gd(handle,
875 alloc_inode, 994 alloc_inode,
876 group_bh, 995 group_bh,
877 journal_type); 996 journal_type);
878 if (status < 0) { 997 if (status < 0) {
879 mlog_errno(status); 998 mlog_errno(status);
880 goto bail; 999 goto bail;
@@ -931,21 +1050,10 @@ static int ocfs2_relink_block_group(handle_t *handle,
931 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1050 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
932 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; 1051 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
933 1052
934 if (!OCFS2_IS_VALID_DINODE(fe)) { 1053 /* The caller got these descriptors from
935 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); 1054 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
936 status = -EIO; 1055 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
937 goto out; 1056 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
938 }
939 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
940 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
941 status = -EIO;
942 goto out;
943 }
944 if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
945 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
946 status = -EIO;
947 goto out;
948 }
949 1057
950 mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n", 1058 mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
951 (unsigned long long)le64_to_cpu(fe->i_blkno), chain, 1059 (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
@@ -956,8 +1064,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
956 bg_ptr = le64_to_cpu(bg->bg_next_group); 1064 bg_ptr = le64_to_cpu(bg->bg_next_group);
957 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); 1065 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
958 1066
959 status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh, 1067 status = ocfs2_journal_access_gd(handle, alloc_inode, prev_bg_bh,
960 OCFS2_JOURNAL_ACCESS_WRITE); 1068 OCFS2_JOURNAL_ACCESS_WRITE);
961 if (status < 0) { 1069 if (status < 0) {
962 mlog_errno(status); 1070 mlog_errno(status);
963 goto out_rollback; 1071 goto out_rollback;
@@ -971,8 +1079,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
971 goto out_rollback; 1079 goto out_rollback;
972 } 1080 }
973 1081
974 status = ocfs2_journal_access(handle, alloc_inode, bg_bh, 1082 status = ocfs2_journal_access_gd(handle, alloc_inode, bg_bh,
975 OCFS2_JOURNAL_ACCESS_WRITE); 1083 OCFS2_JOURNAL_ACCESS_WRITE);
976 if (status < 0) { 1084 if (status < 0) {
977 mlog_errno(status); 1085 mlog_errno(status);
978 goto out_rollback; 1086 goto out_rollback;
@@ -986,8 +1094,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
986 goto out_rollback; 1094 goto out_rollback;
987 } 1095 }
988 1096
989 status = ocfs2_journal_access(handle, alloc_inode, fe_bh, 1097 status = ocfs2_journal_access_di(handle, alloc_inode, fe_bh,
990 OCFS2_JOURNAL_ACCESS_WRITE); 1098 OCFS2_JOURNAL_ACCESS_WRITE);
991 if (status < 0) { 1099 if (status < 0) {
992 mlog_errno(status); 1100 mlog_errno(status);
993 goto out_rollback; 1101 goto out_rollback;
@@ -1008,7 +1116,7 @@ out_rollback:
1008 bg->bg_next_group = cpu_to_le64(bg_ptr); 1116 bg->bg_next_group = cpu_to_le64(bg_ptr);
1009 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); 1117 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1010 } 1118 }
1011out: 1119
1012 mlog_exit(status); 1120 mlog_exit(status);
1013 return status; 1121 return status;
1014} 1122}
@@ -1138,8 +1246,8 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1138 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 1246 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1139 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain; 1247 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1140 1248
1141 ret = ocfs2_journal_access(handle, inode, di_bh, 1249 ret = ocfs2_journal_access_di(handle, inode, di_bh,
1142 OCFS2_JOURNAL_ACCESS_WRITE); 1250 OCFS2_JOURNAL_ACCESS_WRITE);
1143 if (ret < 0) { 1251 if (ret < 0) {
1144 mlog_errno(ret); 1252 mlog_errno(ret);
1145 goto out; 1253 goto out;
@@ -1170,21 +1278,17 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1170 u16 found; 1278 u16 found;
1171 struct buffer_head *group_bh = NULL; 1279 struct buffer_head *group_bh = NULL;
1172 struct ocfs2_group_desc *gd; 1280 struct ocfs2_group_desc *gd;
1281 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1173 struct inode *alloc_inode = ac->ac_inode; 1282 struct inode *alloc_inode = ac->ac_inode;
1174 1283
1175 ret = ocfs2_read_block(alloc_inode, gd_blkno, &group_bh); 1284 ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
1285 &group_bh);
1176 if (ret < 0) { 1286 if (ret < 0) {
1177 mlog_errno(ret); 1287 mlog_errno(ret);
1178 return ret; 1288 return ret;
1179 } 1289 }
1180 1290
1181 gd = (struct ocfs2_group_desc *) group_bh->b_data; 1291 gd = (struct ocfs2_group_desc *) group_bh->b_data;
1182 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
1183 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, gd);
1184 ret = -EIO;
1185 goto out;
1186 }
1187
1188 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, 1292 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1189 ac->ac_max_block, bit_off, &found); 1293 ac->ac_max_block, bit_off, &found);
1190 if (ret < 0) { 1294 if (ret < 0) {
@@ -1241,19 +1345,14 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1241 bits_wanted, chain, 1345 bits_wanted, chain,
1242 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno); 1346 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
1243 1347
1244 status = ocfs2_read_block(alloc_inode, 1348 status = ocfs2_read_group_descriptor(alloc_inode, fe,
1245 le64_to_cpu(cl->cl_recs[chain].c_blkno), 1349 le64_to_cpu(cl->cl_recs[chain].c_blkno),
1246 &group_bh); 1350 &group_bh);
1247 if (status < 0) { 1351 if (status < 0) {
1248 mlog_errno(status); 1352 mlog_errno(status);
1249 goto bail; 1353 goto bail;
1250 } 1354 }
1251 bg = (struct ocfs2_group_desc *) group_bh->b_data; 1355 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1252 status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
1253 if (status) {
1254 mlog_errno(status);
1255 goto bail;
1256 }
1257 1356
1258 status = -ENOSPC; 1357 status = -ENOSPC;
1259 /* for now, the chain search is a bit simplistic. We just use 1358 /* for now, the chain search is a bit simplistic. We just use
@@ -1271,18 +1370,13 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1271 next_group = le64_to_cpu(bg->bg_next_group); 1370 next_group = le64_to_cpu(bg->bg_next_group);
1272 prev_group_bh = group_bh; 1371 prev_group_bh = group_bh;
1273 group_bh = NULL; 1372 group_bh = NULL;
1274 status = ocfs2_read_block(alloc_inode, 1373 status = ocfs2_read_group_descriptor(alloc_inode, fe,
1275 next_group, &group_bh); 1374 next_group, &group_bh);
1276 if (status < 0) { 1375 if (status < 0) {
1277 mlog_errno(status); 1376 mlog_errno(status);
1278 goto bail; 1377 goto bail;
1279 } 1378 }
1280 bg = (struct ocfs2_group_desc *) group_bh->b_data; 1379 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1281 status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
1282 if (status) {
1283 mlog_errno(status);
1284 goto bail;
1285 }
1286 } 1380 }
1287 if (status < 0) { 1381 if (status < 0) {
1288 if (status != -ENOSPC) 1382 if (status != -ENOSPC)
@@ -1324,10 +1418,10 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1324 1418
1325 /* Ok, claim our bits now: set the info on dinode, chainlist 1419 /* Ok, claim our bits now: set the info on dinode, chainlist
1326 * and then the group */ 1420 * and then the group */
1327 status = ocfs2_journal_access(handle, 1421 status = ocfs2_journal_access_di(handle,
1328 alloc_inode, 1422 alloc_inode,
1329 ac->ac_bh, 1423 ac->ac_bh,
1330 OCFS2_JOURNAL_ACCESS_WRITE); 1424 OCFS2_JOURNAL_ACCESS_WRITE);
1331 if (status < 0) { 1425 if (status < 0) {
1332 mlog_errno(status); 1426 mlog_errno(status);
1333 goto bail; 1427 goto bail;
@@ -1392,11 +1486,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1392 BUG_ON(!ac->ac_bh); 1486 BUG_ON(!ac->ac_bh);
1393 1487
1394 fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; 1488 fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1395 if (!OCFS2_IS_VALID_DINODE(fe)) { 1489
1396 OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe); 1490 /* The bh was validated by the inode read during
1397 status = -EIO; 1491 * ocfs2_reserve_suballoc_bits(). Any corruption is a code bug. */
1398 goto bail; 1492 BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1399 } 1493
1400 if (le32_to_cpu(fe->id1.bitmap1.i_used) >= 1494 if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1401 le32_to_cpu(fe->id1.bitmap1.i_total)) { 1495 le32_to_cpu(fe->id1.bitmap1.i_total)) {
1402 ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used " 1496 ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
@@ -1725,19 +1819,17 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1725 1819
1726 mlog_entry_void(); 1820 mlog_entry_void();
1727 1821
1728 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { 1822 /* The caller got this descriptor from
1729 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); 1823 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
1730 status = -EIO; 1824 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1731 goto bail;
1732 }
1733 1825
1734 mlog(0, "off = %u, num = %u\n", bit_off, num_bits); 1826 mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1735 1827
1736 if (ocfs2_is_cluster_bitmap(alloc_inode)) 1828 if (ocfs2_is_cluster_bitmap(alloc_inode))
1737 journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 1829 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1738 1830
1739 status = ocfs2_journal_access(handle, alloc_inode, group_bh, 1831 status = ocfs2_journal_access_gd(handle, alloc_inode, group_bh,
1740 journal_type); 1832 journal_type);
1741 if (status < 0) { 1833 if (status < 0) {
1742 mlog_errno(status); 1834 mlog_errno(status);
1743 goto bail; 1835 goto bail;
@@ -1782,29 +1874,26 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
1782 1874
1783 mlog_entry_void(); 1875 mlog_entry_void();
1784 1876
1785 if (!OCFS2_IS_VALID_DINODE(fe)) { 1877 /* The alloc_bh comes from ocfs2_free_dinode() or
1786 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); 1878 * ocfs2_free_clusters(). The callers have all locked the
1787 status = -EIO; 1879 * allocator and gotten alloc_bh from the lock call. This
1788 goto bail; 1880 * validates the dinode buffer. Any corruption that has happended
1789 } 1881 * is a code bug. */
1882 BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1790 BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl)); 1883 BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
1791 1884
1792 mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n", 1885 mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
1793 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count, 1886 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
1794 (unsigned long long)bg_blkno, start_bit); 1887 (unsigned long long)bg_blkno, start_bit);
1795 1888
1796 status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh); 1889 status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
1890 &group_bh);
1797 if (status < 0) { 1891 if (status < 0) {
1798 mlog_errno(status); 1892 mlog_errno(status);
1799 goto bail; 1893 goto bail;
1800 } 1894 }
1801
1802 group = (struct ocfs2_group_desc *) group_bh->b_data; 1895 group = (struct ocfs2_group_desc *) group_bh->b_data;
1803 status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, group); 1896
1804 if (status) {
1805 mlog_errno(status);
1806 goto bail;
1807 }
1808 BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); 1897 BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
1809 1898
1810 status = ocfs2_block_group_clear_bits(handle, alloc_inode, 1899 status = ocfs2_block_group_clear_bits(handle, alloc_inode,
@@ -1815,8 +1904,8 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
1815 goto bail; 1904 goto bail;
1816 } 1905 }
1817 1906
1818 status = ocfs2_journal_access(handle, alloc_inode, alloc_bh, 1907 status = ocfs2_journal_access_di(handle, alloc_inode, alloc_bh,
1819 OCFS2_JOURNAL_ACCESS_WRITE); 1908 OCFS2_JOURNAL_ACCESS_WRITE);
1820 if (status < 0) { 1909 if (status < 0) {
1821 mlog_errno(status); 1910 mlog_errno(status);
1822 goto bail; 1911 goto bail;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 4df159d8f450..e3c13c77f9e8 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -164,10 +164,24 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac);
164 * and return that block offset. */ 164 * and return that block offset. */
165u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster); 165u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
166 166
167/* somewhat more expensive than our other checks, so use sparingly. */ 167/*
168 * By default, ocfs2_read_group_descriptor() calls ocfs2_error() when it
169 * finds a problem. A caller that wants to check a group descriptor
170 * without going readonly should read the block with ocfs2_read_block[s]()
171 * and then checking it with this function. This is only resize, really.
172 * Everyone else should be using ocfs2_read_group_descriptor().
173 */
168int ocfs2_check_group_descriptor(struct super_block *sb, 174int ocfs2_check_group_descriptor(struct super_block *sb,
169 struct ocfs2_dinode *di, 175 struct ocfs2_dinode *di,
170 struct ocfs2_group_desc *gd); 176 struct buffer_head *bh);
177/*
178 * Read a group descriptor block into *bh. If *bh is NULL, a bh will be
179 * allocated. This is a cached read. The descriptor will be validated with
180 * ocfs2_validate_group_descriptor().
181 */
182int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
183 u64 gd_blkno, struct buffer_head **bh);
184
171int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et, 185int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
172 u32 clusters_to_add, u32 extents_to_split, 186 u32 clusters_to_add, u32 extents_to_split,
173 struct ocfs2_alloc_context **data_ac, 187 struct ocfs2_alloc_context **data_ac,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 304b63ac78cf..43ed11345b59 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -41,6 +41,7 @@
41#include <linux/debugfs.h> 41#include <linux/debugfs.h>
42#include <linux/mount.h> 42#include <linux/mount.h>
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/quotaops.h>
44 45
45#define MLOG_MASK_PREFIX ML_SUPER 46#define MLOG_MASK_PREFIX ML_SUPER
46#include <cluster/masklog.h> 47#include <cluster/masklog.h>
@@ -51,6 +52,7 @@
51#include "ocfs1_fs_compat.h" 52#include "ocfs1_fs_compat.h"
52 53
53#include "alloc.h" 54#include "alloc.h"
55#include "blockcheck.h"
54#include "dlmglue.h" 56#include "dlmglue.h"
55#include "export.h" 57#include "export.h"
56#include "extent_map.h" 58#include "extent_map.h"
@@ -65,10 +67,13 @@
65#include "uptodate.h" 67#include "uptodate.h"
66#include "ver.h" 68#include "ver.h"
67#include "xattr.h" 69#include "xattr.h"
70#include "quota.h"
68 71
69#include "buffer_head_io.h" 72#include "buffer_head_io.h"
70 73
71static struct kmem_cache *ocfs2_inode_cachep = NULL; 74static struct kmem_cache *ocfs2_inode_cachep = NULL;
75struct kmem_cache *ocfs2_dquot_cachep;
76struct kmem_cache *ocfs2_qf_chunk_cachep;
72 77
73/* OCFS2 needs to schedule several differnt types of work which 78/* OCFS2 needs to schedule several differnt types of work which
74 * require cluster locking, disk I/O, recovery waits, etc. Since these 79 * require cluster locking, disk I/O, recovery waits, etc. Since these
@@ -124,6 +129,9 @@ static int ocfs2_get_sector(struct super_block *sb,
124static void ocfs2_write_super(struct super_block *sb); 129static void ocfs2_write_super(struct super_block *sb);
125static struct inode *ocfs2_alloc_inode(struct super_block *sb); 130static struct inode *ocfs2_alloc_inode(struct super_block *sb);
126static void ocfs2_destroy_inode(struct inode *inode); 131static void ocfs2_destroy_inode(struct inode *inode);
132static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
133static int ocfs2_enable_quotas(struct ocfs2_super *osb);
134static void ocfs2_disable_quotas(struct ocfs2_super *osb);
127 135
128static const struct super_operations ocfs2_sops = { 136static const struct super_operations ocfs2_sops = {
129 .statfs = ocfs2_statfs, 137 .statfs = ocfs2_statfs,
@@ -137,6 +145,8 @@ static const struct super_operations ocfs2_sops = {
137 .put_super = ocfs2_put_super, 145 .put_super = ocfs2_put_super,
138 .remount_fs = ocfs2_remount, 146 .remount_fs = ocfs2_remount,
139 .show_options = ocfs2_show_options, 147 .show_options = ocfs2_show_options,
148 .quota_read = ocfs2_quota_read,
149 .quota_write = ocfs2_quota_write,
140}; 150};
141 151
142enum { 152enum {
@@ -158,6 +168,10 @@ enum {
158 Opt_user_xattr, 168 Opt_user_xattr,
159 Opt_nouser_xattr, 169 Opt_nouser_xattr,
160 Opt_inode64, 170 Opt_inode64,
171 Opt_acl,
172 Opt_noacl,
173 Opt_usrquota,
174 Opt_grpquota,
161 Opt_err, 175 Opt_err,
162}; 176};
163 177
@@ -180,6 +194,10 @@ static const match_table_t tokens = {
180 {Opt_user_xattr, "user_xattr"}, 194 {Opt_user_xattr, "user_xattr"},
181 {Opt_nouser_xattr, "nouser_xattr"}, 195 {Opt_nouser_xattr, "nouser_xattr"},
182 {Opt_inode64, "inode64"}, 196 {Opt_inode64, "inode64"},
197 {Opt_acl, "acl"},
198 {Opt_noacl, "noacl"},
199 {Opt_usrquota, "usrquota"},
200 {Opt_grpquota, "grpquota"},
183 {Opt_err, NULL} 201 {Opt_err, NULL}
184}; 202};
185 203
@@ -221,6 +239,19 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait)
221 return 0; 239 return 0;
222} 240}
223 241
242static int ocfs2_need_system_inode(struct ocfs2_super *osb, int ino)
243{
244 if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)
245 && (ino == USER_QUOTA_SYSTEM_INODE
246 || ino == LOCAL_USER_QUOTA_SYSTEM_INODE))
247 return 0;
248 if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
249 && (ino == GROUP_QUOTA_SYSTEM_INODE
250 || ino == LOCAL_GROUP_QUOTA_SYSTEM_INODE))
251 return 0;
252 return 1;
253}
254
224static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) 255static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
225{ 256{
226 struct inode *new = NULL; 257 struct inode *new = NULL;
@@ -247,6 +278,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
247 278
248 for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE; 279 for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
249 i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) { 280 i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
281 if (!ocfs2_need_system_inode(osb, i))
282 continue;
250 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); 283 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
251 if (!new) { 284 if (!new) {
252 ocfs2_release_system_inodes(osb); 285 ocfs2_release_system_inodes(osb);
@@ -277,6 +310,8 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
277 for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1; 310 for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
278 i < NUM_SYSTEM_INODES; 311 i < NUM_SYSTEM_INODES;
279 i++) { 312 i++) {
313 if (!ocfs2_need_system_inode(osb, i))
314 continue;
280 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); 315 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
281 if (!new) { 316 if (!new) {
282 ocfs2_release_system_inodes(osb); 317 ocfs2_release_system_inodes(osb);
@@ -426,6 +461,12 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
426 461
427 /* We're going to/from readonly mode. */ 462 /* We're going to/from readonly mode. */
428 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { 463 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
464 /* Disable quota accounting before remounting RO */
465 if (*flags & MS_RDONLY) {
466 ret = ocfs2_susp_quotas(osb, 0);
467 if (ret < 0)
468 goto out;
469 }
429 /* Lock here so the check of HARD_RO and the potential 470 /* Lock here so the check of HARD_RO and the potential
430 * setting of SOFT_RO is atomic. */ 471 * setting of SOFT_RO is atomic. */
431 spin_lock(&osb->osb_lock); 472 spin_lock(&osb->osb_lock);
@@ -461,11 +502,28 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
461 } 502 }
462unlock_osb: 503unlock_osb:
463 spin_unlock(&osb->osb_lock); 504 spin_unlock(&osb->osb_lock);
505 /* Enable quota accounting after remounting RW */
506 if (!ret && !(*flags & MS_RDONLY)) {
507 if (sb_any_quota_suspended(sb))
508 ret = ocfs2_susp_quotas(osb, 1);
509 else
510 ret = ocfs2_enable_quotas(osb);
511 if (ret < 0) {
512 /* Return back changes... */
513 spin_lock(&osb->osb_lock);
514 sb->s_flags |= MS_RDONLY;
515 osb->osb_flags |= OCFS2_OSB_SOFT_RO;
516 spin_unlock(&osb->osb_lock);
517 goto out;
518 }
519 }
464 } 520 }
465 521
466 if (!ret) { 522 if (!ret) {
467 /* Only save off the new mount options in case of a successful 523 /* Only save off the new mount options in case of a successful
468 * remount. */ 524 * remount. */
525 if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
526 parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
469 osb->s_mount_opt = parsed_options.mount_opt; 527 osb->s_mount_opt = parsed_options.mount_opt;
470 osb->s_atime_quantum = parsed_options.atime_quantum; 528 osb->s_atime_quantum = parsed_options.atime_quantum;
471 osb->preferred_slot = parsed_options.slot; 529 osb->preferred_slot = parsed_options.slot;
@@ -619,6 +677,131 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
619 return 0; 677 return 0;
620} 678}
621 679
680static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
681{
682 int type;
683 struct super_block *sb = osb->sb;
684 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
685 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
686 int status = 0;
687
688 for (type = 0; type < MAXQUOTAS; type++) {
689 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
690 continue;
691 if (unsuspend)
692 status = vfs_quota_enable(
693 sb_dqopt(sb)->files[type],
694 type, QFMT_OCFS2,
695 DQUOT_SUSPENDED);
696 else
697 status = vfs_quota_disable(sb, type,
698 DQUOT_SUSPENDED);
699 if (status < 0)
700 break;
701 }
702 if (status < 0)
703 mlog(ML_ERROR, "Failed to suspend/unsuspend quotas on "
704 "remount (error = %d).\n", status);
705 return status;
706}
707
708static int ocfs2_enable_quotas(struct ocfs2_super *osb)
709{
710 struct inode *inode[MAXQUOTAS] = { NULL, NULL };
711 struct super_block *sb = osb->sb;
712 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
713 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
714 unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
715 LOCAL_GROUP_QUOTA_SYSTEM_INODE };
716 int status;
717 int type;
718
719 sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE;
720 for (type = 0; type < MAXQUOTAS; type++) {
721 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
722 continue;
723 inode[type] = ocfs2_get_system_file_inode(osb, ino[type],
724 osb->slot_num);
725 if (!inode[type]) {
726 status = -ENOENT;
727 goto out_quota_off;
728 }
729 status = vfs_quota_enable(inode[type], type, QFMT_OCFS2,
730 DQUOT_USAGE_ENABLED);
731 if (status < 0)
732 goto out_quota_off;
733 }
734
735 for (type = 0; type < MAXQUOTAS; type++)
736 iput(inode[type]);
737 return 0;
738out_quota_off:
739 ocfs2_disable_quotas(osb);
740 for (type = 0; type < MAXQUOTAS; type++)
741 iput(inode[type]);
742 mlog_errno(status);
743 return status;
744}
745
746static void ocfs2_disable_quotas(struct ocfs2_super *osb)
747{
748 int type;
749 struct inode *inode;
750 struct super_block *sb = osb->sb;
751
752 /* We mostly ignore errors in this function because there's not much
753 * we can do when we see them */
754 for (type = 0; type < MAXQUOTAS; type++) {
755 if (!sb_has_quota_loaded(sb, type))
756 continue;
757 inode = igrab(sb->s_dquot.files[type]);
758 /* Turn off quotas. This will remove all dquot structures from
759 * memory and so they will be automatically synced to global
760 * quota files */
761 vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED |
762 DQUOT_LIMITS_ENABLED);
763 if (!inode)
764 continue;
765 iput(inode);
766 }
767}
768
769/* Handle quota on quotactl */
770static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
771 char *path, int remount)
772{
773 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
774 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
775
776 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
777 return -EINVAL;
778
779 if (remount)
780 return 0; /* Just ignore it has been handled in
781 * ocfs2_remount() */
782 return vfs_quota_enable(sb_dqopt(sb)->files[type], type,
783 format_id, DQUOT_LIMITS_ENABLED);
784}
785
786/* Handle quota off quotactl */
787static int ocfs2_quota_off(struct super_block *sb, int type, int remount)
788{
789 if (remount)
790 return 0; /* Ignore now and handle later in
791 * ocfs2_remount() */
792 return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
793}
794
795static struct quotactl_ops ocfs2_quotactl_ops = {
796 .quota_on = ocfs2_quota_on,
797 .quota_off = ocfs2_quota_off,
798 .quota_sync = vfs_quota_sync,
799 .get_info = vfs_get_dqinfo,
800 .set_info = vfs_set_dqinfo,
801 .get_dqblk = vfs_get_dqblk,
802 .set_dqblk = vfs_set_dqblk,
803};
804
622static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) 805static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
623{ 806{
624 struct dentry *root; 807 struct dentry *root;
@@ -651,12 +834,32 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
651 } 834 }
652 brelse(bh); 835 brelse(bh);
653 bh = NULL; 836 bh = NULL;
837
838 if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
839 parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
840
654 osb->s_mount_opt = parsed_options.mount_opt; 841 osb->s_mount_opt = parsed_options.mount_opt;
655 osb->s_atime_quantum = parsed_options.atime_quantum; 842 osb->s_atime_quantum = parsed_options.atime_quantum;
656 osb->preferred_slot = parsed_options.slot; 843 osb->preferred_slot = parsed_options.slot;
657 osb->osb_commit_interval = parsed_options.commit_interval; 844 osb->osb_commit_interval = parsed_options.commit_interval;
658 osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt); 845 osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
659 osb->local_alloc_bits = osb->local_alloc_default_bits; 846 osb->local_alloc_bits = osb->local_alloc_default_bits;
847 if (osb->s_mount_opt & OCFS2_MOUNT_USRQUOTA &&
848 !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
849 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
850 status = -EINVAL;
851 mlog(ML_ERROR, "User quotas were requested, but this "
852 "filesystem does not have the feature enabled.\n");
853 goto read_super_error;
854 }
855 if (osb->s_mount_opt & OCFS2_MOUNT_GRPQUOTA &&
856 !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
857 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
858 status = -EINVAL;
859 mlog(ML_ERROR, "Group quotas were requested, but this "
860 "filesystem does not have the feature enabled.\n");
861 goto read_super_error;
862 }
660 863
661 status = ocfs2_verify_userspace_stack(osb, &parsed_options); 864 status = ocfs2_verify_userspace_stack(osb, &parsed_options);
662 if (status) 865 if (status)
@@ -664,6 +867,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
664 867
665 sb->s_magic = OCFS2_SUPER_MAGIC; 868 sb->s_magic = OCFS2_SUPER_MAGIC;
666 869
870 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
871 ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
872
667 /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, 873 /* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
668 * heartbeat=none */ 874 * heartbeat=none */
669 if (bdev_read_only(sb->s_bdev)) { 875 if (bdev_read_only(sb->s_bdev)) {
@@ -758,6 +964,28 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
758 atomic_set(&osb->vol_state, VOLUME_MOUNTED); 964 atomic_set(&osb->vol_state, VOLUME_MOUNTED);
759 wake_up(&osb->osb_mount_event); 965 wake_up(&osb->osb_mount_event);
760 966
967 /* Now we can initialize quotas because we can afford to wait
968 * for cluster locks recovery now. That also means that truncation
969 * log recovery can happen but that waits for proper quota setup */
970 if (!(sb->s_flags & MS_RDONLY)) {
971 status = ocfs2_enable_quotas(osb);
972 if (status < 0) {
973 /* We have to err-out specially here because
974 * s_root is already set */
975 mlog_errno(status);
976 atomic_set(&osb->vol_state, VOLUME_DISABLED);
977 wake_up(&osb->osb_mount_event);
978 mlog_exit(status);
979 return status;
980 }
981 }
982
983 ocfs2_complete_quota_recovery(osb);
984
985 /* Now we wake up again for processes waiting for quotas */
986 atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS);
987 wake_up(&osb->osb_mount_event);
988
761 mlog_exit(status); 989 mlog_exit(status);
762 return status; 990 return status;
763 991
@@ -945,6 +1173,41 @@ static int ocfs2_parse_options(struct super_block *sb,
945 case Opt_inode64: 1173 case Opt_inode64:
946 mopt->mount_opt |= OCFS2_MOUNT_INODE64; 1174 mopt->mount_opt |= OCFS2_MOUNT_INODE64;
947 break; 1175 break;
1176 case Opt_usrquota:
1177 /* We check only on remount, otherwise features
1178 * aren't yet initialized. */
1179 if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1180 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1181 mlog(ML_ERROR, "User quota requested but "
1182 "filesystem feature is not set\n");
1183 status = 0;
1184 goto bail;
1185 }
1186 mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA;
1187 break;
1188 case Opt_grpquota:
1189 if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1190 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1191 mlog(ML_ERROR, "Group quota requested but "
1192 "filesystem feature is not set\n");
1193 status = 0;
1194 goto bail;
1195 }
1196 mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
1197 break;
1198#ifdef CONFIG_OCFS2_FS_POSIX_ACL
1199 case Opt_acl:
1200 mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
1201 break;
1202 case Opt_noacl:
1203 mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
1204 break;
1205#else
1206 case Opt_acl:
1207 case Opt_noacl:
1208 printk(KERN_INFO "ocfs2 (no)acl options not supported\n");
1209 break;
1210#endif
948 default: 1211 default:
949 mlog(ML_ERROR, 1212 mlog(ML_ERROR,
950 "Unrecognized mount option \"%s\" " 1213 "Unrecognized mount option \"%s\" "
@@ -1008,6 +1271,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1008 if (osb->osb_cluster_stack[0]) 1271 if (osb->osb_cluster_stack[0])
1009 seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN, 1272 seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
1010 osb->osb_cluster_stack); 1273 osb->osb_cluster_stack);
1274 if (opts & OCFS2_MOUNT_USRQUOTA)
1275 seq_printf(s, ",usrquota");
1276 if (opts & OCFS2_MOUNT_GRPQUOTA)
1277 seq_printf(s, ",grpquota");
1011 1278
1012 if (opts & OCFS2_MOUNT_NOUSERXATTR) 1279 if (opts & OCFS2_MOUNT_NOUSERXATTR)
1013 seq_printf(s, ",nouser_xattr"); 1280 seq_printf(s, ",nouser_xattr");
@@ -1017,6 +1284,13 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1017 if (opts & OCFS2_MOUNT_INODE64) 1284 if (opts & OCFS2_MOUNT_INODE64)
1018 seq_printf(s, ",inode64"); 1285 seq_printf(s, ",inode64");
1019 1286
1287#ifdef CONFIG_OCFS2_FS_POSIX_ACL
1288 if (opts & OCFS2_MOUNT_POSIX_ACL)
1289 seq_printf(s, ",acl");
1290 else
1291 seq_printf(s, ",noacl");
1292#endif
1293
1020 return 0; 1294 return 0;
1021} 1295}
1022 1296
@@ -1052,10 +1326,16 @@ static int __init ocfs2_init(void)
1052 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); 1326 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
1053 } 1327 }
1054 1328
1329 status = ocfs2_quota_setup();
1330 if (status)
1331 goto leave;
1332
1055 ocfs2_set_locking_protocol(); 1333 ocfs2_set_locking_protocol();
1056 1334
1335 status = register_quota_format(&ocfs2_quota_format);
1057leave: 1336leave:
1058 if (status < 0) { 1337 if (status < 0) {
1338 ocfs2_quota_shutdown();
1059 ocfs2_free_mem_caches(); 1339 ocfs2_free_mem_caches();
1060 exit_ocfs2_uptodate_cache(); 1340 exit_ocfs2_uptodate_cache();
1061 } 1341 }
@@ -1072,11 +1352,15 @@ static void __exit ocfs2_exit(void)
1072{ 1352{
1073 mlog_entry_void(); 1353 mlog_entry_void();
1074 1354
1355 ocfs2_quota_shutdown();
1356
1075 if (ocfs2_wq) { 1357 if (ocfs2_wq) {
1076 flush_workqueue(ocfs2_wq); 1358 flush_workqueue(ocfs2_wq);
1077 destroy_workqueue(ocfs2_wq); 1359 destroy_workqueue(ocfs2_wq);
1078 } 1360 }
1079 1361
1362 unregister_quota_format(&ocfs2_quota_format);
1363
1080 debugfs_remove(ocfs2_debugfs_root); 1364 debugfs_remove(ocfs2_debugfs_root);
1081 1365
1082 ocfs2_free_mem_caches(); 1366 ocfs2_free_mem_caches();
@@ -1192,8 +1476,27 @@ static int ocfs2_initialize_mem_caches(void)
1192 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 1476 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
1193 SLAB_MEM_SPREAD), 1477 SLAB_MEM_SPREAD),
1194 ocfs2_inode_init_once); 1478 ocfs2_inode_init_once);
1195 if (!ocfs2_inode_cachep) 1479 ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
1480 sizeof(struct ocfs2_dquot),
1481 0,
1482 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
1483 SLAB_MEM_SPREAD),
1484 NULL);
1485 ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache",
1486 sizeof(struct ocfs2_quota_chunk),
1487 0,
1488 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
1489 NULL);
1490 if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep ||
1491 !ocfs2_qf_chunk_cachep) {
1492 if (ocfs2_inode_cachep)
1493 kmem_cache_destroy(ocfs2_inode_cachep);
1494 if (ocfs2_dquot_cachep)
1495 kmem_cache_destroy(ocfs2_dquot_cachep);
1496 if (ocfs2_qf_chunk_cachep)
1497 kmem_cache_destroy(ocfs2_qf_chunk_cachep);
1196 return -ENOMEM; 1498 return -ENOMEM;
1499 }
1197 1500
1198 return 0; 1501 return 0;
1199} 1502}
@@ -1202,8 +1505,15 @@ static void ocfs2_free_mem_caches(void)
1202{ 1505{
1203 if (ocfs2_inode_cachep) 1506 if (ocfs2_inode_cachep)
1204 kmem_cache_destroy(ocfs2_inode_cachep); 1507 kmem_cache_destroy(ocfs2_inode_cachep);
1205
1206 ocfs2_inode_cachep = NULL; 1508 ocfs2_inode_cachep = NULL;
1509
1510 if (ocfs2_dquot_cachep)
1511 kmem_cache_destroy(ocfs2_dquot_cachep);
1512 ocfs2_dquot_cachep = NULL;
1513
1514 if (ocfs2_qf_chunk_cachep)
1515 kmem_cache_destroy(ocfs2_qf_chunk_cachep);
1516 ocfs2_qf_chunk_cachep = NULL;
1207} 1517}
1208 1518
1209static int ocfs2_get_sector(struct super_block *sb, 1519static int ocfs2_get_sector(struct super_block *sb,
@@ -1303,6 +1613,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1303 osb = OCFS2_SB(sb); 1613 osb = OCFS2_SB(sb);
1304 BUG_ON(!osb); 1614 BUG_ON(!osb);
1305 1615
1616 ocfs2_disable_quotas(osb);
1617
1306 ocfs2_shutdown_local_alloc(osb); 1618 ocfs2_shutdown_local_alloc(osb);
1307 1619
1308 ocfs2_truncate_log_shutdown(osb); 1620 ocfs2_truncate_log_shutdown(osb);
@@ -1413,6 +1725,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
1413 sb->s_fs_info = osb; 1725 sb->s_fs_info = osb;
1414 sb->s_op = &ocfs2_sops; 1726 sb->s_op = &ocfs2_sops;
1415 sb->s_export_op = &ocfs2_export_ops; 1727 sb->s_export_op = &ocfs2_export_ops;
1728 sb->s_qcop = &ocfs2_quotactl_ops;
1729 sb->dq_op = &ocfs2_quota_operations;
1416 sb->s_xattr = ocfs2_xattr_handlers; 1730 sb->s_xattr = ocfs2_xattr_handlers;
1417 sb->s_time_gran = 1; 1731 sb->s_time_gran = 1;
1418 sb->s_flags |= MS_NOATIME; 1732 sb->s_flags |= MS_NOATIME;
@@ -1676,6 +1990,15 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
1676 1990
1677 if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE, 1991 if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
1678 strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) { 1992 strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
1993 /* We have to do a raw check of the feature here */
1994 if (le32_to_cpu(di->id2.i_super.s_feature_incompat) &
1995 OCFS2_FEATURE_INCOMPAT_META_ECC) {
1996 status = ocfs2_block_check_validate(bh->b_data,
1997 bh->b_size,
1998 &di->i_check);
1999 if (status)
2000 goto out;
2001 }
1679 status = -EINVAL; 2002 status = -EINVAL;
1680 if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) { 2003 if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
1681 mlog(ML_ERROR, "found superblock with incorrect block " 2004 mlog(ML_ERROR, "found superblock with incorrect block "
@@ -1717,6 +2040,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
1717 } 2040 }
1718 } 2041 }
1719 2042
2043out:
1720 mlog_exit(status); 2044 mlog_exit(status);
1721 return status; 2045 return status;
1722} 2046}
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index cbd03dfdc7b9..ed0a0cfd68d2 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -84,7 +84,7 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,
84 84
85 mlog_entry_void(); 85 mlog_entry_void();
86 86
87 status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh); 87 status = ocfs2_read_inode_block(inode, bh);
88 if (status < 0) { 88 if (status < 0) {
89 mlog_errno(status); 89 mlog_errno(status);
90 link = ERR_PTR(status); 90 link = ERR_PTR(status);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 74d7367ade13..e1d638af6ac3 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -35,12 +35,14 @@
35#include <linux/init.h> 35#include <linux/init.h>
36#include <linux/module.h> 36#include <linux/module.h>
37#include <linux/string.h> 37#include <linux/string.h>
38#include <linux/security.h>
38 39
39#define MLOG_MASK_PREFIX ML_XATTR 40#define MLOG_MASK_PREFIX ML_XATTR
40#include <cluster/masklog.h> 41#include <cluster/masklog.h>
41 42
42#include "ocfs2.h" 43#include "ocfs2.h"
43#include "alloc.h" 44#include "alloc.h"
45#include "blockcheck.h"
44#include "dlmglue.h" 46#include "dlmglue.h"
45#include "file.h" 47#include "file.h"
46#include "symlink.h" 48#include "symlink.h"
@@ -61,12 +63,32 @@ struct ocfs2_xattr_def_value_root {
61}; 63};
62 64
63struct ocfs2_xattr_bucket { 65struct ocfs2_xattr_bucket {
64 struct buffer_head *bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET]; 66 /* The inode these xattrs are associated with */
65 struct ocfs2_xattr_header *xh; 67 struct inode *bu_inode;
68
69 /* The actual buffers that make up the bucket */
70 struct buffer_head *bu_bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
71
72 /* How many blocks make up one bucket for this filesystem */
73 int bu_blocks;
74};
75
76struct ocfs2_xattr_set_ctxt {
77 handle_t *handle;
78 struct ocfs2_alloc_context *meta_ac;
79 struct ocfs2_alloc_context *data_ac;
80 struct ocfs2_cached_dealloc_ctxt dealloc;
66}; 81};
67 82
68#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root)) 83#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root))
69#define OCFS2_XATTR_INLINE_SIZE 80 84#define OCFS2_XATTR_INLINE_SIZE 80
85#define OCFS2_XATTR_FREE_IN_IBODY (OCFS2_MIN_XATTR_INLINE_SIZE \
86 - sizeof(struct ocfs2_xattr_header) \
87 - sizeof(__u32))
88#define OCFS2_XATTR_FREE_IN_BLOCK(ptr) ((ptr)->i_sb->s_blocksize \
89 - sizeof(struct ocfs2_xattr_block) \
90 - sizeof(struct ocfs2_xattr_header) \
91 - sizeof(__u32))
70 92
71static struct ocfs2_xattr_def_value_root def_xv = { 93static struct ocfs2_xattr_def_value_root def_xv = {
72 .xv.xr_list.l_count = cpu_to_le16(1), 94 .xv.xr_list.l_count = cpu_to_le16(1),
@@ -74,13 +96,25 @@ static struct ocfs2_xattr_def_value_root def_xv = {
74 96
75struct xattr_handler *ocfs2_xattr_handlers[] = { 97struct xattr_handler *ocfs2_xattr_handlers[] = {
76 &ocfs2_xattr_user_handler, 98 &ocfs2_xattr_user_handler,
99#ifdef CONFIG_OCFS2_FS_POSIX_ACL
100 &ocfs2_xattr_acl_access_handler,
101 &ocfs2_xattr_acl_default_handler,
102#endif
77 &ocfs2_xattr_trusted_handler, 103 &ocfs2_xattr_trusted_handler,
104 &ocfs2_xattr_security_handler,
78 NULL 105 NULL
79}; 106};
80 107
81static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { 108static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
82 [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, 109 [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler,
110#ifdef CONFIG_OCFS2_FS_POSIX_ACL
111 [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
112 = &ocfs2_xattr_acl_access_handler,
113 [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
114 = &ocfs2_xattr_acl_default_handler,
115#endif
83 [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, 116 [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler,
117 [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler,
84}; 118};
85 119
86struct ocfs2_xattr_info { 120struct ocfs2_xattr_info {
@@ -98,7 +132,7 @@ struct ocfs2_xattr_search {
98 */ 132 */
99 struct buffer_head *xattr_bh; 133 struct buffer_head *xattr_bh;
100 struct ocfs2_xattr_header *header; 134 struct ocfs2_xattr_header *header;
101 struct ocfs2_xattr_bucket bucket; 135 struct ocfs2_xattr_bucket *bucket;
102 void *base; 136 void *base;
103 void *end; 137 void *end;
104 struct ocfs2_xattr_entry *here; 138 struct ocfs2_xattr_entry *here;
@@ -127,14 +161,20 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
127 size_t buffer_size); 161 size_t buffer_size);
128 162
129static int ocfs2_xattr_create_index_block(struct inode *inode, 163static int ocfs2_xattr_create_index_block(struct inode *inode,
130 struct ocfs2_xattr_search *xs); 164 struct ocfs2_xattr_search *xs,
165 struct ocfs2_xattr_set_ctxt *ctxt);
131 166
132static int ocfs2_xattr_set_entry_index_block(struct inode *inode, 167static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
133 struct ocfs2_xattr_info *xi, 168 struct ocfs2_xattr_info *xi,
134 struct ocfs2_xattr_search *xs); 169 struct ocfs2_xattr_search *xs,
170 struct ocfs2_xattr_set_ctxt *ctxt);
135 171
136static int ocfs2_delete_xattr_index_block(struct inode *inode, 172static int ocfs2_delete_xattr_index_block(struct inode *inode,
137 struct buffer_head *xb_bh); 173 struct buffer_head *xb_bh);
174static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
175 u64 src_blk, u64 last_blk, u64 to_blk,
176 unsigned int start_bucket,
177 u32 *first_hash);
138 178
139static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb) 179static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
140{ 180{
@@ -154,6 +194,216 @@ static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
154 return len / sizeof(struct ocfs2_xattr_entry); 194 return len / sizeof(struct ocfs2_xattr_entry);
155} 195}
156 196
197#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr)
198#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data)
199#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0))
200
201static struct ocfs2_xattr_bucket *ocfs2_xattr_bucket_new(struct inode *inode)
202{
203 struct ocfs2_xattr_bucket *bucket;
204 int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
205
206 BUG_ON(blks > OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET);
207
208 bucket = kzalloc(sizeof(struct ocfs2_xattr_bucket), GFP_NOFS);
209 if (bucket) {
210 bucket->bu_inode = inode;
211 bucket->bu_blocks = blks;
212 }
213
214 return bucket;
215}
216
217static void ocfs2_xattr_bucket_relse(struct ocfs2_xattr_bucket *bucket)
218{
219 int i;
220
221 for (i = 0; i < bucket->bu_blocks; i++) {
222 brelse(bucket->bu_bhs[i]);
223 bucket->bu_bhs[i] = NULL;
224 }
225}
226
227static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket)
228{
229 if (bucket) {
230 ocfs2_xattr_bucket_relse(bucket);
231 bucket->bu_inode = NULL;
232 kfree(bucket);
233 }
234}
235
236/*
237 * A bucket that has never been written to disk doesn't need to be
238 * read. We just need the buffer_heads. Don't call this for
239 * buckets that are already on disk. ocfs2_read_xattr_bucket() initializes
240 * them fully.
241 */
242static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
243 u64 xb_blkno)
244{
245 int i, rc = 0;
246
247 for (i = 0; i < bucket->bu_blocks; i++) {
248 bucket->bu_bhs[i] = sb_getblk(bucket->bu_inode->i_sb,
249 xb_blkno + i);
250 if (!bucket->bu_bhs[i]) {
251 rc = -EIO;
252 mlog_errno(rc);
253 break;
254 }
255
256 if (!ocfs2_buffer_uptodate(bucket->bu_inode,
257 bucket->bu_bhs[i]))
258 ocfs2_set_new_buffer_uptodate(bucket->bu_inode,
259 bucket->bu_bhs[i]);
260 }
261
262 if (rc)
263 ocfs2_xattr_bucket_relse(bucket);
264 return rc;
265}
266
267/* Read the xattr bucket at xb_blkno */
268static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
269 u64 xb_blkno)
270{
271 int rc;
272
273 rc = ocfs2_read_blocks(bucket->bu_inode, xb_blkno,
274 bucket->bu_blocks, bucket->bu_bhs, 0,
275 NULL);
276 if (!rc) {
277 rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb,
278 bucket->bu_bhs,
279 bucket->bu_blocks,
280 &bucket_xh(bucket)->xh_check);
281 if (rc)
282 mlog_errno(rc);
283 }
284
285 if (rc)
286 ocfs2_xattr_bucket_relse(bucket);
287 return rc;
288}
289
290static int ocfs2_xattr_bucket_journal_access(handle_t *handle,
291 struct ocfs2_xattr_bucket *bucket,
292 int type)
293{
294 int i, rc = 0;
295
296 for (i = 0; i < bucket->bu_blocks; i++) {
297 rc = ocfs2_journal_access(handle, bucket->bu_inode,
298 bucket->bu_bhs[i], type);
299 if (rc) {
300 mlog_errno(rc);
301 break;
302 }
303 }
304
305 return rc;
306}
307
308static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle,
309 struct ocfs2_xattr_bucket *bucket)
310{
311 int i;
312
313 ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb,
314 bucket->bu_bhs, bucket->bu_blocks,
315 &bucket_xh(bucket)->xh_check);
316
317 for (i = 0; i < bucket->bu_blocks; i++)
318 ocfs2_journal_dirty(handle, bucket->bu_bhs[i]);
319}
320
321static void ocfs2_xattr_bucket_copy_data(struct ocfs2_xattr_bucket *dest,
322 struct ocfs2_xattr_bucket *src)
323{
324 int i;
325 int blocksize = src->bu_inode->i_sb->s_blocksize;
326
327 BUG_ON(dest->bu_blocks != src->bu_blocks);
328 BUG_ON(dest->bu_inode != src->bu_inode);
329
330 for (i = 0; i < src->bu_blocks; i++) {
331 memcpy(bucket_block(dest, i), bucket_block(src, i),
332 blocksize);
333 }
334}
335
336static int ocfs2_validate_xattr_block(struct super_block *sb,
337 struct buffer_head *bh)
338{
339 int rc;
340 struct ocfs2_xattr_block *xb =
341 (struct ocfs2_xattr_block *)bh->b_data;
342
343 mlog(0, "Validating xattr block %llu\n",
344 (unsigned long long)bh->b_blocknr);
345
346 BUG_ON(!buffer_uptodate(bh));
347
348 /*
349 * If the ecc fails, we return the error but otherwise
350 * leave the filesystem running. We know any error is
351 * local to this block.
352 */
353 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &xb->xb_check);
354 if (rc)
355 return rc;
356
357 /*
358 * Errors after here are fatal
359 */
360
361 if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
362 ocfs2_error(sb,
363 "Extended attribute block #%llu has bad "
364 "signature %.*s",
365 (unsigned long long)bh->b_blocknr, 7,
366 xb->xb_signature);
367 return -EINVAL;
368 }
369
370 if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) {
371 ocfs2_error(sb,
372 "Extended attribute block #%llu has an "
373 "invalid xb_blkno of %llu",
374 (unsigned long long)bh->b_blocknr,
375 (unsigned long long)le64_to_cpu(xb->xb_blkno));
376 return -EINVAL;
377 }
378
379 if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) {
380 ocfs2_error(sb,
381 "Extended attribute block #%llu has an invalid "
382 "xb_fs_generation of #%u",
383 (unsigned long long)bh->b_blocknr,
384 le32_to_cpu(xb->xb_fs_generation));
385 return -EINVAL;
386 }
387
388 return 0;
389}
390
391static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
392 struct buffer_head **bh)
393{
394 int rc;
395 struct buffer_head *tmp = *bh;
396
397 rc = ocfs2_read_block(inode, xb_blkno, &tmp,
398 ocfs2_validate_xattr_block);
399
400 /* If ocfs2_read_block() got us a new bh, pass it up. */
401 if (!rc && !*bh)
402 *bh = tmp;
403
404 return rc;
405}
406
157static inline const char *ocfs2_xattr_prefix(int name_index) 407static inline const char *ocfs2_xattr_prefix(int name_index)
158{ 408{
159 struct xattr_handler *handler = NULL; 409 struct xattr_handler *handler = NULL;
@@ -200,54 +450,163 @@ static void ocfs2_xattr_hash_entry(struct inode *inode,
200 return; 450 return;
201} 451}
202 452
453static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
454{
455 int size = 0;
456
457 if (value_len <= OCFS2_XATTR_INLINE_SIZE)
458 size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
459 else
460 size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
461 size += sizeof(struct ocfs2_xattr_entry);
462
463 return size;
464}
465
466int ocfs2_calc_security_init(struct inode *dir,
467 struct ocfs2_security_xattr_info *si,
468 int *want_clusters,
469 int *xattr_credits,
470 struct ocfs2_alloc_context **xattr_ac)
471{
472 int ret = 0;
473 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
474 int s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
475 si->value_len);
476
477 /*
478 * The max space of security xattr taken inline is
479 * 256(name) + 80(value) + 16(entry) = 352 bytes,
480 * So reserve one metadata block for it is ok.
481 */
482 if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
483 s_size > OCFS2_XATTR_FREE_IN_IBODY) {
484 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
485 if (ret) {
486 mlog_errno(ret);
487 return ret;
488 }
489 *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
490 }
491
492 /* reserve clusters for xattr value which will be set in B tree*/
493 if (si->value_len > OCFS2_XATTR_INLINE_SIZE) {
494 int new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
495 si->value_len);
496
497 *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
498 new_clusters);
499 *want_clusters += new_clusters;
500 }
501 return ret;
502}
503
504int ocfs2_calc_xattr_init(struct inode *dir,
505 struct buffer_head *dir_bh,
506 int mode,
507 struct ocfs2_security_xattr_info *si,
508 int *want_clusters,
509 int *xattr_credits,
510 struct ocfs2_alloc_context **xattr_ac)
511{
512 int ret = 0;
513 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
514 int s_size = 0, a_size = 0, acl_len = 0, new_clusters;
515
516 if (si->enable)
517 s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
518 si->value_len);
519
520 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
521 acl_len = ocfs2_xattr_get_nolock(dir, dir_bh,
522 OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
523 "", NULL, 0);
524 if (acl_len > 0) {
525 a_size = ocfs2_xattr_entry_real_size(0, acl_len);
526 if (S_ISDIR(mode))
527 a_size <<= 1;
528 } else if (acl_len != 0 && acl_len != -ENODATA) {
529 mlog_errno(ret);
530 return ret;
531 }
532 }
533
534 if (!(s_size + a_size))
535 return ret;
536
537 /*
538 * The max space of security xattr taken inline is
539 * 256(name) + 80(value) + 16(entry) = 352 bytes,
540 * The max space of acl xattr taken inline is
541 * 80(value) + 16(entry) * 2(if directory) = 192 bytes,
542 * when blocksize = 512, may reserve one more cluser for
543 * xattr bucket, otherwise reserve one metadata block
544 * for them is ok.
545 */
546 if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
547 (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
548 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
549 if (ret) {
550 mlog_errno(ret);
551 return ret;
552 }
553 *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
554 }
555
556 if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE &&
557 (s_size + a_size) > OCFS2_XATTR_FREE_IN_BLOCK(dir)) {
558 *want_clusters += 1;
559 *xattr_credits += ocfs2_blocks_per_xattr_bucket(dir->i_sb);
560 }
561
562 /*
563 * reserve credits and clusters for xattrs which has large value
564 * and have to be set outside
565 */
566 if (si->enable && si->value_len > OCFS2_XATTR_INLINE_SIZE) {
567 new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
568 si->value_len);
569 *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
570 new_clusters);
571 *want_clusters += new_clusters;
572 }
573 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL &&
574 acl_len > OCFS2_XATTR_INLINE_SIZE) {
575 /* for directory, it has DEFAULT and ACCESS two types of acls */
576 new_clusters = (S_ISDIR(mode) ? 2 : 1) *
577 ocfs2_clusters_for_bytes(dir->i_sb, acl_len);
578 *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
579 new_clusters);
580 *want_clusters += new_clusters;
581 }
582
583 return ret;
584}
585
203static int ocfs2_xattr_extend_allocation(struct inode *inode, 586static int ocfs2_xattr_extend_allocation(struct inode *inode,
204 u32 clusters_to_add, 587 u32 clusters_to_add,
205 struct buffer_head *xattr_bh, 588 struct ocfs2_xattr_value_buf *vb,
206 struct ocfs2_xattr_value_root *xv) 589 struct ocfs2_xattr_set_ctxt *ctxt)
207{ 590{
208 int status = 0; 591 int status = 0;
209 int restart_func = 0; 592 handle_t *handle = ctxt->handle;
210 int credits = 0;
211 handle_t *handle = NULL;
212 struct ocfs2_alloc_context *data_ac = NULL;
213 struct ocfs2_alloc_context *meta_ac = NULL;
214 enum ocfs2_alloc_restarted why; 593 enum ocfs2_alloc_restarted why;
215 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 594 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
216 u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters); 595 u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
217 struct ocfs2_extent_tree et; 596 struct ocfs2_extent_tree et;
218 597
219 mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add); 598 mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
220 599
221 ocfs2_init_xattr_value_extent_tree(&et, inode, xattr_bh, xv); 600 ocfs2_init_xattr_value_extent_tree(&et, inode, vb);
222
223restart_all:
224
225 status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
226 &data_ac, &meta_ac);
227 if (status) {
228 mlog_errno(status);
229 goto leave;
230 }
231
232 credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
233 clusters_to_add);
234 handle = ocfs2_start_trans(osb, credits);
235 if (IS_ERR(handle)) {
236 status = PTR_ERR(handle);
237 handle = NULL;
238 mlog_errno(status);
239 goto leave;
240 }
241 601
242restarted_transaction: 602 status = vb->vb_access(handle, inode, vb->vb_bh,
243 status = ocfs2_journal_access(handle, inode, xattr_bh, 603 OCFS2_JOURNAL_ACCESS_WRITE);
244 OCFS2_JOURNAL_ACCESS_WRITE);
245 if (status < 0) { 604 if (status < 0) {
246 mlog_errno(status); 605 mlog_errno(status);
247 goto leave; 606 goto leave;
248 } 607 }
249 608
250 prev_clusters = le32_to_cpu(xv->xr_clusters); 609 prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
251 status = ocfs2_add_clusters_in_btree(osb, 610 status = ocfs2_add_clusters_in_btree(osb,
252 inode, 611 inode,
253 &logical_start, 612 &logical_start,
@@ -255,157 +614,84 @@ restarted_transaction:
255 0, 614 0,
256 &et, 615 &et,
257 handle, 616 handle,
258 data_ac, 617 ctxt->data_ac,
259 meta_ac, 618 ctxt->meta_ac,
260 &why); 619 &why);
261 if ((status < 0) && (status != -EAGAIN)) { 620 if (status < 0) {
262 if (status != -ENOSPC) 621 mlog_errno(status);
263 mlog_errno(status);
264 goto leave; 622 goto leave;
265 } 623 }
266 624
267 status = ocfs2_journal_dirty(handle, xattr_bh); 625 status = ocfs2_journal_dirty(handle, vb->vb_bh);
268 if (status < 0) { 626 if (status < 0) {
269 mlog_errno(status); 627 mlog_errno(status);
270 goto leave; 628 goto leave;
271 } 629 }
272 630
273 clusters_to_add -= le32_to_cpu(xv->xr_clusters) - prev_clusters; 631 clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
274 632
275 if (why != RESTART_NONE && clusters_to_add) { 633 /*
276 if (why == RESTART_META) { 634 * We should have already allocated enough space before the transaction,
277 mlog(0, "restarting function.\n"); 635 * so no need to restart.
278 restart_func = 1; 636 */
279 } else { 637 BUG_ON(why != RESTART_NONE || clusters_to_add);
280 BUG_ON(why != RESTART_TRANS);
281
282 mlog(0, "restarting transaction.\n");
283 /* TODO: This can be more intelligent. */
284 credits = ocfs2_calc_extend_credits(osb->sb,
285 et.et_root_el,
286 clusters_to_add);
287 status = ocfs2_extend_trans(handle, credits);
288 if (status < 0) {
289 /* handle still has to be committed at
290 * this point. */
291 status = -ENOMEM;
292 mlog_errno(status);
293 goto leave;
294 }
295 goto restarted_transaction;
296 }
297 }
298 638
299leave: 639leave:
300 if (handle) {
301 ocfs2_commit_trans(osb, handle);
302 handle = NULL;
303 }
304 if (data_ac) {
305 ocfs2_free_alloc_context(data_ac);
306 data_ac = NULL;
307 }
308 if (meta_ac) {
309 ocfs2_free_alloc_context(meta_ac);
310 meta_ac = NULL;
311 }
312 if ((!status) && restart_func) {
313 restart_func = 0;
314 goto restart_all;
315 }
316 640
317 return status; 641 return status;
318} 642}
319 643
320static int __ocfs2_remove_xattr_range(struct inode *inode, 644static int __ocfs2_remove_xattr_range(struct inode *inode,
321 struct buffer_head *root_bh, 645 struct ocfs2_xattr_value_buf *vb,
322 struct ocfs2_xattr_value_root *xv,
323 u32 cpos, u32 phys_cpos, u32 len, 646 u32 cpos, u32 phys_cpos, u32 len,
324 struct ocfs2_cached_dealloc_ctxt *dealloc) 647 struct ocfs2_xattr_set_ctxt *ctxt)
325{ 648{
326 int ret; 649 int ret;
327 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 650 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
328 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 651 handle_t *handle = ctxt->handle;
329 struct inode *tl_inode = osb->osb_tl_inode;
330 handle_t *handle;
331 struct ocfs2_alloc_context *meta_ac = NULL;
332 struct ocfs2_extent_tree et; 652 struct ocfs2_extent_tree et;
333 653
334 ocfs2_init_xattr_value_extent_tree(&et, inode, root_bh, xv); 654 ocfs2_init_xattr_value_extent_tree(&et, inode, vb);
335 655
336 ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac); 656 ret = vb->vb_access(handle, inode, vb->vb_bh,
657 OCFS2_JOURNAL_ACCESS_WRITE);
337 if (ret) { 658 if (ret) {
338 mlog_errno(ret); 659 mlog_errno(ret);
339 return ret;
340 }
341
342 mutex_lock(&tl_inode->i_mutex);
343
344 if (ocfs2_truncate_log_needs_flush(osb)) {
345 ret = __ocfs2_flush_truncate_log(osb);
346 if (ret < 0) {
347 mlog_errno(ret);
348 goto out;
349 }
350 }
351
352 handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
353 if (IS_ERR(handle)) {
354 ret = PTR_ERR(handle);
355 mlog_errno(ret);
356 goto out; 660 goto out;
357 } 661 }
358 662
359 ret = ocfs2_journal_access(handle, inode, root_bh, 663 ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, ctxt->meta_ac,
360 OCFS2_JOURNAL_ACCESS_WRITE); 664 &ctxt->dealloc);
361 if (ret) {
362 mlog_errno(ret);
363 goto out_commit;
364 }
365
366 ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
367 dealloc);
368 if (ret) { 665 if (ret) {
369 mlog_errno(ret); 666 mlog_errno(ret);
370 goto out_commit; 667 goto out;
371 } 668 }
372 669
373 le32_add_cpu(&xv->xr_clusters, -len); 670 le32_add_cpu(&vb->vb_xv->xr_clusters, -len);
374 671
375 ret = ocfs2_journal_dirty(handle, root_bh); 672 ret = ocfs2_journal_dirty(handle, vb->vb_bh);
376 if (ret) { 673 if (ret) {
377 mlog_errno(ret); 674 mlog_errno(ret);
378 goto out_commit; 675 goto out;
379 } 676 }
380 677
381 ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len); 678 ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, phys_blkno, len);
382 if (ret) 679 if (ret)
383 mlog_errno(ret); 680 mlog_errno(ret);
384 681
385out_commit:
386 ocfs2_commit_trans(osb, handle);
387out: 682out:
388 mutex_unlock(&tl_inode->i_mutex);
389
390 if (meta_ac)
391 ocfs2_free_alloc_context(meta_ac);
392
393 return ret; 683 return ret;
394} 684}
395 685
396static int ocfs2_xattr_shrink_size(struct inode *inode, 686static int ocfs2_xattr_shrink_size(struct inode *inode,
397 u32 old_clusters, 687 u32 old_clusters,
398 u32 new_clusters, 688 u32 new_clusters,
399 struct buffer_head *root_bh, 689 struct ocfs2_xattr_value_buf *vb,
400 struct ocfs2_xattr_value_root *xv) 690 struct ocfs2_xattr_set_ctxt *ctxt)
401{ 691{
402 int ret = 0; 692 int ret = 0;
403 u32 trunc_len, cpos, phys_cpos, alloc_size; 693 u32 trunc_len, cpos, phys_cpos, alloc_size;
404 u64 block; 694 u64 block;
405 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
406 struct ocfs2_cached_dealloc_ctxt dealloc;
407
408 ocfs2_init_dealloc_ctxt(&dealloc);
409 695
410 if (old_clusters <= new_clusters) 696 if (old_clusters <= new_clusters)
411 return 0; 697 return 0;
@@ -414,7 +700,8 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
414 trunc_len = old_clusters - new_clusters; 700 trunc_len = old_clusters - new_clusters;
415 while (trunc_len) { 701 while (trunc_len) {
416 ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos, 702 ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
417 &alloc_size, &xv->xr_list); 703 &alloc_size,
704 &vb->vb_xv->xr_list);
418 if (ret) { 705 if (ret) {
419 mlog_errno(ret); 706 mlog_errno(ret);
420 goto out; 707 goto out;
@@ -423,9 +710,9 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
423 if (alloc_size > trunc_len) 710 if (alloc_size > trunc_len)
424 alloc_size = trunc_len; 711 alloc_size = trunc_len;
425 712
426 ret = __ocfs2_remove_xattr_range(inode, root_bh, xv, cpos, 713 ret = __ocfs2_remove_xattr_range(inode, vb, cpos,
427 phys_cpos, alloc_size, 714 phys_cpos, alloc_size,
428 &dealloc); 715 ctxt);
429 if (ret) { 716 if (ret) {
430 mlog_errno(ret); 717 mlog_errno(ret);
431 goto out; 718 goto out;
@@ -439,20 +726,17 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
439 } 726 }
440 727
441out: 728out:
442 ocfs2_schedule_truncate_log_flush(osb, 1);
443 ocfs2_run_deallocs(osb, &dealloc);
444
445 return ret; 729 return ret;
446} 730}
447 731
448static int ocfs2_xattr_value_truncate(struct inode *inode, 732static int ocfs2_xattr_value_truncate(struct inode *inode,
449 struct buffer_head *root_bh, 733 struct ocfs2_xattr_value_buf *vb,
450 struct ocfs2_xattr_value_root *xv, 734 int len,
451 int len) 735 struct ocfs2_xattr_set_ctxt *ctxt)
452{ 736{
453 int ret; 737 int ret;
454 u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len); 738 u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len);
455 u32 old_clusters = le32_to_cpu(xv->xr_clusters); 739 u32 old_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
456 740
457 if (new_clusters == old_clusters) 741 if (new_clusters == old_clusters)
458 return 0; 742 return 0;
@@ -460,11 +744,11 @@ static int ocfs2_xattr_value_truncate(struct inode *inode,
460 if (new_clusters > old_clusters) 744 if (new_clusters > old_clusters)
461 ret = ocfs2_xattr_extend_allocation(inode, 745 ret = ocfs2_xattr_extend_allocation(inode,
462 new_clusters - old_clusters, 746 new_clusters - old_clusters,
463 root_bh, xv); 747 vb, ctxt);
464 else 748 else
465 ret = ocfs2_xattr_shrink_size(inode, 749 ret = ocfs2_xattr_shrink_size(inode,
466 old_clusters, new_clusters, 750 old_clusters, new_clusters,
467 root_bh, xv); 751 vb, ctxt);
468 752
469 return ret; 753 return ret;
470} 754}
@@ -554,18 +838,14 @@ static int ocfs2_xattr_block_list(struct inode *inode,
554 if (!di->i_xattr_loc) 838 if (!di->i_xattr_loc)
555 return ret; 839 return ret;
556 840
557 ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh); 841 ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
842 &blk_bh);
558 if (ret < 0) { 843 if (ret < 0) {
559 mlog_errno(ret); 844 mlog_errno(ret);
560 return ret; 845 return ret;
561 } 846 }
562 847
563 xb = (struct ocfs2_xattr_block *)blk_bh->b_data; 848 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
564 if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
565 ret = -EIO;
566 goto cleanup;
567 }
568
569 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { 849 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
570 struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header; 850 struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
571 ret = ocfs2_xattr_list_entries(inode, header, 851 ret = ocfs2_xattr_list_entries(inode, header,
@@ -575,7 +855,7 @@ static int ocfs2_xattr_block_list(struct inode *inode,
575 ret = ocfs2_xattr_tree_list_index_block(inode, xt, 855 ret = ocfs2_xattr_tree_list_index_block(inode, xt,
576 buffer, buffer_size); 856 buffer, buffer_size);
577 } 857 }
578cleanup: 858
579 brelse(blk_bh); 859 brelse(blk_bh);
580 860
581 return ret; 861 return ret;
@@ -685,7 +965,7 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode,
685 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); 965 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
686 /* Copy ocfs2_xattr_value */ 966 /* Copy ocfs2_xattr_value */
687 for (i = 0; i < num_clusters * bpc; i++, blkno++) { 967 for (i = 0; i < num_clusters * bpc; i++, blkno++) {
688 ret = ocfs2_read_block(inode, blkno, &bh); 968 ret = ocfs2_read_block(inode, blkno, &bh, NULL);
689 if (ret) { 969 if (ret) {
690 mlog_errno(ret); 970 mlog_errno(ret);
691 goto out; 971 goto out;
@@ -769,7 +1049,12 @@ static int ocfs2_xattr_block_get(struct inode *inode,
769 size_t size; 1049 size_t size;
770 int ret = -ENODATA, name_offset, name_len, block_off, i; 1050 int ret = -ENODATA, name_offset, name_len, block_off, i;
771 1051
772 memset(&xs->bucket, 0, sizeof(xs->bucket)); 1052 xs->bucket = ocfs2_xattr_bucket_new(inode);
1053 if (!xs->bucket) {
1054 ret = -ENOMEM;
1055 mlog_errno(ret);
1056 goto cleanup;
1057 }
773 1058
774 ret = ocfs2_xattr_block_find(inode, name_index, name, xs); 1059 ret = ocfs2_xattr_block_find(inode, name_index, name, xs);
775 if (ret) { 1060 if (ret) {
@@ -795,11 +1080,11 @@ static int ocfs2_xattr_block_get(struct inode *inode,
795 1080
796 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { 1081 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
797 ret = ocfs2_xattr_bucket_get_name_value(inode, 1082 ret = ocfs2_xattr_bucket_get_name_value(inode,
798 xs->bucket.xh, 1083 bucket_xh(xs->bucket),
799 i, 1084 i,
800 &block_off, 1085 &block_off,
801 &name_offset); 1086 &name_offset);
802 xs->base = xs->bucket.bhs[block_off]->b_data; 1087 xs->base = bucket_block(xs->bucket, block_off);
803 } 1088 }
804 if (ocfs2_xattr_is_local(xs->here)) { 1089 if (ocfs2_xattr_is_local(xs->here)) {
805 memcpy(buffer, (void *)xs->base + 1090 memcpy(buffer, (void *)xs->base +
@@ -817,21 +1102,15 @@ static int ocfs2_xattr_block_get(struct inode *inode,
817 } 1102 }
818 ret = size; 1103 ret = size;
819cleanup: 1104cleanup:
820 for (i = 0; i < OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET; i++) 1105 ocfs2_xattr_bucket_free(xs->bucket);
821 brelse(xs->bucket.bhs[i]);
822 memset(&xs->bucket, 0, sizeof(xs->bucket));
823 1106
824 brelse(xs->xattr_bh); 1107 brelse(xs->xattr_bh);
825 xs->xattr_bh = NULL; 1108 xs->xattr_bh = NULL;
826 return ret; 1109 return ret;
827} 1110}
828 1111
829/* ocfs2_xattr_get() 1112int ocfs2_xattr_get_nolock(struct inode *inode,
830 * 1113 struct buffer_head *di_bh,
831 * Copy an extended attribute into the buffer provided.
832 * Buffer is NULL to compute the size of buffer required.
833 */
834static int ocfs2_xattr_get(struct inode *inode,
835 int name_index, 1114 int name_index,
836 const char *name, 1115 const char *name,
837 void *buffer, 1116 void *buffer,
@@ -839,7 +1118,6 @@ static int ocfs2_xattr_get(struct inode *inode,
839{ 1118{
840 int ret; 1119 int ret;
841 struct ocfs2_dinode *di = NULL; 1120 struct ocfs2_dinode *di = NULL;
842 struct buffer_head *di_bh = NULL;
843 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1121 struct ocfs2_inode_info *oi = OCFS2_I(inode);
844 struct ocfs2_xattr_search xis = { 1122 struct ocfs2_xattr_search xis = {
845 .not_found = -ENODATA, 1123 .not_found = -ENODATA,
@@ -854,11 +1132,6 @@ static int ocfs2_xattr_get(struct inode *inode,
854 if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) 1132 if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
855 ret = -ENODATA; 1133 ret = -ENODATA;
856 1134
857 ret = ocfs2_inode_lock(inode, &di_bh, 0);
858 if (ret < 0) {
859 mlog_errno(ret);
860 return ret;
861 }
862 xis.inode_bh = xbs.inode_bh = di_bh; 1135 xis.inode_bh = xbs.inode_bh = di_bh;
863 di = (struct ocfs2_dinode *)di_bh->b_data; 1136 di = (struct ocfs2_dinode *)di_bh->b_data;
864 1137
@@ -869,6 +1142,32 @@ static int ocfs2_xattr_get(struct inode *inode,
869 ret = ocfs2_xattr_block_get(inode, name_index, name, buffer, 1142 ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
870 buffer_size, &xbs); 1143 buffer_size, &xbs);
871 up_read(&oi->ip_xattr_sem); 1144 up_read(&oi->ip_xattr_sem);
1145
1146 return ret;
1147}
1148
1149/* ocfs2_xattr_get()
1150 *
1151 * Copy an extended attribute into the buffer provided.
1152 * Buffer is NULL to compute the size of buffer required.
1153 */
1154static int ocfs2_xattr_get(struct inode *inode,
1155 int name_index,
1156 const char *name,
1157 void *buffer,
1158 size_t buffer_size)
1159{
1160 int ret;
1161 struct buffer_head *di_bh = NULL;
1162
1163 ret = ocfs2_inode_lock(inode, &di_bh, 0);
1164 if (ret < 0) {
1165 mlog_errno(ret);
1166 return ret;
1167 }
1168 ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
1169 name, buffer, buffer_size);
1170
872 ocfs2_inode_unlock(inode, 0); 1171 ocfs2_inode_unlock(inode, 0);
873 1172
874 brelse(di_bh); 1173 brelse(di_bh);
@@ -877,44 +1176,36 @@ static int ocfs2_xattr_get(struct inode *inode,
877} 1176}
878 1177
879static int __ocfs2_xattr_set_value_outside(struct inode *inode, 1178static int __ocfs2_xattr_set_value_outside(struct inode *inode,
1179 handle_t *handle,
880 struct ocfs2_xattr_value_root *xv, 1180 struct ocfs2_xattr_value_root *xv,
881 const void *value, 1181 const void *value,
882 int value_len) 1182 int value_len)
883{ 1183{
884 int ret = 0, i, cp_len, credits; 1184 int ret = 0, i, cp_len;
885 u16 blocksize = inode->i_sb->s_blocksize; 1185 u16 blocksize = inode->i_sb->s_blocksize;
886 u32 p_cluster, num_clusters; 1186 u32 p_cluster, num_clusters;
887 u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); 1187 u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
888 u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len); 1188 u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
889 u64 blkno; 1189 u64 blkno;
890 struct buffer_head *bh = NULL; 1190 struct buffer_head *bh = NULL;
891 handle_t *handle;
892 1191
893 BUG_ON(clusters > le32_to_cpu(xv->xr_clusters)); 1192 BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
894 1193
895 credits = clusters * bpc;
896 handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), credits);
897 if (IS_ERR(handle)) {
898 ret = PTR_ERR(handle);
899 mlog_errno(ret);
900 goto out;
901 }
902
903 while (cpos < clusters) { 1194 while (cpos < clusters) {
904 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, 1195 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
905 &num_clusters, &xv->xr_list); 1196 &num_clusters, &xv->xr_list);
906 if (ret) { 1197 if (ret) {
907 mlog_errno(ret); 1198 mlog_errno(ret);
908 goto out_commit; 1199 goto out;
909 } 1200 }
910 1201
911 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); 1202 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
912 1203
913 for (i = 0; i < num_clusters * bpc; i++, blkno++) { 1204 for (i = 0; i < num_clusters * bpc; i++, blkno++) {
914 ret = ocfs2_read_block(inode, blkno, &bh); 1205 ret = ocfs2_read_block(inode, blkno, &bh, NULL);
915 if (ret) { 1206 if (ret) {
916 mlog_errno(ret); 1207 mlog_errno(ret);
917 goto out_commit; 1208 goto out;
918 } 1209 }
919 1210
920 ret = ocfs2_journal_access(handle, 1211 ret = ocfs2_journal_access(handle,
@@ -923,7 +1214,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
923 OCFS2_JOURNAL_ACCESS_WRITE); 1214 OCFS2_JOURNAL_ACCESS_WRITE);
924 if (ret < 0) { 1215 if (ret < 0) {
925 mlog_errno(ret); 1216 mlog_errno(ret);
926 goto out_commit; 1217 goto out;
927 } 1218 }
928 1219
929 cp_len = value_len > blocksize ? blocksize : value_len; 1220 cp_len = value_len > blocksize ? blocksize : value_len;
@@ -937,7 +1228,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
937 ret = ocfs2_journal_dirty(handle, bh); 1228 ret = ocfs2_journal_dirty(handle, bh);
938 if (ret < 0) { 1229 if (ret < 0) {
939 mlog_errno(ret); 1230 mlog_errno(ret);
940 goto out_commit; 1231 goto out;
941 } 1232 }
942 brelse(bh); 1233 brelse(bh);
943 bh = NULL; 1234 bh = NULL;
@@ -951,8 +1242,6 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
951 } 1242 }
952 cpos += num_clusters; 1243 cpos += num_clusters;
953 } 1244 }
954out_commit:
955 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
956out: 1245out:
957 brelse(bh); 1246 brelse(bh);
958 1247
@@ -960,28 +1249,22 @@ out:
960} 1249}
961 1250
962static int ocfs2_xattr_cleanup(struct inode *inode, 1251static int ocfs2_xattr_cleanup(struct inode *inode,
1252 handle_t *handle,
963 struct ocfs2_xattr_info *xi, 1253 struct ocfs2_xattr_info *xi,
964 struct ocfs2_xattr_search *xs, 1254 struct ocfs2_xattr_search *xs,
1255 struct ocfs2_xattr_value_buf *vb,
965 size_t offs) 1256 size_t offs)
966{ 1257{
967 handle_t *handle = NULL;
968 int ret = 0; 1258 int ret = 0;
969 size_t name_len = strlen(xi->name); 1259 size_t name_len = strlen(xi->name);
970 void *val = xs->base + offs; 1260 void *val = xs->base + offs;
971 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; 1261 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
972 1262
973 handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1263 ret = vb->vb_access(handle, inode, vb->vb_bh,
974 OCFS2_XATTR_BLOCK_UPDATE_CREDITS); 1264 OCFS2_JOURNAL_ACCESS_WRITE);
975 if (IS_ERR(handle)) {
976 ret = PTR_ERR(handle);
977 mlog_errno(ret);
978 goto out;
979 }
980 ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
981 OCFS2_JOURNAL_ACCESS_WRITE);
982 if (ret) { 1265 if (ret) {
983 mlog_errno(ret); 1266 mlog_errno(ret);
984 goto out_commit; 1267 goto out;
985 } 1268 }
986 /* Decrease xattr count */ 1269 /* Decrease xattr count */
987 le16_add_cpu(&xs->header->xh_count, -1); 1270 le16_add_cpu(&xs->header->xh_count, -1);
@@ -989,35 +1272,27 @@ static int ocfs2_xattr_cleanup(struct inode *inode,
989 memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry)); 1272 memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
990 memset(val, 0, size); 1273 memset(val, 0, size);
991 1274
992 ret = ocfs2_journal_dirty(handle, xs->xattr_bh); 1275 ret = ocfs2_journal_dirty(handle, vb->vb_bh);
993 if (ret < 0) 1276 if (ret < 0)
994 mlog_errno(ret); 1277 mlog_errno(ret);
995out_commit:
996 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
997out: 1278out:
998 return ret; 1279 return ret;
999} 1280}
1000 1281
1001static int ocfs2_xattr_update_entry(struct inode *inode, 1282static int ocfs2_xattr_update_entry(struct inode *inode,
1283 handle_t *handle,
1002 struct ocfs2_xattr_info *xi, 1284 struct ocfs2_xattr_info *xi,
1003 struct ocfs2_xattr_search *xs, 1285 struct ocfs2_xattr_search *xs,
1286 struct ocfs2_xattr_value_buf *vb,
1004 size_t offs) 1287 size_t offs)
1005{ 1288{
1006 handle_t *handle = NULL; 1289 int ret;
1007 int ret = 0;
1008 1290
1009 handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1291 ret = vb->vb_access(handle, inode, vb->vb_bh,
1010 OCFS2_XATTR_BLOCK_UPDATE_CREDITS); 1292 OCFS2_JOURNAL_ACCESS_WRITE);
1011 if (IS_ERR(handle)) {
1012 ret = PTR_ERR(handle);
1013 mlog_errno(ret);
1014 goto out;
1015 }
1016 ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
1017 OCFS2_JOURNAL_ACCESS_WRITE);
1018 if (ret) { 1293 if (ret) {
1019 mlog_errno(ret); 1294 mlog_errno(ret);
1020 goto out_commit; 1295 goto out;
1021 } 1296 }
1022 1297
1023 xs->here->xe_name_offset = cpu_to_le16(offs); 1298 xs->here->xe_name_offset = cpu_to_le16(offs);
@@ -1028,11 +1303,9 @@ static int ocfs2_xattr_update_entry(struct inode *inode,
1028 ocfs2_xattr_set_local(xs->here, 0); 1303 ocfs2_xattr_set_local(xs->here, 0);
1029 ocfs2_xattr_hash_entry(inode, xs->header, xs->here); 1304 ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
1030 1305
1031 ret = ocfs2_journal_dirty(handle, xs->xattr_bh); 1306 ret = ocfs2_journal_dirty(handle, vb->vb_bh);
1032 if (ret < 0) 1307 if (ret < 0)
1033 mlog_errno(ret); 1308 mlog_errno(ret);
1034out_commit:
1035 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
1036out: 1309out:
1037 return ret; 1310 return ret;
1038} 1311}
@@ -1045,6 +1318,8 @@ out:
1045static int ocfs2_xattr_set_value_outside(struct inode *inode, 1318static int ocfs2_xattr_set_value_outside(struct inode *inode,
1046 struct ocfs2_xattr_info *xi, 1319 struct ocfs2_xattr_info *xi,
1047 struct ocfs2_xattr_search *xs, 1320 struct ocfs2_xattr_search *xs,
1321 struct ocfs2_xattr_set_ctxt *ctxt,
1322 struct ocfs2_xattr_value_buf *vb,
1048 size_t offs) 1323 size_t offs)
1049{ 1324{
1050 size_t name_len = strlen(xi->name); 1325 size_t name_len = strlen(xi->name);
@@ -1062,20 +1337,20 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
1062 xv->xr_list.l_tree_depth = 0; 1337 xv->xr_list.l_tree_depth = 0;
1063 xv->xr_list.l_count = cpu_to_le16(1); 1338 xv->xr_list.l_count = cpu_to_le16(1);
1064 xv->xr_list.l_next_free_rec = 0; 1339 xv->xr_list.l_next_free_rec = 0;
1340 vb->vb_xv = xv;
1065 1341
1066 ret = ocfs2_xattr_value_truncate(inode, xs->xattr_bh, xv, 1342 ret = ocfs2_xattr_value_truncate(inode, vb, xi->value_len, ctxt);
1067 xi->value_len);
1068 if (ret < 0) { 1343 if (ret < 0) {
1069 mlog_errno(ret); 1344 mlog_errno(ret);
1070 return ret; 1345 return ret;
1071 } 1346 }
1072 ret = __ocfs2_xattr_set_value_outside(inode, xv, xi->value, 1347 ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, vb, offs);
1073 xi->value_len);
1074 if (ret < 0) { 1348 if (ret < 0) {
1075 mlog_errno(ret); 1349 mlog_errno(ret);
1076 return ret; 1350 return ret;
1077 } 1351 }
1078 ret = ocfs2_xattr_update_entry(inode, xi, xs, offs); 1352 ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb->vb_xv,
1353 xi->value, xi->value_len);
1079 if (ret < 0) 1354 if (ret < 0)
1080 mlog_errno(ret); 1355 mlog_errno(ret);
1081 1356
@@ -1195,6 +1470,7 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode,
1195static int ocfs2_xattr_set_entry(struct inode *inode, 1470static int ocfs2_xattr_set_entry(struct inode *inode,
1196 struct ocfs2_xattr_info *xi, 1471 struct ocfs2_xattr_info *xi,
1197 struct ocfs2_xattr_search *xs, 1472 struct ocfs2_xattr_search *xs,
1473 struct ocfs2_xattr_set_ctxt *ctxt,
1198 int flag) 1474 int flag)
1199{ 1475{
1200 struct ocfs2_xattr_entry *last; 1476 struct ocfs2_xattr_entry *last;
@@ -1202,7 +1478,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1202 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; 1478 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
1203 size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name); 1479 size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name);
1204 size_t size_l = 0; 1480 size_t size_l = 0;
1205 handle_t *handle = NULL; 1481 handle_t *handle = ctxt->handle;
1206 int free, i, ret; 1482 int free, i, ret;
1207 struct ocfs2_xattr_info xi_l = { 1483 struct ocfs2_xattr_info xi_l = {
1208 .name_index = xi->name_index, 1484 .name_index = xi->name_index,
@@ -1210,6 +1486,16 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1210 .value = xi->value, 1486 .value = xi->value,
1211 .value_len = xi->value_len, 1487 .value_len = xi->value_len,
1212 }; 1488 };
1489 struct ocfs2_xattr_value_buf vb = {
1490 .vb_bh = xs->xattr_bh,
1491 .vb_access = ocfs2_journal_access_di,
1492 };
1493
1494 if (!(flag & OCFS2_INLINE_XATTR_FL)) {
1495 BUG_ON(xs->xattr_bh == xs->inode_bh);
1496 vb.vb_access = ocfs2_journal_access_xb;
1497 } else
1498 BUG_ON(xs->xattr_bh != xs->inode_bh);
1213 1499
1214 /* Compute min_offs, last and free space. */ 1500 /* Compute min_offs, last and free space. */
1215 last = xs->header->xh_entries; 1501 last = xs->header->xh_entries;
@@ -1265,15 +1551,14 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1265 if (ocfs2_xattr_is_local(xs->here) && size == size_l) { 1551 if (ocfs2_xattr_is_local(xs->here) && size == size_l) {
1266 /* Replace existing local xattr with tree root */ 1552 /* Replace existing local xattr with tree root */
1267 ret = ocfs2_xattr_set_value_outside(inode, xi, xs, 1553 ret = ocfs2_xattr_set_value_outside(inode, xi, xs,
1268 offs); 1554 ctxt, &vb, offs);
1269 if (ret < 0) 1555 if (ret < 0)
1270 mlog_errno(ret); 1556 mlog_errno(ret);
1271 goto out; 1557 goto out;
1272 } else if (!ocfs2_xattr_is_local(xs->here)) { 1558 } else if (!ocfs2_xattr_is_local(xs->here)) {
1273 /* For existing xattr which has value outside */ 1559 /* For existing xattr which has value outside */
1274 struct ocfs2_xattr_value_root *xv = NULL; 1560 vb.vb_xv = (struct ocfs2_xattr_value_root *)
1275 xv = (struct ocfs2_xattr_value_root *)(val + 1561 (val + OCFS2_XATTR_SIZE(name_len));
1276 OCFS2_XATTR_SIZE(name_len));
1277 1562
1278 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 1563 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
1279 /* 1564 /*
@@ -1282,27 +1567,30 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1282 * then set new value with set_value_outside(). 1567 * then set new value with set_value_outside().
1283 */ 1568 */
1284 ret = ocfs2_xattr_value_truncate(inode, 1569 ret = ocfs2_xattr_value_truncate(inode,
1285 xs->xattr_bh, 1570 &vb,
1286 xv, 1571 xi->value_len,
1287 xi->value_len); 1572 ctxt);
1288 if (ret < 0) { 1573 if (ret < 0) {
1289 mlog_errno(ret); 1574 mlog_errno(ret);
1290 goto out; 1575 goto out;
1291 } 1576 }
1292 1577
1293 ret = __ocfs2_xattr_set_value_outside(inode, 1578 ret = ocfs2_xattr_update_entry(inode,
1294 xv, 1579 handle,
1295 xi->value, 1580 xi,
1296 xi->value_len); 1581 xs,
1582 &vb,
1583 offs);
1297 if (ret < 0) { 1584 if (ret < 0) {
1298 mlog_errno(ret); 1585 mlog_errno(ret);
1299 goto out; 1586 goto out;
1300 } 1587 }
1301 1588
1302 ret = ocfs2_xattr_update_entry(inode, 1589 ret = __ocfs2_xattr_set_value_outside(inode,
1303 xi, 1590 handle,
1304 xs, 1591 vb.vb_xv,
1305 offs); 1592 xi->value,
1593 xi->value_len);
1306 if (ret < 0) 1594 if (ret < 0)
1307 mlog_errno(ret); 1595 mlog_errno(ret);
1308 goto out; 1596 goto out;
@@ -1312,44 +1600,28 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1312 * just trucate old value to zero. 1600 * just trucate old value to zero.
1313 */ 1601 */
1314 ret = ocfs2_xattr_value_truncate(inode, 1602 ret = ocfs2_xattr_value_truncate(inode,
1315 xs->xattr_bh, 1603 &vb,
1316 xv, 1604 0,
1317 0); 1605 ctxt);
1318 if (ret < 0) 1606 if (ret < 0)
1319 mlog_errno(ret); 1607 mlog_errno(ret);
1320 } 1608 }
1321 } 1609 }
1322 } 1610 }
1323 1611
1324 handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1612 ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh,
1325 OCFS2_INODE_UPDATE_CREDITS); 1613 OCFS2_JOURNAL_ACCESS_WRITE);
1326 if (IS_ERR(handle)) {
1327 ret = PTR_ERR(handle);
1328 mlog_errno(ret);
1329 goto out;
1330 }
1331
1332 ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
1333 OCFS2_JOURNAL_ACCESS_WRITE);
1334 if (ret) { 1614 if (ret) {
1335 mlog_errno(ret); 1615 mlog_errno(ret);
1336 goto out_commit; 1616 goto out;
1337 } 1617 }
1338 1618
1339 if (!(flag & OCFS2_INLINE_XATTR_FL)) { 1619 if (!(flag & OCFS2_INLINE_XATTR_FL)) {
1340 /* set extended attribute in external block. */ 1620 ret = vb.vb_access(handle, inode, vb.vb_bh,
1341 ret = ocfs2_extend_trans(handle, 1621 OCFS2_JOURNAL_ACCESS_WRITE);
1342 OCFS2_INODE_UPDATE_CREDITS +
1343 OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
1344 if (ret) {
1345 mlog_errno(ret);
1346 goto out_commit;
1347 }
1348 ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
1349 OCFS2_JOURNAL_ACCESS_WRITE);
1350 if (ret) { 1622 if (ret) {
1351 mlog_errno(ret); 1623 mlog_errno(ret);
1352 goto out_commit; 1624 goto out;
1353 } 1625 }
1354 } 1626 }
1355 1627
@@ -1363,7 +1635,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1363 ret = ocfs2_journal_dirty(handle, xs->xattr_bh); 1635 ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
1364 if (ret < 0) { 1636 if (ret < 0) {
1365 mlog_errno(ret); 1637 mlog_errno(ret);
1366 goto out_commit; 1638 goto out;
1367 } 1639 }
1368 } 1640 }
1369 1641
@@ -1391,25 +1663,19 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1391 oi->ip_dyn_features |= flag; 1663 oi->ip_dyn_features |= flag;
1392 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 1664 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
1393 spin_unlock(&oi->ip_lock); 1665 spin_unlock(&oi->ip_lock);
1394 /* Update inode ctime */
1395 inode->i_ctime = CURRENT_TIME;
1396 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
1397 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
1398 1666
1399 ret = ocfs2_journal_dirty(handle, xs->inode_bh); 1667 ret = ocfs2_journal_dirty(handle, xs->inode_bh);
1400 if (ret < 0) 1668 if (ret < 0)
1401 mlog_errno(ret); 1669 mlog_errno(ret);
1402 1670
1403out_commit:
1404 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
1405
1406 if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 1671 if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
1407 /* 1672 /*
1408 * Set value outside in B tree. 1673 * Set value outside in B tree.
1409 * This is the second step for value size > INLINE_SIZE. 1674 * This is the second step for value size > INLINE_SIZE.
1410 */ 1675 */
1411 size_t offs = le16_to_cpu(xs->here->xe_name_offset); 1676 size_t offs = le16_to_cpu(xs->here->xe_name_offset);
1412 ret = ocfs2_xattr_set_value_outside(inode, xi, xs, offs); 1677 ret = ocfs2_xattr_set_value_outside(inode, xi, xs, ctxt,
1678 &vb, offs);
1413 if (ret < 0) { 1679 if (ret < 0) {
1414 int ret2; 1680 int ret2;
1415 1681
@@ -1418,41 +1684,56 @@ out_commit:
1418 * If set value outside failed, we have to clean 1684 * If set value outside failed, we have to clean
1419 * the junk tree root we have already set in local. 1685 * the junk tree root we have already set in local.
1420 */ 1686 */
1421 ret2 = ocfs2_xattr_cleanup(inode, xi, xs, offs); 1687 ret2 = ocfs2_xattr_cleanup(inode, ctxt->handle,
1688 xi, xs, &vb, offs);
1422 if (ret2 < 0) 1689 if (ret2 < 0)
1423 mlog_errno(ret2); 1690 mlog_errno(ret2);
1424 } 1691 }
1425 } 1692 }
1426out: 1693out:
1427 return ret; 1694 return ret;
1428
1429} 1695}
1430 1696
1431static int ocfs2_remove_value_outside(struct inode*inode, 1697static int ocfs2_remove_value_outside(struct inode*inode,
1432 struct buffer_head *bh, 1698 struct ocfs2_xattr_value_buf *vb,
1433 struct ocfs2_xattr_header *header) 1699 struct ocfs2_xattr_header *header)
1434{ 1700{
1435 int ret = 0, i; 1701 int ret = 0, i;
1702 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1703 struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
1704
1705 ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
1706
1707 ctxt.handle = ocfs2_start_trans(osb,
1708 ocfs2_remove_extent_credits(osb->sb));
1709 if (IS_ERR(ctxt.handle)) {
1710 ret = PTR_ERR(ctxt.handle);
1711 mlog_errno(ret);
1712 goto out;
1713 }
1436 1714
1437 for (i = 0; i < le16_to_cpu(header->xh_count); i++) { 1715 for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
1438 struct ocfs2_xattr_entry *entry = &header->xh_entries[i]; 1716 struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
1439 1717
1440 if (!ocfs2_xattr_is_local(entry)) { 1718 if (!ocfs2_xattr_is_local(entry)) {
1441 struct ocfs2_xattr_value_root *xv;
1442 void *val; 1719 void *val;
1443 1720
1444 val = (void *)header + 1721 val = (void *)header +
1445 le16_to_cpu(entry->xe_name_offset); 1722 le16_to_cpu(entry->xe_name_offset);
1446 xv = (struct ocfs2_xattr_value_root *) 1723 vb->vb_xv = (struct ocfs2_xattr_value_root *)
1447 (val + OCFS2_XATTR_SIZE(entry->xe_name_len)); 1724 (val + OCFS2_XATTR_SIZE(entry->xe_name_len));
1448 ret = ocfs2_xattr_value_truncate(inode, bh, xv, 0); 1725 ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
1449 if (ret < 0) { 1726 if (ret < 0) {
1450 mlog_errno(ret); 1727 mlog_errno(ret);
1451 return ret; 1728 break;
1452 } 1729 }
1453 } 1730 }
1454 } 1731 }
1455 1732
1733 ocfs2_commit_trans(osb, ctxt.handle);
1734 ocfs2_schedule_truncate_log_flush(osb, 1);
1735 ocfs2_run_deallocs(osb, &ctxt.dealloc);
1736out:
1456 return ret; 1737 return ret;
1457} 1738}
1458 1739
@@ -1463,12 +1744,16 @@ static int ocfs2_xattr_ibody_remove(struct inode *inode,
1463 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 1744 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1464 struct ocfs2_xattr_header *header; 1745 struct ocfs2_xattr_header *header;
1465 int ret; 1746 int ret;
1747 struct ocfs2_xattr_value_buf vb = {
1748 .vb_bh = di_bh,
1749 .vb_access = ocfs2_journal_access_di,
1750 };
1466 1751
1467 header = (struct ocfs2_xattr_header *) 1752 header = (struct ocfs2_xattr_header *)
1468 ((void *)di + inode->i_sb->s_blocksize - 1753 ((void *)di + inode->i_sb->s_blocksize -
1469 le16_to_cpu(di->i_xattr_inline_size)); 1754 le16_to_cpu(di->i_xattr_inline_size));
1470 1755
1471 ret = ocfs2_remove_value_outside(inode, di_bh, header); 1756 ret = ocfs2_remove_value_outside(inode, &vb, header);
1472 1757
1473 return ret; 1758 return ret;
1474} 1759}
@@ -1478,11 +1763,15 @@ static int ocfs2_xattr_block_remove(struct inode *inode,
1478{ 1763{
1479 struct ocfs2_xattr_block *xb; 1764 struct ocfs2_xattr_block *xb;
1480 int ret = 0; 1765 int ret = 0;
1766 struct ocfs2_xattr_value_buf vb = {
1767 .vb_bh = blk_bh,
1768 .vb_access = ocfs2_journal_access_xb,
1769 };
1481 1770
1482 xb = (struct ocfs2_xattr_block *)blk_bh->b_data; 1771 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
1483 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { 1772 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
1484 struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header); 1773 struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
1485 ret = ocfs2_remove_value_outside(inode, blk_bh, header); 1774 ret = ocfs2_remove_value_outside(inode, &vb, header);
1486 } else 1775 } else
1487 ret = ocfs2_delete_xattr_index_block(inode, blk_bh); 1776 ret = ocfs2_delete_xattr_index_block(inode, blk_bh);
1488 1777
@@ -1502,24 +1791,19 @@ static int ocfs2_xattr_free_block(struct inode *inode,
1502 u64 blk, bg_blkno; 1791 u64 blk, bg_blkno;
1503 u16 bit; 1792 u16 bit;
1504 1793
1505 ret = ocfs2_read_block(inode, block, &blk_bh); 1794 ret = ocfs2_read_xattr_block(inode, block, &blk_bh);
1506 if (ret < 0) { 1795 if (ret < 0) {
1507 mlog_errno(ret); 1796 mlog_errno(ret);
1508 goto out; 1797 goto out;
1509 } 1798 }
1510 1799
1511 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
1512 if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
1513 ret = -EIO;
1514 goto out;
1515 }
1516
1517 ret = ocfs2_xattr_block_remove(inode, blk_bh); 1800 ret = ocfs2_xattr_block_remove(inode, blk_bh);
1518 if (ret < 0) { 1801 if (ret < 0) {
1519 mlog_errno(ret); 1802 mlog_errno(ret);
1520 goto out; 1803 goto out;
1521 } 1804 }
1522 1805
1806 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
1523 blk = le64_to_cpu(xb->xb_blkno); 1807 blk = le64_to_cpu(xb->xb_blkno);
1524 bit = le16_to_cpu(xb->xb_suballoc_bit); 1808 bit = le16_to_cpu(xb->xb_suballoc_bit);
1525 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 1809 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
@@ -1606,8 +1890,8 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
1606 mlog_errno(ret); 1890 mlog_errno(ret);
1607 goto out; 1891 goto out;
1608 } 1892 }
1609 ret = ocfs2_journal_access(handle, inode, di_bh, 1893 ret = ocfs2_journal_access_di(handle, inode, di_bh,
1610 OCFS2_JOURNAL_ACCESS_WRITE); 1894 OCFS2_JOURNAL_ACCESS_WRITE);
1611 if (ret) { 1895 if (ret) {
1612 mlog_errno(ret); 1896 mlog_errno(ret);
1613 goto out_commit; 1897 goto out_commit;
@@ -1714,7 +1998,8 @@ static int ocfs2_xattr_ibody_find(struct inode *inode,
1714 */ 1998 */
1715static int ocfs2_xattr_ibody_set(struct inode *inode, 1999static int ocfs2_xattr_ibody_set(struct inode *inode,
1716 struct ocfs2_xattr_info *xi, 2000 struct ocfs2_xattr_info *xi,
1717 struct ocfs2_xattr_search *xs) 2001 struct ocfs2_xattr_search *xs,
2002 struct ocfs2_xattr_set_ctxt *ctxt)
1718{ 2003{
1719 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2004 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1720 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; 2005 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
@@ -1731,7 +2016,7 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
1731 } 2016 }
1732 } 2017 }
1733 2018
1734 ret = ocfs2_xattr_set_entry(inode, xi, xs, 2019 ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
1735 (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL)); 2020 (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL));
1736out: 2021out:
1737 up_write(&oi->ip_alloc_sem); 2022 up_write(&oi->ip_alloc_sem);
@@ -1758,19 +2043,15 @@ static int ocfs2_xattr_block_find(struct inode *inode,
1758 if (!di->i_xattr_loc) 2043 if (!di->i_xattr_loc)
1759 return ret; 2044 return ret;
1760 2045
1761 ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh); 2046 ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
2047 &blk_bh);
1762 if (ret < 0) { 2048 if (ret < 0) {
1763 mlog_errno(ret); 2049 mlog_errno(ret);
1764 return ret; 2050 return ret;
1765 } 2051 }
1766 2052
1767 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
1768 if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
1769 ret = -EIO;
1770 goto cleanup;
1771 }
1772
1773 xs->xattr_bh = blk_bh; 2053 xs->xattr_bh = blk_bh;
2054 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
1774 2055
1775 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { 2056 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
1776 xs->header = &xb->xb_attrs.xb_header; 2057 xs->header = &xb->xb_attrs.xb_header;
@@ -1804,13 +2085,13 @@ cleanup:
1804 */ 2085 */
1805static int ocfs2_xattr_block_set(struct inode *inode, 2086static int ocfs2_xattr_block_set(struct inode *inode,
1806 struct ocfs2_xattr_info *xi, 2087 struct ocfs2_xattr_info *xi,
1807 struct ocfs2_xattr_search *xs) 2088 struct ocfs2_xattr_search *xs,
2089 struct ocfs2_xattr_set_ctxt *ctxt)
1808{ 2090{
1809 struct buffer_head *new_bh = NULL; 2091 struct buffer_head *new_bh = NULL;
1810 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2092 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1811 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; 2093 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
1812 struct ocfs2_alloc_context *meta_ac = NULL; 2094 handle_t *handle = ctxt->handle;
1813 handle_t *handle = NULL;
1814 struct ocfs2_xattr_block *xblk = NULL; 2095 struct ocfs2_xattr_block *xblk = NULL;
1815 u16 suballoc_bit_start; 2096 u16 suballoc_bit_start;
1816 u32 num_got; 2097 u32 num_got;
@@ -1818,45 +2099,29 @@ static int ocfs2_xattr_block_set(struct inode *inode,
1818 int ret; 2099 int ret;
1819 2100
1820 if (!xs->xattr_bh) { 2101 if (!xs->xattr_bh) {
1821 /* 2102 ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh,
1822 * Alloc one external block for extended attribute 2103 OCFS2_JOURNAL_ACCESS_CREATE);
1823 * outside of inode.
1824 */
1825 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
1826 if (ret < 0) { 2104 if (ret < 0) {
1827 mlog_errno(ret); 2105 mlog_errno(ret);
1828 goto out; 2106 goto end;
1829 }
1830 handle = ocfs2_start_trans(osb,
1831 OCFS2_XATTR_BLOCK_CREATE_CREDITS);
1832 if (IS_ERR(handle)) {
1833 ret = PTR_ERR(handle);
1834 mlog_errno(ret);
1835 goto out;
1836 }
1837 ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
1838 OCFS2_JOURNAL_ACCESS_CREATE);
1839 if (ret < 0) {
1840 mlog_errno(ret);
1841 goto out_commit;
1842 } 2107 }
1843 2108
1844 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, 2109 ret = ocfs2_claim_metadata(osb, handle, ctxt->meta_ac, 1,
1845 &suballoc_bit_start, &num_got, 2110 &suballoc_bit_start, &num_got,
1846 &first_blkno); 2111 &first_blkno);
1847 if (ret < 0) { 2112 if (ret < 0) {
1848 mlog_errno(ret); 2113 mlog_errno(ret);
1849 goto out_commit; 2114 goto end;
1850 } 2115 }
1851 2116
1852 new_bh = sb_getblk(inode->i_sb, first_blkno); 2117 new_bh = sb_getblk(inode->i_sb, first_blkno);
1853 ocfs2_set_new_buffer_uptodate(inode, new_bh); 2118 ocfs2_set_new_buffer_uptodate(inode, new_bh);
1854 2119
1855 ret = ocfs2_journal_access(handle, inode, new_bh, 2120 ret = ocfs2_journal_access_xb(handle, inode, new_bh,
1856 OCFS2_JOURNAL_ACCESS_CREATE); 2121 OCFS2_JOURNAL_ACCESS_CREATE);
1857 if (ret < 0) { 2122 if (ret < 0) {
1858 mlog_errno(ret); 2123 mlog_errno(ret);
1859 goto out_commit; 2124 goto end;
1860 } 2125 }
1861 2126
1862 /* Initialize ocfs2_xattr_block */ 2127 /* Initialize ocfs2_xattr_block */
@@ -1874,44 +2139,555 @@ static int ocfs2_xattr_block_set(struct inode *inode,
1874 xs->end = (void *)xblk + inode->i_sb->s_blocksize; 2139 xs->end = (void *)xblk + inode->i_sb->s_blocksize;
1875 xs->here = xs->header->xh_entries; 2140 xs->here = xs->header->xh_entries;
1876 2141
1877
1878 ret = ocfs2_journal_dirty(handle, new_bh); 2142 ret = ocfs2_journal_dirty(handle, new_bh);
1879 if (ret < 0) { 2143 if (ret < 0) {
1880 mlog_errno(ret); 2144 mlog_errno(ret);
1881 goto out_commit; 2145 goto end;
1882 } 2146 }
1883 di->i_xattr_loc = cpu_to_le64(first_blkno); 2147 di->i_xattr_loc = cpu_to_le64(first_blkno);
1884 ret = ocfs2_journal_dirty(handle, xs->inode_bh); 2148 ocfs2_journal_dirty(handle, xs->inode_bh);
1885 if (ret < 0)
1886 mlog_errno(ret);
1887out_commit:
1888 ocfs2_commit_trans(osb, handle);
1889out:
1890 if (meta_ac)
1891 ocfs2_free_alloc_context(meta_ac);
1892 if (ret < 0)
1893 return ret;
1894 } else 2149 } else
1895 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; 2150 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
1896 2151
1897 if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) { 2152 if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
1898 /* Set extended attribute into external block */ 2153 /* Set extended attribute into external block */
1899 ret = ocfs2_xattr_set_entry(inode, xi, xs, OCFS2_HAS_XATTR_FL); 2154 ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
2155 OCFS2_HAS_XATTR_FL);
1900 if (!ret || ret != -ENOSPC) 2156 if (!ret || ret != -ENOSPC)
1901 goto end; 2157 goto end;
1902 2158
1903 ret = ocfs2_xattr_create_index_block(inode, xs); 2159 ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
1904 if (ret) 2160 if (ret)
1905 goto end; 2161 goto end;
1906 } 2162 }
1907 2163
1908 ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs); 2164 ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
1909 2165
1910end: 2166end:
1911 2167
1912 return ret; 2168 return ret;
1913} 2169}
1914 2170
2171/* Check whether the new xattr can be inserted into the inode. */
2172static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
2173 struct ocfs2_xattr_info *xi,
2174 struct ocfs2_xattr_search *xs)
2175{
2176 u64 value_size;
2177 struct ocfs2_xattr_entry *last;
2178 int free, i;
2179 size_t min_offs = xs->end - xs->base;
2180
2181 if (!xs->header)
2182 return 0;
2183
2184 last = xs->header->xh_entries;
2185
2186 for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) {
2187 size_t offs = le16_to_cpu(last->xe_name_offset);
2188 if (offs < min_offs)
2189 min_offs = offs;
2190 last += 1;
2191 }
2192
2193 free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
2194 if (free < 0)
2195 return 0;
2196
2197 BUG_ON(!xs->not_found);
2198
2199 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
2200 value_size = OCFS2_XATTR_ROOT_SIZE;
2201 else
2202 value_size = OCFS2_XATTR_SIZE(xi->value_len);
2203
2204 if (free >= sizeof(struct ocfs2_xattr_entry) +
2205 OCFS2_XATTR_SIZE(strlen(xi->name)) + value_size)
2206 return 1;
2207
2208 return 0;
2209}
2210
2211static int ocfs2_calc_xattr_set_need(struct inode *inode,
2212 struct ocfs2_dinode *di,
2213 struct ocfs2_xattr_info *xi,
2214 struct ocfs2_xattr_search *xis,
2215 struct ocfs2_xattr_search *xbs,
2216 int *clusters_need,
2217 int *meta_need,
2218 int *credits_need)
2219{
2220 int ret = 0, old_in_xb = 0;
2221 int clusters_add = 0, meta_add = 0, credits = 0;
2222 struct buffer_head *bh = NULL;
2223 struct ocfs2_xattr_block *xb = NULL;
2224 struct ocfs2_xattr_entry *xe = NULL;
2225 struct ocfs2_xattr_value_root *xv = NULL;
2226 char *base = NULL;
2227 int name_offset, name_len = 0;
2228 u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
2229 xi->value_len);
2230 u64 value_size;
2231
2232 /*
2233 * Calculate the clusters we need to write.
2234 * No matter whether we replace an old one or add a new one,
2235 * we need this for writing.
2236 */
2237 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
2238 credits += new_clusters *
2239 ocfs2_clusters_to_blocks(inode->i_sb, 1);
2240
2241 if (xis->not_found && xbs->not_found) {
2242 credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2243
2244 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
2245 clusters_add += new_clusters;
2246 credits += ocfs2_calc_extend_credits(inode->i_sb,
2247 &def_xv.xv.xr_list,
2248 new_clusters);
2249 }
2250
2251 goto meta_guess;
2252 }
2253
2254 if (!xis->not_found) {
2255 xe = xis->here;
2256 name_offset = le16_to_cpu(xe->xe_name_offset);
2257 name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
2258 base = xis->base;
2259 credits += OCFS2_INODE_UPDATE_CREDITS;
2260 } else {
2261 int i, block_off = 0;
2262 xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
2263 xe = xbs->here;
2264 name_offset = le16_to_cpu(xe->xe_name_offset);
2265 name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
2266 i = xbs->here - xbs->header->xh_entries;
2267 old_in_xb = 1;
2268
2269 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
2270 ret = ocfs2_xattr_bucket_get_name_value(inode,
2271 bucket_xh(xbs->bucket),
2272 i, &block_off,
2273 &name_offset);
2274 base = bucket_block(xbs->bucket, block_off);
2275 credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2276 } else {
2277 base = xbs->base;
2278 credits += OCFS2_XATTR_BLOCK_UPDATE_CREDITS;
2279 }
2280 }
2281
2282 /*
2283 * delete a xattr doesn't need metadata and cluster allocation.
2284 * so just calculate the credits and return.
2285 *
2286 * The credits for removing the value tree will be extended
2287 * by ocfs2_remove_extent itself.
2288 */
2289 if (!xi->value) {
2290 if (!ocfs2_xattr_is_local(xe))
2291 credits += ocfs2_remove_extent_credits(inode->i_sb);
2292
2293 goto out;
2294 }
2295
2296 /* do cluster allocation guess first. */
2297 value_size = le64_to_cpu(xe->xe_value_size);
2298
2299 if (old_in_xb) {
2300 /*
2301 * In xattr set, we always try to set the xe in inode first,
2302 * so if it can be inserted into inode successfully, the old
2303 * one will be removed from the xattr block, and this xattr
2304 * will be inserted into inode as a new xattr in inode.
2305 */
2306 if (ocfs2_xattr_can_be_in_inode(inode, xi, xis)) {
2307 clusters_add += new_clusters;
2308 credits += ocfs2_remove_extent_credits(inode->i_sb) +
2309 OCFS2_INODE_UPDATE_CREDITS;
2310 if (!ocfs2_xattr_is_local(xe))
2311 credits += ocfs2_calc_extend_credits(
2312 inode->i_sb,
2313 &def_xv.xv.xr_list,
2314 new_clusters);
2315 goto out;
2316 }
2317 }
2318
2319 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
2320 /* the new values will be stored outside. */
2321 u32 old_clusters = 0;
2322
2323 if (!ocfs2_xattr_is_local(xe)) {
2324 old_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
2325 value_size);
2326 xv = (struct ocfs2_xattr_value_root *)
2327 (base + name_offset + name_len);
2328 value_size = OCFS2_XATTR_ROOT_SIZE;
2329 } else
2330 xv = &def_xv.xv;
2331
2332 if (old_clusters >= new_clusters) {
2333 credits += ocfs2_remove_extent_credits(inode->i_sb);
2334 goto out;
2335 } else {
2336 meta_add += ocfs2_extend_meta_needed(&xv->xr_list);
2337 clusters_add += new_clusters - old_clusters;
2338 credits += ocfs2_calc_extend_credits(inode->i_sb,
2339 &xv->xr_list,
2340 new_clusters -
2341 old_clusters);
2342 if (value_size >= OCFS2_XATTR_ROOT_SIZE)
2343 goto out;
2344 }
2345 } else {
2346 /*
2347 * Now the new value will be stored inside. So if the new
2348 * value is smaller than the size of value root or the old
2349 * value, we don't need any allocation, otherwise we have
2350 * to guess metadata allocation.
2351 */
2352 if ((ocfs2_xattr_is_local(xe) && value_size >= xi->value_len) ||
2353 (!ocfs2_xattr_is_local(xe) &&
2354 OCFS2_XATTR_ROOT_SIZE >= xi->value_len))
2355 goto out;
2356 }
2357
2358meta_guess:
2359 /* calculate metadata allocation. */
2360 if (di->i_xattr_loc) {
2361 if (!xbs->xattr_bh) {
2362 ret = ocfs2_read_xattr_block(inode,
2363 le64_to_cpu(di->i_xattr_loc),
2364 &bh);
2365 if (ret) {
2366 mlog_errno(ret);
2367 goto out;
2368 }
2369
2370 xb = (struct ocfs2_xattr_block *)bh->b_data;
2371 } else
2372 xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
2373
2374 /*
2375 * If there is already an xattr tree, good, we can calculate
2376 * like other b-trees. Otherwise we may have the chance of
2377 * create a tree, the credit calculation is borrowed from
2378 * ocfs2_calc_extend_credits with root_el = NULL. And the
2379 * new tree will be cluster based, so no meta is needed.
2380 */
2381 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
2382 struct ocfs2_extent_list *el =
2383 &xb->xb_attrs.xb_root.xt_list;
2384 meta_add += ocfs2_extend_meta_needed(el);
2385 credits += ocfs2_calc_extend_credits(inode->i_sb,
2386 el, 1);
2387 } else
2388 credits += OCFS2_SUBALLOC_ALLOC + 1;
2389
2390 /*
2391 * This cluster will be used either for new bucket or for
2392 * new xattr block.
2393 * If the cluster size is the same as the bucket size, one
2394 * more is needed since we may need to extend the bucket
2395 * also.
2396 */
2397 clusters_add += 1;
2398 credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2399 if (OCFS2_XATTR_BUCKET_SIZE ==
2400 OCFS2_SB(inode->i_sb)->s_clustersize) {
2401 credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2402 clusters_add += 1;
2403 }
2404 } else {
2405 meta_add += 1;
2406 credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
2407 }
2408out:
2409 if (clusters_need)
2410 *clusters_need = clusters_add;
2411 if (meta_need)
2412 *meta_need = meta_add;
2413 if (credits_need)
2414 *credits_need = credits;
2415 brelse(bh);
2416 return ret;
2417}
2418
2419static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
2420 struct ocfs2_dinode *di,
2421 struct ocfs2_xattr_info *xi,
2422 struct ocfs2_xattr_search *xis,
2423 struct ocfs2_xattr_search *xbs,
2424 struct ocfs2_xattr_set_ctxt *ctxt,
2425 int *credits)
2426{
2427 int clusters_add, meta_add, ret;
2428 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2429
2430 memset(ctxt, 0, sizeof(struct ocfs2_xattr_set_ctxt));
2431
2432 ocfs2_init_dealloc_ctxt(&ctxt->dealloc);
2433
2434 ret = ocfs2_calc_xattr_set_need(inode, di, xi, xis, xbs,
2435 &clusters_add, &meta_add, credits);
2436 if (ret) {
2437 mlog_errno(ret);
2438 return ret;
2439 }
2440
2441 mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
2442 "credits = %d\n", xi->name, meta_add, clusters_add, *credits);
2443
2444 if (meta_add) {
2445 ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
2446 &ctxt->meta_ac);
2447 if (ret) {
2448 mlog_errno(ret);
2449 goto out;
2450 }
2451 }
2452
2453 if (clusters_add) {
2454 ret = ocfs2_reserve_clusters(osb, clusters_add, &ctxt->data_ac);
2455 if (ret)
2456 mlog_errno(ret);
2457 }
2458out:
2459 if (ret) {
2460 if (ctxt->meta_ac) {
2461 ocfs2_free_alloc_context(ctxt->meta_ac);
2462 ctxt->meta_ac = NULL;
2463 }
2464
2465 /*
2466 * We cannot have an error and a non null ctxt->data_ac.
2467 */
2468 }
2469
2470 return ret;
2471}
2472
2473static int __ocfs2_xattr_set_handle(struct inode *inode,
2474 struct ocfs2_dinode *di,
2475 struct ocfs2_xattr_info *xi,
2476 struct ocfs2_xattr_search *xis,
2477 struct ocfs2_xattr_search *xbs,
2478 struct ocfs2_xattr_set_ctxt *ctxt)
2479{
2480 int ret = 0, credits, old_found;
2481
2482 if (!xi->value) {
2483 /* Remove existing extended attribute */
2484 if (!xis->not_found)
2485 ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
2486 else if (!xbs->not_found)
2487 ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
2488 } else {
2489 /* We always try to set extended attribute into inode first*/
2490 ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
2491 if (!ret && !xbs->not_found) {
2492 /*
2493 * If succeed and that extended attribute existing in
2494 * external block, then we will remove it.
2495 */
2496 xi->value = NULL;
2497 xi->value_len = 0;
2498
2499 old_found = xis->not_found;
2500 xis->not_found = -ENODATA;
2501 ret = ocfs2_calc_xattr_set_need(inode,
2502 di,
2503 xi,
2504 xis,
2505 xbs,
2506 NULL,
2507 NULL,
2508 &credits);
2509 xis->not_found = old_found;
2510 if (ret) {
2511 mlog_errno(ret);
2512 goto out;
2513 }
2514
2515 ret = ocfs2_extend_trans(ctxt->handle, credits +
2516 ctxt->handle->h_buffer_credits);
2517 if (ret) {
2518 mlog_errno(ret);
2519 goto out;
2520 }
2521 ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
2522 } else if (ret == -ENOSPC) {
2523 if (di->i_xattr_loc && !xbs->xattr_bh) {
2524 ret = ocfs2_xattr_block_find(inode,
2525 xi->name_index,
2526 xi->name, xbs);
2527 if (ret)
2528 goto out;
2529
2530 old_found = xis->not_found;
2531 xis->not_found = -ENODATA;
2532 ret = ocfs2_calc_xattr_set_need(inode,
2533 di,
2534 xi,
2535 xis,
2536 xbs,
2537 NULL,
2538 NULL,
2539 &credits);
2540 xis->not_found = old_found;
2541 if (ret) {
2542 mlog_errno(ret);
2543 goto out;
2544 }
2545
2546 ret = ocfs2_extend_trans(ctxt->handle, credits +
2547 ctxt->handle->h_buffer_credits);
2548 if (ret) {
2549 mlog_errno(ret);
2550 goto out;
2551 }
2552 }
2553 /*
2554 * If no space in inode, we will set extended attribute
2555 * into external block.
2556 */
2557 ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
2558 if (ret)
2559 goto out;
2560 if (!xis->not_found) {
2561 /*
2562 * If succeed and that extended attribute
2563 * existing in inode, we will remove it.
2564 */
2565 xi->value = NULL;
2566 xi->value_len = 0;
2567 xbs->not_found = -ENODATA;
2568 ret = ocfs2_calc_xattr_set_need(inode,
2569 di,
2570 xi,
2571 xis,
2572 xbs,
2573 NULL,
2574 NULL,
2575 &credits);
2576 if (ret) {
2577 mlog_errno(ret);
2578 goto out;
2579 }
2580
2581 ret = ocfs2_extend_trans(ctxt->handle, credits +
2582 ctxt->handle->h_buffer_credits);
2583 if (ret) {
2584 mlog_errno(ret);
2585 goto out;
2586 }
2587 ret = ocfs2_xattr_ibody_set(inode, xi,
2588 xis, ctxt);
2589 }
2590 }
2591 }
2592
2593 if (!ret) {
2594 /* Update inode ctime. */
2595 ret = ocfs2_journal_access(ctxt->handle, inode, xis->inode_bh,
2596 OCFS2_JOURNAL_ACCESS_WRITE);
2597 if (ret) {
2598 mlog_errno(ret);
2599 goto out;
2600 }
2601
2602 inode->i_ctime = CURRENT_TIME;
2603 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
2604 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
2605 ocfs2_journal_dirty(ctxt->handle, xis->inode_bh);
2606 }
2607out:
2608 return ret;
2609}
2610
2611/*
2612 * This function only called duing creating inode
2613 * for init security/acl xattrs of the new inode.
2614 * All transanction credits have been reserved in mknod.
2615 */
2616int ocfs2_xattr_set_handle(handle_t *handle,
2617 struct inode *inode,
2618 struct buffer_head *di_bh,
2619 int name_index,
2620 const char *name,
2621 const void *value,
2622 size_t value_len,
2623 int flags,
2624 struct ocfs2_alloc_context *meta_ac,
2625 struct ocfs2_alloc_context *data_ac)
2626{
2627 struct ocfs2_dinode *di;
2628 int ret;
2629
2630 struct ocfs2_xattr_info xi = {
2631 .name_index = name_index,
2632 .name = name,
2633 .value = value,
2634 .value_len = value_len,
2635 };
2636
2637 struct ocfs2_xattr_search xis = {
2638 .not_found = -ENODATA,
2639 };
2640
2641 struct ocfs2_xattr_search xbs = {
2642 .not_found = -ENODATA,
2643 };
2644
2645 struct ocfs2_xattr_set_ctxt ctxt = {
2646 .handle = handle,
2647 .meta_ac = meta_ac,
2648 .data_ac = data_ac,
2649 };
2650
2651 if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
2652 return -EOPNOTSUPP;
2653
2654 /*
2655 * In extreme situation, may need xattr bucket when
2656 * block size is too small. And we have already reserved
2657 * the credits for bucket in mknod.
2658 */
2659 if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) {
2660 xbs.bucket = ocfs2_xattr_bucket_new(inode);
2661 if (!xbs.bucket) {
2662 mlog_errno(-ENOMEM);
2663 return -ENOMEM;
2664 }
2665 }
2666
2667 xis.inode_bh = xbs.inode_bh = di_bh;
2668 di = (struct ocfs2_dinode *)di_bh->b_data;
2669
2670 down_write(&OCFS2_I(inode)->ip_xattr_sem);
2671
2672 ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
2673 if (ret)
2674 goto cleanup;
2675 if (xis.not_found) {
2676 ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs);
2677 if (ret)
2678 goto cleanup;
2679 }
2680
2681 ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
2682
2683cleanup:
2684 up_write(&OCFS2_I(inode)->ip_xattr_sem);
2685 brelse(xbs.xattr_bh);
2686 ocfs2_xattr_bucket_free(xbs.bucket);
2687
2688 return ret;
2689}
2690
1915/* 2691/*
1916 * ocfs2_xattr_set() 2692 * ocfs2_xattr_set()
1917 * 2693 *
@@ -1928,8 +2704,10 @@ int ocfs2_xattr_set(struct inode *inode,
1928{ 2704{
1929 struct buffer_head *di_bh = NULL; 2705 struct buffer_head *di_bh = NULL;
1930 struct ocfs2_dinode *di; 2706 struct ocfs2_dinode *di;
1931 int ret; 2707 int ret, credits;
1932 u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); 2708 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2709 struct inode *tl_inode = osb->osb_tl_inode;
2710 struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
1933 2711
1934 struct ocfs2_xattr_info xi = { 2712 struct ocfs2_xattr_info xi = {
1935 .name_index = name_index, 2713 .name_index = name_index,
@@ -1949,10 +2727,20 @@ int ocfs2_xattr_set(struct inode *inode,
1949 if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) 2727 if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
1950 return -EOPNOTSUPP; 2728 return -EOPNOTSUPP;
1951 2729
2730 /*
2731 * Only xbs will be used on indexed trees. xis doesn't need a
2732 * bucket.
2733 */
2734 xbs.bucket = ocfs2_xattr_bucket_new(inode);
2735 if (!xbs.bucket) {
2736 mlog_errno(-ENOMEM);
2737 return -ENOMEM;
2738 }
2739
1952 ret = ocfs2_inode_lock(inode, &di_bh, 1); 2740 ret = ocfs2_inode_lock(inode, &di_bh, 1);
1953 if (ret < 0) { 2741 if (ret < 0) {
1954 mlog_errno(ret); 2742 mlog_errno(ret);
1955 return ret; 2743 goto cleanup_nolock;
1956 } 2744 }
1957 xis.inode_bh = xbs.inode_bh = di_bh; 2745 xis.inode_bh = xbs.inode_bh = di_bh;
1958 di = (struct ocfs2_dinode *)di_bh->b_data; 2746 di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -1984,55 +2772,53 @@ int ocfs2_xattr_set(struct inode *inode,
1984 goto cleanup; 2772 goto cleanup;
1985 } 2773 }
1986 2774
1987 if (!value) { 2775
1988 /* Remove existing extended attribute */ 2776 mutex_lock(&tl_inode->i_mutex);
1989 if (!xis.not_found) 2777
1990 ret = ocfs2_xattr_ibody_set(inode, &xi, &xis); 2778 if (ocfs2_truncate_log_needs_flush(osb)) {
1991 else if (!xbs.not_found) 2779 ret = __ocfs2_flush_truncate_log(osb);
1992 ret = ocfs2_xattr_block_set(inode, &xi, &xbs); 2780 if (ret < 0) {
1993 } else { 2781 mutex_unlock(&tl_inode->i_mutex);
1994 /* We always try to set extended attribute into inode first*/ 2782 mlog_errno(ret);
1995 ret = ocfs2_xattr_ibody_set(inode, &xi, &xis); 2783 goto cleanup;
1996 if (!ret && !xbs.not_found) {
1997 /*
1998 * If succeed and that extended attribute existing in
1999 * external block, then we will remove it.
2000 */
2001 xi.value = NULL;
2002 xi.value_len = 0;
2003 ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
2004 } else if (ret == -ENOSPC) {
2005 if (di->i_xattr_loc && !xbs.xattr_bh) {
2006 ret = ocfs2_xattr_block_find(inode, name_index,
2007 name, &xbs);
2008 if (ret)
2009 goto cleanup;
2010 }
2011 /*
2012 * If no space in inode, we will set extended attribute
2013 * into external block.
2014 */
2015 ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
2016 if (ret)
2017 goto cleanup;
2018 if (!xis.not_found) {
2019 /*
2020 * If succeed and that extended attribute
2021 * existing in inode, we will remove it.
2022 */
2023 xi.value = NULL;
2024 xi.value_len = 0;
2025 ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
2026 }
2027 } 2784 }
2028 } 2785 }
2786 mutex_unlock(&tl_inode->i_mutex);
2787
2788 ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis,
2789 &xbs, &ctxt, &credits);
2790 if (ret) {
2791 mlog_errno(ret);
2792 goto cleanup;
2793 }
2794
2795 /* we need to update inode's ctime field, so add credit for it. */
2796 credits += OCFS2_INODE_UPDATE_CREDITS;
2797 ctxt.handle = ocfs2_start_trans(osb, credits);
2798 if (IS_ERR(ctxt.handle)) {
2799 ret = PTR_ERR(ctxt.handle);
2800 mlog_errno(ret);
2801 goto cleanup;
2802 }
2803
2804 ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
2805
2806 ocfs2_commit_trans(osb, ctxt.handle);
2807
2808 if (ctxt.data_ac)
2809 ocfs2_free_alloc_context(ctxt.data_ac);
2810 if (ctxt.meta_ac)
2811 ocfs2_free_alloc_context(ctxt.meta_ac);
2812 if (ocfs2_dealloc_has_cluster(&ctxt.dealloc))
2813 ocfs2_schedule_truncate_log_flush(osb, 1);
2814 ocfs2_run_deallocs(osb, &ctxt.dealloc);
2029cleanup: 2815cleanup:
2030 up_write(&OCFS2_I(inode)->ip_xattr_sem); 2816 up_write(&OCFS2_I(inode)->ip_xattr_sem);
2031 ocfs2_inode_unlock(inode, 1); 2817 ocfs2_inode_unlock(inode, 1);
2818cleanup_nolock:
2032 brelse(di_bh); 2819 brelse(di_bh);
2033 brelse(xbs.xattr_bh); 2820 brelse(xbs.xattr_bh);
2034 for (i = 0; i < blk_per_bucket; i++) 2821 ocfs2_xattr_bucket_free(xbs.bucket);
2035 brelse(xbs.bucket.bhs[i]);
2036 2822
2037 return ret; 2823 return ret;
2038} 2824}
@@ -2107,7 +2893,7 @@ typedef int (xattr_bucket_func)(struct inode *inode,
2107 void *para); 2893 void *para);
2108 2894
2109static int ocfs2_find_xe_in_bucket(struct inode *inode, 2895static int ocfs2_find_xe_in_bucket(struct inode *inode,
2110 struct buffer_head *header_bh, 2896 struct ocfs2_xattr_bucket *bucket,
2111 int name_index, 2897 int name_index,
2112 const char *name, 2898 const char *name,
2113 u32 name_hash, 2899 u32 name_hash,
@@ -2115,11 +2901,9 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
2115 int *found) 2901 int *found)
2116{ 2902{
2117 int i, ret = 0, cmp = 1, block_off, new_offset; 2903 int i, ret = 0, cmp = 1, block_off, new_offset;
2118 struct ocfs2_xattr_header *xh = 2904 struct ocfs2_xattr_header *xh = bucket_xh(bucket);
2119 (struct ocfs2_xattr_header *)header_bh->b_data;
2120 size_t name_len = strlen(name); 2905 size_t name_len = strlen(name);
2121 struct ocfs2_xattr_entry *xe = NULL; 2906 struct ocfs2_xattr_entry *xe = NULL;
2122 struct buffer_head *name_bh = NULL;
2123 char *xe_name; 2907 char *xe_name;
2124 2908
2125 /* 2909 /*
@@ -2150,19 +2934,9 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
2150 break; 2934 break;
2151 } 2935 }
2152 2936
2153 ret = ocfs2_read_block(inode, header_bh->b_blocknr + block_off,
2154 &name_bh);
2155 if (ret) {
2156 mlog_errno(ret);
2157 break;
2158 }
2159 xe_name = name_bh->b_data + new_offset;
2160 2937
2161 cmp = memcmp(name, xe_name, name_len); 2938 xe_name = bucket_block(bucket, block_off) + new_offset;
2162 brelse(name_bh); 2939 if (!memcmp(name, xe_name, name_len)) {
2163 name_bh = NULL;
2164
2165 if (cmp == 0) {
2166 *xe_index = i; 2940 *xe_index = i;
2167 *found = 1; 2941 *found = 1;
2168 ret = 0; 2942 ret = 0;
@@ -2192,39 +2966,42 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
2192 struct ocfs2_xattr_search *xs) 2966 struct ocfs2_xattr_search *xs)
2193{ 2967{
2194 int ret, found = 0; 2968 int ret, found = 0;
2195 struct buffer_head *bh = NULL;
2196 struct buffer_head *lower_bh = NULL;
2197 struct ocfs2_xattr_header *xh = NULL; 2969 struct ocfs2_xattr_header *xh = NULL;
2198 struct ocfs2_xattr_entry *xe = NULL; 2970 struct ocfs2_xattr_entry *xe = NULL;
2199 u16 index = 0; 2971 u16 index = 0;
2200 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); 2972 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2201 int low_bucket = 0, bucket, high_bucket; 2973 int low_bucket = 0, bucket, high_bucket;
2974 struct ocfs2_xattr_bucket *search;
2202 u32 last_hash; 2975 u32 last_hash;
2203 u64 blkno; 2976 u64 blkno, lower_blkno = 0;
2204 2977
2205 ret = ocfs2_read_block(inode, p_blkno, &bh); 2978 search = ocfs2_xattr_bucket_new(inode);
2979 if (!search) {
2980 ret = -ENOMEM;
2981 mlog_errno(ret);
2982 goto out;
2983 }
2984
2985 ret = ocfs2_read_xattr_bucket(search, p_blkno);
2206 if (ret) { 2986 if (ret) {
2207 mlog_errno(ret); 2987 mlog_errno(ret);
2208 goto out; 2988 goto out;
2209 } 2989 }
2210 2990
2211 xh = (struct ocfs2_xattr_header *)bh->b_data; 2991 xh = bucket_xh(search);
2212 high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1; 2992 high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1;
2213
2214 while (low_bucket <= high_bucket) { 2993 while (low_bucket <= high_bucket) {
2215 brelse(bh); 2994 ocfs2_xattr_bucket_relse(search);
2216 bh = NULL;
2217 bucket = (low_bucket + high_bucket) / 2;
2218 2995
2996 bucket = (low_bucket + high_bucket) / 2;
2219 blkno = p_blkno + bucket * blk_per_bucket; 2997 blkno = p_blkno + bucket * blk_per_bucket;
2220 2998 ret = ocfs2_read_xattr_bucket(search, blkno);
2221 ret = ocfs2_read_block(inode, blkno, &bh);
2222 if (ret) { 2999 if (ret) {
2223 mlog_errno(ret); 3000 mlog_errno(ret);
2224 goto out; 3001 goto out;
2225 } 3002 }
2226 3003
2227 xh = (struct ocfs2_xattr_header *)bh->b_data; 3004 xh = bucket_xh(search);
2228 xe = &xh->xh_entries[0]; 3005 xe = &xh->xh_entries[0];
2229 if (name_hash < le32_to_cpu(xe->xe_name_hash)) { 3006 if (name_hash < le32_to_cpu(xe->xe_name_hash)) {
2230 high_bucket = bucket - 1; 3007 high_bucket = bucket - 1;
@@ -2241,10 +3018,8 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
2241 3018
2242 last_hash = le32_to_cpu(xe->xe_name_hash); 3019 last_hash = le32_to_cpu(xe->xe_name_hash);
2243 3020
2244 /* record lower_bh which may be the insert place. */ 3021 /* record lower_blkno which may be the insert place. */
2245 brelse(lower_bh); 3022 lower_blkno = blkno;
2246 lower_bh = bh;
2247 bh = NULL;
2248 3023
2249 if (name_hash > le32_to_cpu(xe->xe_name_hash)) { 3024 if (name_hash > le32_to_cpu(xe->xe_name_hash)) {
2250 low_bucket = bucket + 1; 3025 low_bucket = bucket + 1;
@@ -2252,7 +3027,7 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
2252 } 3027 }
2253 3028
2254 /* the searched xattr should reside in this bucket if exists. */ 3029 /* the searched xattr should reside in this bucket if exists. */
2255 ret = ocfs2_find_xe_in_bucket(inode, lower_bh, 3030 ret = ocfs2_find_xe_in_bucket(inode, search,
2256 name_index, name, name_hash, 3031 name_index, name, name_hash,
2257 &index, &found); 3032 &index, &found);
2258 if (ret) { 3033 if (ret) {
@@ -2267,46 +3042,29 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
2267 * When the xattr's hash value is in the gap of 2 buckets, we will 3042 * When the xattr's hash value is in the gap of 2 buckets, we will
2268 * always set it to the previous bucket. 3043 * always set it to the previous bucket.
2269 */ 3044 */
2270 if (!lower_bh) { 3045 if (!lower_blkno)
2271 /* 3046 lower_blkno = p_blkno;
2272 * We can't find any bucket whose first name_hash is less 3047
2273 * than the find name_hash. 3048 /* This should be in cache - we just read it during the search */
2274 */ 3049 ret = ocfs2_read_xattr_bucket(xs->bucket, lower_blkno);
2275 BUG_ON(bh->b_blocknr != p_blkno); 3050 if (ret) {
2276 lower_bh = bh; 3051 mlog_errno(ret);
2277 bh = NULL; 3052 goto out;
2278 } 3053 }
2279 xs->bucket.bhs[0] = lower_bh;
2280 xs->bucket.xh = (struct ocfs2_xattr_header *)
2281 xs->bucket.bhs[0]->b_data;
2282 lower_bh = NULL;
2283 3054
2284 xs->header = xs->bucket.xh; 3055 xs->header = bucket_xh(xs->bucket);
2285 xs->base = xs->bucket.bhs[0]->b_data; 3056 xs->base = bucket_block(xs->bucket, 0);
2286 xs->end = xs->base + inode->i_sb->s_blocksize; 3057 xs->end = xs->base + inode->i_sb->s_blocksize;
2287 3058
2288 if (found) { 3059 if (found) {
2289 /*
2290 * If we have found the xattr enty, read all the blocks in
2291 * this bucket.
2292 */
2293 ret = ocfs2_read_blocks(inode, xs->bucket.bhs[0]->b_blocknr + 1,
2294 blk_per_bucket - 1, &xs->bucket.bhs[1],
2295 0);
2296 if (ret) {
2297 mlog_errno(ret);
2298 goto out;
2299 }
2300
2301 xs->here = &xs->header->xh_entries[index]; 3060 xs->here = &xs->header->xh_entries[index];
2302 mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name, 3061 mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name,
2303 (unsigned long long)xs->bucket.bhs[0]->b_blocknr, index); 3062 (unsigned long long)bucket_blkno(xs->bucket), index);
2304 } else 3063 } else
2305 ret = -ENODATA; 3064 ret = -ENODATA;
2306 3065
2307out: 3066out:
2308 brelse(bh); 3067 ocfs2_xattr_bucket_free(search);
2309 brelse(lower_bh);
2310 return ret; 3068 return ret;
2311} 3069}
2312 3070
@@ -2357,53 +3115,50 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
2357 xattr_bucket_func *func, 3115 xattr_bucket_func *func,
2358 void *para) 3116 void *para)
2359{ 3117{
2360 int i, j, ret = 0; 3118 int i, ret = 0;
2361 int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2362 u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)); 3119 u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
2363 u32 num_buckets = clusters * bpc; 3120 u32 num_buckets = clusters * bpc;
2364 struct ocfs2_xattr_bucket bucket; 3121 struct ocfs2_xattr_bucket *bucket;
2365 3122
2366 memset(&bucket, 0, sizeof(bucket)); 3123 bucket = ocfs2_xattr_bucket_new(inode);
3124 if (!bucket) {
3125 mlog_errno(-ENOMEM);
3126 return -ENOMEM;
3127 }
2367 3128
2368 mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n", 3129 mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
2369 clusters, (unsigned long long)blkno); 3130 clusters, (unsigned long long)blkno);
2370 3131
2371 for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) { 3132 for (i = 0; i < num_buckets; i++, blkno += bucket->bu_blocks) {
2372 ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, 3133 ret = ocfs2_read_xattr_bucket(bucket, blkno);
2373 bucket.bhs, 0);
2374 if (ret) { 3134 if (ret) {
2375 mlog_errno(ret); 3135 mlog_errno(ret);
2376 goto out; 3136 break;
2377 } 3137 }
2378 3138
2379 bucket.xh = (struct ocfs2_xattr_header *)bucket.bhs[0]->b_data;
2380 /* 3139 /*
2381 * The real bucket num in this series of blocks is stored 3140 * The real bucket num in this series of blocks is stored
2382 * in the 1st bucket. 3141 * in the 1st bucket.
2383 */ 3142 */
2384 if (i == 0) 3143 if (i == 0)
2385 num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets); 3144 num_buckets = le16_to_cpu(bucket_xh(bucket)->xh_num_buckets);
2386 3145
2387 mlog(0, "iterating xattr bucket %llu, first hash %u\n", 3146 mlog(0, "iterating xattr bucket %llu, first hash %u\n",
2388 (unsigned long long)blkno, 3147 (unsigned long long)blkno,
2389 le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash)); 3148 le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash));
2390 if (func) { 3149 if (func) {
2391 ret = func(inode, &bucket, para); 3150 ret = func(inode, bucket, para);
2392 if (ret) { 3151 if (ret)
2393 mlog_errno(ret); 3152 mlog_errno(ret);
2394 break; 3153 /* Fall through to bucket_relse() */
2395 }
2396 } 3154 }
2397 3155
2398 for (j = 0; j < blk_per_bucket; j++) 3156 ocfs2_xattr_bucket_relse(bucket);
2399 brelse(bucket.bhs[j]); 3157 if (ret)
2400 memset(&bucket, 0, sizeof(bucket)); 3158 break;
2401 } 3159 }
2402 3160
2403out: 3161 ocfs2_xattr_bucket_free(bucket);
2404 for (j = 0; j < blk_per_bucket; j++)
2405 brelse(bucket.bhs[j]);
2406
2407 return ret; 3162 return ret;
2408} 3163}
2409 3164
@@ -2441,21 +3196,21 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
2441 int i, block_off, new_offset; 3196 int i, block_off, new_offset;
2442 const char *prefix, *name; 3197 const char *prefix, *name;
2443 3198
2444 for (i = 0 ; i < le16_to_cpu(bucket->xh->xh_count); i++) { 3199 for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) {
2445 struct ocfs2_xattr_entry *entry = &bucket->xh->xh_entries[i]; 3200 struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i];
2446 type = ocfs2_xattr_get_type(entry); 3201 type = ocfs2_xattr_get_type(entry);
2447 prefix = ocfs2_xattr_prefix(type); 3202 prefix = ocfs2_xattr_prefix(type);
2448 3203
2449 if (prefix) { 3204 if (prefix) {
2450 ret = ocfs2_xattr_bucket_get_name_value(inode, 3205 ret = ocfs2_xattr_bucket_get_name_value(inode,
2451 bucket->xh, 3206 bucket_xh(bucket),
2452 i, 3207 i,
2453 &block_off, 3208 &block_off,
2454 &new_offset); 3209 &new_offset);
2455 if (ret) 3210 if (ret)
2456 break; 3211 break;
2457 3212
2458 name = (const char *)bucket->bhs[block_off]->b_data + 3213 name = (const char *)bucket_block(bucket, block_off) +
2459 new_offset; 3214 new_offset;
2460 ret = ocfs2_xattr_list_entry(xl->buffer, 3215 ret = ocfs2_xattr_list_entry(xl->buffer,
2461 xl->buffer_size, 3216 xl->buffer_size,
@@ -2540,32 +3295,34 @@ static void swap_xe(void *a, void *b, int size)
2540/* 3295/*
2541 * When the ocfs2_xattr_block is filled up, new bucket will be created 3296 * When the ocfs2_xattr_block is filled up, new bucket will be created
2542 * and all the xattr entries will be moved to the new bucket. 3297 * and all the xattr entries will be moved to the new bucket.
3298 * The header goes at the start of the bucket, and the names+values are
3299 * filled from the end. This is why *target starts as the last buffer.
2543 * Note: we need to sort the entries since they are not saved in order 3300 * Note: we need to sort the entries since they are not saved in order
2544 * in the ocfs2_xattr_block. 3301 * in the ocfs2_xattr_block.
2545 */ 3302 */
2546static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode, 3303static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
2547 struct buffer_head *xb_bh, 3304 struct buffer_head *xb_bh,
2548 struct buffer_head *xh_bh, 3305 struct ocfs2_xattr_bucket *bucket)
2549 struct buffer_head *data_bh)
2550{ 3306{
2551 int i, blocksize = inode->i_sb->s_blocksize; 3307 int i, blocksize = inode->i_sb->s_blocksize;
3308 int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2552 u16 offset, size, off_change; 3309 u16 offset, size, off_change;
2553 struct ocfs2_xattr_entry *xe; 3310 struct ocfs2_xattr_entry *xe;
2554 struct ocfs2_xattr_block *xb = 3311 struct ocfs2_xattr_block *xb =
2555 (struct ocfs2_xattr_block *)xb_bh->b_data; 3312 (struct ocfs2_xattr_block *)xb_bh->b_data;
2556 struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header; 3313 struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header;
2557 struct ocfs2_xattr_header *xh = 3314 struct ocfs2_xattr_header *xh = bucket_xh(bucket);
2558 (struct ocfs2_xattr_header *)xh_bh->b_data;
2559 u16 count = le16_to_cpu(xb_xh->xh_count); 3315 u16 count = le16_to_cpu(xb_xh->xh_count);
2560 char *target = xh_bh->b_data, *src = xb_bh->b_data; 3316 char *src = xb_bh->b_data;
3317 char *target = bucket_block(bucket, blks - 1);
2561 3318
2562 mlog(0, "cp xattr from block %llu to bucket %llu\n", 3319 mlog(0, "cp xattr from block %llu to bucket %llu\n",
2563 (unsigned long long)xb_bh->b_blocknr, 3320 (unsigned long long)xb_bh->b_blocknr,
2564 (unsigned long long)xh_bh->b_blocknr); 3321 (unsigned long long)bucket_blkno(bucket));
3322
3323 for (i = 0; i < blks; i++)
3324 memset(bucket_block(bucket, i), 0, blocksize);
2565 3325
2566 memset(xh_bh->b_data, 0, blocksize);
2567 if (data_bh)
2568 memset(data_bh->b_data, 0, blocksize);
2569 /* 3326 /*
2570 * Since the xe_name_offset is based on ocfs2_xattr_header, 3327 * Since the xe_name_offset is based on ocfs2_xattr_header,
2571 * there is a offset change corresponding to the change of 3328 * there is a offset change corresponding to the change of
@@ -2577,8 +3334,6 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
2577 size = blocksize - offset; 3334 size = blocksize - offset;
2578 3335
2579 /* copy all the names and values. */ 3336 /* copy all the names and values. */
2580 if (data_bh)
2581 target = data_bh->b_data;
2582 memcpy(target + offset, src + offset, size); 3337 memcpy(target + offset, src + offset, size);
2583 3338
2584 /* Init new header now. */ 3339 /* Init new header now. */
@@ -2588,7 +3343,7 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
2588 xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size); 3343 xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size);
2589 3344
2590 /* copy all the entries. */ 3345 /* copy all the entries. */
2591 target = xh_bh->b_data; 3346 target = bucket_block(bucket, 0);
2592 offset = offsetof(struct ocfs2_xattr_header, xh_entries); 3347 offset = offsetof(struct ocfs2_xattr_header, xh_entries);
2593 size = count * sizeof(struct ocfs2_xattr_entry); 3348 size = count * sizeof(struct ocfs2_xattr_entry);
2594 memcpy(target + offset, (char *)xb_xh + offset, size); 3349 memcpy(target + offset, (char *)xb_xh + offset, size);
@@ -2614,73 +3369,47 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
2614 * While if the entry is in index b-tree, "bucket" indicates the 3369 * While if the entry is in index b-tree, "bucket" indicates the
2615 * real place of the xattr. 3370 * real place of the xattr.
2616 */ 3371 */
2617static int ocfs2_xattr_update_xattr_search(struct inode *inode, 3372static void ocfs2_xattr_update_xattr_search(struct inode *inode,
2618 struct ocfs2_xattr_search *xs, 3373 struct ocfs2_xattr_search *xs,
2619 struct buffer_head *old_bh, 3374 struct buffer_head *old_bh)
2620 struct buffer_head *new_bh)
2621{ 3375{
2622 int ret = 0;
2623 char *buf = old_bh->b_data; 3376 char *buf = old_bh->b_data;
2624 struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf; 3377 struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf;
2625 struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header; 3378 struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header;
2626 int i, blocksize = inode->i_sb->s_blocksize; 3379 int i;
2627 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2628
2629 xs->bucket.bhs[0] = new_bh;
2630 get_bh(new_bh);
2631 xs->bucket.xh = (struct ocfs2_xattr_header *)xs->bucket.bhs[0]->b_data;
2632 xs->header = xs->bucket.xh;
2633 3380
2634 xs->base = new_bh->b_data; 3381 xs->header = bucket_xh(xs->bucket);
3382 xs->base = bucket_block(xs->bucket, 0);
2635 xs->end = xs->base + inode->i_sb->s_blocksize; 3383 xs->end = xs->base + inode->i_sb->s_blocksize;
2636 3384
2637 if (!xs->not_found) { 3385 if (xs->not_found)
2638 if (OCFS2_XATTR_BUCKET_SIZE != blocksize) { 3386 return;
2639 ret = ocfs2_read_blocks(inode,
2640 xs->bucket.bhs[0]->b_blocknr + 1,
2641 blk_per_bucket - 1, &xs->bucket.bhs[1],
2642 0);
2643 if (ret) {
2644 mlog_errno(ret);
2645 return ret;
2646 }
2647
2648 }
2649 i = xs->here - old_xh->xh_entries;
2650 xs->here = &xs->header->xh_entries[i];
2651 }
2652 3387
2653 return ret; 3388 i = xs->here - old_xh->xh_entries;
3389 xs->here = &xs->header->xh_entries[i];
2654} 3390}
2655 3391
2656static int ocfs2_xattr_create_index_block(struct inode *inode, 3392static int ocfs2_xattr_create_index_block(struct inode *inode,
2657 struct ocfs2_xattr_search *xs) 3393 struct ocfs2_xattr_search *xs,
3394 struct ocfs2_xattr_set_ctxt *ctxt)
2658{ 3395{
2659 int ret, credits = OCFS2_SUBALLOC_ALLOC; 3396 int ret;
2660 u32 bit_off, len; 3397 u32 bit_off, len;
2661 u64 blkno; 3398 u64 blkno;
2662 handle_t *handle; 3399 handle_t *handle = ctxt->handle;
2663 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3400 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2664 struct ocfs2_inode_info *oi = OCFS2_I(inode); 3401 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2665 struct ocfs2_alloc_context *data_ac;
2666 struct buffer_head *xh_bh = NULL, *data_bh = NULL;
2667 struct buffer_head *xb_bh = xs->xattr_bh; 3402 struct buffer_head *xb_bh = xs->xattr_bh;
2668 struct ocfs2_xattr_block *xb = 3403 struct ocfs2_xattr_block *xb =
2669 (struct ocfs2_xattr_block *)xb_bh->b_data; 3404 (struct ocfs2_xattr_block *)xb_bh->b_data;
2670 struct ocfs2_xattr_tree_root *xr; 3405 struct ocfs2_xattr_tree_root *xr;
2671 u16 xb_flags = le16_to_cpu(xb->xb_flags); 3406 u16 xb_flags = le16_to_cpu(xb->xb_flags);
2672 u16 bpb = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2673 3407
2674 mlog(0, "create xattr index block for %llu\n", 3408 mlog(0, "create xattr index block for %llu\n",
2675 (unsigned long long)xb_bh->b_blocknr); 3409 (unsigned long long)xb_bh->b_blocknr);
2676 3410
2677 BUG_ON(xb_flags & OCFS2_XATTR_INDEXED); 3411 BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
2678 3412 BUG_ON(!xs->bucket);
2679 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
2680 if (ret) {
2681 mlog_errno(ret);
2682 goto out;
2683 }
2684 3413
2685 /* 3414 /*
2686 * XXX: 3415 * XXX:
@@ -2689,29 +3418,18 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
2689 */ 3418 */
2690 down_write(&oi->ip_alloc_sem); 3419 down_write(&oi->ip_alloc_sem);
2691 3420
2692 /* 3421 ret = ocfs2_journal_access_xb(handle, inode, xb_bh,
2693 * 3 more credits, one for xattr block update, one for the 1st block 3422 OCFS2_JOURNAL_ACCESS_WRITE);
2694 * of the new xattr bucket and one for the value/data.
2695 */
2696 credits += 3;
2697 handle = ocfs2_start_trans(osb, credits);
2698 if (IS_ERR(handle)) {
2699 ret = PTR_ERR(handle);
2700 mlog_errno(ret);
2701 goto out_sem;
2702 }
2703
2704 ret = ocfs2_journal_access(handle, inode, xb_bh,
2705 OCFS2_JOURNAL_ACCESS_WRITE);
2706 if (ret) { 3423 if (ret) {
2707 mlog_errno(ret); 3424 mlog_errno(ret);
2708 goto out_commit; 3425 goto out;
2709 } 3426 }
2710 3427
2711 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len); 3428 ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac,
3429 1, 1, &bit_off, &len);
2712 if (ret) { 3430 if (ret) {
2713 mlog_errno(ret); 3431 mlog_errno(ret);
2714 goto out_commit; 3432 goto out;
2715 } 3433 }
2716 3434
2717 /* 3435 /*
@@ -2724,51 +3442,23 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
2724 mlog(0, "allocate 1 cluster from %llu to xattr block\n", 3442 mlog(0, "allocate 1 cluster from %llu to xattr block\n",
2725 (unsigned long long)blkno); 3443 (unsigned long long)blkno);
2726 3444
2727 xh_bh = sb_getblk(inode->i_sb, blkno); 3445 ret = ocfs2_init_xattr_bucket(xs->bucket, blkno);
2728 if (!xh_bh) { 3446 if (ret) {
2729 ret = -EIO;
2730 mlog_errno(ret); 3447 mlog_errno(ret);
2731 goto out_commit; 3448 goto out;
2732 } 3449 }
2733 3450
2734 ocfs2_set_new_buffer_uptodate(inode, xh_bh); 3451 ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
2735 3452 OCFS2_JOURNAL_ACCESS_CREATE);
2736 ret = ocfs2_journal_access(handle, inode, xh_bh,
2737 OCFS2_JOURNAL_ACCESS_CREATE);
2738 if (ret) { 3453 if (ret) {
2739 mlog_errno(ret); 3454 mlog_errno(ret);
2740 goto out_commit; 3455 goto out;
2741 }
2742
2743 if (bpb > 1) {
2744 data_bh = sb_getblk(inode->i_sb, blkno + bpb - 1);
2745 if (!data_bh) {
2746 ret = -EIO;
2747 mlog_errno(ret);
2748 goto out_commit;
2749 }
2750
2751 ocfs2_set_new_buffer_uptodate(inode, data_bh);
2752
2753 ret = ocfs2_journal_access(handle, inode, data_bh,
2754 OCFS2_JOURNAL_ACCESS_CREATE);
2755 if (ret) {
2756 mlog_errno(ret);
2757 goto out_commit;
2758 }
2759 } 3456 }
2760 3457
2761 ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xh_bh, data_bh); 3458 ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xs->bucket);
3459 ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
2762 3460
2763 ocfs2_journal_dirty(handle, xh_bh); 3461 ocfs2_xattr_update_xattr_search(inode, xs, xb_bh);
2764 if (data_bh)
2765 ocfs2_journal_dirty(handle, data_bh);
2766
2767 ret = ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh);
2768 if (ret) {
2769 mlog_errno(ret);
2770 goto out_commit;
2771 }
2772 3462
2773 /* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */ 3463 /* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */
2774 memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize - 3464 memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
@@ -2787,24 +3477,10 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
2787 3477
2788 xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED); 3478 xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED);
2789 3479
2790 ret = ocfs2_journal_dirty(handle, xb_bh); 3480 ocfs2_journal_dirty(handle, xb_bh);
2791 if (ret) {
2792 mlog_errno(ret);
2793 goto out_commit;
2794 }
2795
2796out_commit:
2797 ocfs2_commit_trans(osb, handle);
2798
2799out_sem:
2800 up_write(&oi->ip_alloc_sem);
2801 3481
2802out: 3482out:
2803 if (data_ac) 3483 up_write(&oi->ip_alloc_sem);
2804 ocfs2_free_alloc_context(data_ac);
2805
2806 brelse(xh_bh);
2807 brelse(data_bh);
2808 3484
2809 return ret; 3485 return ret;
2810} 3486}
@@ -2829,29 +3505,18 @@ static int cmp_xe_offset(const void *a, const void *b)
2829 * so that we can spare some space for insertion. 3505 * so that we can spare some space for insertion.
2830 */ 3506 */
2831static int ocfs2_defrag_xattr_bucket(struct inode *inode, 3507static int ocfs2_defrag_xattr_bucket(struct inode *inode,
3508 handle_t *handle,
2832 struct ocfs2_xattr_bucket *bucket) 3509 struct ocfs2_xattr_bucket *bucket)
2833{ 3510{
2834 int ret, i; 3511 int ret, i;
2835 size_t end, offset, len, value_len; 3512 size_t end, offset, len, value_len;
2836 struct ocfs2_xattr_header *xh; 3513 struct ocfs2_xattr_header *xh;
2837 char *entries, *buf, *bucket_buf = NULL; 3514 char *entries, *buf, *bucket_buf = NULL;
2838 u64 blkno = bucket->bhs[0]->b_blocknr; 3515 u64 blkno = bucket_blkno(bucket);
2839 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2840 u16 xh_free_start; 3516 u16 xh_free_start;
2841 size_t blocksize = inode->i_sb->s_blocksize; 3517 size_t blocksize = inode->i_sb->s_blocksize;
2842 handle_t *handle;
2843 struct buffer_head **bhs;
2844 struct ocfs2_xattr_entry *xe; 3518 struct ocfs2_xattr_entry *xe;
2845 3519
2846 bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
2847 GFP_NOFS);
2848 if (!bhs)
2849 return -ENOMEM;
2850
2851 ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, bhs, 0);
2852 if (ret)
2853 goto out;
2854
2855 /* 3520 /*
2856 * In order to make the operation more efficient and generic, 3521 * In order to make the operation more efficient and generic,
2857 * we copy all the blocks into a contiguous memory and do the 3522 * we copy all the blocks into a contiguous memory and do the
@@ -2865,26 +3530,16 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
2865 } 3530 }
2866 3531
2867 buf = bucket_buf; 3532 buf = bucket_buf;
2868 for (i = 0; i < blk_per_bucket; i++, buf += blocksize) 3533 for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
2869 memcpy(buf, bhs[i]->b_data, blocksize); 3534 memcpy(buf, bucket_block(bucket, i), blocksize);
2870 3535
2871 handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), blk_per_bucket); 3536 ret = ocfs2_xattr_bucket_journal_access(handle, bucket,
2872 if (IS_ERR(handle)) { 3537 OCFS2_JOURNAL_ACCESS_WRITE);
2873 ret = PTR_ERR(handle); 3538 if (ret < 0) {
2874 handle = NULL;
2875 mlog_errno(ret); 3539 mlog_errno(ret);
2876 goto out; 3540 goto out;
2877 } 3541 }
2878 3542
2879 for (i = 0; i < blk_per_bucket; i++) {
2880 ret = ocfs2_journal_access(handle, inode, bhs[i],
2881 OCFS2_JOURNAL_ACCESS_WRITE);
2882 if (ret < 0) {
2883 mlog_errno(ret);
2884 goto commit;
2885 }
2886 }
2887
2888 xh = (struct ocfs2_xattr_header *)bucket_buf; 3543 xh = (struct ocfs2_xattr_header *)bucket_buf;
2889 entries = (char *)xh->xh_entries; 3544 entries = (char *)xh->xh_entries;
2890 xh_free_start = le16_to_cpu(xh->xh_free_start); 3545 xh_free_start = le16_to_cpu(xh->xh_free_start);
@@ -2940,7 +3595,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
2940 "bucket %llu\n", (unsigned long long)blkno); 3595 "bucket %llu\n", (unsigned long long)blkno);
2941 3596
2942 if (xh_free_start == end) 3597 if (xh_free_start == end)
2943 goto commit; 3598 goto out;
2944 3599
2945 memset(bucket_buf + xh_free_start, 0, end - xh_free_start); 3600 memset(bucket_buf + xh_free_start, 0, end - xh_free_start);
2946 xh->xh_free_start = cpu_to_le16(end); 3601 xh->xh_free_start = cpu_to_le16(end);
@@ -2951,169 +3606,94 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
2951 cmp_xe, swap_xe); 3606 cmp_xe, swap_xe);
2952 3607
2953 buf = bucket_buf; 3608 buf = bucket_buf;
2954 for (i = 0; i < blk_per_bucket; i++, buf += blocksize) { 3609 for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
2955 memcpy(bhs[i]->b_data, buf, blocksize); 3610 memcpy(bucket_block(bucket, i), buf, blocksize);
2956 ocfs2_journal_dirty(handle, bhs[i]); 3611 ocfs2_xattr_bucket_journal_dirty(handle, bucket);
2957 }
2958 3612
2959commit:
2960 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
2961out: 3613out:
2962
2963 if (bhs) {
2964 for (i = 0; i < blk_per_bucket; i++)
2965 brelse(bhs[i]);
2966 }
2967 kfree(bhs);
2968
2969 kfree(bucket_buf); 3614 kfree(bucket_buf);
2970 return ret; 3615 return ret;
2971} 3616}
2972 3617
2973/* 3618/*
2974 * Move half nums of the xattr bucket in the previous cluster to this new 3619 * prev_blkno points to the start of an existing extent. new_blkno
2975 * cluster. We only touch the last cluster of the previous extend record. 3620 * points to a newly allocated extent. Because we know each of our
3621 * clusters contains more than bucket, we can easily split one cluster
3622 * at a bucket boundary. So we take the last cluster of the existing
3623 * extent and split it down the middle. We move the last half of the
3624 * buckets in the last cluster of the existing extent over to the new
3625 * extent.
3626 *
3627 * first_bh is the buffer at prev_blkno so we can update the existing
3628 * extent's bucket count. header_bh is the bucket were we were hoping
3629 * to insert our xattr. If the bucket move places the target in the new
3630 * extent, we'll update first_bh and header_bh after modifying the old
3631 * extent.
2976 * 3632 *
2977 * first_bh is the first buffer_head of a series of bucket in the same 3633 * first_hash will be set as the 1st xe's name_hash in the new extent.
2978 * extent rec and header_bh is the header of one bucket in this cluster.
2979 * They will be updated if we move the data header_bh contains to the new
2980 * cluster. first_hash will be set as the 1st xe's name_hash of the new cluster.
2981 */ 3634 */
2982static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode, 3635static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
2983 handle_t *handle, 3636 handle_t *handle,
2984 struct buffer_head **first_bh, 3637 struct ocfs2_xattr_bucket *first,
2985 struct buffer_head **header_bh, 3638 struct ocfs2_xattr_bucket *target,
2986 u64 new_blkno, 3639 u64 new_blkno,
2987 u64 prev_blkno,
2988 u32 num_clusters, 3640 u32 num_clusters,
2989 u32 *first_hash) 3641 u32 *first_hash)
2990{ 3642{
2991 int i, ret, credits; 3643 int ret;
2992 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3644 struct super_block *sb = inode->i_sb;
2993 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); 3645 int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(sb);
2994 int num_buckets = ocfs2_xattr_buckets_per_cluster(osb); 3646 int num_buckets = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
2995 int blocksize = inode->i_sb->s_blocksize; 3647 int to_move = num_buckets / 2;
2996 struct buffer_head *old_bh, *new_bh, *prev_bh, *new_first_bh = NULL; 3648 u64 src_blkno;
2997 struct ocfs2_xattr_header *new_xh; 3649 u64 last_cluster_blkno = bucket_blkno(first) +
2998 struct ocfs2_xattr_header *xh = 3650 ((num_clusters - 1) * ocfs2_clusters_to_blocks(sb, 1));
2999 (struct ocfs2_xattr_header *)((*first_bh)->b_data);
3000
3001 BUG_ON(le16_to_cpu(xh->xh_num_buckets) < num_buckets);
3002 BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize);
3003
3004 prev_bh = *first_bh;
3005 get_bh(prev_bh);
3006 xh = (struct ocfs2_xattr_header *)prev_bh->b_data;
3007 3651
3008 prev_blkno += (num_clusters - 1) * bpc + bpc / 2; 3652 BUG_ON(le16_to_cpu(bucket_xh(first)->xh_num_buckets) < num_buckets);
3653 BUG_ON(OCFS2_XATTR_BUCKET_SIZE == OCFS2_SB(sb)->s_clustersize);
3009 3654
3010 mlog(0, "move half of xattrs in cluster %llu to %llu\n", 3655 mlog(0, "move half of xattrs in cluster %llu to %llu\n",
3011 (unsigned long long)prev_blkno, (unsigned long long)new_blkno); 3656 (unsigned long long)last_cluster_blkno, (unsigned long long)new_blkno);
3012 3657
3013 /* 3658 ret = ocfs2_mv_xattr_buckets(inode, handle, bucket_blkno(first),
3014 * We need to update the 1st half of the new cluster and 3659 last_cluster_blkno, new_blkno,
3015 * 1 more for the update of the 1st bucket of the previous 3660 to_move, first_hash);
3016 * extent record.
3017 */
3018 credits = bpc / 2 + 1;
3019 ret = ocfs2_extend_trans(handle, credits);
3020 if (ret) { 3661 if (ret) {
3021 mlog_errno(ret); 3662 mlog_errno(ret);
3022 goto out; 3663 goto out;
3023 } 3664 }
3024 3665
3025 ret = ocfs2_journal_access(handle, inode, prev_bh, 3666 /* This is the first bucket that got moved */
3026 OCFS2_JOURNAL_ACCESS_WRITE); 3667 src_blkno = last_cluster_blkno + (to_move * blks_per_bucket);
3027 if (ret) {
3028 mlog_errno(ret);
3029 goto out;
3030 }
3031 3668
3032 for (i = 0; i < bpc / 2; i++, prev_blkno++, new_blkno++) { 3669 /*
3033 old_bh = new_bh = NULL; 3670 * If the target bucket was part of the moved buckets, we need to
3034 new_bh = sb_getblk(inode->i_sb, new_blkno); 3671 * update first and target.
3035 if (!new_bh) { 3672 */
3036 ret = -EIO; 3673 if (bucket_blkno(target) >= src_blkno) {
3037 mlog_errno(ret); 3674 /* Find the block for the new target bucket */
3038 goto out; 3675 src_blkno = new_blkno +
3039 } 3676 (bucket_blkno(target) - src_blkno);
3040 3677
3041 ocfs2_set_new_buffer_uptodate(inode, new_bh); 3678 ocfs2_xattr_bucket_relse(first);
3679 ocfs2_xattr_bucket_relse(target);
3042 3680
3043 ret = ocfs2_journal_access(handle, inode, new_bh, 3681 /*
3044 OCFS2_JOURNAL_ACCESS_CREATE); 3682 * These shouldn't fail - the buffers are in the
3045 if (ret < 0) { 3683 * journal from ocfs2_cp_xattr_bucket().
3684 */
3685 ret = ocfs2_read_xattr_bucket(first, new_blkno);
3686 if (ret) {
3046 mlog_errno(ret); 3687 mlog_errno(ret);
3047 brelse(new_bh);
3048 goto out; 3688 goto out;
3049 } 3689 }
3050 3690 ret = ocfs2_read_xattr_bucket(target, src_blkno);
3051 ret = ocfs2_read_block(inode, prev_blkno, &old_bh); 3691 if (ret)
3052 if (ret < 0) {
3053 mlog_errno(ret); 3692 mlog_errno(ret);
3054 brelse(new_bh);
3055 goto out;
3056 }
3057 3693
3058 memcpy(new_bh->b_data, old_bh->b_data, blocksize);
3059
3060 if (i == 0) {
3061 new_xh = (struct ocfs2_xattr_header *)new_bh->b_data;
3062 new_xh->xh_num_buckets = cpu_to_le16(num_buckets / 2);
3063
3064 if (first_hash)
3065 *first_hash = le32_to_cpu(
3066 new_xh->xh_entries[0].xe_name_hash);
3067 new_first_bh = new_bh;
3068 get_bh(new_first_bh);
3069 }
3070
3071 ocfs2_journal_dirty(handle, new_bh);
3072
3073 if (*header_bh == old_bh) {
3074 brelse(*header_bh);
3075 *header_bh = new_bh;
3076 get_bh(*header_bh);
3077
3078 brelse(*first_bh);
3079 *first_bh = new_first_bh;
3080 get_bh(*first_bh);
3081 }
3082 brelse(new_bh);
3083 brelse(old_bh);
3084 } 3694 }
3085 3695
3086 le16_add_cpu(&xh->xh_num_buckets, -(num_buckets / 2));
3087
3088 ocfs2_journal_dirty(handle, prev_bh);
3089out: 3696out:
3090 brelse(prev_bh);
3091 brelse(new_first_bh);
3092 return ret;
3093}
3094
3095static int ocfs2_read_xattr_bucket(struct inode *inode,
3096 u64 blkno,
3097 struct buffer_head **bhs,
3098 int new)
3099{
3100 int ret = 0;
3101 u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3102
3103 if (!new)
3104 return ocfs2_read_blocks(inode, blkno,
3105 blk_per_bucket, bhs, 0);
3106
3107 for (i = 0; i < blk_per_bucket; i++) {
3108 bhs[i] = sb_getblk(inode->i_sb, blkno + i);
3109 if (bhs[i] == NULL) {
3110 ret = -EIO;
3111 mlog_errno(ret);
3112 break;
3113 }
3114 ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
3115 }
3116
3117 return ret; 3697 return ret;
3118} 3698}
3119 3699
@@ -3178,8 +3758,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
3178{ 3758{
3179 int ret, i; 3759 int ret, i;
3180 int count, start, len, name_value_len = 0, xe_len, name_offset = 0; 3760 int count, start, len, name_value_len = 0, xe_len, name_offset = 0;
3181 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); 3761 struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
3182 struct buffer_head **s_bhs, **t_bhs = NULL;
3183 struct ocfs2_xattr_header *xh; 3762 struct ocfs2_xattr_header *xh;
3184 struct ocfs2_xattr_entry *xe; 3763 struct ocfs2_xattr_entry *xe;
3185 int blocksize = inode->i_sb->s_blocksize; 3764 int blocksize = inode->i_sb->s_blocksize;
@@ -3187,47 +3766,52 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
3187 mlog(0, "move some of xattrs from bucket %llu to %llu\n", 3766 mlog(0, "move some of xattrs from bucket %llu to %llu\n",
3188 (unsigned long long)blk, (unsigned long long)new_blk); 3767 (unsigned long long)blk, (unsigned long long)new_blk);
3189 3768
3190 s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS); 3769 s_bucket = ocfs2_xattr_bucket_new(inode);
3191 if (!s_bhs) 3770 t_bucket = ocfs2_xattr_bucket_new(inode);
3192 return -ENOMEM; 3771 if (!s_bucket || !t_bucket) {
3193 3772 ret = -ENOMEM;
3194 ret = ocfs2_read_xattr_bucket(inode, blk, s_bhs, 0);
3195 if (ret) {
3196 mlog_errno(ret); 3773 mlog_errno(ret);
3197 goto out; 3774 goto out;
3198 } 3775 }
3199 3776
3200 ret = ocfs2_journal_access(handle, inode, s_bhs[0], 3777 ret = ocfs2_read_xattr_bucket(s_bucket, blk);
3201 OCFS2_JOURNAL_ACCESS_WRITE);
3202 if (ret) { 3778 if (ret) {
3203 mlog_errno(ret); 3779 mlog_errno(ret);
3204 goto out; 3780 goto out;
3205 } 3781 }
3206 3782
3207 t_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS); 3783 ret = ocfs2_xattr_bucket_journal_access(handle, s_bucket,
3208 if (!t_bhs) { 3784 OCFS2_JOURNAL_ACCESS_WRITE);
3209 ret = -ENOMEM; 3785 if (ret) {
3786 mlog_errno(ret);
3210 goto out; 3787 goto out;
3211 } 3788 }
3212 3789
3213 ret = ocfs2_read_xattr_bucket(inode, new_blk, t_bhs, new_bucket_head); 3790 /*
3791 * Even if !new_bucket_head, we're overwriting t_bucket. Thus,
3792 * there's no need to read it.
3793 */
3794 ret = ocfs2_init_xattr_bucket(t_bucket, new_blk);
3214 if (ret) { 3795 if (ret) {
3215 mlog_errno(ret); 3796 mlog_errno(ret);
3216 goto out; 3797 goto out;
3217 } 3798 }
3218 3799
3219 for (i = 0; i < blk_per_bucket; i++) { 3800 /*
3220 ret = ocfs2_journal_access(handle, inode, t_bhs[i], 3801 * Hey, if we're overwriting t_bucket, what difference does
3221 new_bucket_head ? 3802 * ACCESS_CREATE vs ACCESS_WRITE make? See the comment in the
3222 OCFS2_JOURNAL_ACCESS_CREATE : 3803 * same part of ocfs2_cp_xattr_bucket().
3223 OCFS2_JOURNAL_ACCESS_WRITE); 3804 */
3224 if (ret) { 3805 ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
3225 mlog_errno(ret); 3806 new_bucket_head ?
3226 goto out; 3807 OCFS2_JOURNAL_ACCESS_CREATE :
3227 } 3808 OCFS2_JOURNAL_ACCESS_WRITE);
3809 if (ret) {
3810 mlog_errno(ret);
3811 goto out;
3228 } 3812 }
3229 3813
3230 xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data; 3814 xh = bucket_xh(s_bucket);
3231 count = le16_to_cpu(xh->xh_count); 3815 count = le16_to_cpu(xh->xh_count);
3232 start = ocfs2_xattr_find_divide_pos(xh); 3816 start = ocfs2_xattr_find_divide_pos(xh);
3233 3817
@@ -3239,10 +3823,10 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
3239 * The hash value is set as one larger than 3823 * The hash value is set as one larger than
3240 * that of the last entry in the previous bucket. 3824 * that of the last entry in the previous bucket.
3241 */ 3825 */
3242 for (i = 0; i < blk_per_bucket; i++) 3826 for (i = 0; i < t_bucket->bu_blocks; i++)
3243 memset(t_bhs[i]->b_data, 0, blocksize); 3827 memset(bucket_block(t_bucket, i), 0, blocksize);
3244 3828
3245 xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data; 3829 xh = bucket_xh(t_bucket);
3246 xh->xh_free_start = cpu_to_le16(blocksize); 3830 xh->xh_free_start = cpu_to_le16(blocksize);
3247 xh->xh_entries[0].xe_name_hash = xe->xe_name_hash; 3831 xh->xh_entries[0].xe_name_hash = xe->xe_name_hash;
3248 le32_add_cpu(&xh->xh_entries[0].xe_name_hash, 1); 3832 le32_add_cpu(&xh->xh_entries[0].xe_name_hash, 1);
@@ -3251,11 +3835,10 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
3251 } 3835 }
3252 3836
3253 /* copy the whole bucket to the new first. */ 3837 /* copy the whole bucket to the new first. */
3254 for (i = 0; i < blk_per_bucket; i++) 3838 ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
3255 memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
3256 3839
3257 /* update the new bucket. */ 3840 /* update the new bucket. */
3258 xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data; 3841 xh = bucket_xh(t_bucket);
3259 3842
3260 /* 3843 /*
3261 * Calculate the total name/value len and xh_free_start for 3844 * Calculate the total name/value len and xh_free_start for
@@ -3319,11 +3902,7 @@ set_num_buckets:
3319 else 3902 else
3320 xh->xh_num_buckets = 0; 3903 xh->xh_num_buckets = 0;
3321 3904
3322 for (i = 0; i < blk_per_bucket; i++) { 3905 ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
3323 ocfs2_journal_dirty(handle, t_bhs[i]);
3324 if (ret)
3325 mlog_errno(ret);
3326 }
3327 3906
3328 /* store the first_hash of the new bucket. */ 3907 /* store the first_hash of the new bucket. */
3329 if (first_hash) 3908 if (first_hash)
@@ -3337,29 +3916,18 @@ set_num_buckets:
3337 if (start == count) 3916 if (start == count)
3338 goto out; 3917 goto out;
3339 3918
3340 xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data; 3919 xh = bucket_xh(s_bucket);
3341 memset(&xh->xh_entries[start], 0, 3920 memset(&xh->xh_entries[start], 0,
3342 sizeof(struct ocfs2_xattr_entry) * (count - start)); 3921 sizeof(struct ocfs2_xattr_entry) * (count - start));
3343 xh->xh_count = cpu_to_le16(start); 3922 xh->xh_count = cpu_to_le16(start);
3344 xh->xh_free_start = cpu_to_le16(name_offset); 3923 xh->xh_free_start = cpu_to_le16(name_offset);
3345 xh->xh_name_value_len = cpu_to_le16(name_value_len); 3924 xh->xh_name_value_len = cpu_to_le16(name_value_len);
3346 3925
3347 ocfs2_journal_dirty(handle, s_bhs[0]); 3926 ocfs2_xattr_bucket_journal_dirty(handle, s_bucket);
3348 if (ret)
3349 mlog_errno(ret);
3350 3927
3351out: 3928out:
3352 if (s_bhs) { 3929 ocfs2_xattr_bucket_free(s_bucket);
3353 for (i = 0; i < blk_per_bucket; i++) 3930 ocfs2_xattr_bucket_free(t_bucket);
3354 brelse(s_bhs[i]);
3355 }
3356 kfree(s_bhs);
3357
3358 if (t_bhs) {
3359 for (i = 0; i < blk_per_bucket; i++)
3360 brelse(t_bhs[i]);
3361 }
3362 kfree(t_bhs);
3363 3931
3364 return ret; 3932 return ret;
3365} 3933}
@@ -3376,10 +3944,8 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
3376 u64 t_blkno, 3944 u64 t_blkno,
3377 int t_is_new) 3945 int t_is_new)
3378{ 3946{
3379 int ret, i; 3947 int ret;
3380 int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); 3948 struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
3381 int blocksize = inode->i_sb->s_blocksize;
3382 struct buffer_head **s_bhs, **t_bhs = NULL;
3383 3949
3384 BUG_ON(s_blkno == t_blkno); 3950 BUG_ON(s_blkno == t_blkno);
3385 3951
@@ -3387,92 +3953,115 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
3387 (unsigned long long)s_blkno, (unsigned long long)t_blkno, 3953 (unsigned long long)s_blkno, (unsigned long long)t_blkno,
3388 t_is_new); 3954 t_is_new);
3389 3955
3390 s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket, 3956 s_bucket = ocfs2_xattr_bucket_new(inode);
3391 GFP_NOFS); 3957 t_bucket = ocfs2_xattr_bucket_new(inode);
3392 if (!s_bhs) 3958 if (!s_bucket || !t_bucket) {
3393 return -ENOMEM; 3959 ret = -ENOMEM;
3960 mlog_errno(ret);
3961 goto out;
3962 }
3394 3963
3395 ret = ocfs2_read_xattr_bucket(inode, s_blkno, s_bhs, 0); 3964 ret = ocfs2_read_xattr_bucket(s_bucket, s_blkno);
3396 if (ret) 3965 if (ret)
3397 goto out; 3966 goto out;
3398 3967
3399 t_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket, 3968 /*
3400 GFP_NOFS); 3969 * Even if !t_is_new, we're overwriting t_bucket. Thus,
3401 if (!t_bhs) { 3970 * there's no need to read it.
3402 ret = -ENOMEM; 3971 */
3972 ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno);
3973 if (ret)
3403 goto out; 3974 goto out;
3404 }
3405 3975
3406 ret = ocfs2_read_xattr_bucket(inode, t_blkno, t_bhs, t_is_new); 3976 /*
3977 * Hey, if we're overwriting t_bucket, what difference does
3978 * ACCESS_CREATE vs ACCESS_WRITE make? Well, if we allocated a new
3979 * cluster to fill, we came here from
3980 * ocfs2_mv_xattr_buckets(), and it is really new -
3981 * ACCESS_CREATE is required. But we also might have moved data
3982 * out of t_bucket before extending back into it.
3983 * ocfs2_add_new_xattr_bucket() can do this - its call to
3984 * ocfs2_add_new_xattr_cluster() may have created a new extent
3985 * and copied out the end of the old extent. Then it re-extends
3986 * the old extent back to create space for new xattrs. That's
3987 * how we get here, and the bucket isn't really new.
3988 */
3989 ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
3990 t_is_new ?
3991 OCFS2_JOURNAL_ACCESS_CREATE :
3992 OCFS2_JOURNAL_ACCESS_WRITE);
3407 if (ret) 3993 if (ret)
3408 goto out; 3994 goto out;
3409 3995
3410 for (i = 0; i < blk_per_bucket; i++) { 3996 ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
3411 ret = ocfs2_journal_access(handle, inode, t_bhs[i], 3997 ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
3412 t_is_new ?
3413 OCFS2_JOURNAL_ACCESS_CREATE :
3414 OCFS2_JOURNAL_ACCESS_WRITE);
3415 if (ret)
3416 goto out;
3417 }
3418
3419 for (i = 0; i < blk_per_bucket; i++) {
3420 memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
3421 ocfs2_journal_dirty(handle, t_bhs[i]);
3422 }
3423 3998
3424out: 3999out:
3425 if (s_bhs) { 4000 ocfs2_xattr_bucket_free(t_bucket);
3426 for (i = 0; i < blk_per_bucket; i++) 4001 ocfs2_xattr_bucket_free(s_bucket);
3427 brelse(s_bhs[i]);
3428 }
3429 kfree(s_bhs);
3430
3431 if (t_bhs) {
3432 for (i = 0; i < blk_per_bucket; i++)
3433 brelse(t_bhs[i]);
3434 }
3435 kfree(t_bhs);
3436 4002
3437 return ret; 4003 return ret;
3438} 4004}
3439 4005
3440/* 4006/*
3441 * Copy one xattr cluster from src_blk to to_blk. 4007 * src_blk points to the start of an existing extent. last_blk points to
3442 * The to_blk will become the first bucket header of the cluster, so its 4008 * last cluster in that extent. to_blk points to a newly allocated
3443 * xh_num_buckets will be initialized as the bucket num in the cluster. 4009 * extent. We copy the buckets from the cluster at last_blk to the new
4010 * extent. If start_bucket is non-zero, we skip that many buckets before
4011 * we start copying. The new extent's xh_num_buckets gets set to the
4012 * number of buckets we copied. The old extent's xh_num_buckets shrinks
4013 * by the same amount.
3444 */ 4014 */
3445static int ocfs2_cp_xattr_cluster(struct inode *inode, 4015static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
3446 handle_t *handle, 4016 u64 src_blk, u64 last_blk, u64 to_blk,
3447 struct buffer_head *first_bh, 4017 unsigned int start_bucket,
3448 u64 src_blk,
3449 u64 to_blk,
3450 u32 *first_hash) 4018 u32 *first_hash)
3451{ 4019{
3452 int i, ret, credits; 4020 int i, ret, credits;
3453 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 4021 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3454 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); 4022 int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3455 int num_buckets = ocfs2_xattr_buckets_per_cluster(osb); 4023 int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
3456 struct buffer_head *bh = NULL; 4024 struct ocfs2_xattr_bucket *old_first, *new_first;
3457 struct ocfs2_xattr_header *xh; 4025
3458 u64 to_blk_start = to_blk; 4026 mlog(0, "mv xattrs from cluster %llu to %llu\n",
4027 (unsigned long long)last_blk, (unsigned long long)to_blk);
4028
4029 BUG_ON(start_bucket >= num_buckets);
4030 if (start_bucket) {
4031 num_buckets -= start_bucket;
4032 last_blk += (start_bucket * blks_per_bucket);
4033 }
4034
4035 /* The first bucket of the original extent */
4036 old_first = ocfs2_xattr_bucket_new(inode);
4037 /* The first bucket of the new extent */
4038 new_first = ocfs2_xattr_bucket_new(inode);
4039 if (!old_first || !new_first) {
4040 ret = -ENOMEM;
4041 mlog_errno(ret);
4042 goto out;
4043 }
3459 4044
3460 mlog(0, "cp xattrs from cluster %llu to %llu\n", 4045 ret = ocfs2_read_xattr_bucket(old_first, src_blk);
3461 (unsigned long long)src_blk, (unsigned long long)to_blk); 4046 if (ret) {
4047 mlog_errno(ret);
4048 goto out;
4049 }
3462 4050
3463 /* 4051 /*
3464 * We need to update the new cluster and 1 more for the update of 4052 * We need to update the first bucket of the old extent and all
3465 * the 1st bucket of the previous extent rec. 4053 * the buckets going to the new extent.
3466 */ 4054 */
3467 credits = bpc + 1; 4055 credits = ((num_buckets + 1) * blks_per_bucket) +
4056 handle->h_buffer_credits;
3468 ret = ocfs2_extend_trans(handle, credits); 4057 ret = ocfs2_extend_trans(handle, credits);
3469 if (ret) { 4058 if (ret) {
3470 mlog_errno(ret); 4059 mlog_errno(ret);
3471 goto out; 4060 goto out;
3472 } 4061 }
3473 4062
3474 ret = ocfs2_journal_access(handle, inode, first_bh, 4063 ret = ocfs2_xattr_bucket_journal_access(handle, old_first,
3475 OCFS2_JOURNAL_ACCESS_WRITE); 4064 OCFS2_JOURNAL_ACCESS_WRITE);
3476 if (ret) { 4065 if (ret) {
3477 mlog_errno(ret); 4066 mlog_errno(ret);
3478 goto out; 4067 goto out;
@@ -3480,45 +4069,45 @@ static int ocfs2_cp_xattr_cluster(struct inode *inode,
3480 4069
3481 for (i = 0; i < num_buckets; i++) { 4070 for (i = 0; i < num_buckets; i++) {
3482 ret = ocfs2_cp_xattr_bucket(inode, handle, 4071 ret = ocfs2_cp_xattr_bucket(inode, handle,
3483 src_blk, to_blk, 1); 4072 last_blk + (i * blks_per_bucket),
4073 to_blk + (i * blks_per_bucket),
4074 1);
3484 if (ret) { 4075 if (ret) {
3485 mlog_errno(ret); 4076 mlog_errno(ret);
3486 goto out; 4077 goto out;
3487 } 4078 }
3488
3489 src_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3490 to_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3491 } 4079 }
3492 4080
3493 /* update the old bucket header. */ 4081 /*
3494 xh = (struct ocfs2_xattr_header *)first_bh->b_data; 4082 * Get the new bucket ready before we dirty anything
3495 le16_add_cpu(&xh->xh_num_buckets, -num_buckets); 4083 * (This actually shouldn't fail, because we already dirtied
3496 4084 * it once in ocfs2_cp_xattr_bucket()).
3497 ocfs2_journal_dirty(handle, first_bh); 4085 */
3498 4086 ret = ocfs2_read_xattr_bucket(new_first, to_blk);
3499 /* update the new bucket header. */ 4087 if (ret) {
3500 ret = ocfs2_read_block(inode, to_blk_start, &bh);
3501 if (ret < 0) {
3502 mlog_errno(ret); 4088 mlog_errno(ret);
3503 goto out; 4089 goto out;
3504 } 4090 }
3505 4091 ret = ocfs2_xattr_bucket_journal_access(handle, new_first,
3506 ret = ocfs2_journal_access(handle, inode, bh, 4092 OCFS2_JOURNAL_ACCESS_WRITE);
3507 OCFS2_JOURNAL_ACCESS_WRITE);
3508 if (ret) { 4093 if (ret) {
3509 mlog_errno(ret); 4094 mlog_errno(ret);
3510 goto out; 4095 goto out;
3511 } 4096 }
3512 4097
3513 xh = (struct ocfs2_xattr_header *)bh->b_data; 4098 /* Now update the headers */
3514 xh->xh_num_buckets = cpu_to_le16(num_buckets); 4099 le16_add_cpu(&bucket_xh(old_first)->xh_num_buckets, -num_buckets);
4100 ocfs2_xattr_bucket_journal_dirty(handle, old_first);
3515 4101
3516 ocfs2_journal_dirty(handle, bh); 4102 bucket_xh(new_first)->xh_num_buckets = cpu_to_le16(num_buckets);
4103 ocfs2_xattr_bucket_journal_dirty(handle, new_first);
3517 4104
3518 if (first_hash) 4105 if (first_hash)
3519 *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash); 4106 *first_hash = le32_to_cpu(bucket_xh(new_first)->xh_entries[0].xe_name_hash);
4107
3520out: 4108out:
3521 brelse(bh); 4109 ocfs2_xattr_bucket_free(new_first);
4110 ocfs2_xattr_bucket_free(old_first);
3522 return ret; 4111 return ret;
3523} 4112}
3524 4113
@@ -3534,7 +4123,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
3534 u32 *first_hash) 4123 u32 *first_hash)
3535{ 4124{
3536 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); 4125 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3537 int ret, credits = 2 * blk_per_bucket; 4126 int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits;
3538 4127
3539 BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize); 4128 BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
3540 4129
@@ -3577,43 +4166,49 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
3577 */ 4166 */
3578static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode, 4167static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
3579 handle_t *handle, 4168 handle_t *handle,
3580 struct buffer_head **first_bh, 4169 struct ocfs2_xattr_bucket *first,
3581 struct buffer_head **header_bh, 4170 struct ocfs2_xattr_bucket *target,
3582 u64 new_blk, 4171 u64 new_blk,
3583 u64 prev_blk,
3584 u32 prev_clusters, 4172 u32 prev_clusters,
3585 u32 *v_start, 4173 u32 *v_start,
3586 int *extend) 4174 int *extend)
3587{ 4175{
3588 int ret = 0; 4176 int ret;
3589 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
3590 4177
3591 mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n", 4178 mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
3592 (unsigned long long)prev_blk, prev_clusters, 4179 (unsigned long long)bucket_blkno(first), prev_clusters,
3593 (unsigned long long)new_blk); 4180 (unsigned long long)new_blk);
3594 4181
3595 if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) 4182 if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) {
3596 ret = ocfs2_mv_xattr_bucket_cross_cluster(inode, 4183 ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
3597 handle, 4184 handle,
3598 first_bh, 4185 first, target,
3599 header_bh,
3600 new_blk, 4186 new_blk,
3601 prev_blk,
3602 prev_clusters, 4187 prev_clusters,
3603 v_start); 4188 v_start);
3604 else { 4189 if (ret)
3605 u64 last_blk = prev_blk + bpc * (prev_clusters - 1); 4190 mlog_errno(ret);
3606 4191 } else {
3607 if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk) 4192 /* The start of the last cluster in the first extent */
3608 ret = ocfs2_cp_xattr_cluster(inode, handle, *first_bh, 4193 u64 last_blk = bucket_blkno(first) +
3609 last_blk, new_blk, 4194 ((prev_clusters - 1) *
4195 ocfs2_clusters_to_blocks(inode->i_sb, 1));
4196
4197 if (prev_clusters > 1 && bucket_blkno(target) != last_blk) {
4198 ret = ocfs2_mv_xattr_buckets(inode, handle,
4199 bucket_blkno(first),
4200 last_blk, new_blk, 0,
3610 v_start); 4201 v_start);
3611 else { 4202 if (ret)
4203 mlog_errno(ret);
4204 } else {
3612 ret = ocfs2_divide_xattr_cluster(inode, handle, 4205 ret = ocfs2_divide_xattr_cluster(inode, handle,
3613 last_blk, new_blk, 4206 last_blk, new_blk,
3614 v_start); 4207 v_start);
4208 if (ret)
4209 mlog_errno(ret);
3615 4210
3616 if ((*header_bh)->b_blocknr == last_blk && extend) 4211 if ((bucket_blkno(target) == last_blk) && extend)
3617 *extend = 0; 4212 *extend = 0;
3618 } 4213 }
3619 } 4214 }
@@ -3639,56 +4234,37 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
3639 */ 4234 */
3640static int ocfs2_add_new_xattr_cluster(struct inode *inode, 4235static int ocfs2_add_new_xattr_cluster(struct inode *inode,
3641 struct buffer_head *root_bh, 4236 struct buffer_head *root_bh,
3642 struct buffer_head **first_bh, 4237 struct ocfs2_xattr_bucket *first,
3643 struct buffer_head **header_bh, 4238 struct ocfs2_xattr_bucket *target,
3644 u32 *num_clusters, 4239 u32 *num_clusters,
3645 u32 prev_cpos, 4240 u32 prev_cpos,
3646 u64 prev_blkno, 4241 int *extend,
3647 int *extend) 4242 struct ocfs2_xattr_set_ctxt *ctxt)
3648{ 4243{
3649 int ret, credits; 4244 int ret;
3650 u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); 4245 u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
3651 u32 prev_clusters = *num_clusters; 4246 u32 prev_clusters = *num_clusters;
3652 u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0; 4247 u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0;
3653 u64 block; 4248 u64 block;
3654 handle_t *handle = NULL; 4249 handle_t *handle = ctxt->handle;
3655 struct ocfs2_alloc_context *data_ac = NULL;
3656 struct ocfs2_alloc_context *meta_ac = NULL;
3657 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 4250 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3658 struct ocfs2_extent_tree et; 4251 struct ocfs2_extent_tree et;
3659 4252
3660 mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, " 4253 mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
3661 "previous xattr blkno = %llu\n", 4254 "previous xattr blkno = %llu\n",
3662 (unsigned long long)OCFS2_I(inode)->ip_blkno, 4255 (unsigned long long)OCFS2_I(inode)->ip_blkno,
3663 prev_cpos, (unsigned long long)prev_blkno); 4256 prev_cpos, (unsigned long long)bucket_blkno(first));
3664 4257
3665 ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh); 4258 ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
3666 4259
3667 ret = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, 4260 ret = ocfs2_journal_access_xb(handle, inode, root_bh,
3668 &data_ac, &meta_ac); 4261 OCFS2_JOURNAL_ACCESS_WRITE);
3669 if (ret) {
3670 mlog_errno(ret);
3671 goto leave;
3672 }
3673
3674 credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
3675 clusters_to_add);
3676 handle = ocfs2_start_trans(osb, credits);
3677 if (IS_ERR(handle)) {
3678 ret = PTR_ERR(handle);
3679 handle = NULL;
3680 mlog_errno(ret);
3681 goto leave;
3682 }
3683
3684 ret = ocfs2_journal_access(handle, inode, root_bh,
3685 OCFS2_JOURNAL_ACCESS_WRITE);
3686 if (ret < 0) { 4262 if (ret < 0) {
3687 mlog_errno(ret); 4263 mlog_errno(ret);
3688 goto leave; 4264 goto leave;
3689 } 4265 }
3690 4266
3691 ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 4267 ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1,
3692 clusters_to_add, &bit_off, &num_bits); 4268 clusters_to_add, &bit_off, &num_bits);
3693 if (ret < 0) { 4269 if (ret < 0) {
3694 if (ret != -ENOSPC) 4270 if (ret != -ENOSPC)
@@ -3702,7 +4278,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
3702 mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n", 4278 mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n",
3703 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); 4279 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
3704 4280
3705 if (prev_blkno + prev_clusters * bpc == block && 4281 if (bucket_blkno(first) + (prev_clusters * bpc) == block &&
3706 (prev_clusters + num_bits) << osb->s_clustersize_bits <= 4282 (prev_clusters + num_bits) << osb->s_clustersize_bits <=
3707 OCFS2_MAX_XATTR_TREE_LEAF_SIZE) { 4283 OCFS2_MAX_XATTR_TREE_LEAF_SIZE) {
3708 /* 4284 /*
@@ -3721,10 +4297,9 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
3721 } else { 4297 } else {
3722 ret = ocfs2_adjust_xattr_cross_cluster(inode, 4298 ret = ocfs2_adjust_xattr_cross_cluster(inode,
3723 handle, 4299 handle,
3724 first_bh, 4300 first,
3725 header_bh, 4301 target,
3726 block, 4302 block,
3727 prev_blkno,
3728 prev_clusters, 4303 prev_clusters,
3729 &v_start, 4304 &v_start,
3730 extend); 4305 extend);
@@ -3734,149 +4309,137 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
3734 } 4309 }
3735 } 4310 }
3736 4311
3737 if (handle->h_buffer_credits < credits) {
3738 /*
3739 * The journal has been restarted before, and don't
3740 * have enough space for the insertion, so extend it
3741 * here.
3742 */
3743 ret = ocfs2_extend_trans(handle, credits);
3744 if (ret) {
3745 mlog_errno(ret);
3746 goto leave;
3747 }
3748 }
3749 mlog(0, "Insert %u clusters at block %llu for xattr at %u\n", 4312 mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
3750 num_bits, (unsigned long long)block, v_start); 4313 num_bits, (unsigned long long)block, v_start);
3751 ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block, 4314 ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block,
3752 num_bits, 0, meta_ac); 4315 num_bits, 0, ctxt->meta_ac);
3753 if (ret < 0) { 4316 if (ret < 0) {
3754 mlog_errno(ret); 4317 mlog_errno(ret);
3755 goto leave; 4318 goto leave;
3756 } 4319 }
3757 4320
3758 ret = ocfs2_journal_dirty(handle, root_bh); 4321 ret = ocfs2_journal_dirty(handle, root_bh);
3759 if (ret < 0) { 4322 if (ret < 0)
3760 mlog_errno(ret); 4323 mlog_errno(ret);
3761 goto leave;
3762 }
3763 4324
3764leave: 4325leave:
3765 if (handle)
3766 ocfs2_commit_trans(osb, handle);
3767 if (data_ac)
3768 ocfs2_free_alloc_context(data_ac);
3769 if (meta_ac)
3770 ocfs2_free_alloc_context(meta_ac);
3771
3772 return ret; 4326 return ret;
3773} 4327}
3774 4328
3775/* 4329/*
3776 * Extend a new xattr bucket and move xattrs to the end one by one until 4330 * We are given an extent. 'first' is the bucket at the very front of
3777 * We meet with start_bh. Only move half of the xattrs to the bucket after it. 4331 * the extent. The extent has space for an additional bucket past
4332 * bucket_xh(first)->xh_num_buckets. 'target_blkno' is the block number
4333 * of the target bucket. We wish to shift every bucket past the target
4334 * down one, filling in that additional space. When we get back to the
4335 * target, we split the target between itself and the now-empty bucket
4336 * at target+1 (aka, target_blkno + blks_per_bucket).
3778 */ 4337 */
3779static int ocfs2_extend_xattr_bucket(struct inode *inode, 4338static int ocfs2_extend_xattr_bucket(struct inode *inode,
3780 struct buffer_head *first_bh, 4339 handle_t *handle,
3781 struct buffer_head *start_bh, 4340 struct ocfs2_xattr_bucket *first,
4341 u64 target_blk,
3782 u32 num_clusters) 4342 u32 num_clusters)
3783{ 4343{
3784 int ret, credits; 4344 int ret, credits;
3785 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 4345 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3786 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); 4346 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3787 u64 start_blk = start_bh->b_blocknr, end_blk; 4347 u64 end_blk;
3788 u32 num_buckets = num_clusters * ocfs2_xattr_buckets_per_cluster(osb); 4348 u16 new_bucket = le16_to_cpu(bucket_xh(first)->xh_num_buckets);
3789 handle_t *handle;
3790 struct ocfs2_xattr_header *first_xh =
3791 (struct ocfs2_xattr_header *)first_bh->b_data;
3792 u16 bucket = le16_to_cpu(first_xh->xh_num_buckets);
3793 4349
3794 mlog(0, "extend xattr bucket in %llu, xattr extend rec starting " 4350 mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
3795 "from %llu, len = %u\n", (unsigned long long)start_blk, 4351 "from %llu, len = %u\n", (unsigned long long)target_blk,
3796 (unsigned long long)first_bh->b_blocknr, num_clusters); 4352 (unsigned long long)bucket_blkno(first), num_clusters);
3797 4353
3798 BUG_ON(bucket >= num_buckets); 4354 /* The extent must have room for an additional bucket */
4355 BUG_ON(new_bucket >=
4356 (num_clusters * ocfs2_xattr_buckets_per_cluster(osb)));
3799 4357
3800 end_blk = first_bh->b_blocknr + (bucket - 1) * blk_per_bucket; 4358 /* end_blk points to the last existing bucket */
4359 end_blk = bucket_blkno(first) + ((new_bucket - 1) * blk_per_bucket);
3801 4360
3802 /* 4361 /*
3803 * We will touch all the buckets after the start_bh(include it). 4362 * end_blk is the start of the last existing bucket.
3804 * Add one more bucket and modify the first_bh. 4363 * Thus, (end_blk - target_blk) covers the target bucket and
4364 * every bucket after it up to, but not including, the last
4365 * existing bucket. Then we add the last existing bucket, the
4366 * new bucket, and the first bucket (3 * blk_per_bucket).
3805 */ 4367 */
3806 credits = end_blk - start_blk + 2 * blk_per_bucket + 1; 4368 credits = (end_blk - target_blk) + (3 * blk_per_bucket) +
3807 handle = ocfs2_start_trans(osb, credits); 4369 handle->h_buffer_credits;
3808 if (IS_ERR(handle)) { 4370 ret = ocfs2_extend_trans(handle, credits);
3809 ret = PTR_ERR(handle); 4371 if (ret) {
3810 handle = NULL;
3811 mlog_errno(ret); 4372 mlog_errno(ret);
3812 goto out; 4373 goto out;
3813 } 4374 }
3814 4375
3815 ret = ocfs2_journal_access(handle, inode, first_bh, 4376 ret = ocfs2_xattr_bucket_journal_access(handle, first,
3816 OCFS2_JOURNAL_ACCESS_WRITE); 4377 OCFS2_JOURNAL_ACCESS_WRITE);
3817 if (ret) { 4378 if (ret) {
3818 mlog_errno(ret); 4379 mlog_errno(ret);
3819 goto commit; 4380 goto out;
3820 } 4381 }
3821 4382
3822 while (end_blk != start_blk) { 4383 while (end_blk != target_blk) {
3823 ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk, 4384 ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk,
3824 end_blk + blk_per_bucket, 0); 4385 end_blk + blk_per_bucket, 0);
3825 if (ret) 4386 if (ret)
3826 goto commit; 4387 goto out;
3827 end_blk -= blk_per_bucket; 4388 end_blk -= blk_per_bucket;
3828 } 4389 }
3829 4390
3830 /* Move half of the xattr in start_blk to the next bucket. */ 4391 /* Move half of the xattr in target_blkno to the next bucket. */
3831 ret = ocfs2_divide_xattr_bucket(inode, handle, start_blk, 4392 ret = ocfs2_divide_xattr_bucket(inode, handle, target_blk,
3832 start_blk + blk_per_bucket, NULL, 0); 4393 target_blk + blk_per_bucket, NULL, 0);
3833 4394
3834 le16_add_cpu(&first_xh->xh_num_buckets, 1); 4395 le16_add_cpu(&bucket_xh(first)->xh_num_buckets, 1);
3835 ocfs2_journal_dirty(handle, first_bh); 4396 ocfs2_xattr_bucket_journal_dirty(handle, first);
3836 4397
3837commit:
3838 ocfs2_commit_trans(osb, handle);
3839out: 4398out:
3840 return ret; 4399 return ret;
3841} 4400}
3842 4401
3843/* 4402/*
3844 * Add new xattr bucket in an extent record and adjust the buckets accordingly. 4403 * Add new xattr bucket in an extent record and adjust the buckets
3845 * xb_bh is the ocfs2_xattr_block. 4404 * accordingly. xb_bh is the ocfs2_xattr_block, and target is the
3846 * We will move all the buckets starting from header_bh to the next place. As 4405 * bucket we want to insert into.
3847 * for this one, half num of its xattrs will be moved to the next one. 4406 *
4407 * In the easy case, we will move all the buckets after target down by
4408 * one. Half of target's xattrs will be moved to the next bucket.
3848 * 4409 *
3849 * We will allocate a new cluster if current cluster is full and adjust 4410 * If current cluster is full, we'll allocate a new one. This may not
3850 * header_bh and first_bh if the insert place is moved to the new cluster. 4411 * be contiguous. The underlying calls will make sure that there is
4412 * space for the insert, shifting buckets around if necessary.
4413 * 'target' may be moved by those calls.
3851 */ 4414 */
3852static int ocfs2_add_new_xattr_bucket(struct inode *inode, 4415static int ocfs2_add_new_xattr_bucket(struct inode *inode,
3853 struct buffer_head *xb_bh, 4416 struct buffer_head *xb_bh,
3854 struct buffer_head *header_bh) 4417 struct ocfs2_xattr_bucket *target,
4418 struct ocfs2_xattr_set_ctxt *ctxt)
3855{ 4419{
3856 struct ocfs2_xattr_header *first_xh = NULL;
3857 struct buffer_head *first_bh = NULL;
3858 struct ocfs2_xattr_block *xb = 4420 struct ocfs2_xattr_block *xb =
3859 (struct ocfs2_xattr_block *)xb_bh->b_data; 4421 (struct ocfs2_xattr_block *)xb_bh->b_data;
3860 struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root; 4422 struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
3861 struct ocfs2_extent_list *el = &xb_root->xt_list; 4423 struct ocfs2_extent_list *el = &xb_root->xt_list;
3862 struct ocfs2_xattr_header *xh = 4424 u32 name_hash =
3863 (struct ocfs2_xattr_header *)header_bh->b_data; 4425 le32_to_cpu(bucket_xh(target)->xh_entries[0].xe_name_hash);
3864 u32 name_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash); 4426 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3865 struct super_block *sb = inode->i_sb;
3866 struct ocfs2_super *osb = OCFS2_SB(sb);
3867 int ret, num_buckets, extend = 1; 4427 int ret, num_buckets, extend = 1;
3868 u64 p_blkno; 4428 u64 p_blkno;
3869 u32 e_cpos, num_clusters; 4429 u32 e_cpos, num_clusters;
4430 /* The bucket at the front of the extent */
4431 struct ocfs2_xattr_bucket *first;
3870 4432
3871 mlog(0, "Add new xattr bucket starting form %llu\n", 4433 mlog(0, "Add new xattr bucket starting from %llu\n",
3872 (unsigned long long)header_bh->b_blocknr); 4434 (unsigned long long)bucket_blkno(target));
3873 4435
3874 /* 4436 /* The first bucket of the original extent */
3875 * Add refrence for header_bh here because it may be 4437 first = ocfs2_xattr_bucket_new(inode);
3876 * changed in ocfs2_add_new_xattr_cluster and we need 4438 if (!first) {
3877 * to free it in the end. 4439 ret = -ENOMEM;
3878 */ 4440 mlog_errno(ret);
3879 get_bh(header_bh); 4441 goto out;
4442 }
3880 4443
3881 ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos, 4444 ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos,
3882 &num_clusters, el); 4445 &num_clusters, el);
@@ -3885,40 +4448,45 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode,
3885 goto out; 4448 goto out;
3886 } 4449 }
3887 4450
3888 ret = ocfs2_read_block(inode, p_blkno, &first_bh); 4451 ret = ocfs2_read_xattr_bucket(first, p_blkno);
3889 if (ret) { 4452 if (ret) {
3890 mlog_errno(ret); 4453 mlog_errno(ret);
3891 goto out; 4454 goto out;
3892 } 4455 }
3893 4456
3894 num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters; 4457 num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters;
3895 first_xh = (struct ocfs2_xattr_header *)first_bh->b_data; 4458 if (num_buckets == le16_to_cpu(bucket_xh(first)->xh_num_buckets)) {
3896 4459 /*
3897 if (num_buckets == le16_to_cpu(first_xh->xh_num_buckets)) { 4460 * This can move first+target if the target bucket moves
4461 * to the new extent.
4462 */
3898 ret = ocfs2_add_new_xattr_cluster(inode, 4463 ret = ocfs2_add_new_xattr_cluster(inode,
3899 xb_bh, 4464 xb_bh,
3900 &first_bh, 4465 first,
3901 &header_bh, 4466 target,
3902 &num_clusters, 4467 &num_clusters,
3903 e_cpos, 4468 e_cpos,
3904 p_blkno, 4469 &extend,
3905 &extend); 4470 ctxt);
3906 if (ret) { 4471 if (ret) {
3907 mlog_errno(ret); 4472 mlog_errno(ret);
3908 goto out; 4473 goto out;
3909 } 4474 }
3910 } 4475 }
3911 4476
3912 if (extend) 4477 if (extend) {
3913 ret = ocfs2_extend_xattr_bucket(inode, 4478 ret = ocfs2_extend_xattr_bucket(inode,
3914 first_bh, 4479 ctxt->handle,
3915 header_bh, 4480 first,
4481 bucket_blkno(target),
3916 num_clusters); 4482 num_clusters);
3917 if (ret) 4483 if (ret)
3918 mlog_errno(ret); 4484 mlog_errno(ret);
4485 }
4486
3919out: 4487out:
3920 brelse(first_bh); 4488 ocfs2_xattr_bucket_free(first);
3921 brelse(header_bh); 4489
3922 return ret; 4490 return ret;
3923} 4491}
3924 4492
@@ -3929,7 +4497,7 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
3929 int block_off = offs >> inode->i_sb->s_blocksize_bits; 4497 int block_off = offs >> inode->i_sb->s_blocksize_bits;
3930 4498
3931 offs = offs % inode->i_sb->s_blocksize; 4499 offs = offs % inode->i_sb->s_blocksize;
3932 return bucket->bhs[block_off]->b_data + offs; 4500 return bucket_block(bucket, block_off) + offs;
3933} 4501}
3934 4502
3935/* 4503/*
@@ -3984,7 +4552,7 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
3984 xe->xe_value_size = 0; 4552 xe->xe_value_size = 0;
3985 4553
3986 val = ocfs2_xattr_bucket_get_val(inode, 4554 val = ocfs2_xattr_bucket_get_val(inode,
3987 &xs->bucket, offs); 4555 xs->bucket, offs);
3988 memset(val + OCFS2_XATTR_SIZE(name_len), 0, 4556 memset(val + OCFS2_XATTR_SIZE(name_len), 0,
3989 size - OCFS2_XATTR_SIZE(name_len)); 4557 size - OCFS2_XATTR_SIZE(name_len));
3990 if (OCFS2_XATTR_SIZE(xi->value_len) > 0) 4558 if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
@@ -4062,8 +4630,7 @@ set_new_name_value:
4062 xh->xh_free_start = cpu_to_le16(offs); 4630 xh->xh_free_start = cpu_to_le16(offs);
4063 } 4631 }
4064 4632
4065 val = ocfs2_xattr_bucket_get_val(inode, 4633 val = ocfs2_xattr_bucket_get_val(inode, xs->bucket, offs - size);
4066 &xs->bucket, offs - size);
4067 xe->xe_name_offset = cpu_to_le16(offs - size); 4634 xe->xe_name_offset = cpu_to_le16(offs - size);
4068 4635
4069 memset(val, 0, size); 4636 memset(val, 0, size);
@@ -4079,125 +4646,45 @@ set_new_name_value:
4079 return; 4646 return;
4080} 4647}
4081 4648
4082static int ocfs2_xattr_bucket_handle_journal(struct inode *inode,
4083 handle_t *handle,
4084 struct ocfs2_xattr_search *xs,
4085 struct buffer_head **bhs,
4086 u16 bh_num)
4087{
4088 int ret = 0, off, block_off;
4089 struct ocfs2_xattr_entry *xe = xs->here;
4090
4091 /*
4092 * First calculate all the blocks we should journal_access
4093 * and journal_dirty. The first block should always be touched.
4094 */
4095 ret = ocfs2_journal_dirty(handle, bhs[0]);
4096 if (ret)
4097 mlog_errno(ret);
4098
4099 /* calc the data. */
4100 off = le16_to_cpu(xe->xe_name_offset);
4101 block_off = off >> inode->i_sb->s_blocksize_bits;
4102 ret = ocfs2_journal_dirty(handle, bhs[block_off]);
4103 if (ret)
4104 mlog_errno(ret);
4105
4106 return ret;
4107}
4108
4109/* 4649/*
4110 * Set the xattr entry in the specified bucket. 4650 * Set the xattr entry in the specified bucket.
4111 * The bucket is indicated by xs->bucket and it should have the enough 4651 * The bucket is indicated by xs->bucket and it should have the enough
4112 * space for the xattr insertion. 4652 * space for the xattr insertion.
4113 */ 4653 */
4114static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode, 4654static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
4655 handle_t *handle,
4115 struct ocfs2_xattr_info *xi, 4656 struct ocfs2_xattr_info *xi,
4116 struct ocfs2_xattr_search *xs, 4657 struct ocfs2_xattr_search *xs,
4117 u32 name_hash, 4658 u32 name_hash,
4118 int local) 4659 int local)
4119{ 4660{
4120 int i, ret; 4661 int ret;
4121 handle_t *handle = NULL; 4662 u64 blkno;
4122 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
4123 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
4124 4663
4125 mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n", 4664 mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
4126 (unsigned long)xi->value_len, xi->name_index, 4665 (unsigned long)xi->value_len, xi->name_index,
4127 (unsigned long long)xs->bucket.bhs[0]->b_blocknr); 4666 (unsigned long long)bucket_blkno(xs->bucket));
4128 4667
4129 if (!xs->bucket.bhs[1]) { 4668 if (!xs->bucket->bu_bhs[1]) {
4130 ret = ocfs2_read_blocks(inode, 4669 blkno = bucket_blkno(xs->bucket);
4131 xs->bucket.bhs[0]->b_blocknr + 1, 4670 ocfs2_xattr_bucket_relse(xs->bucket);
4132 blk_per_bucket - 1, &xs->bucket.bhs[1], 4671 ret = ocfs2_read_xattr_bucket(xs->bucket, blkno);
4133 0);
4134 if (ret) { 4672 if (ret) {
4135 mlog_errno(ret); 4673 mlog_errno(ret);
4136 goto out; 4674 goto out;
4137 } 4675 }
4138 } 4676 }
4139 4677
4140 handle = ocfs2_start_trans(osb, blk_per_bucket); 4678 ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
4141 if (IS_ERR(handle)) { 4679 OCFS2_JOURNAL_ACCESS_WRITE);
4142 ret = PTR_ERR(handle); 4680 if (ret < 0) {
4143 handle = NULL;
4144 mlog_errno(ret); 4681 mlog_errno(ret);
4145 goto out; 4682 goto out;
4146 } 4683 }
4147 4684
4148 for (i = 0; i < blk_per_bucket; i++) {
4149 ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[i],
4150 OCFS2_JOURNAL_ACCESS_WRITE);
4151 if (ret < 0) {
4152 mlog_errno(ret);
4153 goto out;
4154 }
4155 }
4156
4157 ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local); 4685 ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
4686 ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
4158 4687
4159 /*Only dirty the blocks we have touched in set xattr. */
4160 ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs,
4161 xs->bucket.bhs, blk_per_bucket);
4162 if (ret)
4163 mlog_errno(ret);
4164out:
4165 ocfs2_commit_trans(osb, handle);
4166
4167 return ret;
4168}
4169
4170static int ocfs2_xattr_value_update_size(struct inode *inode,
4171 struct buffer_head *xe_bh,
4172 struct ocfs2_xattr_entry *xe,
4173 u64 new_size)
4174{
4175 int ret;
4176 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
4177 handle_t *handle = NULL;
4178
4179 handle = ocfs2_start_trans(osb, 1);
4180 if (IS_ERR(handle)) {
4181 ret = -ENOMEM;
4182 mlog_errno(ret);
4183 goto out;
4184 }
4185
4186 ret = ocfs2_journal_access(handle, inode, xe_bh,
4187 OCFS2_JOURNAL_ACCESS_WRITE);
4188 if (ret < 0) {
4189 mlog_errno(ret);
4190 goto out_commit;
4191 }
4192
4193 xe->xe_value_size = cpu_to_le64(new_size);
4194
4195 ret = ocfs2_journal_dirty(handle, xe_bh);
4196 if (ret < 0)
4197 mlog_errno(ret);
4198
4199out_commit:
4200 ocfs2_commit_trans(osb, handle);
4201out: 4688out:
4202 return ret; 4689 return ret;
4203} 4690}
@@ -4210,18 +4697,19 @@ out:
4210 * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed. 4697 * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed.
4211 */ 4698 */
4212static int ocfs2_xattr_bucket_value_truncate(struct inode *inode, 4699static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
4213 struct buffer_head *header_bh, 4700 struct ocfs2_xattr_bucket *bucket,
4214 int xe_off, 4701 int xe_off,
4215 int len) 4702 int len,
4703 struct ocfs2_xattr_set_ctxt *ctxt)
4216{ 4704{
4217 int ret, offset; 4705 int ret, offset;
4218 u64 value_blk; 4706 u64 value_blk;
4219 struct buffer_head *value_bh = NULL;
4220 struct ocfs2_xattr_value_root *xv;
4221 struct ocfs2_xattr_entry *xe; 4707 struct ocfs2_xattr_entry *xe;
4222 struct ocfs2_xattr_header *xh = 4708 struct ocfs2_xattr_header *xh = bucket_xh(bucket);
4223 (struct ocfs2_xattr_header *)header_bh->b_data;
4224 size_t blocksize = inode->i_sb->s_blocksize; 4709 size_t blocksize = inode->i_sb->s_blocksize;
4710 struct ocfs2_xattr_value_buf vb = {
4711 .vb_access = ocfs2_journal_access,
4712 };
4225 4713
4226 xe = &xh->xh_entries[xe_off]; 4714 xe = &xh->xh_entries[xe_off];
4227 4715
@@ -4234,49 +4722,58 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
4234 4722
4235 /* We don't allow ocfs2_xattr_value to be stored in different block. */ 4723 /* We don't allow ocfs2_xattr_value to be stored in different block. */
4236 BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize); 4724 BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize);
4237 value_blk += header_bh->b_blocknr;
4238 4725
4239 ret = ocfs2_read_block(inode, value_blk, &value_bh); 4726 vb.vb_bh = bucket->bu_bhs[value_blk];
4240 if (ret) { 4727 BUG_ON(!vb.vb_bh);
4241 mlog_errno(ret);
4242 goto out;
4243 }
4244 4728
4245 xv = (struct ocfs2_xattr_value_root *) 4729 vb.vb_xv = (struct ocfs2_xattr_value_root *)
4246 (value_bh->b_data + offset % blocksize); 4730 (vb.vb_bh->b_data + offset % blocksize);
4247 4731
4248 mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n", 4732 ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket,
4249 xe_off, (unsigned long long)header_bh->b_blocknr, len); 4733 OCFS2_JOURNAL_ACCESS_WRITE);
4250 ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len);
4251 if (ret) { 4734 if (ret) {
4252 mlog_errno(ret); 4735 mlog_errno(ret);
4253 goto out; 4736 goto out;
4254 } 4737 }
4255 4738
4256 ret = ocfs2_xattr_value_update_size(inode, header_bh, xe, len); 4739 /*
4740 * From here on out we have to dirty the bucket. The generic
4741 * value calls only modify one of the bucket's bhs, but we need
4742 * to send the bucket at once. So if they error, they *could* have
4743 * modified something. We have to assume they did, and dirty
4744 * the whole bucket. This leaves us in a consistent state.
4745 */
4746 mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
4747 xe_off, (unsigned long long)bucket_blkno(bucket), len);
4748 ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt);
4257 if (ret) { 4749 if (ret) {
4258 mlog_errno(ret); 4750 mlog_errno(ret);
4259 goto out; 4751 goto out_dirty;
4260 } 4752 }
4261 4753
4754 xe->xe_value_size = cpu_to_le64(len);
4755
4756out_dirty:
4757 ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket);
4758
4262out: 4759out:
4263 brelse(value_bh);
4264 return ret; 4760 return ret;
4265} 4761}
4266 4762
4267static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode, 4763static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
4268 struct ocfs2_xattr_search *xs, 4764 struct ocfs2_xattr_search *xs,
4269 int len) 4765 int len,
4766 struct ocfs2_xattr_set_ctxt *ctxt)
4270{ 4767{
4271 int ret, offset; 4768 int ret, offset;
4272 struct ocfs2_xattr_entry *xe = xs->here; 4769 struct ocfs2_xattr_entry *xe = xs->here;
4273 struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base; 4770 struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
4274 4771
4275 BUG_ON(!xs->bucket.bhs[0] || !xe || ocfs2_xattr_is_local(xe)); 4772 BUG_ON(!xs->bucket->bu_bhs[0] || !xe || ocfs2_xattr_is_local(xe));
4276 4773
4277 offset = xe - xh->xh_entries; 4774 offset = xe - xh->xh_entries;
4278 ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket.bhs[0], 4775 ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket,
4279 offset, len); 4776 offset, len, ctxt);
4280 if (ret) 4777 if (ret)
4281 mlog_errno(ret); 4778 mlog_errno(ret);
4282 4779
@@ -4284,6 +4781,7 @@ static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
4284} 4781}
4285 4782
4286static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode, 4783static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
4784 handle_t *handle,
4287 struct ocfs2_xattr_search *xs, 4785 struct ocfs2_xattr_search *xs,
4288 char *val, 4786 char *val,
4289 int value_len) 4787 int value_len)
@@ -4299,7 +4797,8 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
4299 4797
4300 xv = (struct ocfs2_xattr_value_root *)(xs->base + offset); 4798 xv = (struct ocfs2_xattr_value_root *)(xs->base + offset);
4301 4799
4302 return __ocfs2_xattr_set_value_outside(inode, xv, val, value_len); 4800 return __ocfs2_xattr_set_value_outside(inode, handle,
4801 xv, val, value_len);
4303} 4802}
4304 4803
4305static int ocfs2_rm_xattr_cluster(struct inode *inode, 4804static int ocfs2_rm_xattr_cluster(struct inode *inode,
@@ -4343,15 +4842,15 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
4343 } 4842 }
4344 } 4843 }
4345 4844
4346 handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS); 4845 handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
4347 if (IS_ERR(handle)) { 4846 if (IS_ERR(handle)) {
4348 ret = -ENOMEM; 4847 ret = -ENOMEM;
4349 mlog_errno(ret); 4848 mlog_errno(ret);
4350 goto out; 4849 goto out;
4351 } 4850 }
4352 4851
4353 ret = ocfs2_journal_access(handle, inode, root_bh, 4852 ret = ocfs2_journal_access_xb(handle, inode, root_bh,
4354 OCFS2_JOURNAL_ACCESS_WRITE); 4853 OCFS2_JOURNAL_ACCESS_WRITE);
4355 if (ret) { 4854 if (ret) {
4356 mlog_errno(ret); 4855 mlog_errno(ret);
4357 goto out_commit; 4856 goto out_commit;
@@ -4392,26 +4891,19 @@ out:
4392} 4891}
4393 4892
4394static void ocfs2_xattr_bucket_remove_xs(struct inode *inode, 4893static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
4894 handle_t *handle,
4395 struct ocfs2_xattr_search *xs) 4895 struct ocfs2_xattr_search *xs)
4396{ 4896{
4397 handle_t *handle = NULL; 4897 struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
4398 struct ocfs2_xattr_header *xh = xs->bucket.xh;
4399 struct ocfs2_xattr_entry *last = &xh->xh_entries[ 4898 struct ocfs2_xattr_entry *last = &xh->xh_entries[
4400 le16_to_cpu(xh->xh_count) - 1]; 4899 le16_to_cpu(xh->xh_count) - 1];
4401 int ret = 0; 4900 int ret = 0;
4402 4901
4403 handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1); 4902 ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
4404 if (IS_ERR(handle)) { 4903 OCFS2_JOURNAL_ACCESS_WRITE);
4405 ret = PTR_ERR(handle);
4406 mlog_errno(ret);
4407 return;
4408 }
4409
4410 ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[0],
4411 OCFS2_JOURNAL_ACCESS_WRITE);
4412 if (ret) { 4904 if (ret) {
4413 mlog_errno(ret); 4905 mlog_errno(ret);
4414 goto out_commit; 4906 return;
4415 } 4907 }
4416 4908
4417 /* Remove the old entry. */ 4909 /* Remove the old entry. */
@@ -4420,11 +4912,7 @@ static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
4420 memset(last, 0, sizeof(struct ocfs2_xattr_entry)); 4912 memset(last, 0, sizeof(struct ocfs2_xattr_entry));
4421 le16_add_cpu(&xh->xh_count, -1); 4913 le16_add_cpu(&xh->xh_count, -1);
4422 4914
4423 ret = ocfs2_journal_dirty(handle, xs->bucket.bhs[0]); 4915 ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
4424 if (ret < 0)
4425 mlog_errno(ret);
4426out_commit:
4427 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
4428} 4916}
4429 4917
4430/* 4918/*
@@ -4440,7 +4928,8 @@ out_commit:
4440 */ 4928 */
4441static int ocfs2_xattr_set_in_bucket(struct inode *inode, 4929static int ocfs2_xattr_set_in_bucket(struct inode *inode,
4442 struct ocfs2_xattr_info *xi, 4930 struct ocfs2_xattr_info *xi,
4443 struct ocfs2_xattr_search *xs) 4931 struct ocfs2_xattr_search *xs,
4932 struct ocfs2_xattr_set_ctxt *ctxt)
4444{ 4933{
4445 int ret, local = 1; 4934 int ret, local = 1;
4446 size_t value_len; 4935 size_t value_len;
@@ -4468,7 +4957,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
4468 value_len = 0; 4957 value_len = 0;
4469 4958
4470 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs, 4959 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
4471 value_len); 4960 value_len,
4961 ctxt);
4472 if (ret) 4962 if (ret)
4473 goto out; 4963 goto out;
4474 4964
@@ -4488,7 +4978,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
4488 xi->value_len = OCFS2_XATTR_ROOT_SIZE; 4978 xi->value_len = OCFS2_XATTR_ROOT_SIZE;
4489 } 4979 }
4490 4980
4491 ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash, local); 4981 ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs,
4982 name_hash, local);
4492 if (ret) { 4983 if (ret) {
4493 mlog_errno(ret); 4984 mlog_errno(ret);
4494 goto out; 4985 goto out;
@@ -4499,7 +4990,7 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
4499 4990
4500 /* allocate the space now for the outside block storage. */ 4991 /* allocate the space now for the outside block storage. */
4501 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs, 4992 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
4502 value_len); 4993 value_len, ctxt);
4503 if (ret) { 4994 if (ret) {
4504 mlog_errno(ret); 4995 mlog_errno(ret);
4505 4996
@@ -4509,13 +5000,14 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
4509 * storage and we have allocated xattr already, 5000 * storage and we have allocated xattr already,
4510 * so need to remove it. 5001 * so need to remove it.
4511 */ 5002 */
4512 ocfs2_xattr_bucket_remove_xs(inode, xs); 5003 ocfs2_xattr_bucket_remove_xs(inode, ctxt->handle, xs);
4513 } 5004 }
4514 goto out; 5005 goto out;
4515 } 5006 }
4516 5007
4517set_value_outside: 5008set_value_outside:
4518 ret = ocfs2_xattr_bucket_set_value_outside(inode, xs, val, value_len); 5009 ret = ocfs2_xattr_bucket_set_value_outside(inode, ctxt->handle,
5010 xs, val, value_len);
4519out: 5011out:
4520 return ret; 5012 return ret;
4521} 5013}
@@ -4530,7 +5022,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
4530 struct ocfs2_xattr_bucket *bucket, 5022 struct ocfs2_xattr_bucket *bucket,
4531 const char *name) 5023 const char *name)
4532{ 5024{
4533 struct ocfs2_xattr_header *xh = bucket->xh; 5025 struct ocfs2_xattr_header *xh = bucket_xh(bucket);
4534 u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name)); 5026 u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
4535 5027
4536 if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash)) 5028 if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash))
@@ -4540,7 +5032,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
4540 xh->xh_entries[0].xe_name_hash) { 5032 xh->xh_entries[0].xe_name_hash) {
4541 mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, " 5033 mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, "
4542 "hash = %u\n", 5034 "hash = %u\n",
4543 (unsigned long long)bucket->bhs[0]->b_blocknr, 5035 (unsigned long long)bucket_blkno(bucket),
4544 le32_to_cpu(xh->xh_entries[0].xe_name_hash)); 5036 le32_to_cpu(xh->xh_entries[0].xe_name_hash));
4545 return -ENOSPC; 5037 return -ENOSPC;
4546 } 5038 }
@@ -4550,16 +5042,16 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
4550 5042
4551static int ocfs2_xattr_set_entry_index_block(struct inode *inode, 5043static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
4552 struct ocfs2_xattr_info *xi, 5044 struct ocfs2_xattr_info *xi,
4553 struct ocfs2_xattr_search *xs) 5045 struct ocfs2_xattr_search *xs,
5046 struct ocfs2_xattr_set_ctxt *ctxt)
4554{ 5047{
4555 struct ocfs2_xattr_header *xh; 5048 struct ocfs2_xattr_header *xh;
4556 struct ocfs2_xattr_entry *xe; 5049 struct ocfs2_xattr_entry *xe;
4557 u16 count, header_size, xh_free_start; 5050 u16 count, header_size, xh_free_start;
4558 int i, free, max_free, need, old; 5051 int free, max_free, need, old;
4559 size_t value_size = 0, name_len = strlen(xi->name); 5052 size_t value_size = 0, name_len = strlen(xi->name);
4560 size_t blocksize = inode->i_sb->s_blocksize; 5053 size_t blocksize = inode->i_sb->s_blocksize;
4561 int ret, allocation = 0; 5054 int ret, allocation = 0;
4562 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
4563 5055
4564 mlog_entry("Set xattr %s in xattr index block\n", xi->name); 5056 mlog_entry("Set xattr %s in xattr index block\n", xi->name);
4565 5057
@@ -4574,7 +5066,7 @@ try_again:
4574 5066
4575 mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size " 5067 mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
4576 "of %u which exceed block size\n", 5068 "of %u which exceed block size\n",
4577 (unsigned long long)xs->bucket.bhs[0]->b_blocknr, 5069 (unsigned long long)bucket_blkno(xs->bucket),
4578 header_size); 5070 header_size);
4579 5071
4580 if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) 5072 if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE)
@@ -4614,11 +5106,13 @@ try_again:
4614 mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, " 5106 mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
4615 "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len =" 5107 "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
4616 " %u\n", xs->not_found, 5108 " %u\n", xs->not_found,
4617 (unsigned long long)xs->bucket.bhs[0]->b_blocknr, 5109 (unsigned long long)bucket_blkno(xs->bucket),
4618 free, need, max_free, le16_to_cpu(xh->xh_free_start), 5110 free, need, max_free, le16_to_cpu(xh->xh_free_start),
4619 le16_to_cpu(xh->xh_name_value_len)); 5111 le16_to_cpu(xh->xh_name_value_len));
4620 5112
4621 if (free < need || count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) { 5113 if (free < need ||
5114 (xs->not_found &&
5115 count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb))) {
4622 if (need <= max_free && 5116 if (need <= max_free &&
4623 count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) { 5117 count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
4624 /* 5118 /*
@@ -4626,7 +5120,8 @@ try_again:
4626 * name/value will be moved, the xe shouldn't be changed 5120 * name/value will be moved, the xe shouldn't be changed
4627 * in xs. 5121 * in xs.
4628 */ 5122 */
4629 ret = ocfs2_defrag_xattr_bucket(inode, &xs->bucket); 5123 ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
5124 xs->bucket);
4630 if (ret) { 5125 if (ret) {
4631 mlog_errno(ret); 5126 mlog_errno(ret);
4632 goto out; 5127 goto out;
@@ -4658,7 +5153,7 @@ try_again:
4658 * add a new bucket for the insert. 5153 * add a new bucket for the insert.
4659 */ 5154 */
4660 ret = ocfs2_check_xattr_bucket_collision(inode, 5155 ret = ocfs2_check_xattr_bucket_collision(inode,
4661 &xs->bucket, 5156 xs->bucket,
4662 xi->name); 5157 xi->name);
4663 if (ret) { 5158 if (ret) {
4664 mlog_errno(ret); 5159 mlog_errno(ret);
@@ -4667,17 +5162,21 @@ try_again:
4667 5162
4668 ret = ocfs2_add_new_xattr_bucket(inode, 5163 ret = ocfs2_add_new_xattr_bucket(inode,
4669 xs->xattr_bh, 5164 xs->xattr_bh,
4670 xs->bucket.bhs[0]); 5165 xs->bucket,
5166 ctxt);
4671 if (ret) { 5167 if (ret) {
4672 mlog_errno(ret); 5168 mlog_errno(ret);
4673 goto out; 5169 goto out;
4674 } 5170 }
4675 5171
4676 for (i = 0; i < blk_per_bucket; i++) 5172 /*
4677 brelse(xs->bucket.bhs[i]); 5173 * ocfs2_add_new_xattr_bucket() will have updated
4678 5174 * xs->bucket if it moved, but it will not have updated
4679 memset(&xs->bucket, 0, sizeof(xs->bucket)); 5175 * any of the other search fields. Thus, we drop it and
4680 5176 * re-search. Everything should be cached, so it'll be
5177 * quick.
5178 */
5179 ocfs2_xattr_bucket_relse(xs->bucket);
4681 ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh, 5180 ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
4682 xi->name_index, 5181 xi->name_index,
4683 xi->name, xs); 5182 xi->name, xs);
@@ -4689,7 +5188,7 @@ try_again:
4689 } 5188 }
4690 5189
4691xattr_set: 5190xattr_set:
4692 ret = ocfs2_xattr_set_in_bucket(inode, xi, xs); 5191 ret = ocfs2_xattr_set_in_bucket(inode, xi, xs, ctxt);
4693out: 5192out:
4694 mlog_exit(ret); 5193 mlog_exit(ret);
4695 return ret; 5194 return ret;
@@ -4700,24 +5199,41 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
4700 void *para) 5199 void *para)
4701{ 5200{
4702 int ret = 0; 5201 int ret = 0;
4703 struct ocfs2_xattr_header *xh = bucket->xh; 5202 struct ocfs2_xattr_header *xh = bucket_xh(bucket);
4704 u16 i; 5203 u16 i;
4705 struct ocfs2_xattr_entry *xe; 5204 struct ocfs2_xattr_entry *xe;
5205 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5206 struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,};
5207 int credits = ocfs2_remove_extent_credits(osb->sb) +
5208 ocfs2_blocks_per_xattr_bucket(inode->i_sb);
5209
5210
5211 ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
4706 5212
4707 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { 5213 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
4708 xe = &xh->xh_entries[i]; 5214 xe = &xh->xh_entries[i];
4709 if (ocfs2_xattr_is_local(xe)) 5215 if (ocfs2_xattr_is_local(xe))
4710 continue; 5216 continue;
4711 5217
4712 ret = ocfs2_xattr_bucket_value_truncate(inode, 5218 ctxt.handle = ocfs2_start_trans(osb, credits);
4713 bucket->bhs[0], 5219 if (IS_ERR(ctxt.handle)) {
4714 i, 0); 5220 ret = PTR_ERR(ctxt.handle);
5221 mlog_errno(ret);
5222 break;
5223 }
5224
5225 ret = ocfs2_xattr_bucket_value_truncate(inode, bucket,
5226 i, 0, &ctxt);
5227
5228 ocfs2_commit_trans(osb, ctxt.handle);
4715 if (ret) { 5229 if (ret) {
4716 mlog_errno(ret); 5230 mlog_errno(ret);
4717 break; 5231 break;
4718 } 5232 }
4719 } 5233 }
4720 5234
5235 ocfs2_schedule_truncate_log_flush(osb, 1);
5236 ocfs2_run_deallocs(osb, &ctxt.dealloc);
4721 return ret; 5237 return ret;
4722} 5238}
4723 5239
@@ -4768,6 +5284,74 @@ out:
4768} 5284}
4769 5285
4770/* 5286/*
5287 * 'security' attributes support
5288 */
5289static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
5290 size_t list_size, const char *name,
5291 size_t name_len)
5292{
5293 const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
5294 const size_t total_len = prefix_len + name_len + 1;
5295
5296 if (list && total_len <= list_size) {
5297 memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
5298 memcpy(list + prefix_len, name, name_len);
5299 list[prefix_len + name_len] = '\0';
5300 }
5301 return total_len;
5302}
5303
5304static int ocfs2_xattr_security_get(struct inode *inode, const char *name,
5305 void *buffer, size_t size)
5306{
5307 if (strcmp(name, "") == 0)
5308 return -EINVAL;
5309 return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_SECURITY, name,
5310 buffer, size);
5311}
5312
5313static int ocfs2_xattr_security_set(struct inode *inode, const char *name,
5314 const void *value, size_t size, int flags)
5315{
5316 if (strcmp(name, "") == 0)
5317 return -EINVAL;
5318
5319 return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, name, value,
5320 size, flags);
5321}
5322
5323int ocfs2_init_security_get(struct inode *inode,
5324 struct inode *dir,
5325 struct ocfs2_security_xattr_info *si)
5326{
5327 /* check whether ocfs2 support feature xattr */
5328 if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
5329 return -EOPNOTSUPP;
5330 return security_inode_init_security(inode, dir, &si->name, &si->value,
5331 &si->value_len);
5332}
5333
5334int ocfs2_init_security_set(handle_t *handle,
5335 struct inode *inode,
5336 struct buffer_head *di_bh,
5337 struct ocfs2_security_xattr_info *si,
5338 struct ocfs2_alloc_context *xattr_ac,
5339 struct ocfs2_alloc_context *data_ac)
5340{
5341 return ocfs2_xattr_set_handle(handle, inode, di_bh,
5342 OCFS2_XATTR_INDEX_SECURITY,
5343 si->name, si->value, si->value_len, 0,
5344 xattr_ac, data_ac);
5345}
5346
5347struct xattr_handler ocfs2_xattr_security_handler = {
5348 .prefix = XATTR_SECURITY_PREFIX,
5349 .list = ocfs2_xattr_security_list,
5350 .get = ocfs2_xattr_security_get,
5351 .set = ocfs2_xattr_security_set,
5352};
5353
5354/*
4771 * 'trusted' attributes support 5355 * 'trusted' attributes support
4772 */ 5356 */
4773static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list, 5357static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 1d8314c7656d..5a1ebc789f7e 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -30,13 +30,58 @@ enum ocfs2_xattr_type {
30 OCFS2_XATTR_MAX 30 OCFS2_XATTR_MAX
31}; 31};
32 32
33struct ocfs2_security_xattr_info {
34 int enable;
35 char *name;
36 void *value;
37 size_t value_len;
38};
39
33extern struct xattr_handler ocfs2_xattr_user_handler; 40extern struct xattr_handler ocfs2_xattr_user_handler;
34extern struct xattr_handler ocfs2_xattr_trusted_handler; 41extern struct xattr_handler ocfs2_xattr_trusted_handler;
42extern struct xattr_handler ocfs2_xattr_security_handler;
43#ifdef CONFIG_OCFS2_FS_POSIX_ACL
44extern struct xattr_handler ocfs2_xattr_acl_access_handler;
45extern struct xattr_handler ocfs2_xattr_acl_default_handler;
46#endif
35extern struct xattr_handler *ocfs2_xattr_handlers[]; 47extern struct xattr_handler *ocfs2_xattr_handlers[];
36 48
37ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); 49ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
50int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
51 const char *, void *, size_t);
38int ocfs2_xattr_set(struct inode *, int, const char *, const void *, 52int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
39 size_t, int); 53 size_t, int);
54int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *,
55 int, const char *, const void *, size_t, int,
56 struct ocfs2_alloc_context *,
57 struct ocfs2_alloc_context *);
40int ocfs2_xattr_remove(struct inode *, struct buffer_head *); 58int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
59int ocfs2_init_security_get(struct inode *, struct inode *,
60 struct ocfs2_security_xattr_info *);
61int ocfs2_init_security_set(handle_t *, struct inode *,
62 struct buffer_head *,
63 struct ocfs2_security_xattr_info *,
64 struct ocfs2_alloc_context *,
65 struct ocfs2_alloc_context *);
66int ocfs2_calc_security_init(struct inode *,
67 struct ocfs2_security_xattr_info *,
68 int *, int *, struct ocfs2_alloc_context **);
69int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
70 int, struct ocfs2_security_xattr_info *,
71 int *, int *, struct ocfs2_alloc_context **);
72
73/*
74 * xattrs can live inside an inode, as part of an external xattr block,
75 * or inside an xattr bucket, which is the leaf of a tree rooted in an
76 * xattr block. Some of the xattr calls, especially the value setting
77 * functions, want to treat each of these locations as equal. Let's wrap
78 * them in a structure that we can pass around instead of raw buffer_heads.
79 */
80struct ocfs2_xattr_value_buf {
81 struct buffer_head *vb_bh;
82 ocfs2_journal_access_func vb_access;
83 struct ocfs2_xattr_value_root *vb_xv;
84};
85
41 86
42#endif /* OCFS2_XATTR_H */ 87#endif /* OCFS2_XATTR_H */
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 6afe57c84f84..633e9dc972bb 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -39,7 +39,6 @@ struct inode *omfs_new_inode(struct inode *dir, int mode)
39 inode->i_mode = mode; 39 inode->i_mode = mode;
40 inode->i_uid = current_fsuid(); 40 inode->i_uid = current_fsuid();
41 inode->i_gid = current_fsgid(); 41 inode->i_gid = current_fsgid();
42 inode->i_blocks = 0;
43 inode->i_mapping->a_ops = &omfs_aops; 42 inode->i_mapping->a_ops = &omfs_aops;
44 43
45 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 44 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/open.c b/fs/open.c
index c0a426d5766c..d882fd2351d6 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -272,6 +272,8 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
272 goto put_write_and_out; 272 goto put_write_and_out;
273 273
274 error = locks_verify_truncate(inode, NULL, length); 274 error = locks_verify_truncate(inode, NULL, length);
275 if (!error)
276 error = security_path_truncate(&path, length, 0);
275 if (!error) { 277 if (!error) {
276 DQUOT_INIT(inode); 278 DQUOT_INIT(inode);
277 error = do_truncate(path.dentry, length, 0, NULL); 279 error = do_truncate(path.dentry, length, 0, NULL);
@@ -329,6 +331,9 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
329 331
330 error = locks_verify_truncate(inode, file, length); 332 error = locks_verify_truncate(inode, file, length);
331 if (!error) 333 if (!error)
334 error = security_path_truncate(&file->f_path, length,
335 ATTR_MTIME|ATTR_CTIME);
336 if (!error)
332 error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file); 337 error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
333out_putf: 338out_putf:
334 fput(file); 339 fput(file);
@@ -407,7 +412,7 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len)
407 if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) 412 if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
408 goto out_fput; 413 goto out_fput;
409 414
410 if (inode->i_op && inode->i_op->fallocate) 415 if (inode->i_op->fallocate)
411 ret = inode->i_op->fallocate(inode, mode, offset, len); 416 ret = inode->i_op->fallocate(inode, mode, offset, len);
412 else 417 else
413 ret = -EOPNOTSUPP; 418 ret = -EOPNOTSUPP;
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index d41bdc784de4..ffcd04f0012c 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -256,9 +256,6 @@ found:
256 break; 256 break;
257 } 257 }
258 258
259 inode->i_gid = 0;
260 inode->i_uid = 0;
261
262 d_add(dentry, inode); 259 d_add(dentry, inode);
263 return NULL; 260 return NULL;
264} 261}
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6d5b213b8a9b..6d720243f5f4 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -334,6 +334,7 @@ void delete_partition(struct gendisk *disk, int partno)
334 334
335 blk_free_devt(part_devt(part)); 335 blk_free_devt(part_devt(part));
336 rcu_assign_pointer(ptbl->part[partno], NULL); 336 rcu_assign_pointer(ptbl->part[partno], NULL);
337 rcu_assign_pointer(ptbl->last_lookup, NULL);
337 kobject_put(part->holder_dir); 338 kobject_put(part->holder_dir);
338 device_del(part_to_dev(part)); 339 device_del(part_to_dev(part));
339 340
@@ -384,9 +385,9 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
384 385
385 dname = dev_name(ddev); 386 dname = dev_name(ddev);
386 if (isdigit(dname[strlen(dname) - 1])) 387 if (isdigit(dname[strlen(dname) - 1]))
387 snprintf(pdev->bus_id, BUS_ID_SIZE, "%sp%d", dname, partno); 388 dev_set_name(pdev, "%sp%d", dname, partno);
388 else 389 else
389 snprintf(pdev->bus_id, BUS_ID_SIZE, "%s%d", dname, partno); 390 dev_set_name(pdev, "%s%d", dname, partno);
390 391
391 device_initialize(pdev); 392 device_initialize(pdev);
392 pdev->class = &block_class; 393 pdev->class = &block_class;
@@ -447,16 +448,11 @@ void register_disk(struct gendisk *disk)
447 struct block_device *bdev; 448 struct block_device *bdev;
448 struct disk_part_iter piter; 449 struct disk_part_iter piter;
449 struct hd_struct *part; 450 struct hd_struct *part;
450 char *s;
451 int err; 451 int err;
452 452
453 ddev->parent = disk->driverfs_dev; 453 ddev->parent = disk->driverfs_dev;
454 454
455 strlcpy(ddev->bus_id, disk->disk_name, BUS_ID_SIZE); 455 dev_set_name(ddev, disk->disk_name);
456 /* ewww... some of these buggers have / in the name... */
457 s = strchr(ddev->bus_id, '/');
458 if (s)
459 *s = '!';
460 456
461 /* delay uevents, until we scanned partition table */ 457 /* delay uevents, until we scanned partition table */
462 ddev->uevent_suppress = 1; 458 ddev->uevent_suppress = 1;
diff --git a/fs/pipe.c b/fs/pipe.c
index aaf797bd57b9..891697112f66 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1016,10 +1016,7 @@ int do_pipe_flags(int *fd, int flags)
1016 goto err_fdr; 1016 goto err_fdr;
1017 fdw = error; 1017 fdw = error;
1018 1018
1019 error = audit_fd_pair(fdr, fdw); 1019 audit_fd_pair(fdr, fdw);
1020 if (error < 0)
1021 goto err_fdw;
1022
1023 fd_install(fdr, fr); 1020 fd_install(fdr, fr);
1024 fd_install(fdw, fw); 1021 fd_install(fdw, fw);
1025 fd[0] = fdr; 1022 fd[0] = fdr;
@@ -1027,8 +1024,6 @@ int do_pipe_flags(int *fd, int flags)
1027 1024
1028 return 0; 1025 return 0;
1029 1026
1030 err_fdw:
1031 put_unused_fd(fdw);
1032 err_fdr: 1027 err_fdr:
1033 put_unused_fd(fdr); 1028 put_unused_fd(fdr);
1034 err_read_pipe: 1029 err_read_pipe:
diff --git a/fs/proc/base.c b/fs/proc/base.c
index cad92c1ac2b3..0c9de19a1633 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -65,6 +65,7 @@
65#include <linux/mm.h> 65#include <linux/mm.h>
66#include <linux/rcupdate.h> 66#include <linux/rcupdate.h>
67#include <linux/kallsyms.h> 67#include <linux/kallsyms.h>
68#include <linux/stacktrace.h>
68#include <linux/resource.h> 69#include <linux/resource.h>
69#include <linux/module.h> 70#include <linux/module.h>
70#include <linux/mount.h> 71#include <linux/mount.h>
@@ -109,25 +110,22 @@ struct pid_entry {
109 .op = OP, \ 110 .op = OP, \
110} 111}
111 112
112#define DIR(NAME, MODE, OTYPE) \ 113#define DIR(NAME, MODE, iops, fops) \
113 NOD(NAME, (S_IFDIR|(MODE)), \ 114 NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} )
114 &proc_##OTYPE##_inode_operations, &proc_##OTYPE##_operations, \ 115#define LNK(NAME, get_link) \
115 {} )
116#define LNK(NAME, OTYPE) \
117 NOD(NAME, (S_IFLNK|S_IRWXUGO), \ 116 NOD(NAME, (S_IFLNK|S_IRWXUGO), \
118 &proc_pid_link_inode_operations, NULL, \ 117 &proc_pid_link_inode_operations, NULL, \
119 { .proc_get_link = &proc_##OTYPE##_link } ) 118 { .proc_get_link = get_link } )
120#define REG(NAME, MODE, OTYPE) \ 119#define REG(NAME, MODE, fops) \
121 NOD(NAME, (S_IFREG|(MODE)), NULL, \ 120 NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
122 &proc_##OTYPE##_operations, {}) 121#define INF(NAME, MODE, read) \
123#define INF(NAME, MODE, OTYPE) \
124 NOD(NAME, (S_IFREG|(MODE)), \ 122 NOD(NAME, (S_IFREG|(MODE)), \
125 NULL, &proc_info_file_operations, \ 123 NULL, &proc_info_file_operations, \
126 { .proc_read = &proc_##OTYPE } ) 124 { .proc_read = read } )
127#define ONE(NAME, MODE, OTYPE) \ 125#define ONE(NAME, MODE, show) \
128 NOD(NAME, (S_IFREG|(MODE)), \ 126 NOD(NAME, (S_IFREG|(MODE)), \
129 NULL, &proc_single_file_operations, \ 127 NULL, &proc_single_file_operations, \
130 { .proc_show = &proc_##OTYPE } ) 128 { .proc_show = show } )
131 129
132/* 130/*
133 * Count the number of hardlinks for the pid_entry table, excluding the . 131 * Count the number of hardlinks for the pid_entry table, excluding the .
@@ -308,9 +306,9 @@ static int proc_pid_auxv(struct task_struct *task, char *buffer)
308 struct mm_struct *mm = get_task_mm(task); 306 struct mm_struct *mm = get_task_mm(task);
309 if (mm) { 307 if (mm) {
310 unsigned int nwords = 0; 308 unsigned int nwords = 0;
311 do 309 do {
312 nwords += 2; 310 nwords += 2;
313 while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ 311 } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
314 res = nwords * sizeof(mm->saved_auxv[0]); 312 res = nwords * sizeof(mm->saved_auxv[0]);
315 if (res > PAGE_SIZE) 313 if (res > PAGE_SIZE)
316 res = PAGE_SIZE; 314 res = PAGE_SIZE;
@@ -340,6 +338,37 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
340} 338}
341#endif /* CONFIG_KALLSYMS */ 339#endif /* CONFIG_KALLSYMS */
342 340
341#ifdef CONFIG_STACKTRACE
342
343#define MAX_STACK_TRACE_DEPTH 64
344
345static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
346 struct pid *pid, struct task_struct *task)
347{
348 struct stack_trace trace;
349 unsigned long *entries;
350 int i;
351
352 entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL);
353 if (!entries)
354 return -ENOMEM;
355
356 trace.nr_entries = 0;
357 trace.max_entries = MAX_STACK_TRACE_DEPTH;
358 trace.entries = entries;
359 trace.skip = 0;
360 save_stack_trace_tsk(task, &trace);
361
362 for (i = 0; i < trace.nr_entries; i++) {
363 seq_printf(m, "[<%p>] %pS\n",
364 (void *)entries[i], (void *)entries[i]);
365 }
366 kfree(entries);
367
368 return 0;
369}
370#endif
371
343#ifdef CONFIG_SCHEDSTATS 372#ifdef CONFIG_SCHEDSTATS
344/* 373/*
345 * Provides /proc/PID/schedstat 374 * Provides /proc/PID/schedstat
@@ -1186,8 +1215,6 @@ static int sched_show(struct seq_file *m, void *v)
1186 struct inode *inode = m->private; 1215 struct inode *inode = m->private;
1187 struct task_struct *p; 1216 struct task_struct *p;
1188 1217
1189 WARN_ON(!inode);
1190
1191 p = get_proc_task(inode); 1218 p = get_proc_task(inode);
1192 if (!p) 1219 if (!p)
1193 return -ESRCH; 1220 return -ESRCH;
@@ -1205,8 +1232,6 @@ sched_write(struct file *file, const char __user *buf,
1205 struct inode *inode = file->f_path.dentry->d_inode; 1232 struct inode *inode = file->f_path.dentry->d_inode;
1206 struct task_struct *p; 1233 struct task_struct *p;
1207 1234
1208 WARN_ON(!inode);
1209
1210 p = get_proc_task(inode); 1235 p = get_proc_task(inode);
1211 if (!p) 1236 if (!p)
1212 return -ESRCH; 1237 return -ESRCH;
@@ -1426,8 +1451,6 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
1426 if (!ei->pid) 1451 if (!ei->pid)
1427 goto out_unlock; 1452 goto out_unlock;
1428 1453
1429 inode->i_uid = 0;
1430 inode->i_gid = 0;
1431 if (task_dumpable(task)) { 1454 if (task_dumpable(task)) {
1432 rcu_read_lock(); 1455 rcu_read_lock();
1433 cred = __task_cred(task); 1456 cred = __task_cred(task);
@@ -1976,13 +1999,11 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
1976 const struct pid_entry *ents, 1999 const struct pid_entry *ents,
1977 unsigned int nents) 2000 unsigned int nents)
1978{ 2001{
1979 struct inode *inode;
1980 struct dentry *error; 2002 struct dentry *error;
1981 struct task_struct *task = get_proc_task(dir); 2003 struct task_struct *task = get_proc_task(dir);
1982 const struct pid_entry *p, *last; 2004 const struct pid_entry *p, *last;
1983 2005
1984 error = ERR_PTR(-ENOENT); 2006 error = ERR_PTR(-ENOENT);
1985 inode = NULL;
1986 2007
1987 if (!task) 2008 if (!task)
1988 goto out_no_task; 2009 goto out_no_task;
@@ -2138,12 +2159,12 @@ static const struct file_operations proc_pid_attr_operations = {
2138}; 2159};
2139 2160
2140static const struct pid_entry attr_dir_stuff[] = { 2161static const struct pid_entry attr_dir_stuff[] = {
2141 REG("current", S_IRUGO|S_IWUGO, pid_attr), 2162 REG("current", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2142 REG("prev", S_IRUGO, pid_attr), 2163 REG("prev", S_IRUGO, proc_pid_attr_operations),
2143 REG("exec", S_IRUGO|S_IWUGO, pid_attr), 2164 REG("exec", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2144 REG("fscreate", S_IRUGO|S_IWUGO, pid_attr), 2165 REG("fscreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2145 REG("keycreate", S_IRUGO|S_IWUGO, pid_attr), 2166 REG("keycreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2146 REG("sockcreate", S_IRUGO|S_IWUGO, pid_attr), 2167 REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2147}; 2168};
2148 2169
2149static int proc_attr_dir_readdir(struct file * filp, 2170static int proc_attr_dir_readdir(struct file * filp,
@@ -2349,8 +2370,6 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
2349 if (!ei->pid) 2370 if (!ei->pid)
2350 goto out_iput; 2371 goto out_iput;
2351 2372
2352 inode->i_uid = 0;
2353 inode->i_gid = 0;
2354 inode->i_mode = p->mode; 2373 inode->i_mode = p->mode;
2355 if (S_ISDIR(inode->i_mode)) 2374 if (S_ISDIR(inode->i_mode))
2356 inode->i_nlink = 2; 2375 inode->i_nlink = 2;
@@ -2465,74 +2484,77 @@ static const struct file_operations proc_task_operations;
2465static const struct inode_operations proc_task_inode_operations; 2484static const struct inode_operations proc_task_inode_operations;
2466 2485
2467static const struct pid_entry tgid_base_stuff[] = { 2486static const struct pid_entry tgid_base_stuff[] = {
2468 DIR("task", S_IRUGO|S_IXUGO, task), 2487 DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
2469 DIR("fd", S_IRUSR|S_IXUSR, fd), 2488 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
2470 DIR("fdinfo", S_IRUSR|S_IXUSR, fdinfo), 2489 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
2471#ifdef CONFIG_NET 2490#ifdef CONFIG_NET
2472 DIR("net", S_IRUGO|S_IXUGO, net), 2491 DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
2473#endif 2492#endif
2474 REG("environ", S_IRUSR, environ), 2493 REG("environ", S_IRUSR, proc_environ_operations),
2475 INF("auxv", S_IRUSR, pid_auxv), 2494 INF("auxv", S_IRUSR, proc_pid_auxv),
2476 ONE("status", S_IRUGO, pid_status), 2495 ONE("status", S_IRUGO, proc_pid_status),
2477 ONE("personality", S_IRUSR, pid_personality), 2496 ONE("personality", S_IRUSR, proc_pid_personality),
2478 INF("limits", S_IRUSR, pid_limits), 2497 INF("limits", S_IRUSR, proc_pid_limits),
2479#ifdef CONFIG_SCHED_DEBUG 2498#ifdef CONFIG_SCHED_DEBUG
2480 REG("sched", S_IRUGO|S_IWUSR, pid_sched), 2499 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2481#endif 2500#endif
2482#ifdef CONFIG_HAVE_ARCH_TRACEHOOK 2501#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
2483 INF("syscall", S_IRUSR, pid_syscall), 2502 INF("syscall", S_IRUSR, proc_pid_syscall),
2484#endif 2503#endif
2485 INF("cmdline", S_IRUGO, pid_cmdline), 2504 INF("cmdline", S_IRUGO, proc_pid_cmdline),
2486 ONE("stat", S_IRUGO, tgid_stat), 2505 ONE("stat", S_IRUGO, proc_tgid_stat),
2487 ONE("statm", S_IRUGO, pid_statm), 2506 ONE("statm", S_IRUGO, proc_pid_statm),
2488 REG("maps", S_IRUGO, maps), 2507 REG("maps", S_IRUGO, proc_maps_operations),
2489#ifdef CONFIG_NUMA 2508#ifdef CONFIG_NUMA
2490 REG("numa_maps", S_IRUGO, numa_maps), 2509 REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
2491#endif 2510#endif
2492 REG("mem", S_IRUSR|S_IWUSR, mem), 2511 REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
2493 LNK("cwd", cwd), 2512 LNK("cwd", proc_cwd_link),
2494 LNK("root", root), 2513 LNK("root", proc_root_link),
2495 LNK("exe", exe), 2514 LNK("exe", proc_exe_link),
2496 REG("mounts", S_IRUGO, mounts), 2515 REG("mounts", S_IRUGO, proc_mounts_operations),
2497 REG("mountinfo", S_IRUGO, mountinfo), 2516 REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
2498 REG("mountstats", S_IRUSR, mountstats), 2517 REG("mountstats", S_IRUSR, proc_mountstats_operations),
2499#ifdef CONFIG_PROC_PAGE_MONITOR 2518#ifdef CONFIG_PROC_PAGE_MONITOR
2500 REG("clear_refs", S_IWUSR, clear_refs), 2519 REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
2501 REG("smaps", S_IRUGO, smaps), 2520 REG("smaps", S_IRUGO, proc_smaps_operations),
2502 REG("pagemap", S_IRUSR, pagemap), 2521 REG("pagemap", S_IRUSR, proc_pagemap_operations),
2503#endif 2522#endif
2504#ifdef CONFIG_SECURITY 2523#ifdef CONFIG_SECURITY
2505 DIR("attr", S_IRUGO|S_IXUGO, attr_dir), 2524 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
2506#endif 2525#endif
2507#ifdef CONFIG_KALLSYMS 2526#ifdef CONFIG_KALLSYMS
2508 INF("wchan", S_IRUGO, pid_wchan), 2527 INF("wchan", S_IRUGO, proc_pid_wchan),
2528#endif
2529#ifdef CONFIG_STACKTRACE
2530 ONE("stack", S_IRUSR, proc_pid_stack),
2509#endif 2531#endif
2510#ifdef CONFIG_SCHEDSTATS 2532#ifdef CONFIG_SCHEDSTATS
2511 INF("schedstat", S_IRUGO, pid_schedstat), 2533 INF("schedstat", S_IRUGO, proc_pid_schedstat),
2512#endif 2534#endif
2513#ifdef CONFIG_LATENCYTOP 2535#ifdef CONFIG_LATENCYTOP
2514 REG("latency", S_IRUGO, lstats), 2536 REG("latency", S_IRUGO, proc_lstats_operations),
2515#endif 2537#endif
2516#ifdef CONFIG_PROC_PID_CPUSET 2538#ifdef CONFIG_PROC_PID_CPUSET
2517 REG("cpuset", S_IRUGO, cpuset), 2539 REG("cpuset", S_IRUGO, proc_cpuset_operations),
2518#endif 2540#endif
2519#ifdef CONFIG_CGROUPS 2541#ifdef CONFIG_CGROUPS
2520 REG("cgroup", S_IRUGO, cgroup), 2542 REG("cgroup", S_IRUGO, proc_cgroup_operations),
2521#endif 2543#endif
2522 INF("oom_score", S_IRUGO, oom_score), 2544 INF("oom_score", S_IRUGO, proc_oom_score),
2523 REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust), 2545 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
2524#ifdef CONFIG_AUDITSYSCALL 2546#ifdef CONFIG_AUDITSYSCALL
2525 REG("loginuid", S_IWUSR|S_IRUGO, loginuid), 2547 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
2526 REG("sessionid", S_IRUGO, sessionid), 2548 REG("sessionid", S_IRUGO, proc_sessionid_operations),
2527#endif 2549#endif
2528#ifdef CONFIG_FAULT_INJECTION 2550#ifdef CONFIG_FAULT_INJECTION
2529 REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject), 2551 REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
2530#endif 2552#endif
2531#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) 2553#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
2532 REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter), 2554 REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
2533#endif 2555#endif
2534#ifdef CONFIG_TASK_IO_ACCOUNTING 2556#ifdef CONFIG_TASK_IO_ACCOUNTING
2535 INF("io", S_IRUGO, tgid_io_accounting), 2557 INF("io", S_IRUGO, proc_tgid_io_accounting),
2536#endif 2558#endif
2537}; 2559};
2538 2560
@@ -2805,66 +2827,69 @@ out_no_task:
2805 * Tasks 2827 * Tasks
2806 */ 2828 */
2807static const struct pid_entry tid_base_stuff[] = { 2829static const struct pid_entry tid_base_stuff[] = {
2808 DIR("fd", S_IRUSR|S_IXUSR, fd), 2830 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
2809 DIR("fdinfo", S_IRUSR|S_IXUSR, fdinfo), 2831 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fd_operations),
2810 REG("environ", S_IRUSR, environ), 2832 REG("environ", S_IRUSR, proc_environ_operations),
2811 INF("auxv", S_IRUSR, pid_auxv), 2833 INF("auxv", S_IRUSR, proc_pid_auxv),
2812 ONE("status", S_IRUGO, pid_status), 2834 ONE("status", S_IRUGO, proc_pid_status),
2813 ONE("personality", S_IRUSR, pid_personality), 2835 ONE("personality", S_IRUSR, proc_pid_personality),
2814 INF("limits", S_IRUSR, pid_limits), 2836 INF("limits", S_IRUSR, proc_pid_limits),
2815#ifdef CONFIG_SCHED_DEBUG 2837#ifdef CONFIG_SCHED_DEBUG
2816 REG("sched", S_IRUGO|S_IWUSR, pid_sched), 2838 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2817#endif 2839#endif
2818#ifdef CONFIG_HAVE_ARCH_TRACEHOOK 2840#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
2819 INF("syscall", S_IRUSR, pid_syscall), 2841 INF("syscall", S_IRUSR, proc_pid_syscall),
2820#endif 2842#endif
2821 INF("cmdline", S_IRUGO, pid_cmdline), 2843 INF("cmdline", S_IRUGO, proc_pid_cmdline),
2822 ONE("stat", S_IRUGO, tid_stat), 2844 ONE("stat", S_IRUGO, proc_tid_stat),
2823 ONE("statm", S_IRUGO, pid_statm), 2845 ONE("statm", S_IRUGO, proc_pid_statm),
2824 REG("maps", S_IRUGO, maps), 2846 REG("maps", S_IRUGO, proc_maps_operations),
2825#ifdef CONFIG_NUMA 2847#ifdef CONFIG_NUMA
2826 REG("numa_maps", S_IRUGO, numa_maps), 2848 REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
2827#endif 2849#endif
2828 REG("mem", S_IRUSR|S_IWUSR, mem), 2850 REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
2829 LNK("cwd", cwd), 2851 LNK("cwd", proc_cwd_link),
2830 LNK("root", root), 2852 LNK("root", proc_root_link),
2831 LNK("exe", exe), 2853 LNK("exe", proc_exe_link),
2832 REG("mounts", S_IRUGO, mounts), 2854 REG("mounts", S_IRUGO, proc_mounts_operations),
2833 REG("mountinfo", S_IRUGO, mountinfo), 2855 REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
2834#ifdef CONFIG_PROC_PAGE_MONITOR 2856#ifdef CONFIG_PROC_PAGE_MONITOR
2835 REG("clear_refs", S_IWUSR, clear_refs), 2857 REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
2836 REG("smaps", S_IRUGO, smaps), 2858 REG("smaps", S_IRUGO, proc_smaps_operations),
2837 REG("pagemap", S_IRUSR, pagemap), 2859 REG("pagemap", S_IRUSR, proc_pagemap_operations),
2838#endif 2860#endif
2839#ifdef CONFIG_SECURITY 2861#ifdef CONFIG_SECURITY
2840 DIR("attr", S_IRUGO|S_IXUGO, attr_dir), 2862 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
2841#endif 2863#endif
2842#ifdef CONFIG_KALLSYMS 2864#ifdef CONFIG_KALLSYMS
2843 INF("wchan", S_IRUGO, pid_wchan), 2865 INF("wchan", S_IRUGO, proc_pid_wchan),
2866#endif
2867#ifdef CONFIG_STACKTRACE
2868 ONE("stack", S_IRUSR, proc_pid_stack),
2844#endif 2869#endif
2845#ifdef CONFIG_SCHEDSTATS 2870#ifdef CONFIG_SCHEDSTATS
2846 INF("schedstat", S_IRUGO, pid_schedstat), 2871 INF("schedstat", S_IRUGO, proc_pid_schedstat),
2847#endif 2872#endif
2848#ifdef CONFIG_LATENCYTOP 2873#ifdef CONFIG_LATENCYTOP
2849 REG("latency", S_IRUGO, lstats), 2874 REG("latency", S_IRUGO, proc_lstats_operations),
2850#endif 2875#endif
2851#ifdef CONFIG_PROC_PID_CPUSET 2876#ifdef CONFIG_PROC_PID_CPUSET
2852 REG("cpuset", S_IRUGO, cpuset), 2877 REG("cpuset", S_IRUGO, proc_cpuset_operations),
2853#endif 2878#endif
2854#ifdef CONFIG_CGROUPS 2879#ifdef CONFIG_CGROUPS
2855 REG("cgroup", S_IRUGO, cgroup), 2880 REG("cgroup", S_IRUGO, proc_cgroup_operations),
2856#endif 2881#endif
2857 INF("oom_score", S_IRUGO, oom_score), 2882 INF("oom_score", S_IRUGO, proc_oom_score),
2858 REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust), 2883 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
2859#ifdef CONFIG_AUDITSYSCALL 2884#ifdef CONFIG_AUDITSYSCALL
2860 REG("loginuid", S_IWUSR|S_IRUGO, loginuid), 2885 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
2861 REG("sessionid", S_IRUSR, sessionid), 2886 REG("sessionid", S_IRUSR, proc_sessionid_operations),
2862#endif 2887#endif
2863#ifdef CONFIG_FAULT_INJECTION 2888#ifdef CONFIG_FAULT_INJECTION
2864 REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject), 2889 REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
2865#endif 2890#endif
2866#ifdef CONFIG_TASK_IO_ACCOUNTING 2891#ifdef CONFIG_TASK_IO_ACCOUNTING
2867 INF("io", S_IRUGO, tid_io_accounting), 2892 INF("io", S_IRUGO, proc_tid_io_accounting),
2868#endif 2893#endif
2869}; 2894};
2870 2895
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 60a359b35582..db7fa5cab988 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -14,7 +14,6 @@
14#include <linux/stat.h> 14#include <linux/stat.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/mount.h> 16#include <linux/mount.h>
17#include <linux/smp_lock.h>
18#include <linux/init.h> 17#include <linux/init.h>
19#include <linux/idr.h> 18#include <linux/idr.h>
20#include <linux/namei.h> 19#include <linux/namei.h>
@@ -379,7 +378,6 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
379 struct inode *inode = NULL; 378 struct inode *inode = NULL;
380 int error = -ENOENT; 379 int error = -ENOENT;
381 380
382 lock_kernel();
383 spin_lock(&proc_subdir_lock); 381 spin_lock(&proc_subdir_lock);
384 for (de = de->subdir; de ; de = de->next) { 382 for (de = de->subdir; de ; de = de->next) {
385 if (de->namelen != dentry->d_name.len) 383 if (de->namelen != dentry->d_name.len)
@@ -397,7 +395,6 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
397 } 395 }
398 spin_unlock(&proc_subdir_lock); 396 spin_unlock(&proc_subdir_lock);
399out_unlock: 397out_unlock:
400 unlock_kernel();
401 398
402 if (inode) { 399 if (inode) {
403 dentry->d_op = &proc_dentry_operations; 400 dentry->d_op = &proc_dentry_operations;
@@ -432,8 +429,6 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
432 struct inode *inode = filp->f_path.dentry->d_inode; 429 struct inode *inode = filp->f_path.dentry->d_inode;
433 int ret = 0; 430 int ret = 0;
434 431
435 lock_kernel();
436
437 ino = inode->i_ino; 432 ino = inode->i_ino;
438 i = filp->f_pos; 433 i = filp->f_pos;
439 switch (i) { 434 switch (i) {
@@ -487,7 +482,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
487 spin_unlock(&proc_subdir_lock); 482 spin_unlock(&proc_subdir_lock);
488 } 483 }
489 ret = 1; 484 ret = 1;
490out: unlock_kernel(); 485out:
491 return ret; 486 return ret;
492} 487}
493 488
@@ -504,6 +499,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
504 * the /proc directory. 499 * the /proc directory.
505 */ 500 */
506static const struct file_operations proc_dir_operations = { 501static const struct file_operations proc_dir_operations = {
502 .llseek = generic_file_llseek,
507 .read = generic_read_dir, 503 .read = generic_read_dir,
508 .readdir = proc_readdir, 504 .readdir = proc_readdir,
509}; 505};
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 2543fd00c658..3e76bb9b3ad6 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -35,16 +35,13 @@ struct proc_dir_entry *de_get(struct proc_dir_entry *de)
35 */ 35 */
36void de_put(struct proc_dir_entry *de) 36void de_put(struct proc_dir_entry *de)
37{ 37{
38 lock_kernel();
39 if (!atomic_read(&de->count)) { 38 if (!atomic_read(&de->count)) {
40 printk("de_put: entry %s already free!\n", de->name); 39 printk("de_put: entry %s already free!\n", de->name);
41 unlock_kernel();
42 return; 40 return;
43 } 41 }
44 42
45 if (atomic_dec_and_test(&de->count)) 43 if (atomic_dec_and_test(&de->count))
46 free_proc_entry(de); 44 free_proc_entry(de);
47 unlock_kernel();
48} 45}
49 46
50/* 47/*
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 3e8aeb8b61ce..cd53ff838498 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -41,8 +41,6 @@ do { \
41 (vmi)->used = 0; \ 41 (vmi)->used = 0; \
42 (vmi)->largest_chunk = 0; \ 42 (vmi)->largest_chunk = 0; \
43} while(0) 43} while(0)
44
45extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *);
46#endif 44#endif
47 45
48extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, 46extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index b1675c4e66da..43d23948384a 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -74,6 +74,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
74 "LowTotal: %8lu kB\n" 74 "LowTotal: %8lu kB\n"
75 "LowFree: %8lu kB\n" 75 "LowFree: %8lu kB\n"
76#endif 76#endif
77#ifndef CONFIG_MMU
78 "MmapCopy: %8lu kB\n"
79#endif
77 "SwapTotal: %8lu kB\n" 80 "SwapTotal: %8lu kB\n"
78 "SwapFree: %8lu kB\n" 81 "SwapFree: %8lu kB\n"
79 "Dirty: %8lu kB\n" 82 "Dirty: %8lu kB\n"
@@ -116,6 +119,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
116 K(i.totalram-i.totalhigh), 119 K(i.totalram-i.totalhigh),
117 K(i.freeram-i.freehigh), 120 K(i.freeram-i.freehigh),
118#endif 121#endif
122#ifndef CONFIG_MMU
123 K((unsigned long) atomic_read(&mmap_pages_allocated)),
124#endif
119 K(i.totalswap), 125 K(i.totalswap),
120 K(i.freeswap), 126 K(i.freeswap),
121 K(global_page_state(NR_FILE_DIRTY)), 127 K(global_page_state(NR_FILE_DIRTY)),
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 3f87d2632947..b446d7ad0b0d 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -33,33 +33,33 @@
33#include "internal.h" 33#include "internal.h"
34 34
35/* 35/*
36 * display a single VMA to a sequenced file 36 * display a single region to a sequenced file
37 */ 37 */
38int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) 38static int nommu_region_show(struct seq_file *m, struct vm_region *region)
39{ 39{
40 unsigned long ino = 0; 40 unsigned long ino = 0;
41 struct file *file; 41 struct file *file;
42 dev_t dev = 0; 42 dev_t dev = 0;
43 int flags, len; 43 int flags, len;
44 44
45 flags = vma->vm_flags; 45 flags = region->vm_flags;
46 file = vma->vm_file; 46 file = region->vm_file;
47 47
48 if (file) { 48 if (file) {
49 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 49 struct inode *inode = region->vm_file->f_path.dentry->d_inode;
50 dev = inode->i_sb->s_dev; 50 dev = inode->i_sb->s_dev;
51 ino = inode->i_ino; 51 ino = inode->i_ino;
52 } 52 }
53 53
54 seq_printf(m, 54 seq_printf(m,
55 "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", 55 "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
56 vma->vm_start, 56 region->vm_start,
57 vma->vm_end, 57 region->vm_end,
58 flags & VM_READ ? 'r' : '-', 58 flags & VM_READ ? 'r' : '-',
59 flags & VM_WRITE ? 'w' : '-', 59 flags & VM_WRITE ? 'w' : '-',
60 flags & VM_EXEC ? 'x' : '-', 60 flags & VM_EXEC ? 'x' : '-',
61 flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', 61 flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
62 ((loff_t)vma->vm_pgoff) << PAGE_SHIFT, 62 ((loff_t)region->vm_pgoff) << PAGE_SHIFT,
63 MAJOR(dev), MINOR(dev), ino, &len); 63 MAJOR(dev), MINOR(dev), ino, &len);
64 64
65 if (file) { 65 if (file) {
@@ -75,61 +75,54 @@ int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
75} 75}
76 76
77/* 77/*
78 * display a list of all the VMAs the kernel knows about 78 * display a list of all the REGIONs the kernel knows about
79 * - nommu kernals have a single flat list 79 * - nommu kernals have a single flat list
80 */ 80 */
81static int nommu_vma_list_show(struct seq_file *m, void *v) 81static int nommu_region_list_show(struct seq_file *m, void *_p)
82{ 82{
83 struct vm_area_struct *vma; 83 struct rb_node *p = _p;
84 84
85 vma = rb_entry((struct rb_node *) v, struct vm_area_struct, vm_rb); 85 return nommu_region_show(m, rb_entry(p, struct vm_region, vm_rb));
86 return nommu_vma_show(m, vma);
87} 86}
88 87
89static void *nommu_vma_list_start(struct seq_file *m, loff_t *_pos) 88static void *nommu_region_list_start(struct seq_file *m, loff_t *_pos)
90{ 89{
91 struct rb_node *_rb; 90 struct rb_node *p;
92 loff_t pos = *_pos; 91 loff_t pos = *_pos;
93 void *next = NULL;
94 92
95 down_read(&nommu_vma_sem); 93 down_read(&nommu_region_sem);
96 94
97 for (_rb = rb_first(&nommu_vma_tree); _rb; _rb = rb_next(_rb)) { 95 for (p = rb_first(&nommu_region_tree); p; p = rb_next(p))
98 if (pos == 0) { 96 if (pos-- == 0)
99 next = _rb; 97 return p;
100 break; 98 return NULL;
101 }
102 pos--;
103 }
104
105 return next;
106} 99}
107 100
108static void nommu_vma_list_stop(struct seq_file *m, void *v) 101static void nommu_region_list_stop(struct seq_file *m, void *v)
109{ 102{
110 up_read(&nommu_vma_sem); 103 up_read(&nommu_region_sem);
111} 104}
112 105
113static void *nommu_vma_list_next(struct seq_file *m, void *v, loff_t *pos) 106static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos)
114{ 107{
115 (*pos)++; 108 (*pos)++;
116 return rb_next((struct rb_node *) v); 109 return rb_next((struct rb_node *) v);
117} 110}
118 111
119static const struct seq_operations proc_nommu_vma_list_seqop = { 112static struct seq_operations proc_nommu_region_list_seqop = {
120 .start = nommu_vma_list_start, 113 .start = nommu_region_list_start,
121 .next = nommu_vma_list_next, 114 .next = nommu_region_list_next,
122 .stop = nommu_vma_list_stop, 115 .stop = nommu_region_list_stop,
123 .show = nommu_vma_list_show 116 .show = nommu_region_list_show
124}; 117};
125 118
126static int proc_nommu_vma_list_open(struct inode *inode, struct file *file) 119static int proc_nommu_region_list_open(struct inode *inode, struct file *file)
127{ 120{
128 return seq_open(file, &proc_nommu_vma_list_seqop); 121 return seq_open(file, &proc_nommu_region_list_seqop);
129} 122}
130 123
131static const struct file_operations proc_nommu_vma_list_operations = { 124static const struct file_operations proc_nommu_region_list_operations = {
132 .open = proc_nommu_vma_list_open, 125 .open = proc_nommu_region_list_open,
133 .read = seq_read, 126 .read = seq_read,
134 .llseek = seq_lseek, 127 .llseek = seq_lseek,
135 .release = seq_release, 128 .release = seq_release,
@@ -137,7 +130,7 @@ static const struct file_operations proc_nommu_vma_list_operations = {
137 130
138static int __init proc_nommu_init(void) 131static int __init proc_nommu_init(void)
139{ 132{
140 proc_create("maps", S_IRUGO, NULL, &proc_nommu_vma_list_operations); 133 proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations);
141 return 0; 134 return 0;
142} 135}
143 136
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 7bc296f424ae..04d1270f1c38 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -18,7 +18,6 @@
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/bitops.h> 20#include <linux/bitops.h>
21#include <linux/smp_lock.h>
22#include <linux/mount.h> 21#include <linux/mount.h>
23#include <linux/nsproxy.h> 22#include <linux/nsproxy.h>
24#include <net/net_namespace.h> 23#include <net/net_namespace.h>
@@ -172,6 +171,7 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent,
172} 171}
173 172
174const struct file_operations proc_net_operations = { 173const struct file_operations proc_net_operations = {
174 .llseek = generic_file_llseek,
175 .read = generic_read_dir, 175 .read = generic_read_dir,
176 .readdir = proc_tgid_net_readdir, 176 .readdir = proc_tgid_net_readdir,
177}; 177};
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 06ed10b7da9e..94fcfff6863a 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -31,7 +31,6 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
31 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 31 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
32 inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */ 32 inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */
33 inode->i_mode = table->mode; 33 inode->i_mode = table->mode;
34 inode->i_uid = inode->i_gid = 0;
35 if (!table->child) { 34 if (!table->child) {
36 inode->i_mode |= S_IFREG; 35 inode->i_mode |= S_IFREG;
37 inode->i_op = &proc_sys_inode_operations; 36 inode->i_op = &proc_sys_inode_operations;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 7761602af9de..f6299a25594e 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -16,7 +16,6 @@
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/bitops.h> 18#include <linux/bitops.h>
19#include <linux/smp_lock.h>
20#include <linux/mount.h> 19#include <linux/mount.h>
21#include <linux/pid_namespace.h> 20#include <linux/pid_namespace.h>
22 21
@@ -162,17 +161,12 @@ static int proc_root_readdir(struct file * filp,
162 unsigned int nr = filp->f_pos; 161 unsigned int nr = filp->f_pos;
163 int ret; 162 int ret;
164 163
165 lock_kernel();
166
167 if (nr < FIRST_PROCESS_ENTRY) { 164 if (nr < FIRST_PROCESS_ENTRY) {
168 int error = proc_readdir(filp, dirent, filldir); 165 int error = proc_readdir(filp, dirent, filldir);
169 if (error <= 0) { 166 if (error <= 0)
170 unlock_kernel();
171 return error; 167 return error;
172 }
173 filp->f_pos = FIRST_PROCESS_ENTRY; 168 filp->f_pos = FIRST_PROCESS_ENTRY;
174 } 169 }
175 unlock_kernel();
176 170
177 ret = proc_pid_readdir(filp, dirent, filldir); 171 ret = proc_pid_readdir(filp, dirent, filldir);
178 return ret; 172 return ret;
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 81904f07679d..f75efa22df5e 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -9,6 +9,7 @@
9#include <linux/seq_file.h> 9#include <linux/seq_file.h>
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/irqnr.h>
12#include <asm/cputime.h> 13#include <asm/cputime.h>
13 14
14#ifndef arch_irq_stat_cpu 15#ifndef arch_irq_stat_cpu
@@ -44,10 +45,9 @@ static int show_stat(struct seq_file *p, void *v)
44 softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); 45 softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
45 steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); 46 steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
46 guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); 47 guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
47 48 for_each_irq_nr(j) {
48 for_each_irq_nr(j)
49 sum += kstat_irqs_cpu(j, i); 49 sum += kstat_irqs_cpu(j, i);
50 50 }
51 sum += arch_irq_stat_cpu(i); 51 sum += arch_irq_stat_cpu(i);
52 } 52 }
53 sum += arch_irq_stat(); 53 sum += arch_irq_stat();
@@ -92,7 +92,6 @@ static int show_stat(struct seq_file *p, void *v)
92 /* sum again ? it could be updated? */ 92 /* sum again ? it could be updated? */
93 for_each_irq_nr(j) { 93 for_each_irq_nr(j) {
94 per_irq_sum = 0; 94 per_irq_sum = 0;
95
96 for_each_possible_cpu(i) 95 for_each_possible_cpu(i)
97 per_irq_sum += kstat_irqs_cpu(j, i); 96 per_irq_sum += kstat_irqs_cpu(j, i);
98 97
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3a8bdd7f5756..94063840832a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -396,7 +396,9 @@ static int show_smap(struct seq_file *m, void *v)
396 "Private_Clean: %8lu kB\n" 396 "Private_Clean: %8lu kB\n"
397 "Private_Dirty: %8lu kB\n" 397 "Private_Dirty: %8lu kB\n"
398 "Referenced: %8lu kB\n" 398 "Referenced: %8lu kB\n"
399 "Swap: %8lu kB\n", 399 "Swap: %8lu kB\n"
400 "KernelPageSize: %8lu kB\n"
401 "MMUPageSize: %8lu kB\n",
400 (vma->vm_end - vma->vm_start) >> 10, 402 (vma->vm_end - vma->vm_start) >> 10,
401 mss.resident >> 10, 403 mss.resident >> 10,
402 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), 404 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -405,7 +407,9 @@ static int show_smap(struct seq_file *m, void *v)
405 mss.private_clean >> 10, 407 mss.private_clean >> 10,
406 mss.private_dirty >> 10, 408 mss.private_dirty >> 10,
407 mss.referenced >> 10, 409 mss.referenced >> 10,
408 mss.swap >> 10); 410 mss.swap >> 10,
411 vma_kernel_pagesize(vma) >> 10,
412 vma_mmu_pagesize(vma) >> 10);
409 413
410 if (m->count < m->size) /* vma is copied successfully */ 414 if (m->count < m->size) /* vma is copied successfully */
411 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; 415 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 219bd79ea894..343ea1216bc8 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -9,31 +9,38 @@
9 9
10/* 10/*
11 * Logic: we've got two memory sums for each process, "shared", and 11 * Logic: we've got two memory sums for each process, "shared", and
12 * "non-shared". Shared memory may get counted more then once, for 12 * "non-shared". Shared memory may get counted more than once, for
13 * each process that owns it. Non-shared memory is counted 13 * each process that owns it. Non-shared memory is counted
14 * accurately. 14 * accurately.
15 */ 15 */
16void task_mem(struct seq_file *m, struct mm_struct *mm) 16void task_mem(struct seq_file *m, struct mm_struct *mm)
17{ 17{
18 struct vm_list_struct *vml; 18 struct vm_area_struct *vma;
19 unsigned long bytes = 0, sbytes = 0, slack = 0; 19 struct vm_region *region;
20 struct rb_node *p;
21 unsigned long bytes = 0, sbytes = 0, slack = 0, size;
20 22
21 down_read(&mm->mmap_sem); 23 down_read(&mm->mmap_sem);
22 for (vml = mm->context.vmlist; vml; vml = vml->next) { 24 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
23 if (!vml->vma) 25 vma = rb_entry(p, struct vm_area_struct, vm_rb);
24 continue; 26
27 bytes += kobjsize(vma);
28
29 region = vma->vm_region;
30 if (region) {
31 size = kobjsize(region);
32 size += region->vm_end - region->vm_start;
33 } else {
34 size = vma->vm_end - vma->vm_start;
35 }
25 36
26 bytes += kobjsize(vml);
27 if (atomic_read(&mm->mm_count) > 1 || 37 if (atomic_read(&mm->mm_count) > 1 ||
28 atomic_read(&vml->vma->vm_usage) > 1 38 vma->vm_flags & VM_MAYSHARE) {
29 ) { 39 sbytes += size;
30 sbytes += kobjsize((void *) vml->vma->vm_start);
31 sbytes += kobjsize(vml->vma);
32 } else { 40 } else {
33 bytes += kobjsize((void *) vml->vma->vm_start); 41 bytes += size;
34 bytes += kobjsize(vml->vma); 42 if (region)
35 slack += kobjsize((void *) vml->vma->vm_start) - 43 slack = region->vm_end - vma->vm_end;
36 (vml->vma->vm_end - vml->vma->vm_start);
37 } 44 }
38 } 45 }
39 46
@@ -70,13 +77,14 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
70 77
71unsigned long task_vsize(struct mm_struct *mm) 78unsigned long task_vsize(struct mm_struct *mm)
72{ 79{
73 struct vm_list_struct *tbp; 80 struct vm_area_struct *vma;
81 struct rb_node *p;
74 unsigned long vsize = 0; 82 unsigned long vsize = 0;
75 83
76 down_read(&mm->mmap_sem); 84 down_read(&mm->mmap_sem);
77 for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) { 85 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
78 if (tbp->vma) 86 vma = rb_entry(p, struct vm_area_struct, vm_rb);
79 vsize += kobjsize((void *) tbp->vma->vm_start); 87 vsize += vma->vm_end - vma->vm_start;
80 } 88 }
81 up_read(&mm->mmap_sem); 89 up_read(&mm->mmap_sem);
82 return vsize; 90 return vsize;
@@ -85,15 +93,19 @@ unsigned long task_vsize(struct mm_struct *mm)
85int task_statm(struct mm_struct *mm, int *shared, int *text, 93int task_statm(struct mm_struct *mm, int *shared, int *text,
86 int *data, int *resident) 94 int *data, int *resident)
87{ 95{
88 struct vm_list_struct *tbp; 96 struct vm_area_struct *vma;
97 struct vm_region *region;
98 struct rb_node *p;
89 int size = kobjsize(mm); 99 int size = kobjsize(mm);
90 100
91 down_read(&mm->mmap_sem); 101 down_read(&mm->mmap_sem);
92 for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) { 102 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
93 size += kobjsize(tbp); 103 vma = rb_entry(p, struct vm_area_struct, vm_rb);
94 if (tbp->vma) { 104 size += kobjsize(vma);
95 size += kobjsize(tbp->vma); 105 region = vma->vm_region;
96 size += kobjsize((void *) tbp->vma->vm_start); 106 if (region) {
107 size += kobjsize(region);
108 size += region->vm_end - region->vm_start;
97 } 109 }
98 } 110 }
99 111
@@ -105,20 +117,62 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
105} 117}
106 118
107/* 119/*
120 * display a single VMA to a sequenced file
121 */
122static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
123{
124 unsigned long ino = 0;
125 struct file *file;
126 dev_t dev = 0;
127 int flags, len;
128
129 flags = vma->vm_flags;
130 file = vma->vm_file;
131
132 if (file) {
133 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
134 dev = inode->i_sb->s_dev;
135 ino = inode->i_ino;
136 }
137
138 seq_printf(m,
139 "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
140 vma->vm_start,
141 vma->vm_end,
142 flags & VM_READ ? 'r' : '-',
143 flags & VM_WRITE ? 'w' : '-',
144 flags & VM_EXEC ? 'x' : '-',
145 flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
146 vma->vm_pgoff << PAGE_SHIFT,
147 MAJOR(dev), MINOR(dev), ino, &len);
148
149 if (file) {
150 len = 25 + sizeof(void *) * 6 - len;
151 if (len < 1)
152 len = 1;
153 seq_printf(m, "%*c", len, ' ');
154 seq_path(m, &file->f_path, "");
155 }
156
157 seq_putc(m, '\n');
158 return 0;
159}
160
161/*
108 * display mapping lines for a particular process's /proc/pid/maps 162 * display mapping lines for a particular process's /proc/pid/maps
109 */ 163 */
110static int show_map(struct seq_file *m, void *_vml) 164static int show_map(struct seq_file *m, void *_p)
111{ 165{
112 struct vm_list_struct *vml = _vml; 166 struct rb_node *p = _p;
113 167
114 return nommu_vma_show(m, vml->vma); 168 return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb));
115} 169}
116 170
117static void *m_start(struct seq_file *m, loff_t *pos) 171static void *m_start(struct seq_file *m, loff_t *pos)
118{ 172{
119 struct proc_maps_private *priv = m->private; 173 struct proc_maps_private *priv = m->private;
120 struct vm_list_struct *vml;
121 struct mm_struct *mm; 174 struct mm_struct *mm;
175 struct rb_node *p;
122 loff_t n = *pos; 176 loff_t n = *pos;
123 177
124 /* pin the task and mm whilst we play with them */ 178 /* pin the task and mm whilst we play with them */
@@ -134,9 +188,9 @@ static void *m_start(struct seq_file *m, loff_t *pos)
134 } 188 }
135 189
136 /* start from the Nth VMA */ 190 /* start from the Nth VMA */
137 for (vml = mm->context.vmlist; vml; vml = vml->next) 191 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
138 if (n-- == 0) 192 if (n-- == 0)
139 return vml; 193 return p;
140 return NULL; 194 return NULL;
141} 195}
142 196
@@ -152,12 +206,12 @@ static void m_stop(struct seq_file *m, void *_vml)
152 } 206 }
153} 207}
154 208
155static void *m_next(struct seq_file *m, void *_vml, loff_t *pos) 209static void *m_next(struct seq_file *m, void *_p, loff_t *pos)
156{ 210{
157 struct vm_list_struct *vml = _vml; 211 struct rb_node *p = _p;
158 212
159 (*pos)++; 213 (*pos)++;
160 return vml ? vml->next : NULL; 214 return p ? rb_next(p) : NULL;
161} 215}
162 216
163static const struct seq_operations proc_pid_maps_ops = { 217static const struct seq_operations proc_pid_maps_ops = {
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 03ec59504906..5edcc3f92ba7 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -47,8 +47,6 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
47 47
48 offset = (unsigned long)(*ppos % PAGE_SIZE); 48 offset = (unsigned long)(*ppos % PAGE_SIZE);
49 pfn = (unsigned long)(*ppos / PAGE_SIZE); 49 pfn = (unsigned long)(*ppos / PAGE_SIZE);
50 if (pfn > saved_max_pfn)
51 return -EINVAL;
52 50
53 do { 51 do {
54 if (count > (PAGE_SIZE - offset)) 52 if (count > (PAGE_SIZE - offset))
diff --git a/fs/quota.c b/fs/quota.c
index b7fe44e01618..4a8c94f05f76 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -73,7 +73,7 @@ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid
73 case Q_SETQUOTA: 73 case Q_SETQUOTA:
74 case Q_GETQUOTA: 74 case Q_GETQUOTA:
75 /* This is just informative test so we are satisfied without a lock */ 75 /* This is just informative test so we are satisfied without a lock */
76 if (!sb_has_quota_enabled(sb, type)) 76 if (!sb_has_quota_active(sb, type))
77 return -ESRCH; 77 return -ESRCH;
78 } 78 }
79 79
@@ -160,6 +160,9 @@ static void quota_sync_sb(struct super_block *sb, int type)
160 int cnt; 160 int cnt;
161 161
162 sb->s_qcop->quota_sync(sb, type); 162 sb->s_qcop->quota_sync(sb, type);
163
164 if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)
165 return;
163 /* This is not very clever (and fast) but currently I don't know about 166 /* This is not very clever (and fast) but currently I don't know about
164 * any other simple way of getting quota data to disk and we must get 167 * any other simple way of getting quota data to disk and we must get
165 * them there for userspace to be visible... */ 168 * them there for userspace to be visible... */
@@ -175,7 +178,7 @@ static void quota_sync_sb(struct super_block *sb, int type)
175 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 178 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
176 if (type != -1 && cnt != type) 179 if (type != -1 && cnt != type)
177 continue; 180 continue;
178 if (!sb_has_quota_enabled(sb, cnt)) 181 if (!sb_has_quota_active(sb, cnt))
179 continue; 182 continue;
180 mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, I_MUTEX_QUOTA); 183 mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, I_MUTEX_QUOTA);
181 truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0); 184 truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
@@ -201,7 +204,7 @@ restart:
201 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 204 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
202 if (type != -1 && type != cnt) 205 if (type != -1 && type != cnt)
203 continue; 206 continue;
204 if (!sb_has_quota_enabled(sb, cnt)) 207 if (!sb_has_quota_active(sb, cnt))
205 continue; 208 continue;
206 if (!info_dirty(&sb_dqopt(sb)->info[cnt]) && 209 if (!info_dirty(&sb_dqopt(sb)->info[cnt]) &&
207 list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list)) 210 list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
@@ -245,7 +248,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
245 __u32 fmt; 248 __u32 fmt;
246 249
247 down_read(&sb_dqopt(sb)->dqptr_sem); 250 down_read(&sb_dqopt(sb)->dqptr_sem);
248 if (!sb_has_quota_enabled(sb, type)) { 251 if (!sb_has_quota_active(sb, type)) {
249 up_read(&sb_dqopt(sb)->dqptr_sem); 252 up_read(&sb_dqopt(sb)->dqptr_sem);
250 return -ESRCH; 253 return -ESRCH;
251 } 254 }
diff --git a/fs/quota_tree.c b/fs/quota_tree.c
new file mode 100644
index 000000000000..953404c95b17
--- /dev/null
+++ b/fs/quota_tree.c
@@ -0,0 +1,645 @@
1/*
2 * vfsv0 quota IO operations on file
3 */
4
5#include <linux/errno.h>
6#include <linux/fs.h>
7#include <linux/mount.h>
8#include <linux/dqblk_v2.h>
9#include <linux/kernel.h>
10#include <linux/init.h>
11#include <linux/module.h>
12#include <linux/slab.h>
13#include <linux/quotaops.h>
14
15#include <asm/byteorder.h>
16
17#include "quota_tree.h"
18
19MODULE_AUTHOR("Jan Kara");
20MODULE_DESCRIPTION("Quota trie support");
21MODULE_LICENSE("GPL");
22
23#define __QUOTA_QT_PARANOIA
24
25typedef char *dqbuf_t;
26
27static int get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth)
28{
29 unsigned int epb = info->dqi_usable_bs >> 2;
30
31 depth = info->dqi_qtree_depth - depth - 1;
32 while (depth--)
33 id /= epb;
34 return id % epb;
35}
36
37/* Number of entries in one blocks */
38static inline int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info)
39{
40 return (info->dqi_usable_bs - sizeof(struct qt_disk_dqdbheader))
41 / info->dqi_entry_size;
42}
43
44static dqbuf_t getdqbuf(size_t size)
45{
46 dqbuf_t buf = kmalloc(size, GFP_NOFS);
47 if (!buf)
48 printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n");
49 return buf;
50}
51
52static inline void freedqbuf(dqbuf_t buf)
53{
54 kfree(buf);
55}
56
57static inline ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf)
58{
59 struct super_block *sb = info->dqi_sb;
60
61 memset(buf, 0, info->dqi_usable_bs);
62 return sb->s_op->quota_read(sb, info->dqi_type, (char *)buf,
63 info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
64}
65
66static inline ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf)
67{
68 struct super_block *sb = info->dqi_sb;
69
70 return sb->s_op->quota_write(sb, info->dqi_type, (char *)buf,
71 info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
72}
73
74/* Remove empty block from list and return it */
75static int get_free_dqblk(struct qtree_mem_dqinfo *info)
76{
77 dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
78 struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
79 int ret, blk;
80
81 if (!buf)
82 return -ENOMEM;
83 if (info->dqi_free_blk) {
84 blk = info->dqi_free_blk;
85 ret = read_blk(info, blk, buf);
86 if (ret < 0)
87 goto out_buf;
88 info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
89 }
90 else {
91 memset(buf, 0, info->dqi_usable_bs);
92 /* Assure block allocation... */
93 ret = write_blk(info, info->dqi_blocks, buf);
94 if (ret < 0)
95 goto out_buf;
96 blk = info->dqi_blocks++;
97 }
98 mark_info_dirty(info->dqi_sb, info->dqi_type);
99 ret = blk;
100out_buf:
101 freedqbuf(buf);
102 return ret;
103}
104
105/* Insert empty block to the list */
106static int put_free_dqblk(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
107{
108 struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
109 int err;
110
111 dh->dqdh_next_free = cpu_to_le32(info->dqi_free_blk);
112 dh->dqdh_prev_free = cpu_to_le32(0);
113 dh->dqdh_entries = cpu_to_le16(0);
114 err = write_blk(info, blk, buf);
115 if (err < 0)
116 return err;
117 info->dqi_free_blk = blk;
118 mark_info_dirty(info->dqi_sb, info->dqi_type);
119 return 0;
120}
121
122/* Remove given block from the list of blocks with free entries */
123static int remove_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
124{
125 dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs);
126 struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
127 uint nextblk = le32_to_cpu(dh->dqdh_next_free);
128 uint prevblk = le32_to_cpu(dh->dqdh_prev_free);
129 int err;
130
131 if (!tmpbuf)
132 return -ENOMEM;
133 if (nextblk) {
134 err = read_blk(info, nextblk, tmpbuf);
135 if (err < 0)
136 goto out_buf;
137 ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
138 dh->dqdh_prev_free;
139 err = write_blk(info, nextblk, tmpbuf);
140 if (err < 0)
141 goto out_buf;
142 }
143 if (prevblk) {
144 err = read_blk(info, prevblk, tmpbuf);
145 if (err < 0)
146 goto out_buf;
147 ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_next_free =
148 dh->dqdh_next_free;
149 err = write_blk(info, prevblk, tmpbuf);
150 if (err < 0)
151 goto out_buf;
152 } else {
153 info->dqi_free_entry = nextblk;
154 mark_info_dirty(info->dqi_sb, info->dqi_type);
155 }
156 freedqbuf(tmpbuf);
157 dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
158 /* No matter whether write succeeds block is out of list */
159 if (write_blk(info, blk, buf) < 0)
160 printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk);
161 return 0;
162out_buf:
163 freedqbuf(tmpbuf);
164 return err;
165}
166
167/* Insert given block to the beginning of list with free entries */
168static int insert_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
169{
170 dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs);
171 struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
172 int err;
173
174 if (!tmpbuf)
175 return -ENOMEM;
176 dh->dqdh_next_free = cpu_to_le32(info->dqi_free_entry);
177 dh->dqdh_prev_free = cpu_to_le32(0);
178 err = write_blk(info, blk, buf);
179 if (err < 0)
180 goto out_buf;
181 if (info->dqi_free_entry) {
182 err = read_blk(info, info->dqi_free_entry, tmpbuf);
183 if (err < 0)
184 goto out_buf;
185 ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
186 cpu_to_le32(blk);
187 err = write_blk(info, info->dqi_free_entry, tmpbuf);
188 if (err < 0)
189 goto out_buf;
190 }
191 freedqbuf(tmpbuf);
192 info->dqi_free_entry = blk;
193 mark_info_dirty(info->dqi_sb, info->dqi_type);
194 return 0;
195out_buf:
196 freedqbuf(tmpbuf);
197 return err;
198}
199
200/* Is the entry in the block free? */
201int qtree_entry_unused(struct qtree_mem_dqinfo *info, char *disk)
202{
203 int i;
204
205 for (i = 0; i < info->dqi_entry_size; i++)
206 if (disk[i])
207 return 0;
208 return 1;
209}
210EXPORT_SYMBOL(qtree_entry_unused);
211
212/* Find space for dquot */
213static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
214 struct dquot *dquot, int *err)
215{
216 uint blk, i;
217 struct qt_disk_dqdbheader *dh;
218 dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
219 char *ddquot;
220
221 *err = 0;
222 if (!buf) {
223 *err = -ENOMEM;
224 return 0;
225 }
226 dh = (struct qt_disk_dqdbheader *)buf;
227 if (info->dqi_free_entry) {
228 blk = info->dqi_free_entry;
229 *err = read_blk(info, blk, buf);
230 if (*err < 0)
231 goto out_buf;
232 } else {
233 blk = get_free_dqblk(info);
234 if ((int)blk < 0) {
235 *err = blk;
236 freedqbuf(buf);
237 return 0;
238 }
239 memset(buf, 0, info->dqi_usable_bs);
240 /* This is enough as block is already zeroed and entry list is empty... */
241 info->dqi_free_entry = blk;
242 mark_info_dirty(dquot->dq_sb, dquot->dq_type);
243 }
244 /* Block will be full? */
245 if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
246 *err = remove_free_dqentry(info, buf, blk);
247 if (*err < 0) {
248 printk(KERN_ERR "VFS: find_free_dqentry(): Can't "
249 "remove block (%u) from entry free list.\n",
250 blk);
251 goto out_buf;
252 }
253 }
254 le16_add_cpu(&dh->dqdh_entries, 1);
255 /* Find free structure in block */
256 for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader);
257 i < qtree_dqstr_in_blk(info) && !qtree_entry_unused(info, ddquot);
258 i++, ddquot += info->dqi_entry_size);
259#ifdef __QUOTA_QT_PARANOIA
260 if (i == qtree_dqstr_in_blk(info)) {
261 printk(KERN_ERR "VFS: find_free_dqentry(): Data block full "
262 "but it shouldn't.\n");
263 *err = -EIO;
264 goto out_buf;
265 }
266#endif
267 *err = write_blk(info, blk, buf);
268 if (*err < 0) {
269 printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota "
270 "data block %u.\n", blk);
271 goto out_buf;
272 }
273 dquot->dq_off = (blk << info->dqi_blocksize_bits) +
274 sizeof(struct qt_disk_dqdbheader) +
275 i * info->dqi_entry_size;
276 freedqbuf(buf);
277 return blk;
278out_buf:
279 freedqbuf(buf);
280 return 0;
281}
282
283/* Insert reference to structure into the trie */
284static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
285 uint *treeblk, int depth)
286{
287 dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
288 int ret = 0, newson = 0, newact = 0;
289 __le32 *ref;
290 uint newblk;
291
292 if (!buf)
293 return -ENOMEM;
294 if (!*treeblk) {
295 ret = get_free_dqblk(info);
296 if (ret < 0)
297 goto out_buf;
298 *treeblk = ret;
299 memset(buf, 0, info->dqi_usable_bs);
300 newact = 1;
301 } else {
302 ret = read_blk(info, *treeblk, buf);
303 if (ret < 0) {
304 printk(KERN_ERR "VFS: Can't read tree quota block "
305 "%u.\n", *treeblk);
306 goto out_buf;
307 }
308 }
309 ref = (__le32 *)buf;
310 newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
311 if (!newblk)
312 newson = 1;
313 if (depth == info->dqi_qtree_depth - 1) {
314#ifdef __QUOTA_QT_PARANOIA
315 if (newblk) {
316 printk(KERN_ERR "VFS: Inserting already present quota "
317 "entry (block %u).\n",
318 le32_to_cpu(ref[get_index(info,
319 dquot->dq_id, depth)]));
320 ret = -EIO;
321 goto out_buf;
322 }
323#endif
324 newblk = find_free_dqentry(info, dquot, &ret);
325 } else {
326 ret = do_insert_tree(info, dquot, &newblk, depth+1);
327 }
328 if (newson && ret >= 0) {
329 ref[get_index(info, dquot->dq_id, depth)] =
330 cpu_to_le32(newblk);
331 ret = write_blk(info, *treeblk, buf);
332 } else if (newact && ret < 0) {
333 put_free_dqblk(info, buf, *treeblk);
334 }
335out_buf:
336 freedqbuf(buf);
337 return ret;
338}
339
340/* Wrapper for inserting quota structure into tree */
341static inline int dq_insert_tree(struct qtree_mem_dqinfo *info,
342 struct dquot *dquot)
343{
344 int tmp = QT_TREEOFF;
345 return do_insert_tree(info, dquot, &tmp, 0);
346}
347
348/*
349 * We don't have to be afraid of deadlocks as we never have quotas on quota files...
350 */
351int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
352{
353 int type = dquot->dq_type;
354 struct super_block *sb = dquot->dq_sb;
355 ssize_t ret;
356 dqbuf_t ddquot = getdqbuf(info->dqi_entry_size);
357
358 if (!ddquot)
359 return -ENOMEM;
360
361 /* dq_off is guarded by dqio_mutex */
362 if (!dquot->dq_off) {
363 ret = dq_insert_tree(info, dquot);
364 if (ret < 0) {
365 printk(KERN_ERR "VFS: Error %zd occurred while "
366 "creating quota.\n", ret);
367 freedqbuf(ddquot);
368 return ret;
369 }
370 }
371 spin_lock(&dq_data_lock);
372 info->dqi_ops->mem2disk_dqblk(ddquot, dquot);
373 spin_unlock(&dq_data_lock);
374 ret = sb->s_op->quota_write(sb, type, (char *)ddquot,
375 info->dqi_entry_size, dquot->dq_off);
376 if (ret != info->dqi_entry_size) {
377 printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
378 sb->s_id);
379 if (ret >= 0)
380 ret = -ENOSPC;
381 } else {
382 ret = 0;
383 }
384 dqstats.writes++;
385 freedqbuf(ddquot);
386
387 return ret;
388}
389EXPORT_SYMBOL(qtree_write_dquot);
390
391/* Free dquot entry in data block */
392static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
393 uint blk)
394{
395 struct qt_disk_dqdbheader *dh;
396 dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
397 int ret = 0;
398
399 if (!buf)
400 return -ENOMEM;
401 if (dquot->dq_off >> info->dqi_blocksize_bits != blk) {
402 printk(KERN_ERR "VFS: Quota structure has offset to other "
403 "block (%u) than it should (%u).\n", blk,
404 (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
405 goto out_buf;
406 }
407 ret = read_blk(info, blk, buf);
408 if (ret < 0) {
409 printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
410 goto out_buf;
411 }
412 dh = (struct qt_disk_dqdbheader *)buf;
413 le16_add_cpu(&dh->dqdh_entries, -1);
414 if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */
415 ret = remove_free_dqentry(info, buf, blk);
416 if (ret >= 0)
417 ret = put_free_dqblk(info, buf, blk);
418 if (ret < 0) {
419 printk(KERN_ERR "VFS: Can't move quota data block (%u) "
420 "to free list.\n", blk);
421 goto out_buf;
422 }
423 } else {
424 memset(buf +
425 (dquot->dq_off & ((1 << info->dqi_blocksize_bits) - 1)),
426 0, info->dqi_entry_size);
427 if (le16_to_cpu(dh->dqdh_entries) ==
428 qtree_dqstr_in_blk(info) - 1) {
429 /* Insert will write block itself */
430 ret = insert_free_dqentry(info, buf, blk);
431 if (ret < 0) {
432 printk(KERN_ERR "VFS: Can't insert quota data "
433 "block (%u) to free entry list.\n", blk);
434 goto out_buf;
435 }
436 } else {
437 ret = write_blk(info, blk, buf);
438 if (ret < 0) {
439 printk(KERN_ERR "VFS: Can't write quota data "
440 "block %u\n", blk);
441 goto out_buf;
442 }
443 }
444 }
445 dquot->dq_off = 0; /* Quota is now unattached */
446out_buf:
447 freedqbuf(buf);
448 return ret;
449}
450
451/* Remove reference to dquot from tree */
452static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
453 uint *blk, int depth)
454{
455 dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
456 int ret = 0;
457 uint newblk;
458 __le32 *ref = (__le32 *)buf;
459
460 if (!buf)
461 return -ENOMEM;
462 ret = read_blk(info, *blk, buf);
463 if (ret < 0) {
464 printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
465 goto out_buf;
466 }
467 newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
468 if (depth == info->dqi_qtree_depth - 1) {
469 ret = free_dqentry(info, dquot, newblk);
470 newblk = 0;
471 } else {
472 ret = remove_tree(info, dquot, &newblk, depth+1);
473 }
474 if (ret >= 0 && !newblk) {
475 int i;
476 ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0);
477 /* Block got empty? */
478 for (i = 0;
479 i < (info->dqi_usable_bs >> 2) && !ref[i];
480 i++);
481 /* Don't put the root block into the free block list */
482 if (i == (info->dqi_usable_bs >> 2)
483 && *blk != QT_TREEOFF) {
484 put_free_dqblk(info, buf, *blk);
485 *blk = 0;
486 } else {
487 ret = write_blk(info, *blk, buf);
488 if (ret < 0)
489 printk(KERN_ERR "VFS: Can't write quota tree "
490 "block %u.\n", *blk);
491 }
492 }
493out_buf:
494 freedqbuf(buf);
495 return ret;
496}
497
498/* Delete dquot from tree */
499int qtree_delete_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
500{
501 uint tmp = QT_TREEOFF;
502
503 if (!dquot->dq_off) /* Even not allocated? */
504 return 0;
505 return remove_tree(info, dquot, &tmp, 0);
506}
507EXPORT_SYMBOL(qtree_delete_dquot);
508
509/* Find entry in block */
510static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
511 struct dquot *dquot, uint blk)
512{
513 dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
514 loff_t ret = 0;
515 int i;
516 char *ddquot;
517
518 if (!buf)
519 return -ENOMEM;
520 ret = read_blk(info, blk, buf);
521 if (ret < 0) {
522 printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
523 goto out_buf;
524 }
525 for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader);
526 i < qtree_dqstr_in_blk(info) && !info->dqi_ops->is_id(ddquot, dquot);
527 i++, ddquot += info->dqi_entry_size);
528 if (i == qtree_dqstr_in_blk(info)) {
529 printk(KERN_ERR "VFS: Quota for id %u referenced "
530 "but not present.\n", dquot->dq_id);
531 ret = -EIO;
532 goto out_buf;
533 } else {
534 ret = (blk << info->dqi_blocksize_bits) + sizeof(struct
535 qt_disk_dqdbheader) + i * info->dqi_entry_size;
536 }
537out_buf:
538 freedqbuf(buf);
539 return ret;
540}
541
542/* Find entry for given id in the tree */
543static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
544 struct dquot *dquot, uint blk, int depth)
545{
546 dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
547 loff_t ret = 0;
548 __le32 *ref = (__le32 *)buf;
549
550 if (!buf)
551 return -ENOMEM;
552 ret = read_blk(info, blk, buf);
553 if (ret < 0) {
554 printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
555 goto out_buf;
556 }
557 ret = 0;
558 blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
559 if (!blk) /* No reference? */
560 goto out_buf;
561 if (depth < info->dqi_qtree_depth - 1)
562 ret = find_tree_dqentry(info, dquot, blk, depth+1);
563 else
564 ret = find_block_dqentry(info, dquot, blk);
565out_buf:
566 freedqbuf(buf);
567 return ret;
568}
569
570/* Find entry for given id in the tree - wrapper function */
571static inline loff_t find_dqentry(struct qtree_mem_dqinfo *info,
572 struct dquot *dquot)
573{
574 return find_tree_dqentry(info, dquot, QT_TREEOFF, 0);
575}
576
577int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
578{
579 int type = dquot->dq_type;
580 struct super_block *sb = dquot->dq_sb;
581 loff_t offset;
582 dqbuf_t ddquot;
583 int ret = 0;
584
585#ifdef __QUOTA_QT_PARANOIA
586 /* Invalidated quota? */
587 if (!sb_dqopt(dquot->dq_sb)->files[type]) {
588 printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
589 return -EIO;
590 }
591#endif
592 /* Do we know offset of the dquot entry in the quota file? */
593 if (!dquot->dq_off) {
594 offset = find_dqentry(info, dquot);
595 if (offset <= 0) { /* Entry not present? */
596 if (offset < 0)
597 printk(KERN_ERR "VFS: Can't read quota "
598 "structure for id %u.\n", dquot->dq_id);
599 dquot->dq_off = 0;
600 set_bit(DQ_FAKE_B, &dquot->dq_flags);
601 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
602 ret = offset;
603 goto out;
604 }
605 dquot->dq_off = offset;
606 }
607 ddquot = getdqbuf(info->dqi_entry_size);
608 if (!ddquot)
609 return -ENOMEM;
610 ret = sb->s_op->quota_read(sb, type, (char *)ddquot,
611 info->dqi_entry_size, dquot->dq_off);
612 if (ret != info->dqi_entry_size) {
613 if (ret >= 0)
614 ret = -EIO;
615 printk(KERN_ERR "VFS: Error while reading quota "
616 "structure for id %u.\n", dquot->dq_id);
617 set_bit(DQ_FAKE_B, &dquot->dq_flags);
618 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
619 freedqbuf(ddquot);
620 goto out;
621 }
622 spin_lock(&dq_data_lock);
623 info->dqi_ops->disk2mem_dqblk(dquot, ddquot);
624 if (!dquot->dq_dqb.dqb_bhardlimit &&
625 !dquot->dq_dqb.dqb_bsoftlimit &&
626 !dquot->dq_dqb.dqb_ihardlimit &&
627 !dquot->dq_dqb.dqb_isoftlimit)
628 set_bit(DQ_FAKE_B, &dquot->dq_flags);
629 spin_unlock(&dq_data_lock);
630 freedqbuf(ddquot);
631out:
632 dqstats.reads++;
633 return ret;
634}
635EXPORT_SYMBOL(qtree_read_dquot);
636
637/* Check whether dquot should not be deleted. We know we are
638 * the only one operating on dquot (thanks to dq_lock) */
639int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
640{
641 if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace))
642 return qtree_delete_dquot(info, dquot);
643 return 0;
644}
645EXPORT_SYMBOL(qtree_release_dquot);
diff --git a/fs/quota_tree.h b/fs/quota_tree.h
new file mode 100644
index 000000000000..a1ab8db81a51
--- /dev/null
+++ b/fs/quota_tree.h
@@ -0,0 +1,25 @@
1/*
2 * Definitions of structures for vfsv0 quota format
3 */
4
5#ifndef _LINUX_QUOTA_TREE_H
6#define _LINUX_QUOTA_TREE_H
7
8#include <linux/types.h>
9#include <linux/quota.h>
10
11/*
12 * Structure of header of block with quota structures. It is padded to 16 bytes so
13 * there will be space for exactly 21 quota-entries in a block
14 */
15struct qt_disk_dqdbheader {
16 __le32 dqdh_next_free; /* Number of next block with free entry */
17 __le32 dqdh_prev_free; /* Number of previous block with free entry */
18 __le16 dqdh_entries; /* Number of valid entries in block */
19 __le16 dqdh_pad1;
20 __le32 dqdh_pad2;
21};
22
23#define QT_TREEOFF 1 /* Offset of tree in file in blocks */
24
25#endif /* _LINUX_QUOTAIO_TREE_H */
diff --git a/fs/quota_v1.c b/fs/quota_v1.c
index 5ae15b13eeb0..b4af1c69ad16 100644
--- a/fs/quota_v1.c
+++ b/fs/quota_v1.c
@@ -3,25 +3,39 @@
3#include <linux/quota.h> 3#include <linux/quota.h>
4#include <linux/quotaops.h> 4#include <linux/quotaops.h>
5#include <linux/dqblk_v1.h> 5#include <linux/dqblk_v1.h>
6#include <linux/quotaio_v1.h>
7#include <linux/kernel.h> 6#include <linux/kernel.h>
8#include <linux/init.h> 7#include <linux/init.h>
9#include <linux/module.h> 8#include <linux/module.h>
10 9
11#include <asm/byteorder.h> 10#include <asm/byteorder.h>
12 11
12#include "quotaio_v1.h"
13
13MODULE_AUTHOR("Jan Kara"); 14MODULE_AUTHOR("Jan Kara");
14MODULE_DESCRIPTION("Old quota format support"); 15MODULE_DESCRIPTION("Old quota format support");
15MODULE_LICENSE("GPL"); 16MODULE_LICENSE("GPL");
16 17
18#define QUOTABLOCK_BITS 10
19#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
20
21static inline qsize_t v1_stoqb(qsize_t space)
22{
23 return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS;
24}
25
26static inline qsize_t v1_qbtos(qsize_t blocks)
27{
28 return blocks << QUOTABLOCK_BITS;
29}
30
17static void v1_disk2mem_dqblk(struct mem_dqblk *m, struct v1_disk_dqblk *d) 31static void v1_disk2mem_dqblk(struct mem_dqblk *m, struct v1_disk_dqblk *d)
18{ 32{
19 m->dqb_ihardlimit = d->dqb_ihardlimit; 33 m->dqb_ihardlimit = d->dqb_ihardlimit;
20 m->dqb_isoftlimit = d->dqb_isoftlimit; 34 m->dqb_isoftlimit = d->dqb_isoftlimit;
21 m->dqb_curinodes = d->dqb_curinodes; 35 m->dqb_curinodes = d->dqb_curinodes;
22 m->dqb_bhardlimit = d->dqb_bhardlimit; 36 m->dqb_bhardlimit = v1_qbtos(d->dqb_bhardlimit);
23 m->dqb_bsoftlimit = d->dqb_bsoftlimit; 37 m->dqb_bsoftlimit = v1_qbtos(d->dqb_bsoftlimit);
24 m->dqb_curspace = ((qsize_t)d->dqb_curblocks) << QUOTABLOCK_BITS; 38 m->dqb_curspace = v1_qbtos(d->dqb_curblocks);
25 m->dqb_itime = d->dqb_itime; 39 m->dqb_itime = d->dqb_itime;
26 m->dqb_btime = d->dqb_btime; 40 m->dqb_btime = d->dqb_btime;
27} 41}
@@ -31,9 +45,9 @@ static void v1_mem2disk_dqblk(struct v1_disk_dqblk *d, struct mem_dqblk *m)
31 d->dqb_ihardlimit = m->dqb_ihardlimit; 45 d->dqb_ihardlimit = m->dqb_ihardlimit;
32 d->dqb_isoftlimit = m->dqb_isoftlimit; 46 d->dqb_isoftlimit = m->dqb_isoftlimit;
33 d->dqb_curinodes = m->dqb_curinodes; 47 d->dqb_curinodes = m->dqb_curinodes;
34 d->dqb_bhardlimit = m->dqb_bhardlimit; 48 d->dqb_bhardlimit = v1_stoqb(m->dqb_bhardlimit);
35 d->dqb_bsoftlimit = m->dqb_bsoftlimit; 49 d->dqb_bsoftlimit = v1_stoqb(m->dqb_bsoftlimit);
36 d->dqb_curblocks = toqb(m->dqb_curspace); 50 d->dqb_curblocks = v1_stoqb(m->dqb_curspace);
37 d->dqb_itime = m->dqb_itime; 51 d->dqb_itime = m->dqb_itime;
38 d->dqb_btime = m->dqb_btime; 52 d->dqb_btime = m->dqb_btime;
39} 53}
diff --git a/fs/quota_v2.c b/fs/quota_v2.c
index b53827dc02d9..b618b563635c 100644
--- a/fs/quota_v2.c
+++ b/fs/quota_v2.c
@@ -6,7 +6,6 @@
6#include <linux/fs.h> 6#include <linux/fs.h>
7#include <linux/mount.h> 7#include <linux/mount.h>
8#include <linux/dqblk_v2.h> 8#include <linux/dqblk_v2.h>
9#include <linux/quotaio_v2.h>
10#include <linux/kernel.h> 9#include <linux/kernel.h>
11#include <linux/init.h> 10#include <linux/init.h>
12#include <linux/module.h> 11#include <linux/module.h>
@@ -15,16 +14,37 @@
15 14
16#include <asm/byteorder.h> 15#include <asm/byteorder.h>
17 16
17#include "quota_tree.h"
18#include "quotaio_v2.h"
19
18MODULE_AUTHOR("Jan Kara"); 20MODULE_AUTHOR("Jan Kara");
19MODULE_DESCRIPTION("Quota format v2 support"); 21MODULE_DESCRIPTION("Quota format v2 support");
20MODULE_LICENSE("GPL"); 22MODULE_LICENSE("GPL");
21 23
22#define __QUOTA_V2_PARANOIA 24#define __QUOTA_V2_PARANOIA
23 25
24typedef char *dqbuf_t; 26static void v2_mem2diskdqb(void *dp, struct dquot *dquot);
27static void v2_disk2memdqb(struct dquot *dquot, void *dp);
28static int v2_is_id(void *dp, struct dquot *dquot);
29
30static struct qtree_fmt_operations v2_qtree_ops = {
31 .mem2disk_dqblk = v2_mem2diskdqb,
32 .disk2mem_dqblk = v2_disk2memdqb,
33 .is_id = v2_is_id,
34};
35
36#define QUOTABLOCK_BITS 10
37#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
25 38
26#define GETIDINDEX(id, depth) (((id) >> ((V2_DQTREEDEPTH-(depth)-1)*8)) & 0xff) 39static inline qsize_t v2_stoqb(qsize_t space)
27#define GETENTRIES(buf) ((struct v2_disk_dqblk *)(((char *)buf)+sizeof(struct v2_disk_dqdbheader))) 40{
41 return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS;
42}
43
44static inline qsize_t v2_qbtos(qsize_t blocks)
45{
46 return blocks << QUOTABLOCK_BITS;
47}
28 48
29/* Check whether given file is really vfsv0 quotafile */ 49/* Check whether given file is really vfsv0 quotafile */
30static int v2_check_quota_file(struct super_block *sb, int type) 50static int v2_check_quota_file(struct super_block *sb, int type)
@@ -50,7 +70,8 @@ static int v2_check_quota_file(struct super_block *sb, int type)
50static int v2_read_file_info(struct super_block *sb, int type) 70static int v2_read_file_info(struct super_block *sb, int type)
51{ 71{
52 struct v2_disk_dqinfo dinfo; 72 struct v2_disk_dqinfo dinfo;
53 struct mem_dqinfo *info = sb_dqopt(sb)->info+type; 73 struct mem_dqinfo *info = sb_dqinfo(sb, type);
74 struct qtree_mem_dqinfo *qinfo;
54 ssize_t size; 75 ssize_t size;
55 76
56 size = sb->s_op->quota_read(sb, type, (char *)&dinfo, 77 size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
@@ -60,15 +81,29 @@ static int v2_read_file_info(struct super_block *sb, int type)
60 sb->s_id); 81 sb->s_id);
61 return -1; 82 return -1;
62 } 83 }
84 info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS);
85 if (!info->dqi_priv) {
86 printk(KERN_WARNING
87 "Not enough memory for quota information structure.\n");
88 return -1;
89 }
90 qinfo = info->dqi_priv;
63 /* limits are stored as unsigned 32-bit data */ 91 /* limits are stored as unsigned 32-bit data */
64 info->dqi_maxblimit = 0xffffffff; 92 info->dqi_maxblimit = 0xffffffff;
65 info->dqi_maxilimit = 0xffffffff; 93 info->dqi_maxilimit = 0xffffffff;
66 info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); 94 info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
67 info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); 95 info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
68 info->dqi_flags = le32_to_cpu(dinfo.dqi_flags); 96 info->dqi_flags = le32_to_cpu(dinfo.dqi_flags);
69 info->u.v2_i.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks); 97 qinfo->dqi_sb = sb;
70 info->u.v2_i.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk); 98 qinfo->dqi_type = type;
71 info->u.v2_i.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry); 99 qinfo->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
100 qinfo->dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
101 qinfo->dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
102 qinfo->dqi_blocksize_bits = V2_DQBLKSIZE_BITS;
103 qinfo->dqi_usable_bs = 1 << V2_DQBLKSIZE_BITS;
104 qinfo->dqi_qtree_depth = qtree_depth(qinfo);
105 qinfo->dqi_entry_size = sizeof(struct v2_disk_dqblk);
106 qinfo->dqi_ops = &v2_qtree_ops;
72 return 0; 107 return 0;
73} 108}
74 109
@@ -76,7 +111,8 @@ static int v2_read_file_info(struct super_block *sb, int type)
76static int v2_write_file_info(struct super_block *sb, int type) 111static int v2_write_file_info(struct super_block *sb, int type)
77{ 112{
78 struct v2_disk_dqinfo dinfo; 113 struct v2_disk_dqinfo dinfo;
79 struct mem_dqinfo *info = sb_dqopt(sb)->info+type; 114 struct mem_dqinfo *info = sb_dqinfo(sb, type);
115 struct qtree_mem_dqinfo *qinfo = info->dqi_priv;
80 ssize_t size; 116 ssize_t size;
81 117
82 spin_lock(&dq_data_lock); 118 spin_lock(&dq_data_lock);
@@ -85,9 +121,9 @@ static int v2_write_file_info(struct super_block *sb, int type)
85 dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace); 121 dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
86 dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK); 122 dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
87 spin_unlock(&dq_data_lock); 123 spin_unlock(&dq_data_lock);
88 dinfo.dqi_blocks = cpu_to_le32(info->u.v2_i.dqi_blocks); 124 dinfo.dqi_blocks = cpu_to_le32(qinfo->dqi_blocks);
89 dinfo.dqi_free_blk = cpu_to_le32(info->u.v2_i.dqi_free_blk); 125 dinfo.dqi_free_blk = cpu_to_le32(qinfo->dqi_free_blk);
90 dinfo.dqi_free_entry = cpu_to_le32(info->u.v2_i.dqi_free_entry); 126 dinfo.dqi_free_entry = cpu_to_le32(qinfo->dqi_free_entry);
91 size = sb->s_op->quota_write(sb, type, (char *)&dinfo, 127 size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
92 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); 128 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
93 if (size != sizeof(struct v2_disk_dqinfo)) { 129 if (size != sizeof(struct v2_disk_dqinfo)) {
@@ -98,574 +134,75 @@ static int v2_write_file_info(struct super_block *sb, int type)
98 return 0; 134 return 0;
99} 135}
100 136
101static void disk2memdqb(struct mem_dqblk *m, struct v2_disk_dqblk *d) 137static void v2_disk2memdqb(struct dquot *dquot, void *dp)
102{ 138{
139 struct v2_disk_dqblk *d = dp, empty;
140 struct mem_dqblk *m = &dquot->dq_dqb;
141
103 m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit); 142 m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit);
104 m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit); 143 m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit);
105 m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes); 144 m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes);
106 m->dqb_itime = le64_to_cpu(d->dqb_itime); 145 m->dqb_itime = le64_to_cpu(d->dqb_itime);
107 m->dqb_bhardlimit = le32_to_cpu(d->dqb_bhardlimit); 146 m->dqb_bhardlimit = v2_qbtos(le32_to_cpu(d->dqb_bhardlimit));
108 m->dqb_bsoftlimit = le32_to_cpu(d->dqb_bsoftlimit); 147 m->dqb_bsoftlimit = v2_qbtos(le32_to_cpu(d->dqb_bsoftlimit));
109 m->dqb_curspace = le64_to_cpu(d->dqb_curspace); 148 m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
110 m->dqb_btime = le64_to_cpu(d->dqb_btime); 149 m->dqb_btime = le64_to_cpu(d->dqb_btime);
150 /* We need to escape back all-zero structure */
151 memset(&empty, 0, sizeof(struct v2_disk_dqblk));
152 empty.dqb_itime = cpu_to_le64(1);
153 if (!memcmp(&empty, dp, sizeof(struct v2_disk_dqblk)))
154 m->dqb_itime = 0;
111} 155}
112 156
113static void mem2diskdqb(struct v2_disk_dqblk *d, struct mem_dqblk *m, qid_t id) 157static void v2_mem2diskdqb(void *dp, struct dquot *dquot)
114{ 158{
159 struct v2_disk_dqblk *d = dp;
160 struct mem_dqblk *m = &dquot->dq_dqb;
161 struct qtree_mem_dqinfo *info =
162 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
163
115 d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit); 164 d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit);
116 d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit); 165 d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit);
117 d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes); 166 d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes);
118 d->dqb_itime = cpu_to_le64(m->dqb_itime); 167 d->dqb_itime = cpu_to_le64(m->dqb_itime);
119 d->dqb_bhardlimit = cpu_to_le32(m->dqb_bhardlimit); 168 d->dqb_bhardlimit = cpu_to_le32(v2_stoqb(m->dqb_bhardlimit));
120 d->dqb_bsoftlimit = cpu_to_le32(m->dqb_bsoftlimit); 169 d->dqb_bsoftlimit = cpu_to_le32(v2_stoqb(m->dqb_bsoftlimit));
121 d->dqb_curspace = cpu_to_le64(m->dqb_curspace); 170 d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
122 d->dqb_btime = cpu_to_le64(m->dqb_btime); 171 d->dqb_btime = cpu_to_le64(m->dqb_btime);
123 d->dqb_id = cpu_to_le32(id); 172 d->dqb_id = cpu_to_le32(dquot->dq_id);
124} 173 if (qtree_entry_unused(info, dp))
125 174 d->dqb_itime = cpu_to_le64(1);
126static dqbuf_t getdqbuf(void)
127{
128 dqbuf_t buf = kmalloc(V2_DQBLKSIZE, GFP_NOFS);
129 if (!buf)
130 printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n");
131 return buf;
132}
133
134static inline void freedqbuf(dqbuf_t buf)
135{
136 kfree(buf);
137}
138
139static inline ssize_t read_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf)
140{
141 memset(buf, 0, V2_DQBLKSIZE);
142 return sb->s_op->quota_read(sb, type, (char *)buf,
143 V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS);
144}
145
146static inline ssize_t write_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf)
147{
148 return sb->s_op->quota_write(sb, type, (char *)buf,
149 V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS);
150}
151
152/* Remove empty block from list and return it */
153static int get_free_dqblk(struct super_block *sb, int type)
154{
155 dqbuf_t buf = getdqbuf();
156 struct mem_dqinfo *info = sb_dqinfo(sb, type);
157 struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
158 int ret, blk;
159
160 if (!buf)
161 return -ENOMEM;
162 if (info->u.v2_i.dqi_free_blk) {
163 blk = info->u.v2_i.dqi_free_blk;
164 if ((ret = read_blk(sb, type, blk, buf)) < 0)
165 goto out_buf;
166 info->u.v2_i.dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
167 }
168 else {
169 memset(buf, 0, V2_DQBLKSIZE);
170 /* Assure block allocation... */
171 if ((ret = write_blk(sb, type, info->u.v2_i.dqi_blocks, buf)) < 0)
172 goto out_buf;
173 blk = info->u.v2_i.dqi_blocks++;
174 }
175 mark_info_dirty(sb, type);
176 ret = blk;
177out_buf:
178 freedqbuf(buf);
179 return ret;
180}
181
182/* Insert empty block to the list */
183static int put_free_dqblk(struct super_block *sb, int type, dqbuf_t buf, uint blk)
184{
185 struct mem_dqinfo *info = sb_dqinfo(sb, type);
186 struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
187 int err;
188
189 dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_blk);
190 dh->dqdh_prev_free = cpu_to_le32(0);
191 dh->dqdh_entries = cpu_to_le16(0);
192 info->u.v2_i.dqi_free_blk = blk;
193 mark_info_dirty(sb, type);
194 /* Some strange block. We had better leave it... */
195 if ((err = write_blk(sb, type, blk, buf)) < 0)
196 return err;
197 return 0;
198} 175}
199 176
200/* Remove given block from the list of blocks with free entries */ 177static int v2_is_id(void *dp, struct dquot *dquot)
201static int remove_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk)
202{ 178{
203 dqbuf_t tmpbuf = getdqbuf(); 179 struct v2_disk_dqblk *d = dp;
204 struct mem_dqinfo *info = sb_dqinfo(sb, type); 180 struct qtree_mem_dqinfo *info =
205 struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf; 181 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
206 uint nextblk = le32_to_cpu(dh->dqdh_next_free), prevblk = le32_to_cpu(dh->dqdh_prev_free);
207 int err;
208 182
209 if (!tmpbuf) 183 if (qtree_entry_unused(info, dp))
210 return -ENOMEM;
211 if (nextblk) {
212 if ((err = read_blk(sb, type, nextblk, tmpbuf)) < 0)
213 goto out_buf;
214 ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = dh->dqdh_prev_free;
215 if ((err = write_blk(sb, type, nextblk, tmpbuf)) < 0)
216 goto out_buf;
217 }
218 if (prevblk) {
219 if ((err = read_blk(sb, type, prevblk, tmpbuf)) < 0)
220 goto out_buf;
221 ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_next_free = dh->dqdh_next_free;
222 if ((err = write_blk(sb, type, prevblk, tmpbuf)) < 0)
223 goto out_buf;
224 }
225 else {
226 info->u.v2_i.dqi_free_entry = nextblk;
227 mark_info_dirty(sb, type);
228 }
229 freedqbuf(tmpbuf);
230 dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
231 /* No matter whether write succeeds block is out of list */
232 if (write_blk(sb, type, blk, buf) < 0)
233 printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk);
234 return 0;
235out_buf:
236 freedqbuf(tmpbuf);
237 return err;
238}
239
240/* Insert given block to the beginning of list with free entries */
241static int insert_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk)
242{
243 dqbuf_t tmpbuf = getdqbuf();
244 struct mem_dqinfo *info = sb_dqinfo(sb, type);
245 struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
246 int err;
247
248 if (!tmpbuf)
249 return -ENOMEM;
250 dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_entry);
251 dh->dqdh_prev_free = cpu_to_le32(0);
252 if ((err = write_blk(sb, type, blk, buf)) < 0)
253 goto out_buf;
254 if (info->u.v2_i.dqi_free_entry) {
255 if ((err = read_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0)
256 goto out_buf;
257 ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = cpu_to_le32(blk);
258 if ((err = write_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0)
259 goto out_buf;
260 }
261 freedqbuf(tmpbuf);
262 info->u.v2_i.dqi_free_entry = blk;
263 mark_info_dirty(sb, type);
264 return 0;
265out_buf:
266 freedqbuf(tmpbuf);
267 return err;
268}
269
270/* Find space for dquot */
271static uint find_free_dqentry(struct dquot *dquot, int *err)
272{
273 struct super_block *sb = dquot->dq_sb;
274 struct mem_dqinfo *info = sb_dqopt(sb)->info+dquot->dq_type;
275 uint blk, i;
276 struct v2_disk_dqdbheader *dh;
277 struct v2_disk_dqblk *ddquot;
278 struct v2_disk_dqblk fakedquot;
279 dqbuf_t buf;
280
281 *err = 0;
282 if (!(buf = getdqbuf())) {
283 *err = -ENOMEM;
284 return 0; 184 return 0;
285 } 185 return le32_to_cpu(d->dqb_id) == dquot->dq_id;
286 dh = (struct v2_disk_dqdbheader *)buf;
287 ddquot = GETENTRIES(buf);
288 if (info->u.v2_i.dqi_free_entry) {
289 blk = info->u.v2_i.dqi_free_entry;
290 if ((*err = read_blk(sb, dquot->dq_type, blk, buf)) < 0)
291 goto out_buf;
292 }
293 else {
294 blk = get_free_dqblk(sb, dquot->dq_type);
295 if ((int)blk < 0) {
296 *err = blk;
297 freedqbuf(buf);
298 return 0;
299 }
300 memset(buf, 0, V2_DQBLKSIZE);
301 /* This is enough as block is already zeroed and entry list is empty... */
302 info->u.v2_i.dqi_free_entry = blk;
303 mark_info_dirty(sb, dquot->dq_type);
304 }
305 if (le16_to_cpu(dh->dqdh_entries)+1 >= V2_DQSTRINBLK) /* Block will be full? */
306 if ((*err = remove_free_dqentry(sb, dquot->dq_type, buf, blk)) < 0) {
307 printk(KERN_ERR "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n", blk);
308 goto out_buf;
309 }
310 le16_add_cpu(&dh->dqdh_entries, 1);
311 memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
312 /* Find free structure in block */
313 for (i = 0; i < V2_DQSTRINBLK && memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)); i++);
314#ifdef __QUOTA_V2_PARANOIA
315 if (i == V2_DQSTRINBLK) {
316 printk(KERN_ERR "VFS: find_free_dqentry(): Data block full but it shouldn't.\n");
317 *err = -EIO;
318 goto out_buf;
319 }
320#endif
321 if ((*err = write_blk(sb, dquot->dq_type, blk, buf)) < 0) {
322 printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota data block %u.\n", blk);
323 goto out_buf;
324 }
325 dquot->dq_off = (blk<<V2_DQBLKSIZE_BITS)+sizeof(struct v2_disk_dqdbheader)+i*sizeof(struct v2_disk_dqblk);
326 freedqbuf(buf);
327 return blk;
328out_buf:
329 freedqbuf(buf);
330 return 0;
331}
332
333/* Insert reference to structure into the trie */
334static int do_insert_tree(struct dquot *dquot, uint *treeblk, int depth)
335{
336 struct super_block *sb = dquot->dq_sb;
337 dqbuf_t buf;
338 int ret = 0, newson = 0, newact = 0;
339 __le32 *ref;
340 uint newblk;
341
342 if (!(buf = getdqbuf()))
343 return -ENOMEM;
344 if (!*treeblk) {
345 ret = get_free_dqblk(sb, dquot->dq_type);
346 if (ret < 0)
347 goto out_buf;
348 *treeblk = ret;
349 memset(buf, 0, V2_DQBLKSIZE);
350 newact = 1;
351 }
352 else {
353 if ((ret = read_blk(sb, dquot->dq_type, *treeblk, buf)) < 0) {
354 printk(KERN_ERR "VFS: Can't read tree quota block %u.\n", *treeblk);
355 goto out_buf;
356 }
357 }
358 ref = (__le32 *)buf;
359 newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
360 if (!newblk)
361 newson = 1;
362 if (depth == V2_DQTREEDEPTH-1) {
363#ifdef __QUOTA_V2_PARANOIA
364 if (newblk) {
365 printk(KERN_ERR "VFS: Inserting already present quota entry (block %u).\n", le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]));
366 ret = -EIO;
367 goto out_buf;
368 }
369#endif
370 newblk = find_free_dqentry(dquot, &ret);
371 }
372 else
373 ret = do_insert_tree(dquot, &newblk, depth+1);
374 if (newson && ret >= 0) {
375 ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(newblk);
376 ret = write_blk(sb, dquot->dq_type, *treeblk, buf);
377 }
378 else if (newact && ret < 0)
379 put_free_dqblk(sb, dquot->dq_type, buf, *treeblk);
380out_buf:
381 freedqbuf(buf);
382 return ret;
383} 186}
384 187
385/* Wrapper for inserting quota structure into tree */ 188static int v2_read_dquot(struct dquot *dquot)
386static inline int dq_insert_tree(struct dquot *dquot)
387{ 189{
388 int tmp = V2_DQTREEOFF; 190 return qtree_read_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
389 return do_insert_tree(dquot, &tmp, 0);
390} 191}
391 192
392/*
393 * We don't have to be afraid of deadlocks as we never have quotas on quota files...
394 */
395static int v2_write_dquot(struct dquot *dquot) 193static int v2_write_dquot(struct dquot *dquot)
396{ 194{
397 int type = dquot->dq_type; 195 return qtree_write_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
398 ssize_t ret;
399 struct v2_disk_dqblk ddquot, empty;
400
401 /* dq_off is guarded by dqio_mutex */
402 if (!dquot->dq_off)
403 if ((ret = dq_insert_tree(dquot)) < 0) {
404 printk(KERN_ERR "VFS: Error %zd occurred while creating quota.\n", ret);
405 return ret;
406 }
407 spin_lock(&dq_data_lock);
408 mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id);
409 /* Argh... We may need to write structure full of zeroes but that would be
410 * treated as an empty place by the rest of the code. Format change would
411 * be definitely cleaner but the problems probably are not worth it */
412 memset(&empty, 0, sizeof(struct v2_disk_dqblk));
413 if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
414 ddquot.dqb_itime = cpu_to_le64(1);
415 spin_unlock(&dq_data_lock);
416 ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type,
417 (char *)&ddquot, sizeof(struct v2_disk_dqblk), dquot->dq_off);
418 if (ret != sizeof(struct v2_disk_dqblk)) {
419 printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", dquot->dq_sb->s_id);
420 if (ret >= 0)
421 ret = -ENOSPC;
422 }
423 else
424 ret = 0;
425 dqstats.writes++;
426
427 return ret;
428} 196}
429 197
430/* Free dquot entry in data block */ 198static int v2_release_dquot(struct dquot *dquot)
431static int free_dqentry(struct dquot *dquot, uint blk)
432{
433 struct super_block *sb = dquot->dq_sb;
434 int type = dquot->dq_type;
435 struct v2_disk_dqdbheader *dh;
436 dqbuf_t buf = getdqbuf();
437 int ret = 0;
438
439 if (!buf)
440 return -ENOMEM;
441 if (dquot->dq_off >> V2_DQBLKSIZE_BITS != blk) {
442 printk(KERN_ERR "VFS: Quota structure has offset to other "
443 "block (%u) than it should (%u).\n", blk,
444 (uint)(dquot->dq_off >> V2_DQBLKSIZE_BITS));
445 goto out_buf;
446 }
447 if ((ret = read_blk(sb, type, blk, buf)) < 0) {
448 printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
449 goto out_buf;
450 }
451 dh = (struct v2_disk_dqdbheader *)buf;
452 le16_add_cpu(&dh->dqdh_entries, -1);
453 if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */
454 if ((ret = remove_free_dqentry(sb, type, buf, blk)) < 0 ||
455 (ret = put_free_dqblk(sb, type, buf, blk)) < 0) {
456 printk(KERN_ERR "VFS: Can't move quota data block (%u) "
457 "to free list.\n", blk);
458 goto out_buf;
459 }
460 }
461 else {
462 memset(buf+(dquot->dq_off & ((1 << V2_DQBLKSIZE_BITS)-1)), 0,
463 sizeof(struct v2_disk_dqblk));
464 if (le16_to_cpu(dh->dqdh_entries) == V2_DQSTRINBLK-1) {
465 /* Insert will write block itself */
466 if ((ret = insert_free_dqentry(sb, type, buf, blk)) < 0) {
467 printk(KERN_ERR "VFS: Can't insert quota data block (%u) to free entry list.\n", blk);
468 goto out_buf;
469 }
470 }
471 else
472 if ((ret = write_blk(sb, type, blk, buf)) < 0) {
473 printk(KERN_ERR "VFS: Can't write quota data "
474 "block %u\n", blk);
475 goto out_buf;
476 }
477 }
478 dquot->dq_off = 0; /* Quota is now unattached */
479out_buf:
480 freedqbuf(buf);
481 return ret;
482}
483
484/* Remove reference to dquot from tree */
485static int remove_tree(struct dquot *dquot, uint *blk, int depth)
486{
487 struct super_block *sb = dquot->dq_sb;
488 int type = dquot->dq_type;
489 dqbuf_t buf = getdqbuf();
490 int ret = 0;
491 uint newblk;
492 __le32 *ref = (__le32 *)buf;
493
494 if (!buf)
495 return -ENOMEM;
496 if ((ret = read_blk(sb, type, *blk, buf)) < 0) {
497 printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
498 goto out_buf;
499 }
500 newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
501 if (depth == V2_DQTREEDEPTH-1) {
502 ret = free_dqentry(dquot, newblk);
503 newblk = 0;
504 }
505 else
506 ret = remove_tree(dquot, &newblk, depth+1);
507 if (ret >= 0 && !newblk) {
508 int i;
509 ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(0);
510 for (i = 0; i < V2_DQBLKSIZE && !buf[i]; i++); /* Block got empty? */
511 /* Don't put the root block into the free block list */
512 if (i == V2_DQBLKSIZE && *blk != V2_DQTREEOFF) {
513 put_free_dqblk(sb, type, buf, *blk);
514 *blk = 0;
515 }
516 else
517 if ((ret = write_blk(sb, type, *blk, buf)) < 0)
518 printk(KERN_ERR "VFS: Can't write quota tree "
519 "block %u.\n", *blk);
520 }
521out_buf:
522 freedqbuf(buf);
523 return ret;
524}
525
526/* Delete dquot from tree */
527static int v2_delete_dquot(struct dquot *dquot)
528{
529 uint tmp = V2_DQTREEOFF;
530
531 if (!dquot->dq_off) /* Even not allocated? */
532 return 0;
533 return remove_tree(dquot, &tmp, 0);
534}
535
536/* Find entry in block */
537static loff_t find_block_dqentry(struct dquot *dquot, uint blk)
538{
539 dqbuf_t buf = getdqbuf();
540 loff_t ret = 0;
541 int i;
542 struct v2_disk_dqblk *ddquot = GETENTRIES(buf);
543
544 if (!buf)
545 return -ENOMEM;
546 if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) {
547 printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
548 goto out_buf;
549 }
550 if (dquot->dq_id)
551 for (i = 0; i < V2_DQSTRINBLK &&
552 le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; i++);
553 else { /* ID 0 as a bit more complicated searching... */
554 struct v2_disk_dqblk fakedquot;
555
556 memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
557 for (i = 0; i < V2_DQSTRINBLK; i++)
558 if (!le32_to_cpu(ddquot[i].dqb_id) &&
559 memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)))
560 break;
561 }
562 if (i == V2_DQSTRINBLK) {
563 printk(KERN_ERR "VFS: Quota for id %u referenced "
564 "but not present.\n", dquot->dq_id);
565 ret = -EIO;
566 goto out_buf;
567 }
568 else
569 ret = (blk << V2_DQBLKSIZE_BITS) + sizeof(struct
570 v2_disk_dqdbheader) + i * sizeof(struct v2_disk_dqblk);
571out_buf:
572 freedqbuf(buf);
573 return ret;
574}
575
576/* Find entry for given id in the tree */
577static loff_t find_tree_dqentry(struct dquot *dquot, uint blk, int depth)
578{
579 dqbuf_t buf = getdqbuf();
580 loff_t ret = 0;
581 __le32 *ref = (__le32 *)buf;
582
583 if (!buf)
584 return -ENOMEM;
585 if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) {
586 printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
587 goto out_buf;
588 }
589 ret = 0;
590 blk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
591 if (!blk) /* No reference? */
592 goto out_buf;
593 if (depth < V2_DQTREEDEPTH-1)
594 ret = find_tree_dqentry(dquot, blk, depth+1);
595 else
596 ret = find_block_dqentry(dquot, blk);
597out_buf:
598 freedqbuf(buf);
599 return ret;
600}
601
602/* Find entry for given id in the tree - wrapper function */
603static inline loff_t find_dqentry(struct dquot *dquot)
604{
605 return find_tree_dqentry(dquot, V2_DQTREEOFF, 0);
606}
607
608static int v2_read_dquot(struct dquot *dquot)
609{ 199{
610 int type = dquot->dq_type; 200 return qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
611 loff_t offset;
612 struct v2_disk_dqblk ddquot, empty;
613 int ret = 0;
614
615#ifdef __QUOTA_V2_PARANOIA
616 /* Invalidated quota? */
617 if (!dquot->dq_sb || !sb_dqopt(dquot->dq_sb)->files[type]) {
618 printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
619 return -EIO;
620 }
621#endif
622 offset = find_dqentry(dquot);
623 if (offset <= 0) { /* Entry not present? */
624 if (offset < 0)
625 printk(KERN_ERR "VFS: Can't read quota "
626 "structure for id %u.\n", dquot->dq_id);
627 dquot->dq_off = 0;
628 set_bit(DQ_FAKE_B, &dquot->dq_flags);
629 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
630 ret = offset;
631 }
632 else {
633 dquot->dq_off = offset;
634 if ((ret = dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type,
635 (char *)&ddquot, sizeof(struct v2_disk_dqblk), offset))
636 != sizeof(struct v2_disk_dqblk)) {
637 if (ret >= 0)
638 ret = -EIO;
639 printk(KERN_ERR "VFS: Error while reading quota "
640 "structure for id %u.\n", dquot->dq_id);
641 memset(&ddquot, 0, sizeof(struct v2_disk_dqblk));
642 }
643 else {
644 ret = 0;
645 /* We need to escape back all-zero structure */
646 memset(&empty, 0, sizeof(struct v2_disk_dqblk));
647 empty.dqb_itime = cpu_to_le64(1);
648 if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
649 ddquot.dqb_itime = 0;
650 }
651 disk2memdqb(&dquot->dq_dqb, &ddquot);
652 if (!dquot->dq_dqb.dqb_bhardlimit &&
653 !dquot->dq_dqb.dqb_bsoftlimit &&
654 !dquot->dq_dqb.dqb_ihardlimit &&
655 !dquot->dq_dqb.dqb_isoftlimit)
656 set_bit(DQ_FAKE_B, &dquot->dq_flags);
657 }
658 dqstats.reads++;
659
660 return ret;
661} 201}
662 202
663/* Check whether dquot should not be deleted. We know we are 203static int v2_free_file_info(struct super_block *sb, int type)
664 * the only one operating on dquot (thanks to dq_lock) */
665static int v2_release_dquot(struct dquot *dquot)
666{ 204{
667 if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace)) 205 kfree(sb_dqinfo(sb, type)->dqi_priv);
668 return v2_delete_dquot(dquot);
669 return 0; 206 return 0;
670} 207}
671 208
@@ -673,7 +210,7 @@ static struct quota_format_ops v2_format_ops = {
673 .check_quota_file = v2_check_quota_file, 210 .check_quota_file = v2_check_quota_file,
674 .read_file_info = v2_read_file_info, 211 .read_file_info = v2_read_file_info,
675 .write_file_info = v2_write_file_info, 212 .write_file_info = v2_write_file_info,
676 .free_file_info = NULL, 213 .free_file_info = v2_free_file_info,
677 .read_dqblk = v2_read_dquot, 214 .read_dqblk = v2_read_dquot,
678 .commit_dqblk = v2_write_dquot, 215 .commit_dqblk = v2_write_dquot,
679 .release_dqblk = v2_release_dquot, 216 .release_dqblk = v2_release_dquot,
diff --git a/fs/quotaio_v1.h b/fs/quotaio_v1.h
new file mode 100644
index 000000000000..746654b5de70
--- /dev/null
+++ b/fs/quotaio_v1.h
@@ -0,0 +1,33 @@
1#ifndef _LINUX_QUOTAIO_V1_H
2#define _LINUX_QUOTAIO_V1_H
3
4#include <linux/types.h>
5
6/*
7 * The following constants define the amount of time given a user
8 * before the soft limits are treated as hard limits (usually resulting
9 * in an allocation failure). The timer is started when the user crosses
10 * their soft limit, it is reset when they go below their soft limit.
11 */
12#define MAX_IQ_TIME 604800 /* (7*24*60*60) 1 week */
13#define MAX_DQ_TIME 604800 /* (7*24*60*60) 1 week */
14
15/*
16 * The following structure defines the format of the disk quota file
17 * (as it appears on disk) - the file is an array of these structures
18 * indexed by user or group number.
19 */
20struct v1_disk_dqblk {
21 __u32 dqb_bhardlimit; /* absolute limit on disk blks alloc */
22 __u32 dqb_bsoftlimit; /* preferred limit on disk blks */
23 __u32 dqb_curblocks; /* current block count */
24 __u32 dqb_ihardlimit; /* absolute limit on allocated inodes */
25 __u32 dqb_isoftlimit; /* preferred inode limit */
26 __u32 dqb_curinodes; /* current # allocated inodes */
27 time_t dqb_btime; /* time limit for excessive disk use */
28 time_t dqb_itime; /* time limit for excessive inode use */
29};
30
31#define v1_dqoff(UID) ((loff_t)((UID) * sizeof (struct v1_disk_dqblk)))
32
33#endif /* _LINUX_QUOTAIO_V1_H */
diff --git a/fs/quotaio_v2.h b/fs/quotaio_v2.h
new file mode 100644
index 000000000000..530fe580685c
--- /dev/null
+++ b/fs/quotaio_v2.h
@@ -0,0 +1,60 @@
1/*
2 * Definitions of structures for vfsv0 quota format
3 */
4
5#ifndef _LINUX_QUOTAIO_V2_H
6#define _LINUX_QUOTAIO_V2_H
7
8#include <linux/types.h>
9#include <linux/quota.h>
10
11/*
12 * Definitions of magics and versions of current quota files
13 */
14#define V2_INITQMAGICS {\
15 0xd9c01f11, /* USRQUOTA */\
16 0xd9c01927 /* GRPQUOTA */\
17}
18
19#define V2_INITQVERSIONS {\
20 0, /* USRQUOTA */\
21 0 /* GRPQUOTA */\
22}
23
24/* First generic header */
25struct v2_disk_dqheader {
26 __le32 dqh_magic; /* Magic number identifying file */
27 __le32 dqh_version; /* File version */
28};
29
30/*
31 * The following structure defines the format of the disk quota file
32 * (as it appears on disk) - the file is a radix tree whose leaves point
33 * to blocks of these structures.
34 */
35struct v2_disk_dqblk {
36 __le32 dqb_id; /* id this quota applies to */
37 __le32 dqb_ihardlimit; /* absolute limit on allocated inodes */
38 __le32 dqb_isoftlimit; /* preferred inode limit */
39 __le32 dqb_curinodes; /* current # allocated inodes */
40 __le32 dqb_bhardlimit; /* absolute limit on disk space (in QUOTABLOCK_SIZE) */
41 __le32 dqb_bsoftlimit; /* preferred limit on disk space (in QUOTABLOCK_SIZE) */
42 __le64 dqb_curspace; /* current space occupied (in bytes) */
43 __le64 dqb_btime; /* time limit for excessive disk use */
44 __le64 dqb_itime; /* time limit for excessive inode use */
45};
46
47/* Header with type and version specific information */
48struct v2_disk_dqinfo {
49 __le32 dqi_bgrace; /* Time before block soft limit becomes hard limit */
50 __le32 dqi_igrace; /* Time before inode soft limit becomes hard limit */
51 __le32 dqi_flags; /* Flags for quotafile (DQF_*) */
52 __le32 dqi_blocks; /* Number of blocks in file */
53 __le32 dqi_free_blk; /* Number of first free block in the list */
54 __le32 dqi_free_entry; /* Number of block with at least one free entry */
55};
56
57#define V2_DQINFOOFF sizeof(struct v2_disk_dqheader) /* Offset of info header in file */
58#define V2_DQBLKSIZE_BITS 10 /* Size of leaf block in tree */
59
60#endif /* _LINUX_QUOTAIO_V2_H */
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 76acdbc34611..b9b567a28376 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -262,11 +262,11 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
262 ret = -ENOMEM; 262 ret = -ENOMEM;
263 pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL); 263 pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL);
264 if (!pages) 264 if (!pages)
265 goto out; 265 goto out_free;
266 266
267 nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages); 267 nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages);
268 if (nr != lpages) 268 if (nr != lpages)
269 goto out; /* leave if some pages were missing */ 269 goto out_free_pages; /* leave if some pages were missing */
270 270
271 /* check the pages for physical adjacency */ 271 /* check the pages for physical adjacency */
272 ptr = pages; 272 ptr = pages;
@@ -274,19 +274,18 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
274 page++; 274 page++;
275 for (loop = lpages; loop > 1; loop--) 275 for (loop = lpages; loop > 1; loop--)
276 if (*ptr++ != page++) 276 if (*ptr++ != page++)
277 goto out; 277 goto out_free_pages;
278 278
279 /* okay - all conditions fulfilled */ 279 /* okay - all conditions fulfilled */
280 ret = (unsigned long) page_address(pages[0]); 280 ret = (unsigned long) page_address(pages[0]);
281 281
282 out: 282out_free_pages:
283 if (pages) { 283 ptr = pages;
284 ptr = pages; 284 for (loop = nr; loop > 0; loop--)
285 for (loop = lpages; loop > 0; loop--) 285 put_page(*ptr++);
286 put_page(*ptr++); 286out_free:
287 kfree(pages); 287 kfree(pages);
288 } 288out:
289
290 return ret; 289 return ret;
291} 290}
292 291
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a83a3518ae33..b7e6ac706b87 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -57,7 +57,6 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
57 inode->i_mode = mode; 57 inode->i_mode = mode;
58 inode->i_uid = current_fsuid(); 58 inode->i_uid = current_fsuid();
59 inode->i_gid = current_fsgid(); 59 inode->i_gid = current_fsgid();
60 inode->i_blocks = 0;
61 inode->i_mapping->a_ops = &ramfs_aops; 60 inode->i_mapping->a_ops = &ramfs_aops;
62 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; 61 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
63 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); 62 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
diff --git a/fs/read_write.c b/fs/read_write.c
index 969a6d9c020b..5cc6924eb158 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -50,6 +50,14 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
50 offset += inode->i_size; 50 offset += inode->i_size;
51 break; 51 break;
52 case SEEK_CUR: 52 case SEEK_CUR:
53 /*
54 * Here we special-case the lseek(fd, 0, SEEK_CUR)
55 * position-querying operation. Avoid rewriting the "same"
56 * f_pos value back to the file because a concurrent read(),
57 * write() or lseek() might have altered it
58 */
59 if (offset == 0)
60 return file->f_pos;
53 offset += file->f_pos; 61 offset += file->f_pos;
54 break; 62 break;
55 } 63 }
@@ -105,6 +113,10 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
105 offset += i_size_read(file->f_path.dentry->d_inode); 113 offset += i_size_read(file->f_path.dentry->d_inode);
106 break; 114 break;
107 case SEEK_CUR: 115 case SEEK_CUR:
116 if (offset == 0) {
117 retval = file->f_pos;
118 goto out;
119 }
108 offset += file->f_pos; 120 offset += file->f_pos;
109 } 121 }
110 retval = -EINVAL; 122 retval = -EINVAL;
@@ -115,6 +127,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
115 } 127 }
116 retval = offset; 128 retval = offset;
117 } 129 }
130out:
118 unlock_kernel(); 131 unlock_kernel();
119 return retval; 132 return retval;
120} 133}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 6c4c2c69449f..55fce92cdf18 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1753,6 +1753,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1753 struct inode *inode) 1753 struct inode *inode)
1754{ 1754{
1755 struct super_block *sb; 1755 struct super_block *sb;
1756 struct reiserfs_iget_args args;
1756 INITIALIZE_PATH(path_to_key); 1757 INITIALIZE_PATH(path_to_key);
1757 struct cpu_key key; 1758 struct cpu_key key;
1758 struct item_head ih; 1759 struct item_head ih;
@@ -1780,6 +1781,20 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1780 err = -ENOMEM; 1781 err = -ENOMEM;
1781 goto out_bad_inode; 1782 goto out_bad_inode;
1782 } 1783 }
1784 args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
1785 if (old_format_only(sb))
1786 make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
1787 TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
1788 else
1789 make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
1790 TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
1791 memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
1792 args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
1793 if (insert_inode_locked4(inode, args.objectid,
1794 reiserfs_find_actor, &args) < 0) {
1795 err = -EINVAL;
1796 goto out_bad_inode;
1797 }
1783 if (old_format_only(sb)) 1798 if (old_format_only(sb))
1784 /* not a perfect generation count, as object ids can be reused, but 1799 /* not a perfect generation count, as object ids can be reused, but
1785 ** this is as good as reiserfs can do right now. 1800 ** this is as good as reiserfs can do right now.
@@ -1825,13 +1840,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1825 reiserfs_init_acl_default(inode); 1840 reiserfs_init_acl_default(inode);
1826 reiserfs_init_xattr_rwsem(inode); 1841 reiserfs_init_xattr_rwsem(inode);
1827 1842
1828 if (old_format_only(sb))
1829 make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
1830 TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
1831 else
1832 make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
1833 TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
1834
1835 /* key to search for correct place for new stat data */ 1843 /* key to search for correct place for new stat data */
1836 _make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id), 1844 _make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id),
1837 le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET, 1845 le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET,
@@ -1859,13 +1867,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1859 } else { 1867 } else {
1860 inode2sd(&sd, inode, inode->i_size); 1868 inode2sd(&sd, inode, inode->i_size);
1861 } 1869 }
1862 // these do not go to on-disk stat data
1863 inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
1864
1865 // store in in-core inode the key of stat data and version all 1870 // store in in-core inode the key of stat data and version all
1866 // object items will have (directory items will have old offset 1871 // object items will have (directory items will have old offset
1867 // format, other new objects will consist of new items) 1872 // format, other new objects will consist of new items)
1868 memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
1869 if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode)) 1873 if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
1870 set_inode_item_key_version(inode, KEY_FORMAT_3_5); 1874 set_inode_item_key_version(inode, KEY_FORMAT_3_5);
1871 else 1875 else
@@ -1929,7 +1933,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1929 reiserfs_mark_inode_private(inode); 1933 reiserfs_mark_inode_private(inode);
1930 } 1934 }
1931 1935
1932 insert_inode_hash(inode);
1933 reiserfs_update_sd(th, inode); 1936 reiserfs_update_sd(th, inode);
1934 reiserfs_check_path(&path_to_key); 1937 reiserfs_check_path(&path_to_key);
1935 1938
@@ -1956,6 +1959,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1956 out_inserted_sd: 1959 out_inserted_sd:
1957 inode->i_nlink = 0; 1960 inode->i_nlink = 0;
1958 th->t_trans_id = 0; /* so the caller can't use this handle later */ 1961 th->t_trans_id = 0; /* so the caller can't use this handle later */
1962 unlock_new_inode(inode); /* OK to do even if we hadn't locked it */
1959 1963
1960 /* If we were inheriting an ACL, we need to release the lock so that 1964 /* If we were inheriting an ACL, we need to release the lock so that
1961 * iput doesn't deadlock in reiserfs_delete_xattrs. The locking 1965 * iput doesn't deadlock in reiserfs_delete_xattrs. The locking
@@ -2556,7 +2560,7 @@ static int reiserfs_write_begin(struct file *file,
2556 } 2560 }
2557 2561
2558 index = pos >> PAGE_CACHE_SHIFT; 2562 index = pos >> PAGE_CACHE_SHIFT;
2559 page = __grab_cache_page(mapping, index); 2563 page = grab_cache_page_write_begin(mapping, index, flags);
2560 if (!page) 2564 if (!page)
2561 return -ENOMEM; 2565 return -ENOMEM;
2562 *pagep = page; 2566 *pagep = page;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 4f322e5ed840..738967f6c8ee 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -646,6 +646,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
646 err = journal_end(&th, dir->i_sb, jbegin_count); 646 err = journal_end(&th, dir->i_sb, jbegin_count);
647 if (err) 647 if (err)
648 retval = err; 648 retval = err;
649 unlock_new_inode(inode);
649 iput(inode); 650 iput(inode);
650 goto out_failed; 651 goto out_failed;
651 } 652 }
@@ -653,6 +654,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
653 reiserfs_update_inode_transaction(dir); 654 reiserfs_update_inode_transaction(dir);
654 655
655 d_instantiate(dentry, inode); 656 d_instantiate(dentry, inode);
657 unlock_new_inode(inode);
656 retval = journal_end(&th, dir->i_sb, jbegin_count); 658 retval = journal_end(&th, dir->i_sb, jbegin_count);
657 659
658 out_failed: 660 out_failed:
@@ -727,11 +729,13 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
727 err = journal_end(&th, dir->i_sb, jbegin_count); 729 err = journal_end(&th, dir->i_sb, jbegin_count);
728 if (err) 730 if (err)
729 retval = err; 731 retval = err;
732 unlock_new_inode(inode);
730 iput(inode); 733 iput(inode);
731 goto out_failed; 734 goto out_failed;
732 } 735 }
733 736
734 d_instantiate(dentry, inode); 737 d_instantiate(dentry, inode);
738 unlock_new_inode(inode);
735 retval = journal_end(&th, dir->i_sb, jbegin_count); 739 retval = journal_end(&th, dir->i_sb, jbegin_count);
736 740
737 out_failed: 741 out_failed:
@@ -812,6 +816,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
812 err = journal_end(&th, dir->i_sb, jbegin_count); 816 err = journal_end(&th, dir->i_sb, jbegin_count);
813 if (err) 817 if (err)
814 retval = err; 818 retval = err;
819 unlock_new_inode(inode);
815 iput(inode); 820 iput(inode);
816 goto out_failed; 821 goto out_failed;
817 } 822 }
@@ -819,6 +824,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
819 reiserfs_update_sd(&th, dir); 824 reiserfs_update_sd(&th, dir);
820 825
821 d_instantiate(dentry, inode); 826 d_instantiate(dentry, inode);
827 unlock_new_inode(inode);
822 retval = journal_end(&th, dir->i_sb, jbegin_count); 828 retval = journal_end(&th, dir->i_sb, jbegin_count);
823 out_failed: 829 out_failed:
824 if (locked) 830 if (locked)
@@ -1096,11 +1102,13 @@ static int reiserfs_symlink(struct inode *parent_dir,
1096 err = journal_end(&th, parent_dir->i_sb, jbegin_count); 1102 err = journal_end(&th, parent_dir->i_sb, jbegin_count);
1097 if (err) 1103 if (err)
1098 retval = err; 1104 retval = err;
1105 unlock_new_inode(inode);
1099 iput(inode); 1106 iput(inode);
1100 goto out_failed; 1107 goto out_failed;
1101 } 1108 }
1102 1109
1103 d_instantiate(dentry, inode); 1110 d_instantiate(dentry, inode);
1111 unlock_new_inode(inode);
1104 retval = journal_end(&th, parent_dir->i_sb, jbegin_count); 1112 retval = journal_end(&th, parent_dir->i_sb, jbegin_count);
1105 out_failed: 1113 out_failed:
1106 reiserfs_write_unlock(parent_dir->i_sb); 1114 reiserfs_write_unlock(parent_dir->i_sb);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 663a91f5dce8..f3c820b75829 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -83,7 +83,7 @@ static void reiserfs_write_super(struct super_block *s)
83 reiserfs_sync_fs(s, 1); 83 reiserfs_sync_fs(s, 1);
84} 84}
85 85
86static void reiserfs_write_super_lockfs(struct super_block *s) 86static int reiserfs_freeze(struct super_block *s)
87{ 87{
88 struct reiserfs_transaction_handle th; 88 struct reiserfs_transaction_handle th;
89 reiserfs_write_lock(s); 89 reiserfs_write_lock(s);
@@ -101,11 +101,13 @@ static void reiserfs_write_super_lockfs(struct super_block *s)
101 } 101 }
102 s->s_dirt = 0; 102 s->s_dirt = 0;
103 reiserfs_write_unlock(s); 103 reiserfs_write_unlock(s);
104 return 0;
104} 105}
105 106
106static void reiserfs_unlockfs(struct super_block *s) 107static int reiserfs_unfreeze(struct super_block *s)
107{ 108{
108 reiserfs_allow_writes(s); 109 reiserfs_allow_writes(s);
110 return 0;
109} 111}
110 112
111extern const struct in_core_key MAX_IN_CORE_KEY; 113extern const struct in_core_key MAX_IN_CORE_KEY;
@@ -613,8 +615,8 @@ static const struct super_operations reiserfs_sops = {
613 .put_super = reiserfs_put_super, 615 .put_super = reiserfs_put_super,
614 .write_super = reiserfs_write_super, 616 .write_super = reiserfs_write_super,
615 .sync_fs = reiserfs_sync_fs, 617 .sync_fs = reiserfs_sync_fs,
616 .write_super_lockfs = reiserfs_write_super_lockfs, 618 .freeze_fs = reiserfs_freeze,
617 .unlockfs = reiserfs_unlockfs, 619 .unfreeze_fs = reiserfs_unfreeze,
618 .statfs = reiserfs_statfs, 620 .statfs = reiserfs_statfs,
619 .remount_fs = reiserfs_remount, 621 .remount_fs = reiserfs_remount,
620 .show_options = generic_show_options, 622 .show_options = generic_show_options,
@@ -649,6 +651,8 @@ static struct dquot_operations reiserfs_quota_operations = {
649 .release_dquot = reiserfs_release_dquot, 651 .release_dquot = reiserfs_release_dquot,
650 .mark_dirty = reiserfs_mark_dquot_dirty, 652 .mark_dirty = reiserfs_mark_dquot_dirty,
651 .write_info = reiserfs_write_info, 653 .write_info = reiserfs_write_info,
654 .alloc_dquot = dquot_alloc,
655 .destroy_dquot = dquot_destroy,
652}; 656};
653 657
654static struct quotactl_ops reiserfs_qctl_operations = { 658static struct quotactl_ops reiserfs_qctl_operations = {
@@ -994,8 +998,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
994 if (c == 'u' || c == 'g') { 998 if (c == 'u' || c == 'g') {
995 int qtype = c == 'u' ? USRQUOTA : GRPQUOTA; 999 int qtype = c == 'u' ? USRQUOTA : GRPQUOTA;
996 1000
997 if ((sb_any_quota_enabled(s) || 1001 if (sb_any_quota_loaded(s) &&
998 sb_any_quota_suspended(s)) &&
999 (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) { 1002 (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) {
1000 reiserfs_warning(s, 1003 reiserfs_warning(s,
1001 "reiserfs_parse_options: cannot change journaled quota options when quota turned on."); 1004 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
@@ -1041,8 +1044,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
1041 "reiserfs_parse_options: unknown quota format specified."); 1044 "reiserfs_parse_options: unknown quota format specified.");
1042 return 0; 1045 return 0;
1043 } 1046 }
1044 if ((sb_any_quota_enabled(s) || 1047 if (sb_any_quota_loaded(s) &&
1045 sb_any_quota_suspended(s)) &&
1046 *qfmt != REISERFS_SB(s)->s_jquota_fmt) { 1048 *qfmt != REISERFS_SB(s)->s_jquota_fmt) {
1047 reiserfs_warning(s, 1049 reiserfs_warning(s,
1048 "reiserfs_parse_options: cannot change journaled quota options when quota turned on."); 1050 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
@@ -1067,7 +1069,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
1067 } 1069 }
1068 /* This checking is not precise wrt the quota type but for our purposes it is sufficient */ 1070 /* This checking is not precise wrt the quota type but for our purposes it is sufficient */
1069 if (!(*mount_options & (1 << REISERFS_QUOTA)) 1071 if (!(*mount_options & (1 << REISERFS_QUOTA))
1070 && sb_any_quota_enabled(s)) { 1072 && sb_any_quota_loaded(s)) {
1071 reiserfs_warning(s, 1073 reiserfs_warning(s,
1072 "reiserfs_parse_options: quota options must be present when quota is turned on."); 1074 "reiserfs_parse_options: quota options must be present when quota is turned on.");
1073 return 0; 1075 return 0;
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 60d2f822e87b..98a232f7196b 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -490,7 +490,7 @@ static mode_t romfs_modemap[] =
490static struct inode * 490static struct inode *
491romfs_iget(struct super_block *sb, unsigned long ino) 491romfs_iget(struct super_block *sb, unsigned long ino)
492{ 492{
493 int nextfh; 493 int nextfh, ret;
494 struct romfs_inode ri; 494 struct romfs_inode ri;
495 struct inode *i; 495 struct inode *i;
496 496
@@ -524,14 +524,13 @@ romfs_iget(struct super_block *sb, unsigned long ino)
524 i->i_size = be32_to_cpu(ri.size); 524 i->i_size = be32_to_cpu(ri.size);
525 i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0; 525 i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
526 i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0; 526 i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
527 i->i_uid = i->i_gid = 0;
528 527
529 /* Precalculate the data offset */ 528 /* Precalculate the data offset */
530 ino = romfs_strnlen(i, ino+ROMFH_SIZE, ROMFS_MAXFN); 529 ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN);
531 if (ino >= 0) 530 if (ret >= 0)
532 ino = ((ROMFH_SIZE+ino+1+ROMFH_PAD)&ROMFH_MASK); 531 ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK;
533 else 532 else
534 ino = 0; 533 ino = 0;
535 534
536 ROMFS_I(i)->i_metasize = ino; 535 ROMFS_I(i)->i_metasize = ino;
537 ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK); 536 ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK);
diff --git a/fs/select.c b/fs/select.c
index 87df51eadcf2..08b91beed806 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -109,11 +109,11 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
109void poll_initwait(struct poll_wqueues *pwq) 109void poll_initwait(struct poll_wqueues *pwq)
110{ 110{
111 init_poll_funcptr(&pwq->pt, __pollwait); 111 init_poll_funcptr(&pwq->pt, __pollwait);
112 pwq->polling_task = current;
112 pwq->error = 0; 113 pwq->error = 0;
113 pwq->table = NULL; 114 pwq->table = NULL;
114 pwq->inline_index = 0; 115 pwq->inline_index = 0;
115} 116}
116
117EXPORT_SYMBOL(poll_initwait); 117EXPORT_SYMBOL(poll_initwait);
118 118
119static void free_poll_entry(struct poll_table_entry *entry) 119static void free_poll_entry(struct poll_table_entry *entry)
@@ -142,12 +142,10 @@ void poll_freewait(struct poll_wqueues *pwq)
142 free_page((unsigned long) old); 142 free_page((unsigned long) old);
143 } 143 }
144} 144}
145
146EXPORT_SYMBOL(poll_freewait); 145EXPORT_SYMBOL(poll_freewait);
147 146
148static struct poll_table_entry *poll_get_entry(poll_table *_p) 147static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
149{ 148{
150 struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);
151 struct poll_table_page *table = p->table; 149 struct poll_table_page *table = p->table;
152 150
153 if (p->inline_index < N_INLINE_POLL_ENTRIES) 151 if (p->inline_index < N_INLINE_POLL_ENTRIES)
@@ -159,7 +157,6 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p)
159 new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); 157 new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
160 if (!new_table) { 158 if (!new_table) {
161 p->error = -ENOMEM; 159 p->error = -ENOMEM;
162 __set_current_state(TASK_RUNNING);
163 return NULL; 160 return NULL;
164 } 161 }
165 new_table->entry = new_table->entries; 162 new_table->entry = new_table->entries;
@@ -171,20 +168,75 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p)
171 return table->entry++; 168 return table->entry++;
172} 169}
173 170
171static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
172{
173 struct poll_wqueues *pwq = wait->private;
174 DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
175
176 /*
177 * Although this function is called under waitqueue lock, LOCK
178 * doesn't imply write barrier and the users expect write
179 * barrier semantics on wakeup functions. The following
180 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
181 * and is paired with set_mb() in poll_schedule_timeout.
182 */
183 smp_wmb();
184 pwq->triggered = 1;
185
186 /*
187 * Perform the default wake up operation using a dummy
188 * waitqueue.
189 *
190 * TODO: This is hacky but there currently is no interface to
191 * pass in @sync. @sync is scheduled to be removed and once
192 * that happens, wake_up_process() can be used directly.
193 */
194 return default_wake_function(&dummy_wait, mode, sync, key);
195}
196
174/* Add a new entry */ 197/* Add a new entry */
175static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, 198static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
176 poll_table *p) 199 poll_table *p)
177{ 200{
178 struct poll_table_entry *entry = poll_get_entry(p); 201 struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
202 struct poll_table_entry *entry = poll_get_entry(pwq);
179 if (!entry) 203 if (!entry)
180 return; 204 return;
181 get_file(filp); 205 get_file(filp);
182 entry->filp = filp; 206 entry->filp = filp;
183 entry->wait_address = wait_address; 207 entry->wait_address = wait_address;
184 init_waitqueue_entry(&entry->wait, current); 208 init_waitqueue_func_entry(&entry->wait, pollwake);
209 entry->wait.private = pwq;
185 add_wait_queue(wait_address, &entry->wait); 210 add_wait_queue(wait_address, &entry->wait);
186} 211}
187 212
213int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
214 ktime_t *expires, unsigned long slack)
215{
216 int rc = -EINTR;
217
218 set_current_state(state);
219 if (!pwq->triggered)
220 rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
221 __set_current_state(TASK_RUNNING);
222
223 /*
224 * Prepare for the next iteration.
225 *
226 * The following set_mb() serves two purposes. First, it's
227 * the counterpart rmb of the wmb in pollwake() such that data
228 * written before wake up is always visible after wake up.
229 * Second, the full barrier guarantees that triggered clearing
230 * doesn't pass event check of the next iteration. Note that
231 * this problem doesn't exist for the first iteration as
232 * add_wait_queue() has full barrier semantics.
233 */
234 set_mb(pwq->triggered, 0);
235
236 return rc;
237}
238EXPORT_SYMBOL(poll_schedule_timeout);
239
188/** 240/**
189 * poll_select_set_timeout - helper function to setup the timeout value 241 * poll_select_set_timeout - helper function to setup the timeout value
190 * @to: pointer to timespec variable for the final timeout 242 * @to: pointer to timespec variable for the final timeout
@@ -340,8 +392,6 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
340 for (;;) { 392 for (;;) {
341 unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; 393 unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
342 394
343 set_current_state(TASK_INTERRUPTIBLE);
344
345 inp = fds->in; outp = fds->out; exp = fds->ex; 395 inp = fds->in; outp = fds->out; exp = fds->ex;
346 rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; 396 rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
347 397
@@ -411,10 +461,10 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
411 to = &expire; 461 to = &expire;
412 } 462 }
413 463
414 if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) 464 if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
465 to, slack))
415 timed_out = 1; 466 timed_out = 1;
416 } 467 }
417 __set_current_state(TASK_RUNNING);
418 468
419 poll_freewait(&table); 469 poll_freewait(&table);
420 470
@@ -666,7 +716,6 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
666 for (;;) { 716 for (;;) {
667 struct poll_list *walk; 717 struct poll_list *walk;
668 718
669 set_current_state(TASK_INTERRUPTIBLE);
670 for (walk = list; walk != NULL; walk = walk->next) { 719 for (walk = list; walk != NULL; walk = walk->next) {
671 struct pollfd * pfd, * pfd_end; 720 struct pollfd * pfd, * pfd_end;
672 721
@@ -709,10 +758,9 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
709 to = &expire; 758 to = &expire;
710 } 759 }
711 760
712 if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) 761 if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
713 timed_out = 1; 762 timed_out = 1;
714 } 763 }
715 __set_current_state(TASK_RUNNING);
716 return count; 764 return count;
717} 765}
718 766
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 16c211558c22..b569ff1c4dc8 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -389,8 +389,14 @@ char *mangle_path(char *s, char *p, char *esc)
389} 389}
390EXPORT_SYMBOL(mangle_path); 390EXPORT_SYMBOL(mangle_path);
391 391
392/* 392/**
393 * return the absolute path of 'dentry' residing in mount 'mnt'. 393 * seq_path - seq_file interface to print a pathname
394 * @m: the seq_file handle
395 * @path: the struct path to print
396 * @esc: set of characters to escape in the output
397 *
398 * return the absolute path of 'path', as represented by the
399 * dentry / mnt pair in the path parameter.
394 */ 400 */
395int seq_path(struct seq_file *m, struct path *path, char *esc) 401int seq_path(struct seq_file *m, struct path *path, char *esc)
396{ 402{
@@ -462,7 +468,8 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
462 return -1; 468 return -1;
463} 469}
464 470
465int seq_bitmap(struct seq_file *m, unsigned long *bits, unsigned int nr_bits) 471int seq_bitmap(struct seq_file *m, const unsigned long *bits,
472 unsigned int nr_bits)
466{ 473{
467 if (m->count < m->size) { 474 if (m->count < m->size) {
468 int len = bitmap_scnprintf(m->buf + m->count, 475 int len = bitmap_scnprintf(m->buf + m->count,
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index e4f8d51a5553..92d5e8ffb639 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -297,7 +297,7 @@ static int smb_write_begin(struct file *file, struct address_space *mapping,
297 struct page **pagep, void **fsdata) 297 struct page **pagep, void **fsdata)
298{ 298{
299 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 299 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
300 *pagep = __grab_cache_page(mapping, index); 300 *pagep = grab_cache_page_write_begin(mapping, index, flags);
301 if (!*pagep) 301 if (!*pagep)
302 return -ENOMEM; 302 return -ENOMEM;
303 return 0; 303 return 0;
diff --git a/fs/splice.c b/fs/splice.c
index 1abab5cee4ba..a54b3e3f10a7 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -21,6 +21,7 @@
21#include <linux/file.h> 21#include <linux/file.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/splice.h> 23#include <linux/splice.h>
24#include <linux/memcontrol.h>
24#include <linux/mm_inline.h> 25#include <linux/mm_inline.h>
25#include <linux/swap.h> 26#include <linux/swap.h>
26#include <linux/writeback.h> 27#include <linux/writeback.h>
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
new file mode 100644
index 000000000000..8258cf9a0317
--- /dev/null
+++ b/fs/squashfs/Makefile
@@ -0,0 +1,8 @@
1#
2# Makefile for the linux squashfs routines.
3#
4
5obj-$(CONFIG_SQUASHFS) += squashfs.o
6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
7squashfs-y += namei.o super.o symlink.o
8#squashfs-y += squashfs2_0.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
new file mode 100644
index 000000000000..c837dfc2b3c6
--- /dev/null
+++ b/fs/squashfs/block.c
@@ -0,0 +1,274 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * block.c
22 */
23
24/*
25 * This file implements the low-level routines to read and decompress
26 * datablocks and metadata blocks.
27 */
28
29#include <linux/fs.h>
30#include <linux/vfs.h>
31#include <linux/slab.h>
32#include <linux/mutex.h>
33#include <linux/string.h>
34#include <linux/buffer_head.h>
35#include <linux/zlib.h>
36
37#include "squashfs_fs.h"
38#include "squashfs_fs_sb.h"
39#include "squashfs_fs_i.h"
40#include "squashfs.h"
41
42/*
43 * Read the metadata block length, this is stored in the first two
44 * bytes of the metadata block.
45 */
46static struct buffer_head *get_block_length(struct super_block *sb,
47 u64 *cur_index, int *offset, int *length)
48{
49 struct squashfs_sb_info *msblk = sb->s_fs_info;
50 struct buffer_head *bh;
51
52 bh = sb_bread(sb, *cur_index);
53 if (bh == NULL)
54 return NULL;
55
56 if (msblk->devblksize - *offset == 1) {
57 *length = (unsigned char) bh->b_data[*offset];
58 put_bh(bh);
59 bh = sb_bread(sb, ++(*cur_index));
60 if (bh == NULL)
61 return NULL;
62 *length |= (unsigned char) bh->b_data[0] << 8;
63 *offset = 1;
64 } else {
65 *length = (unsigned char) bh->b_data[*offset] |
66 (unsigned char) bh->b_data[*offset + 1] << 8;
67 *offset += 2;
68 }
69
70 return bh;
71}
72
73
74/*
75 * Read and decompress a metadata block or datablock. Length is non-zero
76 * if a datablock is being read (the size is stored elsewhere in the
77 * filesystem), otherwise the length is obtained from the first two bytes of
78 * the metadata block. A bit in the length field indicates if the block
79 * is stored uncompressed in the filesystem (usually because compression
80 * generated a larger block - this does occasionally happen with zlib).
81 */
82int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
83 int length, u64 *next_index, int srclength)
84{
85 struct squashfs_sb_info *msblk = sb->s_fs_info;
86 struct buffer_head **bh;
87 int offset = index & ((1 << msblk->devblksize_log2) - 1);
88 u64 cur_index = index >> msblk->devblksize_log2;
89 int bytes, compressed, b = 0, k = 0, page = 0, avail;
90
91
92 bh = kcalloc((msblk->block_size >> msblk->devblksize_log2) + 1,
93 sizeof(*bh), GFP_KERNEL);
94 if (bh == NULL)
95 return -ENOMEM;
96
97 if (length) {
98 /*
99 * Datablock.
100 */
101 bytes = -offset;
102 compressed = SQUASHFS_COMPRESSED_BLOCK(length);
103 length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length);
104 if (next_index)
105 *next_index = index + length;
106
107 TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n",
108 index, compressed ? "" : "un", length, srclength);
109
110 if (length < 0 || length > srclength ||
111 (index + length) > msblk->bytes_used)
112 goto read_failure;
113
114 for (b = 0; bytes < length; b++, cur_index++) {
115 bh[b] = sb_getblk(sb, cur_index);
116 if (bh[b] == NULL)
117 goto block_release;
118 bytes += msblk->devblksize;
119 }
120 ll_rw_block(READ, b, bh);
121 } else {
122 /*
123 * Metadata block.
124 */
125 if ((index + 2) > msblk->bytes_used)
126 goto read_failure;
127
128 bh[0] = get_block_length(sb, &cur_index, &offset, &length);
129 if (bh[0] == NULL)
130 goto read_failure;
131 b = 1;
132
133 bytes = msblk->devblksize - offset;
134 compressed = SQUASHFS_COMPRESSED(length);
135 length = SQUASHFS_COMPRESSED_SIZE(length);
136 if (next_index)
137 *next_index = index + length + 2;
138
139 TRACE("Block @ 0x%llx, %scompressed size %d\n", index,
140 compressed ? "" : "un", length);
141
142 if (length < 0 || length > srclength ||
143 (index + length) > msblk->bytes_used)
144 goto block_release;
145
146 for (; bytes < length; b++) {
147 bh[b] = sb_getblk(sb, ++cur_index);
148 if (bh[b] == NULL)
149 goto block_release;
150 bytes += msblk->devblksize;
151 }
152 ll_rw_block(READ, b - 1, bh + 1);
153 }
154
155 if (compressed) {
156 int zlib_err = 0, zlib_init = 0;
157
158 /*
159 * Uncompress block.
160 */
161
162 mutex_lock(&msblk->read_data_mutex);
163
164 msblk->stream.avail_out = 0;
165 msblk->stream.avail_in = 0;
166
167 bytes = length;
168 do {
169 if (msblk->stream.avail_in == 0 && k < b) {
170 avail = min(bytes, msblk->devblksize - offset);
171 bytes -= avail;
172 wait_on_buffer(bh[k]);
173 if (!buffer_uptodate(bh[k]))
174 goto release_mutex;
175
176 if (avail == 0) {
177 offset = 0;
178 put_bh(bh[k++]);
179 continue;
180 }
181
182 msblk->stream.next_in = bh[k]->b_data + offset;
183 msblk->stream.avail_in = avail;
184 offset = 0;
185 }
186
187 if (msblk->stream.avail_out == 0) {
188 msblk->stream.next_out = buffer[page++];
189 msblk->stream.avail_out = PAGE_CACHE_SIZE;
190 }
191
192 if (!zlib_init) {
193 zlib_err = zlib_inflateInit(&msblk->stream);
194 if (zlib_err != Z_OK) {
195 ERROR("zlib_inflateInit returned"
196 " unexpected result 0x%x,"
197 " srclength %d\n", zlib_err,
198 srclength);
199 goto release_mutex;
200 }
201 zlib_init = 1;
202 }
203
204 zlib_err = zlib_inflate(&msblk->stream, Z_NO_FLUSH);
205
206 if (msblk->stream.avail_in == 0 && k < b)
207 put_bh(bh[k++]);
208 } while (zlib_err == Z_OK);
209
210 if (zlib_err != Z_STREAM_END) {
211 ERROR("zlib_inflate returned unexpected result"
212 " 0x%x, srclength %d, avail_in %d,"
213 " avail_out %d\n", zlib_err, srclength,
214 msblk->stream.avail_in,
215 msblk->stream.avail_out);
216 goto release_mutex;
217 }
218
219 zlib_err = zlib_inflateEnd(&msblk->stream);
220 if (zlib_err != Z_OK) {
221 ERROR("zlib_inflateEnd returned unexpected result 0x%x,"
222 " srclength %d\n", zlib_err, srclength);
223 goto release_mutex;
224 }
225 length = msblk->stream.total_out;
226 mutex_unlock(&msblk->read_data_mutex);
227 } else {
228 /*
229 * Block is uncompressed.
230 */
231 int i, in, pg_offset = 0;
232
233 for (i = 0; i < b; i++) {
234 wait_on_buffer(bh[i]);
235 if (!buffer_uptodate(bh[i]))
236 goto block_release;
237 }
238
239 for (bytes = length; k < b; k++) {
240 in = min(bytes, msblk->devblksize - offset);
241 bytes -= in;
242 while (in) {
243 if (pg_offset == PAGE_CACHE_SIZE) {
244 page++;
245 pg_offset = 0;
246 }
247 avail = min_t(int, in, PAGE_CACHE_SIZE -
248 pg_offset);
249 memcpy(buffer[page] + pg_offset,
250 bh[k]->b_data + offset, avail);
251 in -= avail;
252 pg_offset += avail;
253 offset += avail;
254 }
255 offset = 0;
256 put_bh(bh[k]);
257 }
258 }
259
260 kfree(bh);
261 return length;
262
263release_mutex:
264 mutex_unlock(&msblk->read_data_mutex);
265
266block_release:
267 for (; k < b; k++)
268 put_bh(bh[k]);
269
270read_failure:
271 ERROR("sb_bread failed reading block 0x%llx\n", cur_index);
272 kfree(bh);
273 return -EIO;
274}
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
new file mode 100644
index 000000000000..f29eda16d25e
--- /dev/null
+++ b/fs/squashfs/cache.c
@@ -0,0 +1,412 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * cache.c
22 */
23
24/*
25 * Blocks in Squashfs are compressed. To avoid repeatedly decompressing
26 * recently accessed data Squashfs uses two small metadata and fragment caches.
27 *
28 * This file implements a generic cache implementation used for both caches,
29 * plus functions layered ontop of the generic cache implementation to
30 * access the metadata and fragment caches.
31 *
32 * To avoid out of memory and fragmentation isssues with vmalloc the cache
33 * uses sequences of kmalloced PAGE_CACHE_SIZE buffers.
34 *
35 * It should be noted that the cache is not used for file datablocks, these
36 * are decompressed and cached in the page-cache in the normal way. The
37 * cache is only used to temporarily cache fragment and metadata blocks
38 * which have been read as as a result of a metadata (i.e. inode or
39 * directory) or fragment access. Because metadata and fragments are packed
40 * together into blocks (to gain greater compression) the read of a particular
41 * piece of metadata or fragment will retrieve other metadata/fragments which
42 * have been packed with it, these because of locality-of-reference may be read
43 * in the near future. Temporarily caching them ensures they are available for
44 * near future access without requiring an additional read and decompress.
45 */
46
47#include <linux/fs.h>
48#include <linux/vfs.h>
49#include <linux/slab.h>
50#include <linux/vmalloc.h>
51#include <linux/sched.h>
52#include <linux/spinlock.h>
53#include <linux/wait.h>
54#include <linux/zlib.h>
55#include <linux/pagemap.h>
56
57#include "squashfs_fs.h"
58#include "squashfs_fs_sb.h"
59#include "squashfs_fs_i.h"
60#include "squashfs.h"
61
62/*
63 * Look-up block in cache, and increment usage count. If not in cache, read
64 * and decompress it from disk.
65 */
66struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
67 struct squashfs_cache *cache, u64 block, int length)
68{
69 int i, n;
70 struct squashfs_cache_entry *entry;
71
72 spin_lock(&cache->lock);
73
74 while (1) {
75 for (i = 0; i < cache->entries; i++)
76 if (cache->entry[i].block == block)
77 break;
78
79 if (i == cache->entries) {
80 /*
81 * Block not in cache, if all cache entries are used
82 * go to sleep waiting for one to become available.
83 */
84 if (cache->unused == 0) {
85 cache->num_waiters++;
86 spin_unlock(&cache->lock);
87 wait_event(cache->wait_queue, cache->unused);
88 spin_lock(&cache->lock);
89 cache->num_waiters--;
90 continue;
91 }
92
93 /*
94 * At least one unused cache entry. A simple
95 * round-robin strategy is used to choose the entry to
96 * be evicted from the cache.
97 */
98 i = cache->next_blk;
99 for (n = 0; n < cache->entries; n++) {
100 if (cache->entry[i].refcount == 0)
101 break;
102 i = (i + 1) % cache->entries;
103 }
104
105 cache->next_blk = (i + 1) % cache->entries;
106 entry = &cache->entry[i];
107
108 /*
109 * Initialise choosen cache entry, and fill it in from
110 * disk.
111 */
112 cache->unused--;
113 entry->block = block;
114 entry->refcount = 1;
115 entry->pending = 1;
116 entry->num_waiters = 0;
117 entry->error = 0;
118 spin_unlock(&cache->lock);
119
120 entry->length = squashfs_read_data(sb, entry->data,
121 block, length, &entry->next_index,
122 cache->block_size);
123
124 spin_lock(&cache->lock);
125
126 if (entry->length < 0)
127 entry->error = entry->length;
128
129 entry->pending = 0;
130
131 /*
132 * While filling this entry one or more other processes
133 * have looked it up in the cache, and have slept
134 * waiting for it to become available.
135 */
136 if (entry->num_waiters) {
137 spin_unlock(&cache->lock);
138 wake_up_all(&entry->wait_queue);
139 } else
140 spin_unlock(&cache->lock);
141
142 goto out;
143 }
144
145 /*
146 * Block already in cache. Increment refcount so it doesn't
147 * get reused until we're finished with it, if it was
148 * previously unused there's one less cache entry available
149 * for reuse.
150 */
151 entry = &cache->entry[i];
152 if (entry->refcount == 0)
153 cache->unused--;
154 entry->refcount++;
155
156 /*
157 * If the entry is currently being filled in by another process
158 * go to sleep waiting for it to become available.
159 */
160 if (entry->pending) {
161 entry->num_waiters++;
162 spin_unlock(&cache->lock);
163 wait_event(entry->wait_queue, !entry->pending);
164 } else
165 spin_unlock(&cache->lock);
166
167 goto out;
168 }
169
170out:
171 TRACE("Got %s %d, start block %lld, refcount %d, error %d\n",
172 cache->name, i, entry->block, entry->refcount, entry->error);
173
174 if (entry->error)
175 ERROR("Unable to read %s cache entry [%llx]\n", cache->name,
176 block);
177 return entry;
178}
179
180
181/*
182 * Release cache entry, once usage count is zero it can be reused.
183 */
184void squashfs_cache_put(struct squashfs_cache_entry *entry)
185{
186 struct squashfs_cache *cache = entry->cache;
187
188 spin_lock(&cache->lock);
189 entry->refcount--;
190 if (entry->refcount == 0) {
191 cache->unused++;
192 /*
193 * If there's any processes waiting for a block to become
194 * available, wake one up.
195 */
196 if (cache->num_waiters) {
197 spin_unlock(&cache->lock);
198 wake_up(&cache->wait_queue);
199 return;
200 }
201 }
202 spin_unlock(&cache->lock);
203}
204
205/*
206 * Delete cache reclaiming all kmalloced buffers.
207 */
208void squashfs_cache_delete(struct squashfs_cache *cache)
209{
210 int i, j;
211
212 if (cache == NULL)
213 return;
214
215 for (i = 0; i < cache->entries; i++) {
216 if (cache->entry[i].data) {
217 for (j = 0; j < cache->pages; j++)
218 kfree(cache->entry[i].data[j]);
219 kfree(cache->entry[i].data);
220 }
221 }
222
223 kfree(cache->entry);
224 kfree(cache);
225}
226
227
228/*
229 * Initialise cache allocating the specified number of entries, each of
230 * size block_size. To avoid vmalloc fragmentation issues each entry
231 * is allocated as a sequence of kmalloced PAGE_CACHE_SIZE buffers.
232 */
233struct squashfs_cache *squashfs_cache_init(char *name, int entries,
234 int block_size)
235{
236 int i, j;
237 struct squashfs_cache *cache = kzalloc(sizeof(*cache), GFP_KERNEL);
238
239 if (cache == NULL) {
240 ERROR("Failed to allocate %s cache\n", name);
241 return NULL;
242 }
243
244 cache->entry = kcalloc(entries, sizeof(*(cache->entry)), GFP_KERNEL);
245 if (cache->entry == NULL) {
246 ERROR("Failed to allocate %s cache\n", name);
247 goto cleanup;
248 }
249
250 cache->next_blk = 0;
251 cache->unused = entries;
252 cache->entries = entries;
253 cache->block_size = block_size;
254 cache->pages = block_size >> PAGE_CACHE_SHIFT;
255 cache->name = name;
256 cache->num_waiters = 0;
257 spin_lock_init(&cache->lock);
258 init_waitqueue_head(&cache->wait_queue);
259
260 for (i = 0; i < entries; i++) {
261 struct squashfs_cache_entry *entry = &cache->entry[i];
262
263 init_waitqueue_head(&cache->entry[i].wait_queue);
264 entry->cache = cache;
265 entry->block = SQUASHFS_INVALID_BLK;
266 entry->data = kcalloc(cache->pages, sizeof(void *), GFP_KERNEL);
267 if (entry->data == NULL) {
268 ERROR("Failed to allocate %s cache entry\n", name);
269 goto cleanup;
270 }
271
272 for (j = 0; j < cache->pages; j++) {
273 entry->data[j] = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
274 if (entry->data[j] == NULL) {
275 ERROR("Failed to allocate %s buffer\n", name);
276 goto cleanup;
277 }
278 }
279 }
280
281 return cache;
282
283cleanup:
284 squashfs_cache_delete(cache);
285 return NULL;
286}
287
288
289/*
290 * Copy upto length bytes from cache entry to buffer starting at offset bytes
291 * into the cache entry. If there's not length bytes then copy the number of
292 * bytes available. In all cases return the number of bytes copied.
293 */
294int squashfs_copy_data(void *buffer, struct squashfs_cache_entry *entry,
295 int offset, int length)
296{
297 int remaining = length;
298
299 if (length == 0)
300 return 0;
301 else if (buffer == NULL)
302 return min(length, entry->length - offset);
303
304 while (offset < entry->length) {
305 void *buff = entry->data[offset / PAGE_CACHE_SIZE]
306 + (offset % PAGE_CACHE_SIZE);
307 int bytes = min_t(int, entry->length - offset,
308 PAGE_CACHE_SIZE - (offset % PAGE_CACHE_SIZE));
309
310 if (bytes >= remaining) {
311 memcpy(buffer, buff, remaining);
312 remaining = 0;
313 break;
314 }
315
316 memcpy(buffer, buff, bytes);
317 buffer += bytes;
318 remaining -= bytes;
319 offset += bytes;
320 }
321
322 return length - remaining;
323}
324
325
326/*
327 * Read length bytes from metadata position <block, offset> (block is the
328 * start of the compressed block on disk, and offset is the offset into
329 * the block once decompressed). Data is packed into consecutive blocks,
330 * and length bytes may require reading more than one block.
331 */
332int squashfs_read_metadata(struct super_block *sb, void *buffer,
333 u64 *block, int *offset, int length)
334{
335 struct squashfs_sb_info *msblk = sb->s_fs_info;
336 int bytes, copied = length;
337 struct squashfs_cache_entry *entry;
338
339 TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset);
340
341 while (length) {
342 entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0);
343 if (entry->error)
344 return entry->error;
345 else if (*offset >= entry->length)
346 return -EIO;
347
348 bytes = squashfs_copy_data(buffer, entry, *offset, length);
349 if (buffer)
350 buffer += bytes;
351 length -= bytes;
352 *offset += bytes;
353
354 if (*offset == entry->length) {
355 *block = entry->next_index;
356 *offset = 0;
357 }
358
359 squashfs_cache_put(entry);
360 }
361
362 return copied;
363}
364
365
366/*
367 * Look-up in the fragmment cache the fragment located at <start_block> in the
368 * filesystem. If necessary read and decompress it from disk.
369 */
370struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *sb,
371 u64 start_block, int length)
372{
373 struct squashfs_sb_info *msblk = sb->s_fs_info;
374
375 return squashfs_cache_get(sb, msblk->fragment_cache, start_block,
376 length);
377}
378
379
380/*
381 * Read and decompress the datablock located at <start_block> in the
382 * filesystem. The cache is used here to avoid duplicating locking and
383 * read/decompress code.
384 */
385struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb,
386 u64 start_block, int length)
387{
388 struct squashfs_sb_info *msblk = sb->s_fs_info;
389
390 return squashfs_cache_get(sb, msblk->read_page, start_block, length);
391}
392
393
394/*
395 * Read a filesystem table (uncompressed sequence of bytes) from disk
396 */
397int squashfs_read_table(struct super_block *sb, void *buffer, u64 block,
398 int length)
399{
400 int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
401 int i, res;
402 void **data = kcalloc(pages, sizeof(void *), GFP_KERNEL);
403 if (data == NULL)
404 return -ENOMEM;
405
406 for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
407 data[i] = buffer;
408 res = squashfs_read_data(sb, data, block, length |
409 SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length);
410 kfree(data);
411 return res;
412}
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
new file mode 100644
index 000000000000..566b0eaed868
--- /dev/null
+++ b/fs/squashfs/dir.c
@@ -0,0 +1,235 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * dir.c
22 */
23
24/*
25 * This file implements code to read directories from disk.
26 *
27 * See namei.c for a description of directory organisation on disk.
28 */
29
30#include <linux/fs.h>
31#include <linux/vfs.h>
32#include <linux/slab.h>
33#include <linux/zlib.h>
34
35#include "squashfs_fs.h"
36#include "squashfs_fs_sb.h"
37#include "squashfs_fs_i.h"
38#include "squashfs.h"
39
40static const unsigned char squashfs_filetype_table[] = {
41 DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_FIFO, DT_SOCK
42};
43
44/*
45 * Lookup offset (f_pos) in the directory index, returning the
46 * metadata block containing it.
47 *
48 * If we get an error reading the index then return the part of the index
49 * (if any) we have managed to read - the index isn't essential, just
50 * quicker.
51 */
52static int get_dir_index_using_offset(struct super_block *sb,
53 u64 *next_block, int *next_offset, u64 index_start, int index_offset,
54 int i_count, u64 f_pos)
55{
56 struct squashfs_sb_info *msblk = sb->s_fs_info;
57 int err, i, index, length = 0;
58 struct squashfs_dir_index dir_index;
59
60 TRACE("Entered get_dir_index_using_offset, i_count %d, f_pos %lld\n",
61 i_count, f_pos);
62
63 /*
64 * Translate from external f_pos to the internal f_pos. This
65 * is offset by 3 because we invent "." and ".." entries which are
66 * not actually stored in the directory.
67 */
68 if (f_pos < 3)
69 return f_pos;
70 f_pos -= 3;
71
72 for (i = 0; i < i_count; i++) {
73 err = squashfs_read_metadata(sb, &dir_index, &index_start,
74 &index_offset, sizeof(dir_index));
75 if (err < 0)
76 break;
77
78 index = le32_to_cpu(dir_index.index);
79 if (index > f_pos)
80 /*
81 * Found the index we're looking for.
82 */
83 break;
84
85 err = squashfs_read_metadata(sb, NULL, &index_start,
86 &index_offset, le32_to_cpu(dir_index.size) + 1);
87 if (err < 0)
88 break;
89
90 length = index;
91 *next_block = le32_to_cpu(dir_index.start_block) +
92 msblk->directory_table;
93 }
94
95 *next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE;
96
97 /*
98 * Translate back from internal f_pos to external f_pos.
99 */
100 return length + 3;
101}
102
103
104static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
105{
106 struct inode *inode = file->f_dentry->d_inode;
107 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
108 u64 block = squashfs_i(inode)->start + msblk->directory_table;
109 int offset = squashfs_i(inode)->offset, length = 0, dir_count, size,
110 type, err;
111 unsigned int inode_number;
112 struct squashfs_dir_header dirh;
113 struct squashfs_dir_entry *dire;
114
115 TRACE("Entered squashfs_readdir [%llx:%x]\n", block, offset);
116
117 dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL);
118 if (dire == NULL) {
119 ERROR("Failed to allocate squashfs_dir_entry\n");
120 goto finish;
121 }
122
123 /*
124 * Return "." and ".." entries as the first two filenames in the
125 * directory. To maximise compression these two entries are not
126 * stored in the directory, and so we invent them here.
127 *
128 * It also means that the external f_pos is offset by 3 from the
129 * on-disk directory f_pos.
130 */
131 while (file->f_pos < 3) {
132 char *name;
133 int i_ino;
134
135 if (file->f_pos == 0) {
136 name = ".";
137 size = 1;
138 i_ino = inode->i_ino;
139 } else {
140 name = "..";
141 size = 2;
142 i_ino = squashfs_i(inode)->parent;
143 }
144
145 TRACE("Calling filldir(%p, %s, %d, %lld, %d, %d)\n",
146 dirent, name, size, file->f_pos, i_ino,
147 squashfs_filetype_table[1]);
148
149 if (filldir(dirent, name, size, file->f_pos, i_ino,
150 squashfs_filetype_table[1]) < 0) {
151 TRACE("Filldir returned less than 0\n");
152 goto finish;
153 }
154
155 file->f_pos += size;
156 }
157
158 length = get_dir_index_using_offset(inode->i_sb, &block, &offset,
159 squashfs_i(inode)->dir_idx_start,
160 squashfs_i(inode)->dir_idx_offset,
161 squashfs_i(inode)->dir_idx_cnt,
162 file->f_pos);
163
164 while (length < i_size_read(inode)) {
165 /*
166 * Read directory header
167 */
168 err = squashfs_read_metadata(inode->i_sb, &dirh, &block,
169 &offset, sizeof(dirh));
170 if (err < 0)
171 goto failed_read;
172
173 length += sizeof(dirh);
174
175 dir_count = le32_to_cpu(dirh.count) + 1;
176 while (dir_count--) {
177 /*
178 * Read directory entry.
179 */
180 err = squashfs_read_metadata(inode->i_sb, dire, &block,
181 &offset, sizeof(*dire));
182 if (err < 0)
183 goto failed_read;
184
185 size = le16_to_cpu(dire->size) + 1;
186
187 err = squashfs_read_metadata(inode->i_sb, dire->name,
188 &block, &offset, size);
189 if (err < 0)
190 goto failed_read;
191
192 length += sizeof(*dire) + size;
193
194 if (file->f_pos >= length)
195 continue;
196
197 dire->name[size] = '\0';
198 inode_number = le32_to_cpu(dirh.inode_number) +
199 ((short) le16_to_cpu(dire->inode_number));
200 type = le16_to_cpu(dire->type);
201
202 TRACE("Calling filldir(%p, %s, %d, %lld, %x:%x, %d, %d)"
203 "\n", dirent, dire->name, size,
204 file->f_pos,
205 le32_to_cpu(dirh.start_block),
206 le16_to_cpu(dire->offset),
207 inode_number,
208 squashfs_filetype_table[type]);
209
210 if (filldir(dirent, dire->name, size, file->f_pos,
211 inode_number,
212 squashfs_filetype_table[type]) < 0) {
213 TRACE("Filldir returned less than 0\n");
214 goto finish;
215 }
216
217 file->f_pos = length;
218 }
219 }
220
221finish:
222 kfree(dire);
223 return 0;
224
225failed_read:
226 ERROR("Unable to read directory block [%llx:%x]\n", block, offset);
227 kfree(dire);
228 return 0;
229}
230
231
232const struct file_operations squashfs_dir_ops = {
233 .read = generic_read_dir,
234 .readdir = squashfs_readdir
235};
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
new file mode 100644
index 000000000000..69e971d5ddc1
--- /dev/null
+++ b/fs/squashfs/export.c
@@ -0,0 +1,155 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * export.c
22 */
23
24/*
25 * This file implements code to make Squashfs filesystems exportable (NFS etc.)
26 *
27 * The export code uses an inode lookup table to map inode numbers passed in
28 * filehandles to an inode location on disk. This table is stored compressed
29 * into metadata blocks. A second index table is used to locate these. This
30 * second index table for speed of access (and because it is small) is read at
31 * mount time and cached in memory.
32 *
33 * The inode lookup table is used only by the export code, inode disk
34 * locations are directly encoded in directories, enabling direct access
35 * without an intermediate lookup for all operations except the export ops.
36 */
37
38#include <linux/fs.h>
39#include <linux/vfs.h>
40#include <linux/dcache.h>
41#include <linux/exportfs.h>
42#include <linux/zlib.h>
43
44#include "squashfs_fs.h"
45#include "squashfs_fs_sb.h"
46#include "squashfs_fs_i.h"
47#include "squashfs.h"
48
49/*
50 * Look-up inode number (ino) in table, returning the inode location.
51 */
52static long long squashfs_inode_lookup(struct super_block *sb, int ino_num)
53{
54 struct squashfs_sb_info *msblk = sb->s_fs_info;
55 int blk = SQUASHFS_LOOKUP_BLOCK(ino_num - 1);
56 int offset = SQUASHFS_LOOKUP_BLOCK_OFFSET(ino_num - 1);
57 u64 start = le64_to_cpu(msblk->inode_lookup_table[blk]);
58 __le64 ino;
59 int err;
60
61 TRACE("Entered squashfs_inode_lookup, inode_number = %d\n", ino_num);
62
63 err = squashfs_read_metadata(sb, &ino, &start, &offset, sizeof(ino));
64 if (err < 0)
65 return err;
66
67 TRACE("squashfs_inode_lookup, inode = 0x%llx\n",
68 (u64) le64_to_cpu(ino));
69
70 return le64_to_cpu(ino);
71}
72
73
74static struct dentry *squashfs_export_iget(struct super_block *sb,
75 unsigned int ino_num)
76{
77 long long ino;
78 struct dentry *dentry = ERR_PTR(-ENOENT);
79
80 TRACE("Entered squashfs_export_iget\n");
81
82 ino = squashfs_inode_lookup(sb, ino_num);
83 if (ino >= 0)
84 dentry = d_obtain_alias(squashfs_iget(sb, ino, ino_num));
85
86 return dentry;
87}
88
89
90static struct dentry *squashfs_fh_to_dentry(struct super_block *sb,
91 struct fid *fid, int fh_len, int fh_type)
92{
93 if ((fh_type != FILEID_INO32_GEN && fh_type != FILEID_INO32_GEN_PARENT)
94 || fh_len < 2)
95 return NULL;
96
97 return squashfs_export_iget(sb, fid->i32.ino);
98}
99
100
101static struct dentry *squashfs_fh_to_parent(struct super_block *sb,
102 struct fid *fid, int fh_len, int fh_type)
103{
104 if (fh_type != FILEID_INO32_GEN_PARENT || fh_len < 4)
105 return NULL;
106
107 return squashfs_export_iget(sb, fid->i32.parent_ino);
108}
109
110
111static struct dentry *squashfs_get_parent(struct dentry *child)
112{
113 struct inode *inode = child->d_inode;
114 unsigned int parent_ino = squashfs_i(inode)->parent;
115
116 return squashfs_export_iget(inode->i_sb, parent_ino);
117}
118
119
120/*
121 * Read uncompressed inode lookup table indexes off disk into memory
122 */
123__le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
124 u64 lookup_table_start, unsigned int inodes)
125{
126 unsigned int length = SQUASHFS_LOOKUP_BLOCK_BYTES(inodes);
127 __le64 *inode_lookup_table;
128 int err;
129
130 TRACE("In read_inode_lookup_table, length %d\n", length);
131
132 /* Allocate inode lookup table indexes */
133 inode_lookup_table = kmalloc(length, GFP_KERNEL);
134 if (inode_lookup_table == NULL) {
135 ERROR("Failed to allocate inode lookup table\n");
136 return ERR_PTR(-ENOMEM);
137 }
138
139 err = squashfs_read_table(sb, inode_lookup_table, lookup_table_start,
140 length);
141 if (err < 0) {
142 ERROR("unable to read inode lookup table\n");
143 kfree(inode_lookup_table);
144 return ERR_PTR(err);
145 }
146
147 return inode_lookup_table;
148}
149
150
151const struct export_operations squashfs_export_ops = {
152 .fh_to_dentry = squashfs_fh_to_dentry,
153 .fh_to_parent = squashfs_fh_to_parent,
154 .get_parent = squashfs_get_parent
155};
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
new file mode 100644
index 000000000000..717767d831df
--- /dev/null
+++ b/fs/squashfs/file.c
@@ -0,0 +1,502 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * file.c
22 */
23
24/*
25 * This file contains code for handling regular files. A regular file
26 * consists of a sequence of contiguous compressed blocks, and/or a
27 * compressed fragment block (tail-end packed block). The compressed size
28 * of each datablock is stored in a block list contained within the
29 * file inode (itself stored in one or more compressed metadata blocks).
30 *
31 * To speed up access to datablocks when reading 'large' files (256 Mbytes or
32 * larger), the code implements an index cache that caches the mapping from
33 * block index to datablock location on disk.
34 *
35 * The index cache allows Squashfs to handle large files (up to 1.75 TiB) while
36 * retaining a simple and space-efficient block list on disk. The cache
37 * is split into slots, caching up to eight 224 GiB files (128 KiB blocks).
38 * Larger files use multiple slots, with 1.75 TiB files using all 8 slots.
39 * The index cache is designed to be memory efficient, and by default uses
40 * 16 KiB.
41 */
42
43#include <linux/fs.h>
44#include <linux/vfs.h>
45#include <linux/kernel.h>
46#include <linux/slab.h>
47#include <linux/string.h>
48#include <linux/pagemap.h>
49#include <linux/mutex.h>
50#include <linux/zlib.h>
51
52#include "squashfs_fs.h"
53#include "squashfs_fs_sb.h"
54#include "squashfs_fs_i.h"
55#include "squashfs.h"
56
57/*
58 * Locate cache slot in range [offset, index] for specified inode. If
59 * there's more than one return the slot closest to index.
60 */
61static struct meta_index *locate_meta_index(struct inode *inode, int offset,
62 int index)
63{
64 struct meta_index *meta = NULL;
65 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
66 int i;
67
68 mutex_lock(&msblk->meta_index_mutex);
69
70 TRACE("locate_meta_index: index %d, offset %d\n", index, offset);
71
72 if (msblk->meta_index == NULL)
73 goto not_allocated;
74
75 for (i = 0; i < SQUASHFS_META_SLOTS; i++) {
76 if (msblk->meta_index[i].inode_number == inode->i_ino &&
77 msblk->meta_index[i].offset >= offset &&
78 msblk->meta_index[i].offset <= index &&
79 msblk->meta_index[i].locked == 0) {
80 TRACE("locate_meta_index: entry %d, offset %d\n", i,
81 msblk->meta_index[i].offset);
82 meta = &msblk->meta_index[i];
83 offset = meta->offset;
84 }
85 }
86
87 if (meta)
88 meta->locked = 1;
89
90not_allocated:
91 mutex_unlock(&msblk->meta_index_mutex);
92
93 return meta;
94}
95
96
97/*
98 * Find and initialise an empty cache slot for index offset.
99 */
100static struct meta_index *empty_meta_index(struct inode *inode, int offset,
101 int skip)
102{
103 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
104 struct meta_index *meta = NULL;
105 int i;
106
107 mutex_lock(&msblk->meta_index_mutex);
108
109 TRACE("empty_meta_index: offset %d, skip %d\n", offset, skip);
110
111 if (msblk->meta_index == NULL) {
112 /*
113 * First time cache index has been used, allocate and
114 * initialise. The cache index could be allocated at
115 * mount time but doing it here means it is allocated only
116 * if a 'large' file is read.
117 */
118 msblk->meta_index = kcalloc(SQUASHFS_META_SLOTS,
119 sizeof(*(msblk->meta_index)), GFP_KERNEL);
120 if (msblk->meta_index == NULL) {
121 ERROR("Failed to allocate meta_index\n");
122 goto failed;
123 }
124 for (i = 0; i < SQUASHFS_META_SLOTS; i++) {
125 msblk->meta_index[i].inode_number = 0;
126 msblk->meta_index[i].locked = 0;
127 }
128 msblk->next_meta_index = 0;
129 }
130
131 for (i = SQUASHFS_META_SLOTS; i &&
132 msblk->meta_index[msblk->next_meta_index].locked; i--)
133 msblk->next_meta_index = (msblk->next_meta_index + 1) %
134 SQUASHFS_META_SLOTS;
135
136 if (i == 0) {
137 TRACE("empty_meta_index: failed!\n");
138 goto failed;
139 }
140
141 TRACE("empty_meta_index: returned meta entry %d, %p\n",
142 msblk->next_meta_index,
143 &msblk->meta_index[msblk->next_meta_index]);
144
145 meta = &msblk->meta_index[msblk->next_meta_index];
146 msblk->next_meta_index = (msblk->next_meta_index + 1) %
147 SQUASHFS_META_SLOTS;
148
149 meta->inode_number = inode->i_ino;
150 meta->offset = offset;
151 meta->skip = skip;
152 meta->entries = 0;
153 meta->locked = 1;
154
155failed:
156 mutex_unlock(&msblk->meta_index_mutex);
157 return meta;
158}
159
160
161static void release_meta_index(struct inode *inode, struct meta_index *meta)
162{
163 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
164 mutex_lock(&msblk->meta_index_mutex);
165 meta->locked = 0;
166 mutex_unlock(&msblk->meta_index_mutex);
167}
168
169
170/*
171 * Read the next n blocks from the block list, starting from
172 * metadata block <start_block, offset>.
173 */
174static long long read_indexes(struct super_block *sb, int n,
175 u64 *start_block, int *offset)
176{
177 int err, i;
178 long long block = 0;
179 __le32 *blist = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
180
181 if (blist == NULL) {
182 ERROR("read_indexes: Failed to allocate block_list\n");
183 return -ENOMEM;
184 }
185
186 while (n) {
187 int blocks = min_t(int, n, PAGE_CACHE_SIZE >> 2);
188
189 err = squashfs_read_metadata(sb, blist, start_block,
190 offset, blocks << 2);
191 if (err < 0) {
192 ERROR("read_indexes: reading block [%llx:%x]\n",
193 *start_block, *offset);
194 goto failure;
195 }
196
197 for (i = 0; i < blocks; i++) {
198 int size = le32_to_cpu(blist[i]);
199 block += SQUASHFS_COMPRESSED_SIZE_BLOCK(size);
200 }
201 n -= blocks;
202 }
203
204 kfree(blist);
205 return block;
206
207failure:
208 kfree(blist);
209 return err;
210}
211
212
213/*
214 * Each cache index slot has SQUASHFS_META_ENTRIES, each of which
215 * can cache one index -> datablock/blocklist-block mapping. We wish
216 * to distribute these over the length of the file, entry[0] maps index x,
217 * entry[1] maps index x + skip, entry[2] maps index x + 2 * skip, and so on.
218 * The larger the file, the greater the skip factor. The skip factor is
219 * limited to the size of the metadata cache (SQUASHFS_CACHED_BLKS) to ensure
220 * the number of metadata blocks that need to be read fits into the cache.
221 * If the skip factor is limited in this way then the file will use multiple
222 * slots.
223 */
224static inline int calculate_skip(int blocks)
225{
226 int skip = blocks / ((SQUASHFS_META_ENTRIES + 1)
227 * SQUASHFS_META_INDEXES);
228 return min(SQUASHFS_CACHED_BLKS - 1, skip + 1);
229}
230
231
232/*
233 * Search and grow the index cache for the specified inode, returning the
234 * on-disk locations of the datablock and block list metadata block
235 * <index_block, index_offset> for index (scaled to nearest cache index).
236 */
237static int fill_meta_index(struct inode *inode, int index,
238 u64 *index_block, int *index_offset, u64 *data_block)
239{
240 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
241 int skip = calculate_skip(i_size_read(inode) >> msblk->block_log);
242 int offset = 0;
243 struct meta_index *meta;
244 struct meta_entry *meta_entry;
245 u64 cur_index_block = squashfs_i(inode)->block_list_start;
246 int cur_offset = squashfs_i(inode)->offset;
247 u64 cur_data_block = squashfs_i(inode)->start;
248 int err, i;
249
250 /*
251 * Scale index to cache index (cache slot entry)
252 */
253 index /= SQUASHFS_META_INDEXES * skip;
254
255 while (offset < index) {
256 meta = locate_meta_index(inode, offset + 1, index);
257
258 if (meta == NULL) {
259 meta = empty_meta_index(inode, offset + 1, skip);
260 if (meta == NULL)
261 goto all_done;
262 } else {
263 offset = index < meta->offset + meta->entries ? index :
264 meta->offset + meta->entries - 1;
265 meta_entry = &meta->meta_entry[offset - meta->offset];
266 cur_index_block = meta_entry->index_block +
267 msblk->inode_table;
268 cur_offset = meta_entry->offset;
269 cur_data_block = meta_entry->data_block;
270 TRACE("get_meta_index: offset %d, meta->offset %d, "
271 "meta->entries %d\n", offset, meta->offset,
272 meta->entries);
273 TRACE("get_meta_index: index_block 0x%llx, offset 0x%x"
274 " data_block 0x%llx\n", cur_index_block,
275 cur_offset, cur_data_block);
276 }
277
278 /*
279 * If necessary grow cache slot by reading block list. Cache
280 * slot is extended up to index or to the end of the slot, in
281 * which case further slots will be used.
282 */
283 for (i = meta->offset + meta->entries; i <= index &&
284 i < meta->offset + SQUASHFS_META_ENTRIES; i++) {
285 int blocks = skip * SQUASHFS_META_INDEXES;
286 long long res = read_indexes(inode->i_sb, blocks,
287 &cur_index_block, &cur_offset);
288
289 if (res < 0) {
290 if (meta->entries == 0)
291 /*
292 * Don't leave an empty slot on read
293 * error allocated to this inode...
294 */
295 meta->inode_number = 0;
296 err = res;
297 goto failed;
298 }
299
300 cur_data_block += res;
301 meta_entry = &meta->meta_entry[i - meta->offset];
302 meta_entry->index_block = cur_index_block -
303 msblk->inode_table;
304 meta_entry->offset = cur_offset;
305 meta_entry->data_block = cur_data_block;
306 meta->entries++;
307 offset++;
308 }
309
310 TRACE("get_meta_index: meta->offset %d, meta->entries %d\n",
311 meta->offset, meta->entries);
312
313 release_meta_index(inode, meta);
314 }
315
316all_done:
317 *index_block = cur_index_block;
318 *index_offset = cur_offset;
319 *data_block = cur_data_block;
320
321 /*
322 * Scale cache index (cache slot entry) to index
323 */
324 return offset * SQUASHFS_META_INDEXES * skip;
325
326failed:
327 release_meta_index(inode, meta);
328 return err;
329}
330
331
332/*
333 * Get the on-disk location and compressed size of the datablock
334 * specified by index. Fill_meta_index() does most of the work.
335 */
336static int read_blocklist(struct inode *inode, int index, u64 *block)
337{
338 u64 start;
339 long long blks;
340 int offset;
341 __le32 size;
342 int res = fill_meta_index(inode, index, &start, &offset, block);
343
344 TRACE("read_blocklist: res %d, index %d, start 0x%llx, offset"
345 " 0x%x, block 0x%llx\n", res, index, start, offset,
346 *block);
347
348 if (res < 0)
349 return res;
350
351 /*
352 * res contains the index of the mapping returned by fill_meta_index(),
353 * this will likely be less than the desired index (because the
354 * meta_index cache works at a higher granularity). Read any
355 * extra block indexes needed.
356 */
357 if (res < index) {
358 blks = read_indexes(inode->i_sb, index - res, &start, &offset);
359 if (blks < 0)
360 return (int) blks;
361 *block += blks;
362 }
363
364 /*
365 * Read length of block specified by index.
366 */
367 res = squashfs_read_metadata(inode->i_sb, &size, &start, &offset,
368 sizeof(size));
369 if (res < 0)
370 return res;
371 return le32_to_cpu(size);
372}
373
374
375static int squashfs_readpage(struct file *file, struct page *page)
376{
377 struct inode *inode = page->mapping->host;
378 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
379 int bytes, i, offset = 0, sparse = 0;
380 struct squashfs_cache_entry *buffer = NULL;
381 void *pageaddr;
382
383 int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
384 int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT);
385 int start_index = page->index & ~mask;
386 int end_index = start_index | mask;
387 int file_end = i_size_read(inode) >> msblk->block_log;
388
389 TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
390 page->index, squashfs_i(inode)->start);
391
392 if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
393 PAGE_CACHE_SHIFT))
394 goto out;
395
396 if (index < file_end || squashfs_i(inode)->fragment_block ==
397 SQUASHFS_INVALID_BLK) {
398 /*
399 * Reading a datablock from disk. Need to read block list
400 * to get location and block size.
401 */
402 u64 block = 0;
403 int bsize = read_blocklist(inode, index, &block);
404 if (bsize < 0)
405 goto error_out;
406
407 if (bsize == 0) { /* hole */
408 bytes = index == file_end ?
409 (i_size_read(inode) & (msblk->block_size - 1)) :
410 msblk->block_size;
411 sparse = 1;
412 } else {
413 /*
414 * Read and decompress datablock.
415 */
416 buffer = squashfs_get_datablock(inode->i_sb,
417 block, bsize);
418 if (buffer->error) {
419 ERROR("Unable to read page, block %llx, size %x"
420 "\n", block, bsize);
421 squashfs_cache_put(buffer);
422 goto error_out;
423 }
424 bytes = buffer->length;
425 }
426 } else {
427 /*
428 * Datablock is stored inside a fragment (tail-end packed
429 * block).
430 */
431 buffer = squashfs_get_fragment(inode->i_sb,
432 squashfs_i(inode)->fragment_block,
433 squashfs_i(inode)->fragment_size);
434
435 if (buffer->error) {
436 ERROR("Unable to read page, block %llx, size %x\n",
437 squashfs_i(inode)->fragment_block,
438 squashfs_i(inode)->fragment_size);
439 squashfs_cache_put(buffer);
440 goto error_out;
441 }
442 bytes = i_size_read(inode) & (msblk->block_size - 1);
443 offset = squashfs_i(inode)->fragment_offset;
444 }
445
446 /*
447 * Loop copying datablock into pages. As the datablock likely covers
448 * many PAGE_CACHE_SIZE pages (default block size is 128 KiB) explicitly
449 * grab the pages from the page cache, except for the page that we've
450 * been called to fill.
451 */
452 for (i = start_index; i <= end_index && bytes > 0; i++,
453 bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
454 struct page *push_page;
455 int avail = sparse ? 0 : min_t(int, bytes, PAGE_CACHE_SIZE);
456
457 TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail);
458
459 push_page = (i == page->index) ? page :
460 grab_cache_page_nowait(page->mapping, i);
461
462 if (!push_page)
463 continue;
464
465 if (PageUptodate(push_page))
466 goto skip_page;
467
468 pageaddr = kmap_atomic(push_page, KM_USER0);
469 squashfs_copy_data(pageaddr, buffer, offset, avail);
470 memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail);
471 kunmap_atomic(pageaddr, KM_USER0);
472 flush_dcache_page(push_page);
473 SetPageUptodate(push_page);
474skip_page:
475 unlock_page(push_page);
476 if (i != page->index)
477 page_cache_release(push_page);
478 }
479
480 if (!sparse)
481 squashfs_cache_put(buffer);
482
483 return 0;
484
485error_out:
486 SetPageError(page);
487out:
488 pageaddr = kmap_atomic(page, KM_USER0);
489 memset(pageaddr, 0, PAGE_CACHE_SIZE);
490 kunmap_atomic(pageaddr, KM_USER0);
491 flush_dcache_page(page);
492 if (!PageError(page))
493 SetPageUptodate(page);
494 unlock_page(page);
495
496 return 0;
497}
498
499
500const struct address_space_operations squashfs_aops = {
501 .readpage = squashfs_readpage
502};
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
new file mode 100644
index 000000000000..b5a2c15bbbc7
--- /dev/null
+++ b/fs/squashfs/fragment.c
@@ -0,0 +1,98 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * fragment.c
22 */
23
24/*
25 * This file implements code to handle compressed fragments (tail-end packed
26 * datablocks).
27 *
28 * Regular files contain a fragment index which is mapped to a fragment
29 * location on disk and compressed size using a fragment lookup table.
30 * Like everything in Squashfs this fragment lookup table is itself stored
31 * compressed into metadata blocks. A second index table is used to locate
32 * these. This second index table for speed of access (and because it
33 * is small) is read at mount time and cached in memory.
34 */
35
36#include <linux/fs.h>
37#include <linux/vfs.h>
38#include <linux/slab.h>
39#include <linux/zlib.h>
40
41#include "squashfs_fs.h"
42#include "squashfs_fs_sb.h"
43#include "squashfs_fs_i.h"
44#include "squashfs.h"
45
46/*
47 * Look-up fragment using the fragment index table. Return the on disk
48 * location of the fragment and its compressed size
49 */
50int squashfs_frag_lookup(struct super_block *sb, unsigned int fragment,
51 u64 *fragment_block)
52{
53 struct squashfs_sb_info *msblk = sb->s_fs_info;
54 int block = SQUASHFS_FRAGMENT_INDEX(fragment);
55 int offset = SQUASHFS_FRAGMENT_INDEX_OFFSET(fragment);
56 u64 start_block = le64_to_cpu(msblk->fragment_index[block]);
57 struct squashfs_fragment_entry fragment_entry;
58 int size;
59
60 size = squashfs_read_metadata(sb, &fragment_entry, &start_block,
61 &offset, sizeof(fragment_entry));
62 if (size < 0)
63 return size;
64
65 *fragment_block = le64_to_cpu(fragment_entry.start_block);
66 size = le32_to_cpu(fragment_entry.size);
67
68 return size;
69}
70
71
72/*
73 * Read the uncompressed fragment lookup table indexes off disk into memory
74 */
75__le64 *squashfs_read_fragment_index_table(struct super_block *sb,
76 u64 fragment_table_start, unsigned int fragments)
77{
78 unsigned int length = SQUASHFS_FRAGMENT_INDEX_BYTES(fragments);
79 __le64 *fragment_index;
80 int err;
81
82 /* Allocate fragment lookup table indexes */
83 fragment_index = kmalloc(length, GFP_KERNEL);
84 if (fragment_index == NULL) {
85 ERROR("Failed to allocate fragment index table\n");
86 return ERR_PTR(-ENOMEM);
87 }
88
89 err = squashfs_read_table(sb, fragment_index, fragment_table_start,
90 length);
91 if (err < 0) {
92 ERROR("unable to read fragment index table\n");
93 kfree(fragment_index);
94 return ERR_PTR(err);
95 }
96
97 return fragment_index;
98}
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
new file mode 100644
index 000000000000..3795b837ba28
--- /dev/null
+++ b/fs/squashfs/id.c
@@ -0,0 +1,94 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * id.c
22 */
23
24/*
25 * This file implements code to handle uids and gids.
26 *
27 * For space efficiency regular files store uid and gid indexes, which are
28 * converted to 32-bit uids/gids using an id look up table. This table is
29 * stored compressed into metadata blocks. A second index table is used to
30 * locate these. This second index table for speed of access (and because it
31 * is small) is read at mount time and cached in memory.
32 */
33
34#include <linux/fs.h>
35#include <linux/vfs.h>
36#include <linux/slab.h>
37#include <linux/zlib.h>
38
39#include "squashfs_fs.h"
40#include "squashfs_fs_sb.h"
41#include "squashfs_fs_i.h"
42#include "squashfs.h"
43
44/*
45 * Map uid/gid index into real 32-bit uid/gid using the id look up table
46 */
47int squashfs_get_id(struct super_block *sb, unsigned int index,
48 unsigned int *id)
49{
50 struct squashfs_sb_info *msblk = sb->s_fs_info;
51 int block = SQUASHFS_ID_BLOCK(index);
52 int offset = SQUASHFS_ID_BLOCK_OFFSET(index);
53 u64 start_block = le64_to_cpu(msblk->id_table[block]);
54 __le32 disk_id;
55 int err;
56
57 err = squashfs_read_metadata(sb, &disk_id, &start_block, &offset,
58 sizeof(disk_id));
59 if (err < 0)
60 return err;
61
62 *id = le32_to_cpu(disk_id);
63 return 0;
64}
65
66
67/*
68 * Read uncompressed id lookup table indexes from disk into memory
69 */
70__le64 *squashfs_read_id_index_table(struct super_block *sb,
71 u64 id_table_start, unsigned short no_ids)
72{
73 unsigned int length = SQUASHFS_ID_BLOCK_BYTES(no_ids);
74 __le64 *id_table;
75 int err;
76
77 TRACE("In read_id_index_table, length %d\n", length);
78
79 /* Allocate id lookup table indexes */
80 id_table = kmalloc(length, GFP_KERNEL);
81 if (id_table == NULL) {
82 ERROR("Failed to allocate id index table\n");
83 return ERR_PTR(-ENOMEM);
84 }
85
86 err = squashfs_read_table(sb, id_table, id_table_start, length);
87 if (err < 0) {
88 ERROR("unable to read id index table\n");
89 kfree(id_table);
90 return ERR_PTR(err);
91 }
92
93 return id_table;
94}
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
new file mode 100644
index 000000000000..7a63398bb855
--- /dev/null
+++ b/fs/squashfs/inode.c
@@ -0,0 +1,346 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * inode.c
22 */
23
24/*
25 * This file implements code to create and read inodes from disk.
26 *
27 * Inodes in Squashfs are identified by a 48-bit inode which encodes the
28 * location of the compressed metadata block containing the inode, and the byte
29 * offset into that block where the inode is placed (<block, offset>).
30 *
31 * To maximise compression there are different inodes for each file type
32 * (regular file, directory, device, etc.), the inode contents and length
33 * varying with the type.
34 *
35 * To further maximise compression, two types of regular file inode and
36 * directory inode are defined: inodes optimised for frequently occurring
37 * regular files and directories, and extended types where extra
38 * information has to be stored.
39 */
40
41#include <linux/fs.h>
42#include <linux/vfs.h>
43#include <linux/zlib.h>
44
45#include "squashfs_fs.h"
46#include "squashfs_fs_sb.h"
47#include "squashfs_fs_i.h"
48#include "squashfs.h"
49
50/*
51 * Initialise VFS inode with the base inode information common to all
52 * Squashfs inode types. Sqsh_ino contains the unswapped base inode
53 * off disk.
54 */
55static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
56 struct squashfs_base_inode *sqsh_ino)
57{
58 int err;
59
60 err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &inode->i_uid);
61 if (err)
62 return err;
63
64 err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->guid), &inode->i_gid);
65 if (err)
66 return err;
67
68 inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
69 inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime);
70 inode->i_atime.tv_sec = inode->i_mtime.tv_sec;
71 inode->i_ctime.tv_sec = inode->i_mtime.tv_sec;
72 inode->i_mode = le16_to_cpu(sqsh_ino->mode);
73 inode->i_size = 0;
74
75 return err;
76}
77
78
79struct inode *squashfs_iget(struct super_block *sb, long long ino,
80 unsigned int ino_number)
81{
82 struct inode *inode = iget_locked(sb, ino_number);
83 int err;
84
85 TRACE("Entered squashfs_iget\n");
86
87 if (!inode)
88 return ERR_PTR(-ENOMEM);
89 if (!(inode->i_state & I_NEW))
90 return inode;
91
92 err = squashfs_read_inode(inode, ino);
93 if (err) {
94 iget_failed(inode);
95 return ERR_PTR(err);
96 }
97
98 unlock_new_inode(inode);
99 return inode;
100}
101
102
103/*
104 * Initialise VFS inode by reading inode from inode table (compressed
105 * metadata). The format and amount of data read depends on type.
106 */
107int squashfs_read_inode(struct inode *inode, long long ino)
108{
109 struct super_block *sb = inode->i_sb;
110 struct squashfs_sb_info *msblk = sb->s_fs_info;
111 u64 block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table;
112 int err, type, offset = SQUASHFS_INODE_OFFSET(ino);
113 union squashfs_inode squashfs_ino;
114 struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base;
115
116 TRACE("Entered squashfs_read_inode\n");
117
118 /*
119 * Read inode base common to all inode types.
120 */
121 err = squashfs_read_metadata(sb, sqshb_ino, &block,
122 &offset, sizeof(*sqshb_ino));
123 if (err < 0)
124 goto failed_read;
125
126 err = squashfs_new_inode(sb, inode, sqshb_ino);
127 if (err)
128 goto failed_read;
129
130 block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table;
131 offset = SQUASHFS_INODE_OFFSET(ino);
132
133 type = le16_to_cpu(sqshb_ino->inode_type);
134 switch (type) {
135 case SQUASHFS_REG_TYPE: {
136 unsigned int frag_offset, frag_size, frag;
137 u64 frag_blk;
138 struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg;
139
140 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
141 sizeof(*sqsh_ino));
142 if (err < 0)
143 goto failed_read;
144
145 frag = le32_to_cpu(sqsh_ino->fragment);
146 if (frag != SQUASHFS_INVALID_FRAG) {
147 frag_offset = le32_to_cpu(sqsh_ino->offset);
148 frag_size = squashfs_frag_lookup(sb, frag, &frag_blk);
149 if (frag_size < 0) {
150 err = frag_size;
151 goto failed_read;
152 }
153 } else {
154 frag_blk = SQUASHFS_INVALID_BLK;
155 frag_size = 0;
156 frag_offset = 0;
157 }
158
159 inode->i_nlink = 1;
160 inode->i_size = le32_to_cpu(sqsh_ino->file_size);
161 inode->i_fop = &generic_ro_fops;
162 inode->i_mode |= S_IFREG;
163 inode->i_blocks = ((inode->i_size - 1) >> 9) + 1;
164 squashfs_i(inode)->fragment_block = frag_blk;
165 squashfs_i(inode)->fragment_size = frag_size;
166 squashfs_i(inode)->fragment_offset = frag_offset;
167 squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
168 squashfs_i(inode)->block_list_start = block;
169 squashfs_i(inode)->offset = offset;
170 inode->i_data.a_ops = &squashfs_aops;
171
172 TRACE("File inode %x:%x, start_block %llx, block_list_start "
173 "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino),
174 offset, squashfs_i(inode)->start, block, offset);
175 break;
176 }
177 case SQUASHFS_LREG_TYPE: {
178 unsigned int frag_offset, frag_size, frag;
179 u64 frag_blk;
180 struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg;
181
182 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
183 sizeof(*sqsh_ino));
184 if (err < 0)
185 goto failed_read;
186
187 frag = le32_to_cpu(sqsh_ino->fragment);
188 if (frag != SQUASHFS_INVALID_FRAG) {
189 frag_offset = le32_to_cpu(sqsh_ino->offset);
190 frag_size = squashfs_frag_lookup(sb, frag, &frag_blk);
191 if (frag_size < 0) {
192 err = frag_size;
193 goto failed_read;
194 }
195 } else {
196 frag_blk = SQUASHFS_INVALID_BLK;
197 frag_size = 0;
198 frag_offset = 0;
199 }
200
201 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
202 inode->i_size = le64_to_cpu(sqsh_ino->file_size);
203 inode->i_fop = &generic_ro_fops;
204 inode->i_mode |= S_IFREG;
205 inode->i_blocks = ((inode->i_size -
206 le64_to_cpu(sqsh_ino->sparse) - 1) >> 9) + 1;
207
208 squashfs_i(inode)->fragment_block = frag_blk;
209 squashfs_i(inode)->fragment_size = frag_size;
210 squashfs_i(inode)->fragment_offset = frag_offset;
211 squashfs_i(inode)->start = le64_to_cpu(sqsh_ino->start_block);
212 squashfs_i(inode)->block_list_start = block;
213 squashfs_i(inode)->offset = offset;
214 inode->i_data.a_ops = &squashfs_aops;
215
216 TRACE("File inode %x:%x, start_block %llx, block_list_start "
217 "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino),
218 offset, squashfs_i(inode)->start, block, offset);
219 break;
220 }
221 case SQUASHFS_DIR_TYPE: {
222 struct squashfs_dir_inode *sqsh_ino = &squashfs_ino.dir;
223
224 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
225 sizeof(*sqsh_ino));
226 if (err < 0)
227 goto failed_read;
228
229 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
230 inode->i_size = le16_to_cpu(sqsh_ino->file_size);
231 inode->i_op = &squashfs_dir_inode_ops;
232 inode->i_fop = &squashfs_dir_ops;
233 inode->i_mode |= S_IFDIR;
234 squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
235 squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset);
236 squashfs_i(inode)->dir_idx_cnt = 0;
237 squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode);
238
239 TRACE("Directory inode %x:%x, start_block %llx, offset %x\n",
240 SQUASHFS_INODE_BLK(ino), offset,
241 squashfs_i(inode)->start,
242 le16_to_cpu(sqsh_ino->offset));
243 break;
244 }
245 case SQUASHFS_LDIR_TYPE: {
246 struct squashfs_ldir_inode *sqsh_ino = &squashfs_ino.ldir;
247
248 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
249 sizeof(*sqsh_ino));
250 if (err < 0)
251 goto failed_read;
252
253 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
254 inode->i_size = le32_to_cpu(sqsh_ino->file_size);
255 inode->i_op = &squashfs_dir_inode_ops;
256 inode->i_fop = &squashfs_dir_ops;
257 inode->i_mode |= S_IFDIR;
258 squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
259 squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset);
260 squashfs_i(inode)->dir_idx_start = block;
261 squashfs_i(inode)->dir_idx_offset = offset;
262 squashfs_i(inode)->dir_idx_cnt = le16_to_cpu(sqsh_ino->i_count);
263 squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode);
264
265 TRACE("Long directory inode %x:%x, start_block %llx, offset "
266 "%x\n", SQUASHFS_INODE_BLK(ino), offset,
267 squashfs_i(inode)->start,
268 le16_to_cpu(sqsh_ino->offset));
269 break;
270 }
271 case SQUASHFS_SYMLINK_TYPE:
272 case SQUASHFS_LSYMLINK_TYPE: {
273 struct squashfs_symlink_inode *sqsh_ino = &squashfs_ino.symlink;
274
275 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
276 sizeof(*sqsh_ino));
277 if (err < 0)
278 goto failed_read;
279
280 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
281 inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
282 inode->i_op = &page_symlink_inode_operations;
283 inode->i_data.a_ops = &squashfs_symlink_aops;
284 inode->i_mode |= S_IFLNK;
285 squashfs_i(inode)->start = block;
286 squashfs_i(inode)->offset = offset;
287
288 TRACE("Symbolic link inode %x:%x, start_block %llx, offset "
289 "%x\n", SQUASHFS_INODE_BLK(ino), offset,
290 block, offset);
291 break;
292 }
293 case SQUASHFS_BLKDEV_TYPE:
294 case SQUASHFS_CHRDEV_TYPE:
295 case SQUASHFS_LBLKDEV_TYPE:
296 case SQUASHFS_LCHRDEV_TYPE: {
297 struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev;
298 unsigned int rdev;
299
300 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
301 sizeof(*sqsh_ino));
302 if (err < 0)
303 goto failed_read;
304
305 if (type == SQUASHFS_CHRDEV_TYPE)
306 inode->i_mode |= S_IFCHR;
307 else
308 inode->i_mode |= S_IFBLK;
309 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
310 rdev = le32_to_cpu(sqsh_ino->rdev);
311 init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
312
313 TRACE("Device inode %x:%x, rdev %x\n",
314 SQUASHFS_INODE_BLK(ino), offset, rdev);
315 break;
316 }
317 case SQUASHFS_FIFO_TYPE:
318 case SQUASHFS_SOCKET_TYPE:
319 case SQUASHFS_LFIFO_TYPE:
320 case SQUASHFS_LSOCKET_TYPE: {
321 struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc;
322
323 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
324 sizeof(*sqsh_ino));
325 if (err < 0)
326 goto failed_read;
327
328 if (type == SQUASHFS_FIFO_TYPE)
329 inode->i_mode |= S_IFIFO;
330 else
331 inode->i_mode |= S_IFSOCK;
332 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
333 init_special_inode(inode, inode->i_mode, 0);
334 break;
335 }
336 default:
337 ERROR("Unknown inode type %d in squashfs_iget!\n", type);
338 return -EINVAL;
339 }
340
341 return 0;
342
343failed_read:
344 ERROR("Unable to read inode 0x%llx\n", ino);
345 return err;
346}
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
new file mode 100644
index 000000000000..9e398653b22b
--- /dev/null
+++ b/fs/squashfs/namei.c
@@ -0,0 +1,242 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * namei.c
22 */
23
24/*
25 * This file implements code to do filename lookup in directories.
26 *
27 * Like inodes, directories are packed into compressed metadata blocks, stored
28 * in a directory table. Directories are accessed using the start address of
29 * the metablock containing the directory and the offset into the
30 * decompressed block (<block, offset>).
31 *
32 * Directories are organised in a slightly complex way, and are not simply
33 * a list of file names. The organisation takes advantage of the
34 * fact that (in most cases) the inodes of the files will be in the same
35 * compressed metadata block, and therefore, can share the start block.
36 * Directories are therefore organised in a two level list, a directory
37 * header containing the shared start block value, and a sequence of directory
38 * entries, each of which share the shared start block. A new directory header
39 * is written once/if the inode start block changes. The directory
40 * header/directory entry list is repeated as many times as necessary.
41 *
42 * Directories are sorted, and can contain a directory index to speed up
43 * file lookup. Directory indexes store one entry per metablock, each entry
44 * storing the index/filename mapping to the first directory header
45 * in each metadata block. Directories are sorted in alphabetical order,
46 * and at lookup the index is scanned linearly looking for the first filename
47 * alphabetically larger than the filename being looked up. At this point the
48 * location of the metadata block the filename is in has been found.
49 * The general idea of the index is ensure only one metadata block needs to be
50 * decompressed to do a lookup irrespective of the length of the directory.
51 * This scheme has the advantage that it doesn't require extra memory overhead
52 * and doesn't require much extra storage on disk.
53 */
54
55#include <linux/fs.h>
56#include <linux/vfs.h>
57#include <linux/slab.h>
58#include <linux/string.h>
59#include <linux/dcache.h>
60#include <linux/zlib.h>
61
62#include "squashfs_fs.h"
63#include "squashfs_fs_sb.h"
64#include "squashfs_fs_i.h"
65#include "squashfs.h"
66
67/*
68 * Lookup name in the directory index, returning the location of the metadata
69 * block containing it, and the directory index this represents.
70 *
71 * If we get an error reading the index then return the part of the index
72 * (if any) we have managed to read - the index isn't essential, just
73 * quicker.
74 */
75static int get_dir_index_using_name(struct super_block *sb,
76 u64 *next_block, int *next_offset, u64 index_start,
77 int index_offset, int i_count, const char *name,
78 int len)
79{
80 struct squashfs_sb_info *msblk = sb->s_fs_info;
81 int i, size, length = 0, err;
82 struct squashfs_dir_index *index;
83 char *str;
84
85 TRACE("Entered get_dir_index_using_name, i_count %d\n", i_count);
86
87 index = kmalloc(sizeof(*index) + SQUASHFS_NAME_LEN * 2 + 2, GFP_KERNEL);
88 if (index == NULL) {
89 ERROR("Failed to allocate squashfs_dir_index\n");
90 goto out;
91 }
92
93 str = &index->name[SQUASHFS_NAME_LEN + 1];
94 strncpy(str, name, len);
95 str[len] = '\0';
96
97 for (i = 0; i < i_count; i++) {
98 err = squashfs_read_metadata(sb, index, &index_start,
99 &index_offset, sizeof(*index));
100 if (err < 0)
101 break;
102
103
104 size = le32_to_cpu(index->size) + 1;
105
106 err = squashfs_read_metadata(sb, index->name, &index_start,
107 &index_offset, size);
108 if (err < 0)
109 break;
110
111 index->name[size] = '\0';
112
113 if (strcmp(index->name, str) > 0)
114 break;
115
116 length = le32_to_cpu(index->index);
117 *next_block = le32_to_cpu(index->start_block) +
118 msblk->directory_table;
119 }
120
121 *next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE;
122 kfree(index);
123
124out:
125 /*
126 * Return index (f_pos) of the looked up metadata block. Translate
127 * from internal f_pos to external f_pos which is offset by 3 because
128 * we invent "." and ".." entries which are not actually stored in the
129 * directory.
130 */
131 return length + 3;
132}
133
134
135static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry,
136 struct nameidata *nd)
137{
138 const unsigned char *name = dentry->d_name.name;
139 int len = dentry->d_name.len;
140 struct inode *inode = NULL;
141 struct squashfs_sb_info *msblk = dir->i_sb->s_fs_info;
142 struct squashfs_dir_header dirh;
143 struct squashfs_dir_entry *dire;
144 u64 block = squashfs_i(dir)->start + msblk->directory_table;
145 int offset = squashfs_i(dir)->offset;
146 int err, length = 0, dir_count, size;
147
148 TRACE("Entered squashfs_lookup [%llx:%x]\n", block, offset);
149
150 dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL);
151 if (dire == NULL) {
152 ERROR("Failed to allocate squashfs_dir_entry\n");
153 return ERR_PTR(-ENOMEM);
154 }
155
156 if (len > SQUASHFS_NAME_LEN) {
157 err = -ENAMETOOLONG;
158 goto failed;
159 }
160
161 length = get_dir_index_using_name(dir->i_sb, &block, &offset,
162 squashfs_i(dir)->dir_idx_start,
163 squashfs_i(dir)->dir_idx_offset,
164 squashfs_i(dir)->dir_idx_cnt, name, len);
165
166 while (length < i_size_read(dir)) {
167 /*
168 * Read directory header.
169 */
170 err = squashfs_read_metadata(dir->i_sb, &dirh, &block,
171 &offset, sizeof(dirh));
172 if (err < 0)
173 goto read_failure;
174
175 length += sizeof(dirh);
176
177 dir_count = le32_to_cpu(dirh.count) + 1;
178 while (dir_count--) {
179 /*
180 * Read directory entry.
181 */
182 err = squashfs_read_metadata(dir->i_sb, dire, &block,
183 &offset, sizeof(*dire));
184 if (err < 0)
185 goto read_failure;
186
187 size = le16_to_cpu(dire->size) + 1;
188
189 err = squashfs_read_metadata(dir->i_sb, dire->name,
190 &block, &offset, size);
191 if (err < 0)
192 goto read_failure;
193
194 length += sizeof(*dire) + size;
195
196 if (name[0] < dire->name[0])
197 goto exit_lookup;
198
199 if (len == size && !strncmp(name, dire->name, len)) {
200 unsigned int blk, off, ino_num;
201 long long ino;
202 blk = le32_to_cpu(dirh.start_block);
203 off = le16_to_cpu(dire->offset);
204 ino_num = le32_to_cpu(dirh.inode_number) +
205 (short) le16_to_cpu(dire->inode_number);
206 ino = SQUASHFS_MKINODE(blk, off);
207
208 TRACE("calling squashfs_iget for directory "
209 "entry %s, inode %x:%x, %d\n", name,
210 blk, off, ino_num);
211
212 inode = squashfs_iget(dir->i_sb, ino, ino_num);
213 if (IS_ERR(inode)) {
214 err = PTR_ERR(inode);
215 goto failed;
216 }
217
218 goto exit_lookup;
219 }
220 }
221 }
222
223exit_lookup:
224 kfree(dire);
225 if (inode)
226 return d_splice_alias(inode, dentry);
227 d_add(dentry, inode);
228 return ERR_PTR(0);
229
230read_failure:
231 ERROR("Unable to read directory block [%llx:%x]\n",
232 squashfs_i(dir)->start + msblk->directory_table,
233 squashfs_i(dir)->offset);
234failed:
235 kfree(dire);
236 return ERR_PTR(err);
237}
238
239
240const struct inode_operations squashfs_dir_inode_ops = {
241 .lookup = squashfs_lookup
242};
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
new file mode 100644
index 000000000000..6b2515d027d5
--- /dev/null
+++ b/fs/squashfs/squashfs.h
@@ -0,0 +1,90 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * squashfs.h
22 */
23
24#define TRACE(s, args...) pr_debug("SQUASHFS: "s, ## args)
25
26#define ERROR(s, args...) pr_err("SQUASHFS error: "s, ## args)
27
28#define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args)
29
30static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
31{
32 return list_entry(inode, struct squashfs_inode_info, vfs_inode);
33}
34
35/* block.c */
36extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
37 int);
38
39/* cache.c */
40extern struct squashfs_cache *squashfs_cache_init(char *, int, int);
41extern void squashfs_cache_delete(struct squashfs_cache *);
42extern struct squashfs_cache_entry *squashfs_cache_get(struct super_block *,
43 struct squashfs_cache *, u64, int);
44extern void squashfs_cache_put(struct squashfs_cache_entry *);
45extern int squashfs_copy_data(void *, struct squashfs_cache_entry *, int, int);
46extern int squashfs_read_metadata(struct super_block *, void *, u64 *,
47 int *, int);
48extern struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *,
49 u64, int);
50extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *,
51 u64, int);
52extern int squashfs_read_table(struct super_block *, void *, u64, int);
53
54/* export.c */
55extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64,
56 unsigned int);
57
58/* fragment.c */
59extern int squashfs_frag_lookup(struct super_block *, unsigned int, u64 *);
60extern __le64 *squashfs_read_fragment_index_table(struct super_block *,
61 u64, unsigned int);
62
63/* id.c */
64extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *);
65extern __le64 *squashfs_read_id_index_table(struct super_block *, u64,
66 unsigned short);
67
68/* inode.c */
69extern struct inode *squashfs_iget(struct super_block *, long long,
70 unsigned int);
71extern int squashfs_read_inode(struct inode *, long long);
72
73/*
74 * Inodes and files operations
75 */
76
77/* dir.c */
78extern const struct file_operations squashfs_dir_ops;
79
80/* export.c */
81extern const struct export_operations squashfs_export_ops;
82
83/* file.c */
84extern const struct address_space_operations squashfs_aops;
85
86/* namei.c */
87extern const struct inode_operations squashfs_dir_inode_ops;
88
89/* symlink.c */
90extern const struct address_space_operations squashfs_symlink_aops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
new file mode 100644
index 000000000000..6840da1bf21e
--- /dev/null
+++ b/fs/squashfs/squashfs_fs.h
@@ -0,0 +1,381 @@
1#ifndef SQUASHFS_FS
2#define SQUASHFS_FS
3/*
4 * Squashfs
5 *
6 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
7 * Phillip Lougher <phillip@lougher.demon.co.uk>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version 2,
12 * or (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 *
23 * squashfs_fs.h
24 */
25
26#define SQUASHFS_CACHED_FRAGMENTS CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE
27#define SQUASHFS_MAJOR 4
28#define SQUASHFS_MINOR 0
29#define SQUASHFS_MAGIC 0x73717368
30#define SQUASHFS_START 0
31
32/* size of metadata (inode and directory) blocks */
33#define SQUASHFS_METADATA_SIZE 8192
34#define SQUASHFS_METADATA_LOG 13
35
36/* default size of data blocks */
37#define SQUASHFS_FILE_SIZE 131072
38#define SQUASHFS_FILE_LOG 17
39
40#define SQUASHFS_FILE_MAX_SIZE 1048576
41#define SQUASHFS_FILE_MAX_LOG 20
42
43/* Max number of uids and gids */
44#define SQUASHFS_IDS 65536
45
46/* Max length of filename (not 255) */
47#define SQUASHFS_NAME_LEN 256
48
49#define SQUASHFS_INVALID_FRAG (0xffffffffU)
50#define SQUASHFS_INVALID_BLK (-1LL)
51
52/* Filesystem flags */
53#define SQUASHFS_NOI 0
54#define SQUASHFS_NOD 1
55#define SQUASHFS_NOF 3
56#define SQUASHFS_NO_FRAG 4
57#define SQUASHFS_ALWAYS_FRAG 5
58#define SQUASHFS_DUPLICATE 6
59#define SQUASHFS_EXPORT 7
60
61#define SQUASHFS_BIT(flag, bit) ((flag >> bit) & 1)
62
63#define SQUASHFS_UNCOMPRESSED_INODES(flags) SQUASHFS_BIT(flags, \
64 SQUASHFS_NOI)
65
66#define SQUASHFS_UNCOMPRESSED_DATA(flags) SQUASHFS_BIT(flags, \
67 SQUASHFS_NOD)
68
69#define SQUASHFS_UNCOMPRESSED_FRAGMENTS(flags) SQUASHFS_BIT(flags, \
70 SQUASHFS_NOF)
71
72#define SQUASHFS_NO_FRAGMENTS(flags) SQUASHFS_BIT(flags, \
73 SQUASHFS_NO_FRAG)
74
75#define SQUASHFS_ALWAYS_FRAGMENTS(flags) SQUASHFS_BIT(flags, \
76 SQUASHFS_ALWAYS_FRAG)
77
78#define SQUASHFS_DUPLICATES(flags) SQUASHFS_BIT(flags, \
79 SQUASHFS_DUPLICATE)
80
81#define SQUASHFS_EXPORTABLE(flags) SQUASHFS_BIT(flags, \
82 SQUASHFS_EXPORT)
83
84/* Max number of types and file types */
85#define SQUASHFS_DIR_TYPE 1
86#define SQUASHFS_REG_TYPE 2
87#define SQUASHFS_SYMLINK_TYPE 3
88#define SQUASHFS_BLKDEV_TYPE 4
89#define SQUASHFS_CHRDEV_TYPE 5
90#define SQUASHFS_FIFO_TYPE 6
91#define SQUASHFS_SOCKET_TYPE 7
92#define SQUASHFS_LDIR_TYPE 8
93#define SQUASHFS_LREG_TYPE 9
94#define SQUASHFS_LSYMLINK_TYPE 10
95#define SQUASHFS_LBLKDEV_TYPE 11
96#define SQUASHFS_LCHRDEV_TYPE 12
97#define SQUASHFS_LFIFO_TYPE 13
98#define SQUASHFS_LSOCKET_TYPE 14
99
100/* Flag whether block is compressed or uncompressed, bit is set if block is
101 * uncompressed */
102#define SQUASHFS_COMPRESSED_BIT (1 << 15)
103
104#define SQUASHFS_COMPRESSED_SIZE(B) (((B) & ~SQUASHFS_COMPRESSED_BIT) ? \
105 (B) & ~SQUASHFS_COMPRESSED_BIT : SQUASHFS_COMPRESSED_BIT)
106
107#define SQUASHFS_COMPRESSED(B) (!((B) & SQUASHFS_COMPRESSED_BIT))
108
109#define SQUASHFS_COMPRESSED_BIT_BLOCK (1 << 24)
110
111#define SQUASHFS_COMPRESSED_SIZE_BLOCK(B) ((B) & \
112 ~SQUASHFS_COMPRESSED_BIT_BLOCK)
113
114#define SQUASHFS_COMPRESSED_BLOCK(B) (!((B) & SQUASHFS_COMPRESSED_BIT_BLOCK))
115
116/*
117 * Inode number ops. Inodes consist of a compressed block number, and an
118 * uncompressed offset within that block
119 */
120#define SQUASHFS_INODE_BLK(A) ((unsigned int) ((A) >> 16))
121
122#define SQUASHFS_INODE_OFFSET(A) ((unsigned int) ((A) & 0xffff))
123
124#define SQUASHFS_MKINODE(A, B) ((long long)(((long long) (A)\
125 << 16) + (B)))
126
127/* Translate between VFS mode and squashfs mode */
128#define SQUASHFS_MODE(A) ((A) & 0xfff)
129
130/* fragment and fragment table defines */
131#define SQUASHFS_FRAGMENT_BYTES(A) \
132 ((A) * sizeof(struct squashfs_fragment_entry))
133
134#define SQUASHFS_FRAGMENT_INDEX(A) (SQUASHFS_FRAGMENT_BYTES(A) / \
135 SQUASHFS_METADATA_SIZE)
136
137#define SQUASHFS_FRAGMENT_INDEX_OFFSET(A) (SQUASHFS_FRAGMENT_BYTES(A) % \
138 SQUASHFS_METADATA_SIZE)
139
140#define SQUASHFS_FRAGMENT_INDEXES(A) ((SQUASHFS_FRAGMENT_BYTES(A) + \
141 SQUASHFS_METADATA_SIZE - 1) / \
142 SQUASHFS_METADATA_SIZE)
143
144#define SQUASHFS_FRAGMENT_INDEX_BYTES(A) (SQUASHFS_FRAGMENT_INDEXES(A) *\
145 sizeof(u64))
146
147/* inode lookup table defines */
148#define SQUASHFS_LOOKUP_BYTES(A) ((A) * sizeof(u64))
149
150#define SQUASHFS_LOOKUP_BLOCK(A) (SQUASHFS_LOOKUP_BYTES(A) / \
151 SQUASHFS_METADATA_SIZE)
152
153#define SQUASHFS_LOOKUP_BLOCK_OFFSET(A) (SQUASHFS_LOOKUP_BYTES(A) % \
154 SQUASHFS_METADATA_SIZE)
155
156#define SQUASHFS_LOOKUP_BLOCKS(A) ((SQUASHFS_LOOKUP_BYTES(A) + \
157 SQUASHFS_METADATA_SIZE - 1) / \
158 SQUASHFS_METADATA_SIZE)
159
160#define SQUASHFS_LOOKUP_BLOCK_BYTES(A) (SQUASHFS_LOOKUP_BLOCKS(A) *\
161 sizeof(u64))
162
163/* uid/gid lookup table defines */
164#define SQUASHFS_ID_BYTES(A) ((A) * sizeof(unsigned int))
165
166#define SQUASHFS_ID_BLOCK(A) (SQUASHFS_ID_BYTES(A) / \
167 SQUASHFS_METADATA_SIZE)
168
169#define SQUASHFS_ID_BLOCK_OFFSET(A) (SQUASHFS_ID_BYTES(A) % \
170 SQUASHFS_METADATA_SIZE)
171
172#define SQUASHFS_ID_BLOCKS(A) ((SQUASHFS_ID_BYTES(A) + \
173 SQUASHFS_METADATA_SIZE - 1) / \
174 SQUASHFS_METADATA_SIZE)
175
176#define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\
177 sizeof(u64))
178
179/* cached data constants for filesystem */
180#define SQUASHFS_CACHED_BLKS 8
181
182#define SQUASHFS_MAX_FILE_SIZE_LOG 64
183
184#define SQUASHFS_MAX_FILE_SIZE (1LL << \
185 (SQUASHFS_MAX_FILE_SIZE_LOG - 2))
186
187#define SQUASHFS_MARKER_BYTE 0xff
188
189/* meta index cache */
190#define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int))
191#define SQUASHFS_META_ENTRIES 127
192#define SQUASHFS_META_SLOTS 8
193
194struct meta_entry {
195 u64 data_block;
196 unsigned int index_block;
197 unsigned short offset;
198 unsigned short pad;
199};
200
201struct meta_index {
202 unsigned int inode_number;
203 unsigned int offset;
204 unsigned short entries;
205 unsigned short skip;
206 unsigned short locked;
207 unsigned short pad;
208 struct meta_entry meta_entry[SQUASHFS_META_ENTRIES];
209};
210
211
212/*
213 * definitions for structures on disk
214 */
215#define ZLIB_COMPRESSION 1
216
217struct squashfs_super_block {
218 __le32 s_magic;
219 __le32 inodes;
220 __le32 mkfs_time;
221 __le32 block_size;
222 __le32 fragments;
223 __le16 compression;
224 __le16 block_log;
225 __le16 flags;
226 __le16 no_ids;
227 __le16 s_major;
228 __le16 s_minor;
229 __le64 root_inode;
230 __le64 bytes_used;
231 __le64 id_table_start;
232 __le64 xattr_table_start;
233 __le64 inode_table_start;
234 __le64 directory_table_start;
235 __le64 fragment_table_start;
236 __le64 lookup_table_start;
237};
238
239struct squashfs_dir_index {
240 __le32 index;
241 __le32 start_block;
242 __le32 size;
243 unsigned char name[0];
244};
245
246struct squashfs_base_inode {
247 __le16 inode_type;
248 __le16 mode;
249 __le16 uid;
250 __le16 guid;
251 __le32 mtime;
252 __le32 inode_number;
253};
254
255struct squashfs_ipc_inode {
256 __le16 inode_type;
257 __le16 mode;
258 __le16 uid;
259 __le16 guid;
260 __le32 mtime;
261 __le32 inode_number;
262 __le32 nlink;
263};
264
265struct squashfs_dev_inode {
266 __le16 inode_type;
267 __le16 mode;
268 __le16 uid;
269 __le16 guid;
270 __le32 mtime;
271 __le32 inode_number;
272 __le32 nlink;
273 __le32 rdev;
274};
275
276struct squashfs_symlink_inode {
277 __le16 inode_type;
278 __le16 mode;
279 __le16 uid;
280 __le16 guid;
281 __le32 mtime;
282 __le32 inode_number;
283 __le32 nlink;
284 __le32 symlink_size;
285 char symlink[0];
286};
287
288struct squashfs_reg_inode {
289 __le16 inode_type;
290 __le16 mode;
291 __le16 uid;
292 __le16 guid;
293 __le32 mtime;
294 __le32 inode_number;
295 __le32 start_block;
296 __le32 fragment;
297 __le32 offset;
298 __le32 file_size;
299 __le16 block_list[0];
300};
301
302struct squashfs_lreg_inode {
303 __le16 inode_type;
304 __le16 mode;
305 __le16 uid;
306 __le16 guid;
307 __le32 mtime;
308 __le32 inode_number;
309 __le64 start_block;
310 __le64 file_size;
311 __le64 sparse;
312 __le32 nlink;
313 __le32 fragment;
314 __le32 offset;
315 __le32 xattr;
316 __le16 block_list[0];
317};
318
319struct squashfs_dir_inode {
320 __le16 inode_type;
321 __le16 mode;
322 __le16 uid;
323 __le16 guid;
324 __le32 mtime;
325 __le32 inode_number;
326 __le32 start_block;
327 __le32 nlink;
328 __le16 file_size;
329 __le16 offset;
330 __le32 parent_inode;
331};
332
333struct squashfs_ldir_inode {
334 __le16 inode_type;
335 __le16 mode;
336 __le16 uid;
337 __le16 guid;
338 __le32 mtime;
339 __le32 inode_number;
340 __le32 nlink;
341 __le32 file_size;
342 __le32 start_block;
343 __le32 parent_inode;
344 __le16 i_count;
345 __le16 offset;
346 __le32 xattr;
347 struct squashfs_dir_index index[0];
348};
349
350union squashfs_inode {
351 struct squashfs_base_inode base;
352 struct squashfs_dev_inode dev;
353 struct squashfs_symlink_inode symlink;
354 struct squashfs_reg_inode reg;
355 struct squashfs_lreg_inode lreg;
356 struct squashfs_dir_inode dir;
357 struct squashfs_ldir_inode ldir;
358 struct squashfs_ipc_inode ipc;
359};
360
361struct squashfs_dir_entry {
362 __le16 offset;
363 __le16 inode_number;
364 __le16 type;
365 __le16 size;
366 char name[0];
367};
368
369struct squashfs_dir_header {
370 __le32 count;
371 __le32 start_block;
372 __le32 inode_number;
373};
374
375struct squashfs_fragment_entry {
376 __le64 start_block;
377 __le32 size;
378 unsigned int unused;
379};
380
381#endif
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
new file mode 100644
index 000000000000..fbfca30c0c68
--- /dev/null
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -0,0 +1,45 @@
1#ifndef SQUASHFS_FS_I
2#define SQUASHFS_FS_I
3/*
4 * Squashfs
5 *
6 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
7 * Phillip Lougher <phillip@lougher.demon.co.uk>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version 2,
12 * or (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 *
23 * squashfs_fs_i.h
24 */
25
26struct squashfs_inode_info {
27 u64 start;
28 int offset;
29 union {
30 struct {
31 u64 fragment_block;
32 int fragment_size;
33 int fragment_offset;
34 u64 block_list_start;
35 };
36 struct {
37 u64 dir_idx_start;
38 int dir_idx_offset;
39 int dir_idx_cnt;
40 int parent;
41 };
42 };
43 struct inode vfs_inode;
44};
45#endif
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
new file mode 100644
index 000000000000..c8c65614dd1c
--- /dev/null
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -0,0 +1,76 @@
1#ifndef SQUASHFS_FS_SB
2#define SQUASHFS_FS_SB
3/*
4 * Squashfs
5 *
6 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
7 * Phillip Lougher <phillip@lougher.demon.co.uk>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version 2,
12 * or (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 *
23 * squashfs_fs_sb.h
24 */
25
26#include "squashfs_fs.h"
27
28struct squashfs_cache {
29 char *name;
30 int entries;
31 int next_blk;
32 int num_waiters;
33 int unused;
34 int block_size;
35 int pages;
36 spinlock_t lock;
37 wait_queue_head_t wait_queue;
38 struct squashfs_cache_entry *entry;
39};
40
41struct squashfs_cache_entry {
42 u64 block;
43 int length;
44 int refcount;
45 u64 next_index;
46 int pending;
47 int error;
48 int num_waiters;
49 wait_queue_head_t wait_queue;
50 struct squashfs_cache *cache;
51 void **data;
52};
53
54struct squashfs_sb_info {
55 int devblksize;
56 int devblksize_log2;
57 struct squashfs_cache *block_cache;
58 struct squashfs_cache *fragment_cache;
59 struct squashfs_cache *read_page;
60 int next_meta_index;
61 __le64 *id_table;
62 __le64 *fragment_index;
63 unsigned int *fragment_index_2;
64 struct mutex read_data_mutex;
65 struct mutex meta_index_mutex;
66 struct meta_index *meta_index;
67 z_stream stream;
68 __le64 *inode_lookup_table;
69 u64 inode_table;
70 u64 directory_table;
71 unsigned int block_size;
72 unsigned short block_log;
73 long long bytes_used;
74 unsigned int inodes;
75};
76#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
new file mode 100644
index 000000000000..a0466d7467b2
--- /dev/null
+++ b/fs/squashfs/super.c
@@ -0,0 +1,440 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * super.c
22 */
23
24/*
25 * This file implements code to read the superblock, read and initialise
26 * in-memory structures at mount time, and all the VFS glue code to register
27 * the filesystem.
28 */
29
30#include <linux/fs.h>
31#include <linux/vfs.h>
32#include <linux/slab.h>
33#include <linux/mutex.h>
34#include <linux/pagemap.h>
35#include <linux/init.h>
36#include <linux/module.h>
37#include <linux/zlib.h>
38
39#include "squashfs_fs.h"
40#include "squashfs_fs_sb.h"
41#include "squashfs_fs_i.h"
42#include "squashfs.h"
43
44static struct file_system_type squashfs_fs_type;
45static struct super_operations squashfs_super_ops;
46
47static int supported_squashfs_filesystem(short major, short minor, short comp)
48{
49 if (major < SQUASHFS_MAJOR) {
50 ERROR("Major/Minor mismatch, older Squashfs %d.%d "
51 "filesystems are unsupported\n", major, minor);
52 return -EINVAL;
53 } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) {
54 ERROR("Major/Minor mismatch, trying to mount newer "
55 "%d.%d filesystem\n", major, minor);
56 ERROR("Please update your kernel\n");
57 return -EINVAL;
58 }
59
60 if (comp != ZLIB_COMPRESSION)
61 return -EINVAL;
62
63 return 0;
64}
65
66
67static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
68{
69 struct squashfs_sb_info *msblk;
70 struct squashfs_super_block *sblk = NULL;
71 char b[BDEVNAME_SIZE];
72 struct inode *root;
73 long long root_inode;
74 unsigned short flags;
75 unsigned int fragments;
76 u64 lookup_table_start;
77 int err;
78
79 TRACE("Entered squashfs_fill_superblock\n");
80
81 sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL);
82 if (sb->s_fs_info == NULL) {
83 ERROR("Failed to allocate squashfs_sb_info\n");
84 return -ENOMEM;
85 }
86 msblk = sb->s_fs_info;
87
88 msblk->stream.workspace = kmalloc(zlib_inflate_workspacesize(),
89 GFP_KERNEL);
90 if (msblk->stream.workspace == NULL) {
91 ERROR("Failed to allocate zlib workspace\n");
92 goto failure;
93 }
94
95 sblk = kzalloc(sizeof(*sblk), GFP_KERNEL);
96 if (sblk == NULL) {
97 ERROR("Failed to allocate squashfs_super_block\n");
98 goto failure;
99 }
100
101 msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE);
102 msblk->devblksize_log2 = ffz(~msblk->devblksize);
103
104 mutex_init(&msblk->read_data_mutex);
105 mutex_init(&msblk->meta_index_mutex);
106
107 /*
108 * msblk->bytes_used is checked in squashfs_read_table to ensure reads
109 * are not beyond filesystem end. But as we're using
110 * squashfs_read_table here to read the superblock (including the value
111 * of bytes_used) we need to set it to an initial sensible dummy value
112 */
113 msblk->bytes_used = sizeof(*sblk);
114 err = squashfs_read_table(sb, sblk, SQUASHFS_START, sizeof(*sblk));
115
116 if (err < 0) {
117 ERROR("unable to read squashfs_super_block\n");
118 goto failed_mount;
119 }
120
121 /* Check it is a SQUASHFS superblock */
122 sb->s_magic = le32_to_cpu(sblk->s_magic);
123 if (sb->s_magic != SQUASHFS_MAGIC) {
124 if (!silent)
125 ERROR("Can't find a SQUASHFS superblock on %s\n",
126 bdevname(sb->s_bdev, b));
127 err = -EINVAL;
128 goto failed_mount;
129 }
130
131 /* Check the MAJOR & MINOR versions and compression type */
132 err = supported_squashfs_filesystem(le16_to_cpu(sblk->s_major),
133 le16_to_cpu(sblk->s_minor),
134 le16_to_cpu(sblk->compression));
135 if (err < 0)
136 goto failed_mount;
137
138 err = -EINVAL;
139
140 /*
141 * Check if there's xattrs in the filesystem. These are not
142 * supported in this version, so warn that they will be ignored.
143 */
144 if (le64_to_cpu(sblk->xattr_table_start) != SQUASHFS_INVALID_BLK)
145 ERROR("Xattrs in filesystem, these will be ignored\n");
146
147 /* Check the filesystem does not extend beyond the end of the
148 block device */
149 msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
150 if (msblk->bytes_used < 0 || msblk->bytes_used >
151 i_size_read(sb->s_bdev->bd_inode))
152 goto failed_mount;
153
154 /* Check block size for sanity */
155 msblk->block_size = le32_to_cpu(sblk->block_size);
156 if (msblk->block_size > SQUASHFS_FILE_MAX_SIZE)
157 goto failed_mount;
158
159 msblk->block_log = le16_to_cpu(sblk->block_log);
160 if (msblk->block_log > SQUASHFS_FILE_MAX_LOG)
161 goto failed_mount;
162
163 /* Check the root inode for sanity */
164 root_inode = le64_to_cpu(sblk->root_inode);
165 if (SQUASHFS_INODE_OFFSET(root_inode) > SQUASHFS_METADATA_SIZE)
166 goto failed_mount;
167
168 msblk->inode_table = le64_to_cpu(sblk->inode_table_start);
169 msblk->directory_table = le64_to_cpu(sblk->directory_table_start);
170 msblk->inodes = le32_to_cpu(sblk->inodes);
171 flags = le16_to_cpu(sblk->flags);
172
173 TRACE("Found valid superblock on %s\n", bdevname(sb->s_bdev, b));
174 TRACE("Inodes are %scompressed\n", SQUASHFS_UNCOMPRESSED_INODES(flags)
175 ? "un" : "");
176 TRACE("Data is %scompressed\n", SQUASHFS_UNCOMPRESSED_DATA(flags)
177 ? "un" : "");
178 TRACE("Filesystem size %lld bytes\n", msblk->bytes_used);
179 TRACE("Block size %d\n", msblk->block_size);
180 TRACE("Number of inodes %d\n", msblk->inodes);
181 TRACE("Number of fragments %d\n", le32_to_cpu(sblk->fragments));
182 TRACE("Number of ids %d\n", le16_to_cpu(sblk->no_ids));
183 TRACE("sblk->inode_table_start %llx\n", msblk->inode_table);
184 TRACE("sblk->directory_table_start %llx\n", msblk->directory_table);
185 TRACE("sblk->fragment_table_start %llx\n",
186 (u64) le64_to_cpu(sblk->fragment_table_start));
187 TRACE("sblk->id_table_start %llx\n",
188 (u64) le64_to_cpu(sblk->id_table_start));
189
190 sb->s_maxbytes = MAX_LFS_FILESIZE;
191 sb->s_flags |= MS_RDONLY;
192 sb->s_op = &squashfs_super_ops;
193
194 err = -ENOMEM;
195
196 msblk->block_cache = squashfs_cache_init("metadata",
197 SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE);
198 if (msblk->block_cache == NULL)
199 goto failed_mount;
200
201 /* Allocate read_page block */
202 msblk->read_page = squashfs_cache_init("data", 1, msblk->block_size);
203 if (msblk->read_page == NULL) {
204 ERROR("Failed to allocate read_page block\n");
205 goto failed_mount;
206 }
207
208 /* Allocate and read id index table */
209 msblk->id_table = squashfs_read_id_index_table(sb,
210 le64_to_cpu(sblk->id_table_start), le16_to_cpu(sblk->no_ids));
211 if (IS_ERR(msblk->id_table)) {
212 err = PTR_ERR(msblk->id_table);
213 msblk->id_table = NULL;
214 goto failed_mount;
215 }
216
217 fragments = le32_to_cpu(sblk->fragments);
218 if (fragments == 0)
219 goto allocate_lookup_table;
220
221 msblk->fragment_cache = squashfs_cache_init("fragment",
222 SQUASHFS_CACHED_FRAGMENTS, msblk->block_size);
223 if (msblk->fragment_cache == NULL) {
224 err = -ENOMEM;
225 goto failed_mount;
226 }
227
228 /* Allocate and read fragment index table */
229 msblk->fragment_index = squashfs_read_fragment_index_table(sb,
230 le64_to_cpu(sblk->fragment_table_start), fragments);
231 if (IS_ERR(msblk->fragment_index)) {
232 err = PTR_ERR(msblk->fragment_index);
233 msblk->fragment_index = NULL;
234 goto failed_mount;
235 }
236
237allocate_lookup_table:
238 lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
239 if (lookup_table_start == SQUASHFS_INVALID_BLK)
240 goto allocate_root;
241
242 /* Allocate and read inode lookup table */
243 msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
244 lookup_table_start, msblk->inodes);
245 if (IS_ERR(msblk->inode_lookup_table)) {
246 err = PTR_ERR(msblk->inode_lookup_table);
247 msblk->inode_lookup_table = NULL;
248 goto failed_mount;
249 }
250
251 sb->s_export_op = &squashfs_export_ops;
252
253allocate_root:
254 root = new_inode(sb);
255 if (!root) {
256 err = -ENOMEM;
257 goto failed_mount;
258 }
259
260 err = squashfs_read_inode(root, root_inode);
261 if (err) {
262 iget_failed(root);
263 goto failed_mount;
264 }
265 insert_inode_hash(root);
266
267 sb->s_root = d_alloc_root(root);
268 if (sb->s_root == NULL) {
269 ERROR("Root inode create failed\n");
270 err = -ENOMEM;
271 iput(root);
272 goto failed_mount;
273 }
274
275 TRACE("Leaving squashfs_fill_super\n");
276 kfree(sblk);
277 return 0;
278
279failed_mount:
280 squashfs_cache_delete(msblk->block_cache);
281 squashfs_cache_delete(msblk->fragment_cache);
282 squashfs_cache_delete(msblk->read_page);
283 kfree(msblk->inode_lookup_table);
284 kfree(msblk->fragment_index);
285 kfree(msblk->id_table);
286 kfree(msblk->stream.workspace);
287 kfree(sb->s_fs_info);
288 sb->s_fs_info = NULL;
289 kfree(sblk);
290 return err;
291
292failure:
293 kfree(msblk->stream.workspace);
294 kfree(sb->s_fs_info);
295 sb->s_fs_info = NULL;
296 return -ENOMEM;
297}
298
299
300static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
301{
302 struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info;
303
304 TRACE("Entered squashfs_statfs\n");
305
306 buf->f_type = SQUASHFS_MAGIC;
307 buf->f_bsize = msblk->block_size;
308 buf->f_blocks = ((msblk->bytes_used - 1) >> msblk->block_log) + 1;
309 buf->f_bfree = buf->f_bavail = 0;
310 buf->f_files = msblk->inodes;
311 buf->f_ffree = 0;
312 buf->f_namelen = SQUASHFS_NAME_LEN;
313
314 return 0;
315}
316
317
318static int squashfs_remount(struct super_block *sb, int *flags, char *data)
319{
320 *flags |= MS_RDONLY;
321 return 0;
322}
323
324
325static void squashfs_put_super(struct super_block *sb)
326{
327 if (sb->s_fs_info) {
328 struct squashfs_sb_info *sbi = sb->s_fs_info;
329 squashfs_cache_delete(sbi->block_cache);
330 squashfs_cache_delete(sbi->fragment_cache);
331 squashfs_cache_delete(sbi->read_page);
332 kfree(sbi->id_table);
333 kfree(sbi->fragment_index);
334 kfree(sbi->meta_index);
335 kfree(sbi->stream.workspace);
336 kfree(sb->s_fs_info);
337 sb->s_fs_info = NULL;
338 }
339}
340
341
342static int squashfs_get_sb(struct file_system_type *fs_type, int flags,
343 const char *dev_name, void *data,
344 struct vfsmount *mnt)
345{
346 return get_sb_bdev(fs_type, flags, dev_name, data, squashfs_fill_super,
347 mnt);
348}
349
350
351static struct kmem_cache *squashfs_inode_cachep;
352
353
354static void init_once(void *foo)
355{
356 struct squashfs_inode_info *ei = foo;
357
358 inode_init_once(&ei->vfs_inode);
359}
360
361
362static int __init init_inodecache(void)
363{
364 squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache",
365 sizeof(struct squashfs_inode_info), 0,
366 SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once);
367
368 return squashfs_inode_cachep ? 0 : -ENOMEM;
369}
370
371
372static void destroy_inodecache(void)
373{
374 kmem_cache_destroy(squashfs_inode_cachep);
375}
376
377
378static int __init init_squashfs_fs(void)
379{
380 int err = init_inodecache();
381
382 if (err)
383 return err;
384
385 err = register_filesystem(&squashfs_fs_type);
386 if (err) {
387 destroy_inodecache();
388 return err;
389 }
390
391 printk(KERN_INFO "squashfs: version 4.0 (2009/01/03) "
392 "Phillip Lougher\n");
393
394 return 0;
395}
396
397
398static void __exit exit_squashfs_fs(void)
399{
400 unregister_filesystem(&squashfs_fs_type);
401 destroy_inodecache();
402}
403
404
405static struct inode *squashfs_alloc_inode(struct super_block *sb)
406{
407 struct squashfs_inode_info *ei =
408 kmem_cache_alloc(squashfs_inode_cachep, GFP_KERNEL);
409
410 return ei ? &ei->vfs_inode : NULL;
411}
412
413
414static void squashfs_destroy_inode(struct inode *inode)
415{
416 kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode));
417}
418
419
420static struct file_system_type squashfs_fs_type = {
421 .owner = THIS_MODULE,
422 .name = "squashfs",
423 .get_sb = squashfs_get_sb,
424 .kill_sb = kill_block_super,
425 .fs_flags = FS_REQUIRES_DEV
426};
427
428static struct super_operations squashfs_super_ops = {
429 .alloc_inode = squashfs_alloc_inode,
430 .destroy_inode = squashfs_destroy_inode,
431 .statfs = squashfs_statfs,
432 .put_super = squashfs_put_super,
433 .remount_fs = squashfs_remount
434};
435
436module_init(init_squashfs_fs);
437module_exit(exit_squashfs_fs);
438MODULE_DESCRIPTION("squashfs 4.0, a compressed read-only filesystem");
439MODULE_AUTHOR("Phillip Lougher <phillip@lougher.demon.co.uk>");
440MODULE_LICENSE("GPL");
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
new file mode 100644
index 000000000000..83d87880aac8
--- /dev/null
+++ b/fs/squashfs/symlink.c
@@ -0,0 +1,118 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * symlink.c
22 */
23
24/*
25 * This file implements code to handle symbolic links.
26 *
27 * The data contents of symbolic links are stored inside the symbolic
28 * link inode within the inode table. This allows the normally small symbolic
29 * link to be compressed as part of the inode table, achieving much greater
30 * compression than if the symbolic link was compressed individually.
31 */
32
33#include <linux/fs.h>
34#include <linux/vfs.h>
35#include <linux/kernel.h>
36#include <linux/slab.h>
37#include <linux/string.h>
38#include <linux/pagemap.h>
39#include <linux/zlib.h>
40
41#include "squashfs_fs.h"
42#include "squashfs_fs_sb.h"
43#include "squashfs_fs_i.h"
44#include "squashfs.h"
45
46static int squashfs_symlink_readpage(struct file *file, struct page *page)
47{
48 struct inode *inode = page->mapping->host;
49 struct super_block *sb = inode->i_sb;
50 struct squashfs_sb_info *msblk = sb->s_fs_info;
51 int index = page->index << PAGE_CACHE_SHIFT;
52 u64 block = squashfs_i(inode)->start;
53 int offset = squashfs_i(inode)->offset;
54 int length = min_t(int, i_size_read(inode) - index, PAGE_CACHE_SIZE);
55 int bytes, copied;
56 void *pageaddr;
57 struct squashfs_cache_entry *entry;
58
59 TRACE("Entered squashfs_symlink_readpage, page index %ld, start block "
60 "%llx, offset %x\n", page->index, block, offset);
61
62 /*
63 * Skip index bytes into symlink metadata.
64 */
65 if (index) {
66 bytes = squashfs_read_metadata(sb, NULL, &block, &offset,
67 index);
68 if (bytes < 0) {
69 ERROR("Unable to read symlink [%llx:%x]\n",
70 squashfs_i(inode)->start,
71 squashfs_i(inode)->offset);
72 goto error_out;
73 }
74 }
75
76 /*
77 * Read length bytes from symlink metadata. Squashfs_read_metadata
78 * is not used here because it can sleep and we want to use
79 * kmap_atomic to map the page. Instead call the underlying
80 * squashfs_cache_get routine. As length bytes may overlap metadata
81 * blocks, we may need to call squashfs_cache_get multiple times.
82 */
83 for (bytes = 0; bytes < length; offset = 0, bytes += copied) {
84 entry = squashfs_cache_get(sb, msblk->block_cache, block, 0);
85 if (entry->error) {
86 ERROR("Unable to read symlink [%llx:%x]\n",
87 squashfs_i(inode)->start,
88 squashfs_i(inode)->offset);
89 squashfs_cache_put(entry);
90 goto error_out;
91 }
92
93 pageaddr = kmap_atomic(page, KM_USER0);
94 copied = squashfs_copy_data(pageaddr + bytes, entry, offset,
95 length - bytes);
96 if (copied == length - bytes)
97 memset(pageaddr + length, 0, PAGE_CACHE_SIZE - length);
98 else
99 block = entry->next_index;
100 kunmap_atomic(pageaddr, KM_USER0);
101 squashfs_cache_put(entry);
102 }
103
104 flush_dcache_page(page);
105 SetPageUptodate(page);
106 unlock_page(page);
107 return 0;
108
109error_out:
110 SetPageError(page);
111 unlock_page(page);
112 return 0;
113}
114
115
116const struct address_space_operations squashfs_symlink_aops = {
117 .readpage = squashfs_symlink_readpage
118};
diff --git a/fs/stat.c b/fs/stat.c
index 7c46fbeb8b76..7e12a6f82795 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -305,7 +305,7 @@ asmlinkage long sys_readlinkat(int dfd, const char __user *pathname,
305 struct inode *inode = path.dentry->d_inode; 305 struct inode *inode = path.dentry->d_inode;
306 306
307 error = -EINVAL; 307 error = -EINVAL;
308 if (inode->i_op && inode->i_op->readlink) { 308 if (inode->i_op->readlink) {
309 error = security_inode_readlink(path.dentry); 309 error = security_inode_readlink(path.dentry);
310 if (!error) { 310 if (!error) {
311 touch_atime(path.mnt, path.dentry); 311 touch_atime(path.mnt, path.dentry);
diff --git a/fs/super.c b/fs/super.c
index ddba069d7a99..ed080c417167 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -38,6 +38,7 @@
38#include <linux/kobject.h> 38#include <linux/kobject.h>
39#include <linux/mutex.h> 39#include <linux/mutex.h>
40#include <linux/file.h> 40#include <linux/file.h>
41#include <linux/async.h>
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include "internal.h" 43#include "internal.h"
43 44
@@ -71,6 +72,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
71 INIT_HLIST_HEAD(&s->s_anon); 72 INIT_HLIST_HEAD(&s->s_anon);
72 INIT_LIST_HEAD(&s->s_inodes); 73 INIT_LIST_HEAD(&s->s_inodes);
73 INIT_LIST_HEAD(&s->s_dentry_lru); 74 INIT_LIST_HEAD(&s->s_dentry_lru);
75 INIT_LIST_HEAD(&s->s_async_list);
74 init_rwsem(&s->s_umount); 76 init_rwsem(&s->s_umount);
75 mutex_init(&s->s_lock); 77 mutex_init(&s->s_lock);
76 lockdep_set_class(&s->s_umount, &type->s_umount_key); 78 lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -289,11 +291,18 @@ void generic_shutdown_super(struct super_block *sb)
289{ 291{
290 const struct super_operations *sop = sb->s_op; 292 const struct super_operations *sop = sb->s_op;
291 293
294
292 if (sb->s_root) { 295 if (sb->s_root) {
293 shrink_dcache_for_umount(sb); 296 shrink_dcache_for_umount(sb);
294 fsync_super(sb); 297 fsync_super(sb);
295 lock_super(sb); 298 lock_super(sb);
296 sb->s_flags &= ~MS_ACTIVE; 299 sb->s_flags &= ~MS_ACTIVE;
300
301 /*
302 * wait for asynchronous fs operations to finish before going further
303 */
304 async_synchronize_full_special(&sb->s_async_list);
305
297 /* bad name - it should be evict_inodes() */ 306 /* bad name - it should be evict_inodes() */
298 invalidate_inodes(sb); 307 invalidate_inodes(sb);
299 lock_kernel(); 308 lock_kernel();
@@ -461,6 +470,7 @@ restart:
461 sb->s_count++; 470 sb->s_count++;
462 spin_unlock(&sb_lock); 471 spin_unlock(&sb_lock);
463 down_read(&sb->s_umount); 472 down_read(&sb->s_umount);
473 async_synchronize_full_special(&sb->s_async_list);
464 if (sb->s_root && (wait || sb->s_dirt)) 474 if (sb->s_root && (wait || sb->s_dirt))
465 sb->s_op->sync_fs(sb, wait); 475 sb->s_op->sync_fs(sb, wait);
466 up_read(&sb->s_umount); 476 up_read(&sb->s_umount);
@@ -800,6 +810,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
800 } 810 }
801 811
802 s->s_flags |= MS_ACTIVE; 812 s->s_flags |= MS_ACTIVE;
813 bdev->bd_super = s;
803 } 814 }
804 815
805 return simple_set_mnt(mnt, s); 816 return simple_set_mnt(mnt, s);
@@ -819,6 +830,7 @@ void kill_block_super(struct super_block *sb)
819 struct block_device *bdev = sb->s_bdev; 830 struct block_device *bdev = sb->s_bdev;
820 fmode_t mode = sb->s_mode; 831 fmode_t mode = sb->s_mode;
821 832
833 bdev->bd_super = 0;
822 generic_shutdown_super(sb); 834 generic_shutdown_super(sb);
823 sync_blockdev(bdev); 835 sync_blockdev(bdev);
824 close_bdev_exclusive(bdev, mode); 836 close_bdev_exclusive(bdev, mode);
diff --git a/fs/sync.c b/fs/sync.c
index 2967562d416f..ac02b56548bc 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -75,14 +75,39 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
75 return ret; 75 return ret;
76} 76}
77 77
78long do_fsync(struct file *file, int datasync) 78/**
79 * vfs_fsync - perform a fsync or fdatasync on a file
80 * @file: file to sync
81 * @dentry: dentry of @file
82 * @data: only perform a fdatasync operation
83 *
84 * Write back data and metadata for @file to disk. If @datasync is
85 * set only metadata needed to access modified file data is written.
86 *
87 * In case this function is called from nfsd @file may be %NULL and
88 * only @dentry is set. This can only happen when the filesystem
89 * implements the export_operations API.
90 */
91int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
79{ 92{
80 int ret; 93 const struct file_operations *fop;
81 int err; 94 struct address_space *mapping;
82 struct address_space *mapping = file->f_mapping; 95 int err, ret;
96
97 /*
98 * Get mapping and operations from the file in case we have
99 * as file, or get the default values for them in case we
100 * don't have a struct file available. Damn nfsd..
101 */
102 if (file) {
103 mapping = file->f_mapping;
104 fop = file->f_op;
105 } else {
106 mapping = dentry->d_inode->i_mapping;
107 fop = dentry->d_inode->i_fop;
108 }
83 109
84 if (!file->f_op || !file->f_op->fsync) { 110 if (!fop || !fop->fsync) {
85 /* Why? We can still call filemap_fdatawrite */
86 ret = -EINVAL; 111 ret = -EINVAL;
87 goto out; 112 goto out;
88 } 113 }
@@ -94,7 +119,7 @@ long do_fsync(struct file *file, int datasync)
94 * livelocks in fsync_buffers_list(). 119 * livelocks in fsync_buffers_list().
95 */ 120 */
96 mutex_lock(&mapping->host->i_mutex); 121 mutex_lock(&mapping->host->i_mutex);
97 err = file->f_op->fsync(file, file->f_path.dentry, datasync); 122 err = fop->fsync(file, dentry, datasync);
98 if (!ret) 123 if (!ret)
99 ret = err; 124 ret = err;
100 mutex_unlock(&mapping->host->i_mutex); 125 mutex_unlock(&mapping->host->i_mutex);
@@ -104,15 +129,16 @@ long do_fsync(struct file *file, int datasync)
104out: 129out:
105 return ret; 130 return ret;
106} 131}
132EXPORT_SYMBOL(vfs_fsync);
107 133
108static long __do_fsync(unsigned int fd, int datasync) 134static int do_fsync(unsigned int fd, int datasync)
109{ 135{
110 struct file *file; 136 struct file *file;
111 int ret = -EBADF; 137 int ret = -EBADF;
112 138
113 file = fget(fd); 139 file = fget(fd);
114 if (file) { 140 if (file) {
115 ret = do_fsync(file, datasync); 141 ret = vfs_fsync(file, file->f_path.dentry, datasync);
116 fput(file); 142 fput(file);
117 } 143 }
118 return ret; 144 return ret;
@@ -120,12 +146,12 @@ static long __do_fsync(unsigned int fd, int datasync)
120 146
121asmlinkage long sys_fsync(unsigned int fd) 147asmlinkage long sys_fsync(unsigned int fd)
122{ 148{
123 return __do_fsync(fd, 0); 149 return do_fsync(fd, 0);
124} 150}
125 151
126asmlinkage long sys_fdatasync(unsigned int fd) 152asmlinkage long sys_fdatasync(unsigned int fd)
127{ 153{
128 return __do_fsync(fd, 1); 154 return do_fsync(fd, 1);
129} 155}
130 156
131/* 157/*
@@ -269,7 +295,7 @@ int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
269 295
270 if (flags & SYNC_FILE_RANGE_WRITE) { 296 if (flags & SYNC_FILE_RANGE_WRITE) {
271 ret = __filemap_fdatawrite_range(mapping, offset, endbyte, 297 ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
272 WB_SYNC_NONE); 298 WB_SYNC_ALL);
273 if (ret < 0) 299 if (ret < 0)
274 goto out; 300 goto out;
275 } 301 }
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index eb53c632f856..dfa3d94cfc74 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -107,8 +107,6 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
107static inline void set_default_inode_attr(struct inode * inode, mode_t mode) 107static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
108{ 108{
109 inode->i_mode = mode; 109 inode->i_mode = mode;
110 inode->i_uid = 0;
111 inode->i_gid = 0;
112 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 110 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
113} 111}
114 112
@@ -149,7 +147,6 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
149{ 147{
150 struct bin_attribute *bin_attr; 148 struct bin_attribute *bin_attr;
151 149
152 inode->i_blocks = 0;
153 inode->i_mapping->a_ops = &sysfs_aops; 150 inode->i_mapping->a_ops = &sysfs_aops;
154 inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info; 151 inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
155 inode->i_op = &sysfs_inode_operations; 152 inode->i_op = &sysfs_inode_operations;
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index df0d435baa48..3d81bf58dae2 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -27,6 +27,7 @@
27#include <linux/init.h> 27#include <linux/init.h>
28#include <linux/buffer_head.h> 28#include <linux/buffer_head.h>
29#include <linux/vfs.h> 29#include <linux/vfs.h>
30#include <linux/namei.h>
30#include <asm/byteorder.h> 31#include <asm/byteorder.h>
31#include "sysv.h" 32#include "sysv.h"
32 33
@@ -163,8 +164,11 @@ void sysv_set_inode(struct inode *inode, dev_t rdev)
163 if (inode->i_blocks) { 164 if (inode->i_blocks) {
164 inode->i_op = &sysv_symlink_inode_operations; 165 inode->i_op = &sysv_symlink_inode_operations;
165 inode->i_mapping->a_ops = &sysv_aops; 166 inode->i_mapping->a_ops = &sysv_aops;
166 } else 167 } else {
167 inode->i_op = &sysv_fast_symlink_inode_operations; 168 inode->i_op = &sysv_fast_symlink_inode_operations;
169 nd_terminate_link(SYSV_I(inode)->i_data, inode->i_size,
170 sizeof(SYSV_I(inode)->i_data) - 1);
171 }
168 } else 172 } else
169 init_special_inode(inode, inode->i_mode, rdev); 173 init_special_inode(inode, inode->i_mode, rdev);
170} 174}
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index 91ceeda7e5bf..e35b54d5059d 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -40,7 +40,7 @@ config UBIFS_FS_ZLIB
40 depends on UBIFS_FS 40 depends on UBIFS_FS
41 default y 41 default y
42 help 42 help
43 Zlib copresses better then LZO but it is slower. Say 'Y' if unsure. 43 Zlib compresses better than LZO but it is slower. Say 'Y' if unsure.
44 44
45# Debugging-related stuff 45# Debugging-related stuff
46config UBIFS_FS_DEBUG 46config UBIFS_FS_DEBUG
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 4a18f084cc42..175f9c590b77 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -32,18 +32,15 @@
32 32
33#include "ubifs.h" 33#include "ubifs.h"
34#include <linux/writeback.h> 34#include <linux/writeback.h>
35#include <asm/div64.h> 35#include <linux/math64.h>
36 36
37/* 37/*
38 * When pessimistic budget calculations say that there is no enough space, 38 * When pessimistic budget calculations say that there is no enough space,
39 * UBIFS starts writing back dirty inodes and pages, doing garbage collection, 39 * UBIFS starts writing back dirty inodes and pages, doing garbage collection,
40 * or committing. The below constants define maximum number of times UBIFS 40 * or committing. The below constant defines maximum number of times UBIFS
41 * repeats the operations. 41 * repeats the operations.
42 */ 42 */
43#define MAX_SHRINK_RETRIES 8 43#define MAX_MKSPC_RETRIES 3
44#define MAX_GC_RETRIES 4
45#define MAX_CMT_RETRIES 2
46#define MAX_NOSPC_RETRIES 1
47 44
48/* 45/*
49 * The below constant defines amount of dirty pages which should be written 46 * The below constant defines amount of dirty pages which should be written
@@ -52,30 +49,6 @@
52#define NR_TO_WRITE 16 49#define NR_TO_WRITE 16
53 50
54/** 51/**
55 * struct retries_info - information about re-tries while making free space.
56 * @prev_liability: previous liability
57 * @shrink_cnt: how many times the liability was shrinked
58 * @shrink_retries: count of liability shrink re-tries (increased when
59 * liability does not shrink)
60 * @try_gc: GC should be tried first
61 * @gc_retries: how many times GC was run
62 * @cmt_retries: how many times commit has been done
63 * @nospc_retries: how many times GC returned %-ENOSPC
64 *
65 * Since we consider budgeting to be the fast-path, and this structure has to
66 * be allocated on stack and zeroed out, we make it smaller using bit-fields.
67 */
68struct retries_info {
69 long long prev_liability;
70 unsigned int shrink_cnt;
71 unsigned int shrink_retries:5;
72 unsigned int try_gc:1;
73 unsigned int gc_retries:4;
74 unsigned int cmt_retries:3;
75 unsigned int nospc_retries:1;
76};
77
78/**
79 * shrink_liability - write-back some dirty pages/inodes. 52 * shrink_liability - write-back some dirty pages/inodes.
80 * @c: UBIFS file-system description object 53 * @c: UBIFS file-system description object
81 * @nr_to_write: how many dirty pages to write-back 54 * @nr_to_write: how many dirty pages to write-back
@@ -147,13 +120,29 @@ static int run_gc(struct ubifs_info *c)
147} 120}
148 121
149/** 122/**
123 * get_liability - calculate current liability.
124 * @c: UBIFS file-system description object
125 *
126 * This function calculates and returns current UBIFS liability, i.e. the
127 * amount of bytes UBIFS has "promised" to write to the media.
128 */
129static long long get_liability(struct ubifs_info *c)
130{
131 long long liab;
132
133 spin_lock(&c->space_lock);
134 liab = c->budg_idx_growth + c->budg_data_growth + c->budg_dd_growth;
135 spin_unlock(&c->space_lock);
136 return liab;
137}
138
139/**
150 * make_free_space - make more free space on the file-system. 140 * make_free_space - make more free space on the file-system.
151 * @c: UBIFS file-system description object 141 * @c: UBIFS file-system description object
152 * @ri: information about previous invocations of this function
153 * 142 *
154 * This function is called when an operation cannot be budgeted because there 143 * This function is called when an operation cannot be budgeted because there
155 * is supposedly no free space. But in most cases there is some free space: 144 * is supposedly no free space. But in most cases there is some free space:
156 * o budgeting is pessimistic, so it always budgets more then it is actually 145 * o budgeting is pessimistic, so it always budgets more than it is actually
157 * needed, so shrinking the liability is one way to make free space - the 146 * needed, so shrinking the liability is one way to make free space - the
158 * cached data will take less space then it was budgeted for; 147 * cached data will take less space then it was budgeted for;
159 * o GC may turn some dark space into free space (budgeting treats dark space 148 * o GC may turn some dark space into free space (budgeting treats dark space
@@ -165,87 +154,42 @@ static int run_gc(struct ubifs_info *c)
165 * Returns %-ENOSPC if it couldn't do more free space, and other negative error 154 * Returns %-ENOSPC if it couldn't do more free space, and other negative error
166 * codes on failures. 155 * codes on failures.
167 */ 156 */
168static int make_free_space(struct ubifs_info *c, struct retries_info *ri) 157static int make_free_space(struct ubifs_info *c)
169{ 158{
170 int err; 159 int err, retries = 0;
171 160 long long liab1, liab2;
172 /*
173 * If we have some dirty pages and inodes (liability), try to write
174 * them back unless this was tried too many times without effect
175 * already.
176 */
177 if (ri->shrink_retries < MAX_SHRINK_RETRIES && !ri->try_gc) {
178 long long liability;
179
180 spin_lock(&c->space_lock);
181 liability = c->budg_idx_growth + c->budg_data_growth +
182 c->budg_dd_growth;
183 spin_unlock(&c->space_lock);
184 161
185 if (ri->prev_liability >= liability) { 162 do {
186 /* Liability does not shrink, next time try GC then */ 163 liab1 = get_liability(c);
187 ri->shrink_retries += 1; 164 /*
188 if (ri->gc_retries < MAX_GC_RETRIES) 165 * We probably have some dirty pages or inodes (liability), try
189 ri->try_gc = 1; 166 * to write them back.
190 dbg_budg("liability did not shrink: retries %d of %d", 167 */
191 ri->shrink_retries, MAX_SHRINK_RETRIES); 168 dbg_budg("liability %lld, run write-back", liab1);
192 } 169 shrink_liability(c, NR_TO_WRITE);
193 170
194 dbg_budg("force write-back (count %d)", ri->shrink_cnt); 171 liab2 = get_liability(c);
195 shrink_liability(c, NR_TO_WRITE + ri->shrink_cnt); 172 if (liab2 < liab1)
173 return -EAGAIN;
196 174
197 ri->prev_liability = liability; 175 dbg_budg("new liability %lld (not shrinked)", liab2);
198 ri->shrink_cnt += 1;
199 return -EAGAIN;
200 }
201 176
202 /* 177 /* Liability did not shrink again, try GC */
203 * Try to run garbage collector unless it was already tried too many 178 dbg_budg("Run GC");
204 * times.
205 */
206 if (ri->gc_retries < MAX_GC_RETRIES) {
207 ri->gc_retries += 1;
208 dbg_budg("run GC, retries %d of %d",
209 ri->gc_retries, MAX_GC_RETRIES);
210
211 ri->try_gc = 0;
212 err = run_gc(c); 179 err = run_gc(c);
213 if (!err) 180 if (!err)
214 return -EAGAIN; 181 return -EAGAIN;
215 182
216 if (err == -EAGAIN) { 183 if (err != -EAGAIN && err != -ENOSPC)
217 dbg_budg("GC asked to commit"); 184 /* Some real error happened */
218 err = ubifs_run_commit(c);
219 if (err)
220 return err;
221 return -EAGAIN;
222 }
223
224 if (err != -ENOSPC)
225 return err;
226
227 /*
228 * GC could not make any progress. If this is the first time,
229 * then it makes sense to try to commit, because it might make
230 * some dirty space.
231 */
232 dbg_budg("GC returned -ENOSPC, retries %d",
233 ri->nospc_retries);
234 if (ri->nospc_retries >= MAX_NOSPC_RETRIES)
235 return err; 185 return err;
236 ri->nospc_retries += 1;
237 }
238 186
239 /* Neither GC nor write-back helped, try to commit */ 187 dbg_budg("Run commit (retries %d)", retries);
240 if (ri->cmt_retries < MAX_CMT_RETRIES) {
241 ri->cmt_retries += 1;
242 dbg_budg("run commit, retries %d of %d",
243 ri->cmt_retries, MAX_CMT_RETRIES);
244 err = ubifs_run_commit(c); 188 err = ubifs_run_commit(c);
245 if (err) 189 if (err)
246 return err; 190 return err;
247 return -EAGAIN; 191 } while (retries++ < MAX_MKSPC_RETRIES);
248 } 192
249 return -ENOSPC; 193 return -ENOSPC;
250} 194}
251 195
@@ -258,8 +202,8 @@ static int make_free_space(struct ubifs_info *c, struct retries_info *ri)
258 */ 202 */
259int ubifs_calc_min_idx_lebs(struct ubifs_info *c) 203int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
260{ 204{
261 int ret; 205 int idx_lebs, eff_leb_size = c->leb_size - c->max_idx_node_sz;
262 uint64_t idx_size; 206 long long idx_size;
263 207
264 idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx; 208 idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
265 209
@@ -271,23 +215,16 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
271 * pair, nor similarly the two variables for the new index size, so we 215 * pair, nor similarly the two variables for the new index size, so we
272 * have to do this costly 64-bit division on fast-path. 216 * have to do this costly 64-bit division on fast-path.
273 */ 217 */
274 if (do_div(idx_size, c->leb_size - c->max_idx_node_sz)) 218 idx_size += eff_leb_size - 1;
275 ret = idx_size + 1; 219 idx_lebs = div_u64(idx_size, eff_leb_size);
276 else
277 ret = idx_size;
278 /* 220 /*
279 * The index head is not available for the in-the-gaps method, so add an 221 * The index head is not available for the in-the-gaps method, so add an
280 * extra LEB to compensate. 222 * extra LEB to compensate.
281 */ 223 */
282 ret += 1; 224 idx_lebs += 1;
283 /* 225 if (idx_lebs < MIN_INDEX_LEBS)
284 * At present the index needs at least 2 LEBs: one for the index head 226 idx_lebs = MIN_INDEX_LEBS;
285 * and one for in-the-gaps method (which currently does not cater for 227 return idx_lebs;
286 * the index head and so excludes it from consideration).
287 */
288 if (ret < 2)
289 ret = 2;
290 return ret;
291} 228}
292 229
293/** 230/**
@@ -530,8 +467,7 @@ static int calc_dd_growth(const struct ubifs_info *c,
530int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req) 467int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
531{ 468{
532 int uninitialized_var(cmt_retries), uninitialized_var(wb_retries); 469 int uninitialized_var(cmt_retries), uninitialized_var(wb_retries);
533 int err, idx_growth, data_growth, dd_growth; 470 int err, idx_growth, data_growth, dd_growth, retried = 0;
534 struct retries_info ri;
535 471
536 ubifs_assert(req->new_page <= 1); 472 ubifs_assert(req->new_page <= 1);
537 ubifs_assert(req->dirtied_page <= 1); 473 ubifs_assert(req->dirtied_page <= 1);
@@ -549,7 +485,6 @@ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
549 if (!data_growth && !dd_growth) 485 if (!data_growth && !dd_growth)
550 return 0; 486 return 0;
551 idx_growth = calc_idx_growth(c, req); 487 idx_growth = calc_idx_growth(c, req);
552 memset(&ri, 0, sizeof(struct retries_info));
553 488
554again: 489again:
555 spin_lock(&c->space_lock); 490 spin_lock(&c->space_lock);
@@ -587,12 +522,17 @@ again:
587 return err; 522 return err;
588 } 523 }
589 524
590 err = make_free_space(c, &ri); 525 err = make_free_space(c);
526 cond_resched();
591 if (err == -EAGAIN) { 527 if (err == -EAGAIN) {
592 dbg_budg("try again"); 528 dbg_budg("try again");
593 cond_resched();
594 goto again; 529 goto again;
595 } else if (err == -ENOSPC) { 530 } else if (err == -ENOSPC) {
531 if (!retried) {
532 retried = 1;
533 dbg_budg("-ENOSPC, but anyway try once again");
534 goto again;
535 }
596 dbg_budg("FS is full, -ENOSPC"); 536 dbg_budg("FS is full, -ENOSPC");
597 c->nospace = 1; 537 c->nospace = 1;
598 if (can_use_rp(c) || c->rp_size == 0) 538 if (can_use_rp(c) || c->rp_size == 0)
@@ -666,7 +606,7 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
666 * @c: UBIFS file-system description object 606 * @c: UBIFS file-system description object
667 * 607 *
668 * This function converts budget which was allocated for a new page of data to 608 * This function converts budget which was allocated for a new page of data to
669 * the budget of changing an existing page of data. The latter is smaller then 609 * the budget of changing an existing page of data. The latter is smaller than
670 * the former, so this function only does simple re-calculation and does not 610 * the former, so this function only does simple re-calculation and does not
671 * involve any write-back. 611 * involve any write-back.
672 */ 612 */
@@ -712,9 +652,9 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
712 * user-space. User-space application tend to expect that if the file-system 652 * user-space. User-space application tend to expect that if the file-system
713 * (e.g., via the 'statfs()' call) reports that it has N bytes available, they 653 * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
714 * are able to write a file of size N. UBIFS attaches node headers to each data 654 * are able to write a file of size N. UBIFS attaches node headers to each data
715 * node and it has to write indexind nodes as well. This introduces additional 655 * node and it has to write indexing nodes as well. This introduces additional
716 * overhead, and UBIFS it has to report sligtly less free space to meet the 656 * overhead, and UBIFS has to report slightly less free space to meet the above
717 * above expectetion. 657 * expectations.
718 * 658 *
719 * This function assumes free space is made up of uncompressed data nodes and 659 * This function assumes free space is made up of uncompressed data nodes and
720 * full index nodes (one per data node, tripled because we always allow enough 660 * full index nodes (one per data node, tripled because we always allow enough
@@ -723,7 +663,7 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
723 * Note, the calculation is pessimistic, which means that most of the time 663 * Note, the calculation is pessimistic, which means that most of the time
724 * UBIFS reports less space than it actually has. 664 * UBIFS reports less space than it actually has.
725 */ 665 */
726long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free) 666long long ubifs_reported_space(const struct ubifs_info *c, long long free)
727{ 667{
728 int divisor, factor, f; 668 int divisor, factor, f;
729 669
@@ -737,7 +677,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
737 * of data nodes, f - fanout. Because effective UBIFS fanout is twice 677 * of data nodes, f - fanout. Because effective UBIFS fanout is twice
738 * as less than maximum fanout, we assume that each data node 678 * as less than maximum fanout, we assume that each data node
739 * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes. 679 * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes.
740 * Note, the multiplier 3 is because UBIFS reseves thrice as more space 680 * Note, the multiplier 3 is because UBIFS reserves thrice as more space
741 * for the index. 681 * for the index.
742 */ 682 */
743 f = c->fanout > 3 ? c->fanout >> 1 : 2; 683 f = c->fanout > 3 ? c->fanout >> 1 : 2;
@@ -745,8 +685,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
745 divisor = UBIFS_MAX_DATA_NODE_SZ; 685 divisor = UBIFS_MAX_DATA_NODE_SZ;
746 divisor += (c->max_idx_node_sz * 3) / (f - 1); 686 divisor += (c->max_idx_node_sz * 3) / (f - 1);
747 free *= factor; 687 free *= factor;
748 do_div(free, divisor); 688 return div_u64(free, divisor);
749 return free;
750} 689}
751 690
752/** 691/**
@@ -756,10 +695,10 @@ long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
756 * This function calculates amount of free space to report to user-space. 695 * This function calculates amount of free space to report to user-space.
757 * 696 *
758 * Because UBIFS may introduce substantial overhead (the index, node headers, 697 * Because UBIFS may introduce substantial overhead (the index, node headers,
759 * alighment, wastage at the end of eraseblocks, etc), it cannot report real 698 * alignment, wastage at the end of eraseblocks, etc), it cannot report real
760 * amount of free flash space it has (well, because not all dirty space is 699 * amount of free flash space it has (well, because not all dirty space is
761 * reclamable, UBIFS does not actually know the real amount). If UBIFS did so, 700 * reclaimable, UBIFS does not actually know the real amount). If UBIFS did so,
762 * it would bread user expectetion about what free space is. Users seem to 701 * it would bread user expectations about what free space is. Users seem to
763 * accustomed to assume that if the file-system reports N bytes of free space, 702 * accustomed to assume that if the file-system reports N bytes of free space,
764 * they would be able to fit a file of N bytes to the FS. This almost works for 703 * they would be able to fit a file of N bytes to the FS. This almost works for
765 * traditional file-systems, because they have way less overhead than UBIFS. 704 * traditional file-systems, because they have way less overhead than UBIFS.
@@ -771,18 +710,9 @@ long long ubifs_get_free_space(struct ubifs_info *c)
771 long long available, outstanding, free; 710 long long available, outstanding, free;
772 711
773 spin_lock(&c->space_lock); 712 spin_lock(&c->space_lock);
774 min_idx_lebs = ubifs_calc_min_idx_lebs(c); 713 min_idx_lebs = c->min_idx_lebs;
714 ubifs_assert(min_idx_lebs == ubifs_calc_min_idx_lebs(c));
775 outstanding = c->budg_data_growth + c->budg_dd_growth; 715 outstanding = c->budg_data_growth + c->budg_dd_growth;
776
777 /*
778 * Force the amount available to the total size reported if the used
779 * space is zero.
780 */
781 if (c->lst.total_used <= UBIFS_INO_NODE_SZ && !outstanding) {
782 spin_unlock(&c->space_lock);
783 return (long long)c->block_cnt << UBIFS_BLOCK_SHIFT;
784 }
785
786 available = ubifs_calc_available(c, min_idx_lebs); 716 available = ubifs_calc_available(c, min_idx_lebs);
787 717
788 /* 718 /*
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index b49884c8c10e..f3a7945527fb 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -470,12 +470,12 @@ int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot)
470{ 470{
471 struct ubifs_idx_node *idx; 471 struct ubifs_idx_node *idx;
472 int lnum, offs, len, err = 0; 472 int lnum, offs, len, err = 0;
473 struct ubifs_debug_info *d = c->dbg;
473 474
474 c->old_zroot = *zroot; 475 d->old_zroot = *zroot;
475 476 lnum = d->old_zroot.lnum;
476 lnum = c->old_zroot.lnum; 477 offs = d->old_zroot.offs;
477 offs = c->old_zroot.offs; 478 len = d->old_zroot.len;
478 len = c->old_zroot.len;
479 479
480 idx = kmalloc(c->max_idx_node_sz, GFP_NOFS); 480 idx = kmalloc(c->max_idx_node_sz, GFP_NOFS);
481 if (!idx) 481 if (!idx)
@@ -485,8 +485,8 @@ int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot)
485 if (err) 485 if (err)
486 goto out; 486 goto out;
487 487
488 c->old_zroot_level = le16_to_cpu(idx->level); 488 d->old_zroot_level = le16_to_cpu(idx->level);
489 c->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum); 489 d->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum);
490out: 490out:
491 kfree(idx); 491 kfree(idx);
492 return err; 492 return err;
@@ -509,6 +509,7 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
509{ 509{
510 int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt; 510 int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt;
511 int first = 1, iip; 511 int first = 1, iip;
512 struct ubifs_debug_info *d = c->dbg;
512 union ubifs_key lower_key, upper_key, l_key, u_key; 513 union ubifs_key lower_key, upper_key, l_key, u_key;
513 unsigned long long uninitialized_var(last_sqnum); 514 unsigned long long uninitialized_var(last_sqnum);
514 struct ubifs_idx_node *idx; 515 struct ubifs_idx_node *idx;
@@ -525,9 +526,9 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
525 UBIFS_IDX_NODE_SZ; 526 UBIFS_IDX_NODE_SZ;
526 527
527 /* Start at the old zroot */ 528 /* Start at the old zroot */
528 lnum = c->old_zroot.lnum; 529 lnum = d->old_zroot.lnum;
529 offs = c->old_zroot.offs; 530 offs = d->old_zroot.offs;
530 len = c->old_zroot.len; 531 len = d->old_zroot.len;
531 iip = 0; 532 iip = 0;
532 533
533 /* 534 /*
@@ -560,11 +561,11 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
560 if (first) { 561 if (first) {
561 first = 0; 562 first = 0;
562 /* Check root level and sqnum */ 563 /* Check root level and sqnum */
563 if (le16_to_cpu(idx->level) != c->old_zroot_level) { 564 if (le16_to_cpu(idx->level) != d->old_zroot_level) {
564 err = 2; 565 err = 2;
565 goto out_dump; 566 goto out_dump;
566 } 567 }
567 if (le64_to_cpu(idx->ch.sqnum) != c->old_zroot_sqnum) { 568 if (le64_to_cpu(idx->ch.sqnum) != d->old_zroot_sqnum) {
568 err = 3; 569 err = 3;
569 goto out_dump; 570 goto out_dump;
570 } 571 }
diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c
index a0ada596b17c..11e4132f314a 100644
--- a/fs/ubifs/compress.c
+++ b/fs/ubifs/compress.c
@@ -33,7 +33,7 @@
33/* Fake description object for the "none" compressor */ 33/* Fake description object for the "none" compressor */
34static struct ubifs_compressor none_compr = { 34static struct ubifs_compressor none_compr = {
35 .compr_type = UBIFS_COMPR_NONE, 35 .compr_type = UBIFS_COMPR_NONE,
36 .name = "no compression", 36 .name = "none",
37 .capi_name = "", 37 .capi_name = "",
38}; 38};
39 39
@@ -43,13 +43,13 @@ static DEFINE_MUTEX(lzo_mutex);
43static struct ubifs_compressor lzo_compr = { 43static struct ubifs_compressor lzo_compr = {
44 .compr_type = UBIFS_COMPR_LZO, 44 .compr_type = UBIFS_COMPR_LZO,
45 .comp_mutex = &lzo_mutex, 45 .comp_mutex = &lzo_mutex,
46 .name = "LZO", 46 .name = "lzo",
47 .capi_name = "lzo", 47 .capi_name = "lzo",
48}; 48};
49#else 49#else
50static struct ubifs_compressor lzo_compr = { 50static struct ubifs_compressor lzo_compr = {
51 .compr_type = UBIFS_COMPR_LZO, 51 .compr_type = UBIFS_COMPR_LZO,
52 .name = "LZO", 52 .name = "lzo",
53}; 53};
54#endif 54#endif
55 55
@@ -108,7 +108,7 @@ void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
108 if (compr->comp_mutex) 108 if (compr->comp_mutex)
109 mutex_lock(compr->comp_mutex); 109 mutex_lock(compr->comp_mutex);
110 err = crypto_comp_compress(compr->cc, in_buf, in_len, out_buf, 110 err = crypto_comp_compress(compr->cc, in_buf, in_len, out_buf,
111 out_len); 111 (unsigned int *)out_len);
112 if (compr->comp_mutex) 112 if (compr->comp_mutex)
113 mutex_unlock(compr->comp_mutex); 113 mutex_unlock(compr->comp_mutex);
114 if (unlikely(err)) { 114 if (unlikely(err)) {
@@ -119,10 +119,10 @@ void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
119 } 119 }
120 120
121 /* 121 /*
122 * Presently, we just require that compression results in less data, 122 * If the data compressed only slightly, it is better to leave it
123 * rather than any defined minimum compression ratio or amount. 123 * uncompressed to improve read speed.
124 */ 124 */
125 if (ALIGN(*out_len, 8) >= ALIGN(in_len, 8)) 125 if (in_len - *out_len < UBIFS_MIN_COMPRESS_DIFF)
126 goto no_compr; 126 goto no_compr;
127 127
128 return; 128 return;
@@ -172,7 +172,7 @@ int ubifs_decompress(const void *in_buf, int in_len, void *out_buf,
172 if (compr->decomp_mutex) 172 if (compr->decomp_mutex)
173 mutex_lock(compr->decomp_mutex); 173 mutex_lock(compr->decomp_mutex);
174 err = crypto_comp_decompress(compr->cc, in_buf, in_len, out_buf, 174 err = crypto_comp_decompress(compr->cc, in_buf, in_len, out_buf,
175 out_len); 175 (unsigned int *)out_len);
176 if (compr->decomp_mutex) 176 if (compr->decomp_mutex)
177 mutex_unlock(compr->decomp_mutex); 177 mutex_unlock(compr->decomp_mutex);
178 if (err) 178 if (err)
@@ -244,7 +244,7 @@ out_lzo:
244/** 244/**
245 * ubifs_compressors_exit - de-initialize UBIFS compressors. 245 * ubifs_compressors_exit - de-initialize UBIFS compressors.
246 */ 246 */
247void __exit ubifs_compressors_exit(void) 247void ubifs_compressors_exit(void)
248{ 248{
249 compr_exit(&lzo_compr); 249 compr_exit(&lzo_compr);
250 compr_exit(&zlib_compr); 250 compr_exit(&zlib_compr);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 510ffa0bbda4..792c5a16c182 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -32,6 +32,8 @@
32#include "ubifs.h" 32#include "ubifs.h"
33#include <linux/module.h> 33#include <linux/module.h>
34#include <linux/moduleparam.h> 34#include <linux/moduleparam.h>
35#include <linux/debugfs.h>
36#include <linux/math64.h>
35 37
36#ifdef CONFIG_UBIFS_FS_DEBUG 38#ifdef CONFIG_UBIFS_FS_DEBUG
37 39
@@ -596,7 +598,9 @@ void dbg_dump_budg(struct ubifs_info *c)
596 struct rb_node *rb; 598 struct rb_node *rb;
597 struct ubifs_bud *bud; 599 struct ubifs_bud *bud;
598 struct ubifs_gced_idx_leb *idx_gc; 600 struct ubifs_gced_idx_leb *idx_gc;
601 long long available, outstanding, free;
599 602
603 ubifs_assert(spin_is_locked(&c->space_lock));
600 spin_lock(&dbg_lock); 604 spin_lock(&dbg_lock);
601 printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, " 605 printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, "
602 "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid, 606 "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid,
@@ -629,6 +633,17 @@ void dbg_dump_budg(struct ubifs_info *c)
629 printk(KERN_DEBUG "\tGC'ed idx LEB %d unmap %d\n", 633 printk(KERN_DEBUG "\tGC'ed idx LEB %d unmap %d\n",
630 idx_gc->lnum, idx_gc->unmap); 634 idx_gc->lnum, idx_gc->unmap);
631 printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state); 635 printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state);
636
637 /* Print budgeting predictions */
638 available = ubifs_calc_available(c, c->min_idx_lebs);
639 outstanding = c->budg_data_growth + c->budg_dd_growth;
640 if (available > outstanding)
641 free = ubifs_reported_space(c, available - outstanding);
642 else
643 free = 0;
644 printk(KERN_DEBUG "Budgeting predictions:\n");
645 printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n",
646 available, outstanding, free);
632 spin_unlock(&dbg_lock); 647 spin_unlock(&dbg_lock);
633} 648}
634 649
@@ -645,7 +660,8 @@ void dbg_dump_lprops(struct ubifs_info *c)
645 struct ubifs_lprops lp; 660 struct ubifs_lprops lp;
646 struct ubifs_lp_stats lst; 661 struct ubifs_lp_stats lst;
647 662
648 printk(KERN_DEBUG "(pid %d) Dumping LEB properties\n", current->pid); 663 printk(KERN_DEBUG "(pid %d) start dumping LEB properties\n",
664 current->pid);
649 ubifs_get_lp_stats(c, &lst); 665 ubifs_get_lp_stats(c, &lst);
650 dbg_dump_lstats(&lst); 666 dbg_dump_lstats(&lst);
651 667
@@ -656,6 +672,8 @@ void dbg_dump_lprops(struct ubifs_info *c)
656 672
657 dbg_dump_lprop(c, &lp); 673 dbg_dump_lprop(c, &lp);
658 } 674 }
675 printk(KERN_DEBUG "(pid %d) finish dumping LEB properties\n",
676 current->pid);
659} 677}
660 678
661void dbg_dump_lpt_info(struct ubifs_info *c) 679void dbg_dump_lpt_info(struct ubifs_info *c)
@@ -663,6 +681,7 @@ void dbg_dump_lpt_info(struct ubifs_info *c)
663 int i; 681 int i;
664 682
665 spin_lock(&dbg_lock); 683 spin_lock(&dbg_lock);
684 printk(KERN_DEBUG "(pid %d) dumping LPT information\n", current->pid);
666 printk(KERN_DEBUG "\tlpt_sz: %lld\n", c->lpt_sz); 685 printk(KERN_DEBUG "\tlpt_sz: %lld\n", c->lpt_sz);
667 printk(KERN_DEBUG "\tpnode_sz: %d\n", c->pnode_sz); 686 printk(KERN_DEBUG "\tpnode_sz: %d\n", c->pnode_sz);
668 printk(KERN_DEBUG "\tnnode_sz: %d\n", c->nnode_sz); 687 printk(KERN_DEBUG "\tnnode_sz: %d\n", c->nnode_sz);
@@ -684,7 +703,8 @@ void dbg_dump_lpt_info(struct ubifs_info *c)
684 printk(KERN_DEBUG "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs); 703 printk(KERN_DEBUG "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs);
685 printk(KERN_DEBUG "\tLPT head is at %d:%d\n", 704 printk(KERN_DEBUG "\tLPT head is at %d:%d\n",
686 c->nhead_lnum, c->nhead_offs); 705 c->nhead_lnum, c->nhead_offs);
687 printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n", c->ltab_lnum, c->ltab_offs); 706 printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n",
707 c->ltab_lnum, c->ltab_offs);
688 if (c->big_lpt) 708 if (c->big_lpt)
689 printk(KERN_DEBUG "\tLPT lsave is at %d:%d\n", 709 printk(KERN_DEBUG "\tLPT lsave is at %d:%d\n",
690 c->lsave_lnum, c->lsave_offs); 710 c->lsave_lnum, c->lsave_offs);
@@ -703,9 +723,9 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
703 if (dbg_failure_mode) 723 if (dbg_failure_mode)
704 return; 724 return;
705 725
706 printk(KERN_DEBUG "(pid %d) Dumping LEB %d\n", current->pid, lnum); 726 printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
707 727 current->pid, lnum);
708 sleb = ubifs_scan(c, lnum, 0, c->dbg_buf); 728 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
709 if (IS_ERR(sleb)) { 729 if (IS_ERR(sleb)) {
710 ubifs_err("scan error %d", (int)PTR_ERR(sleb)); 730 ubifs_err("scan error %d", (int)PTR_ERR(sleb));
711 return; 731 return;
@@ -721,6 +741,8 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
721 dbg_dump_node(c, snod->node); 741 dbg_dump_node(c, snod->node);
722 } 742 }
723 743
744 printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
745 current->pid, lnum);
724 ubifs_scan_destroy(sleb); 746 ubifs_scan_destroy(sleb);
725 return; 747 return;
726} 748}
@@ -768,7 +790,7 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
768{ 790{
769 int i; 791 int i;
770 792
771 printk(KERN_DEBUG "(pid %d) Dumping heap cat %d (%d elements)\n", 793 printk(KERN_DEBUG "(pid %d) start dumping heap cat %d (%d elements)\n",
772 current->pid, cat, heap->cnt); 794 current->pid, cat, heap->cnt);
773 for (i = 0; i < heap->cnt; i++) { 795 for (i = 0; i < heap->cnt; i++) {
774 struct ubifs_lprops *lprops = heap->arr[i]; 796 struct ubifs_lprops *lprops = heap->arr[i];
@@ -777,6 +799,7 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
777 "flags %d\n", i, lprops->lnum, lprops->hpos, 799 "flags %d\n", i, lprops->lnum, lprops->hpos,
778 lprops->free, lprops->dirty, lprops->flags); 800 lprops->free, lprops->dirty, lprops->flags);
779 } 801 }
802 printk(KERN_DEBUG "(pid %d) finish dumping heap\n", current->pid);
780} 803}
781 804
782void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, 805void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
@@ -784,7 +807,7 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
784{ 807{
785 int i; 808 int i;
786 809
787 printk(KERN_DEBUG "(pid %d) Dumping pnode:\n", current->pid); 810 printk(KERN_DEBUG "(pid %d) dumping pnode:\n", current->pid);
788 printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n", 811 printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n",
789 (size_t)pnode, (size_t)parent, (size_t)pnode->cnext); 812 (size_t)pnode, (size_t)parent, (size_t)pnode->cnext);
790 printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n", 813 printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n",
@@ -803,7 +826,7 @@ void dbg_dump_tnc(struct ubifs_info *c)
803 int level; 826 int level;
804 827
805 printk(KERN_DEBUG "\n"); 828 printk(KERN_DEBUG "\n");
806 printk(KERN_DEBUG "(pid %d) Dumping the TNC tree\n", current->pid); 829 printk(KERN_DEBUG "(pid %d) start dumping TNC tree\n", current->pid);
807 znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL); 830 znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
808 level = znode->level; 831 level = znode->level;
809 printk(KERN_DEBUG "== Level %d ==\n", level); 832 printk(KERN_DEBUG "== Level %d ==\n", level);
@@ -815,8 +838,7 @@ void dbg_dump_tnc(struct ubifs_info *c)
815 dbg_dump_znode(c, znode); 838 dbg_dump_znode(c, znode);
816 znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode); 839 znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
817 } 840 }
818 841 printk(KERN_DEBUG "(pid %d) finish dumping TNC tree\n", current->pid);
819 printk(KERN_DEBUG "\n");
820} 842}
821 843
822static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode, 844static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode,
@@ -992,8 +1014,8 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
992 zbr1->offs, DBGKEY(&key)); 1014 zbr1->offs, DBGKEY(&key));
993 dbg_err("but it should have key %s according to tnc", 1015 dbg_err("but it should have key %s according to tnc",
994 DBGKEY(&zbr1->key)); 1016 DBGKEY(&zbr1->key));
995 dbg_dump_node(c, dent1); 1017 dbg_dump_node(c, dent1);
996 goto out_free; 1018 goto out_free;
997 } 1019 }
998 1020
999 key_read(c, &dent2->key, &key); 1021 key_read(c, &dent2->key, &key);
@@ -1002,8 +1024,8 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
1002 zbr1->offs, DBGKEY(&key)); 1024 zbr1->offs, DBGKEY(&key));
1003 dbg_err("but it should have key %s according to tnc", 1025 dbg_err("but it should have key %s according to tnc",
1004 DBGKEY(&zbr2->key)); 1026 DBGKEY(&zbr2->key));
1005 dbg_dump_node(c, dent2); 1027 dbg_dump_node(c, dent2);
1006 goto out_free; 1028 goto out_free;
1007 } 1029 }
1008 1030
1009 nlen1 = le16_to_cpu(dent1->nlen); 1031 nlen1 = le16_to_cpu(dent1->nlen);
@@ -1020,9 +1042,9 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
1020 dbg_err("bad order of colliding key %s", 1042 dbg_err("bad order of colliding key %s",
1021 DBGKEY(&key)); 1043 DBGKEY(&key));
1022 1044
1023 dbg_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs); 1045 ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
1024 dbg_dump_node(c, dent1); 1046 dbg_dump_node(c, dent1);
1025 dbg_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs); 1047 ubifs_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs);
1026 dbg_dump_node(c, dent2); 1048 dbg_dump_node(c, dent2);
1027 1049
1028out_free: 1050out_free:
@@ -2097,13 +2119,13 @@ static int simple_rand(void)
2097 return (next >> 16) & 32767; 2119 return (next >> 16) & 32767;
2098} 2120}
2099 2121
2100void dbg_failure_mode_registration(struct ubifs_info *c) 2122static void failure_mode_init(struct ubifs_info *c)
2101{ 2123{
2102 struct failure_mode_info *fmi; 2124 struct failure_mode_info *fmi;
2103 2125
2104 fmi = kmalloc(sizeof(struct failure_mode_info), GFP_NOFS); 2126 fmi = kmalloc(sizeof(struct failure_mode_info), GFP_NOFS);
2105 if (!fmi) { 2127 if (!fmi) {
2106 dbg_err("Failed to register failure mode - no memory"); 2128 ubifs_err("Failed to register failure mode - no memory");
2107 return; 2129 return;
2108 } 2130 }
2109 fmi->c = c; 2131 fmi->c = c;
@@ -2112,7 +2134,7 @@ void dbg_failure_mode_registration(struct ubifs_info *c)
2112 spin_unlock(&fmi_lock); 2134 spin_unlock(&fmi_lock);
2113} 2135}
2114 2136
2115void dbg_failure_mode_deregistration(struct ubifs_info *c) 2137static void failure_mode_exit(struct ubifs_info *c)
2116{ 2138{
2117 struct failure_mode_info *fmi, *tmp; 2139 struct failure_mode_info *fmi, *tmp;
2118 2140
@@ -2146,42 +2168,44 @@ static int in_failure_mode(struct ubi_volume_desc *desc)
2146 struct ubifs_info *c = dbg_find_info(desc); 2168 struct ubifs_info *c = dbg_find_info(desc);
2147 2169
2148 if (c && dbg_failure_mode) 2170 if (c && dbg_failure_mode)
2149 return c->failure_mode; 2171 return c->dbg->failure_mode;
2150 return 0; 2172 return 0;
2151} 2173}
2152 2174
2153static int do_fail(struct ubi_volume_desc *desc, int lnum, int write) 2175static int do_fail(struct ubi_volume_desc *desc, int lnum, int write)
2154{ 2176{
2155 struct ubifs_info *c = dbg_find_info(desc); 2177 struct ubifs_info *c = dbg_find_info(desc);
2178 struct ubifs_debug_info *d;
2156 2179
2157 if (!c || !dbg_failure_mode) 2180 if (!c || !dbg_failure_mode)
2158 return 0; 2181 return 0;
2159 if (c->failure_mode) 2182 d = c->dbg;
2183 if (d->failure_mode)
2160 return 1; 2184 return 1;
2161 if (!c->fail_cnt) { 2185 if (!d->fail_cnt) {
2162 /* First call - decide delay to failure */ 2186 /* First call - decide delay to failure */
2163 if (chance(1, 2)) { 2187 if (chance(1, 2)) {
2164 unsigned int delay = 1 << (simple_rand() >> 11); 2188 unsigned int delay = 1 << (simple_rand() >> 11);
2165 2189
2166 if (chance(1, 2)) { 2190 if (chance(1, 2)) {
2167 c->fail_delay = 1; 2191 d->fail_delay = 1;
2168 c->fail_timeout = jiffies + 2192 d->fail_timeout = jiffies +
2169 msecs_to_jiffies(delay); 2193 msecs_to_jiffies(delay);
2170 dbg_rcvry("failing after %ums", delay); 2194 dbg_rcvry("failing after %ums", delay);
2171 } else { 2195 } else {
2172 c->fail_delay = 2; 2196 d->fail_delay = 2;
2173 c->fail_cnt_max = delay; 2197 d->fail_cnt_max = delay;
2174 dbg_rcvry("failing after %u calls", delay); 2198 dbg_rcvry("failing after %u calls", delay);
2175 } 2199 }
2176 } 2200 }
2177 c->fail_cnt += 1; 2201 d->fail_cnt += 1;
2178 } 2202 }
2179 /* Determine if failure delay has expired */ 2203 /* Determine if failure delay has expired */
2180 if (c->fail_delay == 1) { 2204 if (d->fail_delay == 1) {
2181 if (time_before(jiffies, c->fail_timeout)) 2205 if (time_before(jiffies, d->fail_timeout))
2182 return 0; 2206 return 0;
2183 } else if (c->fail_delay == 2) 2207 } else if (d->fail_delay == 2)
2184 if (c->fail_cnt++ < c->fail_cnt_max) 2208 if (d->fail_cnt++ < d->fail_cnt_max)
2185 return 0; 2209 return 0;
2186 if (lnum == UBIFS_SB_LNUM) { 2210 if (lnum == UBIFS_SB_LNUM) {
2187 if (write) { 2211 if (write) {
@@ -2239,7 +2263,7 @@ static int do_fail(struct ubi_volume_desc *desc, int lnum, int write)
2239 dbg_rcvry("failing in bud LEB %d commit not running", lnum); 2263 dbg_rcvry("failing in bud LEB %d commit not running", lnum);
2240 } 2264 }
2241 ubifs_err("*** SETTING FAILURE MODE ON (LEB %d) ***", lnum); 2265 ubifs_err("*** SETTING FAILURE MODE ON (LEB %d) ***", lnum);
2242 c->failure_mode = 1; 2266 d->failure_mode = 1;
2243 dump_stack(); 2267 dump_stack();
2244 return 1; 2268 return 1;
2245} 2269}
@@ -2344,4 +2368,181 @@ int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype)
2344 return 0; 2368 return 0;
2345} 2369}
2346 2370
2371/**
2372 * ubifs_debugging_init - initialize UBIFS debugging.
2373 * @c: UBIFS file-system description object
2374 *
2375 * This function initializes debugging-related data for the file system.
2376 * Returns zero in case of success and a negative error code in case of
2377 * failure.
2378 */
2379int ubifs_debugging_init(struct ubifs_info *c)
2380{
2381 c->dbg = kzalloc(sizeof(struct ubifs_debug_info), GFP_KERNEL);
2382 if (!c->dbg)
2383 return -ENOMEM;
2384
2385 c->dbg->buf = vmalloc(c->leb_size);
2386 if (!c->dbg->buf)
2387 goto out;
2388
2389 failure_mode_init(c);
2390 return 0;
2391
2392out:
2393 kfree(c->dbg);
2394 return -ENOMEM;
2395}
2396
2397/**
2398 * ubifs_debugging_exit - free debugging data.
2399 * @c: UBIFS file-system description object
2400 */
2401void ubifs_debugging_exit(struct ubifs_info *c)
2402{
2403 failure_mode_exit(c);
2404 vfree(c->dbg->buf);
2405 kfree(c->dbg);
2406}
2407
2408/*
2409 * Root directory for UBIFS stuff in debugfs. Contains sub-directories which
2410 * contain the stuff specific to particular file-system mounts.
2411 */
2412static struct dentry *debugfs_rootdir;
2413
2414/**
2415 * dbg_debugfs_init - initialize debugfs file-system.
2416 *
2417 * UBIFS uses debugfs file-system to expose various debugging knobs to
2418 * user-space. This function creates "ubifs" directory in the debugfs
2419 * file-system. Returns zero in case of success and a negative error code in
2420 * case of failure.
2421 */
2422int dbg_debugfs_init(void)
2423{
2424 debugfs_rootdir = debugfs_create_dir("ubifs", NULL);
2425 if (IS_ERR(debugfs_rootdir)) {
2426 int err = PTR_ERR(debugfs_rootdir);
2427 ubifs_err("cannot create \"ubifs\" debugfs directory, "
2428 "error %d\n", err);
2429 return err;
2430 }
2431
2432 return 0;
2433}
2434
2435/**
2436 * dbg_debugfs_exit - remove the "ubifs" directory from debugfs file-system.
2437 */
2438void dbg_debugfs_exit(void)
2439{
2440 debugfs_remove(debugfs_rootdir);
2441}
2442
2443static int open_debugfs_file(struct inode *inode, struct file *file)
2444{
2445 file->private_data = inode->i_private;
2446 return 0;
2447}
2448
2449static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
2450 size_t count, loff_t *ppos)
2451{
2452 struct ubifs_info *c = file->private_data;
2453 struct ubifs_debug_info *d = c->dbg;
2454
2455 if (file->f_path.dentry == d->dump_lprops)
2456 dbg_dump_lprops(c);
2457 else if (file->f_path.dentry == d->dump_budg) {
2458 spin_lock(&c->space_lock);
2459 dbg_dump_budg(c);
2460 spin_unlock(&c->space_lock);
2461 } else if (file->f_path.dentry == d->dump_tnc) {
2462 mutex_lock(&c->tnc_mutex);
2463 dbg_dump_tnc(c);
2464 mutex_unlock(&c->tnc_mutex);
2465 } else
2466 return -EINVAL;
2467
2468 *ppos += count;
2469 return count;
2470}
2471
2472static const struct file_operations debugfs_fops = {
2473 .open = open_debugfs_file,
2474 .write = write_debugfs_file,
2475 .owner = THIS_MODULE,
2476};
2477
2478/**
2479 * dbg_debugfs_init_fs - initialize debugfs for UBIFS instance.
2480 * @c: UBIFS file-system description object
2481 *
2482 * This function creates all debugfs files for this instance of UBIFS. Returns
2483 * zero in case of success and a negative error code in case of failure.
2484 *
2485 * Note, the only reason we have not merged this function with the
2486 * 'ubifs_debugging_init()' function is because it is better to initialize
2487 * debugfs interfaces at the very end of the mount process, and remove them at
2488 * the very beginning of the mount process.
2489 */
2490int dbg_debugfs_init_fs(struct ubifs_info *c)
2491{
2492 int err;
2493 const char *fname;
2494 struct dentry *dent;
2495 struct ubifs_debug_info *d = c->dbg;
2496
2497 sprintf(d->debugfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id);
2498 d->debugfs_dir = debugfs_create_dir(d->debugfs_dir_name,
2499 debugfs_rootdir);
2500 if (IS_ERR(d->debugfs_dir)) {
2501 err = PTR_ERR(d->debugfs_dir);
2502 ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
2503 d->debugfs_dir_name, err);
2504 goto out;
2505 }
2506
2507 fname = "dump_lprops";
2508 dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c,
2509 &debugfs_fops);
2510 if (IS_ERR(dent))
2511 goto out_remove;
2512 d->dump_lprops = dent;
2513
2514 fname = "dump_budg";
2515 dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c,
2516 &debugfs_fops);
2517 if (IS_ERR(dent))
2518 goto out_remove;
2519 d->dump_budg = dent;
2520
2521 fname = "dump_tnc";
2522 dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c,
2523 &debugfs_fops);
2524 if (IS_ERR(dent))
2525 goto out_remove;
2526 d->dump_tnc = dent;
2527
2528 return 0;
2529
2530out_remove:
2531 err = PTR_ERR(dent);
2532 ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
2533 fname, err);
2534 debugfs_remove_recursive(d->debugfs_dir);
2535out:
2536 return err;
2537}
2538
2539/**
2540 * dbg_debugfs_exit_fs - remove all debugfs files.
2541 * @c: UBIFS file-system description object
2542 */
2543void dbg_debugfs_exit_fs(struct ubifs_info *c)
2544{
2545 debugfs_remove_recursive(c->dbg->debugfs_dir);
2546}
2547
2347#endif /* CONFIG_UBIFS_FS_DEBUG */ 2548#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 33d6b95071e4..9820d6999f7e 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -25,7 +25,56 @@
25 25
26#ifdef CONFIG_UBIFS_FS_DEBUG 26#ifdef CONFIG_UBIFS_FS_DEBUG
27 27
28#define UBIFS_DBG(op) op 28/**
29 * ubifs_debug_info - per-FS debugging information.
30 * @buf: a buffer of LEB size, used for various purposes
31 * @old_zroot: old index root - used by 'dbg_check_old_index()'
32 * @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
33 * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
34 * @failure_mode: failure mode for recovery testing
35 * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls
36 * @fail_timeout: time in jiffies when delay of failure mode expires
37 * @fail_cnt: current number of calls to failure mode I/O functions
38 * @fail_cnt_max: number of calls by which to delay failure mode
39 * @chk_lpt_sz: used by LPT tree size checker
40 * @chk_lpt_sz2: used by LPT tree size checker
41 * @chk_lpt_wastage: used by LPT tree size checker
42 * @chk_lpt_lebs: used by LPT tree size checker
43 * @new_nhead_offs: used by LPT tree size checker
44 * @new_ihead_lnum: used by debugging to check ihead_lnum
45 * @new_ihead_offs: used by debugging to check ihead_offs
46 *
47 * debugfs_dir_name: name of debugfs directory containing this file-system's
48 * files
49 * debugfs_dir: direntry object of the file-system debugfs directory
50 * dump_lprops: "dump lprops" debugfs knob
51 * dump_budg: "dump budgeting information" debugfs knob
52 * dump_tnc: "dump TNC" debugfs knob
53 */
54struct ubifs_debug_info {
55 void *buf;
56 struct ubifs_zbranch old_zroot;
57 int old_zroot_level;
58 unsigned long long old_zroot_sqnum;
59 int failure_mode;
60 int fail_delay;
61 unsigned long fail_timeout;
62 unsigned int fail_cnt;
63 unsigned int fail_cnt_max;
64 long long chk_lpt_sz;
65 long long chk_lpt_sz2;
66 long long chk_lpt_wastage;
67 int chk_lpt_lebs;
68 int new_nhead_offs;
69 int new_ihead_lnum;
70 int new_ihead_offs;
71
72 char debugfs_dir_name[100];
73 struct dentry *debugfs_dir;
74 struct dentry *dump_lprops;
75 struct dentry *dump_budg;
76 struct dentry *dump_tnc;
77};
29 78
30#define ubifs_assert(expr) do { \ 79#define ubifs_assert(expr) do { \
31 if (unlikely(!(expr))) { \ 80 if (unlikely(!(expr))) { \
@@ -211,14 +260,18 @@ extern unsigned int ubifs_msg_flags;
211extern unsigned int ubifs_chk_flags; 260extern unsigned int ubifs_chk_flags;
212extern unsigned int ubifs_tst_flags; 261extern unsigned int ubifs_tst_flags;
213 262
214/* Dump functions */ 263int ubifs_debugging_init(struct ubifs_info *c);
264void ubifs_debugging_exit(struct ubifs_info *c);
215 265
266/* Dump functions */
216const char *dbg_ntype(int type); 267const char *dbg_ntype(int type);
217const char *dbg_cstate(int cmt_state); 268const char *dbg_cstate(int cmt_state);
218const char *dbg_get_key_dump(const struct ubifs_info *c, 269const char *dbg_get_key_dump(const struct ubifs_info *c,
219 const union ubifs_key *key); 270 const union ubifs_key *key);
220void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode); 271void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode);
221void dbg_dump_node(const struct ubifs_info *c, const void *node); 272void dbg_dump_node(const struct ubifs_info *c, const void *node);
273void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum,
274 int offs);
222void dbg_dump_budget_req(const struct ubifs_budget_req *req); 275void dbg_dump_budget_req(const struct ubifs_budget_req *req);
223void dbg_dump_lstats(const struct ubifs_lp_stats *lst); 276void dbg_dump_lstats(const struct ubifs_lp_stats *lst);
224void dbg_dump_budg(struct ubifs_info *c); 277void dbg_dump_budg(struct ubifs_info *c);
@@ -233,9 +286,9 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
233 struct ubifs_nnode *parent, int iip); 286 struct ubifs_nnode *parent, int iip);
234void dbg_dump_tnc(struct ubifs_info *c); 287void dbg_dump_tnc(struct ubifs_info *c);
235void dbg_dump_index(struct ubifs_info *c); 288void dbg_dump_index(struct ubifs_info *c);
289void dbg_dump_lpt_lebs(const struct ubifs_info *c);
236 290
237/* Checking helper functions */ 291/* Checking helper functions */
238
239typedef int (*dbg_leaf_callback)(struct ubifs_info *c, 292typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
240 struct ubifs_zbranch *zbr, void *priv); 293 struct ubifs_zbranch *zbr, void *priv);
241typedef int (*dbg_znode_callback)(struct ubifs_info *c, 294typedef int (*dbg_znode_callback)(struct ubifs_info *c,
@@ -274,9 +327,6 @@ int dbg_force_in_the_gaps(void);
274 327
275#define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY) 328#define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY)
276 329
277void dbg_failure_mode_registration(struct ubifs_info *c);
278void dbg_failure_mode_deregistration(struct ubifs_info *c);
279
280#ifndef UBIFS_DBG_PRESERVE_UBI 330#ifndef UBIFS_DBG_PRESERVE_UBI
281 331
282#define ubi_leb_read dbg_leb_read 332#define ubi_leb_read dbg_leb_read
@@ -318,9 +368,13 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
318 return dbg_leb_change(desc, lnum, buf, len, UBI_UNKNOWN); 368 return dbg_leb_change(desc, lnum, buf, len, UBI_UNKNOWN);
319} 369}
320 370
321#else /* !CONFIG_UBIFS_FS_DEBUG */ 371/* Debugfs-related stuff */
372int dbg_debugfs_init(void);
373void dbg_debugfs_exit(void);
374int dbg_debugfs_init_fs(struct ubifs_info *c);
375void dbg_debugfs_exit_fs(struct ubifs_info *c);
322 376
323#define UBIFS_DBG(op) 377#else /* !CONFIG_UBIFS_FS_DEBUG */
324 378
325/* Use "if (0)" to make compiler check arguments even if debugging is off */ 379/* Use "if (0)" to make compiler check arguments even if debugging is off */
326#define ubifs_assert(expr) do { \ 380#define ubifs_assert(expr) do { \
@@ -360,23 +414,28 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
360#define DBGKEY(key) ((char *)(key)) 414#define DBGKEY(key) ((char *)(key))
361#define DBGKEY1(key) ((char *)(key)) 415#define DBGKEY1(key) ((char *)(key))
362 416
363#define dbg_ntype(type) "" 417#define ubifs_debugging_init(c) 0
364#define dbg_cstate(cmt_state) "" 418#define ubifs_debugging_exit(c) ({})
365#define dbg_get_key_dump(c, key) ({}) 419
366#define dbg_dump_inode(c, inode) ({}) 420#define dbg_ntype(type) ""
367#define dbg_dump_node(c, node) ({}) 421#define dbg_cstate(cmt_state) ""
368#define dbg_dump_budget_req(req) ({}) 422#define dbg_get_key_dump(c, key) ({})
369#define dbg_dump_lstats(lst) ({}) 423#define dbg_dump_inode(c, inode) ({})
370#define dbg_dump_budg(c) ({}) 424#define dbg_dump_node(c, node) ({})
371#define dbg_dump_lprop(c, lp) ({}) 425#define dbg_dump_lpt_node(c, node, lnum, offs) ({})
372#define dbg_dump_lprops(c) ({}) 426#define dbg_dump_budget_req(req) ({})
373#define dbg_dump_lpt_info(c) ({}) 427#define dbg_dump_lstats(lst) ({})
374#define dbg_dump_leb(c, lnum) ({}) 428#define dbg_dump_budg(c) ({})
375#define dbg_dump_znode(c, znode) ({}) 429#define dbg_dump_lprop(c, lp) ({})
376#define dbg_dump_heap(c, heap, cat) ({}) 430#define dbg_dump_lprops(c) ({})
377#define dbg_dump_pnode(c, pnode, parent, iip) ({}) 431#define dbg_dump_lpt_info(c) ({})
378#define dbg_dump_tnc(c) ({}) 432#define dbg_dump_leb(c, lnum) ({})
379#define dbg_dump_index(c) ({}) 433#define dbg_dump_znode(c, znode) ({})
434#define dbg_dump_heap(c, heap, cat) ({})
435#define dbg_dump_pnode(c, pnode, parent, iip) ({})
436#define dbg_dump_tnc(c) ({})
437#define dbg_dump_index(c) ({})
438#define dbg_dump_lpt_lebs(c) ({})
380 439
381#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0 440#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
382#define dbg_old_index_check_init(c, zroot) 0 441#define dbg_old_index_check_init(c, zroot) 0
@@ -396,9 +455,11 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
396#define dbg_force_in_the_gaps_enabled 0 455#define dbg_force_in_the_gaps_enabled 0
397#define dbg_force_in_the_gaps() 0 456#define dbg_force_in_the_gaps() 0
398#define dbg_failure_mode 0 457#define dbg_failure_mode 0
399#define dbg_failure_mode_registration(c) ({})
400#define dbg_failure_mode_deregistration(c) ({})
401 458
402#endif /* !CONFIG_UBIFS_FS_DEBUG */ 459#define dbg_debugfs_init() 0
460#define dbg_debugfs_exit()
461#define dbg_debugfs_init_fs(c) 0
462#define dbg_debugfs_exit_fs(c) 0
403 463
464#endif /* !CONFIG_UBIFS_FS_DEBUG */
404#endif /* !__UBIFS_DEBUG_H__ */ 465#endif /* !__UBIFS_DEBUG_H__ */
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 2624411d9758..bf37374567fa 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -72,8 +72,8 @@ static int read_block(struct inode *inode, void *addr, unsigned int block,
72 return err; 72 return err;
73 } 73 }
74 74
75 ubifs_assert(le64_to_cpu(dn->ch.sqnum) > ubifs_inode(inode)->creat_sqnum); 75 ubifs_assert(le64_to_cpu(dn->ch.sqnum) >
76 76 ubifs_inode(inode)->creat_sqnum);
77 len = le32_to_cpu(dn->size); 77 len = le32_to_cpu(dn->size);
78 if (len <= 0 || len > UBIFS_BLOCK_SIZE) 78 if (len <= 0 || len > UBIFS_BLOCK_SIZE)
79 goto dump; 79 goto dump;
@@ -219,7 +219,8 @@ static void release_existing_page_budget(struct ubifs_info *c)
219} 219}
220 220
221static int write_begin_slow(struct address_space *mapping, 221static int write_begin_slow(struct address_space *mapping,
222 loff_t pos, unsigned len, struct page **pagep) 222 loff_t pos, unsigned len, struct page **pagep,
223 unsigned flags)
223{ 224{
224 struct inode *inode = mapping->host; 225 struct inode *inode = mapping->host;
225 struct ubifs_info *c = inode->i_sb->s_fs_info; 226 struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -247,14 +248,14 @@ static int write_begin_slow(struct address_space *mapping,
247 if (unlikely(err)) 248 if (unlikely(err))
248 return err; 249 return err;
249 250
250 page = __grab_cache_page(mapping, index); 251 page = grab_cache_page_write_begin(mapping, index, flags);
251 if (unlikely(!page)) { 252 if (unlikely(!page)) {
252 ubifs_release_budget(c, &req); 253 ubifs_release_budget(c, &req);
253 return -ENOMEM; 254 return -ENOMEM;
254 } 255 }
255 256
256 if (!PageUptodate(page)) { 257 if (!PageUptodate(page)) {
257 if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) 258 if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
258 SetPageChecked(page); 259 SetPageChecked(page);
259 else { 260 else {
260 err = do_readpage(page); 261 err = do_readpage(page);
@@ -438,13 +439,13 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
438 return -EROFS; 439 return -EROFS;
439 440
440 /* Try out the fast-path part first */ 441 /* Try out the fast-path part first */
441 page = __grab_cache_page(mapping, index); 442 page = grab_cache_page_write_begin(mapping, index, flags);
442 if (unlikely(!page)) 443 if (unlikely(!page))
443 return -ENOMEM; 444 return -ENOMEM;
444 445
445 if (!PageUptodate(page)) { 446 if (!PageUptodate(page)) {
446 /* The page is not loaded from the flash */ 447 /* The page is not loaded from the flash */
447 if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) 448 if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
448 /* 449 /*
449 * We change whole page so no need to load it. But we 450 * We change whole page so no need to load it. But we
450 * have to set the @PG_checked flag to make the further 451 * have to set the @PG_checked flag to make the further
@@ -483,7 +484,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
483 unlock_page(page); 484 unlock_page(page);
484 page_cache_release(page); 485 page_cache_release(page);
485 486
486 return write_begin_slow(mapping, pos, len, pagep); 487 return write_begin_slow(mapping, pos, len, pagep, flags);
487 } 488 }
488 489
489 /* 490 /*
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 0bef6501d58a..9832f9abe28e 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -45,7 +45,7 @@
45#define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ 45#define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ
46 46
47/* 47/*
48 * GC may need to move more then one LEB to make progress. The below constants 48 * GC may need to move more than one LEB to make progress. The below constants
49 * define "soft" and "hard" limits on the number of LEBs the garbage collector 49 * define "soft" and "hard" limits on the number of LEBs the garbage collector
50 * may move. 50 * may move.
51 */ 51 */
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 5e82cffe9695..6db7a6be6c97 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -154,6 +154,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
154 case FS_IOC_GETFLAGS: 154 case FS_IOC_GETFLAGS:
155 flags = ubifs2ioctl(ubifs_inode(inode)->flags); 155 flags = ubifs2ioctl(ubifs_inode(inode)->flags);
156 156
157 dbg_gen("get flags: %#x, i_flags %#x", flags, inode->i_flags);
157 return put_user(flags, (int __user *) arg); 158 return put_user(flags, (int __user *) arg);
158 159
159 case FS_IOC_SETFLAGS: { 160 case FS_IOC_SETFLAGS: {
@@ -176,6 +177,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
176 err = mnt_want_write(file->f_path.mnt); 177 err = mnt_want_write(file->f_path.mnt);
177 if (err) 178 if (err)
178 return err; 179 return err;
180 dbg_gen("set flags: %#x, i_flags %#x", flags, inode->i_flags);
179 err = setflags(inode, flags); 181 err = setflags(inode, flags);
180 mnt_drop_write(file->f_path.mnt); 182 mnt_drop_write(file->f_path.mnt);
181 return err; 183 return err;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index f91b745908ea..9b7c54e0cd2a 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -191,7 +191,7 @@ again:
191 if (wbuf->lnum != -1 && avail >= len) { 191 if (wbuf->lnum != -1 && avail >= len) {
192 /* 192 /*
193 * Someone else has switched the journal head and we have 193 * Someone else has switched the journal head and we have
194 * enough space now. This happens when more then one process is 194 * enough space now. This happens when more than one process is
195 * trying to write to the same journal head at the same time. 195 * trying to write to the same journal head at the same time.
196 */ 196 */
197 dbg_jnl("return LEB %d back, already have LEB %d:%d", 197 dbg_jnl("return LEB %d back, already have LEB %d:%d",
@@ -704,7 +704,7 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
704 data->size = cpu_to_le32(len); 704 data->size = cpu_to_le32(len);
705 zero_data_node_unused(data); 705 zero_data_node_unused(data);
706 706
707 if (!(ui->flags && UBIFS_COMPR_FL)) 707 if (!(ui->flags & UBIFS_COMPR_FL))
708 /* Compression is disabled for this inode */ 708 /* Compression is disabled for this inode */
709 compr_type = UBIFS_COMPR_NONE; 709 compr_type = UBIFS_COMPR_NONE;
710 else 710 else
@@ -1220,7 +1220,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
1220 data_key_init(c, &key, inum, blk); 1220 data_key_init(c, &key, inum, blk);
1221 1221
1222 bit = old_size & (UBIFS_BLOCK_SIZE - 1); 1222 bit = old_size & (UBIFS_BLOCK_SIZE - 1);
1223 blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0: 1); 1223 blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0 : 1);
1224 data_key_init(c, &to_key, inum, blk); 1224 data_key_init(c, &to_key, inum, blk);
1225 1225
1226 err = ubifs_tnc_remove_range(c, &key, &to_key); 1226 err = ubifs_tnc_remove_range(c, &key, &to_key);
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 3f1f16bc25c9..efb3430a2581 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -38,6 +38,22 @@
38#define __UBIFS_KEY_H__ 38#define __UBIFS_KEY_H__
39 39
40/** 40/**
41 * key_mask_hash - mask a valid hash value.
42 * @val: value to be masked
43 *
44 * We use hash values as offset in directories, so values %0 and %1 are
45 * reserved for "." and "..". %2 is reserved for "end of readdir" marker. This
46 * function makes sure the reserved values are not used.
47 */
48static inline uint32_t key_mask_hash(uint32_t hash)
49{
50 hash &= UBIFS_S_KEY_HASH_MASK;
51 if (unlikely(hash <= 2))
52 hash += 3;
53 return hash;
54}
55
56/**
41 * key_r5_hash - R5 hash function (borrowed from reiserfs). 57 * key_r5_hash - R5 hash function (borrowed from reiserfs).
42 * @s: direntry name 58 * @s: direntry name
43 * @len: name length 59 * @len: name length
@@ -54,16 +70,7 @@ static inline uint32_t key_r5_hash(const char *s, int len)
54 str++; 70 str++;
55 } 71 }
56 72
57 a &= UBIFS_S_KEY_HASH_MASK; 73 return key_mask_hash(a);
58
59 /*
60 * We use hash values as offset in directories, so values %0 and %1 are
61 * reserved for "." and "..". %2 is reserved for "end of readdir"
62 * marker.
63 */
64 if (unlikely(a >= 0 && a <= 2))
65 a += 3;
66 return a;
67} 74}
68 75
69/** 76/**
@@ -77,10 +84,7 @@ static inline uint32_t key_test_hash(const char *str, int len)
77 84
78 len = min_t(uint32_t, len, 4); 85 len = min_t(uint32_t, len, 4);
79 memcpy(&a, str, len); 86 memcpy(&a, str, len);
80 a &= UBIFS_S_KEY_HASH_MASK; 87 return key_mask_hash(a);
81 if (unlikely(a >= 0 && a <= 2))
82 a += 3;
83 return a;
84} 88}
85 89
86/** 90/**
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index f27176e9b70d..dfd2bcece27a 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -520,13 +520,13 @@ static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops)
520 * @flags: new flags 520 * @flags: new flags
521 * @idx_gc_cnt: change to the count of idx_gc list 521 * @idx_gc_cnt: change to the count of idx_gc list
522 * 522 *
523 * This function changes LEB properties. This function does not change a LEB 523 * This function changes LEB properties (@free, @dirty or @flag). However, the
524 * property (@free, @dirty or @flag) if the value passed is %LPROPS_NC. 524 * property which has the %LPROPS_NC value is not changed. Returns a pointer to
525 * the updated LEB properties on success and a negative error code on failure.
525 * 526 *
526 * This function returns a pointer to the updated LEB properties on success 527 * Note, the LEB properties may have had to be copied (due to COW) and
527 * and a negative error code on failure. N.B. the LEB properties may have had to 528 * consequently the pointer returned may not be the same as the pointer
528 * be copied (due to COW) and consequently the pointer returned may not be the 529 * passed.
529 * same as the pointer passed.
530 */ 530 */
531const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, 531const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
532 const struct ubifs_lprops *lp, 532 const struct ubifs_lprops *lp,
@@ -1088,7 +1088,7 @@ static int scan_check_cb(struct ubifs_info *c,
1088 } 1088 }
1089 } 1089 }
1090 1090
1091 sleb = ubifs_scan(c, lnum, 0, c->dbg_buf); 1091 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
1092 if (IS_ERR(sleb)) { 1092 if (IS_ERR(sleb)) {
1093 /* 1093 /*
1094 * After an unclean unmount, empty and freeable LEBs 1094 * After an unclean unmount, empty and freeable LEBs
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index db8bd0e518b2..b2792e84d245 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -36,15 +36,16 @@
36 * can be written into a single eraseblock. In that case, garbage collection 36 * can be written into a single eraseblock. In that case, garbage collection
37 * consists of just writing the whole table, which therefore makes all other 37 * consists of just writing the whole table, which therefore makes all other
38 * eraseblocks reusable. In the case of the big model, dirty eraseblocks are 38 * eraseblocks reusable. In the case of the big model, dirty eraseblocks are
39 * selected for garbage collection, which consists are marking the nodes in 39 * selected for garbage collection, which consists of marking the clean nodes in
40 * that LEB as dirty, and then only the dirty nodes are written out. Also, in 40 * that LEB as dirty, and then only the dirty nodes are written out. Also, in
41 * the case of the big model, a table of LEB numbers is saved so that the entire 41 * the case of the big model, a table of LEB numbers is saved so that the entire
42 * LPT does not to be scanned looking for empty eraseblocks when UBIFS is first 42 * LPT does not to be scanned looking for empty eraseblocks when UBIFS is first
43 * mounted. 43 * mounted.
44 */ 44 */
45 45
46#include <linux/crc16.h>
47#include "ubifs.h" 46#include "ubifs.h"
47#include <linux/crc16.h>
48#include <linux/math64.h>
48 49
49/** 50/**
50 * do_calc_lpt_geom - calculate sizes for the LPT area. 51 * do_calc_lpt_geom - calculate sizes for the LPT area.
@@ -135,15 +136,13 @@ static void do_calc_lpt_geom(struct ubifs_info *c)
135int ubifs_calc_lpt_geom(struct ubifs_info *c) 136int ubifs_calc_lpt_geom(struct ubifs_info *c)
136{ 137{
137 int lebs_needed; 138 int lebs_needed;
138 uint64_t sz; 139 long long sz;
139 140
140 do_calc_lpt_geom(c); 141 do_calc_lpt_geom(c);
141 142
142 /* Verify that lpt_lebs is big enough */ 143 /* Verify that lpt_lebs is big enough */
143 sz = c->lpt_sz * 2; /* Must have at least 2 times the size */ 144 sz = c->lpt_sz * 2; /* Must have at least 2 times the size */
144 sz += c->leb_size - 1; 145 lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size);
145 do_div(sz, c->leb_size);
146 lebs_needed = sz;
147 if (lebs_needed > c->lpt_lebs) { 146 if (lebs_needed > c->lpt_lebs) {
148 ubifs_err("too few LPT LEBs"); 147 ubifs_err("too few LPT LEBs");
149 return -EINVAL; 148 return -EINVAL;
@@ -156,7 +155,6 @@ int ubifs_calc_lpt_geom(struct ubifs_info *c)
156 } 155 }
157 156
158 c->check_lpt_free = c->big_lpt; 157 c->check_lpt_free = c->big_lpt;
159
160 return 0; 158 return 0;
161} 159}
162 160
@@ -176,7 +174,7 @@ static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs,
176 int *big_lpt) 174 int *big_lpt)
177{ 175{
178 int i, lebs_needed; 176 int i, lebs_needed;
179 uint64_t sz; 177 long long sz;
180 178
181 /* Start by assuming the minimum number of LPT LEBs */ 179 /* Start by assuming the minimum number of LPT LEBs */
182 c->lpt_lebs = UBIFS_MIN_LPT_LEBS; 180 c->lpt_lebs = UBIFS_MIN_LPT_LEBS;
@@ -203,9 +201,7 @@ static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs,
203 /* Now check there are enough LPT LEBs */ 201 /* Now check there are enough LPT LEBs */
204 for (i = 0; i < 64 ; i++) { 202 for (i = 0; i < 64 ; i++) {
205 sz = c->lpt_sz * 4; /* Allow 4 times the size */ 203 sz = c->lpt_sz * 4; /* Allow 4 times the size */
206 sz += c->leb_size - 1; 204 lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size);
207 do_div(sz, c->leb_size);
208 lebs_needed = sz;
209 if (lebs_needed > c->lpt_lebs) { 205 if (lebs_needed > c->lpt_lebs) {
210 /* Not enough LPT LEBs so try again with more */ 206 /* Not enough LPT LEBs so try again with more */
211 c->lpt_lebs = lebs_needed; 207 c->lpt_lebs = lebs_needed;
@@ -558,7 +554,7 @@ static int calc_nnode_num(int row, int col)
558 * This function calculates and returns the nnode number based on the parent's 554 * This function calculates and returns the nnode number based on the parent's
559 * nnode number and the index in parent. 555 * nnode number and the index in parent.
560 */ 556 */
561static int calc_nnode_num_from_parent(struct ubifs_info *c, 557static int calc_nnode_num_from_parent(const struct ubifs_info *c,
562 struct ubifs_nnode *parent, int iip) 558 struct ubifs_nnode *parent, int iip)
563{ 559{
564 int num, shft; 560 int num, shft;
@@ -583,7 +579,7 @@ static int calc_nnode_num_from_parent(struct ubifs_info *c,
583 * This function calculates and returns the pnode number based on the parent's 579 * This function calculates and returns the pnode number based on the parent's
584 * nnode number and the index in parent. 580 * nnode number and the index in parent.
585 */ 581 */
586static int calc_pnode_num_from_parent(struct ubifs_info *c, 582static int calc_pnode_num_from_parent(const struct ubifs_info *c,
587 struct ubifs_nnode *parent, int iip) 583 struct ubifs_nnode *parent, int iip)
588{ 584{
589 int i, n = c->lpt_hght - 1, pnum = parent->num, num = 0; 585 int i, n = c->lpt_hght - 1, pnum = parent->num, num = 0;
@@ -966,7 +962,7 @@ static int check_lpt_type(uint8_t **addr, int *pos, int type)
966 * 962 *
967 * This function returns %0 on success and a negative error code on failure. 963 * This function returns %0 on success and a negative error code on failure.
968 */ 964 */
969static int unpack_pnode(struct ubifs_info *c, void *buf, 965static int unpack_pnode(const struct ubifs_info *c, void *buf,
970 struct ubifs_pnode *pnode) 966 struct ubifs_pnode *pnode)
971{ 967{
972 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; 968 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
@@ -996,15 +992,15 @@ static int unpack_pnode(struct ubifs_info *c, void *buf,
996} 992}
997 993
998/** 994/**
999 * unpack_nnode - unpack a nnode. 995 * ubifs_unpack_nnode - unpack a nnode.
1000 * @c: UBIFS file-system description object 996 * @c: UBIFS file-system description object
1001 * @buf: buffer containing packed nnode to unpack 997 * @buf: buffer containing packed nnode to unpack
1002 * @nnode: nnode structure to fill 998 * @nnode: nnode structure to fill
1003 * 999 *
1004 * This function returns %0 on success and a negative error code on failure. 1000 * This function returns %0 on success and a negative error code on failure.
1005 */ 1001 */
1006static int unpack_nnode(struct ubifs_info *c, void *buf, 1002int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf,
1007 struct ubifs_nnode *nnode) 1003 struct ubifs_nnode *nnode)
1008{ 1004{
1009 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; 1005 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
1010 int i, pos = 0, err; 1006 int i, pos = 0, err;
@@ -1036,7 +1032,7 @@ static int unpack_nnode(struct ubifs_info *c, void *buf,
1036 * 1032 *
1037 * This function returns %0 on success and a negative error code on failure. 1033 * This function returns %0 on success and a negative error code on failure.
1038 */ 1034 */
1039static int unpack_ltab(struct ubifs_info *c, void *buf) 1035static int unpack_ltab(const struct ubifs_info *c, void *buf)
1040{ 1036{
1041 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; 1037 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
1042 int i, pos = 0, err; 1038 int i, pos = 0, err;
@@ -1068,7 +1064,7 @@ static int unpack_ltab(struct ubifs_info *c, void *buf)
1068 * 1064 *
1069 * This function returns %0 on success and a negative error code on failure. 1065 * This function returns %0 on success and a negative error code on failure.
1070 */ 1066 */
1071static int unpack_lsave(struct ubifs_info *c, void *buf) 1067static int unpack_lsave(const struct ubifs_info *c, void *buf)
1072{ 1068{
1073 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; 1069 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
1074 int i, pos = 0, err; 1070 int i, pos = 0, err;
@@ -1096,7 +1092,7 @@ static int unpack_lsave(struct ubifs_info *c, void *buf)
1096 * 1092 *
1097 * This function returns %0 on success and a negative error code on failure. 1093 * This function returns %0 on success and a negative error code on failure.
1098 */ 1094 */
1099static int validate_nnode(struct ubifs_info *c, struct ubifs_nnode *nnode, 1095static int validate_nnode(const struct ubifs_info *c, struct ubifs_nnode *nnode,
1100 struct ubifs_nnode *parent, int iip) 1096 struct ubifs_nnode *parent, int iip)
1101{ 1097{
1102 int i, lvl, max_offs; 1098 int i, lvl, max_offs;
@@ -1140,7 +1136,7 @@ static int validate_nnode(struct ubifs_info *c, struct ubifs_nnode *nnode,
1140 * 1136 *
1141 * This function returns %0 on success and a negative error code on failure. 1137 * This function returns %0 on success and a negative error code on failure.
1142 */ 1138 */
1143static int validate_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, 1139static int validate_pnode(const struct ubifs_info *c, struct ubifs_pnode *pnode,
1144 struct ubifs_nnode *parent, int iip) 1140 struct ubifs_nnode *parent, int iip)
1145{ 1141{
1146 int i; 1142 int i;
@@ -1174,7 +1170,8 @@ static int validate_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
1174 * This function calculates the LEB numbers for the LEB properties it contains 1170 * This function calculates the LEB numbers for the LEB properties it contains
1175 * based on the pnode number. 1171 * based on the pnode number.
1176 */ 1172 */
1177static void set_pnode_lnum(struct ubifs_info *c, struct ubifs_pnode *pnode) 1173static void set_pnode_lnum(const struct ubifs_info *c,
1174 struct ubifs_pnode *pnode)
1178{ 1175{
1179 int i, lnum; 1176 int i, lnum;
1180 1177
@@ -1227,7 +1224,7 @@ int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
1227 err = ubi_read(c->ubi, lnum, buf, offs, c->nnode_sz); 1224 err = ubi_read(c->ubi, lnum, buf, offs, c->nnode_sz);
1228 if (err) 1225 if (err)
1229 goto out; 1226 goto out;
1230 err = unpack_nnode(c, buf, nnode); 1227 err = ubifs_unpack_nnode(c, buf, nnode);
1231 if (err) 1228 if (err)
1232 goto out; 1229 goto out;
1233 } 1230 }
@@ -1816,7 +1813,7 @@ static struct ubifs_nnode *scan_get_nnode(struct ubifs_info *c,
1816 c->nnode_sz); 1813 c->nnode_sz);
1817 if (err) 1814 if (err)
1818 return ERR_PTR(err); 1815 return ERR_PTR(err);
1819 err = unpack_nnode(c, buf, nnode); 1816 err = ubifs_unpack_nnode(c, buf, nnode);
1820 if (err) 1817 if (err)
1821 return ERR_PTR(err); 1818 return ERR_PTR(err);
1822 } 1819 }
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index a41434b42785..96ca95707175 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -320,6 +320,8 @@ no_space:
320 dbg_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, " 320 dbg_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, "
321 "done_lsave %d", lnum, offs, len, done_ltab, done_lsave); 321 "done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
322 dbg_dump_lpt_info(c); 322 dbg_dump_lpt_info(c);
323 dbg_dump_lpt_lebs(c);
324 dump_stack();
323 return err; 325 return err;
324} 326}
325 327
@@ -546,8 +548,10 @@ static int write_cnodes(struct ubifs_info *c)
546no_space: 548no_space:
547 ubifs_err("LPT out of space mismatch"); 549 ubifs_err("LPT out of space mismatch");
548 dbg_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab " 550 dbg_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab "
549 "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave); 551 "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
550 dbg_dump_lpt_info(c); 552 dbg_dump_lpt_info(c);
553 dbg_dump_lpt_lebs(c);
554 dump_stack();
551 return err; 555 return err;
552} 556}
553 557
@@ -749,7 +753,7 @@ static void lpt_tgc_start(struct ubifs_info *c)
749 * LPT trivial garbage collection is where a LPT LEB contains only dirty and 753 * LPT trivial garbage collection is where a LPT LEB contains only dirty and
750 * free space and so may be reused as soon as the next commit is completed. 754 * free space and so may be reused as soon as the next commit is completed.
751 * This function is called after the commit is completed (master node has been 755 * This function is called after the commit is completed (master node has been
752 * written) and unmaps LPT LEBs that were marked for trivial GC. 756 * written) and un-maps LPT LEBs that were marked for trivial GC.
753 */ 757 */
754static int lpt_tgc_end(struct ubifs_info *c) 758static int lpt_tgc_end(struct ubifs_info *c)
755{ 759{
@@ -1025,7 +1029,7 @@ static int make_node_dirty(struct ubifs_info *c, int node_type, int node_num,
1025 * @c: UBIFS file-system description object 1029 * @c: UBIFS file-system description object
1026 * @node_type: LPT node type 1030 * @node_type: LPT node type
1027 */ 1031 */
1028static int get_lpt_node_len(struct ubifs_info *c, int node_type) 1032static int get_lpt_node_len(const struct ubifs_info *c, int node_type)
1029{ 1033{
1030 switch (node_type) { 1034 switch (node_type) {
1031 case UBIFS_LPT_NNODE: 1035 case UBIFS_LPT_NNODE:
@@ -1046,7 +1050,7 @@ static int get_lpt_node_len(struct ubifs_info *c, int node_type)
1046 * @buf: buffer 1050 * @buf: buffer
1047 * @len: length of buffer 1051 * @len: length of buffer
1048 */ 1052 */
1049static int get_pad_len(struct ubifs_info *c, uint8_t *buf, int len) 1053static int get_pad_len(const struct ubifs_info *c, uint8_t *buf, int len)
1050{ 1054{
1051 int offs, pad_len; 1055 int offs, pad_len;
1052 1056
@@ -1063,7 +1067,8 @@ static int get_pad_len(struct ubifs_info *c, uint8_t *buf, int len)
1063 * @buf: buffer 1067 * @buf: buffer
1064 * @node_num: node number is returned here 1068 * @node_num: node number is returned here
1065 */ 1069 */
1066static int get_lpt_node_type(struct ubifs_info *c, uint8_t *buf, int *node_num) 1070static int get_lpt_node_type(const struct ubifs_info *c, uint8_t *buf,
1071 int *node_num)
1067{ 1072{
1068 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; 1073 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
1069 int pos = 0, node_type; 1074 int pos = 0, node_type;
@@ -1081,7 +1086,7 @@ static int get_lpt_node_type(struct ubifs_info *c, uint8_t *buf, int *node_num)
1081 * 1086 *
1082 * This function returns %1 if the buffer contains a node or %0 if it does not. 1087 * This function returns %1 if the buffer contains a node or %0 if it does not.
1083 */ 1088 */
1084static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len) 1089static int is_a_node(const struct ubifs_info *c, uint8_t *buf, int len)
1085{ 1090{
1086 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; 1091 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
1087 int pos = 0, node_type, node_len; 1092 int pos = 0, node_type, node_len;
@@ -1105,7 +1110,6 @@ static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len)
1105 return 1; 1110 return 1;
1106} 1111}
1107 1112
1108
1109/** 1113/**
1110 * lpt_gc_lnum - garbage collect a LPT LEB. 1114 * lpt_gc_lnum - garbage collect a LPT LEB.
1111 * @c: UBIFS file-system description object 1115 * @c: UBIFS file-system description object
@@ -1463,7 +1467,7 @@ void ubifs_lpt_free(struct ubifs_info *c, int wr_only)
1463#ifdef CONFIG_UBIFS_FS_DEBUG 1467#ifdef CONFIG_UBIFS_FS_DEBUG
1464 1468
1465/** 1469/**
1466 * dbg_is_all_ff - determine if a buffer contains only 0xff bytes. 1470 * dbg_is_all_ff - determine if a buffer contains only 0xFF bytes.
1467 * @buf: buffer 1471 * @buf: buffer
1468 * @len: buffer length 1472 * @len: buffer length
1469 */ 1473 */
@@ -1488,7 +1492,7 @@ static int dbg_is_nnode_dirty(struct ubifs_info *c, int lnum, int offs)
1488 struct ubifs_nnode *nnode; 1492 struct ubifs_nnode *nnode;
1489 int hght; 1493 int hght;
1490 1494
1491 /* Entire tree is in memory so first_nnode / next_nnode are ok */ 1495 /* Entire tree is in memory so first_nnode / next_nnode are OK */
1492 nnode = first_nnode(c, &hght); 1496 nnode = first_nnode(c, &hght);
1493 for (; nnode; nnode = next_nnode(c, nnode, &hght)) { 1497 for (; nnode; nnode = next_nnode(c, nnode, &hght)) {
1494 struct ubifs_nbranch *branch; 1498 struct ubifs_nbranch *branch;
@@ -1602,7 +1606,10 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
1602{ 1606{
1603 int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len; 1607 int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len;
1604 int ret; 1608 int ret;
1605 void *buf = c->dbg_buf; 1609 void *buf = c->dbg->buf;
1610
1611 if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
1612 return 0;
1606 1613
1607 dbg_lp("LEB %d", lnum); 1614 dbg_lp("LEB %d", lnum);
1608 err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); 1615 err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
@@ -1704,6 +1711,9 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
1704 long long free = 0; 1711 long long free = 0;
1705 int i; 1712 int i;
1706 1713
1714 if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
1715 return 0;
1716
1707 for (i = 0; i < c->lpt_lebs; i++) { 1717 for (i = 0; i < c->lpt_lebs; i++) {
1708 if (c->ltab[i].tgc || c->ltab[i].cmt) 1718 if (c->ltab[i].tgc || c->ltab[i].cmt)
1709 continue; 1719 continue;
@@ -1716,6 +1726,8 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
1716 dbg_err("LPT space error: free %lld lpt_sz %lld", 1726 dbg_err("LPT space error: free %lld lpt_sz %lld",
1717 free, c->lpt_sz); 1727 free, c->lpt_sz);
1718 dbg_dump_lpt_info(c); 1728 dbg_dump_lpt_info(c);
1729 dbg_dump_lpt_lebs(c);
1730 dump_stack();
1719 return -EINVAL; 1731 return -EINVAL;
1720 } 1732 }
1721 return 0; 1733 return 0;
@@ -1731,15 +1743,19 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
1731 */ 1743 */
1732int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) 1744int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
1733{ 1745{
1746 struct ubifs_debug_info *d = c->dbg;
1734 long long chk_lpt_sz, lpt_sz; 1747 long long chk_lpt_sz, lpt_sz;
1735 int err = 0; 1748 int err = 0;
1736 1749
1750 if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
1751 return 0;
1752
1737 switch (action) { 1753 switch (action) {
1738 case 0: 1754 case 0:
1739 c->chk_lpt_sz = 0; 1755 d->chk_lpt_sz = 0;
1740 c->chk_lpt_sz2 = 0; 1756 d->chk_lpt_sz2 = 0;
1741 c->chk_lpt_lebs = 0; 1757 d->chk_lpt_lebs = 0;
1742 c->chk_lpt_wastage = 0; 1758 d->chk_lpt_wastage = 0;
1743 if (c->dirty_pn_cnt > c->pnode_cnt) { 1759 if (c->dirty_pn_cnt > c->pnode_cnt) {
1744 dbg_err("dirty pnodes %d exceed max %d", 1760 dbg_err("dirty pnodes %d exceed max %d",
1745 c->dirty_pn_cnt, c->pnode_cnt); 1761 c->dirty_pn_cnt, c->pnode_cnt);
@@ -1752,35 +1768,35 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
1752 } 1768 }
1753 return err; 1769 return err;
1754 case 1: 1770 case 1:
1755 c->chk_lpt_sz += len; 1771 d->chk_lpt_sz += len;
1756 return 0; 1772 return 0;
1757 case 2: 1773 case 2:
1758 c->chk_lpt_sz += len; 1774 d->chk_lpt_sz += len;
1759 c->chk_lpt_wastage += len; 1775 d->chk_lpt_wastage += len;
1760 c->chk_lpt_lebs += 1; 1776 d->chk_lpt_lebs += 1;
1761 return 0; 1777 return 0;
1762 case 3: 1778 case 3:
1763 chk_lpt_sz = c->leb_size; 1779 chk_lpt_sz = c->leb_size;
1764 chk_lpt_sz *= c->chk_lpt_lebs; 1780 chk_lpt_sz *= d->chk_lpt_lebs;
1765 chk_lpt_sz += len - c->nhead_offs; 1781 chk_lpt_sz += len - c->nhead_offs;
1766 if (c->chk_lpt_sz != chk_lpt_sz) { 1782 if (d->chk_lpt_sz != chk_lpt_sz) {
1767 dbg_err("LPT wrote %lld but space used was %lld", 1783 dbg_err("LPT wrote %lld but space used was %lld",
1768 c->chk_lpt_sz, chk_lpt_sz); 1784 d->chk_lpt_sz, chk_lpt_sz);
1769 err = -EINVAL; 1785 err = -EINVAL;
1770 } 1786 }
1771 if (c->chk_lpt_sz > c->lpt_sz) { 1787 if (d->chk_lpt_sz > c->lpt_sz) {
1772 dbg_err("LPT wrote %lld but lpt_sz is %lld", 1788 dbg_err("LPT wrote %lld but lpt_sz is %lld",
1773 c->chk_lpt_sz, c->lpt_sz); 1789 d->chk_lpt_sz, c->lpt_sz);
1774 err = -EINVAL; 1790 err = -EINVAL;
1775 } 1791 }
1776 if (c->chk_lpt_sz2 && c->chk_lpt_sz != c->chk_lpt_sz2) { 1792 if (d->chk_lpt_sz2 && d->chk_lpt_sz != d->chk_lpt_sz2) {
1777 dbg_err("LPT layout size %lld but wrote %lld", 1793 dbg_err("LPT layout size %lld but wrote %lld",
1778 c->chk_lpt_sz, c->chk_lpt_sz2); 1794 d->chk_lpt_sz, d->chk_lpt_sz2);
1779 err = -EINVAL; 1795 err = -EINVAL;
1780 } 1796 }
1781 if (c->chk_lpt_sz2 && c->new_nhead_offs != len) { 1797 if (d->chk_lpt_sz2 && d->new_nhead_offs != len) {
1782 dbg_err("LPT new nhead offs: expected %d was %d", 1798 dbg_err("LPT new nhead offs: expected %d was %d",
1783 c->new_nhead_offs, len); 1799 d->new_nhead_offs, len);
1784 err = -EINVAL; 1800 err = -EINVAL;
1785 } 1801 }
1786 lpt_sz = (long long)c->pnode_cnt * c->pnode_sz; 1802 lpt_sz = (long long)c->pnode_cnt * c->pnode_sz;
@@ -1788,26 +1804,146 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
1788 lpt_sz += c->ltab_sz; 1804 lpt_sz += c->ltab_sz;
1789 if (c->big_lpt) 1805 if (c->big_lpt)
1790 lpt_sz += c->lsave_sz; 1806 lpt_sz += c->lsave_sz;
1791 if (c->chk_lpt_sz - c->chk_lpt_wastage > lpt_sz) { 1807 if (d->chk_lpt_sz - d->chk_lpt_wastage > lpt_sz) {
1792 dbg_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld", 1808 dbg_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld",
1793 c->chk_lpt_sz, c->chk_lpt_wastage, lpt_sz); 1809 d->chk_lpt_sz, d->chk_lpt_wastage, lpt_sz);
1794 err = -EINVAL; 1810 err = -EINVAL;
1795 } 1811 }
1796 if (err) 1812 if (err) {
1797 dbg_dump_lpt_info(c); 1813 dbg_dump_lpt_info(c);
1798 c->chk_lpt_sz2 = c->chk_lpt_sz; 1814 dbg_dump_lpt_lebs(c);
1799 c->chk_lpt_sz = 0; 1815 dump_stack();
1800 c->chk_lpt_wastage = 0; 1816 }
1801 c->chk_lpt_lebs = 0; 1817 d->chk_lpt_sz2 = d->chk_lpt_sz;
1802 c->new_nhead_offs = len; 1818 d->chk_lpt_sz = 0;
1819 d->chk_lpt_wastage = 0;
1820 d->chk_lpt_lebs = 0;
1821 d->new_nhead_offs = len;
1803 return err; 1822 return err;
1804 case 4: 1823 case 4:
1805 c->chk_lpt_sz += len; 1824 d->chk_lpt_sz += len;
1806 c->chk_lpt_wastage += len; 1825 d->chk_lpt_wastage += len;
1807 return 0; 1826 return 0;
1808 default: 1827 default:
1809 return -EINVAL; 1828 return -EINVAL;
1810 } 1829 }
1811} 1830}
1812 1831
1832/**
1833 * dbg_dump_lpt_leb - dump an LPT LEB.
1834 * @c: UBIFS file-system description object
1835 * @lnum: LEB number to dump
1836 *
1837 * This function dumps an LEB from LPT area. Nodes in this area are very
1838 * different to nodes in the main area (e.g., they do not have common headers,
1839 * they do not have 8-byte alignments, etc), so we have a separate function to
1840 * dump LPT area LEBs. Note, LPT has to be locked by the caller.
1841 */
1842static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
1843{
1844 int err, len = c->leb_size, node_type, node_num, node_len, offs;
1845 void *buf = c->dbg->buf;
1846
1847 printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
1848 current->pid, lnum);
1849 err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
1850 if (err) {
1851 ubifs_err("cannot read LEB %d, error %d", lnum, err);
1852 return;
1853 }
1854 while (1) {
1855 offs = c->leb_size - len;
1856 if (!is_a_node(c, buf, len)) {
1857 int pad_len;
1858
1859 pad_len = get_pad_len(c, buf, len);
1860 if (pad_len) {
1861 printk(KERN_DEBUG "LEB %d:%d, pad %d bytes\n",
1862 lnum, offs, pad_len);
1863 buf += pad_len;
1864 len -= pad_len;
1865 continue;
1866 }
1867 if (len)
1868 printk(KERN_DEBUG "LEB %d:%d, free %d bytes\n",
1869 lnum, offs, len);
1870 break;
1871 }
1872
1873 node_type = get_lpt_node_type(c, buf, &node_num);
1874 switch (node_type) {
1875 case UBIFS_LPT_PNODE:
1876 {
1877 node_len = c->pnode_sz;
1878 if (c->big_lpt)
1879 printk(KERN_DEBUG "LEB %d:%d, pnode num %d\n",
1880 lnum, offs, node_num);
1881 else
1882 printk(KERN_DEBUG "LEB %d:%d, pnode\n",
1883 lnum, offs);
1884 break;
1885 }
1886 case UBIFS_LPT_NNODE:
1887 {
1888 int i;
1889 struct ubifs_nnode nnode;
1890
1891 node_len = c->nnode_sz;
1892 if (c->big_lpt)
1893 printk(KERN_DEBUG "LEB %d:%d, nnode num %d, ",
1894 lnum, offs, node_num);
1895 else
1896 printk(KERN_DEBUG "LEB %d:%d, nnode, ",
1897 lnum, offs);
1898 err = ubifs_unpack_nnode(c, buf, &nnode);
1899 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
1900 printk("%d:%d", nnode.nbranch[i].lnum,
1901 nnode.nbranch[i].offs);
1902 if (i != UBIFS_LPT_FANOUT - 1)
1903 printk(", ");
1904 }
1905 printk("\n");
1906 break;
1907 }
1908 case UBIFS_LPT_LTAB:
1909 node_len = c->ltab_sz;
1910 printk(KERN_DEBUG "LEB %d:%d, ltab\n",
1911 lnum, offs);
1912 break;
1913 case UBIFS_LPT_LSAVE:
1914 node_len = c->lsave_sz;
1915 printk(KERN_DEBUG "LEB %d:%d, lsave len\n", lnum, offs);
1916 break;
1917 default:
1918 ubifs_err("LPT node type %d not recognized", node_type);
1919 return;
1920 }
1921
1922 buf += node_len;
1923 len -= node_len;
1924 }
1925
1926 printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
1927 current->pid, lnum);
1928}
1929
1930/**
1931 * dbg_dump_lpt_lebs - dump LPT lebs.
1932 * @c: UBIFS file-system description object
1933 *
1934 * This function dumps all LPT LEBs. The caller has to make sure the LPT is
1935 * locked.
1936 */
1937void dbg_dump_lpt_lebs(const struct ubifs_info *c)
1938{
1939 int i;
1940
1941 printk(KERN_DEBUG "(pid %d) start dumping all LPT LEBs\n",
1942 current->pid);
1943 for (i = 0; i < c->lpt_lebs; i++)
1944 dump_lpt_leb(c, i + c->lpt_first);
1945 printk(KERN_DEBUG "(pid %d) finish dumping all LPT LEBs\n",
1946 current->pid);
1947}
1948
1813#endif /* CONFIG_UBIFS_FS_DEBUG */ 1949#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 9bd5a43d4526..9e6f403f170e 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -899,7 +899,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
899 for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { 899 for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
900 struct ubifs_scan_leb *sleb; 900 struct ubifs_scan_leb *sleb;
901 901
902 sleb = ubifs_scan(c, lnum, 0, c->dbg_buf); 902 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
903 if (IS_ERR(sleb)) { 903 if (IS_ERR(sleb)) {
904 err = PTR_ERR(sleb); 904 err = PTR_ERR(sleb);
905 break; 905 break;
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 21f7d047c306..ce42a7b0ca5a 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -144,7 +144,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
144 /* 144 /*
145 * If the replay order was perfect the dirty space would now be 145 * If the replay order was perfect the dirty space would now be
146 * zero. The order is not perfect because the the journal heads 146 * zero. The order is not perfect because the the journal heads
147 * race with eachother. This is not a problem but is does mean 147 * race with each other. This is not a problem but is does mean
148 * that the dirty space may temporarily exceed c->leb_size 148 * that the dirty space may temporarily exceed c->leb_size
149 * during the replay. 149 * during the replay.
150 */ 150 */
@@ -656,7 +656,7 @@ out_dump:
656 * @dirty: amount of dirty space from padding and deletion nodes 656 * @dirty: amount of dirty space from padding and deletion nodes
657 * 657 *
658 * This function inserts a reference node to the replay tree and returns zero 658 * This function inserts a reference node to the replay tree and returns zero
659 * in case of success ort a negative error code in case of failure. 659 * in case of success or a negative error code in case of failure.
660 */ 660 */
661static int insert_ref_node(struct ubifs_info *c, int lnum, int offs, 661static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
662 unsigned long long sqnum, int free, int dirty) 662 unsigned long long sqnum, int free, int dirty)
@@ -883,7 +883,7 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
883 * This means that we reached end of log and now 883 * This means that we reached end of log and now
884 * look to the older log data, which was already 884 * look to the older log data, which was already
885 * committed but the eraseblock was not erased (UBIFS 885 * committed but the eraseblock was not erased (UBIFS
886 * only unmaps it). So this basically means we have to 886 * only un-maps it). So this basically means we have to
887 * exit with "end of log" code. 887 * exit with "end of log" code.
888 */ 888 */
889 err = 1; 889 err = 1;
@@ -1062,6 +1062,15 @@ int ubifs_replay_journal(struct ubifs_info *c)
1062 if (err) 1062 if (err)
1063 goto out; 1063 goto out;
1064 1064
1065 /*
1066 * UBIFS budgeting calculations use @c->budg_uncommitted_idx variable
1067 * to roughly estimate index growth. Things like @c->min_idx_lebs
1068 * depend on it. This means we have to initialize it to make sure
1069 * budgeting works properly.
1070 */
1071 c->budg_uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt);
1072 c->budg_uncommitted_idx *= c->max_idx_node_sz;
1073
1065 ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery); 1074 ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
1066 dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, " 1075 dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, "
1067 "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum, 1076 "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum,
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 0f392351dc5a..e070c643d1bb 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -28,6 +28,7 @@
28 28
29#include "ubifs.h" 29#include "ubifs.h"
30#include <linux/random.h> 30#include <linux/random.h>
31#include <linux/math64.h>
31 32
32/* 33/*
33 * Default journal size in logical eraseblocks as a percent of total 34 * Default journal size in logical eraseblocks as a percent of total
@@ -80,7 +81,7 @@ static int create_default_filesystem(struct ubifs_info *c)
80 int err, tmp, jnl_lebs, log_lebs, max_buds, main_lebs, main_first; 81 int err, tmp, jnl_lebs, log_lebs, max_buds, main_lebs, main_first;
81 int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0; 82 int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0;
82 int min_leb_cnt = UBIFS_MIN_LEB_CNT; 83 int min_leb_cnt = UBIFS_MIN_LEB_CNT;
83 uint64_t tmp64, main_bytes; 84 long long tmp64, main_bytes;
84 __le64 tmp_le64; 85 __le64 tmp_le64;
85 86
86 /* Some functions called from here depend on the @c->key_len filed */ 87 /* Some functions called from here depend on the @c->key_len filed */
@@ -160,7 +161,7 @@ static int create_default_filesystem(struct ubifs_info *c)
160 if (!sup) 161 if (!sup)
161 return -ENOMEM; 162 return -ENOMEM;
162 163
163 tmp64 = (uint64_t)max_buds * c->leb_size; 164 tmp64 = (long long)max_buds * c->leb_size;
164 if (big_lpt) 165 if (big_lpt)
165 sup_flags |= UBIFS_FLG_BIGLPT; 166 sup_flags |= UBIFS_FLG_BIGLPT;
166 167
@@ -179,14 +180,16 @@ static int create_default_filesystem(struct ubifs_info *c)
179 sup->fanout = cpu_to_le32(DEFAULT_FANOUT); 180 sup->fanout = cpu_to_le32(DEFAULT_FANOUT);
180 sup->lsave_cnt = cpu_to_le32(c->lsave_cnt); 181 sup->lsave_cnt = cpu_to_le32(c->lsave_cnt);
181 sup->fmt_version = cpu_to_le32(UBIFS_FORMAT_VERSION); 182 sup->fmt_version = cpu_to_le32(UBIFS_FORMAT_VERSION);
182 sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO);
183 sup->time_gran = cpu_to_le32(DEFAULT_TIME_GRAN); 183 sup->time_gran = cpu_to_le32(DEFAULT_TIME_GRAN);
184 if (c->mount_opts.override_compr)
185 sup->default_compr = cpu_to_le16(c->mount_opts.compr_type);
186 else
187 sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO);
184 188
185 generate_random_uuid(sup->uuid); 189 generate_random_uuid(sup->uuid);
186 190
187 main_bytes = (uint64_t)main_lebs * c->leb_size; 191 main_bytes = (long long)main_lebs * c->leb_size;
188 tmp64 = main_bytes * DEFAULT_RP_PERCENT; 192 tmp64 = div_u64(main_bytes * DEFAULT_RP_PERCENT, 100);
189 do_div(tmp64, 100);
190 if (tmp64 > DEFAULT_MAX_RP_SIZE) 193 if (tmp64 > DEFAULT_MAX_RP_SIZE)
191 tmp64 = DEFAULT_MAX_RP_SIZE; 194 tmp64 = DEFAULT_MAX_RP_SIZE;
192 sup->rp_size = cpu_to_le64(tmp64); 195 sup->rp_size = cpu_to_le64(tmp64);
@@ -582,16 +585,15 @@ int ubifs_read_superblock(struct ubifs_info *c)
582 c->jhead_cnt = le32_to_cpu(sup->jhead_cnt) + NONDATA_JHEADS_CNT; 585 c->jhead_cnt = le32_to_cpu(sup->jhead_cnt) + NONDATA_JHEADS_CNT;
583 c->fanout = le32_to_cpu(sup->fanout); 586 c->fanout = le32_to_cpu(sup->fanout);
584 c->lsave_cnt = le32_to_cpu(sup->lsave_cnt); 587 c->lsave_cnt = le32_to_cpu(sup->lsave_cnt);
585 c->default_compr = le16_to_cpu(sup->default_compr);
586 c->rp_size = le64_to_cpu(sup->rp_size); 588 c->rp_size = le64_to_cpu(sup->rp_size);
587 c->rp_uid = le32_to_cpu(sup->rp_uid); 589 c->rp_uid = le32_to_cpu(sup->rp_uid);
588 c->rp_gid = le32_to_cpu(sup->rp_gid); 590 c->rp_gid = le32_to_cpu(sup->rp_gid);
589 sup_flags = le32_to_cpu(sup->flags); 591 sup_flags = le32_to_cpu(sup->flags);
592 if (!c->mount_opts.override_compr)
593 c->default_compr = le16_to_cpu(sup->default_compr);
590 594
591 c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran); 595 c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran);
592
593 memcpy(&c->uuid, &sup->uuid, 16); 596 memcpy(&c->uuid, &sup->uuid, 16);
594
595 c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT); 597 c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT);
596 598
597 /* Automatically increase file system size to the maximum size */ 599 /* Automatically increase file system size to the maximum size */
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index f248533841a2..e7bab52a1410 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -151,7 +151,7 @@ static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention)
151 * @contention: if any contention, this is set to %1 151 * @contention: if any contention, this is set to %1
152 * 152 *
153 * This function walks the list of mounted UBIFS file-systems and frees clean 153 * This function walks the list of mounted UBIFS file-systems and frees clean
154 * znodes which are older then @age, until at least @nr znodes are freed. 154 * znodes which are older than @age, until at least @nr znodes are freed.
155 * Returns the number of freed znodes. 155 * Returns the number of freed znodes.
156 */ 156 */
157static int shrink_tnc_trees(int nr, int age, int *contention) 157static int shrink_tnc_trees(int nr, int age, int *contention)
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index d80b2aef42b6..89556ee72518 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -34,6 +34,8 @@
34#include <linux/parser.h> 34#include <linux/parser.h>
35#include <linux/seq_file.h> 35#include <linux/seq_file.h>
36#include <linux/mount.h> 36#include <linux/mount.h>
37#include <linux/math64.h>
38#include <linux/writeback.h>
37#include "ubifs.h" 39#include "ubifs.h"
38 40
39/* 41/*
@@ -417,39 +419,61 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
417 else if (c->mount_opts.chk_data_crc == 1) 419 else if (c->mount_opts.chk_data_crc == 1)
418 seq_printf(s, ",no_chk_data_crc"); 420 seq_printf(s, ",no_chk_data_crc");
419 421
422 if (c->mount_opts.override_compr) {
423 seq_printf(s, ",compr=");
424 seq_printf(s, ubifs_compr_name(c->mount_opts.compr_type));
425 }
426
420 return 0; 427 return 0;
421} 428}
422 429
423static int ubifs_sync_fs(struct super_block *sb, int wait) 430static int ubifs_sync_fs(struct super_block *sb, int wait)
424{ 431{
432 int i, err;
425 struct ubifs_info *c = sb->s_fs_info; 433 struct ubifs_info *c = sb->s_fs_info;
426 int i, ret = 0, err; 434 struct writeback_control wbc = {
427 long long bud_bytes; 435 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
436 .range_start = 0,
437 .range_end = LLONG_MAX,
438 .nr_to_write = LONG_MAX,
439 };
428 440
429 if (c->jheads) { 441 /*
430 for (i = 0; i < c->jhead_cnt; i++) { 442 * Note by akpm about WB_SYNC_NONE used above: zero @wait is just an
431 err = ubifs_wbuf_sync(&c->jheads[i].wbuf); 443 * advisory thing to help the file system shove lots of data into the
432 if (err && !ret) 444 * queues. If some gets missed then it'll be picked up on the second
433 ret = err; 445 * '->sync_fs()' call, with non-zero @wait.
434 } 446 */
435 447
436 /* Commit the journal unless it has too little data */ 448 if (sb->s_flags & MS_RDONLY)
437 spin_lock(&c->buds_lock); 449 return 0;
438 bud_bytes = c->bud_bytes; 450
439 spin_unlock(&c->buds_lock); 451 /*
440 if (bud_bytes > c->leb_size) { 452 * Synchronize write buffers, because 'ubifs_run_commit()' does not
441 err = ubifs_run_commit(c); 453 * do this if it waits for an already running commit.
442 if (err) 454 */
443 return err; 455 for (i = 0; i < c->jhead_cnt; i++) {
444 } 456 err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
457 if (err)
458 return err;
445 } 459 }
446 460
447 /* 461 /*
448 * We ought to call sync for c->ubi but it does not have one. If it had 462 * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
449 * it would in turn call mtd->sync, however mtd operations are 463 * pages, so synchronize them first, then commit the journal. Strictly
450 * synchronous anyway, so we don't lose any sleep here. 464 * speaking, it is not necessary to commit the journal here,
465 * synchronizing write-buffers would be enough. But committing makes
466 * UBIFS free space predictions much more accurate, so we want to let
467 * the user be able to get more accurate results of 'statfs()' after
468 * they synchronize the file system.
451 */ 469 */
452 return ret; 470 generic_sync_sb_inodes(sb, &wbc);
471
472 err = ubifs_run_commit(c);
473 if (err)
474 return err;
475
476 return ubi_sync(c->vi.ubi_num);
453} 477}
454 478
455/** 479/**
@@ -596,7 +620,7 @@ static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad)
596} 620}
597 621
598/* 622/*
599 * init_constants_late - initialize UBIFS constants. 623 * init_constants_sb - initialize UBIFS constants.
600 * @c: UBIFS file-system description object 624 * @c: UBIFS file-system description object
601 * 625 *
602 * This is a helper function which initializes various UBIFS constants after 626 * This is a helper function which initializes various UBIFS constants after
@@ -604,10 +628,10 @@ static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad)
604 * makes sure they are all right. Returns zero in case of success and a 628 * makes sure they are all right. Returns zero in case of success and a
605 * negative error code in case of failure. 629 * negative error code in case of failure.
606 */ 630 */
607static int init_constants_late(struct ubifs_info *c) 631static int init_constants_sb(struct ubifs_info *c)
608{ 632{
609 int tmp, err; 633 int tmp, err;
610 uint64_t tmp64; 634 long long tmp64;
611 635
612 c->main_bytes = (long long)c->main_lebs * c->leb_size; 636 c->main_bytes = (long long)c->main_lebs * c->leb_size;
613 c->max_znode_sz = sizeof(struct ubifs_znode) + 637 c->max_znode_sz = sizeof(struct ubifs_znode) +
@@ -634,9 +658,8 @@ static int init_constants_late(struct ubifs_info *c)
634 * Make sure that the log is large enough to fit reference nodes for 658 * Make sure that the log is large enough to fit reference nodes for
635 * all buds plus one reserved LEB. 659 * all buds plus one reserved LEB.
636 */ 660 */
637 tmp64 = c->max_bud_bytes; 661 tmp64 = c->max_bud_bytes + c->leb_size - 1;
638 tmp = do_div(tmp64, c->leb_size); 662 c->max_bud_cnt = div_u64(tmp64, c->leb_size);
639 c->max_bud_cnt = tmp64 + !!tmp;
640 tmp = (c->ref_node_alsz * c->max_bud_cnt + c->leb_size - 1); 663 tmp = (c->ref_node_alsz * c->max_bud_cnt + c->leb_size - 1);
641 tmp /= c->leb_size; 664 tmp /= c->leb_size;
642 tmp += 1; 665 tmp += 1;
@@ -672,7 +695,7 @@ static int init_constants_late(struct ubifs_info *c)
672 * Consequently, if the journal is too small, UBIFS will treat it as 695 * Consequently, if the journal is too small, UBIFS will treat it as
673 * always full. 696 * always full.
674 */ 697 */
675 tmp64 = (uint64_t)(c->jhead_cnt + 1) * c->leb_size + 1; 698 tmp64 = (long long)(c->jhead_cnt + 1) * c->leb_size + 1;
676 if (c->bg_bud_bytes < tmp64) 699 if (c->bg_bud_bytes < tmp64)
677 c->bg_bud_bytes = tmp64; 700 c->bg_bud_bytes = tmp64;
678 if (c->max_bud_bytes < tmp64 + c->leb_size) 701 if (c->max_bud_bytes < tmp64 + c->leb_size)
@@ -682,6 +705,21 @@ static int init_constants_late(struct ubifs_info *c)
682 if (err) 705 if (err)
683 return err; 706 return err;
684 707
708 return 0;
709}
710
711/*
712 * init_constants_master - initialize UBIFS constants.
713 * @c: UBIFS file-system description object
714 *
715 * This is a helper function which initializes various UBIFS constants after
716 * the master node has been read. It also checks various UBIFS parameters and
717 * makes sure they are all right.
718 */
719static void init_constants_master(struct ubifs_info *c)
720{
721 long long tmp64;
722
685 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); 723 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
686 724
687 /* 725 /*
@@ -690,14 +728,13 @@ static int init_constants_late(struct ubifs_info *c)
690 * necessary to report something for the 'statfs()' call. 728 * necessary to report something for the 'statfs()' call.
691 * 729 *
692 * Subtract the LEB reserved for GC, the LEB which is reserved for 730 * Subtract the LEB reserved for GC, the LEB which is reserved for
693 * deletions, and assume only one journal head is available. 731 * deletions, minimum LEBs for the index, and assume only one journal
732 * head is available.
694 */ 733 */
695 tmp64 = c->main_lebs - 2 - c->jhead_cnt + 1; 734 tmp64 = c->main_lebs - 1 - 1 - MIN_INDEX_LEBS - c->jhead_cnt + 1;
696 tmp64 *= (uint64_t)c->leb_size - c->leb_overhead; 735 tmp64 *= (long long)c->leb_size - c->leb_overhead;
697 tmp64 = ubifs_reported_space(c, tmp64); 736 tmp64 = ubifs_reported_space(c, tmp64);
698 c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT; 737 c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT;
699
700 return 0;
701} 738}
702 739
703/** 740/**
@@ -878,6 +915,7 @@ static int check_volume_empty(struct ubifs_info *c)
878 * Opt_no_bulk_read: disable bulk-reads 915 * Opt_no_bulk_read: disable bulk-reads
879 * Opt_chk_data_crc: check CRCs when reading data nodes 916 * Opt_chk_data_crc: check CRCs when reading data nodes
880 * Opt_no_chk_data_crc: do not check CRCs when reading data nodes 917 * Opt_no_chk_data_crc: do not check CRCs when reading data nodes
918 * Opt_override_compr: override default compressor
881 * Opt_err: just end of array marker 919 * Opt_err: just end of array marker
882 */ 920 */
883enum { 921enum {
@@ -887,6 +925,7 @@ enum {
887 Opt_no_bulk_read, 925 Opt_no_bulk_read,
888 Opt_chk_data_crc, 926 Opt_chk_data_crc,
889 Opt_no_chk_data_crc, 927 Opt_no_chk_data_crc,
928 Opt_override_compr,
890 Opt_err, 929 Opt_err,
891}; 930};
892 931
@@ -897,6 +936,7 @@ static const match_table_t tokens = {
897 {Opt_no_bulk_read, "no_bulk_read"}, 936 {Opt_no_bulk_read, "no_bulk_read"},
898 {Opt_chk_data_crc, "chk_data_crc"}, 937 {Opt_chk_data_crc, "chk_data_crc"},
899 {Opt_no_chk_data_crc, "no_chk_data_crc"}, 938 {Opt_no_chk_data_crc, "no_chk_data_crc"},
939 {Opt_override_compr, "compr=%s"},
900 {Opt_err, NULL}, 940 {Opt_err, NULL},
901}; 941};
902 942
@@ -950,6 +990,28 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
950 c->mount_opts.chk_data_crc = 1; 990 c->mount_opts.chk_data_crc = 1;
951 c->no_chk_data_crc = 1; 991 c->no_chk_data_crc = 1;
952 break; 992 break;
993 case Opt_override_compr:
994 {
995 char *name = match_strdup(&args[0]);
996
997 if (!name)
998 return -ENOMEM;
999 if (!strcmp(name, "none"))
1000 c->mount_opts.compr_type = UBIFS_COMPR_NONE;
1001 else if (!strcmp(name, "lzo"))
1002 c->mount_opts.compr_type = UBIFS_COMPR_LZO;
1003 else if (!strcmp(name, "zlib"))
1004 c->mount_opts.compr_type = UBIFS_COMPR_ZLIB;
1005 else {
1006 ubifs_err("unknown compressor \"%s\"", name);
1007 kfree(name);
1008 return -EINVAL;
1009 }
1010 kfree(name);
1011 c->mount_opts.override_compr = 1;
1012 c->default_compr = c->mount_opts.compr_type;
1013 break;
1014 }
953 default: 1015 default:
954 ubifs_err("unrecognized mount option \"%s\" " 1016 ubifs_err("unrecognized mount option \"%s\" "
955 "or missing value", p); 1017 "or missing value", p);
@@ -1019,6 +1081,30 @@ again:
1019} 1081}
1020 1082
1021/** 1083/**
1084 * check_free_space - check if there is enough free space to mount.
1085 * @c: UBIFS file-system description object
1086 *
1087 * This function makes sure UBIFS has enough free space to be mounted in
1088 * read/write mode. UBIFS must always have some free space to allow deletions.
1089 */
1090static int check_free_space(struct ubifs_info *c)
1091{
1092 ubifs_assert(c->dark_wm > 0);
1093 if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) {
1094 ubifs_err("insufficient free space to mount in read/write mode");
1095 dbg_dump_budg(c);
1096 dbg_dump_lprops(c);
1097 /*
1098 * We return %-EINVAL instead of %-ENOSPC because it seems to
1099 * be the closest error code mentioned in the mount function
1100 * documentation.
1101 */
1102 return -EINVAL;
1103 }
1104 return 0;
1105}
1106
1107/**
1022 * mount_ubifs - mount UBIFS file-system. 1108 * mount_ubifs - mount UBIFS file-system.
1023 * @c: UBIFS file-system description object 1109 * @c: UBIFS file-system description object
1024 * 1110 *
@@ -1039,11 +1125,9 @@ static int mount_ubifs(struct ubifs_info *c)
1039 if (err) 1125 if (err)
1040 return err; 1126 return err;
1041 1127
1042#ifdef CONFIG_UBIFS_FS_DEBUG 1128 err = ubifs_debugging_init(c);
1043 c->dbg_buf = vmalloc(c->leb_size); 1129 if (err)
1044 if (!c->dbg_buf) 1130 return err;
1045 return -ENOMEM;
1046#endif
1047 1131
1048 err = check_volume_empty(c); 1132 err = check_volume_empty(c);
1049 if (err) 1133 if (err)
@@ -1100,27 +1184,25 @@ static int mount_ubifs(struct ubifs_info *c)
1100 goto out_free; 1184 goto out_free;
1101 1185
1102 /* 1186 /*
1103 * Make sure the compressor which is set as the default on in the 1187 * Make sure the compressor which is set as default in the superblock
1104 * superblock was actually compiled in. 1188 * or overridden by mount options is actually compiled in.
1105 */ 1189 */
1106 if (!ubifs_compr_present(c->default_compr)) { 1190 if (!ubifs_compr_present(c->default_compr)) {
1107 ubifs_warn("'%s' compressor is set by superblock, but not " 1191 ubifs_err("'compressor \"%s\" is not compiled in",
1108 "compiled in", ubifs_compr_name(c->default_compr)); 1192 ubifs_compr_name(c->default_compr));
1109 c->default_compr = UBIFS_COMPR_NONE; 1193 goto out_free;
1110 } 1194 }
1111 1195
1112 dbg_failure_mode_registration(c); 1196 err = init_constants_sb(c);
1113
1114 err = init_constants_late(c);
1115 if (err) 1197 if (err)
1116 goto out_dereg; 1198 goto out_free;
1117 1199
1118 sz = ALIGN(c->max_idx_node_sz, c->min_io_size); 1200 sz = ALIGN(c->max_idx_node_sz, c->min_io_size);
1119 sz = ALIGN(sz + c->max_idx_node_sz, c->min_io_size); 1201 sz = ALIGN(sz + c->max_idx_node_sz, c->min_io_size);
1120 c->cbuf = kmalloc(sz, GFP_NOFS); 1202 c->cbuf = kmalloc(sz, GFP_NOFS);
1121 if (!c->cbuf) { 1203 if (!c->cbuf) {
1122 err = -ENOMEM; 1204 err = -ENOMEM;
1123 goto out_dereg; 1205 goto out_free;
1124 } 1206 }
1125 1207
1126 sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id); 1208 sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);
@@ -1145,6 +1227,8 @@ static int mount_ubifs(struct ubifs_info *c)
1145 if (err) 1227 if (err)
1146 goto out_master; 1228 goto out_master;
1147 1229
1230 init_constants_master(c);
1231
1148 if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) { 1232 if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
1149 ubifs_msg("recovery needed"); 1233 ubifs_msg("recovery needed");
1150 c->need_recovery = 1; 1234 c->need_recovery = 1;
@@ -1183,12 +1267,9 @@ static int mount_ubifs(struct ubifs_info *c)
1183 if (!mounted_read_only) { 1267 if (!mounted_read_only) {
1184 int lnum; 1268 int lnum;
1185 1269
1186 /* Check for enough free space */ 1270 err = check_free_space(c);
1187 if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) { 1271 if (err)
1188 ubifs_err("insufficient available space");
1189 err = -EINVAL;
1190 goto out_orphans; 1272 goto out_orphans;
1191 }
1192 1273
1193 /* Check for enough log space */ 1274 /* Check for enough log space */
1194 lnum = c->lhead_lnum + 1; 1275 lnum = c->lhead_lnum + 1;
@@ -1232,6 +1313,10 @@ static int mount_ubifs(struct ubifs_info *c)
1232 } 1313 }
1233 } 1314 }
1234 1315
1316 err = dbg_debugfs_init_fs(c);
1317 if (err)
1318 goto out_infos;
1319
1235 err = dbg_check_filesystem(c); 1320 err = dbg_check_filesystem(c);
1236 if (err) 1321 if (err)
1237 goto out_infos; 1322 goto out_infos;
@@ -1283,8 +1368,20 @@ static int mount_ubifs(struct ubifs_info *c)
1283 dbg_msg("tree fanout: %d", c->fanout); 1368 dbg_msg("tree fanout: %d", c->fanout);
1284 dbg_msg("reserved GC LEB: %d", c->gc_lnum); 1369 dbg_msg("reserved GC LEB: %d", c->gc_lnum);
1285 dbg_msg("first main LEB: %d", c->main_first); 1370 dbg_msg("first main LEB: %d", c->main_first);
1371 dbg_msg("max. znode size %d", c->max_znode_sz);
1372 dbg_msg("max. index node size %d", c->max_idx_node_sz);
1373 dbg_msg("node sizes: data %zu, inode %zu, dentry %zu",
1374 UBIFS_DATA_NODE_SZ, UBIFS_INO_NODE_SZ, UBIFS_DENT_NODE_SZ);
1375 dbg_msg("node sizes: trun %zu, sb %zu, master %zu",
1376 UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ);
1377 dbg_msg("node sizes: ref %zu, cmt. start %zu, orph %zu",
1378 UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);
1379 dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu",
1380 UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
1381 UBIFS_MAX_DENT_NODE_SZ);
1286 dbg_msg("dead watermark: %d", c->dead_wm); 1382 dbg_msg("dead watermark: %d", c->dead_wm);
1287 dbg_msg("dark watermark: %d", c->dark_wm); 1383 dbg_msg("dark watermark: %d", c->dark_wm);
1384 dbg_msg("LEB overhead: %d", c->leb_overhead);
1288 x = (long long)c->main_lebs * c->dark_wm; 1385 x = (long long)c->main_lebs * c->dark_wm;
1289 dbg_msg("max. dark space: %lld (%lld KiB, %lld MiB)", 1386 dbg_msg("max. dark space: %lld (%lld KiB, %lld MiB)",
1290 x, x >> 10, x >> 20); 1387 x, x >> 10, x >> 20);
@@ -1320,14 +1417,12 @@ out_wbufs:
1320 free_wbufs(c); 1417 free_wbufs(c);
1321out_cbuf: 1418out_cbuf:
1322 kfree(c->cbuf); 1419 kfree(c->cbuf);
1323out_dereg:
1324 dbg_failure_mode_deregistration(c);
1325out_free: 1420out_free:
1326 kfree(c->bu.buf); 1421 kfree(c->bu.buf);
1327 vfree(c->ileb_buf); 1422 vfree(c->ileb_buf);
1328 vfree(c->sbuf); 1423 vfree(c->sbuf);
1329 kfree(c->bottom_up_buf); 1424 kfree(c->bottom_up_buf);
1330 UBIFS_DBG(vfree(c->dbg_buf)); 1425 ubifs_debugging_exit(c);
1331 return err; 1426 return err;
1332} 1427}
1333 1428
@@ -1345,6 +1440,7 @@ static void ubifs_umount(struct ubifs_info *c)
1345 dbg_gen("un-mounting UBI device %d, volume %d", c->vi.ubi_num, 1440 dbg_gen("un-mounting UBI device %d, volume %d", c->vi.ubi_num,
1346 c->vi.vol_id); 1441 c->vi.vol_id);
1347 1442
1443 dbg_debugfs_exit_fs(c);
1348 spin_lock(&ubifs_infos_lock); 1444 spin_lock(&ubifs_infos_lock);
1349 list_del(&c->infos_list); 1445 list_del(&c->infos_list);
1350 spin_unlock(&ubifs_infos_lock); 1446 spin_unlock(&ubifs_infos_lock);
@@ -1364,8 +1460,7 @@ static void ubifs_umount(struct ubifs_info *c)
1364 vfree(c->ileb_buf); 1460 vfree(c->ileb_buf);
1365 vfree(c->sbuf); 1461 vfree(c->sbuf);
1366 kfree(c->bottom_up_buf); 1462 kfree(c->bottom_up_buf);
1367 UBIFS_DBG(vfree(c->dbg_buf)); 1463 ubifs_debugging_exit(c);
1368 dbg_failure_mode_deregistration(c);
1369} 1464}
1370 1465
1371/** 1466/**
@@ -1387,12 +1482,9 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1387 c->remounting_rw = 1; 1482 c->remounting_rw = 1;
1388 c->always_chk_crc = 1; 1483 c->always_chk_crc = 1;
1389 1484
1390 /* Check for enough free space */ 1485 err = check_free_space(c);
1391 if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) { 1486 if (err)
1392 ubifs_err("insufficient available space");
1393 err = -EINVAL;
1394 goto out; 1487 goto out;
1395 }
1396 1488
1397 if (c->old_leb_cnt != c->leb_cnt) { 1489 if (c->old_leb_cnt != c->leb_cnt) {
1398 struct ubifs_sb_node *sup; 1490 struct ubifs_sb_node *sup;
@@ -1515,20 +1607,24 @@ out:
1515 * @c: UBIFS file-system description object 1607 * @c: UBIFS file-system description object
1516 * 1608 *
1517 * This function is called during un-mounting and re-mounting, and it commits 1609 * This function is called during un-mounting and re-mounting, and it commits
1518 * the journal unless the "fast unmount" mode is enabled. It also avoids 1610 * the journal unless the "fast unmount" mode is enabled.
1519 * committing the journal if it contains too few data.
1520 */ 1611 */
1521static void commit_on_unmount(struct ubifs_info *c) 1612static void commit_on_unmount(struct ubifs_info *c)
1522{ 1613{
1523 if (!c->fast_unmount) { 1614 struct super_block *sb = c->vfs_sb;
1524 long long bud_bytes; 1615 long long bud_bytes;
1525 1616
1526 spin_lock(&c->buds_lock); 1617 /*
1527 bud_bytes = c->bud_bytes; 1618 * This function is called before the background thread is stopped, so
1528 spin_unlock(&c->buds_lock); 1619 * we may race with ongoing commit, which means we have to take
1529 if (bud_bytes > c->leb_size) 1620 * @c->bud_lock to access @c->bud_bytes.
1530 ubifs_run_commit(c); 1621 */
1531 } 1622 spin_lock(&c->buds_lock);
1623 bud_bytes = c->bud_bytes;
1624 spin_unlock(&c->buds_lock);
1625
1626 if (!c->fast_unmount && !(sb->s_flags & MS_RDONLY) && bud_bytes)
1627 ubifs_run_commit(c);
1532} 1628}
1533 1629
1534/** 1630/**
@@ -1849,7 +1945,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
1849 goto out_iput; 1945 goto out_iput;
1850 1946
1851 mutex_unlock(&c->umount_mutex); 1947 mutex_unlock(&c->umount_mutex);
1852
1853 return 0; 1948 return 0;
1854 1949
1855out_iput: 1950out_iput:
@@ -1955,7 +2050,7 @@ static void ubifs_kill_sb(struct super_block *sb)
1955 * We do 'commit_on_unmount()' here instead of 'ubifs_put_super()' 2050 * We do 'commit_on_unmount()' here instead of 'ubifs_put_super()'
1956 * in order to be outside BKL. 2051 * in order to be outside BKL.
1957 */ 2052 */
1958 if (sb->s_root && !(sb->s_flags & MS_RDONLY)) 2053 if (sb->s_root)
1959 commit_on_unmount(c); 2054 commit_on_unmount(c);
1960 /* The un-mount routine is actually done in put_super() */ 2055 /* The un-mount routine is actually done in put_super() */
1961 generic_shutdown_super(sb); 2056 generic_shutdown_super(sb);
@@ -2021,6 +2116,14 @@ static int __init ubifs_init(void)
2021 BUILD_BUG_ON(UBIFS_REF_NODE_SZ != 64); 2116 BUILD_BUG_ON(UBIFS_REF_NODE_SZ != 64);
2022 2117
2023 /* 2118 /*
2119 * We use 2 bit wide bit-fields to store compression type, which should
2120 * be amended if more compressors are added. The bit-fields are:
2121 * @compr_type in 'struct ubifs_inode', @default_compr in
2122 * 'struct ubifs_info' and @compr_type in 'struct ubifs_mount_opts'.
2123 */
2124 BUILD_BUG_ON(UBIFS_COMPR_TYPES_CNT > 4);
2125
2126 /*
2024 * We require that PAGE_CACHE_SIZE is greater-than-or-equal-to 2127 * We require that PAGE_CACHE_SIZE is greater-than-or-equal-to
2025 * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2. 2128 * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2.
2026 */ 2129 */
@@ -2049,11 +2152,17 @@ static int __init ubifs_init(void)
2049 2152
2050 err = ubifs_compressors_init(); 2153 err = ubifs_compressors_init();
2051 if (err) 2154 if (err)
2155 goto out_shrinker;
2156
2157 err = dbg_debugfs_init();
2158 if (err)
2052 goto out_compr; 2159 goto out_compr;
2053 2160
2054 return 0; 2161 return 0;
2055 2162
2056out_compr: 2163out_compr:
2164 ubifs_compressors_exit();
2165out_shrinker:
2057 unregister_shrinker(&ubifs_shrinker_info); 2166 unregister_shrinker(&ubifs_shrinker_info);
2058 kmem_cache_destroy(ubifs_inode_slab); 2167 kmem_cache_destroy(ubifs_inode_slab);
2059out_reg: 2168out_reg:
@@ -2068,6 +2177,7 @@ static void __exit ubifs_exit(void)
2068 ubifs_assert(list_empty(&ubifs_infos)); 2177 ubifs_assert(list_empty(&ubifs_infos));
2069 ubifs_assert(atomic_long_read(&ubifs_clean_zn_cnt) == 0); 2178 ubifs_assert(atomic_long_read(&ubifs_clean_zn_cnt) == 0);
2070 2179
2180 dbg_debugfs_exit();
2071 ubifs_compressors_exit(); 2181 ubifs_compressors_exit();
2072 unregister_shrinker(&ubifs_shrinker_info); 2182 unregister_shrinker(&ubifs_shrinker_info);
2073 kmem_cache_destroy(ubifs_inode_slab); 2183 kmem_cache_destroy(ubifs_inode_slab);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 6eef5344a145..f7e36f545527 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -2245,12 +2245,11 @@ int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
2245 if (found) { 2245 if (found) {
2246 /* Ensure the znode is dirtied */ 2246 /* Ensure the znode is dirtied */
2247 if (znode->cnext || !ubifs_zn_dirty(znode)) { 2247 if (znode->cnext || !ubifs_zn_dirty(znode)) {
2248 znode = dirty_cow_bottom_up(c, 2248 znode = dirty_cow_bottom_up(c, znode);
2249 znode); 2249 if (IS_ERR(znode)) {
2250 if (IS_ERR(znode)) { 2250 err = PTR_ERR(znode);
2251 err = PTR_ERR(znode); 2251 goto out_unlock;
2252 goto out_unlock; 2252 }
2253 }
2254 } 2253 }
2255 zbr = &znode->zbranch[n]; 2254 zbr = &znode->zbranch[n];
2256 lnc_free(zbr); 2255 lnc_free(zbr);
@@ -2317,11 +2316,11 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
2317 2316
2318 /* Ensure the znode is dirtied */ 2317 /* Ensure the znode is dirtied */
2319 if (znode->cnext || !ubifs_zn_dirty(znode)) { 2318 if (znode->cnext || !ubifs_zn_dirty(znode)) {
2320 znode = dirty_cow_bottom_up(c, znode); 2319 znode = dirty_cow_bottom_up(c, znode);
2321 if (IS_ERR(znode)) { 2320 if (IS_ERR(znode)) {
2322 err = PTR_ERR(znode); 2321 err = PTR_ERR(znode);
2323 goto out_unlock; 2322 goto out_unlock;
2324 } 2323 }
2325 } 2324 }
2326 2325
2327 if (found == 1) { 2326 if (found == 1) {
@@ -2627,11 +2626,11 @@ int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
2627 2626
2628 /* Ensure the znode is dirtied */ 2627 /* Ensure the znode is dirtied */
2629 if (znode->cnext || !ubifs_zn_dirty(znode)) { 2628 if (znode->cnext || !ubifs_zn_dirty(znode)) {
2630 znode = dirty_cow_bottom_up(c, znode); 2629 znode = dirty_cow_bottom_up(c, znode);
2631 if (IS_ERR(znode)) { 2630 if (IS_ERR(znode)) {
2632 err = PTR_ERR(znode); 2631 err = PTR_ERR(znode);
2633 goto out_unlock; 2632 goto out_unlock;
2634 } 2633 }
2635 } 2634 }
2636 2635
2637 /* Remove all keys in range except the first */ 2636 /* Remove all keys in range except the first */
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 8ac76b1c2d55..fde8d127c768 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -553,8 +553,8 @@ static int layout_in_empty_space(struct ubifs_info *c)
553 } 553 }
554 554
555#ifdef CONFIG_UBIFS_FS_DEBUG 555#ifdef CONFIG_UBIFS_FS_DEBUG
556 c->new_ihead_lnum = lnum; 556 c->dbg->new_ihead_lnum = lnum;
557 c->new_ihead_offs = buf_offs; 557 c->dbg->new_ihead_offs = buf_offs;
558#endif 558#endif
559 559
560 return 0; 560 return 0;
@@ -802,8 +802,10 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot)
802 * budgeting subsystem to assume the index is already committed, 802 * budgeting subsystem to assume the index is already committed,
803 * even though it is not. 803 * even though it is not.
804 */ 804 */
805 ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c));
805 c->old_idx_sz = c->calc_idx_sz; 806 c->old_idx_sz = c->calc_idx_sz;
806 c->budg_uncommitted_idx = 0; 807 c->budg_uncommitted_idx = 0;
808 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
807 spin_unlock(&c->space_lock); 809 spin_unlock(&c->space_lock);
808 mutex_unlock(&c->tnc_mutex); 810 mutex_unlock(&c->tnc_mutex);
809 811
@@ -1002,7 +1004,8 @@ static int write_index(struct ubifs_info *c)
1002 } 1004 }
1003 1005
1004#ifdef CONFIG_UBIFS_FS_DEBUG 1006#ifdef CONFIG_UBIFS_FS_DEBUG
1005 if (lnum != c->new_ihead_lnum || buf_offs != c->new_ihead_offs) { 1007 if (lnum != c->dbg->new_ihead_lnum ||
1008 buf_offs != c->dbg->new_ihead_offs) {
1006 ubifs_err("inconsistent ihead"); 1009 ubifs_err("inconsistent ihead");
1007 return -EINVAL; 1010 return -EINVAL;
1008 } 1011 }
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index 0b378042a3a2..b25fc36cf72f 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -51,6 +51,13 @@
51 */ 51 */
52#define UBIFS_MIN_COMPR_LEN 128 52#define UBIFS_MIN_COMPR_LEN 128
53 53
54/*
55 * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes
56 * shorter than uncompressed data length, UBIFS preferes to leave this data
57 * node uncompress, because it'll be read faster.
58 */
59#define UBIFS_MIN_COMPRESS_DIFF 64
60
54/* Root inode number */ 61/* Root inode number */
55#define UBIFS_ROOT_INO 1 62#define UBIFS_ROOT_INO 1
56 63
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 46b172560a06..fc2a4cc66d03 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -63,6 +63,14 @@
63#define SQNUM_WARN_WATERMARK 0xFFFFFFFF00000000ULL 63#define SQNUM_WARN_WATERMARK 0xFFFFFFFF00000000ULL
64#define SQNUM_WATERMARK 0xFFFFFFFFFF000000ULL 64#define SQNUM_WATERMARK 0xFFFFFFFFFF000000ULL
65 65
66/*
67 * Minimum amount of LEBs reserved for the index. At present the index needs at
68 * least 2 LEBs: one for the index head and one for in-the-gaps method (which
69 * currently does not cater for the index head and so excludes it from
70 * consideration).
71 */
72#define MIN_INDEX_LEBS 2
73
66/* Minimum amount of data UBIFS writes to the flash */ 74/* Minimum amount of data UBIFS writes to the flash */
67#define MIN_WRITE_SZ (UBIFS_DATA_NODE_SZ + 8) 75#define MIN_WRITE_SZ (UBIFS_DATA_NODE_SZ + 8)
68 76
@@ -386,12 +394,12 @@ struct ubifs_inode {
386 unsigned int dirty:1; 394 unsigned int dirty:1;
387 unsigned int xattr:1; 395 unsigned int xattr:1;
388 unsigned int bulk_read:1; 396 unsigned int bulk_read:1;
397 unsigned int compr_type:2;
389 struct mutex ui_mutex; 398 struct mutex ui_mutex;
390 spinlock_t ui_lock; 399 spinlock_t ui_lock;
391 loff_t synced_i_size; 400 loff_t synced_i_size;
392 loff_t ui_size; 401 loff_t ui_size;
393 int flags; 402 int flags;
394 int compr_type;
395 pgoff_t last_page_read; 403 pgoff_t last_page_read;
396 pgoff_t read_in_a_row; 404 pgoff_t read_in_a_row;
397 int data_len; 405 int data_len;
@@ -419,7 +427,7 @@ struct ubifs_unclean_leb {
419 * 427 *
420 * LPROPS_UNCAT: not categorized 428 * LPROPS_UNCAT: not categorized
421 * LPROPS_DIRTY: dirty > 0, not index 429 * LPROPS_DIRTY: dirty > 0, not index
422 * LPROPS_DIRTY_IDX: dirty + free > UBIFS_CH_SZ and index 430 * LPROPS_DIRTY_IDX: dirty + free > @c->min_idx_node_sze and index
423 * LPROPS_FREE: free > 0, not empty, not index 431 * LPROPS_FREE: free > 0, not empty, not index
424 * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs 432 * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs
425 * LPROPS_EMPTY: LEB is empty, not taken 433 * LPROPS_EMPTY: LEB is empty, not taken
@@ -473,8 +481,8 @@ struct ubifs_lprops {
473struct ubifs_lpt_lprops { 481struct ubifs_lpt_lprops {
474 int free; 482 int free;
475 int dirty; 483 int dirty;
476 unsigned tgc : 1; 484 unsigned tgc:1;
477 unsigned cmt : 1; 485 unsigned cmt:1;
478}; 486};
479 487
480/** 488/**
@@ -482,24 +490,26 @@ struct ubifs_lpt_lprops {
482 * @empty_lebs: number of empty LEBs 490 * @empty_lebs: number of empty LEBs
483 * @taken_empty_lebs: number of taken LEBs 491 * @taken_empty_lebs: number of taken LEBs
484 * @idx_lebs: number of indexing LEBs 492 * @idx_lebs: number of indexing LEBs
485 * @total_free: total free space in bytes 493 * @total_free: total free space in bytes (includes all LEBs)
486 * @total_dirty: total dirty space in bytes 494 * @total_dirty: total dirty space in bytes (includes all LEBs)
487 * @total_used: total used space in bytes (includes only data LEBs) 495 * @total_used: total used space in bytes (does not include index LEBs)
488 * @total_dead: total dead space in bytes (includes only data LEBs) 496 * @total_dead: total dead space in bytes (does not include index LEBs)
489 * @total_dark: total dark space in bytes (includes only data LEBs) 497 * @total_dark: total dark space in bytes (does not include index LEBs)
498 *
499 * The @taken_empty_lebs field counts the LEBs that are in the transient state
500 * of having been "taken" for use but not yet written to. @taken_empty_lebs is
501 * needed to account correctly for @gc_lnum, otherwise @empty_lebs could be
502 * used by itself (in which case 'unused_lebs' would be a better name). In the
503 * case of @gc_lnum, it is "taken" at mount time or whenever a LEB is retained
504 * by GC, but unlike other empty LEBs that are "taken", it may not be written
505 * straight away (i.e. before the next commit start or unmount), so either
506 * @gc_lnum must be specially accounted for, or the current approach followed
507 * i.e. count it under @taken_empty_lebs.
490 * 508 *
491 * N.B. total_dirty and total_used are different to other total_* fields, 509 * @empty_lebs includes @taken_empty_lebs.
492 * because they account _all_ LEBs, not just data LEBs.
493 * 510 *
494 * 'taken_empty_lebs' counts the LEBs that are in the transient state of having 511 * @total_used, @total_dead and @total_dark fields do not account indexing
495 * been 'taken' for use but not yet written to. 'taken_empty_lebs' is needed 512 * LEBs.
496 * to account correctly for gc_lnum, otherwise 'empty_lebs' could be used
497 * by itself (in which case 'unused_lebs' would be a better name). In the case
498 * of gc_lnum, it is 'taken' at mount time or whenever a LEB is retained by GC,
499 * but unlike other empty LEBs that are 'taken', it may not be written straight
500 * away (i.e. before the next commit start or unmount), so either gc_lnum must
501 * be specially accounted for, or the current approach followed i.e. count it
502 * under 'taken_empty_lebs'.
503 */ 513 */
504struct ubifs_lp_stats { 514struct ubifs_lp_stats {
505 int empty_lebs; 515 int empty_lebs;
@@ -893,15 +903,25 @@ struct ubifs_orphan {
893/** 903/**
894 * struct ubifs_mount_opts - UBIFS-specific mount options information. 904 * struct ubifs_mount_opts - UBIFS-specific mount options information.
895 * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast) 905 * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast)
896 * @bulk_read: enable bulk-reads 906 * @bulk_read: enable/disable bulk-reads (%0 default, %1 disabe, %2 enable)
897 * @chk_data_crc: check CRCs when reading data nodes 907 * @chk_data_crc: enable/disable CRC data checking when reading data nodes
908 * (%0 default, %1 disabe, %2 enable)
909 * @override_compr: override default compressor (%0 - do not override and use
910 * superblock compressor, %1 - override and use compressor
911 * specified in @compr_type)
912 * @compr_type: compressor type to override the superblock compressor with
913 * (%UBIFS_COMPR_NONE, etc)
898 */ 914 */
899struct ubifs_mount_opts { 915struct ubifs_mount_opts {
900 unsigned int unmount_mode:2; 916 unsigned int unmount_mode:2;
901 unsigned int bulk_read:2; 917 unsigned int bulk_read:2;
902 unsigned int chk_data_crc:2; 918 unsigned int chk_data_crc:2;
919 unsigned int override_compr:1;
920 unsigned int compr_type:2;
903}; 921};
904 922
923struct ubifs_debug_info;
924
905/** 925/**
906 * struct ubifs_info - UBIFS file-system description data structure 926 * struct ubifs_info - UBIFS file-system description data structure
907 * (per-superblock). 927 * (per-superblock).
@@ -946,6 +966,7 @@ struct ubifs_mount_opts {
946 * @no_chk_data_crc: do not check CRCs when reading data nodes (except during 966 * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
947 * recovery) 967 * recovery)
948 * @bulk_read: enable bulk-reads 968 * @bulk_read: enable bulk-reads
969 * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
949 * 970 *
950 * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and 971 * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
951 * @calc_idx_sz 972 * @calc_idx_sz
@@ -963,8 +984,6 @@ struct ubifs_mount_opts {
963 * @ileb_nxt: next pre-allocated index LEBs 984 * @ileb_nxt: next pre-allocated index LEBs
964 * @old_idx: tree of index nodes obsoleted since the last commit start 985 * @old_idx: tree of index nodes obsoleted since the last commit start
965 * @bottom_up_buf: a buffer which is used by 'dirty_cow_bottom_up()' in tnc.c 986 * @bottom_up_buf: a buffer which is used by 'dirty_cow_bottom_up()' in tnc.c
966 * @new_ihead_lnum: used by debugging to check ihead_lnum
967 * @new_ihead_offs: used by debugging to check ihead_offs
968 * 987 *
969 * @mst_node: master node 988 * @mst_node: master node
970 * @mst_offs: offset of valid master node 989 * @mst_offs: offset of valid master node
@@ -986,7 +1005,6 @@ struct ubifs_mount_opts {
986 * @main_lebs: count of LEBs in the main area 1005 * @main_lebs: count of LEBs in the main area
987 * @main_first: first LEB of the main area 1006 * @main_first: first LEB of the main area
988 * @main_bytes: main area size in bytes 1007 * @main_bytes: main area size in bytes
989 * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
990 * 1008 *
991 * @key_hash_type: type of the key hash 1009 * @key_hash_type: type of the key hash
992 * @key_hash: direntry key hash function 1010 * @key_hash: direntry key hash function
@@ -1149,15 +1167,7 @@ struct ubifs_mount_opts {
1149 * @always_chk_crc: always check CRCs (while mounting and remounting rw) 1167 * @always_chk_crc: always check CRCs (while mounting and remounting rw)
1150 * @mount_opts: UBIFS-specific mount options 1168 * @mount_opts: UBIFS-specific mount options
1151 * 1169 *
1152 * @dbg_buf: a buffer of LEB size used for debugging purposes 1170 * @dbg: debugging-related information
1153 * @old_zroot: old index root - used by 'dbg_check_old_index()'
1154 * @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
1155 * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
1156 * @failure_mode: failure mode for recovery testing
1157 * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls
1158 * @fail_timeout: time in jiffies when delay of failure mode expires
1159 * @fail_cnt: current number of calls to failure mode I/O functions
1160 * @fail_cnt_max: number of calls by which to delay failure mode
1161 */ 1171 */
1162struct ubifs_info { 1172struct ubifs_info {
1163 struct super_block *vfs_sb; 1173 struct super_block *vfs_sb;
@@ -1196,6 +1206,7 @@ struct ubifs_info {
1196 unsigned int big_lpt:1; 1206 unsigned int big_lpt:1;
1197 unsigned int no_chk_data_crc:1; 1207 unsigned int no_chk_data_crc:1;
1198 unsigned int bulk_read:1; 1208 unsigned int bulk_read:1;
1209 unsigned int default_compr:2;
1199 1210
1200 struct mutex tnc_mutex; 1211 struct mutex tnc_mutex;
1201 struct ubifs_zbranch zroot; 1212 struct ubifs_zbranch zroot;
@@ -1212,10 +1223,6 @@ struct ubifs_info {
1212 int ileb_nxt; 1223 int ileb_nxt;
1213 struct rb_root old_idx; 1224 struct rb_root old_idx;
1214 int *bottom_up_buf; 1225 int *bottom_up_buf;
1215#ifdef CONFIG_UBIFS_FS_DEBUG
1216 int new_ihead_lnum;
1217 int new_ihead_offs;
1218#endif
1219 1226
1220 struct ubifs_mst_node *mst_node; 1227 struct ubifs_mst_node *mst_node;
1221 int mst_offs; 1228 int mst_offs;
@@ -1237,7 +1244,6 @@ struct ubifs_info {
1237 int main_lebs; 1244 int main_lebs;
1238 int main_first; 1245 int main_first;
1239 long long main_bytes; 1246 long long main_bytes;
1240 int default_compr;
1241 1247
1242 uint8_t key_hash_type; 1248 uint8_t key_hash_type;
1243 uint32_t (*key_hash)(const char *str, int len); 1249 uint32_t (*key_hash)(const char *str, int len);
@@ -1315,8 +1321,8 @@ struct ubifs_info {
1315 void *sbuf; 1321 void *sbuf;
1316 struct list_head idx_gc; 1322 struct list_head idx_gc;
1317 int idx_gc_cnt; 1323 int idx_gc_cnt;
1318 volatile int gc_seq; 1324 int gc_seq;
1319 volatile int gced_lnum; 1325 int gced_lnum;
1320 1326
1321 struct list_head infos_list; 1327 struct list_head infos_list;
1322 struct mutex umount_mutex; 1328 struct mutex umount_mutex;
@@ -1391,21 +1397,7 @@ struct ubifs_info {
1391 struct ubifs_mount_opts mount_opts; 1397 struct ubifs_mount_opts mount_opts;
1392 1398
1393#ifdef CONFIG_UBIFS_FS_DEBUG 1399#ifdef CONFIG_UBIFS_FS_DEBUG
1394 void *dbg_buf; 1400 struct ubifs_debug_info *dbg;
1395 struct ubifs_zbranch old_zroot;
1396 int old_zroot_level;
1397 unsigned long long old_zroot_sqnum;
1398 int failure_mode;
1399 int fail_delay;
1400 unsigned long fail_timeout;
1401 unsigned int fail_cnt;
1402 unsigned int fail_cnt_max;
1403 long long chk_lpt_sz;
1404 long long chk_lpt_sz2;
1405 long long chk_lpt_wastage;
1406 int chk_lpt_lebs;
1407 int new_nhead_lnum;
1408 int new_nhead_offs;
1409#endif 1401#endif
1410}; 1402};
1411 1403
@@ -1505,7 +1497,7 @@ void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
1505long long ubifs_get_free_space(struct ubifs_info *c); 1497long long ubifs_get_free_space(struct ubifs_info *c);
1506int ubifs_calc_min_idx_lebs(struct ubifs_info *c); 1498int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
1507void ubifs_convert_page_budget(struct ubifs_info *c); 1499void ubifs_convert_page_budget(struct ubifs_info *c);
1508long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free); 1500long long ubifs_reported_space(const struct ubifs_info *c, long long free);
1509long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs); 1501long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
1510 1502
1511/* find.c */ 1503/* find.c */
@@ -1639,6 +1631,9 @@ void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty);
1639void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode); 1631void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode);
1640uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits); 1632uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits);
1641struct ubifs_nnode *ubifs_first_nnode(struct ubifs_info *c, int *hght); 1633struct ubifs_nnode *ubifs_first_nnode(struct ubifs_info *c, int *hght);
1634/* Needed only in debugging code in lpt_commit.c */
1635int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf,
1636 struct ubifs_nnode *nnode);
1642 1637
1643/* lpt_commit.c */ 1638/* lpt_commit.c */
1644int ubifs_lpt_start_commit(struct ubifs_info *c); 1639int ubifs_lpt_start_commit(struct ubifs_info *c);
@@ -1714,7 +1709,7 @@ long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
1714 1709
1715/* compressor.c */ 1710/* compressor.c */
1716int __init ubifs_compressors_init(void); 1711int __init ubifs_compressors_init(void);
1717void __exit ubifs_compressors_exit(void); 1712void ubifs_compressors_exit(void);
1718void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len, 1713void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
1719 int *compr_type); 1714 int *compr_type);
1720int ubifs_decompress(const void *buf, int len, void *out, int *out_len, 1715int ubifs_decompress(const void *buf, int len, void *out, int *out_len,
diff --git a/fs/xattr.c b/fs/xattr.c
index 468377e66531..237804cd6b56 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -175,7 +175,7 @@ vfs_listxattr(struct dentry *d, char *list, size_t size)
175 if (error) 175 if (error)
176 return error; 176 return error;
177 error = -EOPNOTSUPP; 177 error = -EOPNOTSUPP;
178 if (d->d_inode->i_op && d->d_inode->i_op->listxattr) { 178 if (d->d_inode->i_op->listxattr) {
179 error = d->d_inode->i_op->listxattr(d, list, size); 179 error = d->d_inode->i_op->listxattr(d, list, size);
180 } else { 180 } else {
181 error = security_inode_listsecurity(d->d_inode, list, size); 181 error = security_inode_listsecurity(d->d_inode, list, size);
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 737c9a425361..c3dc491fff89 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -85,13 +85,13 @@ xfs-y += xfs_alloc.o \
85 xfs_trans_inode.o \ 85 xfs_trans_inode.o \
86 xfs_trans_item.o \ 86 xfs_trans_item.o \
87 xfs_utils.o \ 87 xfs_utils.o \
88 xfs_vfsops.o \
89 xfs_vnodeops.o \ 88 xfs_vnodeops.o \
90 xfs_rw.o \ 89 xfs_rw.o \
91 xfs_dmops.o \ 90 xfs_dmops.o \
92 xfs_qmops.o 91 xfs_qmops.o
93 92
94xfs-$(CONFIG_XFS_TRACE) += xfs_dir2_trace.o 93xfs-$(CONFIG_XFS_TRACE) += xfs_btree_trace.o \
94 xfs_dir2_trace.o
95 95
96# Objects in linux/ 96# Objects in linux/
97xfs-y += $(addprefix $(XFS_LINUX)/, \ 97xfs-y += $(addprefix $(XFS_LINUX)/, \
@@ -106,7 +106,7 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \
106 xfs_iops.o \ 106 xfs_iops.o \
107 xfs_lrw.o \ 107 xfs_lrw.o \
108 xfs_super.o \ 108 xfs_super.o \
109 xfs_vnode.o \ 109 xfs_sync.o \
110 xfs_xattr.o) 110 xfs_xattr.o)
111 111
112# Objects in support/ 112# Objects in support/
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
index 351a8f454bd1..4dfc7c370819 100644
--- a/fs/xfs/linux-2.6/sv.h
+++ b/fs/xfs/linux-2.6/sv.h
@@ -32,23 +32,15 @@ typedef struct sv_s {
32 wait_queue_head_t waiters; 32 wait_queue_head_t waiters;
33} sv_t; 33} sv_t;
34 34
35#define SV_FIFO 0x0 /* sv_t is FIFO type */ 35static inline void _sv_wait(sv_t *sv, spinlock_t *lock)
36#define SV_LIFO 0x2 /* sv_t is LIFO type */
37#define SV_PRIO 0x4 /* sv_t is PRIO type */
38#define SV_KEYED 0x6 /* sv_t is KEYED type */
39#define SV_DEFAULT SV_FIFO
40
41
42static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state,
43 unsigned long timeout)
44{ 36{
45 DECLARE_WAITQUEUE(wait, current); 37 DECLARE_WAITQUEUE(wait, current);
46 38
47 add_wait_queue_exclusive(&sv->waiters, &wait); 39 add_wait_queue_exclusive(&sv->waiters, &wait);
48 __set_current_state(state); 40 __set_current_state(TASK_UNINTERRUPTIBLE);
49 spin_unlock(lock); 41 spin_unlock(lock);
50 42
51 schedule_timeout(timeout); 43 schedule();
52 44
53 remove_wait_queue(&sv->waiters, &wait); 45 remove_wait_queue(&sv->waiters, &wait);
54} 46}
@@ -58,13 +50,7 @@ static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state,
58#define sv_destroy(sv) \ 50#define sv_destroy(sv) \
59 /*NOTHING*/ 51 /*NOTHING*/
60#define sv_wait(sv, pri, lock, s) \ 52#define sv_wait(sv, pri, lock, s) \
61 _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT) 53 _sv_wait(sv, lock)
62#define sv_wait_sig(sv, pri, lock, s) \
63 _sv_wait(sv, lock, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
64#define sv_timedwait(sv, pri, lock, s, svf, ts, rts) \
65 _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, timespec_to_jiffies(ts))
66#define sv_timedwait_sig(sv, pri, lock, s, svf, ts, rts) \
67 _sv_wait(sv, lock, TASK_INTERRUPTIBLE, timespec_to_jiffies(ts))
68#define sv_signal(sv) \ 54#define sv_signal(sv) \
69 wake_up(&(sv)->waiters) 55 wake_up(&(sv)->waiters)
70#define sv_broadcast(sv) \ 56#define sv_broadcast(sv) \
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index a44d68eb50b5..de3a198f771e 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -42,6 +42,40 @@
42#include <linux/pagevec.h> 42#include <linux/pagevec.h>
43#include <linux/writeback.h> 43#include <linux/writeback.h>
44 44
45
46/*
47 * Prime number of hash buckets since address is used as the key.
48 */
49#define NVSYNC 37
50#define to_ioend_wq(v) (&xfs_ioend_wq[((unsigned long)v) % NVSYNC])
51static wait_queue_head_t xfs_ioend_wq[NVSYNC];
52
53void __init
54xfs_ioend_init(void)
55{
56 int i;
57
58 for (i = 0; i < NVSYNC; i++)
59 init_waitqueue_head(&xfs_ioend_wq[i]);
60}
61
62void
63xfs_ioend_wait(
64 xfs_inode_t *ip)
65{
66 wait_queue_head_t *wq = to_ioend_wq(ip);
67
68 wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));
69}
70
71STATIC void
72xfs_ioend_wake(
73 xfs_inode_t *ip)
74{
75 if (atomic_dec_and_test(&ip->i_iocount))
76 wake_up(to_ioend_wq(ip));
77}
78
45STATIC void 79STATIC void
46xfs_count_page_state( 80xfs_count_page_state(
47 struct page *page, 81 struct page *page,
@@ -146,16 +180,25 @@ xfs_destroy_ioend(
146 xfs_ioend_t *ioend) 180 xfs_ioend_t *ioend)
147{ 181{
148 struct buffer_head *bh, *next; 182 struct buffer_head *bh, *next;
183 struct xfs_inode *ip = XFS_I(ioend->io_inode);
149 184
150 for (bh = ioend->io_buffer_head; bh; bh = next) { 185 for (bh = ioend->io_buffer_head; bh; bh = next) {
151 next = bh->b_private; 186 next = bh->b_private;
152 bh->b_end_io(bh, !ioend->io_error); 187 bh->b_end_io(bh, !ioend->io_error);
153 } 188 }
154 if (unlikely(ioend->io_error)) { 189
155 vn_ioerror(XFS_I(ioend->io_inode), ioend->io_error, 190 /*
156 __FILE__,__LINE__); 191 * Volume managers supporting multiple paths can send back ENODEV
192 * when the final path disappears. In this case continuing to fill
193 * the page cache with dirty data which cannot be written out is
194 * evil, so prevent that.
195 */
196 if (unlikely(ioend->io_error == -ENODEV)) {
197 xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ,
198 __FILE__, __LINE__);
157 } 199 }
158 vn_iowake(XFS_I(ioend->io_inode)); 200
201 xfs_ioend_wake(ip);
159 mempool_free(ioend, xfs_ioend_pool); 202 mempool_free(ioend, xfs_ioend_pool);
160} 203}
161 204
@@ -191,7 +234,7 @@ xfs_setfilesize(
191 ip->i_d.di_size = isize; 234 ip->i_d.di_size = isize;
192 ip->i_update_core = 1; 235 ip->i_update_core = 1;
193 ip->i_update_size = 1; 236 ip->i_update_size = 1;
194 mark_inode_dirty_sync(ioend->io_inode); 237 xfs_mark_inode_dirty_sync(ip);
195 } 238 }
196 239
197 xfs_iunlock(ip, XFS_ILOCK_EXCL); 240 xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -317,14 +360,9 @@ xfs_map_blocks(
317 xfs_iomap_t *mapp, 360 xfs_iomap_t *mapp,
318 int flags) 361 int flags)
319{ 362{
320 xfs_inode_t *ip = XFS_I(inode); 363 int nmaps = 1;
321 int error, nmaps = 1; 364
322 365 return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps);
323 error = xfs_iomap(ip, offset, count,
324 flags, mapp, &nmaps);
325 if (!error && (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)))
326 xfs_iflags_set(ip, XFS_IMODIFIED);
327 return -error;
328} 366}
329 367
330STATIC_INLINE int 368STATIC_INLINE int
@@ -512,7 +550,7 @@ xfs_cancel_ioend(
512 unlock_buffer(bh); 550 unlock_buffer(bh);
513 } while ((bh = next_bh) != NULL); 551 } while ((bh = next_bh) != NULL);
514 552
515 vn_iowake(XFS_I(ioend->io_inode)); 553 xfs_ioend_wake(XFS_I(ioend->io_inode));
516 mempool_free(ioend, xfs_ioend_pool); 554 mempool_free(ioend, xfs_ioend_pool);
517 } while ((ioend = next) != NULL); 555 } while ((ioend = next) != NULL);
518} 556}
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 3ba0631a3818..7b26f5ff9692 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -43,4 +43,7 @@ typedef struct xfs_ioend {
43extern const struct address_space_operations xfs_address_space_operations; 43extern const struct address_space_operations xfs_address_space_operations;
44extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); 44extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
45 45
46extern void xfs_ioend_init(void);
47extern void xfs_ioend_wait(struct xfs_inode *);
48
46#endif /* __XFS_AOPS_H__ */ 49#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 36d5fcd3f593..cb329edc925b 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -630,6 +630,29 @@ xfs_buf_get_flags(
630 return NULL; 630 return NULL;
631} 631}
632 632
633STATIC int
634_xfs_buf_read(
635 xfs_buf_t *bp,
636 xfs_buf_flags_t flags)
637{
638 int status;
639
640 XB_TRACE(bp, "_xfs_buf_read", (unsigned long)flags);
641
642 ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE)));
643 ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
644
645 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \
646 XBF_READ_AHEAD | _XBF_RUN_QUEUES);
647 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | \
648 XBF_READ_AHEAD | _XBF_RUN_QUEUES);
649
650 status = xfs_buf_iorequest(bp);
651 if (!status && !(flags & XBF_ASYNC))
652 status = xfs_buf_iowait(bp);
653 return status;
654}
655
633xfs_buf_t * 656xfs_buf_t *
634xfs_buf_read_flags( 657xfs_buf_read_flags(
635 xfs_buftarg_t *target, 658 xfs_buftarg_t *target,
@@ -646,7 +669,7 @@ xfs_buf_read_flags(
646 if (!XFS_BUF_ISDONE(bp)) { 669 if (!XFS_BUF_ISDONE(bp)) {
647 XB_TRACE(bp, "read", (unsigned long)flags); 670 XB_TRACE(bp, "read", (unsigned long)flags);
648 XFS_STATS_INC(xb_get_read); 671 XFS_STATS_INC(xb_get_read);
649 xfs_buf_iostart(bp, flags); 672 _xfs_buf_read(bp, flags);
650 } else if (flags & XBF_ASYNC) { 673 } else if (flags & XBF_ASYNC) {
651 XB_TRACE(bp, "read_async", (unsigned long)flags); 674 XB_TRACE(bp, "read_async", (unsigned long)flags);
652 /* 675 /*
@@ -1048,50 +1071,39 @@ xfs_buf_ioerror(
1048 XB_TRACE(bp, "ioerror", (unsigned long)error); 1071 XB_TRACE(bp, "ioerror", (unsigned long)error);
1049} 1072}
1050 1073
1051/*
1052 * Initiate I/O on a buffer, based on the flags supplied.
1053 * The b_iodone routine in the buffer supplied will only be called
1054 * when all of the subsidiary I/O requests, if any, have been completed.
1055 */
1056int 1074int
1057xfs_buf_iostart( 1075xfs_bawrite(
1058 xfs_buf_t *bp, 1076 void *mp,
1059 xfs_buf_flags_t flags) 1077 struct xfs_buf *bp)
1060{ 1078{
1061 int status = 0; 1079 XB_TRACE(bp, "bawrite", 0);
1062 1080
1063 XB_TRACE(bp, "iostart", (unsigned long)flags); 1081 ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
1064 1082
1065 if (flags & XBF_DELWRI) { 1083 xfs_buf_delwri_dequeue(bp);
1066 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC);
1067 bp->b_flags |= flags & (XBF_DELWRI | XBF_ASYNC);
1068 xfs_buf_delwri_queue(bp, 1);
1069 return 0;
1070 }
1071 1084
1072 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \ 1085 bp->b_flags &= ~(XBF_READ | XBF_DELWRI | XBF_READ_AHEAD);
1073 XBF_READ_AHEAD | _XBF_RUN_QUEUES); 1086 bp->b_flags |= (XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES);
1074 bp->b_flags |= flags & (XBF_READ | XBF_WRITE | XBF_ASYNC | \ 1087
1075 XBF_READ_AHEAD | _XBF_RUN_QUEUES); 1088 bp->b_mount = mp;
1089 bp->b_strat = xfs_bdstrat_cb;
1090 return xfs_bdstrat_cb(bp);
1091}
1076 1092
1077 BUG_ON(bp->b_bn == XFS_BUF_DADDR_NULL); 1093void
1094xfs_bdwrite(
1095 void *mp,
1096 struct xfs_buf *bp)
1097{
1098 XB_TRACE(bp, "bdwrite", 0);
1078 1099
1079 /* For writes allow an alternate strategy routine to precede 1100 bp->b_strat = xfs_bdstrat_cb;
1080 * the actual I/O request (which may not be issued at all in 1101 bp->b_mount = mp;
1081 * a shutdown situation, for example).
1082 */
1083 status = (flags & XBF_WRITE) ?
1084 xfs_buf_iostrategy(bp) : xfs_buf_iorequest(bp);
1085 1102
1086 /* Wait for I/O if we are not an async request. 1103 bp->b_flags &= ~XBF_READ;
1087 * Note: async I/O request completion will release the buffer, 1104 bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
1088 * and that can already be done by this point. So using the
1089 * buffer pointer from here on, after async I/O, is invalid.
1090 */
1091 if (!status && !(flags & XBF_ASYNC))
1092 status = xfs_buf_iowait(bp);
1093 1105
1094 return status; 1106 xfs_buf_delwri_queue(bp, 1);
1095} 1107}
1096 1108
1097STATIC_INLINE void 1109STATIC_INLINE void
@@ -1114,8 +1126,7 @@ xfs_buf_bio_end_io(
1114 unsigned int blocksize = bp->b_target->bt_bsize; 1126 unsigned int blocksize = bp->b_target->bt_bsize;
1115 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1127 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1116 1128
1117 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 1129 xfs_buf_ioerror(bp, -error);
1118 bp->b_error = EIO;
1119 1130
1120 do { 1131 do {
1121 struct page *page = bvec->bv_page; 1132 struct page *page = bvec->bv_page;
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 456519a088c7..288ae7c4c800 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -168,7 +168,7 @@ typedef struct xfs_buf {
168 struct completion b_iowait; /* queue for I/O waiters */ 168 struct completion b_iowait; /* queue for I/O waiters */
169 void *b_fspriv; 169 void *b_fspriv;
170 void *b_fspriv2; 170 void *b_fspriv2;
171 void *b_fspriv3; 171 struct xfs_mount *b_mount;
172 unsigned short b_error; /* error code on I/O */ 172 unsigned short b_error; /* error code on I/O */
173 unsigned int b_page_count; /* size of page array */ 173 unsigned int b_page_count; /* size of page array */
174 unsigned int b_offset; /* page offset in first page */ 174 unsigned int b_offset; /* page offset in first page */
@@ -214,9 +214,10 @@ extern void xfs_buf_lock(xfs_buf_t *);
214extern void xfs_buf_unlock(xfs_buf_t *); 214extern void xfs_buf_unlock(xfs_buf_t *);
215 215
216/* Buffer Read and Write Routines */ 216/* Buffer Read and Write Routines */
217extern int xfs_bawrite(void *mp, xfs_buf_t *bp);
218extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
217extern void xfs_buf_ioend(xfs_buf_t *, int); 219extern void xfs_buf_ioend(xfs_buf_t *, int);
218extern void xfs_buf_ioerror(xfs_buf_t *, int); 220extern void xfs_buf_ioerror(xfs_buf_t *, int);
219extern int xfs_buf_iostart(xfs_buf_t *, xfs_buf_flags_t);
220extern int xfs_buf_iorequest(xfs_buf_t *); 221extern int xfs_buf_iorequest(xfs_buf_t *);
221extern int xfs_buf_iowait(xfs_buf_t *); 222extern int xfs_buf_iowait(xfs_buf_t *);
222extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t, 223extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t,
@@ -311,10 +312,6 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
311#define XFS_BUF_UNORDERED(bp) ((bp)->b_flags &= ~XBF_ORDERED) 312#define XFS_BUF_UNORDERED(bp) ((bp)->b_flags &= ~XBF_ORDERED)
312#define XFS_BUF_ISORDERED(bp) ((bp)->b_flags & XBF_ORDERED) 313#define XFS_BUF_ISORDERED(bp) ((bp)->b_flags & XBF_ORDERED)
313 314
314#define XFS_BUF_SHUT(bp) do { } while (0)
315#define XFS_BUF_UNSHUT(bp) do { } while (0)
316#define XFS_BUF_ISSHUT(bp) (0)
317
318#define XFS_BUF_HOLD(bp) xfs_buf_hold(bp) 315#define XFS_BUF_HOLD(bp) xfs_buf_hold(bp)
319#define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ) 316#define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ)
320#define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ) 317#define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ)
@@ -334,8 +331,6 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
334#define XFS_BUF_SET_FSPRIVATE(bp, val) ((bp)->b_fspriv = (void*)(val)) 331#define XFS_BUF_SET_FSPRIVATE(bp, val) ((bp)->b_fspriv = (void*)(val))
335#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2) 332#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2)
336#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val)) 333#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val))
337#define XFS_BUF_FSPRIVATE3(bp, type) ((type)(bp)->b_fspriv3)
338#define XFS_BUF_SET_FSPRIVATE3(bp, val) ((bp)->b_fspriv3 = (void*)(val))
339#define XFS_BUF_SET_START(bp) do { } while (0) 334#define XFS_BUF_SET_START(bp) do { } while (0)
340#define XFS_BUF_SET_BRELSE_FUNC(bp, func) ((bp)->b_relse = (func)) 335#define XFS_BUF_SET_BRELSE_FUNC(bp, func) ((bp)->b_relse = (func))
341 336
@@ -366,14 +361,6 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
366#define XFS_BUF_TARGET(bp) ((bp)->b_target) 361#define XFS_BUF_TARGET(bp) ((bp)->b_target)
367#define XFS_BUFTARG_NAME(target) xfs_buf_target_name(target) 362#define XFS_BUFTARG_NAME(target) xfs_buf_target_name(target)
368 363
369static inline int xfs_bawrite(void *mp, xfs_buf_t *bp)
370{
371 bp->b_fspriv3 = mp;
372 bp->b_strat = xfs_bdstrat_cb;
373 xfs_buf_delwri_dequeue(bp);
374 return xfs_buf_iostart(bp, XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES);
375}
376
377static inline void xfs_buf_relse(xfs_buf_t *bp) 364static inline void xfs_buf_relse(xfs_buf_t *bp)
378{ 365{
379 if (!bp->b_relse) 366 if (!bp->b_relse)
@@ -414,17 +401,6 @@ static inline int XFS_bwrite(xfs_buf_t *bp)
414 return error; 401 return error;
415} 402}
416 403
417/*
418 * No error can be returned from xfs_buf_iostart for delwri
419 * buffers as they are queued and no I/O is issued.
420 */
421static inline void xfs_bdwrite(void *mp, xfs_buf_t *bp)
422{
423 bp->b_strat = xfs_bdstrat_cb;
424 bp->b_fspriv3 = mp;
425 (void)xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC);
426}
427
428#define XFS_bdstrat(bp) xfs_buf_iorequest(bp) 404#define XFS_bdstrat(bp) xfs_buf_iorequest(bp)
429 405
430#define xfs_iowait(bp) xfs_buf_iowait(bp) 406#define xfs_iowait(bp) xfs_buf_iowait(bp)
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
index 8c022cd0ad67..55bddf3b6091 100644
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ b/fs/xfs/linux-2.6/xfs_cred.h
@@ -25,12 +25,4 @@
25 */ 25 */
26typedef const struct cred cred_t; 26typedef const struct cred cred_t;
27 27
28extern cred_t *sys_cred;
29
30/* this is a hack.. (assumes sys_cred is the only cred_t in the system) */
31static inline int capable_cred(cred_t *cr, int cid)
32{
33 return (cr == sys_cred) ? 1 : capable(cid);
34}
35
36#endif /* __XFS_CRED_H__ */ 28#endif /* __XFS_CRED_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 7f7abec25e14..595751f78350 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -29,7 +29,6 @@
29#include "xfs_vnodeops.h" 29#include "xfs_vnodeops.h"
30#include "xfs_bmap_btree.h" 30#include "xfs_bmap_btree.h"
31#include "xfs_inode.h" 31#include "xfs_inode.h"
32#include "xfs_vfsops.h"
33 32
34/* 33/*
35 * Note that we only accept fileids which are long enough rather than allow 34 * Note that we only accept fileids which are long enough rather than allow
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 3fee790f138b..e14c4e3aea0c 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -36,89 +36,54 @@
36#include "xfs_inode.h" 36#include "xfs_inode.h"
37#include "xfs_error.h" 37#include "xfs_error.h"
38#include "xfs_rw.h" 38#include "xfs_rw.h"
39#include "xfs_ioctl32.h"
40#include "xfs_vnodeops.h" 39#include "xfs_vnodeops.h"
40#include "xfs_da_btree.h"
41#include "xfs_ioctl.h"
41 42
42#include <linux/dcache.h> 43#include <linux/dcache.h>
43#include <linux/smp_lock.h> 44#include <linux/smp_lock.h>
44 45
45static struct vm_operations_struct xfs_file_vm_ops; 46static struct vm_operations_struct xfs_file_vm_ops;
46 47
47STATIC_INLINE ssize_t 48STATIC ssize_t
48__xfs_file_read( 49xfs_file_aio_read(
49 struct kiocb *iocb, 50 struct kiocb *iocb,
50 const struct iovec *iov, 51 const struct iovec *iov,
51 unsigned long nr_segs, 52 unsigned long nr_segs,
52 int ioflags,
53 loff_t pos) 53 loff_t pos)
54{ 54{
55 struct file *file = iocb->ki_filp; 55 struct file *file = iocb->ki_filp;
56 int ioflags = IO_ISAIO;
56 57
57 BUG_ON(iocb->ki_pos != pos); 58 BUG_ON(iocb->ki_pos != pos);
58 if (unlikely(file->f_flags & O_DIRECT)) 59 if (unlikely(file->f_flags & O_DIRECT))
59 ioflags |= IO_ISDIRECT; 60 ioflags |= IO_ISDIRECT;
61 if (file->f_mode & FMODE_NOCMTIME)
62 ioflags |= IO_INVIS;
60 return xfs_read(XFS_I(file->f_path.dentry->d_inode), iocb, iov, 63 return xfs_read(XFS_I(file->f_path.dentry->d_inode), iocb, iov,
61 nr_segs, &iocb->ki_pos, ioflags); 64 nr_segs, &iocb->ki_pos, ioflags);
62} 65}
63 66
64STATIC ssize_t 67STATIC ssize_t
65xfs_file_aio_read( 68xfs_file_aio_write(
66 struct kiocb *iocb,
67 const struct iovec *iov,
68 unsigned long nr_segs,
69 loff_t pos)
70{
71 return __xfs_file_read(iocb, iov, nr_segs, IO_ISAIO, pos);
72}
73
74STATIC ssize_t
75xfs_file_aio_read_invis(
76 struct kiocb *iocb,
77 const struct iovec *iov,
78 unsigned long nr_segs,
79 loff_t pos)
80{
81 return __xfs_file_read(iocb, iov, nr_segs, IO_ISAIO|IO_INVIS, pos);
82}
83
84STATIC_INLINE ssize_t
85__xfs_file_write(
86 struct kiocb *iocb, 69 struct kiocb *iocb,
87 const struct iovec *iov, 70 const struct iovec *iov,
88 unsigned long nr_segs, 71 unsigned long nr_segs,
89 int ioflags,
90 loff_t pos) 72 loff_t pos)
91{ 73{
92 struct file *file = iocb->ki_filp; 74 struct file *file = iocb->ki_filp;
75 int ioflags = IO_ISAIO;
93 76
94 BUG_ON(iocb->ki_pos != pos); 77 BUG_ON(iocb->ki_pos != pos);
95 if (unlikely(file->f_flags & O_DIRECT)) 78 if (unlikely(file->f_flags & O_DIRECT))
96 ioflags |= IO_ISDIRECT; 79 ioflags |= IO_ISDIRECT;
80 if (file->f_mode & FMODE_NOCMTIME)
81 ioflags |= IO_INVIS;
97 return xfs_write(XFS_I(file->f_mapping->host), iocb, iov, nr_segs, 82 return xfs_write(XFS_I(file->f_mapping->host), iocb, iov, nr_segs,
98 &iocb->ki_pos, ioflags); 83 &iocb->ki_pos, ioflags);
99} 84}
100 85
101STATIC ssize_t 86STATIC ssize_t
102xfs_file_aio_write(
103 struct kiocb *iocb,
104 const struct iovec *iov,
105 unsigned long nr_segs,
106 loff_t pos)
107{
108 return __xfs_file_write(iocb, iov, nr_segs, IO_ISAIO, pos);
109}
110
111STATIC ssize_t
112xfs_file_aio_write_invis(
113 struct kiocb *iocb,
114 const struct iovec *iov,
115 unsigned long nr_segs,
116 loff_t pos)
117{
118 return __xfs_file_write(iocb, iov, nr_segs, IO_ISAIO|IO_INVIS, pos);
119}
120
121STATIC ssize_t
122xfs_file_splice_read( 87xfs_file_splice_read(
123 struct file *infilp, 88 struct file *infilp,
124 loff_t *ppos, 89 loff_t *ppos,
@@ -126,20 +91,13 @@ xfs_file_splice_read(
126 size_t len, 91 size_t len,
127 unsigned int flags) 92 unsigned int flags)
128{ 93{
129 return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode), 94 int ioflags = 0;
130 infilp, ppos, pipe, len, flags, 0); 95
131} 96 if (infilp->f_mode & FMODE_NOCMTIME)
97 ioflags |= IO_INVIS;
132 98
133STATIC ssize_t
134xfs_file_splice_read_invis(
135 struct file *infilp,
136 loff_t *ppos,
137 struct pipe_inode_info *pipe,
138 size_t len,
139 unsigned int flags)
140{
141 return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode), 99 return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode),
142 infilp, ppos, pipe, len, flags, IO_INVIS); 100 infilp, ppos, pipe, len, flags, ioflags);
143} 101}
144 102
145STATIC ssize_t 103STATIC ssize_t
@@ -150,30 +108,49 @@ xfs_file_splice_write(
150 size_t len, 108 size_t len,
151 unsigned int flags) 109 unsigned int flags)
152{ 110{
153 return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode), 111 int ioflags = 0;
154 pipe, outfilp, ppos, len, flags, 0); 112
155} 113 if (outfilp->f_mode & FMODE_NOCMTIME)
114 ioflags |= IO_INVIS;
156 115
157STATIC ssize_t
158xfs_file_splice_write_invis(
159 struct pipe_inode_info *pipe,
160 struct file *outfilp,
161 loff_t *ppos,
162 size_t len,
163 unsigned int flags)
164{
165 return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode), 116 return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode),
166 pipe, outfilp, ppos, len, flags, IO_INVIS); 117 pipe, outfilp, ppos, len, flags, ioflags);
167} 118}
168 119
169STATIC int 120STATIC int
170xfs_file_open( 121xfs_file_open(
171 struct inode *inode, 122 struct inode *inode,
172 struct file *filp) 123 struct file *file)
173{ 124{
174 if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) 125 if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
175 return -EFBIG; 126 return -EFBIG;
176 return -xfs_open(XFS_I(inode)); 127 if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
128 return -EIO;
129 return 0;
130}
131
132STATIC int
133xfs_dir_open(
134 struct inode *inode,
135 struct file *file)
136{
137 struct xfs_inode *ip = XFS_I(inode);
138 int mode;
139 int error;
140
141 error = xfs_file_open(inode, file);
142 if (error)
143 return error;
144
145 /*
146 * If there are any blocks, read-ahead block 0 as we're almost
147 * certain to have the next operation be a read there.
148 */
149 mode = xfs_ilock_map_shared(ip);
150 if (ip->i_d.di_nextents > 0)
151 xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
152 xfs_iunlock(ip, mode);
153 return 0;
177} 154}
178 155
179STATIC int 156STATIC int
@@ -227,7 +204,7 @@ xfs_file_readdir(
227 * point we can change the ->readdir prototype to include the 204 * point we can change the ->readdir prototype to include the
228 * buffer size. 205 * buffer size.
229 */ 206 */
230 bufsize = (size_t)min_t(loff_t, PAGE_SIZE, inode->i_size); 207 bufsize = (size_t)min_t(loff_t, PAGE_SIZE, ip->i_d.di_size);
231 208
232 error = xfs_readdir(ip, dirent, bufsize, 209 error = xfs_readdir(ip, dirent, bufsize,
233 (xfs_off_t *)&filp->f_pos, filldir); 210 (xfs_off_t *)&filp->f_pos, filldir);
@@ -248,48 +225,6 @@ xfs_file_mmap(
248 return 0; 225 return 0;
249} 226}
250 227
251STATIC long
252xfs_file_ioctl(
253 struct file *filp,
254 unsigned int cmd,
255 unsigned long p)
256{
257 int error;
258 struct inode *inode = filp->f_path.dentry->d_inode;
259
260 error = xfs_ioctl(XFS_I(inode), filp, 0, cmd, (void __user *)p);
261 xfs_iflags_set(XFS_I(inode), XFS_IMODIFIED);
262
263 /* NOTE: some of the ioctl's return positive #'s as a
264 * byte count indicating success, such as
265 * readlink_by_handle. So we don't "sign flip"
266 * like most other routines. This means true
267 * errors need to be returned as a negative value.
268 */
269 return error;
270}
271
272STATIC long
273xfs_file_ioctl_invis(
274 struct file *filp,
275 unsigned int cmd,
276 unsigned long p)
277{
278 int error;
279 struct inode *inode = filp->f_path.dentry->d_inode;
280
281 error = xfs_ioctl(XFS_I(inode), filp, IO_INVIS, cmd, (void __user *)p);
282 xfs_iflags_set(XFS_I(inode), XFS_IMODIFIED);
283
284 /* NOTE: some of the ioctl's return positive #'s as a
285 * byte count indicating success, such as
286 * readlink_by_handle. So we don't "sign flip"
287 * like most other routines. This means true
288 * errors need to be returned as a negative value.
289 */
290 return error;
291}
292
293/* 228/*
294 * mmap()d file has taken write protection fault and is being made 229 * mmap()d file has taken write protection fault and is being made
295 * writable. We can set the page state up correctly for a writable 230 * writable. We can set the page state up correctly for a writable
@@ -325,26 +260,8 @@ const struct file_operations xfs_file_operations = {
325#endif 260#endif
326}; 261};
327 262
328const struct file_operations xfs_invis_file_operations = {
329 .llseek = generic_file_llseek,
330 .read = do_sync_read,
331 .write = do_sync_write,
332 .aio_read = xfs_file_aio_read_invis,
333 .aio_write = xfs_file_aio_write_invis,
334 .splice_read = xfs_file_splice_read_invis,
335 .splice_write = xfs_file_splice_write_invis,
336 .unlocked_ioctl = xfs_file_ioctl_invis,
337#ifdef CONFIG_COMPAT
338 .compat_ioctl = xfs_file_compat_invis_ioctl,
339#endif
340 .mmap = xfs_file_mmap,
341 .open = xfs_file_open,
342 .release = xfs_file_release,
343 .fsync = xfs_file_fsync,
344};
345
346
347const struct file_operations xfs_dir_file_operations = { 263const struct file_operations xfs_dir_file_operations = {
264 .open = xfs_dir_open,
348 .read = generic_read_dir, 265 .read = generic_read_dir,
349 .readdir = xfs_file_readdir, 266 .readdir = xfs_file_readdir,
350 .llseek = generic_file_llseek, 267 .llseek = generic_file_llseek,
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 36caa6d957df..5aeb77776961 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -24,6 +24,10 @@ int fs_noerr(void) { return 0; }
24int fs_nosys(void) { return ENOSYS; } 24int fs_nosys(void) { return ENOSYS; }
25void fs_noval(void) { return; } 25void fs_noval(void) { return; }
26 26
27/*
28 * note: all filemap functions return negative error codes. These
29 * need to be inverted before returning to the xfs core functions.
30 */
27void 31void
28xfs_tosspages( 32xfs_tosspages(
29 xfs_inode_t *ip, 33 xfs_inode_t *ip,
@@ -53,7 +57,7 @@ xfs_flushinval_pages(
53 if (!ret) 57 if (!ret)
54 truncate_inode_pages(mapping, first); 58 truncate_inode_pages(mapping, first);
55 } 59 }
56 return ret; 60 return -ret;
57} 61}
58 62
59int 63int
@@ -72,10 +76,23 @@ xfs_flush_pages(
72 xfs_iflags_clear(ip, XFS_ITRUNCATED); 76 xfs_iflags_clear(ip, XFS_ITRUNCATED);
73 ret = filemap_fdatawrite(mapping); 77 ret = filemap_fdatawrite(mapping);
74 if (flags & XFS_B_ASYNC) 78 if (flags & XFS_B_ASYNC)
75 return ret; 79 return -ret;
76 ret2 = filemap_fdatawait(mapping); 80 ret2 = filemap_fdatawait(mapping);
77 if (!ret) 81 if (!ret)
78 ret = ret2; 82 ret = ret2;
79 } 83 }
80 return ret; 84 return -ret;
85}
86
87int
88xfs_wait_on_pages(
89 xfs_inode_t *ip,
90 xfs_off_t first,
91 xfs_off_t last)
92{
93 struct address_space *mapping = VFS_I(ip)->i_mapping;
94
95 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
96 return -filemap_fdatawait(mapping);
97 return 0;
81} 98}
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
index ef90e64641e6..2ae8b1ccb02e 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -26,7 +26,6 @@
26 */ 26 */
27xfs_param_t xfs_params = { 27xfs_param_t xfs_params = {
28 /* MIN DFLT MAX */ 28 /* MIN DFLT MAX */
29 .restrict_chown = { 0, 1, 1 },
30 .sgid_inherit = { 0, 0, 1 }, 29 .sgid_inherit = { 0, 0, 1 },
31 .symlink_mode = { 0, 0, 1 }, 30 .symlink_mode = { 0, 0, 1 },
32 .panic_mask = { 0, 0, 255 }, 31 .panic_mask = { 0, 0, 255 },
@@ -43,10 +42,3 @@ xfs_param_t xfs_params = {
43 .inherit_nodfrg = { 0, 1, 1 }, 42 .inherit_nodfrg = { 0, 1, 1 },
44 .fstrm_timer = { 1, 30*100, 3600*100}, 43 .fstrm_timer = { 1, 30*100, 3600*100},
45}; 44};
46
47/*
48 * Global system credential structure.
49 */
50static cred_t sys_cred_val;
51cred_t *sys_cred = &sys_cred_val;
52
diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h
index 6eda8a3eb6f1..69f71caf061c 100644
--- a/fs/xfs/linux-2.6/xfs_globals.h
+++ b/fs/xfs/linux-2.6/xfs_globals.h
@@ -19,6 +19,5 @@
19#define __XFS_GLOBALS_H__ 19#define __XFS_GLOBALS_H__
20 20
21extern uint64_t xfs_panic_mask; /* set to cause more panics */ 21extern uint64_t xfs_panic_mask; /* set to cause more panics */
22extern cred_t *sys_cred;
23 22
24#endif /* __XFS_GLOBALS_H__ */ 23#endif /* __XFS_GLOBALS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 281cbd5a25cf..e5be1e0be802 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -68,26 +68,22 @@
68 * XFS_IOC_PATH_TO_HANDLE 68 * XFS_IOC_PATH_TO_HANDLE
69 * returns full handle for a path 69 * returns full handle for a path
70 */ 70 */
71STATIC int 71int
72xfs_find_handle( 72xfs_find_handle(
73 unsigned int cmd, 73 unsigned int cmd,
74 void __user *arg) 74 xfs_fsop_handlereq_t *hreq)
75{ 75{
76 int hsize; 76 int hsize;
77 xfs_handle_t handle; 77 xfs_handle_t handle;
78 xfs_fsop_handlereq_t hreq;
79 struct inode *inode; 78 struct inode *inode;
80 79
81 if (copy_from_user(&hreq, arg, sizeof(hreq)))
82 return -XFS_ERROR(EFAULT);
83
84 memset((char *)&handle, 0, sizeof(handle)); 80 memset((char *)&handle, 0, sizeof(handle));
85 81
86 switch (cmd) { 82 switch (cmd) {
87 case XFS_IOC_PATH_TO_FSHANDLE: 83 case XFS_IOC_PATH_TO_FSHANDLE:
88 case XFS_IOC_PATH_TO_HANDLE: { 84 case XFS_IOC_PATH_TO_HANDLE: {
89 struct path path; 85 struct path path;
90 int error = user_lpath((const char __user *)hreq.path, &path); 86 int error = user_lpath((const char __user *)hreq->path, &path);
91 if (error) 87 if (error)
92 return error; 88 return error;
93 89
@@ -101,7 +97,7 @@ xfs_find_handle(
101 case XFS_IOC_FD_TO_HANDLE: { 97 case XFS_IOC_FD_TO_HANDLE: {
102 struct file *file; 98 struct file *file;
103 99
104 file = fget(hreq.fd); 100 file = fget(hreq->fd);
105 if (!file) 101 if (!file)
106 return -EBADF; 102 return -EBADF;
107 103
@@ -158,8 +154,8 @@ xfs_find_handle(
158 } 154 }
159 155
160 /* now copy our handle into the user buffer & write out the size */ 156 /* now copy our handle into the user buffer & write out the size */
161 if (copy_to_user(hreq.ohandle, &handle, hsize) || 157 if (copy_to_user(hreq->ohandle, &handle, hsize) ||
162 copy_to_user(hreq.ohandlen, &hsize, sizeof(__s32))) { 158 copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32))) {
163 iput(inode); 159 iput(inode);
164 return -XFS_ERROR(EFAULT); 160 return -XFS_ERROR(EFAULT);
165 } 161 }
@@ -249,10 +245,10 @@ xfs_vget_fsop_handlereq(
249 return 0; 245 return 0;
250} 246}
251 247
252STATIC int 248int
253xfs_open_by_handle( 249xfs_open_by_handle(
254 xfs_mount_t *mp, 250 xfs_mount_t *mp,
255 void __user *arg, 251 xfs_fsop_handlereq_t *hreq,
256 struct file *parfilp, 252 struct file *parfilp,
257 struct inode *parinode) 253 struct inode *parinode)
258{ 254{
@@ -263,14 +259,11 @@ xfs_open_by_handle(
263 struct file *filp; 259 struct file *filp;
264 struct inode *inode; 260 struct inode *inode;
265 struct dentry *dentry; 261 struct dentry *dentry;
266 xfs_fsop_handlereq_t hreq;
267 262
268 if (!capable(CAP_SYS_ADMIN)) 263 if (!capable(CAP_SYS_ADMIN))
269 return -XFS_ERROR(EPERM); 264 return -XFS_ERROR(EPERM);
270 if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
271 return -XFS_ERROR(EFAULT);
272 265
273 error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &inode); 266 error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode);
274 if (error) 267 if (error)
275 return -error; 268 return -error;
276 269
@@ -281,10 +274,10 @@ xfs_open_by_handle(
281 } 274 }
282 275
283#if BITS_PER_LONG != 32 276#if BITS_PER_LONG != 32
284 hreq.oflags |= O_LARGEFILE; 277 hreq->oflags |= O_LARGEFILE;
285#endif 278#endif
286 /* Put open permission in namei format. */ 279 /* Put open permission in namei format. */
287 permflag = hreq.oflags; 280 permflag = hreq->oflags;
288 if ((permflag+1) & O_ACCMODE) 281 if ((permflag+1) & O_ACCMODE)
289 permflag++; 282 permflag++;
290 if (permflag & O_TRUNC) 283 if (permflag & O_TRUNC)
@@ -322,15 +315,16 @@ xfs_open_by_handle(
322 mntget(parfilp->f_path.mnt); 315 mntget(parfilp->f_path.mnt);
323 316
324 /* Create file pointer. */ 317 /* Create file pointer. */
325 filp = dentry_open(dentry, parfilp->f_path.mnt, hreq.oflags, cred); 318 filp = dentry_open(dentry, parfilp->f_path.mnt, hreq->oflags, cred);
326 if (IS_ERR(filp)) { 319 if (IS_ERR(filp)) {
327 put_unused_fd(new_fd); 320 put_unused_fd(new_fd);
328 return -XFS_ERROR(-PTR_ERR(filp)); 321 return -XFS_ERROR(-PTR_ERR(filp));
329 } 322 }
323
330 if (inode->i_mode & S_IFREG) { 324 if (inode->i_mode & S_IFREG) {
331 /* invisible operation should not change atime */ 325 /* invisible operation should not change atime */
332 filp->f_flags |= O_NOATIME; 326 filp->f_flags |= O_NOATIME;
333 filp->f_op = &xfs_invis_file_operations; 327 filp->f_mode |= FMODE_NOCMTIME;
334 } 328 }
335 329
336 fd_install(new_fd, filp); 330 fd_install(new_fd, filp);
@@ -363,24 +357,21 @@ do_readlink(
363} 357}
364 358
365 359
366STATIC int 360int
367xfs_readlink_by_handle( 361xfs_readlink_by_handle(
368 xfs_mount_t *mp, 362 xfs_mount_t *mp,
369 void __user *arg, 363 xfs_fsop_handlereq_t *hreq,
370 struct inode *parinode) 364 struct inode *parinode)
371{ 365{
372 struct inode *inode; 366 struct inode *inode;
373 xfs_fsop_handlereq_t hreq;
374 __u32 olen; 367 __u32 olen;
375 void *link; 368 void *link;
376 int error; 369 int error;
377 370
378 if (!capable(CAP_SYS_ADMIN)) 371 if (!capable(CAP_SYS_ADMIN))
379 return -XFS_ERROR(EPERM); 372 return -XFS_ERROR(EPERM);
380 if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
381 return -XFS_ERROR(EFAULT);
382 373
383 error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &inode); 374 error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode);
384 if (error) 375 if (error)
385 return -error; 376 return -error;
386 377
@@ -390,7 +381,7 @@ xfs_readlink_by_handle(
390 goto out_iput; 381 goto out_iput;
391 } 382 }
392 383
393 if (copy_from_user(&olen, hreq.ohandlen, sizeof(__u32))) { 384 if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) {
394 error = -XFS_ERROR(EFAULT); 385 error = -XFS_ERROR(EFAULT);
395 goto out_iput; 386 goto out_iput;
396 } 387 }
@@ -402,7 +393,7 @@ xfs_readlink_by_handle(
402 error = -xfs_readlink(XFS_I(inode), link); 393 error = -xfs_readlink(XFS_I(inode), link);
403 if (error) 394 if (error)
404 goto out_kfree; 395 goto out_kfree;
405 error = do_readlink(hreq.ohandle, olen, link); 396 error = do_readlink(hreq->ohandle, olen, link);
406 if (error) 397 if (error)
407 goto out_kfree; 398 goto out_kfree;
408 399
@@ -501,7 +492,7 @@ xfs_attrlist_by_handle(
501 return -error; 492 return -error;
502} 493}
503 494
504STATIC int 495int
505xfs_attrmulti_attr_get( 496xfs_attrmulti_attr_get(
506 struct inode *inode, 497 struct inode *inode,
507 char *name, 498 char *name,
@@ -530,7 +521,7 @@ xfs_attrmulti_attr_get(
530 return error; 521 return error;
531} 522}
532 523
533STATIC int 524int
534xfs_attrmulti_attr_set( 525xfs_attrmulti_attr_set(
535 struct inode *inode, 526 struct inode *inode,
536 char *name, 527 char *name,
@@ -560,7 +551,7 @@ xfs_attrmulti_attr_set(
560 return error; 551 return error;
561} 552}
562 553
563STATIC int 554int
564xfs_attrmulti_attr_remove( 555xfs_attrmulti_attr_remove(
565 struct inode *inode, 556 struct inode *inode,
566 char *name, 557 char *name,
@@ -662,19 +653,26 @@ xfs_attrmulti_by_handle(
662 return -error; 653 return -error;
663} 654}
664 655
665STATIC int 656int
666xfs_ioc_space( 657xfs_ioc_space(
667 struct xfs_inode *ip, 658 struct xfs_inode *ip,
668 struct inode *inode, 659 struct inode *inode,
669 struct file *filp, 660 struct file *filp,
670 int ioflags, 661 int ioflags,
671 unsigned int cmd, 662 unsigned int cmd,
672 void __user *arg) 663 xfs_flock64_t *bf)
673{ 664{
674 xfs_flock64_t bf;
675 int attr_flags = 0; 665 int attr_flags = 0;
676 int error; 666 int error;
677 667
668 /*
669 * Only allow the sys admin to reserve space unless
670 * unwritten extents are enabled.
671 */
672 if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) &&
673 !capable(CAP_SYS_ADMIN))
674 return -XFS_ERROR(EPERM);
675
678 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) 676 if (inode->i_flags & (S_IMMUTABLE|S_APPEND))
679 return -XFS_ERROR(EPERM); 677 return -XFS_ERROR(EPERM);
680 678
@@ -684,16 +682,12 @@ xfs_ioc_space(
684 if (!S_ISREG(inode->i_mode)) 682 if (!S_ISREG(inode->i_mode))
685 return -XFS_ERROR(EINVAL); 683 return -XFS_ERROR(EINVAL);
686 684
687 if (copy_from_user(&bf, arg, sizeof(bf)))
688 return -XFS_ERROR(EFAULT);
689
690 if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) 685 if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
691 attr_flags |= XFS_ATTR_NONBLOCK; 686 attr_flags |= XFS_ATTR_NONBLOCK;
692 if (ioflags & IO_INVIS) 687 if (ioflags & IO_INVIS)
693 attr_flags |= XFS_ATTR_DMI; 688 attr_flags |= XFS_ATTR_DMI;
694 689
695 error = xfs_change_file_space(ip, cmd, &bf, filp->f_pos, 690 error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags);
696 NULL, attr_flags);
697 return -error; 691 return -error;
698} 692}
699 693
@@ -1105,10 +1099,6 @@ xfs_ioctl_setattr(
1105 1099
1106 /* 1100 /*
1107 * Change file ownership. Must be the owner or privileged. 1101 * Change file ownership. Must be the owner or privileged.
1108 * If the system was configured with the "restricted_chown"
1109 * option, the owner is not permitted to give away the file,
1110 * and can change the group id only to a group of which he
1111 * or she is a member.
1112 */ 1102 */
1113 if (mask & FSX_PROJID) { 1103 if (mask & FSX_PROJID) {
1114 /* 1104 /*
@@ -1137,7 +1127,7 @@ xfs_ioctl_setattr(
1137 * the superblock version number since projids didn't 1127 * the superblock version number since projids didn't
1138 * exist before DINODE_VERSION_2 and SB_VERSION_NLINK. 1128 * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
1139 */ 1129 */
1140 if (ip->i_d.di_version == XFS_DINODE_VERSION_1) 1130 if (ip->i_d.di_version == 1)
1141 xfs_bump_ino_vers2(tp, ip); 1131 xfs_bump_ino_vers2(tp, ip);
1142 } 1132 }
1143 1133
@@ -1256,43 +1246,67 @@ xfs_ioc_setxflags(
1256} 1246}
1257 1247
1258STATIC int 1248STATIC int
1249xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
1250{
1251 struct getbmap __user *base = *ap;
1252
1253 /* copy only getbmap portion (not getbmapx) */
1254 if (copy_to_user(base, bmv, sizeof(struct getbmap)))
1255 return XFS_ERROR(EFAULT);
1256
1257 *ap += sizeof(struct getbmap);
1258 return 0;
1259}
1260
1261STATIC int
1259xfs_ioc_getbmap( 1262xfs_ioc_getbmap(
1260 struct xfs_inode *ip, 1263 struct xfs_inode *ip,
1261 int ioflags, 1264 int ioflags,
1262 unsigned int cmd, 1265 unsigned int cmd,
1263 void __user *arg) 1266 void __user *arg)
1264{ 1267{
1265 struct getbmap bm; 1268 struct getbmapx bmx;
1266 int iflags;
1267 int error; 1269 int error;
1268 1270
1269 if (copy_from_user(&bm, arg, sizeof(bm))) 1271 if (copy_from_user(&bmx, arg, sizeof(struct getbmapx)))
1270 return -XFS_ERROR(EFAULT); 1272 return -XFS_ERROR(EFAULT);
1271 1273
1272 if (bm.bmv_count < 2) 1274 if (bmx.bmv_count < 2)
1273 return -XFS_ERROR(EINVAL); 1275 return -XFS_ERROR(EINVAL);
1274 1276
1275 iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0); 1277 bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
1276 if (ioflags & IO_INVIS) 1278 if (ioflags & IO_INVIS)
1277 iflags |= BMV_IF_NO_DMAPI_READ; 1279 bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
1278 1280
1279 error = xfs_getbmap(ip, &bm, (struct getbmap __user *)arg+1, iflags); 1281 error = xfs_getbmap(ip, &bmx, xfs_getbmap_format,
1282 (struct getbmap *)arg+1);
1280 if (error) 1283 if (error)
1281 return -error; 1284 return -error;
1282 1285
1283 if (copy_to_user(arg, &bm, sizeof(bm))) 1286 /* copy back header - only size of getbmap */
1287 if (copy_to_user(arg, &bmx, sizeof(struct getbmap)))
1284 return -XFS_ERROR(EFAULT); 1288 return -XFS_ERROR(EFAULT);
1285 return 0; 1289 return 0;
1286} 1290}
1287 1291
1288STATIC int 1292STATIC int
1293xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full)
1294{
1295 struct getbmapx __user *base = *ap;
1296
1297 if (copy_to_user(base, bmv, sizeof(struct getbmapx)))
1298 return XFS_ERROR(EFAULT);
1299
1300 *ap += sizeof(struct getbmapx);
1301 return 0;
1302}
1303
1304STATIC int
1289xfs_ioc_getbmapx( 1305xfs_ioc_getbmapx(
1290 struct xfs_inode *ip, 1306 struct xfs_inode *ip,
1291 void __user *arg) 1307 void __user *arg)
1292{ 1308{
1293 struct getbmapx bmx; 1309 struct getbmapx bmx;
1294 struct getbmap bm;
1295 int iflags;
1296 int error; 1310 int error;
1297 1311
1298 if (copy_from_user(&bmx, arg, sizeof(bmx))) 1312 if (copy_from_user(&bmx, arg, sizeof(bmx)))
@@ -1301,46 +1315,46 @@ xfs_ioc_getbmapx(
1301 if (bmx.bmv_count < 2) 1315 if (bmx.bmv_count < 2)
1302 return -XFS_ERROR(EINVAL); 1316 return -XFS_ERROR(EINVAL);
1303 1317
1304 /* 1318 if (bmx.bmv_iflags & (~BMV_IF_VALID))
1305 * Map input getbmapx structure to a getbmap
1306 * structure for xfs_getbmap.
1307 */
1308 GETBMAP_CONVERT(bmx, bm);
1309
1310 iflags = bmx.bmv_iflags;
1311
1312 if (iflags & (~BMV_IF_VALID))
1313 return -XFS_ERROR(EINVAL); 1319 return -XFS_ERROR(EINVAL);
1314 1320
1315 iflags |= BMV_IF_EXTENDED; 1321 error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format,
1316 1322 (struct getbmapx *)arg+1);
1317 error = xfs_getbmap(ip, &bm, (struct getbmapx __user *)arg+1, iflags);
1318 if (error) 1323 if (error)
1319 return -error; 1324 return -error;
1320 1325
1321 GETBMAP_CONVERT(bm, bmx); 1326 /* copy back header */
1322 1327 if (copy_to_user(arg, &bmx, sizeof(struct getbmapx)))
1323 if (copy_to_user(arg, &bmx, sizeof(bmx)))
1324 return -XFS_ERROR(EFAULT); 1328 return -XFS_ERROR(EFAULT);
1325 1329
1326 return 0; 1330 return 0;
1327} 1331}
1328 1332
1329int 1333/*
1330xfs_ioctl( 1334 * Note: some of the ioctl's return positive numbers as a
1331 xfs_inode_t *ip, 1335 * byte count indicating success, such as readlink_by_handle.
1336 * So we don't "sign flip" like most other routines. This means
1337 * true errors need to be returned as a negative value.
1338 */
1339long
1340xfs_file_ioctl(
1332 struct file *filp, 1341 struct file *filp,
1333 int ioflags,
1334 unsigned int cmd, 1342 unsigned int cmd,
1335 void __user *arg) 1343 unsigned long p)
1336{ 1344{
1337 struct inode *inode = filp->f_path.dentry->d_inode; 1345 struct inode *inode = filp->f_path.dentry->d_inode;
1338 xfs_mount_t *mp = ip->i_mount; 1346 struct xfs_inode *ip = XFS_I(inode);
1347 struct xfs_mount *mp = ip->i_mount;
1348 void __user *arg = (void __user *)p;
1349 int ioflags = 0;
1339 int error; 1350 int error;
1340 1351
1341 xfs_itrace_entry(XFS_I(inode)); 1352 if (filp->f_mode & FMODE_NOCMTIME)
1342 switch (cmd) { 1353 ioflags |= IO_INVIS;
1343 1354
1355 xfs_itrace_entry(ip);
1356
1357 switch (cmd) {
1344 case XFS_IOC_ALLOCSP: 1358 case XFS_IOC_ALLOCSP:
1345 case XFS_IOC_FREESP: 1359 case XFS_IOC_FREESP:
1346 case XFS_IOC_RESVSP: 1360 case XFS_IOC_RESVSP:
@@ -1348,17 +1362,13 @@ xfs_ioctl(
1348 case XFS_IOC_ALLOCSP64: 1362 case XFS_IOC_ALLOCSP64:
1349 case XFS_IOC_FREESP64: 1363 case XFS_IOC_FREESP64:
1350 case XFS_IOC_RESVSP64: 1364 case XFS_IOC_RESVSP64:
1351 case XFS_IOC_UNRESVSP64: 1365 case XFS_IOC_UNRESVSP64: {
1352 /* 1366 xfs_flock64_t bf;
1353 * Only allow the sys admin to reserve space unless
1354 * unwritten extents are enabled.
1355 */
1356 if (!xfs_sb_version_hasextflgbit(&mp->m_sb) &&
1357 !capable(CAP_SYS_ADMIN))
1358 return -EPERM;
1359
1360 return xfs_ioc_space(ip, inode, filp, ioflags, cmd, arg);
1361 1367
1368 if (copy_from_user(&bf, arg, sizeof(bf)))
1369 return -XFS_ERROR(EFAULT);
1370 return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
1371 }
1362 case XFS_IOC_DIOINFO: { 1372 case XFS_IOC_DIOINFO: {
1363 struct dioattr da; 1373 struct dioattr da;
1364 xfs_buftarg_t *target = 1374 xfs_buftarg_t *target =
@@ -1418,18 +1428,30 @@ xfs_ioctl(
1418 1428
1419 case XFS_IOC_FD_TO_HANDLE: 1429 case XFS_IOC_FD_TO_HANDLE:
1420 case XFS_IOC_PATH_TO_HANDLE: 1430 case XFS_IOC_PATH_TO_HANDLE:
1421 case XFS_IOC_PATH_TO_FSHANDLE: 1431 case XFS_IOC_PATH_TO_FSHANDLE: {
1422 return xfs_find_handle(cmd, arg); 1432 xfs_fsop_handlereq_t hreq;
1423 1433
1424 case XFS_IOC_OPEN_BY_HANDLE: 1434 if (copy_from_user(&hreq, arg, sizeof(hreq)))
1425 return xfs_open_by_handle(mp, arg, filp, inode); 1435 return -XFS_ERROR(EFAULT);
1436 return xfs_find_handle(cmd, &hreq);
1437 }
1438 case XFS_IOC_OPEN_BY_HANDLE: {
1439 xfs_fsop_handlereq_t hreq;
1426 1440
1441 if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
1442 return -XFS_ERROR(EFAULT);
1443 return xfs_open_by_handle(mp, &hreq, filp, inode);
1444 }
1427 case XFS_IOC_FSSETDM_BY_HANDLE: 1445 case XFS_IOC_FSSETDM_BY_HANDLE:
1428 return xfs_fssetdm_by_handle(mp, arg, inode); 1446 return xfs_fssetdm_by_handle(mp, arg, inode);
1429 1447
1430 case XFS_IOC_READLINK_BY_HANDLE: 1448 case XFS_IOC_READLINK_BY_HANDLE: {
1431 return xfs_readlink_by_handle(mp, arg, inode); 1449 xfs_fsop_handlereq_t hreq;
1432 1450
1451 if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
1452 return -XFS_ERROR(EFAULT);
1453 return xfs_readlink_by_handle(mp, &hreq, inode);
1454 }
1433 case XFS_IOC_ATTRLIST_BY_HANDLE: 1455 case XFS_IOC_ATTRLIST_BY_HANDLE:
1434 return xfs_attrlist_by_handle(mp, arg, inode); 1456 return xfs_attrlist_by_handle(mp, arg, inode);
1435 1457
@@ -1437,7 +1459,11 @@ xfs_ioctl(
1437 return xfs_attrmulti_by_handle(mp, arg, filp, inode); 1459 return xfs_attrmulti_by_handle(mp, arg, filp, inode);
1438 1460
1439 case XFS_IOC_SWAPEXT: { 1461 case XFS_IOC_SWAPEXT: {
1440 error = xfs_swapext((struct xfs_swapext __user *)arg); 1462 struct xfs_swapext sxp;
1463
1464 if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t)))
1465 return -XFS_ERROR(EFAULT);
1466 error = xfs_swapext(&sxp);
1441 return -error; 1467 return -error;
1442 } 1468 }
1443 1469
@@ -1493,9 +1519,6 @@ xfs_ioctl(
1493 case XFS_IOC_FSGROWFSDATA: { 1519 case XFS_IOC_FSGROWFSDATA: {
1494 xfs_growfs_data_t in; 1520 xfs_growfs_data_t in;
1495 1521
1496 if (!capable(CAP_SYS_ADMIN))
1497 return -EPERM;
1498
1499 if (copy_from_user(&in, arg, sizeof(in))) 1522 if (copy_from_user(&in, arg, sizeof(in)))
1500 return -XFS_ERROR(EFAULT); 1523 return -XFS_ERROR(EFAULT);
1501 1524
@@ -1506,9 +1529,6 @@ xfs_ioctl(
1506 case XFS_IOC_FSGROWFSLOG: { 1529 case XFS_IOC_FSGROWFSLOG: {
1507 xfs_growfs_log_t in; 1530 xfs_growfs_log_t in;
1508 1531
1509 if (!capable(CAP_SYS_ADMIN))
1510 return -EPERM;
1511
1512 if (copy_from_user(&in, arg, sizeof(in))) 1532 if (copy_from_user(&in, arg, sizeof(in)))
1513 return -XFS_ERROR(EFAULT); 1533 return -XFS_ERROR(EFAULT);
1514 1534
@@ -1519,9 +1539,6 @@ xfs_ioctl(
1519 case XFS_IOC_FSGROWFSRT: { 1539 case XFS_IOC_FSGROWFSRT: {
1520 xfs_growfs_rt_t in; 1540 xfs_growfs_rt_t in;
1521 1541
1522 if (!capable(CAP_SYS_ADMIN))
1523 return -EPERM;
1524
1525 if (copy_from_user(&in, arg, sizeof(in))) 1542 if (copy_from_user(&in, arg, sizeof(in)))
1526 return -XFS_ERROR(EFAULT); 1543 return -XFS_ERROR(EFAULT);
1527 1544
@@ -1529,21 +1546,6 @@ xfs_ioctl(
1529 return -error; 1546 return -error;
1530 } 1547 }
1531 1548
1532 case XFS_IOC_FREEZE:
1533 if (!capable(CAP_SYS_ADMIN))
1534 return -EPERM;
1535
1536 if (inode->i_sb->s_frozen == SB_UNFROZEN)
1537 freeze_bdev(inode->i_sb->s_bdev);
1538 return 0;
1539
1540 case XFS_IOC_THAW:
1541 if (!capable(CAP_SYS_ADMIN))
1542 return -EPERM;
1543 if (inode->i_sb->s_frozen != SB_UNFROZEN)
1544 thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);
1545 return 0;
1546
1547 case XFS_IOC_GOINGDOWN: { 1549 case XFS_IOC_GOINGDOWN: {
1548 __uint32_t in; 1550 __uint32_t in;
1549 1551
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h
new file mode 100644
index 000000000000..8c16bf2d7e03
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_ioctl.h
@@ -0,0 +1,82 @@
1/*
2 * Copyright (c) 2008 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_IOCTL_H__
19#define __XFS_IOCTL_H__
20
21extern int
22xfs_ioc_space(
23 struct xfs_inode *ip,
24 struct inode *inode,
25 struct file *filp,
26 int ioflags,
27 unsigned int cmd,
28 xfs_flock64_t *bf);
29
30extern int
31xfs_find_handle(
32 unsigned int cmd,
33 xfs_fsop_handlereq_t *hreq);
34
35extern int
36xfs_open_by_handle(
37 xfs_mount_t *mp,
38 xfs_fsop_handlereq_t *hreq,
39 struct file *parfilp,
40 struct inode *parinode);
41
42extern int
43xfs_readlink_by_handle(
44 xfs_mount_t *mp,
45 xfs_fsop_handlereq_t *hreq,
46 struct inode *parinode);
47
48extern int
49xfs_attrmulti_attr_get(
50 struct inode *inode,
51 char *name,
52 char __user *ubuf,
53 __uint32_t *len,
54 __uint32_t flags);
55
56extern int
57 xfs_attrmulti_attr_set(
58 struct inode *inode,
59 char *name,
60 const char __user *ubuf,
61 __uint32_t len,
62 __uint32_t flags);
63
64extern int
65xfs_attrmulti_attr_remove(
66 struct inode *inode,
67 char *name,
68 __uint32_t flags);
69
70extern long
71xfs_file_ioctl(
72 struct file *filp,
73 unsigned int cmd,
74 unsigned long p);
75
76extern long
77xfs_file_compat_ioctl(
78 struct file *file,
79 unsigned int cmd,
80 unsigned long arg);
81
82#endif
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index a4b254eb43b2..50903ad3182e 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -16,11 +16,7 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include <linux/compat.h> 18#include <linux/compat.h>
19#include <linux/init.h>
20#include <linux/ioctl.h> 19#include <linux/ioctl.h>
21#include <linux/syscalls.h>
22#include <linux/types.h>
23#include <linux/fs.h>
24#include <asm/uaccess.h> 20#include <asm/uaccess.h>
25#include "xfs.h" 21#include "xfs.h"
26#include "xfs_fs.h" 22#include "xfs_fs.h"
@@ -36,7 +32,6 @@
36#include "xfs_bmap_btree.h" 32#include "xfs_bmap_btree.h"
37#include "xfs_attr_sf.h" 33#include "xfs_attr_sf.h"
38#include "xfs_dir2_sf.h" 34#include "xfs_dir2_sf.h"
39#include "xfs_vfs.h"
40#include "xfs_vnode.h" 35#include "xfs_vnode.h"
41#include "xfs_dinode.h" 36#include "xfs_dinode.h"
42#include "xfs_inode.h" 37#include "xfs_inode.h"
@@ -44,221 +39,219 @@
44#include "xfs_error.h" 39#include "xfs_error.h"
45#include "xfs_dfrag.h" 40#include "xfs_dfrag.h"
46#include "xfs_vnodeops.h" 41#include "xfs_vnodeops.h"
42#include "xfs_fsops.h"
43#include "xfs_alloc.h"
44#include "xfs_rtalloc.h"
45#include "xfs_attr.h"
46#include "xfs_ioctl.h"
47#include "xfs_ioctl32.h" 47#include "xfs_ioctl32.h"
48 48
49#define _NATIVE_IOC(cmd, type) \ 49#define _NATIVE_IOC(cmd, type) \
50 _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type)) 50 _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type))
51 51
52#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) 52#ifdef BROKEN_X86_ALIGNMENT
53#define BROKEN_X86_ALIGNMENT 53STATIC int
54#define _PACKED __attribute__((packed)) 54xfs_compat_flock64_copyin(
55/* on ia32 l_start is on a 32-bit boundary */ 55 xfs_flock64_t *bf,
56typedef struct xfs_flock64_32 { 56 compat_xfs_flock64_t __user *arg32)
57 __s16 l_type;
58 __s16 l_whence;
59 __s64 l_start __attribute__((packed));
60 /* len == 0 means until end of file */
61 __s64 l_len __attribute__((packed));
62 __s32 l_sysid;
63 __u32 l_pid;
64 __s32 l_pad[4]; /* reserve area */
65} xfs_flock64_32_t;
66
67#define XFS_IOC_ALLOCSP_32 _IOW ('X', 10, struct xfs_flock64_32)
68#define XFS_IOC_FREESP_32 _IOW ('X', 11, struct xfs_flock64_32)
69#define XFS_IOC_ALLOCSP64_32 _IOW ('X', 36, struct xfs_flock64_32)
70#define XFS_IOC_FREESP64_32 _IOW ('X', 37, struct xfs_flock64_32)
71#define XFS_IOC_RESVSP_32 _IOW ('X', 40, struct xfs_flock64_32)
72#define XFS_IOC_UNRESVSP_32 _IOW ('X', 41, struct xfs_flock64_32)
73#define XFS_IOC_RESVSP64_32 _IOW ('X', 42, struct xfs_flock64_32)
74#define XFS_IOC_UNRESVSP64_32 _IOW ('X', 43, struct xfs_flock64_32)
75
76/* just account for different alignment */
77STATIC unsigned long
78xfs_ioctl32_flock(
79 unsigned long arg)
80{ 57{
81 xfs_flock64_32_t __user *p32 = (void __user *)arg; 58 if (get_user(bf->l_type, &arg32->l_type) ||
82 xfs_flock64_t __user *p = compat_alloc_user_space(sizeof(*p)); 59 get_user(bf->l_whence, &arg32->l_whence) ||
83 60 get_user(bf->l_start, &arg32->l_start) ||
84 if (copy_in_user(&p->l_type, &p32->l_type, sizeof(s16)) || 61 get_user(bf->l_len, &arg32->l_len) ||
85 copy_in_user(&p->l_whence, &p32->l_whence, sizeof(s16)) || 62 get_user(bf->l_sysid, &arg32->l_sysid) ||
86 copy_in_user(&p->l_start, &p32->l_start, sizeof(s64)) || 63 get_user(bf->l_pid, &arg32->l_pid) ||
87 copy_in_user(&p->l_len, &p32->l_len, sizeof(s64)) || 64 copy_from_user(bf->l_pad, &arg32->l_pad, 4*sizeof(u32)))
88 copy_in_user(&p->l_sysid, &p32->l_sysid, sizeof(s32)) || 65 return -XFS_ERROR(EFAULT);
89 copy_in_user(&p->l_pid, &p32->l_pid, sizeof(u32)) || 66 return 0;
90 copy_in_user(&p->l_pad, &p32->l_pad, 4*sizeof(u32)))
91 return -EFAULT;
92
93 return (unsigned long)p;
94} 67}
95 68
96typedef struct compat_xfs_fsop_geom_v1 { 69STATIC int
97 __u32 blocksize; /* filesystem (data) block size */ 70xfs_compat_ioc_fsgeometry_v1(
98 __u32 rtextsize; /* realtime extent size */ 71 struct xfs_mount *mp,
99 __u32 agblocks; /* fsblocks in an AG */ 72 compat_xfs_fsop_geom_v1_t __user *arg32)
100 __u32 agcount; /* number of allocation groups */
101 __u32 logblocks; /* fsblocks in the log */
102 __u32 sectsize; /* (data) sector size, bytes */
103 __u32 inodesize; /* inode size in bytes */
104 __u32 imaxpct; /* max allowed inode space(%) */
105 __u64 datablocks; /* fsblocks in data subvolume */
106 __u64 rtblocks; /* fsblocks in realtime subvol */
107 __u64 rtextents; /* rt extents in realtime subvol*/
108 __u64 logstart; /* starting fsblock of the log */
109 unsigned char uuid[16]; /* unique id of the filesystem */
110 __u32 sunit; /* stripe unit, fsblocks */
111 __u32 swidth; /* stripe width, fsblocks */
112 __s32 version; /* structure version */
113 __u32 flags; /* superblock version flags */
114 __u32 logsectsize; /* log sector size, bytes */
115 __u32 rtsectsize; /* realtime sector size, bytes */
116 __u32 dirblocksize; /* directory block size, bytes */
117} __attribute__((packed)) compat_xfs_fsop_geom_v1_t;
118
119#define XFS_IOC_FSGEOMETRY_V1_32 \
120 _IOR ('X', 100, struct compat_xfs_fsop_geom_v1)
121
122STATIC unsigned long xfs_ioctl32_geom_v1(unsigned long arg)
123{ 73{
124 compat_xfs_fsop_geom_v1_t __user *p32 = (void __user *)arg; 74 xfs_fsop_geom_t fsgeo;
125 xfs_fsop_geom_v1_t __user *p = compat_alloc_user_space(sizeof(*p)); 75 int error;
126 76
127 if (copy_in_user(p, p32, sizeof(*p32))) 77 error = xfs_fs_geometry(mp, &fsgeo, 3);
128 return -EFAULT; 78 if (error)
129 return (unsigned long)p; 79 return -error;
80 /* The 32-bit variant simply has some padding at the end */
81 if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1)))
82 return -XFS_ERROR(EFAULT);
83 return 0;
130} 84}
131 85
132typedef struct compat_xfs_inogrp { 86STATIC int
133 __u64 xi_startino; /* starting inode number */ 87xfs_compat_growfs_data_copyin(
134 __s32 xi_alloccount; /* # bits set in allocmask */ 88 struct xfs_growfs_data *in,
135 __u64 xi_allocmask; /* mask of allocated inodes */ 89 compat_xfs_growfs_data_t __user *arg32)
136} __attribute__((packed)) compat_xfs_inogrp_t;
137
138STATIC int xfs_inumbers_fmt_compat(
139 void __user *ubuffer,
140 const xfs_inogrp_t *buffer,
141 long count,
142 long *written)
143{ 90{
144 compat_xfs_inogrp_t __user *p32 = ubuffer; 91 if (get_user(in->newblocks, &arg32->newblocks) ||
145 long i; 92 get_user(in->imaxpct, &arg32->imaxpct))
93 return -XFS_ERROR(EFAULT);
94 return 0;
95}
96
97STATIC int
98xfs_compat_growfs_rt_copyin(
99 struct xfs_growfs_rt *in,
100 compat_xfs_growfs_rt_t __user *arg32)
101{
102 if (get_user(in->newblocks, &arg32->newblocks) ||
103 get_user(in->extsize, &arg32->extsize))
104 return -XFS_ERROR(EFAULT);
105 return 0;
106}
107
108STATIC int
109xfs_inumbers_fmt_compat(
110 void __user *ubuffer,
111 const xfs_inogrp_t *buffer,
112 long count,
113 long *written)
114{
115 compat_xfs_inogrp_t __user *p32 = ubuffer;
116 long i;
146 117
147 for (i = 0; i < count; i++) { 118 for (i = 0; i < count; i++) {
148 if (put_user(buffer[i].xi_startino, &p32[i].xi_startino) || 119 if (put_user(buffer[i].xi_startino, &p32[i].xi_startino) ||
149 put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) || 120 put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) ||
150 put_user(buffer[i].xi_allocmask, &p32[i].xi_allocmask)) 121 put_user(buffer[i].xi_allocmask, &p32[i].xi_allocmask))
151 return -EFAULT; 122 return -XFS_ERROR(EFAULT);
152 } 123 }
153 *written = count * sizeof(*p32); 124 *written = count * sizeof(*p32);
154 return 0; 125 return 0;
155} 126}
156 127
157#else 128#else
158
159#define xfs_inumbers_fmt_compat xfs_inumbers_fmt 129#define xfs_inumbers_fmt_compat xfs_inumbers_fmt
160#define _PACKED 130#endif /* BROKEN_X86_ALIGNMENT */
161 131
162#endif 132STATIC int
133xfs_ioctl32_bstime_copyin(
134 xfs_bstime_t *bstime,
135 compat_xfs_bstime_t __user *bstime32)
136{
137 compat_time_t sec32; /* tv_sec differs on 64 vs. 32 */
163 138
164/* XFS_IOC_FSBULKSTAT and friends */ 139 if (get_user(sec32, &bstime32->tv_sec) ||
140 get_user(bstime->tv_nsec, &bstime32->tv_nsec))
141 return -XFS_ERROR(EFAULT);
142 bstime->tv_sec = sec32;
143 return 0;
144}
145
146/* xfs_bstat_t has differing alignment on intel, & bstime_t sizes everywhere */
147STATIC int
148xfs_ioctl32_bstat_copyin(
149 xfs_bstat_t *bstat,
150 compat_xfs_bstat_t __user *bstat32)
151{
152 if (get_user(bstat->bs_ino, &bstat32->bs_ino) ||
153 get_user(bstat->bs_mode, &bstat32->bs_mode) ||
154 get_user(bstat->bs_nlink, &bstat32->bs_nlink) ||
155 get_user(bstat->bs_uid, &bstat32->bs_uid) ||
156 get_user(bstat->bs_gid, &bstat32->bs_gid) ||
157 get_user(bstat->bs_rdev, &bstat32->bs_rdev) ||
158 get_user(bstat->bs_blksize, &bstat32->bs_blksize) ||
159 get_user(bstat->bs_size, &bstat32->bs_size) ||
160 xfs_ioctl32_bstime_copyin(&bstat->bs_atime, &bstat32->bs_atime) ||
161 xfs_ioctl32_bstime_copyin(&bstat->bs_mtime, &bstat32->bs_mtime) ||
162 xfs_ioctl32_bstime_copyin(&bstat->bs_ctime, &bstat32->bs_ctime) ||
163 get_user(bstat->bs_blocks, &bstat32->bs_size) ||
164 get_user(bstat->bs_xflags, &bstat32->bs_size) ||
165 get_user(bstat->bs_extsize, &bstat32->bs_extsize) ||
166 get_user(bstat->bs_extents, &bstat32->bs_extents) ||
167 get_user(bstat->bs_gen, &bstat32->bs_gen) ||
168 get_user(bstat->bs_projid, &bstat32->bs_projid) ||
169 get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) ||
170 get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) ||
171 get_user(bstat->bs_aextents, &bstat32->bs_aextents))
172 return -XFS_ERROR(EFAULT);
173 return 0;
174}
165 175
166typedef struct compat_xfs_bstime { 176/* XFS_IOC_FSBULKSTAT and friends */
167 __s32 tv_sec; /* seconds */
168 __s32 tv_nsec; /* and nanoseconds */
169} compat_xfs_bstime_t;
170 177
171STATIC int xfs_bstime_store_compat( 178STATIC int
172 compat_xfs_bstime_t __user *p32, 179xfs_bstime_store_compat(
173 const xfs_bstime_t *p) 180 compat_xfs_bstime_t __user *p32,
181 const xfs_bstime_t *p)
174{ 182{
175 __s32 sec32; 183 __s32 sec32;
176 184
177 sec32 = p->tv_sec; 185 sec32 = p->tv_sec;
178 if (put_user(sec32, &p32->tv_sec) || 186 if (put_user(sec32, &p32->tv_sec) ||
179 put_user(p->tv_nsec, &p32->tv_nsec)) 187 put_user(p->tv_nsec, &p32->tv_nsec))
180 return -EFAULT; 188 return -XFS_ERROR(EFAULT);
181 return 0; 189 return 0;
182} 190}
183 191
184typedef struct compat_xfs_bstat { 192/* Return 0 on success or positive error (to xfs_bulkstat()) */
185 __u64 bs_ino; /* inode number */ 193STATIC int
186 __u16 bs_mode; /* type and mode */ 194xfs_bulkstat_one_fmt_compat(
187 __u16 bs_nlink; /* number of links */
188 __u32 bs_uid; /* user id */
189 __u32 bs_gid; /* group id */
190 __u32 bs_rdev; /* device value */
191 __s32 bs_blksize; /* block size */
192 __s64 bs_size; /* file size */
193 compat_xfs_bstime_t bs_atime; /* access time */
194 compat_xfs_bstime_t bs_mtime; /* modify time */
195 compat_xfs_bstime_t bs_ctime; /* inode change time */
196 int64_t bs_blocks; /* number of blocks */
197 __u32 bs_xflags; /* extended flags */
198 __s32 bs_extsize; /* extent size */
199 __s32 bs_extents; /* number of extents */
200 __u32 bs_gen; /* generation count */
201 __u16 bs_projid; /* project id */
202 unsigned char bs_pad[14]; /* pad space, unused */
203 __u32 bs_dmevmask; /* DMIG event mask */
204 __u16 bs_dmstate; /* DMIG state info */
205 __u16 bs_aextents; /* attribute number of extents */
206} _PACKED compat_xfs_bstat_t;
207
208STATIC int xfs_bulkstat_one_fmt_compat(
209 void __user *ubuffer, 195 void __user *ubuffer,
196 int ubsize,
197 int *ubused,
210 const xfs_bstat_t *buffer) 198 const xfs_bstat_t *buffer)
211{ 199{
212 compat_xfs_bstat_t __user *p32 = ubuffer; 200 compat_xfs_bstat_t __user *p32 = ubuffer;
213 201
214 if (put_user(buffer->bs_ino, &p32->bs_ino) || 202 if (ubsize < sizeof(*p32))
215 put_user(buffer->bs_mode, &p32->bs_mode) || 203 return XFS_ERROR(ENOMEM);
216 put_user(buffer->bs_nlink, &p32->bs_nlink) || 204
217 put_user(buffer->bs_uid, &p32->bs_uid) || 205 if (put_user(buffer->bs_ino, &p32->bs_ino) ||
218 put_user(buffer->bs_gid, &p32->bs_gid) || 206 put_user(buffer->bs_mode, &p32->bs_mode) ||
219 put_user(buffer->bs_rdev, &p32->bs_rdev) || 207 put_user(buffer->bs_nlink, &p32->bs_nlink) ||
220 put_user(buffer->bs_blksize, &p32->bs_blksize) || 208 put_user(buffer->bs_uid, &p32->bs_uid) ||
221 put_user(buffer->bs_size, &p32->bs_size) || 209 put_user(buffer->bs_gid, &p32->bs_gid) ||
210 put_user(buffer->bs_rdev, &p32->bs_rdev) ||
211 put_user(buffer->bs_blksize, &p32->bs_blksize) ||
212 put_user(buffer->bs_size, &p32->bs_size) ||
222 xfs_bstime_store_compat(&p32->bs_atime, &buffer->bs_atime) || 213 xfs_bstime_store_compat(&p32->bs_atime, &buffer->bs_atime) ||
223 xfs_bstime_store_compat(&p32->bs_mtime, &buffer->bs_mtime) || 214 xfs_bstime_store_compat(&p32->bs_mtime, &buffer->bs_mtime) ||
224 xfs_bstime_store_compat(&p32->bs_ctime, &buffer->bs_ctime) || 215 xfs_bstime_store_compat(&p32->bs_ctime, &buffer->bs_ctime) ||
225 put_user(buffer->bs_blocks, &p32->bs_blocks) || 216 put_user(buffer->bs_blocks, &p32->bs_blocks) ||
226 put_user(buffer->bs_xflags, &p32->bs_xflags) || 217 put_user(buffer->bs_xflags, &p32->bs_xflags) ||
227 put_user(buffer->bs_extsize, &p32->bs_extsize) || 218 put_user(buffer->bs_extsize, &p32->bs_extsize) ||
228 put_user(buffer->bs_extents, &p32->bs_extents) || 219 put_user(buffer->bs_extents, &p32->bs_extents) ||
229 put_user(buffer->bs_gen, &p32->bs_gen) || 220 put_user(buffer->bs_gen, &p32->bs_gen) ||
230 put_user(buffer->bs_projid, &p32->bs_projid) || 221 put_user(buffer->bs_projid, &p32->bs_projid) ||
231 put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) || 222 put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) ||
232 put_user(buffer->bs_dmstate, &p32->bs_dmstate) || 223 put_user(buffer->bs_dmstate, &p32->bs_dmstate) ||
233 put_user(buffer->bs_aextents, &p32->bs_aextents)) 224 put_user(buffer->bs_aextents, &p32->bs_aextents))
234 return -EFAULT; 225 return XFS_ERROR(EFAULT);
235 return sizeof(*p32); 226 if (ubused)
227 *ubused = sizeof(*p32);
228 return 0;
236} 229}
237 230
238 231STATIC int
239 232xfs_bulkstat_one_compat(
240typedef struct compat_xfs_fsop_bulkreq { 233 xfs_mount_t *mp, /* mount point for filesystem */
241 compat_uptr_t lastip; /* last inode # pointer */ 234 xfs_ino_t ino, /* inode number to get data for */
242 __s32 icount; /* count of entries in buffer */ 235 void __user *buffer, /* buffer to place output in */
243 compat_uptr_t ubuffer; /* user buffer for inode desc. */ 236 int ubsize, /* size of buffer */
244 compat_uptr_t ocount; /* output count pointer */ 237 void *private_data, /* my private data */
245} compat_xfs_fsop_bulkreq_t; 238 xfs_daddr_t bno, /* starting bno of inode cluster */
246 239 int *ubused, /* bytes used by me */
247#define XFS_IOC_FSBULKSTAT_32 \ 240 void *dibuff, /* on-disk inode buffer */
248 _IOWR('X', 101, struct compat_xfs_fsop_bulkreq) 241 int *stat) /* BULKSTAT_RV_... */
249#define XFS_IOC_FSBULKSTAT_SINGLE_32 \ 242{
250 _IOWR('X', 102, struct compat_xfs_fsop_bulkreq) 243 return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
251#define XFS_IOC_FSINUMBERS_32 \ 244 xfs_bulkstat_one_fmt_compat, bno,
252 _IOWR('X', 103, struct compat_xfs_fsop_bulkreq) 245 ubused, dibuff, stat);
246}
253 247
254/* copied from xfs_ioctl.c */ 248/* copied from xfs_ioctl.c */
255STATIC int 249STATIC int
256xfs_ioc_bulkstat_compat( 250xfs_compat_ioc_bulkstat(
257 xfs_mount_t *mp, 251 xfs_mount_t *mp,
258 unsigned int cmd, 252 unsigned int cmd,
259 void __user *arg) 253 compat_xfs_fsop_bulkreq_t __user *p32)
260{ 254{
261 compat_xfs_fsop_bulkreq_t __user *p32 = (void __user *)arg;
262 u32 addr; 255 u32 addr;
263 xfs_fsop_bulkreq_t bulkreq; 256 xfs_fsop_bulkreq_t bulkreq;
264 int count; /* # of records returned */ 257 int count; /* # of records returned */
@@ -270,20 +263,20 @@ xfs_ioc_bulkstat_compat(
270 /* should be called again (unused here, but used in dmapi) */ 263 /* should be called again (unused here, but used in dmapi) */
271 264
272 if (!capable(CAP_SYS_ADMIN)) 265 if (!capable(CAP_SYS_ADMIN))
273 return -EPERM; 266 return -XFS_ERROR(EPERM);
274 267
275 if (XFS_FORCED_SHUTDOWN(mp)) 268 if (XFS_FORCED_SHUTDOWN(mp))
276 return -XFS_ERROR(EIO); 269 return -XFS_ERROR(EIO);
277 270
278 if (get_user(addr, &p32->lastip)) 271 if (get_user(addr, &p32->lastip))
279 return -EFAULT; 272 return -XFS_ERROR(EFAULT);
280 bulkreq.lastip = compat_ptr(addr); 273 bulkreq.lastip = compat_ptr(addr);
281 if (get_user(bulkreq.icount, &p32->icount) || 274 if (get_user(bulkreq.icount, &p32->icount) ||
282 get_user(addr, &p32->ubuffer)) 275 get_user(addr, &p32->ubuffer))
283 return -EFAULT; 276 return -XFS_ERROR(EFAULT);
284 bulkreq.ubuffer = compat_ptr(addr); 277 bulkreq.ubuffer = compat_ptr(addr);
285 if (get_user(addr, &p32->ocount)) 278 if (get_user(addr, &p32->ocount))
286 return -EFAULT; 279 return -XFS_ERROR(EFAULT);
287 bulkreq.ocount = compat_ptr(addr); 280 bulkreq.ocount = compat_ptr(addr);
288 281
289 if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64))) 282 if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
@@ -295,17 +288,22 @@ xfs_ioc_bulkstat_compat(
295 if (bulkreq.ubuffer == NULL) 288 if (bulkreq.ubuffer == NULL)
296 return -XFS_ERROR(EINVAL); 289 return -XFS_ERROR(EINVAL);
297 290
298 if (cmd == XFS_IOC_FSINUMBERS) 291 if (cmd == XFS_IOC_FSINUMBERS_32) {
299 error = xfs_inumbers(mp, &inlast, &count, 292 error = xfs_inumbers(mp, &inlast, &count,
300 bulkreq.ubuffer, xfs_inumbers_fmt_compat); 293 bulkreq.ubuffer, xfs_inumbers_fmt_compat);
301 else { 294 } else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE_32) {
302 /* declare a var to get a warning in case the type changes */ 295 int res;
303 bulkstat_one_fmt_pf formatter = xfs_bulkstat_one_fmt_compat; 296
297 error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer,
298 sizeof(compat_xfs_bstat_t),
299 NULL, 0, NULL, NULL, &res);
300 } else if (cmd == XFS_IOC_FSBULKSTAT_32) {
304 error = xfs_bulkstat(mp, &inlast, &count, 301 error = xfs_bulkstat(mp, &inlast, &count,
305 xfs_bulkstat_one, formatter, 302 xfs_bulkstat_one_compat, NULL,
306 sizeof(compat_xfs_bstat_t), bulkreq.ubuffer, 303 sizeof(compat_xfs_bstat_t), bulkreq.ubuffer,
307 BULKSTAT_FG_QUICK, &done); 304 BULKSTAT_FG_QUICK, &done);
308 } 305 } else
306 error = XFS_ERROR(EINVAL);
309 if (error) 307 if (error)
310 return -error; 308 return -error;
311 309
@@ -321,63 +319,306 @@ xfs_ioc_bulkstat_compat(
321 return 0; 319 return 0;
322} 320}
323 321
322STATIC int
323xfs_compat_handlereq_copyin(
324 xfs_fsop_handlereq_t *hreq,
325 compat_xfs_fsop_handlereq_t __user *arg32)
326{
327 compat_xfs_fsop_handlereq_t hreq32;
328
329 if (copy_from_user(&hreq32, arg32, sizeof(compat_xfs_fsop_handlereq_t)))
330 return -XFS_ERROR(EFAULT);
331
332 hreq->fd = hreq32.fd;
333 hreq->path = compat_ptr(hreq32.path);
334 hreq->oflags = hreq32.oflags;
335 hreq->ihandle = compat_ptr(hreq32.ihandle);
336 hreq->ihandlen = hreq32.ihandlen;
337 hreq->ohandle = compat_ptr(hreq32.ohandle);
338 hreq->ohandlen = compat_ptr(hreq32.ohandlen);
324 339
340 return 0;
341}
325 342
326typedef struct compat_xfs_fsop_handlereq { 343/*
327 __u32 fd; /* fd for FD_TO_HANDLE */ 344 * Convert userspace handle data into inode.
328 compat_uptr_t path; /* user pathname */ 345 *
329 __u32 oflags; /* open flags */ 346 * We use the fact that all the fsop_handlereq ioctl calls have a data
330 compat_uptr_t ihandle; /* user supplied handle */ 347 * structure argument whose first component is always a xfs_fsop_handlereq_t,
331 __u32 ihandlen; /* user supplied length */ 348 * so we can pass that sub structure into this handy, shared routine.
332 compat_uptr_t ohandle; /* user buffer for handle */ 349 *
333 compat_uptr_t ohandlen; /* user buffer length */ 350 * If no error, caller must always iput the returned inode.
334} compat_xfs_fsop_handlereq_t; 351 */
335 352STATIC int
336#define XFS_IOC_PATH_TO_FSHANDLE_32 \ 353xfs_vget_fsop_handlereq_compat(
337 _IOWR('X', 104, struct compat_xfs_fsop_handlereq) 354 xfs_mount_t *mp,
338#define XFS_IOC_PATH_TO_HANDLE_32 \ 355 struct inode *parinode, /* parent inode pointer */
339 _IOWR('X', 105, struct compat_xfs_fsop_handlereq) 356 compat_xfs_fsop_handlereq_t *hreq,
340#define XFS_IOC_FD_TO_HANDLE_32 \ 357 struct inode **inode)
341 _IOWR('X', 106, struct compat_xfs_fsop_handlereq)
342#define XFS_IOC_OPEN_BY_HANDLE_32 \
343 _IOWR('X', 107, struct compat_xfs_fsop_handlereq)
344#define XFS_IOC_READLINK_BY_HANDLE_32 \
345 _IOWR('X', 108, struct compat_xfs_fsop_handlereq)
346
347STATIC unsigned long xfs_ioctl32_fshandle(unsigned long arg)
348{ 358{
349 compat_xfs_fsop_handlereq_t __user *p32 = (void __user *)arg; 359 void __user *hanp;
350 xfs_fsop_handlereq_t __user *p = compat_alloc_user_space(sizeof(*p)); 360 size_t hlen;
351 u32 addr; 361 xfs_fid_t *xfid;
352 362 xfs_handle_t *handlep;
353 if (copy_in_user(&p->fd, &p32->fd, sizeof(__u32)) || 363 xfs_handle_t handle;
354 get_user(addr, &p32->path) || 364 xfs_inode_t *ip;
355 put_user(compat_ptr(addr), &p->path) || 365 xfs_ino_t ino;
356 copy_in_user(&p->oflags, &p32->oflags, sizeof(__u32)) || 366 __u32 igen;
357 get_user(addr, &p32->ihandle) || 367 int error;
358 put_user(compat_ptr(addr), &p->ihandle) || 368
359 copy_in_user(&p->ihandlen, &p32->ihandlen, sizeof(__u32)) || 369 /*
360 get_user(addr, &p32->ohandle) || 370 * Only allow handle opens under a directory.
361 put_user(compat_ptr(addr), &p->ohandle) || 371 */
362 get_user(addr, &p32->ohandlen) || 372 if (!S_ISDIR(parinode->i_mode))
363 put_user(compat_ptr(addr), &p->ohandlen)) 373 return XFS_ERROR(ENOTDIR);
364 return -EFAULT; 374
365 375 hanp = compat_ptr(hreq->ihandle);
366 return (unsigned long)p; 376 hlen = hreq->ihandlen;
377 handlep = &handle;
378
379 if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep))
380 return XFS_ERROR(EINVAL);
381 if (copy_from_user(handlep, hanp, hlen))
382 return XFS_ERROR(EFAULT);
383 if (hlen < sizeof(*handlep))
384 memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen);
385 if (hlen > sizeof(handlep->ha_fsid)) {
386 if (handlep->ha_fid.fid_len !=
387 (hlen - sizeof(handlep->ha_fsid) -
388 sizeof(handlep->ha_fid.fid_len)) ||
389 handlep->ha_fid.fid_pad)
390 return XFS_ERROR(EINVAL);
391 }
392
393 /*
394 * Crack the handle, obtain the inode # & generation #
395 */
396 xfid = (struct xfs_fid *)&handlep->ha_fid;
397 if (xfid->fid_len == sizeof(*xfid) - sizeof(xfid->fid_len)) {
398 ino = xfid->fid_ino;
399 igen = xfid->fid_gen;
400 } else {
401 return XFS_ERROR(EINVAL);
402 }
403
404 /*
405 * Get the XFS inode, building a Linux inode to go with it.
406 */
407 error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
408 if (error)
409 return error;
410 if (ip == NULL)
411 return XFS_ERROR(EIO);
412 if (ip->i_d.di_gen != igen) {
413 xfs_iput_new(ip, XFS_ILOCK_SHARED);
414 return XFS_ERROR(ENOENT);
415 }
416
417 xfs_iunlock(ip, XFS_ILOCK_SHARED);
418
419 *inode = VFS_I(ip);
420 return 0;
367} 421}
368 422
423STATIC int
424xfs_compat_attrlist_by_handle(
425 xfs_mount_t *mp,
426 void __user *arg,
427 struct inode *parinode)
428{
429 int error;
430 attrlist_cursor_kern_t *cursor;
431 compat_xfs_fsop_attrlist_handlereq_t al_hreq;
432 struct inode *inode;
433 char *kbuf;
434
435 if (!capable(CAP_SYS_ADMIN))
436 return -XFS_ERROR(EPERM);
437 if (copy_from_user(&al_hreq, arg,
438 sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
439 return -XFS_ERROR(EFAULT);
440 if (al_hreq.buflen > XATTR_LIST_MAX)
441 return -XFS_ERROR(EINVAL);
442
443 /*
444 * Reject flags, only allow namespaces.
445 */
446 if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
447 return -XFS_ERROR(EINVAL);
448
449 error = xfs_vget_fsop_handlereq_compat(mp, parinode, &al_hreq.hreq,
450 &inode);
451 if (error)
452 goto out;
453
454 kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
455 if (!kbuf)
456 goto out_vn_rele;
457
458 cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
459 error = xfs_attr_list(XFS_I(inode), kbuf, al_hreq.buflen,
460 al_hreq.flags, cursor);
461 if (error)
462 goto out_kfree;
463
464 if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen))
465 error = -EFAULT;
466
467 out_kfree:
468 kfree(kbuf);
469 out_vn_rele:
470 iput(inode);
471 out:
472 return -error;
473}
369 474
370STATIC long 475STATIC int
371xfs_compat_ioctl( 476xfs_compat_attrmulti_by_handle(
372 int mode, 477 xfs_mount_t *mp,
373 struct file *file, 478 void __user *arg,
374 unsigned cmd, 479 struct inode *parinode)
375 unsigned long arg) 480{
481 int error;
482 compat_xfs_attr_multiop_t *ops;
483 compat_xfs_fsop_attrmulti_handlereq_t am_hreq;
484 struct inode *inode;
485 unsigned int i, size;
486 char *attr_name;
487
488 if (!capable(CAP_SYS_ADMIN))
489 return -XFS_ERROR(EPERM);
490 if (copy_from_user(&am_hreq, arg,
491 sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
492 return -XFS_ERROR(EFAULT);
493
494 error = xfs_vget_fsop_handlereq_compat(mp, parinode, &am_hreq.hreq,
495 &inode);
496 if (error)
497 goto out;
498
499 error = E2BIG;
500 size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t);
501 if (!size || size > 16 * PAGE_SIZE)
502 goto out_vn_rele;
503
504 error = ENOMEM;
505 ops = kmalloc(size, GFP_KERNEL);
506 if (!ops)
507 goto out_vn_rele;
508
509 error = EFAULT;
510 if (copy_from_user(ops, compat_ptr(am_hreq.ops), size))
511 goto out_kfree_ops;
512
513 attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
514 if (!attr_name)
515 goto out_kfree_ops;
516
517
518 error = 0;
519 for (i = 0; i < am_hreq.opcount; i++) {
520 ops[i].am_error = strncpy_from_user(attr_name,
521 compat_ptr(ops[i].am_attrname),
522 MAXNAMELEN);
523 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
524 error = -ERANGE;
525 if (ops[i].am_error < 0)
526 break;
527
528 switch (ops[i].am_opcode) {
529 case ATTR_OP_GET:
530 ops[i].am_error = xfs_attrmulti_attr_get(inode,
531 attr_name,
532 compat_ptr(ops[i].am_attrvalue),
533 &ops[i].am_length, ops[i].am_flags);
534 break;
535 case ATTR_OP_SET:
536 ops[i].am_error = xfs_attrmulti_attr_set(inode,
537 attr_name,
538 compat_ptr(ops[i].am_attrvalue),
539 ops[i].am_length, ops[i].am_flags);
540 break;
541 case ATTR_OP_REMOVE:
542 ops[i].am_error = xfs_attrmulti_attr_remove(inode,
543 attr_name, ops[i].am_flags);
544 break;
545 default:
546 ops[i].am_error = EINVAL;
547 }
548 }
549
550 if (copy_to_user(compat_ptr(am_hreq.ops), ops, size))
551 error = XFS_ERROR(EFAULT);
552
553 kfree(attr_name);
554 out_kfree_ops:
555 kfree(ops);
556 out_vn_rele:
557 iput(inode);
558 out:
559 return -error;
560}
561
562STATIC int
563xfs_compat_fssetdm_by_handle(
564 xfs_mount_t *mp,
565 void __user *arg,
566 struct inode *parinode)
567{
568 int error;
569 struct fsdmidata fsd;
570 compat_xfs_fsop_setdm_handlereq_t dmhreq;
571 struct inode *inode;
572
573 if (!capable(CAP_MKNOD))
574 return -XFS_ERROR(EPERM);
575 if (copy_from_user(&dmhreq, arg,
576 sizeof(compat_xfs_fsop_setdm_handlereq_t)))
577 return -XFS_ERROR(EFAULT);
578
579 error = xfs_vget_fsop_handlereq_compat(mp, parinode, &dmhreq.hreq,
580 &inode);
581 if (error)
582 return -error;
583
584 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) {
585 error = -XFS_ERROR(EPERM);
586 goto out;
587 }
588
589 if (copy_from_user(&fsd, compat_ptr(dmhreq.data), sizeof(fsd))) {
590 error = -XFS_ERROR(EFAULT);
591 goto out;
592 }
593
594 error = -xfs_set_dmattrs(XFS_I(inode), fsd.fsd_dmevmask,
595 fsd.fsd_dmstate);
596
597out:
598 iput(inode);
599 return error;
600}
601
602long
603xfs_file_compat_ioctl(
604 struct file *filp,
605 unsigned cmd,
606 unsigned long p)
376{ 607{
377 struct inode *inode = file->f_path.dentry->d_inode; 608 struct inode *inode = filp->f_path.dentry->d_inode;
378 int error; 609 struct xfs_inode *ip = XFS_I(inode);
610 struct xfs_mount *mp = ip->i_mount;
611 void __user *arg = (void __user *)p;
612 int ioflags = 0;
613 int error;
614
615 if (filp->f_mode & FMODE_NOCMTIME)
616 ioflags |= IO_INVIS;
617
618 xfs_itrace_entry(ip);
379 619
380 switch (cmd) { 620 switch (cmd) {
621 /* No size or alignment issues on any arch */
381 case XFS_IOC_DIOINFO: 622 case XFS_IOC_DIOINFO:
382 case XFS_IOC_FSGEOMETRY: 623 case XFS_IOC_FSGEOMETRY:
383 case XFS_IOC_FSGETXATTR: 624 case XFS_IOC_FSGETXATTR:
@@ -387,48 +628,16 @@ xfs_compat_ioctl(
387 case XFS_IOC_GETBMAP: 628 case XFS_IOC_GETBMAP:
388 case XFS_IOC_GETBMAPA: 629 case XFS_IOC_GETBMAPA:
389 case XFS_IOC_GETBMAPX: 630 case XFS_IOC_GETBMAPX:
390/* not handled
391 case XFS_IOC_FSSETDM_BY_HANDLE:
392 case XFS_IOC_ATTRLIST_BY_HANDLE:
393 case XFS_IOC_ATTRMULTI_BY_HANDLE:
394*/
395 case XFS_IOC_FSCOUNTS: 631 case XFS_IOC_FSCOUNTS:
396 case XFS_IOC_SET_RESBLKS: 632 case XFS_IOC_SET_RESBLKS:
397 case XFS_IOC_GET_RESBLKS: 633 case XFS_IOC_GET_RESBLKS:
398 case XFS_IOC_FSGROWFSDATA:
399 case XFS_IOC_FSGROWFSLOG: 634 case XFS_IOC_FSGROWFSLOG:
400 case XFS_IOC_FSGROWFSRT:
401 case XFS_IOC_FREEZE:
402 case XFS_IOC_THAW:
403 case XFS_IOC_GOINGDOWN: 635 case XFS_IOC_GOINGDOWN:
404 case XFS_IOC_ERROR_INJECTION: 636 case XFS_IOC_ERROR_INJECTION:
405 case XFS_IOC_ERROR_CLEARALL: 637 case XFS_IOC_ERROR_CLEARALL:
406 break; 638 return xfs_file_ioctl(filp, cmd, p);
407 639#ifndef BROKEN_X86_ALIGNMENT
408 case XFS_IOC32_GETXFLAGS: 640 /* These are handled fine if no alignment issues */
409 case XFS_IOC32_SETXFLAGS:
410 case XFS_IOC32_GETVERSION:
411 cmd = _NATIVE_IOC(cmd, long);
412 break;
413#ifdef BROKEN_X86_ALIGNMENT
414 /* xfs_flock_t has wrong u32 vs u64 alignment */
415 case XFS_IOC_ALLOCSP_32:
416 case XFS_IOC_FREESP_32:
417 case XFS_IOC_ALLOCSP64_32:
418 case XFS_IOC_FREESP64_32:
419 case XFS_IOC_RESVSP_32:
420 case XFS_IOC_UNRESVSP_32:
421 case XFS_IOC_RESVSP64_32:
422 case XFS_IOC_UNRESVSP64_32:
423 arg = xfs_ioctl32_flock(arg);
424 cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
425 break;
426 case XFS_IOC_FSGEOMETRY_V1_32:
427 arg = xfs_ioctl32_geom_v1(arg);
428 cmd = _NATIVE_IOC(cmd, struct xfs_fsop_geom_v1);
429 break;
430
431#else /* These are handled fine if no alignment issues */
432 case XFS_IOC_ALLOCSP: 641 case XFS_IOC_ALLOCSP:
433 case XFS_IOC_FREESP: 642 case XFS_IOC_FREESP:
434 case XFS_IOC_RESVSP: 643 case XFS_IOC_RESVSP:
@@ -438,51 +647,97 @@ xfs_compat_ioctl(
438 case XFS_IOC_RESVSP64: 647 case XFS_IOC_RESVSP64:
439 case XFS_IOC_UNRESVSP64: 648 case XFS_IOC_UNRESVSP64:
440 case XFS_IOC_FSGEOMETRY_V1: 649 case XFS_IOC_FSGEOMETRY_V1:
441 break; 650 case XFS_IOC_FSGROWFSDATA:
651 case XFS_IOC_FSGROWFSRT:
652 return xfs_file_ioctl(filp, cmd, p);
653#else
654 case XFS_IOC_ALLOCSP_32:
655 case XFS_IOC_FREESP_32:
656 case XFS_IOC_ALLOCSP64_32:
657 case XFS_IOC_FREESP64_32:
658 case XFS_IOC_RESVSP_32:
659 case XFS_IOC_UNRESVSP_32:
660 case XFS_IOC_RESVSP64_32:
661 case XFS_IOC_UNRESVSP64_32: {
662 struct xfs_flock64 bf;
442 663
443 /* xfs_bstat_t still has wrong u32 vs u64 alignment */ 664 if (xfs_compat_flock64_copyin(&bf, arg))
444 case XFS_IOC_SWAPEXT: 665 return -XFS_ERROR(EFAULT);
445 break; 666 cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
667 return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
668 }
669 case XFS_IOC_FSGEOMETRY_V1_32:
670 return xfs_compat_ioc_fsgeometry_v1(mp, arg);
671 case XFS_IOC_FSGROWFSDATA_32: {
672 struct xfs_growfs_data in;
673
674 if (xfs_compat_growfs_data_copyin(&in, arg))
675 return -XFS_ERROR(EFAULT);
676 error = xfs_growfs_data(mp, &in);
677 return -error;
678 }
679 case XFS_IOC_FSGROWFSRT_32: {
680 struct xfs_growfs_rt in;
446 681
682 if (xfs_compat_growfs_rt_copyin(&in, arg))
683 return -XFS_ERROR(EFAULT);
684 error = xfs_growfs_rt(mp, &in);
685 return -error;
686 }
447#endif 687#endif
688 /* long changes size, but xfs only copiese out 32 bits */
689 case XFS_IOC_GETXFLAGS_32:
690 case XFS_IOC_SETXFLAGS_32:
691 case XFS_IOC_GETVERSION_32:
692 cmd = _NATIVE_IOC(cmd, long);
693 return xfs_file_ioctl(filp, cmd, p);
694 case XFS_IOC_SWAPEXT: {
695 struct xfs_swapext sxp;
696 struct compat_xfs_swapext __user *sxu = arg;
697
698 /* Bulk copy in up to the sx_stat field, then copy bstat */
699 if (copy_from_user(&sxp, sxu,
700 offsetof(struct xfs_swapext, sx_stat)) ||
701 xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat))
702 return -XFS_ERROR(EFAULT);
703 error = xfs_swapext(&sxp);
704 return -error;
705 }
448 case XFS_IOC_FSBULKSTAT_32: 706 case XFS_IOC_FSBULKSTAT_32:
449 case XFS_IOC_FSBULKSTAT_SINGLE_32: 707 case XFS_IOC_FSBULKSTAT_SINGLE_32:
450 case XFS_IOC_FSINUMBERS_32: 708 case XFS_IOC_FSINUMBERS_32:
451 cmd = _NATIVE_IOC(cmd, struct xfs_fsop_bulkreq); 709 return xfs_compat_ioc_bulkstat(mp, cmd, arg);
452 return xfs_ioc_bulkstat_compat(XFS_I(inode)->i_mount,
453 cmd, (void __user*)arg);
454 case XFS_IOC_FD_TO_HANDLE_32: 710 case XFS_IOC_FD_TO_HANDLE_32:
455 case XFS_IOC_PATH_TO_HANDLE_32: 711 case XFS_IOC_PATH_TO_HANDLE_32:
456 case XFS_IOC_PATH_TO_FSHANDLE_32: 712 case XFS_IOC_PATH_TO_FSHANDLE_32: {
457 case XFS_IOC_OPEN_BY_HANDLE_32: 713 struct xfs_fsop_handlereq hreq;
458 case XFS_IOC_READLINK_BY_HANDLE_32: 714
459 arg = xfs_ioctl32_fshandle(arg); 715 if (xfs_compat_handlereq_copyin(&hreq, arg))
716 return -XFS_ERROR(EFAULT);
460 cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq); 717 cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq);
461 break; 718 return xfs_find_handle(cmd, &hreq);
462 default:
463 return -ENOIOCTLCMD;
464 } 719 }
720 case XFS_IOC_OPEN_BY_HANDLE_32: {
721 struct xfs_fsop_handlereq hreq;
465 722
466 error = xfs_ioctl(XFS_I(inode), file, mode, cmd, (void __user *)arg); 723 if (xfs_compat_handlereq_copyin(&hreq, arg))
467 xfs_iflags_set(XFS_I(inode), XFS_IMODIFIED); 724 return -XFS_ERROR(EFAULT);
468 725 return xfs_open_by_handle(mp, &hreq, filp, inode);
469 return error; 726 }
470} 727 case XFS_IOC_READLINK_BY_HANDLE_32: {
471 728 struct xfs_fsop_handlereq hreq;
472long
473xfs_file_compat_ioctl(
474 struct file *file,
475 unsigned cmd,
476 unsigned long arg)
477{
478 return xfs_compat_ioctl(0, file, cmd, arg);
479}
480 729
481long 730 if (xfs_compat_handlereq_copyin(&hreq, arg))
482xfs_file_compat_invis_ioctl( 731 return -XFS_ERROR(EFAULT);
483 struct file *file, 732 return xfs_readlink_by_handle(mp, &hreq, inode);
484 unsigned cmd, 733 }
485 unsigned long arg) 734 case XFS_IOC_ATTRLIST_BY_HANDLE_32:
486{ 735 return xfs_compat_attrlist_by_handle(mp, arg, inode);
487 return xfs_compat_ioctl(IO_INVIS, file, cmd, arg); 736 case XFS_IOC_ATTRMULTI_BY_HANDLE_32:
737 return xfs_compat_attrmulti_by_handle(mp, arg, inode);
738 case XFS_IOC_FSSETDM_BY_HANDLE_32:
739 return xfs_compat_fssetdm_by_handle(mp, arg, inode);
740 default:
741 return -XFS_ERROR(ENOIOCTLCMD);
742 }
488} 743}
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h
index 02de6e62ee37..1024c4f8ba0d 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.h
@@ -18,7 +18,217 @@
18#ifndef __XFS_IOCTL32_H__ 18#ifndef __XFS_IOCTL32_H__
19#define __XFS_IOCTL32_H__ 19#define __XFS_IOCTL32_H__
20 20
21extern long xfs_file_compat_ioctl(struct file *, unsigned, unsigned long); 21#include <linux/compat.h>
22extern long xfs_file_compat_invis_ioctl(struct file *, unsigned, unsigned long); 22
23/*
24 * on 32-bit arches, ioctl argument structures may have different sizes
25 * and/or alignment. We define compat structures which match the
26 * 32-bit sizes/alignments here, and their associated ioctl numbers.
27 *
28 * xfs_ioctl32.c contains routines to copy these structures in and out.
29 */
30
31/* stock kernel-level ioctls we support */
32#define XFS_IOC_GETXFLAGS_32 FS_IOC32_GETFLAGS
33#define XFS_IOC_SETXFLAGS_32 FS_IOC32_SETFLAGS
34#define XFS_IOC_GETVERSION_32 FS_IOC32_GETVERSION
35
36/*
37 * On intel, even if sizes match, alignment and/or padding may differ.
38 */
39#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
40#define BROKEN_X86_ALIGNMENT
41#define __compat_packed __attribute__((packed))
42#else
43#define __compat_packed
44#endif
45
46typedef struct compat_xfs_bstime {
47 compat_time_t tv_sec; /* seconds */
48 __s32 tv_nsec; /* and nanoseconds */
49} compat_xfs_bstime_t;
50
51typedef struct compat_xfs_bstat {
52 __u64 bs_ino; /* inode number */
53 __u16 bs_mode; /* type and mode */
54 __u16 bs_nlink; /* number of links */
55 __u32 bs_uid; /* user id */
56 __u32 bs_gid; /* group id */
57 __u32 bs_rdev; /* device value */
58 __s32 bs_blksize; /* block size */
59 __s64 bs_size; /* file size */
60 compat_xfs_bstime_t bs_atime; /* access time */
61 compat_xfs_bstime_t bs_mtime; /* modify time */
62 compat_xfs_bstime_t bs_ctime; /* inode change time */
63 int64_t bs_blocks; /* number of blocks */
64 __u32 bs_xflags; /* extended flags */
65 __s32 bs_extsize; /* extent size */
66 __s32 bs_extents; /* number of extents */
67 __u32 bs_gen; /* generation count */
68 __u16 bs_projid; /* project id */
69 unsigned char bs_pad[14]; /* pad space, unused */
70 __u32 bs_dmevmask; /* DMIG event mask */
71 __u16 bs_dmstate; /* DMIG state info */
72 __u16 bs_aextents; /* attribute number of extents */
73} __compat_packed compat_xfs_bstat_t;
74
75typedef struct compat_xfs_fsop_bulkreq {
76 compat_uptr_t lastip; /* last inode # pointer */
77 __s32 icount; /* count of entries in buffer */
78 compat_uptr_t ubuffer; /* user buffer for inode desc. */
79 compat_uptr_t ocount; /* output count pointer */
80} compat_xfs_fsop_bulkreq_t;
81
82#define XFS_IOC_FSBULKSTAT_32 \
83 _IOWR('X', 101, struct compat_xfs_fsop_bulkreq)
84#define XFS_IOC_FSBULKSTAT_SINGLE_32 \
85 _IOWR('X', 102, struct compat_xfs_fsop_bulkreq)
86#define XFS_IOC_FSINUMBERS_32 \
87 _IOWR('X', 103, struct compat_xfs_fsop_bulkreq)
88
89typedef struct compat_xfs_fsop_handlereq {
90 __u32 fd; /* fd for FD_TO_HANDLE */
91 compat_uptr_t path; /* user pathname */
92 __u32 oflags; /* open flags */
93 compat_uptr_t ihandle; /* user supplied handle */
94 __u32 ihandlen; /* user supplied length */
95 compat_uptr_t ohandle; /* user buffer for handle */
96 compat_uptr_t ohandlen; /* user buffer length */
97} compat_xfs_fsop_handlereq_t;
98
99#define XFS_IOC_PATH_TO_FSHANDLE_32 \
100 _IOWR('X', 104, struct compat_xfs_fsop_handlereq)
101#define XFS_IOC_PATH_TO_HANDLE_32 \
102 _IOWR('X', 105, struct compat_xfs_fsop_handlereq)
103#define XFS_IOC_FD_TO_HANDLE_32 \
104 _IOWR('X', 106, struct compat_xfs_fsop_handlereq)
105#define XFS_IOC_OPEN_BY_HANDLE_32 \
106 _IOWR('X', 107, struct compat_xfs_fsop_handlereq)
107#define XFS_IOC_READLINK_BY_HANDLE_32 \
108 _IOWR('X', 108, struct compat_xfs_fsop_handlereq)
109
110/* The bstat field in the swapext struct needs translation */
111typedef struct compat_xfs_swapext {
112 __int64_t sx_version; /* version */
113 __int64_t sx_fdtarget; /* fd of target file */
114 __int64_t sx_fdtmp; /* fd of tmp file */
115 xfs_off_t sx_offset; /* offset into file */
116 xfs_off_t sx_length; /* leng from offset */
117 char sx_pad[16]; /* pad space, unused */
118 compat_xfs_bstat_t sx_stat; /* stat of target b4 copy */
119} __compat_packed compat_xfs_swapext_t;
120
121#define XFS_IOC_SWAPEXT_32 _IOWR('X', 109, struct compat_xfs_swapext)
122
123typedef struct compat_xfs_fsop_attrlist_handlereq {
124 struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */
125 struct xfs_attrlist_cursor pos; /* opaque cookie, list offset */
126 __u32 flags; /* which namespace to use */
127 __u32 buflen; /* length of buffer supplied */
128 compat_uptr_t buffer; /* returned names */
129} __compat_packed compat_xfs_fsop_attrlist_handlereq_t;
130
131/* Note: actually this is read/write */
132#define XFS_IOC_ATTRLIST_BY_HANDLE_32 \
133 _IOW('X', 122, struct compat_xfs_fsop_attrlist_handlereq)
134
135/* am_opcodes defined in xfs_fs.h */
136typedef struct compat_xfs_attr_multiop {
137 __u32 am_opcode;
138 __s32 am_error;
139 compat_uptr_t am_attrname;
140 compat_uptr_t am_attrvalue;
141 __u32 am_length;
142 __u32 am_flags;
143} compat_xfs_attr_multiop_t;
144
145typedef struct compat_xfs_fsop_attrmulti_handlereq {
146 struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */
147 __u32 opcount;/* count of following multiop */
148 /* ptr to compat_xfs_attr_multiop */
149 compat_uptr_t ops; /* attr_multi data */
150} compat_xfs_fsop_attrmulti_handlereq_t;
151
152#define XFS_IOC_ATTRMULTI_BY_HANDLE_32 \
153 _IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq)
154
155typedef struct compat_xfs_fsop_setdm_handlereq {
156 struct compat_xfs_fsop_handlereq hreq; /* handle information */
157 /* ptr to struct fsdmidata */
158 compat_uptr_t data; /* DMAPI data */
159} compat_xfs_fsop_setdm_handlereq_t;
160
161#define XFS_IOC_FSSETDM_BY_HANDLE_32 \
162 _IOW('X', 121, struct compat_xfs_fsop_setdm_handlereq)
163
164#ifdef BROKEN_X86_ALIGNMENT
165/* on ia32 l_start is on a 32-bit boundary */
166typedef struct compat_xfs_flock64 {
167 __s16 l_type;
168 __s16 l_whence;
169 __s64 l_start __attribute__((packed));
170 /* len == 0 means until end of file */
171 __s64 l_len __attribute__((packed));
172 __s32 l_sysid;
173 __u32 l_pid;
174 __s32 l_pad[4]; /* reserve area */
175} compat_xfs_flock64_t;
176
177#define XFS_IOC_ALLOCSP_32 _IOW('X', 10, struct compat_xfs_flock64)
178#define XFS_IOC_FREESP_32 _IOW('X', 11, struct compat_xfs_flock64)
179#define XFS_IOC_ALLOCSP64_32 _IOW('X', 36, struct compat_xfs_flock64)
180#define XFS_IOC_FREESP64_32 _IOW('X', 37, struct compat_xfs_flock64)
181#define XFS_IOC_RESVSP_32 _IOW('X', 40, struct compat_xfs_flock64)
182#define XFS_IOC_UNRESVSP_32 _IOW('X', 41, struct compat_xfs_flock64)
183#define XFS_IOC_RESVSP64_32 _IOW('X', 42, struct compat_xfs_flock64)
184#define XFS_IOC_UNRESVSP64_32 _IOW('X', 43, struct compat_xfs_flock64)
185
186typedef struct compat_xfs_fsop_geom_v1 {
187 __u32 blocksize; /* filesystem (data) block size */
188 __u32 rtextsize; /* realtime extent size */
189 __u32 agblocks; /* fsblocks in an AG */
190 __u32 agcount; /* number of allocation groups */
191 __u32 logblocks; /* fsblocks in the log */
192 __u32 sectsize; /* (data) sector size, bytes */
193 __u32 inodesize; /* inode size in bytes */
194 __u32 imaxpct; /* max allowed inode space(%) */
195 __u64 datablocks; /* fsblocks in data subvolume */
196 __u64 rtblocks; /* fsblocks in realtime subvol */
197 __u64 rtextents; /* rt extents in realtime subvol*/
198 __u64 logstart; /* starting fsblock of the log */
199 unsigned char uuid[16]; /* unique id of the filesystem */
200 __u32 sunit; /* stripe unit, fsblocks */
201 __u32 swidth; /* stripe width, fsblocks */
202 __s32 version; /* structure version */
203 __u32 flags; /* superblock version flags */
204 __u32 logsectsize; /* log sector size, bytes */
205 __u32 rtsectsize; /* realtime sector size, bytes */
206 __u32 dirblocksize; /* directory block size, bytes */
207} __attribute__((packed)) compat_xfs_fsop_geom_v1_t;
208
209#define XFS_IOC_FSGEOMETRY_V1_32 \
210 _IOR('X', 100, struct compat_xfs_fsop_geom_v1)
211
212typedef struct compat_xfs_inogrp {
213 __u64 xi_startino; /* starting inode number */
214 __s32 xi_alloccount; /* # bits set in allocmask */
215 __u64 xi_allocmask; /* mask of allocated inodes */
216} __attribute__((packed)) compat_xfs_inogrp_t;
217
218/* These growfs input structures have padding on the end, so must translate */
219typedef struct compat_xfs_growfs_data {
220 __u64 newblocks; /* new data subvol size, fsblocks */
221 __u32 imaxpct; /* new inode space percentage limit */
222} __attribute__((packed)) compat_xfs_growfs_data_t;
223
224typedef struct compat_xfs_growfs_rt {
225 __u64 newblocks; /* new realtime size, fsblocks */
226 __u32 extsize; /* new realtime extent size, fsblocks */
227} __attribute__((packed)) compat_xfs_growfs_rt_t;
228
229#define XFS_IOC_FSGROWFSDATA_32 _IOW('X', 110, struct compat_xfs_growfs_data)
230#define XFS_IOC_FSGROWFSRT_32 _IOW('X', 112, struct compat_xfs_growfs_rt)
231
232#endif /* BROKEN_X86_ALIGNMENT */
23 233
24#endif /* __XFS_IOCTL32_H__ */ 234#endif /* __XFS_IOCTL32_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 095d271f3434..7aa53fefc67f 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -53,6 +53,7 @@
53#include <linux/namei.h> 53#include <linux/namei.h>
54#include <linux/security.h> 54#include <linux/security.h>
55#include <linux/falloc.h> 55#include <linux/falloc.h>
56#include <linux/fiemap.h>
56 57
57/* 58/*
58 * Bring the atime in the XFS inode uptodate. 59 * Bring the atime in the XFS inode uptodate.
@@ -64,14 +65,14 @@ xfs_synchronize_atime(
64{ 65{
65 struct inode *inode = VFS_I(ip); 66 struct inode *inode = VFS_I(ip);
66 67
67 if (inode) { 68 if (!(inode->i_state & I_CLEAR)) {
68 ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; 69 ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
69 ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; 70 ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
70 } 71 }
71} 72}
72 73
73/* 74/*
74 * If the linux inode exists, mark it dirty. 75 * If the linux inode is valid, mark it dirty.
75 * Used when commiting a dirty inode into a transaction so that 76 * Used when commiting a dirty inode into a transaction so that
76 * the inode will get written back by the linux code 77 * the inode will get written back by the linux code
77 */ 78 */
@@ -81,7 +82,7 @@ xfs_mark_inode_dirty_sync(
81{ 82{
82 struct inode *inode = VFS_I(ip); 83 struct inode *inode = VFS_I(ip);
83 84
84 if (inode) 85 if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR)))
85 mark_inode_dirty_sync(inode); 86 mark_inode_dirty_sync(inode);
86} 87}
87 88
@@ -128,7 +129,7 @@ xfs_ichgtime(
128 if (sync_it) { 129 if (sync_it) {
129 SYNCHRONIZE(); 130 SYNCHRONIZE();
130 ip->i_update_core = 1; 131 ip->i_update_core = 1;
131 mark_inode_dirty_sync(inode); 132 xfs_mark_inode_dirty_sync(ip);
132 } 133 }
133} 134}
134 135
@@ -158,8 +159,6 @@ xfs_init_security(
158 } 159 }
159 160
160 error = xfs_attr_set(ip, name, value, length, ATTR_SECURE); 161 error = xfs_attr_set(ip, name, value, length, ATTR_SECURE);
161 if (!error)
162 xfs_iflags_set(ip, XFS_IMODIFIED);
163 162
164 kfree(name); 163 kfree(name);
165 kfree(value); 164 kfree(value);
@@ -260,7 +259,6 @@ xfs_vn_mknod(
260 error = _ACL_INHERIT(inode, mode, default_acl); 259 error = _ACL_INHERIT(inode, mode, default_acl);
261 if (unlikely(error)) 260 if (unlikely(error))
262 goto out_cleanup_inode; 261 goto out_cleanup_inode;
263 xfs_iflags_set(ip, XFS_IMODIFIED);
264 _ACL_FREE(default_acl); 262 _ACL_FREE(default_acl);
265 } 263 }
266 264
@@ -366,21 +364,17 @@ xfs_vn_link(
366 struct inode *dir, 364 struct inode *dir,
367 struct dentry *dentry) 365 struct dentry *dentry)
368{ 366{
369 struct inode *inode; /* inode of guy being linked to */ 367 struct inode *inode = old_dentry->d_inode;
370 struct xfs_name name; 368 struct xfs_name name;
371 int error; 369 int error;
372 370
373 inode = old_dentry->d_inode;
374 xfs_dentry_to_name(&name, dentry); 371 xfs_dentry_to_name(&name, dentry);
375 372
376 igrab(inode);
377 error = xfs_link(XFS_I(dir), XFS_I(inode), &name); 373 error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
378 if (unlikely(error)) { 374 if (unlikely(error))
379 iput(inode);
380 return -error; 375 return -error;
381 }
382 376
383 xfs_iflags_set(XFS_I(dir), XFS_IMODIFIED); 377 atomic_inc(&inode->i_count);
384 d_instantiate(dentry, inode); 378 d_instantiate(dentry, inode);
385 return 0; 379 return 0;
386} 380}
@@ -601,7 +595,7 @@ xfs_vn_setattr(
601 struct dentry *dentry, 595 struct dentry *dentry,
602 struct iattr *iattr) 596 struct iattr *iattr)
603{ 597{
604 return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0, NULL); 598 return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
605} 599}
606 600
607/* 601/*
@@ -642,7 +636,7 @@ xfs_vn_fallocate(
642 636
643 xfs_ilock(ip, XFS_IOLOCK_EXCL); 637 xfs_ilock(ip, XFS_IOLOCK_EXCL);
644 error = xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf, 638 error = xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
645 0, NULL, XFS_ATTR_NOLOCK); 639 0, XFS_ATTR_NOLOCK);
646 if (!error && !(mode & FALLOC_FL_KEEP_SIZE) && 640 if (!error && !(mode & FALLOC_FL_KEEP_SIZE) &&
647 offset + len > i_size_read(inode)) 641 offset + len > i_size_read(inode))
648 new_size = offset + len; 642 new_size = offset + len;
@@ -653,7 +647,7 @@ xfs_vn_fallocate(
653 647
654 iattr.ia_valid = ATTR_SIZE; 648 iattr.ia_valid = ATTR_SIZE;
655 iattr.ia_size = new_size; 649 iattr.ia_size = new_size;
656 error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK, NULL); 650 error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
657 } 651 }
658 652
659 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 653 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -661,6 +655,88 @@ out_error:
661 return error; 655 return error;
662} 656}
663 657
658#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
659
660/*
661 * Call fiemap helper to fill in user data.
662 * Returns positive errors to xfs_getbmap.
663 */
664STATIC int
665xfs_fiemap_format(
666 void **arg,
667 struct getbmapx *bmv,
668 int *full)
669{
670 int error;
671 struct fiemap_extent_info *fieinfo = *arg;
672 u32 fiemap_flags = 0;
673 u64 logical, physical, length;
674
675 /* Do nothing for a hole */
676 if (bmv->bmv_block == -1LL)
677 return 0;
678
679 logical = BBTOB(bmv->bmv_offset);
680 physical = BBTOB(bmv->bmv_block);
681 length = BBTOB(bmv->bmv_length);
682
683 if (bmv->bmv_oflags & BMV_OF_PREALLOC)
684 fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
685 else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
686 fiemap_flags |= FIEMAP_EXTENT_DELALLOC;
687 physical = 0; /* no block yet */
688 }
689 if (bmv->bmv_oflags & BMV_OF_LAST)
690 fiemap_flags |= FIEMAP_EXTENT_LAST;
691
692 error = fiemap_fill_next_extent(fieinfo, logical, physical,
693 length, fiemap_flags);
694 if (error > 0) {
695 error = 0;
696 *full = 1; /* user array now full */
697 }
698
699 return -error;
700}
701
702STATIC int
703xfs_vn_fiemap(
704 struct inode *inode,
705 struct fiemap_extent_info *fieinfo,
706 u64 start,
707 u64 length)
708{
709 xfs_inode_t *ip = XFS_I(inode);
710 struct getbmapx bm;
711 int error;
712
713 error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS);
714 if (error)
715 return error;
716
717 /* Set up bmap header for xfs internal routine */
718 bm.bmv_offset = BTOBB(start);
719 /* Special case for whole file */
720 if (length == FIEMAP_MAX_OFFSET)
721 bm.bmv_length = -1LL;
722 else
723 bm.bmv_length = BTOBB(length);
724
725 /* our formatter will tell xfs_getbmap when to stop. */
726 bm.bmv_count = MAXEXTNUM;
727 bm.bmv_iflags = BMV_IF_PREALLOC;
728 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
729 bm.bmv_iflags |= BMV_IF_ATTRFORK;
730 if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
731 bm.bmv_iflags |= BMV_IF_DELALLOC;
732
733 error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
734 if (error)
735 return -error;
736
737 return 0;
738}
739
664static const struct inode_operations xfs_inode_operations = { 740static const struct inode_operations xfs_inode_operations = {
665 .permission = xfs_vn_permission, 741 .permission = xfs_vn_permission,
666 .truncate = xfs_vn_truncate, 742 .truncate = xfs_vn_truncate,
@@ -671,6 +747,7 @@ static const struct inode_operations xfs_inode_operations = {
671 .removexattr = generic_removexattr, 747 .removexattr = generic_removexattr,
672 .listxattr = xfs_vn_listxattr, 748 .listxattr = xfs_vn_listxattr,
673 .fallocate = xfs_vn_fallocate, 749 .fallocate = xfs_vn_fallocate,
750 .fiemap = xfs_vn_fiemap,
674}; 751};
675 752
676static const struct inode_operations xfs_dir_inode_operations = { 753static const struct inode_operations xfs_dir_inode_operations = {
@@ -766,12 +843,20 @@ xfs_diflags_to_iflags(
766 * When reading existing inodes from disk this is called directly 843 * When reading existing inodes from disk this is called directly
767 * from xfs_iget, when creating a new inode it is called from 844 * from xfs_iget, when creating a new inode it is called from
768 * xfs_ialloc after setting up the inode. 845 * xfs_ialloc after setting up the inode.
846 *
847 * We are always called with an uninitialised linux inode here.
848 * We need to initialise the necessary fields and take a reference
849 * on it.
769 */ 850 */
770void 851void
771xfs_setup_inode( 852xfs_setup_inode(
772 struct xfs_inode *ip) 853 struct xfs_inode *ip)
773{ 854{
774 struct inode *inode = ip->i_vnode; 855 struct inode *inode = &ip->i_vnode;
856
857 inode->i_ino = ip->i_ino;
858 inode->i_state = I_NEW|I_LOCK;
859 inode_add_to_lists(ip->i_mount->m_super, inode);
775 860
776 inode->i_mode = ip->i_d.di_mode; 861 inode->i_mode = ip->i_d.di_mode;
777 inode->i_nlink = ip->i_d.di_nlink; 862 inode->i_nlink = ip->i_d.di_nlink;
@@ -799,7 +884,6 @@ xfs_setup_inode(
799 inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec; 884 inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
800 inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; 885 inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
801 xfs_diflags_to_iflags(inode, ip); 886 xfs_diflags_to_iflags(inode, ip);
802 xfs_iflags_clear(ip, XFS_IMODIFIED);
803 887
804 switch (inode->i_mode & S_IFMT) { 888 switch (inode->i_mode & S_IFMT) {
805 case S_IFREG: 889 case S_IFREG:
diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h
index 8b1a1e31dc21..ef41c92ce66e 100644
--- a/fs/xfs/linux-2.6/xfs_iops.h
+++ b/fs/xfs/linux-2.6/xfs_iops.h
@@ -22,7 +22,6 @@ struct xfs_inode;
22 22
23extern const struct file_operations xfs_file_operations; 23extern const struct file_operations xfs_file_operations;
24extern const struct file_operations xfs_dir_file_operations; 24extern const struct file_operations xfs_dir_file_operations;
25extern const struct file_operations xfs_invis_file_operations;
26 25
27extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); 26extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
28 27
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index cc0f7b3a9795..507492d6dccd 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -21,18 +21,12 @@
21#include <linux/types.h> 21#include <linux/types.h>
22 22
23/* 23/*
24 * Some types are conditional depending on the target system.
25 * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits. 24 * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits.
26 * XFS_BIG_INUMS needs the VFS inode number to be 64 bits, as well 25 * XFS_BIG_INUMS requires XFS_BIG_BLKNOS to be set.
27 * as requiring XFS_BIG_BLKNOS to be set.
28 */ 26 */
29#if defined(CONFIG_LBD) || (BITS_PER_LONG == 64) 27#if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
30# define XFS_BIG_BLKNOS 1 28# define XFS_BIG_BLKNOS 1
31# if BITS_PER_LONG == 64 29# define XFS_BIG_INUMS 1
32# define XFS_BIG_INUMS 1
33# else
34# define XFS_BIG_INUMS 0
35# endif
36#else 30#else
37# define XFS_BIG_BLKNOS 0 31# define XFS_BIG_BLKNOS 0
38# define XFS_BIG_INUMS 0 32# define XFS_BIG_INUMS 0
@@ -77,6 +71,7 @@
77#include <linux/spinlock.h> 71#include <linux/spinlock.h>
78#include <linux/random.h> 72#include <linux/random.h>
79#include <linux/ctype.h> 73#include <linux/ctype.h>
74#include <linux/writeback.h>
80 75
81#include <asm/page.h> 76#include <asm/page.h>
82#include <asm/div64.h> 77#include <asm/div64.h>
@@ -85,7 +80,6 @@
85#include <asm/byteorder.h> 80#include <asm/byteorder.h>
86#include <asm/unaligned.h> 81#include <asm/unaligned.h>
87 82
88#include <xfs_vfs.h>
89#include <xfs_cred.h> 83#include <xfs_cred.h>
90#include <xfs_vnode.h> 84#include <xfs_vnode.h>
91#include <xfs_stats.h> 85#include <xfs_stats.h>
@@ -107,7 +101,6 @@
107#undef HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */ 101#undef HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */
108#endif 102#endif
109 103
110#define restricted_chown xfs_params.restrict_chown.val
111#define irix_sgid_inherit xfs_params.sgid_inherit.val 104#define irix_sgid_inherit xfs_params.sgid_inherit.val
112#define irix_symlink_mode xfs_params.symlink_mode.val 105#define irix_symlink_mode xfs_params.symlink_mode.val
113#define xfs_panic_mask xfs_params.panic_mask.val 106#define xfs_panic_mask xfs_params.panic_mask.val
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 1957e5357d04..7e90daa0d1d1 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -51,7 +51,6 @@
51#include "xfs_vnodeops.h" 51#include "xfs_vnodeops.h"
52 52
53#include <linux/capability.h> 53#include <linux/capability.h>
54#include <linux/mount.h>
55#include <linux/writeback.h> 54#include <linux/writeback.h>
56 55
57 56
@@ -243,7 +242,7 @@ xfs_read(
243 242
244 if (unlikely(ioflags & IO_ISDIRECT)) { 243 if (unlikely(ioflags & IO_ISDIRECT)) {
245 if (inode->i_mapping->nrpages) 244 if (inode->i_mapping->nrpages)
246 ret = xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK), 245 ret = -xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
247 -1, FI_REMAPF_LOCKED); 246 -1, FI_REMAPF_LOCKED);
248 mutex_unlock(&inode->i_mutex); 247 mutex_unlock(&inode->i_mutex);
249 if (ret) { 248 if (ret) {
@@ -668,15 +667,8 @@ start:
668 if (new_size > xip->i_size) 667 if (new_size > xip->i_size)
669 xip->i_new_size = new_size; 668 xip->i_new_size = new_size;
670 669
671 /* 670 if (likely(!(ioflags & IO_INVIS)))
672 * We're not supposed to change timestamps in readonly-mounted
673 * filesystems. Throw it away if anyone asks us.
674 */
675 if (likely(!(ioflags & IO_INVIS) &&
676 !mnt_want_write(file->f_path.mnt))) {
677 xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 671 xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
678 mnt_drop_write(file->f_path.mnt);
679 }
680 672
681 /* 673 /*
682 * If the offset is beyond the size of the file, we have a couple 674 * If the offset is beyond the size of the file, we have a couple
@@ -715,7 +707,6 @@ start:
715 } 707 }
716 } 708 }
717 709
718retry:
719 /* We can write back this queue in page reclaim */ 710 /* We can write back this queue in page reclaim */
720 current->backing_dev_info = mapping->backing_dev_info; 711 current->backing_dev_info = mapping->backing_dev_info;
721 712
@@ -771,6 +762,17 @@ retry:
771 if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO)) 762 if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
772 ret = wait_on_sync_kiocb(iocb); 763 ret = wait_on_sync_kiocb(iocb);
773 764
765 isize = i_size_read(inode);
766 if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
767 *offset = isize;
768
769 if (*offset > xip->i_size) {
770 xfs_ilock(xip, XFS_ILOCK_EXCL);
771 if (*offset > xip->i_size)
772 xip->i_size = *offset;
773 xfs_iunlock(xip, XFS_ILOCK_EXCL);
774 }
775
774 if (ret == -ENOSPC && 776 if (ret == -ENOSPC &&
775 DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { 777 DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
776 xfs_iunlock(xip, iolock); 778 xfs_iunlock(xip, iolock);
@@ -784,20 +786,7 @@ retry:
784 xfs_ilock(xip, iolock); 786 xfs_ilock(xip, iolock);
785 if (error) 787 if (error)
786 goto out_unlock_internal; 788 goto out_unlock_internal;
787 pos = xip->i_size; 789 goto start;
788 ret = 0;
789 goto retry;
790 }
791
792 isize = i_size_read(inode);
793 if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
794 *offset = isize;
795
796 if (*offset > xip->i_size) {
797 xfs_ilock(xip, XFS_ILOCK_EXCL);
798 if (*offset > xip->i_size)
799 xip->i_size = *offset;
800 xfs_iunlock(xip, XFS_ILOCK_EXCL);
801 } 790 }
802 791
803 error = -ret; 792 error = -ret;
@@ -855,13 +844,7 @@ retry:
855int 844int
856xfs_bdstrat_cb(struct xfs_buf *bp) 845xfs_bdstrat_cb(struct xfs_buf *bp)
857{ 846{
858 xfs_mount_t *mp; 847 if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
859
860 mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
861 if (!XFS_FORCED_SHUTDOWN(mp)) {
862 xfs_buf_iorequest(bp);
863 return 0;
864 } else {
865 xfs_buftrace("XFS__BDSTRAT IOERROR", bp); 848 xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
866 /* 849 /*
867 * Metadata write that didn't get logged but 850 * Metadata write that didn't get logged but
@@ -874,6 +857,9 @@ xfs_bdstrat_cb(struct xfs_buf *bp)
874 else 857 else
875 return (xfs_bioerror(bp)); 858 return (xfs_bioerror(bp));
876 } 859 }
860
861 xfs_buf_iorequest(bp);
862 return 0;
877} 863}
878 864
879/* 865/*
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
index 3d5b67c075c7..c3526d445f6a 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -53,11 +53,15 @@ xfs_read_xfsstats(
53 { "icluster", XFSSTAT_END_INODE_CLUSTER }, 53 { "icluster", XFSSTAT_END_INODE_CLUSTER },
54 { "vnodes", XFSSTAT_END_VNODE_OPS }, 54 { "vnodes", XFSSTAT_END_VNODE_OPS },
55 { "buf", XFSSTAT_END_BUF }, 55 { "buf", XFSSTAT_END_BUF },
56 { "abtb2", XFSSTAT_END_ABTB_V2 },
57 { "abtc2", XFSSTAT_END_ABTC_V2 },
58 { "bmbt2", XFSSTAT_END_BMBT_V2 },
59 { "ibt2", XFSSTAT_END_IBT_V2 },
56 }; 60 };
57 61
58 /* Loop over all stats groups */ 62 /* Loop over all stats groups */
59 for (i=j=len = 0; i < ARRAY_SIZE(xstats); i++) { 63 for (i=j=len = 0; i < ARRAY_SIZE(xstats); i++) {
60 len += sprintf(buffer + len, xstats[i].desc); 64 len += sprintf(buffer + len, "%s", xstats[i].desc);
61 /* inner loop does each group */ 65 /* inner loop does each group */
62 while (j < xstats[i].endpoint) { 66 while (j < xstats[i].endpoint) {
63 val = 0; 67 val = 0;
diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h
index e83820febc9f..736854b1ca1a 100644
--- a/fs/xfs/linux-2.6/xfs_stats.h
+++ b/fs/xfs/linux-2.6/xfs_stats.h
@@ -118,6 +118,71 @@ struct xfsstats {
118 __uint32_t xb_page_retries; 118 __uint32_t xb_page_retries;
119 __uint32_t xb_page_found; 119 __uint32_t xb_page_found;
120 __uint32_t xb_get_read; 120 __uint32_t xb_get_read;
121/* Version 2 btree counters */
122#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF+15)
123 __uint32_t xs_abtb_2_lookup;
124 __uint32_t xs_abtb_2_compare;
125 __uint32_t xs_abtb_2_insrec;
126 __uint32_t xs_abtb_2_delrec;
127 __uint32_t xs_abtb_2_newroot;
128 __uint32_t xs_abtb_2_killroot;
129 __uint32_t xs_abtb_2_increment;
130 __uint32_t xs_abtb_2_decrement;
131 __uint32_t xs_abtb_2_lshift;
132 __uint32_t xs_abtb_2_rshift;
133 __uint32_t xs_abtb_2_split;
134 __uint32_t xs_abtb_2_join;
135 __uint32_t xs_abtb_2_alloc;
136 __uint32_t xs_abtb_2_free;
137 __uint32_t xs_abtb_2_moves;
138#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2+15)
139 __uint32_t xs_abtc_2_lookup;
140 __uint32_t xs_abtc_2_compare;
141 __uint32_t xs_abtc_2_insrec;
142 __uint32_t xs_abtc_2_delrec;
143 __uint32_t xs_abtc_2_newroot;
144 __uint32_t xs_abtc_2_killroot;
145 __uint32_t xs_abtc_2_increment;
146 __uint32_t xs_abtc_2_decrement;
147 __uint32_t xs_abtc_2_lshift;
148 __uint32_t xs_abtc_2_rshift;
149 __uint32_t xs_abtc_2_split;
150 __uint32_t xs_abtc_2_join;
151 __uint32_t xs_abtc_2_alloc;
152 __uint32_t xs_abtc_2_free;
153 __uint32_t xs_abtc_2_moves;
154#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2+15)
155 __uint32_t xs_bmbt_2_lookup;
156 __uint32_t xs_bmbt_2_compare;
157 __uint32_t xs_bmbt_2_insrec;
158 __uint32_t xs_bmbt_2_delrec;
159 __uint32_t xs_bmbt_2_newroot;
160 __uint32_t xs_bmbt_2_killroot;
161 __uint32_t xs_bmbt_2_increment;
162 __uint32_t xs_bmbt_2_decrement;
163 __uint32_t xs_bmbt_2_lshift;
164 __uint32_t xs_bmbt_2_rshift;
165 __uint32_t xs_bmbt_2_split;
166 __uint32_t xs_bmbt_2_join;
167 __uint32_t xs_bmbt_2_alloc;
168 __uint32_t xs_bmbt_2_free;
169 __uint32_t xs_bmbt_2_moves;
170#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2+15)
171 __uint32_t xs_ibt_2_lookup;
172 __uint32_t xs_ibt_2_compare;
173 __uint32_t xs_ibt_2_insrec;
174 __uint32_t xs_ibt_2_delrec;
175 __uint32_t xs_ibt_2_newroot;
176 __uint32_t xs_ibt_2_killroot;
177 __uint32_t xs_ibt_2_increment;
178 __uint32_t xs_ibt_2_decrement;
179 __uint32_t xs_ibt_2_lshift;
180 __uint32_t xs_ibt_2_rshift;
181 __uint32_t xs_ibt_2_split;
182 __uint32_t xs_ibt_2_join;
183 __uint32_t xs_ibt_2_alloc;
184 __uint32_t xs_ibt_2_free;
185 __uint32_t xs_ibt_2_moves;
121/* Extra precision counters */ 186/* Extra precision counters */
122 __uint64_t xs_xstrat_bytes; 187 __uint64_t xs_xstrat_bytes;
123 __uint64_t xs_write_bytes; 188 __uint64_t xs_write_bytes;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 37ebe36056eb..95a971080368 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -18,7 +18,6 @@
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_bit.h" 19#include "xfs_bit.h"
20#include "xfs_log.h" 20#include "xfs_log.h"
21#include "xfs_clnt.h"
22#include "xfs_inum.h" 21#include "xfs_inum.h"
23#include "xfs_trans.h" 22#include "xfs_trans.h"
24#include "xfs_sb.h" 23#include "xfs_sb.h"
@@ -36,6 +35,7 @@
36#include "xfs_dinode.h" 35#include "xfs_dinode.h"
37#include "xfs_inode.h" 36#include "xfs_inode.h"
38#include "xfs_btree.h" 37#include "xfs_btree.h"
38#include "xfs_btree_trace.h"
39#include "xfs_ialloc.h" 39#include "xfs_ialloc.h"
40#include "xfs_bmap.h" 40#include "xfs_bmap.h"
41#include "xfs_rtalloc.h" 41#include "xfs_rtalloc.h"
@@ -48,7 +48,6 @@
48#include "xfs_buf_item.h" 48#include "xfs_buf_item.h"
49#include "xfs_utils.h" 49#include "xfs_utils.h"
50#include "xfs_vnodeops.h" 50#include "xfs_vnodeops.h"
51#include "xfs_vfsops.h"
52#include "xfs_version.h" 51#include "xfs_version.h"
53#include "xfs_log_priv.h" 52#include "xfs_log_priv.h"
54#include "xfs_trans_priv.h" 53#include "xfs_trans_priv.h"
@@ -58,6 +57,7 @@
58#include "xfs_extfree_item.h" 57#include "xfs_extfree_item.h"
59#include "xfs_mru_cache.h" 58#include "xfs_mru_cache.h"
60#include "xfs_inode_item.h" 59#include "xfs_inode_item.h"
60#include "xfs_sync.h"
61 61
62#include <linux/namei.h> 62#include <linux/namei.h>
63#include <linux/init.h> 63#include <linux/init.h>
@@ -70,36 +70,9 @@
70 70
71static struct quotactl_ops xfs_quotactl_operations; 71static struct quotactl_ops xfs_quotactl_operations;
72static struct super_operations xfs_super_operations; 72static struct super_operations xfs_super_operations;
73static kmem_zone_t *xfs_vnode_zone;
74static kmem_zone_t *xfs_ioend_zone; 73static kmem_zone_t *xfs_ioend_zone;
75mempool_t *xfs_ioend_pool; 74mempool_t *xfs_ioend_pool;
76 75
77STATIC struct xfs_mount_args *
78xfs_args_allocate(
79 struct super_block *sb,
80 int silent)
81{
82 struct xfs_mount_args *args;
83
84 args = kzalloc(sizeof(struct xfs_mount_args), GFP_KERNEL);
85 if (!args)
86 return NULL;
87
88 args->logbufs = args->logbufsize = -1;
89 strncpy(args->fsname, sb->s_id, MAXNAMELEN);
90
91 /* Copy the already-parsed mount(2) flags we're interested in */
92 if (sb->s_flags & MS_DIRSYNC)
93 args->flags |= XFSMNT_DIRSYNC;
94 if (sb->s_flags & MS_SYNCHRONOUS)
95 args->flags |= XFSMNT_WSYNC;
96 if (silent)
97 args->flags |= XFSMNT_QUIET;
98 args->flags |= XFSMNT_32BITINODES;
99
100 return args;
101}
102
103#define MNTOPT_LOGBUFS "logbufs" /* number of XFS log buffers */ 76#define MNTOPT_LOGBUFS "logbufs" /* number of XFS log buffers */
104#define MNTOPT_LOGBSIZE "logbsize" /* size of XFS log buffers */ 77#define MNTOPT_LOGBSIZE "logbsize" /* size of XFS log buffers */
105#define MNTOPT_LOGDEV "logdev" /* log device */ 78#define MNTOPT_LOGDEV "logdev" /* log device */
@@ -188,26 +161,54 @@ suffix_strtoul(char *s, char **endp, unsigned int base)
188 return simple_strtoul((const char *)s, endp, base) << shift_left_factor; 161 return simple_strtoul((const char *)s, endp, base) << shift_left_factor;
189} 162}
190 163
164/*
165 * This function fills in xfs_mount_t fields based on mount args.
166 * Note: the superblock has _not_ yet been read in.
167 *
168 * Note that this function leaks the various device name allocations on
169 * failure. The caller takes care of them.
170 */
191STATIC int 171STATIC int
192xfs_parseargs( 172xfs_parseargs(
193 struct xfs_mount *mp, 173 struct xfs_mount *mp,
194 char *options, 174 char *options,
195 struct xfs_mount_args *args, 175 char **mtpt)
196 int update)
197{ 176{
177 struct super_block *sb = mp->m_super;
198 char *this_char, *value, *eov; 178 char *this_char, *value, *eov;
199 int dsunit, dswidth, vol_dsunit, vol_dswidth; 179 int dsunit = 0;
200 int iosize; 180 int dswidth = 0;
181 int iosize = 0;
201 int dmapi_implies_ikeep = 1; 182 int dmapi_implies_ikeep = 1;
183 uchar_t iosizelog = 0;
184
185 /*
186 * Copy binary VFS mount flags we are interested in.
187 */
188 if (sb->s_flags & MS_RDONLY)
189 mp->m_flags |= XFS_MOUNT_RDONLY;
190 if (sb->s_flags & MS_DIRSYNC)
191 mp->m_flags |= XFS_MOUNT_DIRSYNC;
192 if (sb->s_flags & MS_SYNCHRONOUS)
193 mp->m_flags |= XFS_MOUNT_WSYNC;
194
195 /*
196 * Set some default flags that could be cleared by the mount option
197 * parsing.
198 */
199 mp->m_flags |= XFS_MOUNT_BARRIER;
200 mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
201 mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
202 202
203 args->flags |= XFSMNT_BARRIER; 203 /*
204 args->flags2 |= XFSMNT2_COMPAT_IOSIZE; 204 * These can be overridden by the mount option parsing.
205 */
206 mp->m_logbufs = -1;
207 mp->m_logbsize = -1;
205 208
206 if (!options) 209 if (!options)
207 goto done; 210 goto done;
208 211
209 iosize = dsunit = dswidth = vol_dsunit = vol_dswidth = 0;
210
211 while ((this_char = strsep(&options, ",")) != NULL) { 212 while ((this_char = strsep(&options, ",")) != NULL) {
212 if (!*this_char) 213 if (!*this_char)
213 continue; 214 continue;
@@ -221,7 +222,7 @@ xfs_parseargs(
221 this_char); 222 this_char);
222 return EINVAL; 223 return EINVAL;
223 } 224 }
224 args->logbufs = simple_strtoul(value, &eov, 10); 225 mp->m_logbufs = simple_strtoul(value, &eov, 10);
225 } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) { 226 } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
226 if (!value || !*value) { 227 if (!value || !*value) {
227 cmn_err(CE_WARN, 228 cmn_err(CE_WARN,
@@ -229,7 +230,7 @@ xfs_parseargs(
229 this_char); 230 this_char);
230 return EINVAL; 231 return EINVAL;
231 } 232 }
232 args->logbufsize = suffix_strtoul(value, &eov, 10); 233 mp->m_logbsize = suffix_strtoul(value, &eov, 10);
233 } else if (!strcmp(this_char, MNTOPT_LOGDEV)) { 234 } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
234 if (!value || !*value) { 235 if (!value || !*value) {
235 cmn_err(CE_WARN, 236 cmn_err(CE_WARN,
@@ -237,7 +238,9 @@ xfs_parseargs(
237 this_char); 238 this_char);
238 return EINVAL; 239 return EINVAL;
239 } 240 }
240 strncpy(args->logname, value, MAXNAMELEN); 241 mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
242 if (!mp->m_logname)
243 return ENOMEM;
241 } else if (!strcmp(this_char, MNTOPT_MTPT)) { 244 } else if (!strcmp(this_char, MNTOPT_MTPT)) {
242 if (!value || !*value) { 245 if (!value || !*value) {
243 cmn_err(CE_WARN, 246 cmn_err(CE_WARN,
@@ -245,7 +248,9 @@ xfs_parseargs(
245 this_char); 248 this_char);
246 return EINVAL; 249 return EINVAL;
247 } 250 }
248 strncpy(args->mtpt, value, MAXNAMELEN); 251 *mtpt = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
252 if (!*mtpt)
253 return ENOMEM;
249 } else if (!strcmp(this_char, MNTOPT_RTDEV)) { 254 } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
250 if (!value || !*value) { 255 if (!value || !*value) {
251 cmn_err(CE_WARN, 256 cmn_err(CE_WARN,
@@ -253,7 +258,9 @@ xfs_parseargs(
253 this_char); 258 this_char);
254 return EINVAL; 259 return EINVAL;
255 } 260 }
256 strncpy(args->rtname, value, MAXNAMELEN); 261 mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
262 if (!mp->m_rtname)
263 return ENOMEM;
257 } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) { 264 } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
258 if (!value || !*value) { 265 if (!value || !*value) {
259 cmn_err(CE_WARN, 266 cmn_err(CE_WARN,
@@ -262,8 +269,7 @@ xfs_parseargs(
262 return EINVAL; 269 return EINVAL;
263 } 270 }
264 iosize = simple_strtoul(value, &eov, 10); 271 iosize = simple_strtoul(value, &eov, 10);
265 args->flags |= XFSMNT_IOSIZE; 272 iosizelog = ffs(iosize) - 1;
266 args->iosizelog = (uint8_t) iosize;
267 } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) { 273 } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
268 if (!value || !*value) { 274 if (!value || !*value) {
269 cmn_err(CE_WARN, 275 cmn_err(CE_WARN,
@@ -272,8 +278,7 @@ xfs_parseargs(
272 return EINVAL; 278 return EINVAL;
273 } 279 }
274 iosize = suffix_strtoul(value, &eov, 10); 280 iosize = suffix_strtoul(value, &eov, 10);
275 args->flags |= XFSMNT_IOSIZE; 281 iosizelog = ffs(iosize) - 1;
276 args->iosizelog = ffs(iosize) - 1;
277 } else if (!strcmp(this_char, MNTOPT_GRPID) || 282 } else if (!strcmp(this_char, MNTOPT_GRPID) ||
278 !strcmp(this_char, MNTOPT_BSDGROUPS)) { 283 !strcmp(this_char, MNTOPT_BSDGROUPS)) {
279 mp->m_flags |= XFS_MOUNT_GRPID; 284 mp->m_flags |= XFS_MOUNT_GRPID;
@@ -281,23 +286,25 @@ xfs_parseargs(
281 !strcmp(this_char, MNTOPT_SYSVGROUPS)) { 286 !strcmp(this_char, MNTOPT_SYSVGROUPS)) {
282 mp->m_flags &= ~XFS_MOUNT_GRPID; 287 mp->m_flags &= ~XFS_MOUNT_GRPID;
283 } else if (!strcmp(this_char, MNTOPT_WSYNC)) { 288 } else if (!strcmp(this_char, MNTOPT_WSYNC)) {
284 args->flags |= XFSMNT_WSYNC; 289 mp->m_flags |= XFS_MOUNT_WSYNC;
285 } else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) { 290 } else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) {
286 args->flags |= XFSMNT_OSYNCISOSYNC; 291 mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
287 } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) { 292 } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
288 args->flags |= XFSMNT_NORECOVERY; 293 mp->m_flags |= XFS_MOUNT_NORECOVERY;
289 } else if (!strcmp(this_char, MNTOPT_INO64)) { 294 } else if (!strcmp(this_char, MNTOPT_INO64)) {
290 args->flags |= XFSMNT_INO64; 295#if XFS_BIG_INUMS
291#if !XFS_BIG_INUMS 296 mp->m_flags |= XFS_MOUNT_INO64;
297 mp->m_inoadd = XFS_INO64_OFFSET;
298#else
292 cmn_err(CE_WARN, 299 cmn_err(CE_WARN,
293 "XFS: %s option not allowed on this system", 300 "XFS: %s option not allowed on this system",
294 this_char); 301 this_char);
295 return EINVAL; 302 return EINVAL;
296#endif 303#endif
297 } else if (!strcmp(this_char, MNTOPT_NOALIGN)) { 304 } else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
298 args->flags |= XFSMNT_NOALIGN; 305 mp->m_flags |= XFS_MOUNT_NOALIGN;
299 } else if (!strcmp(this_char, MNTOPT_SWALLOC)) { 306 } else if (!strcmp(this_char, MNTOPT_SWALLOC)) {
300 args->flags |= XFSMNT_SWALLOC; 307 mp->m_flags |= XFS_MOUNT_SWALLOC;
301 } else if (!strcmp(this_char, MNTOPT_SUNIT)) { 308 } else if (!strcmp(this_char, MNTOPT_SUNIT)) {
302 if (!value || !*value) { 309 if (!value || !*value) {
303 cmn_err(CE_WARN, 310 cmn_err(CE_WARN,
@@ -315,7 +322,7 @@ xfs_parseargs(
315 } 322 }
316 dswidth = simple_strtoul(value, &eov, 10); 323 dswidth = simple_strtoul(value, &eov, 10);
317 } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { 324 } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
318 args->flags &= ~XFSMNT_32BITINODES; 325 mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
319#if !XFS_BIG_INUMS 326#if !XFS_BIG_INUMS
320 cmn_err(CE_WARN, 327 cmn_err(CE_WARN,
321 "XFS: %s option not allowed on this system", 328 "XFS: %s option not allowed on this system",
@@ -323,56 +330,61 @@ xfs_parseargs(
323 return EINVAL; 330 return EINVAL;
324#endif 331#endif
325 } else if (!strcmp(this_char, MNTOPT_NOUUID)) { 332 } else if (!strcmp(this_char, MNTOPT_NOUUID)) {
326 args->flags |= XFSMNT_NOUUID; 333 mp->m_flags |= XFS_MOUNT_NOUUID;
327 } else if (!strcmp(this_char, MNTOPT_BARRIER)) { 334 } else if (!strcmp(this_char, MNTOPT_BARRIER)) {
328 args->flags |= XFSMNT_BARRIER; 335 mp->m_flags |= XFS_MOUNT_BARRIER;
329 } else if (!strcmp(this_char, MNTOPT_NOBARRIER)) { 336 } else if (!strcmp(this_char, MNTOPT_NOBARRIER)) {
330 args->flags &= ~XFSMNT_BARRIER; 337 mp->m_flags &= ~XFS_MOUNT_BARRIER;
331 } else if (!strcmp(this_char, MNTOPT_IKEEP)) { 338 } else if (!strcmp(this_char, MNTOPT_IKEEP)) {
332 args->flags |= XFSMNT_IKEEP; 339 mp->m_flags |= XFS_MOUNT_IKEEP;
333 } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) { 340 } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
334 dmapi_implies_ikeep = 0; 341 dmapi_implies_ikeep = 0;
335 args->flags &= ~XFSMNT_IKEEP; 342 mp->m_flags &= ~XFS_MOUNT_IKEEP;
336 } else if (!strcmp(this_char, MNTOPT_LARGEIO)) { 343 } else if (!strcmp(this_char, MNTOPT_LARGEIO)) {
337 args->flags2 &= ~XFSMNT2_COMPAT_IOSIZE; 344 mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE;
338 } else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) { 345 } else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) {
339 args->flags2 |= XFSMNT2_COMPAT_IOSIZE; 346 mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
340 } else if (!strcmp(this_char, MNTOPT_ATTR2)) { 347 } else if (!strcmp(this_char, MNTOPT_ATTR2)) {
341 args->flags |= XFSMNT_ATTR2; 348 mp->m_flags |= XFS_MOUNT_ATTR2;
342 } else if (!strcmp(this_char, MNTOPT_NOATTR2)) { 349 } else if (!strcmp(this_char, MNTOPT_NOATTR2)) {
343 args->flags &= ~XFSMNT_ATTR2; 350 mp->m_flags &= ~XFS_MOUNT_ATTR2;
344 args->flags |= XFSMNT_NOATTR2; 351 mp->m_flags |= XFS_MOUNT_NOATTR2;
345 } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) { 352 } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
346 args->flags2 |= XFSMNT2_FILESTREAMS; 353 mp->m_flags |= XFS_MOUNT_FILESTREAMS;
347 } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) { 354 } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
348 args->flags &= ~(XFSMNT_UQUOTAENF|XFSMNT_UQUOTA); 355 mp->m_qflags &= ~(XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
349 args->flags &= ~(XFSMNT_GQUOTAENF|XFSMNT_GQUOTA); 356 XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
357 XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
358 XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD);
350 } else if (!strcmp(this_char, MNTOPT_QUOTA) || 359 } else if (!strcmp(this_char, MNTOPT_QUOTA) ||
351 !strcmp(this_char, MNTOPT_UQUOTA) || 360 !strcmp(this_char, MNTOPT_UQUOTA) ||
352 !strcmp(this_char, MNTOPT_USRQUOTA)) { 361 !strcmp(this_char, MNTOPT_USRQUOTA)) {
353 args->flags |= XFSMNT_UQUOTA | XFSMNT_UQUOTAENF; 362 mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
363 XFS_UQUOTA_ENFD);
354 } else if (!strcmp(this_char, MNTOPT_QUOTANOENF) || 364 } else if (!strcmp(this_char, MNTOPT_QUOTANOENF) ||
355 !strcmp(this_char, MNTOPT_UQUOTANOENF)) { 365 !strcmp(this_char, MNTOPT_UQUOTANOENF)) {
356 args->flags |= XFSMNT_UQUOTA; 366 mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
357 args->flags &= ~XFSMNT_UQUOTAENF; 367 mp->m_qflags &= ~XFS_UQUOTA_ENFD;
358 } else if (!strcmp(this_char, MNTOPT_PQUOTA) || 368 } else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
359 !strcmp(this_char, MNTOPT_PRJQUOTA)) { 369 !strcmp(this_char, MNTOPT_PRJQUOTA)) {
360 args->flags |= XFSMNT_PQUOTA | XFSMNT_PQUOTAENF; 370 mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
371 XFS_OQUOTA_ENFD);
361 } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) { 372 } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
362 args->flags |= XFSMNT_PQUOTA; 373 mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
363 args->flags &= ~XFSMNT_PQUOTAENF; 374 mp->m_qflags &= ~XFS_OQUOTA_ENFD;
364 } else if (!strcmp(this_char, MNTOPT_GQUOTA) || 375 } else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
365 !strcmp(this_char, MNTOPT_GRPQUOTA)) { 376 !strcmp(this_char, MNTOPT_GRPQUOTA)) {
366 args->flags |= XFSMNT_GQUOTA | XFSMNT_GQUOTAENF; 377 mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
378 XFS_OQUOTA_ENFD);
367 } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { 379 } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
368 args->flags |= XFSMNT_GQUOTA; 380 mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
369 args->flags &= ~XFSMNT_GQUOTAENF; 381 mp->m_qflags &= ~XFS_OQUOTA_ENFD;
370 } else if (!strcmp(this_char, MNTOPT_DMAPI)) { 382 } else if (!strcmp(this_char, MNTOPT_DMAPI)) {
371 args->flags |= XFSMNT_DMAPI; 383 mp->m_flags |= XFS_MOUNT_DMAPI;
372 } else if (!strcmp(this_char, MNTOPT_XDSM)) { 384 } else if (!strcmp(this_char, MNTOPT_XDSM)) {
373 args->flags |= XFSMNT_DMAPI; 385 mp->m_flags |= XFS_MOUNT_DMAPI;
374 } else if (!strcmp(this_char, MNTOPT_DMI)) { 386 } else if (!strcmp(this_char, MNTOPT_DMI)) {
375 args->flags |= XFSMNT_DMAPI; 387 mp->m_flags |= XFS_MOUNT_DMAPI;
376 } else if (!strcmp(this_char, "ihashsize")) { 388 } else if (!strcmp(this_char, "ihashsize")) {
377 cmn_err(CE_WARN, 389 cmn_err(CE_WARN,
378 "XFS: ihashsize no longer used, option is deprecated."); 390 "XFS: ihashsize no longer used, option is deprecated.");
@@ -390,27 +402,29 @@ xfs_parseargs(
390 } 402 }
391 } 403 }
392 404
393 if (args->flags & XFSMNT_NORECOVERY) { 405 /*
394 if ((mp->m_flags & XFS_MOUNT_RDONLY) == 0) { 406 * no recovery flag requires a read-only mount
395 cmn_err(CE_WARN, 407 */
396 "XFS: no-recovery mounts must be read-only."); 408 if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
397 return EINVAL; 409 !(mp->m_flags & XFS_MOUNT_RDONLY)) {
398 } 410 cmn_err(CE_WARN, "XFS: no-recovery mounts must be read-only.");
411 return EINVAL;
399 } 412 }
400 413
401 if ((args->flags & XFSMNT_NOALIGN) && (dsunit || dswidth)) { 414 if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
402 cmn_err(CE_WARN, 415 cmn_err(CE_WARN,
403 "XFS: sunit and swidth options incompatible with the noalign option"); 416 "XFS: sunit and swidth options incompatible with the noalign option");
404 return EINVAL; 417 return EINVAL;
405 } 418 }
406 419
407 if ((args->flags & XFSMNT_GQUOTA) && (args->flags & XFSMNT_PQUOTA)) { 420 if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
421 (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
408 cmn_err(CE_WARN, 422 cmn_err(CE_WARN,
409 "XFS: cannot mount with both project and group quota"); 423 "XFS: cannot mount with both project and group quota");
410 return EINVAL; 424 return EINVAL;
411 } 425 }
412 426
413 if ((args->flags & XFSMNT_DMAPI) && *args->mtpt == '\0') { 427 if ((mp->m_flags & XFS_MOUNT_DMAPI) && (!*mtpt || *mtpt[0] == '\0')) {
414 printk("XFS: %s option needs the mount point option as well\n", 428 printk("XFS: %s option needs the mount point option as well\n",
415 MNTOPT_DMAPI); 429 MNTOPT_DMAPI);
416 return EINVAL; 430 return EINVAL;
@@ -438,27 +452,66 @@ xfs_parseargs(
438 * Note that if "ikeep" or "noikeep" mount options are 452 * Note that if "ikeep" or "noikeep" mount options are
439 * supplied, then they are honored. 453 * supplied, then they are honored.
440 */ 454 */
441 if ((args->flags & XFSMNT_DMAPI) && dmapi_implies_ikeep) 455 if ((mp->m_flags & XFS_MOUNT_DMAPI) && dmapi_implies_ikeep)
442 args->flags |= XFSMNT_IKEEP; 456 mp->m_flags |= XFS_MOUNT_IKEEP;
443 457
444 if ((args->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) { 458done:
459 if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) {
460 /*
461 * At this point the superblock has not been read
462 * in, therefore we do not know the block size.
463 * Before the mount call ends we will convert
464 * these to FSBs.
465 */
445 if (dsunit) { 466 if (dsunit) {
446 args->sunit = dsunit; 467 mp->m_dalign = dsunit;
447 args->flags |= XFSMNT_RETERR; 468 mp->m_flags |= XFS_MOUNT_RETERR;
448 } else {
449 args->sunit = vol_dsunit;
450 } 469 }
451 dswidth ? (args->swidth = dswidth) : 470
452 (args->swidth = vol_dswidth); 471 if (dswidth)
453 } else { 472 mp->m_swidth = dswidth;
454 args->sunit = args->swidth = 0; 473 }
474
475 if (mp->m_logbufs != -1 &&
476 mp->m_logbufs != 0 &&
477 (mp->m_logbufs < XLOG_MIN_ICLOGS ||
478 mp->m_logbufs > XLOG_MAX_ICLOGS)) {
479 cmn_err(CE_WARN,
480 "XFS: invalid logbufs value: %d [not %d-%d]",
481 mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
482 return XFS_ERROR(EINVAL);
483 }
484 if (mp->m_logbsize != -1 &&
485 mp->m_logbsize != 0 &&
486 (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
487 mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
488 !is_power_of_2(mp->m_logbsize))) {
489 cmn_err(CE_WARN,
490 "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
491 mp->m_logbsize);
492 return XFS_ERROR(EINVAL);
493 }
494
495 mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
496 if (!mp->m_fsname)
497 return ENOMEM;
498 mp->m_fsname_len = strlen(mp->m_fsname) + 1;
499
500 if (iosizelog) {
501 if (iosizelog > XFS_MAX_IO_LOG ||
502 iosizelog < XFS_MIN_IO_LOG) {
503 cmn_err(CE_WARN,
504 "XFS: invalid log iosize: %d [not %d-%d]",
505 iosizelog, XFS_MIN_IO_LOG,
506 XFS_MAX_IO_LOG);
507 return XFS_ERROR(EINVAL);
508 }
509
510 mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
511 mp->m_readio_log = iosizelog;
512 mp->m_writeio_log = iosizelog;
455 } 513 }
456 514
457done:
458 if (args->flags & XFSMNT_32BITINODES)
459 mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
460 if (args->flags2)
461 args->flags |= XFSMNT_FLAGS2;
462 return 0; 515 return 0;
463} 516}
464 517
@@ -704,8 +757,7 @@ xfs_close_devices(
704 */ 757 */
705STATIC int 758STATIC int
706xfs_open_devices( 759xfs_open_devices(
707 struct xfs_mount *mp, 760 struct xfs_mount *mp)
708 struct xfs_mount_args *args)
709{ 761{
710 struct block_device *ddev = mp->m_super->s_bdev; 762 struct block_device *ddev = mp->m_super->s_bdev;
711 struct block_device *logdev = NULL, *rtdev = NULL; 763 struct block_device *logdev = NULL, *rtdev = NULL;
@@ -714,14 +766,14 @@ xfs_open_devices(
714 /* 766 /*
715 * Open real time and log devices - order is important. 767 * Open real time and log devices - order is important.
716 */ 768 */
717 if (args->logname[0]) { 769 if (mp->m_logname) {
718 error = xfs_blkdev_get(mp, args->logname, &logdev); 770 error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
719 if (error) 771 if (error)
720 goto out; 772 goto out;
721 } 773 }
722 774
723 if (args->rtname[0]) { 775 if (mp->m_rtname) {
724 error = xfs_blkdev_get(mp, args->rtname, &rtdev); 776 error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev);
725 if (error) 777 if (error)
726 goto out_close_logdev; 778 goto out_close_logdev;
727 779
@@ -813,18 +865,18 @@ xfs_setup_devices(
813 */ 865 */
814void 866void
815xfsaild_wakeup( 867xfsaild_wakeup(
816 xfs_mount_t *mp, 868 struct xfs_ail *ailp,
817 xfs_lsn_t threshold_lsn) 869 xfs_lsn_t threshold_lsn)
818{ 870{
819 mp->m_ail.xa_target = threshold_lsn; 871 ailp->xa_target = threshold_lsn;
820 wake_up_process(mp->m_ail.xa_task); 872 wake_up_process(ailp->xa_task);
821} 873}
822 874
823int 875int
824xfsaild( 876xfsaild(
825 void *data) 877 void *data)
826{ 878{
827 xfs_mount_t *mp = (xfs_mount_t *)data; 879 struct xfs_ail *ailp = data;
828 xfs_lsn_t last_pushed_lsn = 0; 880 xfs_lsn_t last_pushed_lsn = 0;
829 long tout = 0; 881 long tout = 0;
830 882
@@ -836,11 +888,11 @@ xfsaild(
836 /* swsusp */ 888 /* swsusp */
837 try_to_freeze(); 889 try_to_freeze();
838 890
839 ASSERT(mp->m_log); 891 ASSERT(ailp->xa_mount->m_log);
840 if (XFS_FORCED_SHUTDOWN(mp)) 892 if (XFS_FORCED_SHUTDOWN(ailp->xa_mount))
841 continue; 893 continue;
842 894
843 tout = xfsaild_push(mp, &last_pushed_lsn); 895 tout = xfsaild_push(ailp, &last_pushed_lsn);
844 } 896 }
845 897
846 return 0; 898 return 0;
@@ -848,43 +900,82 @@ xfsaild(
848 900
849int 901int
850xfsaild_start( 902xfsaild_start(
851 xfs_mount_t *mp) 903 struct xfs_ail *ailp)
852{ 904{
853 mp->m_ail.xa_target = 0; 905 ailp->xa_target = 0;
854 mp->m_ail.xa_task = kthread_run(xfsaild, mp, "xfsaild"); 906 ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild");
855 if (IS_ERR(mp->m_ail.xa_task)) 907 if (IS_ERR(ailp->xa_task))
856 return -PTR_ERR(mp->m_ail.xa_task); 908 return -PTR_ERR(ailp->xa_task);
857 return 0; 909 return 0;
858} 910}
859 911
860void 912void
861xfsaild_stop( 913xfsaild_stop(
862 xfs_mount_t *mp) 914 struct xfs_ail *ailp)
863{ 915{
864 kthread_stop(mp->m_ail.xa_task); 916 kthread_stop(ailp->xa_task);
865} 917}
866 918
867 919
868 920/* Catch misguided souls that try to use this interface on XFS */
869STATIC struct inode * 921STATIC struct inode *
870xfs_fs_alloc_inode( 922xfs_fs_alloc_inode(
871 struct super_block *sb) 923 struct super_block *sb)
872{ 924{
873 return kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP); 925 BUG();
926 return NULL;
874} 927}
875 928
929/*
930 * Now that the generic code is guaranteed not to be accessing
931 * the linux inode, we can reclaim the inode.
932 */
876STATIC void 933STATIC void
877xfs_fs_destroy_inode( 934xfs_fs_destroy_inode(
878 struct inode *inode) 935 struct inode *inode)
879{ 936{
880 kmem_zone_free(xfs_vnode_zone, inode); 937 xfs_inode_t *ip = XFS_I(inode);
938
939 XFS_STATS_INC(vn_reclaim);
940 if (xfs_reclaim(ip))
941 panic("%s: cannot reclaim 0x%p\n", __func__, inode);
881} 942}
882 943
944/*
945 * Slab object creation initialisation for the XFS inode.
946 * This covers only the idempotent fields in the XFS inode;
947 * all other fields need to be initialised on allocation
948 * from the slab. This avoids the need to repeatedly intialise
949 * fields in the xfs inode that left in the initialise state
950 * when freeing the inode.
951 */
883STATIC void 952STATIC void
884xfs_fs_inode_init_once( 953xfs_fs_inode_init_once(
885 void *vnode) 954 void *inode)
886{ 955{
887 inode_init_once((struct inode *)vnode); 956 struct xfs_inode *ip = inode;
957
958 memset(ip, 0, sizeof(struct xfs_inode));
959
960 /* vfs inode */
961 inode_init_once(VFS_I(ip));
962
963 /* xfs inode */
964 atomic_set(&ip->i_iocount, 0);
965 atomic_set(&ip->i_pincount, 0);
966 spin_lock_init(&ip->i_flags_lock);
967 init_waitqueue_head(&ip->i_ipin_wait);
968 /*
969 * Because we want to use a counting completion, complete
970 * the flush completion once to allow a single access to
971 * the flush completion without blocking.
972 */
973 init_completion(&ip->i_flush);
974 complete(&ip->i_flush);
975
976 mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
977 "xfsino", ip->i_ino);
978 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
888} 979}
889 980
890/* 981/*
@@ -898,21 +989,26 @@ xfs_fs_write_inode(
898 struct inode *inode, 989 struct inode *inode,
899 int sync) 990 int sync)
900{ 991{
992 struct xfs_inode *ip = XFS_I(inode);
901 int error = 0; 993 int error = 0;
902 int flags = 0; 994 int flags = 0;
903 995
904 xfs_itrace_entry(XFS_I(inode)); 996 xfs_itrace_entry(ip);
905 if (sync) { 997 if (sync) {
906 filemap_fdatawait(inode->i_mapping); 998 error = xfs_wait_on_pages(ip, 0, -1);
999 if (error)
1000 goto out_error;
907 flags |= FLUSH_SYNC; 1001 flags |= FLUSH_SYNC;
908 } 1002 }
909 error = xfs_inode_flush(XFS_I(inode), flags); 1003 error = xfs_inode_flush(ip, flags);
1004
1005out_error:
910 /* 1006 /*
911 * if we failed to write out the inode then mark 1007 * if we failed to write out the inode then mark
912 * it dirty again so we'll try again later. 1008 * it dirty again so we'll try again later.
913 */ 1009 */
914 if (error) 1010 if (error)
915 mark_inode_dirty_sync(inode); 1011 xfs_mark_inode_dirty_sync(ip);
916 1012
917 return -error; 1013 return -error;
918} 1014}
@@ -923,164 +1019,12 @@ xfs_fs_clear_inode(
923{ 1019{
924 xfs_inode_t *ip = XFS_I(inode); 1020 xfs_inode_t *ip = XFS_I(inode);
925 1021
926 /* 1022 xfs_itrace_entry(ip);
927 * ip can be null when xfs_iget_core calls xfs_idestroy if we 1023 XFS_STATS_INC(vn_rele);
928 * find an inode with di_mode == 0 but without IGET_CREATE set. 1024 XFS_STATS_INC(vn_remove);
929 */ 1025 XFS_STATS_DEC(vn_active);
930 if (ip) {
931 xfs_itrace_entry(ip);
932 XFS_STATS_INC(vn_rele);
933 XFS_STATS_INC(vn_remove);
934 XFS_STATS_INC(vn_reclaim);
935 XFS_STATS_DEC(vn_active);
936
937 xfs_inactive(ip);
938 xfs_iflags_clear(ip, XFS_IMODIFIED);
939 if (xfs_reclaim(ip))
940 panic("%s: cannot reclaim 0x%p\n", __func__, inode);
941 }
942
943 ASSERT(XFS_I(inode) == NULL);
944}
945
946/*
947 * Enqueue a work item to be picked up by the vfs xfssyncd thread.
948 * Doing this has two advantages:
949 * - It saves on stack space, which is tight in certain situations
950 * - It can be used (with care) as a mechanism to avoid deadlocks.
951 * Flushing while allocating in a full filesystem requires both.
952 */
953STATIC void
954xfs_syncd_queue_work(
955 struct xfs_mount *mp,
956 void *data,
957 void (*syncer)(struct xfs_mount *, void *))
958{
959 struct bhv_vfs_sync_work *work;
960
961 work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
962 INIT_LIST_HEAD(&work->w_list);
963 work->w_syncer = syncer;
964 work->w_data = data;
965 work->w_mount = mp;
966 spin_lock(&mp->m_sync_lock);
967 list_add_tail(&work->w_list, &mp->m_sync_list);
968 spin_unlock(&mp->m_sync_lock);
969 wake_up_process(mp->m_sync_task);
970}
971
972/*
973 * Flush delayed allocate data, attempting to free up reserved space
974 * from existing allocations. At this point a new allocation attempt
975 * has failed with ENOSPC and we are in the process of scratching our
976 * heads, looking about for more room...
977 */
978STATIC void
979xfs_flush_inode_work(
980 struct xfs_mount *mp,
981 void *arg)
982{
983 struct inode *inode = arg;
984 filemap_flush(inode->i_mapping);
985 iput(inode);
986}
987
988void
989xfs_flush_inode(
990 xfs_inode_t *ip)
991{
992 struct inode *inode = VFS_I(ip);
993
994 igrab(inode);
995 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
996 delay(msecs_to_jiffies(500));
997}
998
999/*
1000 * This is the "bigger hammer" version of xfs_flush_inode_work...
1001 * (IOW, "If at first you don't succeed, use a Bigger Hammer").
1002 */
1003STATIC void
1004xfs_flush_device_work(
1005 struct xfs_mount *mp,
1006 void *arg)
1007{
1008 struct inode *inode = arg;
1009 sync_blockdev(mp->m_super->s_bdev);
1010 iput(inode);
1011}
1012
1013void
1014xfs_flush_device(
1015 xfs_inode_t *ip)
1016{
1017 struct inode *inode = VFS_I(ip);
1018 1026
1019 igrab(inode); 1027 xfs_inactive(ip);
1020 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
1021 delay(msecs_to_jiffies(500));
1022 xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
1023}
1024
1025STATIC void
1026xfs_sync_worker(
1027 struct xfs_mount *mp,
1028 void *unused)
1029{
1030 int error;
1031
1032 if (!(mp->m_flags & XFS_MOUNT_RDONLY))
1033 error = xfs_sync(mp, SYNC_FSDATA | SYNC_BDFLUSH | SYNC_ATTR);
1034 mp->m_sync_seq++;
1035 wake_up(&mp->m_wait_single_sync_task);
1036}
1037
1038STATIC int
1039xfssyncd(
1040 void *arg)
1041{
1042 struct xfs_mount *mp = arg;
1043 long timeleft;
1044 bhv_vfs_sync_work_t *work, *n;
1045 LIST_HEAD (tmp);
1046
1047 set_freezable();
1048 timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
1049 for (;;) {
1050 timeleft = schedule_timeout_interruptible(timeleft);
1051 /* swsusp */
1052 try_to_freeze();
1053 if (kthread_should_stop() && list_empty(&mp->m_sync_list))
1054 break;
1055
1056 spin_lock(&mp->m_sync_lock);
1057 /*
1058 * We can get woken by laptop mode, to do a sync -
1059 * that's the (only!) case where the list would be
1060 * empty with time remaining.
1061 */
1062 if (!timeleft || list_empty(&mp->m_sync_list)) {
1063 if (!timeleft)
1064 timeleft = xfs_syncd_centisecs *
1065 msecs_to_jiffies(10);
1066 INIT_LIST_HEAD(&mp->m_sync_work.w_list);
1067 list_add_tail(&mp->m_sync_work.w_list,
1068 &mp->m_sync_list);
1069 }
1070 list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list)
1071 list_move(&work->w_list, &tmp);
1072 spin_unlock(&mp->m_sync_lock);
1073
1074 list_for_each_entry_safe(work, n, &tmp, w_list) {
1075 (*work->w_syncer)(mp, work->w_data);
1076 list_del(&work->w_list);
1077 if (work == &mp->m_sync_work)
1078 continue;
1079 kmem_free(work);
1080 }
1081 }
1082
1083 return 0;
1084} 1028}
1085 1029
1086STATIC void 1030STATIC void
@@ -1099,11 +1043,9 @@ xfs_fs_put_super(
1099 struct xfs_mount *mp = XFS_M(sb); 1043 struct xfs_mount *mp = XFS_M(sb);
1100 struct xfs_inode *rip = mp->m_rootip; 1044 struct xfs_inode *rip = mp->m_rootip;
1101 int unmount_event_flags = 0; 1045 int unmount_event_flags = 0;
1102 int error;
1103
1104 kthread_stop(mp->m_sync_task);
1105 1046
1106 xfs_sync(mp, SYNC_ATTR | SYNC_DELWRI); 1047 xfs_syncd_stop(mp);
1048 xfs_sync_inodes(mp, SYNC_ATTR|SYNC_DELWRI);
1107 1049
1108#ifdef HAVE_DMAPI 1050#ifdef HAVE_DMAPI
1109 if (mp->m_flags & XFS_MOUNT_DMAPI) { 1051 if (mp->m_flags & XFS_MOUNT_DMAPI) {
@@ -1128,18 +1070,6 @@ xfs_fs_put_super(
1128 xfs_filestream_unmount(mp); 1070 xfs_filestream_unmount(mp);
1129 1071
1130 XFS_bflush(mp->m_ddev_targp); 1072 XFS_bflush(mp->m_ddev_targp);
1131 error = xfs_unmount_flush(mp, 0);
1132 WARN_ON(error);
1133
1134 /*
1135 * If we're forcing a shutdown, typically because of a media error,
1136 * we want to make sure we invalidate dirty pages that belong to
1137 * referenced vnodes as well.
1138 */
1139 if (XFS_FORCED_SHUTDOWN(mp)) {
1140 error = xfs_sync(mp, SYNC_WAIT | SYNC_CLOSE);
1141 ASSERT(error != EFSCORRUPTED);
1142 }
1143 1073
1144 if (mp->m_flags & XFS_MOUNT_DMAPI) { 1074 if (mp->m_flags & XFS_MOUNT_DMAPI) {
1145 XFS_SEND_UNMOUNT(mp, rip, DM_RIGHT_NULL, 0, 0, 1075 XFS_SEND_UNMOUNT(mp, rip, DM_RIGHT_NULL, 0, 0,
@@ -1161,7 +1091,7 @@ xfs_fs_write_super(
1161 struct super_block *sb) 1091 struct super_block *sb)
1162{ 1092{
1163 if (!(sb->s_flags & MS_RDONLY)) 1093 if (!(sb->s_flags & MS_RDONLY))
1164 xfs_sync(XFS_M(sb), SYNC_FSDATA); 1094 xfs_sync_fsdata(XFS_M(sb), 0);
1165 sb->s_dirt = 0; 1095 sb->s_dirt = 0;
1166} 1096}
1167 1097
@@ -1172,7 +1102,6 @@ xfs_fs_sync_super(
1172{ 1102{
1173 struct xfs_mount *mp = XFS_M(sb); 1103 struct xfs_mount *mp = XFS_M(sb);
1174 int error; 1104 int error;
1175 int flags;
1176 1105
1177 /* 1106 /*
1178 * Treat a sync operation like a freeze. This is to work 1107 * Treat a sync operation like a freeze. This is to work
@@ -1186,20 +1115,10 @@ xfs_fs_sync_super(
1186 * dirty the Linux inode until after the transaction I/O 1115 * dirty the Linux inode until after the transaction I/O
1187 * completes. 1116 * completes.
1188 */ 1117 */
1189 if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE)) { 1118 if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE))
1190 /* 1119 error = xfs_quiesce_data(mp);
1191 * First stage of freeze - no more writers will make progress 1120 else
1192 * now we are here, so we flush delwri and delalloc buffers 1121 error = xfs_sync_fsdata(mp, 0);
1193 * here, then wait for all I/O to complete. Data is frozen at
1194 * that point. Metadata is not frozen, transactions can still
1195 * occur here so don't bother flushing the buftarg (i.e
1196 * SYNC_QUIESCE) because it'll just get dirty again.
1197 */
1198 flags = SYNC_DATA_QUIESCE;
1199 } else
1200 flags = SYNC_FSDATA;
1201
1202 error = xfs_sync(mp, flags);
1203 sb->s_dirt = 0; 1122 sb->s_dirt = 0;
1204 1123
1205 if (unlikely(laptop_mode)) { 1124 if (unlikely(laptop_mode)) {
@@ -1337,9 +1256,8 @@ xfs_fs_remount(
1337 1256
1338 /* rw -> ro */ 1257 /* rw -> ro */
1339 if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { 1258 if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
1340 xfs_filestream_flush(mp); 1259 xfs_quiesce_data(mp);
1341 xfs_sync(mp, SYNC_DATA_QUIESCE); 1260 xfs_quiesce_attr(mp);
1342 xfs_attr_quiesce(mp);
1343 mp->m_flags |= XFS_MOUNT_RDONLY; 1261 mp->m_flags |= XFS_MOUNT_RDONLY;
1344 } 1262 }
1345 1263
@@ -1348,17 +1266,17 @@ xfs_fs_remount(
1348 1266
1349/* 1267/*
1350 * Second stage of a freeze. The data is already frozen so we only 1268 * Second stage of a freeze. The data is already frozen so we only
1351 * need to take care of themetadata. Once that's done write a dummy 1269 * need to take care of the metadata. Once that's done write a dummy
1352 * record to dirty the log in case of a crash while frozen. 1270 * record to dirty the log in case of a crash while frozen.
1353 */ 1271 */
1354STATIC void 1272STATIC int
1355xfs_fs_lockfs( 1273xfs_fs_freeze(
1356 struct super_block *sb) 1274 struct super_block *sb)
1357{ 1275{
1358 struct xfs_mount *mp = XFS_M(sb); 1276 struct xfs_mount *mp = XFS_M(sb);
1359 1277
1360 xfs_attr_quiesce(mp); 1278 xfs_quiesce_attr(mp);
1361 xfs_fs_log_dummy(mp); 1279 return -xfs_fs_log_dummy(mp);
1362} 1280}
1363 1281
1364STATIC int 1282STATIC int
@@ -1422,175 +1340,28 @@ xfs_fs_setxquota(
1422 1340
1423/* 1341/*
1424 * This function fills in xfs_mount_t fields based on mount args. 1342 * This function fills in xfs_mount_t fields based on mount args.
1425 * Note: the superblock has _not_ yet been read in.
1426 */
1427STATIC int
1428xfs_start_flags(
1429 struct xfs_mount_args *ap,
1430 struct xfs_mount *mp)
1431{
1432 int error;
1433
1434 /* Values are in BBs */
1435 if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
1436 /*
1437 * At this point the superblock has not been read
1438 * in, therefore we do not know the block size.
1439 * Before the mount call ends we will convert
1440 * these to FSBs.
1441 */
1442 mp->m_dalign = ap->sunit;
1443 mp->m_swidth = ap->swidth;
1444 }
1445
1446 if (ap->logbufs != -1 &&
1447 ap->logbufs != 0 &&
1448 (ap->logbufs < XLOG_MIN_ICLOGS ||
1449 ap->logbufs > XLOG_MAX_ICLOGS)) {
1450 cmn_err(CE_WARN,
1451 "XFS: invalid logbufs value: %d [not %d-%d]",
1452 ap->logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
1453 return XFS_ERROR(EINVAL);
1454 }
1455 mp->m_logbufs = ap->logbufs;
1456 if (ap->logbufsize != -1 &&
1457 ap->logbufsize != 0 &&
1458 (ap->logbufsize < XLOG_MIN_RECORD_BSIZE ||
1459 ap->logbufsize > XLOG_MAX_RECORD_BSIZE ||
1460 !is_power_of_2(ap->logbufsize))) {
1461 cmn_err(CE_WARN,
1462 "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
1463 ap->logbufsize);
1464 return XFS_ERROR(EINVAL);
1465 }
1466
1467 error = ENOMEM;
1468
1469 mp->m_logbsize = ap->logbufsize;
1470 mp->m_fsname_len = strlen(ap->fsname) + 1;
1471
1472 mp->m_fsname = kstrdup(ap->fsname, GFP_KERNEL);
1473 if (!mp->m_fsname)
1474 goto out;
1475
1476 if (ap->rtname[0]) {
1477 mp->m_rtname = kstrdup(ap->rtname, GFP_KERNEL);
1478 if (!mp->m_rtname)
1479 goto out_free_fsname;
1480
1481 }
1482
1483 if (ap->logname[0]) {
1484 mp->m_logname = kstrdup(ap->logname, GFP_KERNEL);
1485 if (!mp->m_logname)
1486 goto out_free_rtname;
1487 }
1488
1489 if (ap->flags & XFSMNT_WSYNC)
1490 mp->m_flags |= XFS_MOUNT_WSYNC;
1491#if XFS_BIG_INUMS
1492 if (ap->flags & XFSMNT_INO64) {
1493 mp->m_flags |= XFS_MOUNT_INO64;
1494 mp->m_inoadd = XFS_INO64_OFFSET;
1495 }
1496#endif
1497 if (ap->flags & XFSMNT_RETERR)
1498 mp->m_flags |= XFS_MOUNT_RETERR;
1499 if (ap->flags & XFSMNT_NOALIGN)
1500 mp->m_flags |= XFS_MOUNT_NOALIGN;
1501 if (ap->flags & XFSMNT_SWALLOC)
1502 mp->m_flags |= XFS_MOUNT_SWALLOC;
1503 if (ap->flags & XFSMNT_OSYNCISOSYNC)
1504 mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
1505 if (ap->flags & XFSMNT_32BITINODES)
1506 mp->m_flags |= XFS_MOUNT_32BITINODES;
1507
1508 if (ap->flags & XFSMNT_IOSIZE) {
1509 if (ap->iosizelog > XFS_MAX_IO_LOG ||
1510 ap->iosizelog < XFS_MIN_IO_LOG) {
1511 cmn_err(CE_WARN,
1512 "XFS: invalid log iosize: %d [not %d-%d]",
1513 ap->iosizelog, XFS_MIN_IO_LOG,
1514 XFS_MAX_IO_LOG);
1515 return XFS_ERROR(EINVAL);
1516 }
1517
1518 mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
1519 mp->m_readio_log = mp->m_writeio_log = ap->iosizelog;
1520 }
1521
1522 if (ap->flags & XFSMNT_IKEEP)
1523 mp->m_flags |= XFS_MOUNT_IKEEP;
1524 if (ap->flags & XFSMNT_DIRSYNC)
1525 mp->m_flags |= XFS_MOUNT_DIRSYNC;
1526 if (ap->flags & XFSMNT_ATTR2)
1527 mp->m_flags |= XFS_MOUNT_ATTR2;
1528 if (ap->flags & XFSMNT_NOATTR2)
1529 mp->m_flags |= XFS_MOUNT_NOATTR2;
1530
1531 if (ap->flags2 & XFSMNT2_COMPAT_IOSIZE)
1532 mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
1533
1534 /*
1535 * no recovery flag requires a read-only mount
1536 */
1537 if (ap->flags & XFSMNT_NORECOVERY) {
1538 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
1539 cmn_err(CE_WARN,
1540 "XFS: tried to mount a FS read-write without recovery!");
1541 return XFS_ERROR(EINVAL);
1542 }
1543 mp->m_flags |= XFS_MOUNT_NORECOVERY;
1544 }
1545
1546 if (ap->flags & XFSMNT_NOUUID)
1547 mp->m_flags |= XFS_MOUNT_NOUUID;
1548 if (ap->flags & XFSMNT_BARRIER)
1549 mp->m_flags |= XFS_MOUNT_BARRIER;
1550 else
1551 mp->m_flags &= ~XFS_MOUNT_BARRIER;
1552
1553 if (ap->flags2 & XFSMNT2_FILESTREAMS)
1554 mp->m_flags |= XFS_MOUNT_FILESTREAMS;
1555
1556 if (ap->flags & XFSMNT_DMAPI)
1557 mp->m_flags |= XFS_MOUNT_DMAPI;
1558 return 0;
1559
1560
1561 out_free_rtname:
1562 kfree(mp->m_rtname);
1563 out_free_fsname:
1564 kfree(mp->m_fsname);
1565 out:
1566 return error;
1567}
1568
1569/*
1570 * This function fills in xfs_mount_t fields based on mount args.
1571 * Note: the superblock _has_ now been read in. 1343 * Note: the superblock _has_ now been read in.
1572 */ 1344 */
1573STATIC int 1345STATIC int
1574xfs_finish_flags( 1346xfs_finish_flags(
1575 struct xfs_mount_args *ap,
1576 struct xfs_mount *mp) 1347 struct xfs_mount *mp)
1577{ 1348{
1578 int ronly = (mp->m_flags & XFS_MOUNT_RDONLY); 1349 int ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
1579 1350
1580 /* Fail a mount where the logbuf is smaller then the log stripe */ 1351 /* Fail a mount where the logbuf is smaller than the log stripe */
1581 if (xfs_sb_version_haslogv2(&mp->m_sb)) { 1352 if (xfs_sb_version_haslogv2(&mp->m_sb)) {
1582 if ((ap->logbufsize <= 0) && 1353 if (mp->m_logbsize <= 0 &&
1583 (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) { 1354 mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) {
1584 mp->m_logbsize = mp->m_sb.sb_logsunit; 1355 mp->m_logbsize = mp->m_sb.sb_logsunit;
1585 } else if (ap->logbufsize > 0 && 1356 } else if (mp->m_logbsize > 0 &&
1586 ap->logbufsize < mp->m_sb.sb_logsunit) { 1357 mp->m_logbsize < mp->m_sb.sb_logsunit) {
1587 cmn_err(CE_WARN, 1358 cmn_err(CE_WARN,
1588 "XFS: logbuf size must be greater than or equal to log stripe size"); 1359 "XFS: logbuf size must be greater than or equal to log stripe size");
1589 return XFS_ERROR(EINVAL); 1360 return XFS_ERROR(EINVAL);
1590 } 1361 }
1591 } else { 1362 } else {
1592 /* Fail a mount if the logbuf is larger than 32K */ 1363 /* Fail a mount if the logbuf is larger than 32K */
1593 if (ap->logbufsize > XLOG_BIG_RECORD_BSIZE) { 1364 if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
1594 cmn_err(CE_WARN, 1365 cmn_err(CE_WARN,
1595 "XFS: logbuf size for version 1 logs must be 16K or 32K"); 1366 "XFS: logbuf size for version 1 logs must be 16K or 32K");
1596 return XFS_ERROR(EINVAL); 1367 return XFS_ERROR(EINVAL);
@@ -1602,7 +1373,7 @@ xfs_finish_flags(
1602 * told by noattr2 to turn it off 1373 * told by noattr2 to turn it off
1603 */ 1374 */
1604 if (xfs_sb_version_hasattr2(&mp->m_sb) && 1375 if (xfs_sb_version_hasattr2(&mp->m_sb) &&
1605 !(ap->flags & XFSMNT_NOATTR2)) 1376 !(mp->m_flags & XFS_MOUNT_NOATTR2))
1606 mp->m_flags |= XFS_MOUNT_ATTR2; 1377 mp->m_flags |= XFS_MOUNT_ATTR2;
1607 1378
1608 /* 1379 /*
@@ -1614,48 +1385,6 @@ xfs_finish_flags(
1614 return XFS_ERROR(EROFS); 1385 return XFS_ERROR(EROFS);
1615 } 1386 }
1616 1387
1617 /*
1618 * check for shared mount.
1619 */
1620 if (ap->flags & XFSMNT_SHARED) {
1621 if (!xfs_sb_version_hasshared(&mp->m_sb))
1622 return XFS_ERROR(EINVAL);
1623
1624 /*
1625 * For IRIX 6.5, shared mounts must have the shared
1626 * version bit set, have the persistent readonly
1627 * field set, must be version 0 and can only be mounted
1628 * read-only.
1629 */
1630 if (!ronly || !(mp->m_sb.sb_flags & XFS_SBF_READONLY) ||
1631 (mp->m_sb.sb_shared_vn != 0))
1632 return XFS_ERROR(EINVAL);
1633
1634 mp->m_flags |= XFS_MOUNT_SHARED;
1635
1636 /*
1637 * Shared XFS V0 can't deal with DMI. Return EINVAL.
1638 */
1639 if (mp->m_sb.sb_shared_vn == 0 && (ap->flags & XFSMNT_DMAPI))
1640 return XFS_ERROR(EINVAL);
1641 }
1642
1643 if (ap->flags & XFSMNT_UQUOTA) {
1644 mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
1645 if (ap->flags & XFSMNT_UQUOTAENF)
1646 mp->m_qflags |= XFS_UQUOTA_ENFD;
1647 }
1648
1649 if (ap->flags & XFSMNT_GQUOTA) {
1650 mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
1651 if (ap->flags & XFSMNT_GQUOTAENF)
1652 mp->m_qflags |= XFS_OQUOTA_ENFD;
1653 } else if (ap->flags & XFSMNT_PQUOTA) {
1654 mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
1655 if (ap->flags & XFSMNT_PQUOTAENF)
1656 mp->m_qflags |= XFS_OQUOTA_ENFD;
1657 }
1658
1659 return 0; 1388 return 0;
1660} 1389}
1661 1390
@@ -1667,19 +1396,14 @@ xfs_fs_fill_super(
1667{ 1396{
1668 struct inode *root; 1397 struct inode *root;
1669 struct xfs_mount *mp = NULL; 1398 struct xfs_mount *mp = NULL;
1670 struct xfs_mount_args *args;
1671 int flags = 0, error = ENOMEM; 1399 int flags = 0, error = ENOMEM;
1672 1400 char *mtpt = NULL;
1673 args = xfs_args_allocate(sb, silent);
1674 if (!args)
1675 return -ENOMEM;
1676 1401
1677 mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL); 1402 mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
1678 if (!mp) 1403 if (!mp)
1679 goto out_free_args; 1404 goto out;
1680 1405
1681 spin_lock_init(&mp->m_sb_lock); 1406 spin_lock_init(&mp->m_sb_lock);
1682 mutex_init(&mp->m_ilock);
1683 mutex_init(&mp->m_growlock); 1407 mutex_init(&mp->m_growlock);
1684 atomic_set(&mp->m_active_trans, 0); 1408 atomic_set(&mp->m_active_trans, 0);
1685 INIT_LIST_HEAD(&mp->m_sync_list); 1409 INIT_LIST_HEAD(&mp->m_sync_list);
@@ -1689,12 +1413,9 @@ xfs_fs_fill_super(
1689 mp->m_super = sb; 1413 mp->m_super = sb;
1690 sb->s_fs_info = mp; 1414 sb->s_fs_info = mp;
1691 1415
1692 if (sb->s_flags & MS_RDONLY) 1416 error = xfs_parseargs(mp, (char *)data, &mtpt);
1693 mp->m_flags |= XFS_MOUNT_RDONLY;
1694
1695 error = xfs_parseargs(mp, (char *)data, args, 0);
1696 if (error) 1417 if (error)
1697 goto out_free_mp; 1418 goto out_free_fsname;
1698 1419
1699 sb_min_blocksize(sb, BBSIZE); 1420 sb_min_blocksize(sb, BBSIZE);
1700 sb->s_xattr = xfs_xattr_handlers; 1421 sb->s_xattr = xfs_xattr_handlers;
@@ -1702,33 +1423,28 @@ xfs_fs_fill_super(
1702 sb->s_qcop = &xfs_quotactl_operations; 1423 sb->s_qcop = &xfs_quotactl_operations;
1703 sb->s_op = &xfs_super_operations; 1424 sb->s_op = &xfs_super_operations;
1704 1425
1705 error = xfs_dmops_get(mp, args); 1426 error = xfs_dmops_get(mp);
1706 if (error) 1427 if (error)
1707 goto out_free_mp; 1428 goto out_free_fsname;
1708 error = xfs_qmops_get(mp, args); 1429 error = xfs_qmops_get(mp);
1709 if (error) 1430 if (error)
1710 goto out_put_dmops; 1431 goto out_put_dmops;
1711 1432
1712 if (args->flags & XFSMNT_QUIET) 1433 if (silent)
1713 flags |= XFS_MFSI_QUIET; 1434 flags |= XFS_MFSI_QUIET;
1714 1435
1715 error = xfs_open_devices(mp, args); 1436 error = xfs_open_devices(mp);
1716 if (error) 1437 if (error)
1717 goto out_put_qmops; 1438 goto out_put_qmops;
1718 1439
1719 if (xfs_icsb_init_counters(mp)) 1440 if (xfs_icsb_init_counters(mp))
1720 mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB; 1441 mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
1721 1442
1722 /*
1723 * Setup flags based on mount(2) options and then the superblock
1724 */
1725 error = xfs_start_flags(args, mp);
1726 if (error)
1727 goto out_free_fsname;
1728 error = xfs_readsb(mp, flags); 1443 error = xfs_readsb(mp, flags);
1729 if (error) 1444 if (error)
1730 goto out_free_fsname; 1445 goto out_destroy_counters;
1731 error = xfs_finish_flags(args, mp); 1446
1447 error = xfs_finish_flags(mp);
1732 if (error) 1448 if (error)
1733 goto out_free_sb; 1449 goto out_free_sb;
1734 1450
@@ -1747,7 +1463,7 @@ xfs_fs_fill_super(
1747 if (error) 1463 if (error)
1748 goto out_filestream_unmount; 1464 goto out_filestream_unmount;
1749 1465
1750 XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, args->mtpt, args->fsname); 1466 XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname);
1751 1467
1752 sb->s_dirt = 1; 1468 sb->s_dirt = 1;
1753 sb->s_magic = XFS_SB_MAGIC; 1469 sb->s_magic = XFS_SB_MAGIC;
@@ -1772,35 +1488,31 @@ xfs_fs_fill_super(
1772 goto fail_vnrele; 1488 goto fail_vnrele;
1773 } 1489 }
1774 1490
1775 mp->m_sync_work.w_syncer = xfs_sync_worker; 1491 error = xfs_syncd_init(mp);
1776 mp->m_sync_work.w_mount = mp; 1492 if (error)
1777 mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
1778 if (IS_ERR(mp->m_sync_task)) {
1779 error = -PTR_ERR(mp->m_sync_task);
1780 goto fail_vnrele; 1493 goto fail_vnrele;
1781 }
1782 1494
1783 xfs_itrace_exit(XFS_I(sb->s_root->d_inode)); 1495 kfree(mtpt);
1784 1496
1785 kfree(args); 1497 xfs_itrace_exit(XFS_I(sb->s_root->d_inode));
1786 return 0; 1498 return 0;
1787 1499
1788 out_filestream_unmount: 1500 out_filestream_unmount:
1789 xfs_filestream_unmount(mp); 1501 xfs_filestream_unmount(mp);
1790 out_free_sb: 1502 out_free_sb:
1791 xfs_freesb(mp); 1503 xfs_freesb(mp);
1792 out_free_fsname: 1504 out_destroy_counters:
1793 xfs_free_fsname(mp);
1794 xfs_icsb_destroy_counters(mp); 1505 xfs_icsb_destroy_counters(mp);
1795 xfs_close_devices(mp); 1506 xfs_close_devices(mp);
1796 out_put_qmops: 1507 out_put_qmops:
1797 xfs_qmops_put(mp); 1508 xfs_qmops_put(mp);
1798 out_put_dmops: 1509 out_put_dmops:
1799 xfs_dmops_put(mp); 1510 xfs_dmops_put(mp);
1800 out_free_mp: 1511 out_free_fsname:
1512 xfs_free_fsname(mp);
1513 kfree(mtpt);
1801 kfree(mp); 1514 kfree(mp);
1802 out_free_args: 1515 out:
1803 kfree(args);
1804 return -error; 1516 return -error;
1805 1517
1806 fail_vnrele: 1518 fail_vnrele:
@@ -1820,8 +1532,6 @@ xfs_fs_fill_super(
1820 xfs_filestream_unmount(mp); 1532 xfs_filestream_unmount(mp);
1821 1533
1822 XFS_bflush(mp->m_ddev_targp); 1534 XFS_bflush(mp->m_ddev_targp);
1823 error = xfs_unmount_flush(mp, 0);
1824 WARN_ON(error);
1825 1535
1826 xfs_unmountfs(mp); 1536 xfs_unmountfs(mp);
1827 goto out_free_sb; 1537 goto out_free_sb;
@@ -1847,7 +1557,7 @@ static struct super_operations xfs_super_operations = {
1847 .put_super = xfs_fs_put_super, 1557 .put_super = xfs_fs_put_super,
1848 .write_super = xfs_fs_write_super, 1558 .write_super = xfs_fs_write_super,
1849 .sync_fs = xfs_fs_sync_super, 1559 .sync_fs = xfs_fs_sync_super,
1850 .write_super_lockfs = xfs_fs_lockfs, 1560 .freeze_fs = xfs_fs_freeze,
1851 .statfs = xfs_fs_statfs, 1561 .statfs = xfs_fs_statfs,
1852 .remount_fs = xfs_fs_remount, 1562 .remount_fs = xfs_fs_remount,
1853 .show_options = xfs_fs_show_options, 1563 .show_options = xfs_fs_show_options,
@@ -1882,10 +1592,19 @@ xfs_alloc_trace_bufs(void)
1882 if (!xfs_bmap_trace_buf) 1592 if (!xfs_bmap_trace_buf)
1883 goto out_free_alloc_trace; 1593 goto out_free_alloc_trace;
1884#endif 1594#endif
1885#ifdef XFS_BMBT_TRACE 1595#ifdef XFS_BTREE_TRACE
1596 xfs_allocbt_trace_buf = ktrace_alloc(XFS_ALLOCBT_TRACE_SIZE,
1597 KM_MAYFAIL);
1598 if (!xfs_allocbt_trace_buf)
1599 goto out_free_bmap_trace;
1600
1601 xfs_inobt_trace_buf = ktrace_alloc(XFS_INOBT_TRACE_SIZE, KM_MAYFAIL);
1602 if (!xfs_inobt_trace_buf)
1603 goto out_free_allocbt_trace;
1604
1886 xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_MAYFAIL); 1605 xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_MAYFAIL);
1887 if (!xfs_bmbt_trace_buf) 1606 if (!xfs_bmbt_trace_buf)
1888 goto out_free_bmap_trace; 1607 goto out_free_inobt_trace;
1889#endif 1608#endif
1890#ifdef XFS_ATTR_TRACE 1609#ifdef XFS_ATTR_TRACE
1891 xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_MAYFAIL); 1610 xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_MAYFAIL);
@@ -1907,8 +1626,12 @@ xfs_alloc_trace_bufs(void)
1907 ktrace_free(xfs_attr_trace_buf); 1626 ktrace_free(xfs_attr_trace_buf);
1908 out_free_bmbt_trace: 1627 out_free_bmbt_trace:
1909#endif 1628#endif
1910#ifdef XFS_BMBT_TRACE 1629#ifdef XFS_BTREE_TRACE
1911 ktrace_free(xfs_bmbt_trace_buf); 1630 ktrace_free(xfs_bmbt_trace_buf);
1631 out_free_inobt_trace:
1632 ktrace_free(xfs_inobt_trace_buf);
1633 out_free_allocbt_trace:
1634 ktrace_free(xfs_allocbt_trace_buf);
1912 out_free_bmap_trace: 1635 out_free_bmap_trace:
1913#endif 1636#endif
1914#ifdef XFS_BMAP_TRACE 1637#ifdef XFS_BMAP_TRACE
@@ -1931,8 +1654,10 @@ xfs_free_trace_bufs(void)
1931#ifdef XFS_ATTR_TRACE 1654#ifdef XFS_ATTR_TRACE
1932 ktrace_free(xfs_attr_trace_buf); 1655 ktrace_free(xfs_attr_trace_buf);
1933#endif 1656#endif
1934#ifdef XFS_BMBT_TRACE 1657#ifdef XFS_BTREE_TRACE
1935 ktrace_free(xfs_bmbt_trace_buf); 1658 ktrace_free(xfs_bmbt_trace_buf);
1659 ktrace_free(xfs_inobt_trace_buf);
1660 ktrace_free(xfs_allocbt_trace_buf);
1936#endif 1661#endif
1937#ifdef XFS_BMAP_TRACE 1662#ifdef XFS_BMAP_TRACE
1938 ktrace_free(xfs_bmap_trace_buf); 1663 ktrace_free(xfs_bmap_trace_buf);
@@ -1945,16 +1670,10 @@ xfs_free_trace_bufs(void)
1945STATIC int __init 1670STATIC int __init
1946xfs_init_zones(void) 1671xfs_init_zones(void)
1947{ 1672{
1948 xfs_vnode_zone = kmem_zone_init_flags(sizeof(struct inode), "xfs_vnode",
1949 KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
1950 KM_ZONE_SPREAD,
1951 xfs_fs_inode_init_once);
1952 if (!xfs_vnode_zone)
1953 goto out;
1954 1673
1955 xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend"); 1674 xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
1956 if (!xfs_ioend_zone) 1675 if (!xfs_ioend_zone)
1957 goto out_destroy_vnode_zone; 1676 goto out;
1958 1677
1959 xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE, 1678 xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
1960 xfs_ioend_zone); 1679 xfs_ioend_zone);
@@ -1970,6 +1689,7 @@ xfs_init_zones(void)
1970 "xfs_bmap_free_item"); 1689 "xfs_bmap_free_item");
1971 if (!xfs_bmap_free_item_zone) 1690 if (!xfs_bmap_free_item_zone)
1972 goto out_destroy_log_ticket_zone; 1691 goto out_destroy_log_ticket_zone;
1692
1973 xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t), 1693 xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
1974 "xfs_btree_cur"); 1694 "xfs_btree_cur");
1975 if (!xfs_btree_cur_zone) 1695 if (!xfs_btree_cur_zone)
@@ -2017,8 +1737,8 @@ xfs_init_zones(void)
2017 1737
2018 xfs_inode_zone = 1738 xfs_inode_zone =
2019 kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode", 1739 kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
2020 KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | 1740 KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD,
2021 KM_ZONE_SPREAD, NULL); 1741 xfs_fs_inode_init_once);
2022 if (!xfs_inode_zone) 1742 if (!xfs_inode_zone)
2023 goto out_destroy_efi_zone; 1743 goto out_destroy_efi_zone;
2024 1744
@@ -2066,8 +1786,6 @@ xfs_init_zones(void)
2066 mempool_destroy(xfs_ioend_pool); 1786 mempool_destroy(xfs_ioend_pool);
2067 out_destroy_ioend_zone: 1787 out_destroy_ioend_zone:
2068 kmem_zone_destroy(xfs_ioend_zone); 1788 kmem_zone_destroy(xfs_ioend_zone);
2069 out_destroy_vnode_zone:
2070 kmem_zone_destroy(xfs_vnode_zone);
2071 out: 1789 out:
2072 return -ENOMEM; 1790 return -ENOMEM;
2073} 1791}
@@ -2092,7 +1810,6 @@ xfs_destroy_zones(void)
2092 kmem_zone_destroy(xfs_log_ticket_zone); 1810 kmem_zone_destroy(xfs_log_ticket_zone);
2093 mempool_destroy(xfs_ioend_pool); 1811 mempool_destroy(xfs_ioend_pool);
2094 kmem_zone_destroy(xfs_ioend_zone); 1812 kmem_zone_destroy(xfs_ioend_zone);
2095 kmem_zone_destroy(xfs_vnode_zone);
2096 1813
2097} 1814}
2098 1815
@@ -2100,13 +1817,12 @@ STATIC int __init
2100init_xfs_fs(void) 1817init_xfs_fs(void)
2101{ 1818{
2102 int error; 1819 int error;
2103 static char message[] __initdata = KERN_INFO \
2104 XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n";
2105 1820
2106 printk(message); 1821 printk(KERN_INFO XFS_VERSION_STRING " with "
1822 XFS_BUILD_OPTIONS " enabled\n");
2107 1823
2108 ktrace_init(64); 1824 ktrace_init(64);
2109 vn_init(); 1825 xfs_ioend_init();
2110 xfs_dir_startup(); 1826 xfs_dir_startup();
2111 1827
2112 error = xfs_init_zones(); 1828 error = xfs_init_zones();
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index fe2ef4e6a0f9..d5d776d4cd67 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -20,24 +20,12 @@
20 20
21#include <linux/exportfs.h> 21#include <linux/exportfs.h>
22 22
23#ifdef CONFIG_XFS_DMAPI
24# define vfs_insertdmapi(vfs) vfs_insertops(vfsp, &xfs_dmops)
25# define vfs_initdmapi() dmapi_init()
26# define vfs_exitdmapi() dmapi_uninit()
27#else
28# define vfs_insertdmapi(vfs) do { } while (0)
29# define vfs_initdmapi() do { } while (0)
30# define vfs_exitdmapi() do { } while (0)
31#endif
32
33#ifdef CONFIG_XFS_QUOTA 23#ifdef CONFIG_XFS_QUOTA
34# define vfs_insertquota(vfs) vfs_insertops(vfsp, &xfs_qmops)
35extern void xfs_qm_init(void); 24extern void xfs_qm_init(void);
36extern void xfs_qm_exit(void); 25extern void xfs_qm_exit(void);
37# define vfs_initquota() xfs_qm_init() 26# define vfs_initquota() xfs_qm_init()
38# define vfs_exitquota() xfs_qm_exit() 27# define vfs_exitquota() xfs_qm_exit()
39#else 28#else
40# define vfs_insertquota(vfs) do { } while (0)
41# define vfs_initquota() do { } while (0) 29# define vfs_initquota() do { } while (0)
42# define vfs_exitquota() do { } while (0) 30# define vfs_exitquota() do { } while (0)
43#endif 31#endif
@@ -101,9 +89,6 @@ struct block_device;
101 89
102extern __uint64_t xfs_max_file_offset(unsigned int); 90extern __uint64_t xfs_max_file_offset(unsigned int);
103 91
104extern void xfs_flush_inode(struct xfs_inode *);
105extern void xfs_flush_device(struct xfs_inode *);
106
107extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); 92extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
108 93
109extern const struct export_operations xfs_export_operations; 94extern const struct export_operations xfs_export_operations;
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
new file mode 100644
index 000000000000..2ed035354c26
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -0,0 +1,762 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h"
30#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h"
33#include "xfs_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_inode.h"
37#include "xfs_dinode.h"
38#include "xfs_error.h"
39#include "xfs_mru_cache.h"
40#include "xfs_filestream.h"
41#include "xfs_vnodeops.h"
42#include "xfs_utils.h"
43#include "xfs_buf_item.h"
44#include "xfs_inode_item.h"
45#include "xfs_rw.h"
46
47#include <linux/kthread.h>
48#include <linux/freezer.h>
49
50/*
51 * Sync all the inodes in the given AG according to the
52 * direction given by the flags.
53 */
54STATIC int
55xfs_sync_inodes_ag(
56 xfs_mount_t *mp,
57 int ag,
58 int flags)
59{
60 xfs_perag_t *pag = &mp->m_perag[ag];
61 int nr_found;
62 uint32_t first_index = 0;
63 int error = 0;
64 int last_error = 0;
65 int fflag = XFS_B_ASYNC;
66
67 if (flags & SYNC_DELWRI)
68 fflag = XFS_B_DELWRI;
69 if (flags & SYNC_WAIT)
70 fflag = 0; /* synchronous overrides all */
71
72 do {
73 struct inode *inode;
74 xfs_inode_t *ip = NULL;
75 int lock_flags = XFS_ILOCK_SHARED;
76
77 /*
78 * use a gang lookup to find the next inode in the tree
79 * as the tree is sparse and a gang lookup walks to find
80 * the number of objects requested.
81 */
82 read_lock(&pag->pag_ici_lock);
83 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
84 (void**)&ip, first_index, 1);
85
86 if (!nr_found) {
87 read_unlock(&pag->pag_ici_lock);
88 break;
89 }
90
91 /*
92 * Update the index for the next lookup. Catch overflows
93 * into the next AG range which can occur if we have inodes
94 * in the last block of the AG and we are currently
95 * pointing to the last inode.
96 */
97 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
98 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
99 read_unlock(&pag->pag_ici_lock);
100 break;
101 }
102
103 /* nothing to sync during shutdown */
104 if (XFS_FORCED_SHUTDOWN(mp)) {
105 read_unlock(&pag->pag_ici_lock);
106 return 0;
107 }
108
109 /*
110 * If we can't get a reference on the inode, it must be
111 * in reclaim. Leave it for the reclaim code to flush.
112 */
113 inode = VFS_I(ip);
114 if (!igrab(inode)) {
115 read_unlock(&pag->pag_ici_lock);
116 continue;
117 }
118 read_unlock(&pag->pag_ici_lock);
119
120 /* avoid new or bad inodes */
121 if (is_bad_inode(inode) ||
122 xfs_iflags_test(ip, XFS_INEW)) {
123 IRELE(ip);
124 continue;
125 }
126
127 /*
128 * If we have to flush data or wait for I/O completion
129 * we need to hold the iolock.
130 */
131 if ((flags & SYNC_DELWRI) && VN_DIRTY(inode)) {
132 xfs_ilock(ip, XFS_IOLOCK_SHARED);
133 lock_flags |= XFS_IOLOCK_SHARED;
134 error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE);
135 if (flags & SYNC_IOWAIT)
136 xfs_ioend_wait(ip);
137 }
138 xfs_ilock(ip, XFS_ILOCK_SHARED);
139
140 if ((flags & SYNC_ATTR) && !xfs_inode_clean(ip)) {
141 if (flags & SYNC_WAIT) {
142 xfs_iflock(ip);
143 if (!xfs_inode_clean(ip))
144 error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
145 else
146 xfs_ifunlock(ip);
147 } else if (xfs_iflock_nowait(ip)) {
148 if (!xfs_inode_clean(ip))
149 error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
150 else
151 xfs_ifunlock(ip);
152 }
153 }
154 xfs_iput(ip, lock_flags);
155
156 if (error)
157 last_error = error;
158 /*
159 * bail out if the filesystem is corrupted.
160 */
161 if (error == EFSCORRUPTED)
162 return XFS_ERROR(error);
163
164 } while (nr_found);
165
166 return last_error;
167}
168
169int
170xfs_sync_inodes(
171 xfs_mount_t *mp,
172 int flags)
173{
174 int error;
175 int last_error;
176 int i;
177 int lflags = XFS_LOG_FORCE;
178
179 if (mp->m_flags & XFS_MOUNT_RDONLY)
180 return 0;
181 error = 0;
182 last_error = 0;
183
184 if (flags & SYNC_WAIT)
185 lflags |= XFS_LOG_SYNC;
186
187 for (i = 0; i < mp->m_sb.sb_agcount; i++) {
188 if (!mp->m_perag[i].pag_ici_init)
189 continue;
190 error = xfs_sync_inodes_ag(mp, i, flags);
191 if (error)
192 last_error = error;
193 if (error == EFSCORRUPTED)
194 break;
195 }
196 if (flags & SYNC_DELWRI)
197 xfs_log_force(mp, 0, lflags);
198
199 return XFS_ERROR(last_error);
200}
201
202STATIC int
203xfs_commit_dummy_trans(
204 struct xfs_mount *mp,
205 uint log_flags)
206{
207 struct xfs_inode *ip = mp->m_rootip;
208 struct xfs_trans *tp;
209 int error;
210
211 /*
212 * Put a dummy transaction in the log to tell recovery
213 * that all others are OK.
214 */
215 tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
216 error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
217 if (error) {
218 xfs_trans_cancel(tp, 0);
219 return error;
220 }
221
222 xfs_ilock(ip, XFS_ILOCK_EXCL);
223
224 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
225 xfs_trans_ihold(tp, ip);
226 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
227 /* XXX(hch): ignoring the error here.. */
228 error = xfs_trans_commit(tp, 0);
229
230 xfs_iunlock(ip, XFS_ILOCK_EXCL);
231
232 xfs_log_force(mp, 0, log_flags);
233 return 0;
234}
235
236int
237xfs_sync_fsdata(
238 struct xfs_mount *mp,
239 int flags)
240{
241 struct xfs_buf *bp;
242 struct xfs_buf_log_item *bip;
243 int error = 0;
244
245 /*
246 * If this is xfssyncd() then only sync the superblock if we can
247 * lock it without sleeping and it is not pinned.
248 */
249 if (flags & SYNC_BDFLUSH) {
250 ASSERT(!(flags & SYNC_WAIT));
251
252 bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
253 if (!bp)
254 goto out;
255
256 bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *);
257 if (!bip || !xfs_buf_item_dirty(bip) || XFS_BUF_ISPINNED(bp))
258 goto out_brelse;
259 } else {
260 bp = xfs_getsb(mp, 0);
261
262 /*
263 * If the buffer is pinned then push on the log so we won't
264 * get stuck waiting in the write for someone, maybe
265 * ourselves, to flush the log.
266 *
267 * Even though we just pushed the log above, we did not have
268 * the superblock buffer locked at that point so it can
269 * become pinned in between there and here.
270 */
271 if (XFS_BUF_ISPINNED(bp))
272 xfs_log_force(mp, 0, XFS_LOG_FORCE);
273 }
274
275
276 if (flags & SYNC_WAIT)
277 XFS_BUF_UNASYNC(bp);
278 else
279 XFS_BUF_ASYNC(bp);
280
281 return xfs_bwrite(mp, bp);
282
283 out_brelse:
284 xfs_buf_relse(bp);
285 out:
286 return error;
287}
288
289/*
290 * When remounting a filesystem read-only or freezing the filesystem, we have
291 * two phases to execute. This first phase is syncing the data before we
292 * quiesce the filesystem, and the second is flushing all the inodes out after
293 * we've waited for all the transactions created by the first phase to
294 * complete. The second phase ensures that the inodes are written to their
295 * location on disk rather than just existing in transactions in the log. This
296 * means after a quiesce there is no log replay required to write the inodes to
297 * disk (this is the main difference between a sync and a quiesce).
298 */
299/*
300 * First stage of freeze - no writers will make progress now we are here,
301 * so we flush delwri and delalloc buffers here, then wait for all I/O to
302 * complete. Data is frozen at that point. Metadata is not frozen,
303 * transactions can still occur here so don't bother flushing the buftarg
304 * because it'll just get dirty again.
305 */
306int
307xfs_quiesce_data(
308 struct xfs_mount *mp)
309{
310 int error;
311
312 /* push non-blocking */
313 xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_BDFLUSH);
314 XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
315 xfs_filestream_flush(mp);
316
317 /* push and block */
318 xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_WAIT|SYNC_IOWAIT);
319 XFS_QM_DQSYNC(mp, SYNC_WAIT);
320
321 /* write superblock and hoover up shutdown errors */
322 error = xfs_sync_fsdata(mp, 0);
323
324 /* flush data-only devices */
325 if (mp->m_rtdev_targp)
326 XFS_bflush(mp->m_rtdev_targp);
327
328 return error;
329}
330
331STATIC void
332xfs_quiesce_fs(
333 struct xfs_mount *mp)
334{
335 int count = 0, pincount;
336
337 xfs_flush_buftarg(mp->m_ddev_targp, 0);
338 xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
339
340 /*
341 * This loop must run at least twice. The first instance of the loop
342 * will flush most meta data but that will generate more meta data
343 * (typically directory updates). Which then must be flushed and
344 * logged before we can write the unmount record.
345 */
346 do {
347 xfs_sync_inodes(mp, SYNC_ATTR|SYNC_WAIT);
348 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
349 if (!pincount) {
350 delay(50);
351 count++;
352 }
353 } while (count < 2);
354}
355
356/*
357 * Second stage of a quiesce. The data is already synced, now we have to take
358 * care of the metadata. New transactions are already blocked, so we need to
359 * wait for any remaining transactions to drain out before proceding.
360 */
361void
362xfs_quiesce_attr(
363 struct xfs_mount *mp)
364{
365 int error = 0;
366
367 /* wait for all modifications to complete */
368 while (atomic_read(&mp->m_active_trans) > 0)
369 delay(100);
370
371 /* flush inodes and push all remaining buffers out to disk */
372 xfs_quiesce_fs(mp);
373
374 ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0);
375
376 /* Push the superblock and write an unmount record */
377 error = xfs_log_sbcount(mp, 1);
378 if (error)
379 xfs_fs_cmn_err(CE_WARN, mp,
380 "xfs_attr_quiesce: failed to log sb changes. "
381 "Frozen image may not be consistent.");
382 xfs_log_unmount_write(mp);
383 xfs_unmountfs_writesb(mp);
384}
385
386/*
387 * Enqueue a work item to be picked up by the vfs xfssyncd thread.
388 * Doing this has two advantages:
389 * - It saves on stack space, which is tight in certain situations
390 * - It can be used (with care) as a mechanism to avoid deadlocks.
391 * Flushing while allocating in a full filesystem requires both.
392 */
393STATIC void
394xfs_syncd_queue_work(
395 struct xfs_mount *mp,
396 void *data,
397 void (*syncer)(struct xfs_mount *, void *))
398{
399 struct bhv_vfs_sync_work *work;
400
401 work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
402 INIT_LIST_HEAD(&work->w_list);
403 work->w_syncer = syncer;
404 work->w_data = data;
405 work->w_mount = mp;
406 spin_lock(&mp->m_sync_lock);
407 list_add_tail(&work->w_list, &mp->m_sync_list);
408 spin_unlock(&mp->m_sync_lock);
409 wake_up_process(mp->m_sync_task);
410}
411
412/*
413 * Flush delayed allocate data, attempting to free up reserved space
414 * from existing allocations. At this point a new allocation attempt
415 * has failed with ENOSPC and we are in the process of scratching our
416 * heads, looking about for more room...
417 */
418STATIC void
419xfs_flush_inode_work(
420 struct xfs_mount *mp,
421 void *arg)
422{
423 struct inode *inode = arg;
424 filemap_flush(inode->i_mapping);
425 iput(inode);
426}
427
428void
429xfs_flush_inode(
430 xfs_inode_t *ip)
431{
432 struct inode *inode = VFS_I(ip);
433
434 igrab(inode);
435 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
436 delay(msecs_to_jiffies(500));
437}
438
439/*
440 * This is the "bigger hammer" version of xfs_flush_inode_work...
441 * (IOW, "If at first you don't succeed, use a Bigger Hammer").
442 */
443STATIC void
444xfs_flush_device_work(
445 struct xfs_mount *mp,
446 void *arg)
447{
448 struct inode *inode = arg;
449 sync_blockdev(mp->m_super->s_bdev);
450 iput(inode);
451}
452
453void
454xfs_flush_device(
455 xfs_inode_t *ip)
456{
457 struct inode *inode = VFS_I(ip);
458
459 igrab(inode);
460 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
461 delay(msecs_to_jiffies(500));
462 xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
463}
464
465/*
466 * Every sync period we need to unpin all items, reclaim inodes, sync
467 * quota and write out the superblock. We might need to cover the log
468 * to indicate it is idle.
469 */
470STATIC void
471xfs_sync_worker(
472 struct xfs_mount *mp,
473 void *unused)
474{
475 int error;
476
477 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
478 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
479 xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
480 /* dgc: errors ignored here */
481 error = XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
482 error = xfs_sync_fsdata(mp, SYNC_BDFLUSH);
483 if (xfs_log_need_covered(mp))
484 error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE);
485 }
486 mp->m_sync_seq++;
487 wake_up(&mp->m_wait_single_sync_task);
488}
489
490STATIC int
491xfssyncd(
492 void *arg)
493{
494 struct xfs_mount *mp = arg;
495 long timeleft;
496 bhv_vfs_sync_work_t *work, *n;
497 LIST_HEAD (tmp);
498
499 set_freezable();
500 timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
501 for (;;) {
502 timeleft = schedule_timeout_interruptible(timeleft);
503 /* swsusp */
504 try_to_freeze();
505 if (kthread_should_stop() && list_empty(&mp->m_sync_list))
506 break;
507
508 spin_lock(&mp->m_sync_lock);
509 /*
510 * We can get woken by laptop mode, to do a sync -
511 * that's the (only!) case where the list would be
512 * empty with time remaining.
513 */
514 if (!timeleft || list_empty(&mp->m_sync_list)) {
515 if (!timeleft)
516 timeleft = xfs_syncd_centisecs *
517 msecs_to_jiffies(10);
518 INIT_LIST_HEAD(&mp->m_sync_work.w_list);
519 list_add_tail(&mp->m_sync_work.w_list,
520 &mp->m_sync_list);
521 }
522 list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list)
523 list_move(&work->w_list, &tmp);
524 spin_unlock(&mp->m_sync_lock);
525
526 list_for_each_entry_safe(work, n, &tmp, w_list) {
527 (*work->w_syncer)(mp, work->w_data);
528 list_del(&work->w_list);
529 if (work == &mp->m_sync_work)
530 continue;
531 kmem_free(work);
532 }
533 }
534
535 return 0;
536}
537
538int
539xfs_syncd_init(
540 struct xfs_mount *mp)
541{
542 mp->m_sync_work.w_syncer = xfs_sync_worker;
543 mp->m_sync_work.w_mount = mp;
544 mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
545 if (IS_ERR(mp->m_sync_task))
546 return -PTR_ERR(mp->m_sync_task);
547 return 0;
548}
549
550void
551xfs_syncd_stop(
552 struct xfs_mount *mp)
553{
554 kthread_stop(mp->m_sync_task);
555}
556
557int
558xfs_reclaim_inode(
559 xfs_inode_t *ip,
560 int locked,
561 int sync_mode)
562{
563 xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
564
565 /* The hash lock here protects a thread in xfs_iget_core from
566 * racing with us on linking the inode back with a vnode.
567 * Once we have the XFS_IRECLAIM flag set it will not touch
568 * us.
569 */
570 write_lock(&pag->pag_ici_lock);
571 spin_lock(&ip->i_flags_lock);
572 if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
573 !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
574 spin_unlock(&ip->i_flags_lock);
575 write_unlock(&pag->pag_ici_lock);
576 if (locked) {
577 xfs_ifunlock(ip);
578 xfs_iunlock(ip, XFS_ILOCK_EXCL);
579 }
580 return 1;
581 }
582 __xfs_iflags_set(ip, XFS_IRECLAIM);
583 spin_unlock(&ip->i_flags_lock);
584 write_unlock(&pag->pag_ici_lock);
585 xfs_put_perag(ip->i_mount, pag);
586
587 /*
588 * If the inode is still dirty, then flush it out. If the inode
589 * is not in the AIL, then it will be OK to flush it delwri as
590 * long as xfs_iflush() does not keep any references to the inode.
591 * We leave that decision up to xfs_iflush() since it has the
592 * knowledge of whether it's OK to simply do a delwri flush of
593 * the inode or whether we need to wait until the inode is
594 * pulled from the AIL.
595 * We get the flush lock regardless, though, just to make sure
596 * we don't free it while it is being flushed.
597 */
598 if (!locked) {
599 xfs_ilock(ip, XFS_ILOCK_EXCL);
600 xfs_iflock(ip);
601 }
602
603 /*
604 * In the case of a forced shutdown we rely on xfs_iflush() to
605 * wait for the inode to be unpinned before returning an error.
606 */
607 if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
608 /* synchronize with xfs_iflush_done */
609 xfs_iflock(ip);
610 xfs_ifunlock(ip);
611 }
612
613 xfs_iunlock(ip, XFS_ILOCK_EXCL);
614 xfs_ireclaim(ip);
615 return 0;
616}
617
618/*
619 * We set the inode flag atomically with the radix tree tag.
620 * Once we get tag lookups on the radix tree, this inode flag
621 * can go away.
622 */
623void
624xfs_inode_set_reclaim_tag(
625 xfs_inode_t *ip)
626{
627 xfs_mount_t *mp = ip->i_mount;
628 xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
629
630 read_lock(&pag->pag_ici_lock);
631 spin_lock(&ip->i_flags_lock);
632 radix_tree_tag_set(&pag->pag_ici_root,
633 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
634 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
635 spin_unlock(&ip->i_flags_lock);
636 read_unlock(&pag->pag_ici_lock);
637 xfs_put_perag(mp, pag);
638}
639
640void
641__xfs_inode_clear_reclaim_tag(
642 xfs_mount_t *mp,
643 xfs_perag_t *pag,
644 xfs_inode_t *ip)
645{
646 radix_tree_tag_clear(&pag->pag_ici_root,
647 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
648}
649
650void
651xfs_inode_clear_reclaim_tag(
652 xfs_inode_t *ip)
653{
654 xfs_mount_t *mp = ip->i_mount;
655 xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
656
657 read_lock(&pag->pag_ici_lock);
658 spin_lock(&ip->i_flags_lock);
659 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
660 spin_unlock(&ip->i_flags_lock);
661 read_unlock(&pag->pag_ici_lock);
662 xfs_put_perag(mp, pag);
663}
664
665
666STATIC void
667xfs_reclaim_inodes_ag(
668 xfs_mount_t *mp,
669 int ag,
670 int noblock,
671 int mode)
672{
673 xfs_inode_t *ip = NULL;
674 xfs_perag_t *pag = &mp->m_perag[ag];
675 int nr_found;
676 uint32_t first_index;
677 int skipped;
678
679restart:
680 first_index = 0;
681 skipped = 0;
682 do {
683 /*
684 * use a gang lookup to find the next inode in the tree
685 * as the tree is sparse and a gang lookup walks to find
686 * the number of objects requested.
687 */
688 read_lock(&pag->pag_ici_lock);
689 nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
690 (void**)&ip, first_index, 1,
691 XFS_ICI_RECLAIM_TAG);
692
693 if (!nr_found) {
694 read_unlock(&pag->pag_ici_lock);
695 break;
696 }
697
698 /*
699 * Update the index for the next lookup. Catch overflows
700 * into the next AG range which can occur if we have inodes
701 * in the last block of the AG and we are currently
702 * pointing to the last inode.
703 */
704 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
705 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
706 read_unlock(&pag->pag_ici_lock);
707 break;
708 }
709
710 /* ignore if already under reclaim */
711 if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
712 read_unlock(&pag->pag_ici_lock);
713 continue;
714 }
715
716 if (noblock) {
717 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
718 read_unlock(&pag->pag_ici_lock);
719 continue;
720 }
721 if (xfs_ipincount(ip) ||
722 !xfs_iflock_nowait(ip)) {
723 xfs_iunlock(ip, XFS_ILOCK_EXCL);
724 read_unlock(&pag->pag_ici_lock);
725 continue;
726 }
727 }
728 read_unlock(&pag->pag_ici_lock);
729
730 /*
731 * hmmm - this is an inode already in reclaim. Do
732 * we even bother catching it here?
733 */
734 if (xfs_reclaim_inode(ip, noblock, mode))
735 skipped++;
736 } while (nr_found);
737
738 if (skipped) {
739 delay(1);
740 goto restart;
741 }
742 return;
743
744}
745
746int
747xfs_reclaim_inodes(
748 xfs_mount_t *mp,
749 int noblock,
750 int mode)
751{
752 int i;
753
754 for (i = 0; i < mp->m_sb.sb_agcount; i++) {
755 if (!mp->m_perag[i].pag_ici_init)
756 continue;
757 xfs_reclaim_inodes_ag(mp, i, noblock, mode);
758 }
759 return 0;
760}
761
762
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
new file mode 100644
index 000000000000..5f6de1efe1f6
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -0,0 +1,55 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef XFS_SYNC_H
19#define XFS_SYNC_H 1
20
21struct xfs_mount;
22
23typedef struct bhv_vfs_sync_work {
24 struct list_head w_list;
25 struct xfs_mount *w_mount;
26 void *w_data; /* syncer routine argument */
27 void (*w_syncer)(struct xfs_mount *, void *);
28} bhv_vfs_sync_work_t;
29
30#define SYNC_ATTR 0x0001 /* sync attributes */
31#define SYNC_DELWRI 0x0002 /* look at delayed writes */
32#define SYNC_WAIT 0x0004 /* wait for i/o to complete */
33#define SYNC_BDFLUSH 0x0008 /* BDFLUSH is calling -- don't block */
34#define SYNC_IOWAIT 0x0010 /* wait for all I/O to complete */
35
36int xfs_syncd_init(struct xfs_mount *mp);
37void xfs_syncd_stop(struct xfs_mount *mp);
38
39int xfs_sync_inodes(struct xfs_mount *mp, int flags);
40int xfs_sync_fsdata(struct xfs_mount *mp, int flags);
41
42int xfs_quiesce_data(struct xfs_mount *mp);
43void xfs_quiesce_attr(struct xfs_mount *mp);
44
45void xfs_flush_inode(struct xfs_inode *ip);
46void xfs_flush_device(struct xfs_inode *ip);
47
48int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
49int xfs_reclaim_inodes(struct xfs_mount *mp, int noblock, int mode);
50
51void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
52void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
53void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
54 struct xfs_inode *ip);
55#endif
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 7dacb5bbde3f..916c0ffb6083 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -56,17 +56,6 @@ xfs_stats_clear_proc_handler(
56 56
57static ctl_table xfs_table[] = { 57static ctl_table xfs_table[] = {
58 { 58 {
59 .ctl_name = XFS_RESTRICT_CHOWN,
60 .procname = "restrict_chown",
61 .data = &xfs_params.restrict_chown.val,
62 .maxlen = sizeof(int),
63 .mode = 0644,
64 .proc_handler = &proc_dointvec_minmax,
65 .strategy = &sysctl_intvec,
66 .extra1 = &xfs_params.restrict_chown.min,
67 .extra2 = &xfs_params.restrict_chown.max
68 },
69 {
70 .ctl_name = XFS_SGID_INHERIT, 59 .ctl_name = XFS_SGID_INHERIT,
71 .procname = "irix_sgid_inherit", 60 .procname = "irix_sgid_inherit",
72 .data = &xfs_params.sgid_inherit.val, 61 .data = &xfs_params.sgid_inherit.val,
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/linux-2.6/xfs_sysctl.h
index 4aadb8056c37..b9937d450f8e 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.h
+++ b/fs/xfs/linux-2.6/xfs_sysctl.h
@@ -31,7 +31,6 @@ typedef struct xfs_sysctl_val {
31} xfs_sysctl_val_t; 31} xfs_sysctl_val_t;
32 32
33typedef struct xfs_param { 33typedef struct xfs_param {
34 xfs_sysctl_val_t restrict_chown;/* Root/non-root can give away files.*/
35 xfs_sysctl_val_t sgid_inherit; /* Inherit S_ISGID if process' GID is 34 xfs_sysctl_val_t sgid_inherit; /* Inherit S_ISGID if process' GID is
36 * not a member of parent dir GID. */ 35 * not a member of parent dir GID. */
37 xfs_sysctl_val_t symlink_mode; /* Link creat mode affected by umask */ 36 xfs_sysctl_val_t symlink_mode; /* Link creat mode affected by umask */
@@ -68,7 +67,7 @@ typedef struct xfs_param {
68enum { 67enum {
69 /* XFS_REFCACHE_SIZE = 1 */ 68 /* XFS_REFCACHE_SIZE = 1 */
70 /* XFS_REFCACHE_PURGE = 2 */ 69 /* XFS_REFCACHE_PURGE = 2 */
71 XFS_RESTRICT_CHOWN = 3, 70 /* XFS_RESTRICT_CHOWN = 3 */
72 XFS_SGID_INHERIT = 4, 71 XFS_SGID_INHERIT = 4,
73 XFS_SYMLINK_MODE = 5, 72 XFS_SYMLINK_MODE = 5,
74 XFS_PANIC_MASK = 6, 73 XFS_PANIC_MASK = 6,
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
deleted file mode 100644
index 7e60c7776b1c..000000000000
--- a/fs/xfs/linux-2.6/xfs_vfs.h
+++ /dev/null
@@ -1,77 +0,0 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_VFS_H__
19#define __XFS_VFS_H__
20
21#include <linux/vfs.h>
22#include "xfs_fs.h"
23
24struct inode;
25
26struct fid;
27struct cred;
28struct seq_file;
29struct super_block;
30struct xfs_inode;
31struct xfs_mount;
32struct xfs_mount_args;
33
34typedef struct kstatfs bhv_statvfs_t;
35
36typedef struct bhv_vfs_sync_work {
37 struct list_head w_list;
38 struct xfs_mount *w_mount;
39 void *w_data; /* syncer routine argument */
40 void (*w_syncer)(struct xfs_mount *, void *);
41} bhv_vfs_sync_work_t;
42
43#define SYNC_ATTR 0x0001 /* sync attributes */
44#define SYNC_CLOSE 0x0002 /* close file system down */
45#define SYNC_DELWRI 0x0004 /* look at delayed writes */
46#define SYNC_WAIT 0x0008 /* wait for i/o to complete */
47#define SYNC_BDFLUSH 0x0010 /* BDFLUSH is calling -- don't block */
48#define SYNC_FSDATA 0x0020 /* flush fs data (e.g. superblocks) */
49#define SYNC_REFCACHE 0x0040 /* prune some of the nfs ref cache */
50#define SYNC_REMOUNT 0x0080 /* remount readonly, no dummy LRs */
51#define SYNC_IOWAIT 0x0100 /* wait for all I/O to complete */
52
53/*
54 * When remounting a filesystem read-only or freezing the filesystem,
55 * we have two phases to execute. This first phase is syncing the data
56 * before we quiesce the fielsystem, and the second is flushing all the
57 * inodes out after we've waited for all the transactions created by
58 * the first phase to complete. The second phase uses SYNC_INODE_QUIESCE
59 * to ensure that the inodes are written to their location on disk
60 * rather than just existing in transactions in the log. This means
61 * after a quiesce there is no log replay required to write the inodes
62 * to disk (this is the main difference between a sync and a quiesce).
63 */
64#define SYNC_DATA_QUIESCE (SYNC_DELWRI|SYNC_FSDATA|SYNC_WAIT|SYNC_IOWAIT)
65#define SYNC_INODE_QUIESCE (SYNC_REMOUNT|SYNC_ATTR|SYNC_WAIT)
66
67#define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */
68#define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */
69#define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */
70#define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */
71#define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */
72#define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */
73
74#define xfs_test_for_freeze(mp) ((mp)->m_super->s_frozen)
75#define xfs_wait_for_freeze(mp,l) vfs_check_frozen((mp)->m_super, (l))
76
77#endif /* __XFS_VFS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
deleted file mode 100644
index b52528bbbfff..000000000000
--- a/fs/xfs/linux-2.6/xfs_vnode.c
+++ /dev/null
@@ -1,145 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_vnodeops.h"
20#include "xfs_bmap_btree.h"
21#include "xfs_inode.h"
22
23/*
24 * And this gunk is needed for xfs_mount.h"
25 */
26#include "xfs_log.h"
27#include "xfs_trans.h"
28#include "xfs_sb.h"
29#include "xfs_dmapi.h"
30#include "xfs_inum.h"
31#include "xfs_ag.h"
32#include "xfs_mount.h"
33
34
35/*
36 * Dedicated vnode inactive/reclaim sync wait queues.
37 * Prime number of hash buckets since address is used as the key.
38 */
39#define NVSYNC 37
40#define vptosync(v) (&vsync[((unsigned long)v) % NVSYNC])
41static wait_queue_head_t vsync[NVSYNC];
42
43void __init
44vn_init(void)
45{
46 int i;
47
48 for (i = 0; i < NVSYNC; i++)
49 init_waitqueue_head(&vsync[i]);
50}
51
52void
53vn_iowait(
54 xfs_inode_t *ip)
55{
56 wait_queue_head_t *wq = vptosync(ip);
57
58 wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));
59}
60
61void
62vn_iowake(
63 xfs_inode_t *ip)
64{
65 if (atomic_dec_and_test(&ip->i_iocount))
66 wake_up(vptosync(ip));
67}
68
69/*
70 * Volume managers supporting multiple paths can send back ENODEV when the
71 * final path disappears. In this case continuing to fill the page cache
72 * with dirty data which cannot be written out is evil, so prevent that.
73 */
74void
75vn_ioerror(
76 xfs_inode_t *ip,
77 int error,
78 char *f,
79 int l)
80{
81 if (unlikely(error == -ENODEV))
82 xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ, f, l);
83}
84
85#ifdef XFS_INODE_TRACE
86
87/*
88 * Reference count of Linux inode if present, -1 if the xfs_inode
89 * has no associated Linux inode.
90 */
91static inline int xfs_icount(struct xfs_inode *ip)
92{
93 struct inode *vp = VFS_I(ip);
94
95 if (vp)
96 return vn_count(vp);
97 return -1;
98}
99
100#define KTRACE_ENTER(ip, vk, s, line, ra) \
101 ktrace_enter( (ip)->i_trace, \
102/* 0 */ (void *)(__psint_t)(vk), \
103/* 1 */ (void *)(s), \
104/* 2 */ (void *)(__psint_t) line, \
105/* 3 */ (void *)(__psint_t)xfs_icount(ip), \
106/* 4 */ (void *)(ra), \
107/* 5 */ NULL, \
108/* 6 */ (void *)(__psint_t)current_cpu(), \
109/* 7 */ (void *)(__psint_t)current_pid(), \
110/* 8 */ (void *)__return_address, \
111/* 9 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL)
112
113/*
114 * Vnode tracing code.
115 */
116void
117_xfs_itrace_entry(xfs_inode_t *ip, const char *func, inst_t *ra)
118{
119 KTRACE_ENTER(ip, INODE_KTRACE_ENTRY, func, 0, ra);
120}
121
122void
123_xfs_itrace_exit(xfs_inode_t *ip, const char *func, inst_t *ra)
124{
125 KTRACE_ENTER(ip, INODE_KTRACE_EXIT, func, 0, ra);
126}
127
128void
129xfs_itrace_hold(xfs_inode_t *ip, char *file, int line, inst_t *ra)
130{
131 KTRACE_ENTER(ip, INODE_KTRACE_HOLD, file, line, ra);
132}
133
134void
135_xfs_itrace_ref(xfs_inode_t *ip, char *file, int line, inst_t *ra)
136{
137 KTRACE_ENTER(ip, INODE_KTRACE_REF, file, line, ra);
138}
139
140void
141xfs_itrace_rele(xfs_inode_t *ip, char *file, int line, inst_t *ra)
142{
143 KTRACE_ENTER(ip, INODE_KTRACE_RELE, file, line, ra);
144}
145#endif /* XFS_INODE_TRACE */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 683ce16210ff..f65983a230d3 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -18,7 +18,10 @@
18#ifndef __XFS_VNODE_H__ 18#ifndef __XFS_VNODE_H__
19#define __XFS_VNODE_H__ 19#define __XFS_VNODE_H__
20 20
21#include "xfs_fs.h"
22
21struct file; 23struct file;
24struct xfs_inode;
22struct xfs_iomap; 25struct xfs_iomap;
23struct attrlist_cursor_kern; 26struct attrlist_cursor_kern;
24 27
@@ -51,40 +54,6 @@ struct attrlist_cursor_kern;
51 Prevent VM access to the pages until 54 Prevent VM access to the pages until
52 the operation completes. */ 55 the operation completes. */
53 56
54
55extern void vn_init(void);
56
57/*
58 * Yeah, these don't take vnode anymore at all, all this should be
59 * cleaned up at some point.
60 */
61extern void vn_iowait(struct xfs_inode *ip);
62extern void vn_iowake(struct xfs_inode *ip);
63extern void vn_ioerror(struct xfs_inode *ip, int error, char *f, int l);
64
65static inline int vn_count(struct inode *vp)
66{
67 return atomic_read(&vp->i_count);
68}
69
70#define IHOLD(ip) \
71do { \
72 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
73 atomic_inc(&(VFS_I(ip)->i_count)); \
74 xfs_itrace_hold((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
75} while (0)
76
77#define IRELE(ip) \
78do { \
79 xfs_itrace_rele((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
80 iput(VFS_I(ip)); \
81} while (0)
82
83static inline struct inode *vn_grab(struct inode *vp)
84{
85 return igrab(vp);
86}
87
88/* 57/*
89 * Dealing with bad inodes 58 * Dealing with bad inodes
90 */ 59 */
@@ -121,39 +90,4 @@ static inline void vn_atime_to_time_t(struct inode *vp, time_t *tt)
121 PAGECACHE_TAG_DIRTY) 90 PAGECACHE_TAG_DIRTY)
122 91
123 92
124/*
125 * Tracking vnode activity.
126 */
127#if defined(XFS_INODE_TRACE)
128
129#define INODE_TRACE_SIZE 16 /* number of trace entries */
130#define INODE_KTRACE_ENTRY 1
131#define INODE_KTRACE_EXIT 2
132#define INODE_KTRACE_HOLD 3
133#define INODE_KTRACE_REF 4
134#define INODE_KTRACE_RELE 5
135
136extern void _xfs_itrace_entry(struct xfs_inode *, const char *, inst_t *);
137extern void _xfs_itrace_exit(struct xfs_inode *, const char *, inst_t *);
138extern void xfs_itrace_hold(struct xfs_inode *, char *, int, inst_t *);
139extern void _xfs_itrace_ref(struct xfs_inode *, char *, int, inst_t *);
140extern void xfs_itrace_rele(struct xfs_inode *, char *, int, inst_t *);
141#define xfs_itrace_entry(ip) \
142 _xfs_itrace_entry(ip, __func__, (inst_t *)__return_address)
143#define xfs_itrace_exit(ip) \
144 _xfs_itrace_exit(ip, __func__, (inst_t *)__return_address)
145#define xfs_itrace_exit_tag(ip, tag) \
146 _xfs_itrace_exit(ip, tag, (inst_t *)__return_address)
147#define xfs_itrace_ref(ip) \
148 _xfs_itrace_ref(ip, __FILE__, __LINE__, (inst_t *)__return_address)
149
150#else
151#define xfs_itrace_entry(a)
152#define xfs_itrace_exit(a)
153#define xfs_itrace_exit_tag(a, b)
154#define xfs_itrace_hold(a, b, c, d)
155#define xfs_itrace_ref(a)
156#define xfs_itrace_rele(a, b, c, d)
157#endif
158
159#endif /* __XFS_VNODE_H__ */ 93#endif /* __XFS_VNODE_H__ */
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index f2705f2fd43c..591ca6602bfb 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -101,7 +101,7 @@ xfs_qm_dqinit(
101 if (brandnewdquot) { 101 if (brandnewdquot) {
102 dqp->dq_flnext = dqp->dq_flprev = dqp; 102 dqp->dq_flnext = dqp->dq_flprev = dqp;
103 mutex_init(&dqp->q_qlock); 103 mutex_init(&dqp->q_qlock);
104 sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq"); 104 init_waitqueue_head(&dqp->q_pinwait);
105 105
106 /* 106 /*
107 * Because we want to use a counting completion, complete 107 * Because we want to use a counting completion, complete
@@ -131,7 +131,7 @@ xfs_qm_dqinit(
131 dqp->q_res_bcount = 0; 131 dqp->q_res_bcount = 0;
132 dqp->q_res_icount = 0; 132 dqp->q_res_icount = 0;
133 dqp->q_res_rtbcount = 0; 133 dqp->q_res_rtbcount = 0;
134 dqp->q_pincount = 0; 134 atomic_set(&dqp->q_pincount, 0);
135 dqp->q_hash = NULL; 135 dqp->q_hash = NULL;
136 ASSERT(dqp->dq_flnext == dqp->dq_flprev); 136 ASSERT(dqp->dq_flnext == dqp->dq_flprev);
137 137
@@ -1221,16 +1221,14 @@ xfs_qm_dqflush(
1221 xfs_dqtrace_entry(dqp, "DQFLUSH"); 1221 xfs_dqtrace_entry(dqp, "DQFLUSH");
1222 1222
1223 /* 1223 /*
1224 * If not dirty, nada. 1224 * If not dirty, or it's pinned and we are not supposed to
1225 * block, nada.
1225 */ 1226 */
1226 if (!XFS_DQ_IS_DIRTY(dqp)) { 1227 if (!XFS_DQ_IS_DIRTY(dqp) ||
1228 (!(flags & XFS_QMOPT_SYNC) && atomic_read(&dqp->q_pincount) > 0)) {
1227 xfs_dqfunlock(dqp); 1229 xfs_dqfunlock(dqp);
1228 return (0); 1230 return 0;
1229 } 1231 }
1230
1231 /*
1232 * Cant flush a pinned dquot. Wait for it.
1233 */
1234 xfs_qm_dqunpin_wait(dqp); 1232 xfs_qm_dqunpin_wait(dqp);
1235 1233
1236 /* 1234 /*
@@ -1274,10 +1272,8 @@ xfs_qm_dqflush(
1274 dqp->dq_flags &= ~(XFS_DQ_DIRTY); 1272 dqp->dq_flags &= ~(XFS_DQ_DIRTY);
1275 mp = dqp->q_mount; 1273 mp = dqp->q_mount;
1276 1274
1277 /* lsn is 64 bits */ 1275 xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
1278 spin_lock(&mp->m_ail_lock); 1276 &dqp->q_logitem.qli_item.li_lsn);
1279 dqp->q_logitem.qli_flush_lsn = dqp->q_logitem.qli_item.li_lsn;
1280 spin_unlock(&mp->m_ail_lock);
1281 1277
1282 /* 1278 /*
1283 * Attach an iodone routine so that we can remove this dquot from the 1279 * Attach an iodone routine so that we can remove this dquot from the
@@ -1323,8 +1319,10 @@ xfs_qm_dqflush_done(
1323 xfs_dq_logitem_t *qip) 1319 xfs_dq_logitem_t *qip)
1324{ 1320{
1325 xfs_dquot_t *dqp; 1321 xfs_dquot_t *dqp;
1322 struct xfs_ail *ailp;
1326 1323
1327 dqp = qip->qli_dquot; 1324 dqp = qip->qli_dquot;
1325 ailp = qip->qli_item.li_ailp;
1328 1326
1329 /* 1327 /*
1330 * We only want to pull the item from the AIL if its 1328 * We only want to pull the item from the AIL if its
@@ -1337,15 +1335,12 @@ xfs_qm_dqflush_done(
1337 if ((qip->qli_item.li_flags & XFS_LI_IN_AIL) && 1335 if ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
1338 qip->qli_item.li_lsn == qip->qli_flush_lsn) { 1336 qip->qli_item.li_lsn == qip->qli_flush_lsn) {
1339 1337
1340 spin_lock(&dqp->q_mount->m_ail_lock); 1338 /* xfs_trans_ail_delete() drops the AIL lock. */
1341 /* 1339 spin_lock(&ailp->xa_lock);
1342 * xfs_trans_delete_ail() drops the AIL lock.
1343 */
1344 if (qip->qli_item.li_lsn == qip->qli_flush_lsn) 1340 if (qip->qli_item.li_lsn == qip->qli_flush_lsn)
1345 xfs_trans_delete_ail(dqp->q_mount, 1341 xfs_trans_ail_delete(ailp, (xfs_log_item_t*)qip);
1346 (xfs_log_item_t*)qip);
1347 else 1342 else
1348 spin_unlock(&dqp->q_mount->m_ail_lock); 1343 spin_unlock(&ailp->xa_lock);
1349 } 1344 }
1350 1345
1351 /* 1346 /*
@@ -1375,7 +1370,7 @@ xfs_dqunlock(
1375 mutex_unlock(&(dqp->q_qlock)); 1370 mutex_unlock(&(dqp->q_qlock));
1376 if (dqp->q_logitem.qli_dquot == dqp) { 1371 if (dqp->q_logitem.qli_dquot == dqp) {
1377 /* Once was dqp->q_mount, but might just have been cleared */ 1372 /* Once was dqp->q_mount, but might just have been cleared */
1378 xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_mountp, 1373 xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_ailp,
1379 (xfs_log_item_t*)&(dqp->q_logitem)); 1374 (xfs_log_item_t*)&(dqp->q_logitem));
1380 } 1375 }
1381} 1376}
@@ -1489,7 +1484,7 @@ xfs_qm_dqpurge(
1489 "xfs_qm_dqpurge: dquot %p flush failed", dqp); 1484 "xfs_qm_dqpurge: dquot %p flush failed", dqp);
1490 xfs_dqflock(dqp); 1485 xfs_dqflock(dqp);
1491 } 1486 }
1492 ASSERT(dqp->q_pincount == 0); 1487 ASSERT(atomic_read(&dqp->q_pincount) == 0);
1493 ASSERT(XFS_FORCED_SHUTDOWN(mp) || 1488 ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
1494 !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL)); 1489 !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
1495 1490
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index 8958d0faf8d3..7e455337e2ba 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -83,8 +83,8 @@ typedef struct xfs_dquot {
83 xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */ 83 xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */
84 mutex_t q_qlock; /* quota lock */ 84 mutex_t q_qlock; /* quota lock */
85 struct completion q_flush; /* flush completion queue */ 85 struct completion q_flush; /* flush completion queue */
86 uint q_pincount; /* pin count for this dquot */ 86 atomic_t q_pincount; /* dquot pin count */
87 sv_t q_pinwait; /* sync var for pinning */ 87 wait_queue_head_t q_pinwait; /* dquot pinning wait queue */
88#ifdef XFS_DQUOT_TRACE 88#ifdef XFS_DQUOT_TRACE
89 struct ktrace *q_trace; /* trace header structure */ 89 struct ktrace *q_trace; /* trace header structure */
90#endif 90#endif
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index f028644caa5e..1728f6a7c4f5 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -88,25 +88,22 @@ xfs_qm_dquot_logitem_format(
88 88
89/* 89/*
90 * Increment the pin count of the given dquot. 90 * Increment the pin count of the given dquot.
91 * This value is protected by pinlock spinlock in the xQM structure.
92 */ 91 */
93STATIC void 92STATIC void
94xfs_qm_dquot_logitem_pin( 93xfs_qm_dquot_logitem_pin(
95 xfs_dq_logitem_t *logitem) 94 xfs_dq_logitem_t *logitem)
96{ 95{
97 xfs_dquot_t *dqp; 96 xfs_dquot_t *dqp = logitem->qli_dquot;
98 97
99 dqp = logitem->qli_dquot;
100 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 98 ASSERT(XFS_DQ_IS_LOCKED(dqp));
101 spin_lock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock)); 99 atomic_inc(&dqp->q_pincount);
102 dqp->q_pincount++;
103 spin_unlock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
104} 100}
105 101
106/* 102/*
107 * Decrement the pin count of the given dquot, and wake up 103 * Decrement the pin count of the given dquot, and wake up
108 * anyone in xfs_dqwait_unpin() if the count goes to 0. The 104 * anyone in xfs_dqwait_unpin() if the count goes to 0. The
109 * dquot must have been previously pinned with a call to xfs_dqpin(). 105 * dquot must have been previously pinned with a call to
106 * xfs_qm_dquot_logitem_pin().
110 */ 107 */
111/* ARGSUSED */ 108/* ARGSUSED */
112STATIC void 109STATIC void
@@ -114,16 +111,11 @@ xfs_qm_dquot_logitem_unpin(
114 xfs_dq_logitem_t *logitem, 111 xfs_dq_logitem_t *logitem,
115 int stale) 112 int stale)
116{ 113{
117 xfs_dquot_t *dqp; 114 xfs_dquot_t *dqp = logitem->qli_dquot;
118 115
119 dqp = logitem->qli_dquot; 116 ASSERT(atomic_read(&dqp->q_pincount) > 0);
120 ASSERT(dqp->q_pincount > 0); 117 if (atomic_dec_and_test(&dqp->q_pincount))
121 spin_lock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock)); 118 wake_up(&dqp->q_pinwait);
122 dqp->q_pincount--;
123 if (dqp->q_pincount == 0) {
124 sv_broadcast(&dqp->q_pinwait);
125 }
126 spin_unlock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
127} 119}
128 120
129/* ARGSUSED */ 121/* ARGSUSED */
@@ -193,21 +185,14 @@ xfs_qm_dqunpin_wait(
193 xfs_dquot_t *dqp) 185 xfs_dquot_t *dqp)
194{ 186{
195 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 187 ASSERT(XFS_DQ_IS_LOCKED(dqp));
196 if (dqp->q_pincount == 0) { 188 if (atomic_read(&dqp->q_pincount) == 0)
197 return; 189 return;
198 }
199 190
200 /* 191 /*
201 * Give the log a push so we don't wait here too long. 192 * Give the log a push so we don't wait here too long.
202 */ 193 */
203 xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE); 194 xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE);
204 spin_lock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock)); 195 wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
205 if (dqp->q_pincount == 0) {
206 spin_unlock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
207 return;
208 }
209 sv_wait(&(dqp->q_pinwait), PINOD,
210 &(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s);
211} 196}
212 197
213/* 198/*
@@ -310,7 +295,7 @@ xfs_qm_dquot_logitem_trylock(
310 uint retval; 295 uint retval;
311 296
312 dqp = qip->qli_dquot; 297 dqp = qip->qli_dquot;
313 if (dqp->q_pincount > 0) 298 if (atomic_read(&dqp->q_pincount) > 0)
314 return (XFS_ITEM_PINNED); 299 return (XFS_ITEM_PINNED);
315 300
316 if (! xfs_qm_dqlock_nowait(dqp)) 301 if (! xfs_qm_dqlock_nowait(dqp))
@@ -568,14 +553,16 @@ xfs_qm_qoffend_logitem_committed(
568 xfs_lsn_t lsn) 553 xfs_lsn_t lsn)
569{ 554{
570 xfs_qoff_logitem_t *qfs; 555 xfs_qoff_logitem_t *qfs;
556 struct xfs_ail *ailp;
571 557
572 qfs = qfe->qql_start_lip; 558 qfs = qfe->qql_start_lip;
573 spin_lock(&qfs->qql_item.li_mountp->m_ail_lock); 559 ailp = qfs->qql_item.li_ailp;
560 spin_lock(&ailp->xa_lock);
574 /* 561 /*
575 * Delete the qoff-start logitem from the AIL. 562 * Delete the qoff-start logitem from the AIL.
576 * xfs_trans_delete_ail() drops the AIL lock. 563 * xfs_trans_ail_delete() drops the AIL lock.
577 */ 564 */
578 xfs_trans_delete_ail(qfs->qql_item.li_mountp, (xfs_log_item_t *)qfs); 565 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs);
579 kmem_free(qfs); 566 kmem_free(qfs);
580 kmem_free(qfe); 567 kmem_free(qfe);
581 return (xfs_lsn_t)-1; 568 return (xfs_lsn_t)-1;
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index df0ffef9775a..6b13960cf318 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -20,7 +20,6 @@
20#include "xfs_bit.h" 20#include "xfs_bit.h"
21#include "xfs_log.h" 21#include "xfs_log.h"
22#include "xfs_inum.h" 22#include "xfs_inum.h"
23#include "xfs_clnt.h"
24#include "xfs_trans.h" 23#include "xfs_trans.h"
25#include "xfs_sb.h" 24#include "xfs_sb.h"
26#include "xfs_ag.h" 25#include "xfs_ag.h"
@@ -396,13 +395,10 @@ xfs_qm_mount_quotas(
396/* 395/*
397 * Called from the vfsops layer. 396 * Called from the vfsops layer.
398 */ 397 */
399int 398void
400xfs_qm_unmount_quotas( 399xfs_qm_unmount_quotas(
401 xfs_mount_t *mp) 400 xfs_mount_t *mp)
402{ 401{
403 xfs_inode_t *uqp, *gqp;
404 int error = 0;
405
406 /* 402 /*
407 * Release the dquots that root inode, et al might be holding, 403 * Release the dquots that root inode, et al might be holding,
408 * before we flush quotas and blow away the quotainfo structure. 404 * before we flush quotas and blow away the quotainfo structure.
@@ -415,43 +411,18 @@ xfs_qm_unmount_quotas(
415 xfs_qm_dqdetach(mp->m_rsumip); 411 xfs_qm_dqdetach(mp->m_rsumip);
416 412
417 /* 413 /*
418 * Flush out the quota inodes. 414 * Release the quota inodes.
419 */ 415 */
420 uqp = gqp = NULL;
421 if (mp->m_quotainfo) { 416 if (mp->m_quotainfo) {
422 if ((uqp = mp->m_quotainfo->qi_uquotaip) != NULL) { 417 if (mp->m_quotainfo->qi_uquotaip) {
423 xfs_ilock(uqp, XFS_ILOCK_EXCL); 418 IRELE(mp->m_quotainfo->qi_uquotaip);
424 xfs_iflock(uqp); 419 mp->m_quotainfo->qi_uquotaip = NULL;
425 error = xfs_iflush(uqp, XFS_IFLUSH_SYNC);
426 xfs_iunlock(uqp, XFS_ILOCK_EXCL);
427 if (unlikely(error == EFSCORRUPTED)) {
428 XFS_ERROR_REPORT("xfs_qm_unmount_quotas(1)",
429 XFS_ERRLEVEL_LOW, mp);
430 goto out;
431 }
432 } 420 }
433 if ((gqp = mp->m_quotainfo->qi_gquotaip) != NULL) { 421 if (mp->m_quotainfo->qi_gquotaip) {
434 xfs_ilock(gqp, XFS_ILOCK_EXCL); 422 IRELE(mp->m_quotainfo->qi_gquotaip);
435 xfs_iflock(gqp); 423 mp->m_quotainfo->qi_gquotaip = NULL;
436 error = xfs_iflush(gqp, XFS_IFLUSH_SYNC);
437 xfs_iunlock(gqp, XFS_ILOCK_EXCL);
438 if (unlikely(error == EFSCORRUPTED)) {
439 XFS_ERROR_REPORT("xfs_qm_unmount_quotas(2)",
440 XFS_ERRLEVEL_LOW, mp);
441 goto out;
442 }
443 } 424 }
444 } 425 }
445 if (uqp) {
446 IRELE(uqp);
447 mp->m_quotainfo->qi_uquotaip = NULL;
448 }
449 if (gqp) {
450 IRELE(gqp);
451 mp->m_quotainfo->qi_gquotaip = NULL;
452 }
453out:
454 return XFS_ERROR(error);
455} 426}
456 427
457/* 428/*
@@ -987,14 +958,10 @@ xfs_qm_dqdetach(
987} 958}
988 959
989/* 960/*
990 * This is called by VFS_SYNC and flags arg determines the caller, 961 * This is called to sync quotas. We can be told to use non-blocking
991 * and its motives, as done in xfs_sync. 962 * semantics by either the SYNC_BDFLUSH flag or the absence of the
992 * 963 * SYNC_WAIT flag.
993 * vfs_sync: SYNC_FSDATA|SYNC_ATTR|SYNC_BDFLUSH 0x31
994 * syscall sync: SYNC_FSDATA|SYNC_ATTR|SYNC_DELWRI 0x25
995 * umountroot : SYNC_WAIT | SYNC_CLOSE | SYNC_ATTR | SYNC_FSDATA
996 */ 964 */
997
998int 965int
999xfs_qm_sync( 966xfs_qm_sync(
1000 xfs_mount_t *mp, 967 xfs_mount_t *mp,
@@ -1137,7 +1104,6 @@ xfs_qm_init_quotainfo(
1137 return error; 1104 return error;
1138 } 1105 }
1139 1106
1140 spin_lock_init(&qinf->qi_pinlock);
1141 xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0); 1107 xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0);
1142 qinf->qi_dqreclaims = 0; 1108 qinf->qi_dqreclaims = 0;
1143 1109
@@ -1234,7 +1200,6 @@ xfs_qm_destroy_quotainfo(
1234 */ 1200 */
1235 xfs_qm_rele_quotafs_ref(mp); 1201 xfs_qm_rele_quotafs_ref(mp);
1236 1202
1237 spinlock_destroy(&qi->qi_pinlock);
1238 xfs_qm_list_destroy(&qi->qi_dqlist); 1203 xfs_qm_list_destroy(&qi->qi_dqlist);
1239 1204
1240 if (qi->qi_uquotaip) { 1205 if (qi->qi_uquotaip) {
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index 44f25349e478..ddf09166387c 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -106,7 +106,6 @@ typedef struct xfs_qm {
106typedef struct xfs_quotainfo { 106typedef struct xfs_quotainfo {
107 xfs_inode_t *qi_uquotaip; /* user quota inode */ 107 xfs_inode_t *qi_uquotaip; /* user quota inode */
108 xfs_inode_t *qi_gquotaip; /* group quota inode */ 108 xfs_inode_t *qi_gquotaip; /* group quota inode */
109 spinlock_t qi_pinlock; /* dquot pinning lock */
110 xfs_dqlist_t qi_dqlist; /* all dquots in filesys */ 109 xfs_dqlist_t qi_dqlist; /* all dquots in filesys */
111 int qi_dqreclaims; /* a change here indicates 110 int qi_dqreclaims; /* a change here indicates
112 a removal in the dqlist */ 111 a removal in the dqlist */
@@ -168,7 +167,7 @@ extern void xfs_qm_destroy_quotainfo(xfs_mount_t *);
168extern void xfs_qm_mount_quotas(xfs_mount_t *); 167extern void xfs_qm_mount_quotas(xfs_mount_t *);
169extern int xfs_qm_quotacheck(xfs_mount_t *); 168extern int xfs_qm_quotacheck(xfs_mount_t *);
170extern void xfs_qm_unmount_quotadestroy(xfs_mount_t *); 169extern void xfs_qm_unmount_quotadestroy(xfs_mount_t *);
171extern int xfs_qm_unmount_quotas(xfs_mount_t *); 170extern void xfs_qm_unmount_quotas(xfs_mount_t *);
172extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t); 171extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
173extern int xfs_qm_sync(xfs_mount_t *, int); 172extern int xfs_qm_sync(xfs_mount_t *, int);
174 173
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index eea2e60b456b..bc6c5cca3e12 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -20,7 +20,6 @@
20#include "xfs_bit.h" 20#include "xfs_bit.h"
21#include "xfs_log.h" 21#include "xfs_log.h"
22#include "xfs_inum.h" 22#include "xfs_inum.h"
23#include "xfs_clnt.h"
24#include "xfs_trans.h" 23#include "xfs_trans.h"
25#include "xfs_sb.h" 24#include "xfs_sb.h"
26#include "xfs_ag.h" 25#include "xfs_ag.h"
@@ -51,7 +50,7 @@
51 50
52STATIC void 51STATIC void
53xfs_fill_statvfs_from_dquot( 52xfs_fill_statvfs_from_dquot(
54 bhv_statvfs_t *statp, 53 struct kstatfs *statp,
55 xfs_disk_dquot_t *dp) 54 xfs_disk_dquot_t *dp)
56{ 55{
57 __uint64_t limit; 56 __uint64_t limit;
@@ -88,7 +87,7 @@ xfs_fill_statvfs_from_dquot(
88STATIC void 87STATIC void
89xfs_qm_statvfs( 88xfs_qm_statvfs(
90 xfs_inode_t *ip, 89 xfs_inode_t *ip,
91 bhv_statvfs_t *statp) 90 struct kstatfs *statp)
92{ 91{
93 xfs_mount_t *mp = ip->i_mount; 92 xfs_mount_t *mp = ip->i_mount;
94 xfs_dquot_t *dqp; 93 xfs_dquot_t *dqp;
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 1a3b803dfa55..68139b38aede 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -127,7 +127,7 @@ xfs_qm_quotactl(
127 break; 127 break;
128 128
129 case Q_XQUOTASYNC: 129 case Q_XQUOTASYNC:
130 return (xfs_sync_inodes(mp, SYNC_DELWRI, NULL)); 130 return xfs_sync_inodes(mp, SYNC_DELWRI);
131 131
132 default: 132 default:
133 break; 133 break;
@@ -1022,101 +1022,104 @@ xfs_qm_export_flags(
1022 1022
1023 1023
1024/* 1024/*
1025 * Go thru all the inodes in the file system, releasing their dquots. 1025 * Release all the dquots on the inodes in an AG.
1026 * Note that the mount structure gets modified to indicate that quotas are off
1027 * AFTER this, in the case of quotaoff. This also gets called from
1028 * xfs_rootumount.
1029 */ 1026 */
1030void 1027STATIC void
1031xfs_qm_dqrele_all_inodes( 1028xfs_qm_dqrele_inodes_ag(
1032 struct xfs_mount *mp, 1029 xfs_mount_t *mp,
1033 uint flags) 1030 int ag,
1031 uint flags)
1034{ 1032{
1035 xfs_inode_t *ip, *topino; 1033 xfs_inode_t *ip = NULL;
1036 uint ireclaims; 1034 xfs_perag_t *pag = &mp->m_perag[ag];
1037 struct inode *vp; 1035 int first_index = 0;
1038 boolean_t vnode_refd; 1036 int nr_found;
1039 1037
1040 ASSERT(mp->m_quotainfo);
1041
1042 XFS_MOUNT_ILOCK(mp);
1043again:
1044 ip = mp->m_inodes;
1045 if (ip == NULL) {
1046 XFS_MOUNT_IUNLOCK(mp);
1047 return;
1048 }
1049 do { 1038 do {
1050 /* Skip markers inserted by xfs_sync */ 1039 /*
1051 if (ip->i_mount == NULL) { 1040 * use a gang lookup to find the next inode in the tree
1052 ip = ip->i_mnext; 1041 * as the tree is sparse and a gang lookup walks to find
1053 continue; 1042 * the number of objects requested.
1043 */
1044 read_lock(&pag->pag_ici_lock);
1045 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
1046 (void**)&ip, first_index, 1);
1047
1048 if (!nr_found) {
1049 read_unlock(&pag->pag_ici_lock);
1050 break;
1054 } 1051 }
1055 /* Root inode, rbmip and rsumip have associated blocks */ 1052
1053 /*
1054 * Update the index for the next lookup. Catch overflows
1055 * into the next AG range which can occur if we have inodes
1056 * in the last block of the AG and we are currently
1057 * pointing to the last inode.
1058 */
1059 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
1060 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
1061 read_unlock(&pag->pag_ici_lock);
1062 break;
1063 }
1064
1065 /* skip quota inodes */
1056 if (ip == XFS_QI_UQIP(mp) || ip == XFS_QI_GQIP(mp)) { 1066 if (ip == XFS_QI_UQIP(mp) || ip == XFS_QI_GQIP(mp)) {
1057 ASSERT(ip->i_udquot == NULL); 1067 ASSERT(ip->i_udquot == NULL);
1058 ASSERT(ip->i_gdquot == NULL); 1068 ASSERT(ip->i_gdquot == NULL);
1059 ip = ip->i_mnext; 1069 read_unlock(&pag->pag_ici_lock);
1060 continue; 1070 continue;
1061 } 1071 }
1062 vp = VFS_I(ip); 1072
1063 if (!vp) { 1073 /*
1064 ASSERT(ip->i_udquot == NULL); 1074 * If we can't get a reference on the inode, it must be
1065 ASSERT(ip->i_gdquot == NULL); 1075 * in reclaim. Leave it for the reclaim code to flush.
1066 ip = ip->i_mnext; 1076 */
1077 if (!igrab(VFS_I(ip))) {
1078 read_unlock(&pag->pag_ici_lock);
1067 continue; 1079 continue;
1068 } 1080 }
1069 vnode_refd = B_FALSE; 1081 read_unlock(&pag->pag_ici_lock);
1070 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) { 1082
1071 ireclaims = mp->m_ireclaims; 1083 /* avoid new inodes though we shouldn't find any here */
1072 topino = mp->m_inodes; 1084 if (xfs_iflags_test(ip, XFS_INEW)) {
1073 vp = vn_grab(vp); 1085 IRELE(ip);
1074 if (!vp) 1086 continue;
1075 goto again;
1076
1077 XFS_MOUNT_IUNLOCK(mp);
1078 /* XXX restart limit ? */
1079 xfs_ilock(ip, XFS_ILOCK_EXCL);
1080 vnode_refd = B_TRUE;
1081 } else {
1082 ireclaims = mp->m_ireclaims;
1083 topino = mp->m_inodes;
1084 XFS_MOUNT_IUNLOCK(mp);
1085 } 1087 }
1086 1088
1087 /* 1089 xfs_ilock(ip, XFS_ILOCK_EXCL);
1088 * We don't keep the mountlock across the dqrele() call,
1089 * since it can take a while..
1090 */
1091 if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) { 1090 if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
1092 xfs_qm_dqrele(ip->i_udquot); 1091 xfs_qm_dqrele(ip->i_udquot);
1093 ip->i_udquot = NULL; 1092 ip->i_udquot = NULL;
1094 } 1093 }
1095 if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) { 1094 if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) &&
1095 ip->i_gdquot) {
1096 xfs_qm_dqrele(ip->i_gdquot); 1096 xfs_qm_dqrele(ip->i_gdquot);
1097 ip->i_gdquot = NULL; 1097 ip->i_gdquot = NULL;
1098 } 1098 }
1099 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1099 xfs_iput(ip, XFS_ILOCK_EXCL);
1100 /* 1100
1101 * Wait until we've dropped the ilock and mountlock to 1101 } while (nr_found);
1102 * do the vn_rele. Or be condemned to an eternity in the 1102}
1103 * inactive code in hell. 1103
1104 */ 1104/*
1105 if (vnode_refd) 1105 * Go thru all the inodes in the file system, releasing their dquots.
1106 IRELE(ip); 1106 * Note that the mount structure gets modified to indicate that quotas are off
1107 XFS_MOUNT_ILOCK(mp); 1107 * AFTER this, in the case of quotaoff. This also gets called from
1108 /* 1108 * xfs_rootumount.
1109 * If an inode was inserted or removed, we gotta 1109 */
1110 * start over again. 1110void
1111 */ 1111xfs_qm_dqrele_all_inodes(
1112 if (topino != mp->m_inodes || mp->m_ireclaims != ireclaims) { 1112 struct xfs_mount *mp,
1113 /* XXX use a sentinel */ 1113 uint flags)
1114 goto again; 1114{
1115 } 1115 int i;
1116 ip = ip->i_mnext;
1117 } while (ip != mp->m_inodes);
1118 1116
1119 XFS_MOUNT_IUNLOCK(mp); 1117 ASSERT(mp->m_quotainfo);
1118 for (i = 0; i < mp->m_sb.sb_agcount; i++) {
1119 if (!mp->m_perag[i].pag_ici_init)
1120 continue;
1121 xfs_qm_dqrele_inodes_ag(mp, i, flags);
1122 }
1120} 1123}
1121 1124
1122/*------------------------------------------------------------------------*/ 1125/*------------------------------------------------------------------------*/
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index c27abef7b84f..ae5482965424 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -18,6 +18,13 @@
18#include <xfs.h> 18#include <xfs.h>
19#include "debug.h" 19#include "debug.h"
20 20
21/* xfs_mount.h drags a lot of crap in, sorry.. */
22#include "xfs_sb.h"
23#include "xfs_inum.h"
24#include "xfs_ag.h"
25#include "xfs_dmapi.h"
26#include "xfs_mount.h"
27
21static char message[1024]; /* keep it off the stack */ 28static char message[1024]; /* keep it off the stack */
22static DEFINE_SPINLOCK(xfs_err_lock); 29static DEFINE_SPINLOCK(xfs_err_lock);
23 30
@@ -55,22 +62,42 @@ cmn_err(register int level, char *fmt, ...)
55} 62}
56 63
57void 64void
58icmn_err(register int level, char *fmt, va_list ap) 65xfs_fs_vcmn_err(
66 int level,
67 struct xfs_mount *mp,
68 char *fmt,
69 va_list ap)
59{ 70{
60 ulong flags; 71 unsigned long flags;
61 int len; 72 int len = 0;
62 73
63 level &= XFS_ERR_MASK; 74 level &= XFS_ERR_MASK;
64 if(level > XFS_MAX_ERR_LEVEL) 75 if (level > XFS_MAX_ERR_LEVEL)
65 level = XFS_MAX_ERR_LEVEL; 76 level = XFS_MAX_ERR_LEVEL;
77
66 spin_lock_irqsave(&xfs_err_lock,flags); 78 spin_lock_irqsave(&xfs_err_lock,flags);
67 len = vsnprintf(message, sizeof(message), fmt, ap); 79
80 if (mp) {
81 len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname);
82
83 /*
84 * Skip the printk if we can't print anything useful
85 * due to an over-long device name.
86 */
87 if (len >= sizeof(message))
88 goto out;
89 }
90
91 len = vsnprintf(message + len, sizeof(message) - len, fmt, ap);
68 if (len >= sizeof(message)) 92 if (len >= sizeof(message))
69 len = sizeof(message) - 1; 93 len = sizeof(message) - 1;
70 if (message[len-1] == '\n') 94 if (message[len-1] == '\n')
71 message[len-1] = 0; 95 message[len-1] = 0;
96
72 printk("%s%s\n", err_level[level], message); 97 printk("%s%s\n", err_level[level], message);
98 out:
73 spin_unlock_irqrestore(&xfs_err_lock,flags); 99 spin_unlock_irqrestore(&xfs_err_lock,flags);
100
74 BUG_ON(level == CE_PANIC); 101 BUG_ON(level == CE_PANIC);
75} 102}
76 103
@@ -84,5 +111,5 @@ assfail(char *expr, char *file, int line)
84void 111void
85xfs_hex_dump(void *p, int length) 112xfs_hex_dump(void *p, int length)
86{ 113{
87 print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_OFFSET, 16, 1, p, length, 1); 114 print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
88} 115}
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index 75845f950814..6f4fd37c67af 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -27,8 +27,6 @@
27#define CE_ALERT 1 /* alert */ 27#define CE_ALERT 1 /* alert */
28#define CE_PANIC 0 /* panic */ 28#define CE_PANIC 0 /* panic */
29 29
30extern void icmn_err(int, char *, va_list)
31 __attribute__ ((format (printf, 2, 0)));
32extern void cmn_err(int, char *, ...) 30extern void cmn_err(int, char *, ...)
33 __attribute__ ((format (printf, 2, 3))); 31 __attribute__ ((format (printf, 2, 3)));
34extern void assfail(char *expr, char *f, int l); 32extern void assfail(char *expr, char *f, int l);
diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
index a34ef05489b1..2d494c26717f 100644
--- a/fs/xfs/support/ktrace.c
+++ b/fs/xfs/support/ktrace.c
@@ -113,21 +113,16 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep)
113void 113void
114ktrace_free(ktrace_t *ktp) 114ktrace_free(ktrace_t *ktp)
115{ 115{
116 int entries_size;
117
118 if (ktp == (ktrace_t *)NULL) 116 if (ktp == (ktrace_t *)NULL)
119 return; 117 return;
120 118
121 /* 119 /*
122 * Special treatment for the Vnode trace buffer. 120 * Special treatment for the Vnode trace buffer.
123 */ 121 */
124 if (ktp->kt_nentries == ktrace_zentries) { 122 if (ktp->kt_nentries == ktrace_zentries)
125 kmem_zone_free(ktrace_ent_zone, ktp->kt_entries); 123 kmem_zone_free(ktrace_ent_zone, ktp->kt_entries);
126 } else { 124 else
127 entries_size = (int)(ktp->kt_nentries * sizeof(ktrace_entry_t));
128
129 kmem_free(ktp->kt_entries); 125 kmem_free(ktp->kt_entries);
130 }
131 126
132 kmem_zone_free(ktrace_hdr_zone, ktp); 127 kmem_zone_free(ktrace_hdr_zone, ktp);
133} 128}
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index 540e4c989825..17254b529c54 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -30,7 +30,7 @@
30#define XFS_ATTR_TRACE 1 30#define XFS_ATTR_TRACE 1
31#define XFS_BLI_TRACE 1 31#define XFS_BLI_TRACE 1
32#define XFS_BMAP_TRACE 1 32#define XFS_BMAP_TRACE 1
33#define XFS_BMBT_TRACE 1 33#define XFS_BTREE_TRACE 1
34#define XFS_DIR2_TRACE 1 34#define XFS_DIR2_TRACE 1
35#define XFS_DQUOT_TRACE 1 35#define XFS_DQUOT_TRACE 1
36#define XFS_ILOCK_TRACE 1 36#define XFS_ILOCK_TRACE 1
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 91d69338d3b2..a8cdd73999a4 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -758,7 +758,7 @@ xfs_acl_setmode(
758 if (gap && nomask) 758 if (gap && nomask)
759 iattr.ia_mode |= gap->ae_perm << 3; 759 iattr.ia_mode |= gap->ae_perm << 3;
760 760
761 return xfs_setattr(XFS_I(vp), &iattr, 0, sys_cred); 761 return xfs_setattr(XFS_I(vp), &iattr, 0);
762} 762}
763 763
764/* 764/*
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 61b292a9fb41..f2e21817a226 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -91,6 +91,8 @@ typedef struct xfs_agf {
91#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp)) 91#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
92#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)XFS_BUF_PTR(bp)) 92#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)XFS_BUF_PTR(bp))
93 93
94extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
95 xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
94 96
95/* 97/*
96 * Size of the unlinked inode hash table in the agi. 98 * Size of the unlinked inode hash table in the agi.
@@ -142,6 +144,9 @@ typedef struct xfs_agi {
142#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp)) 144#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
143#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)XFS_BUF_PTR(bp)) 145#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)XFS_BUF_PTR(bp))
144 146
147extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
148 xfs_agnumber_t agno, struct xfs_buf **bpp);
149
145/* 150/*
146 * The third a.g. block contains the a.g. freelist, an array 151 * The third a.g. block contains the a.g. freelist, an array
147 * of block pointers to blocks owned by the allocation btree code. 152 * of block pointers to blocks owned by the allocation btree code.
@@ -192,17 +197,23 @@ typedef struct xfs_perag
192 xfs_agino_t pagi_freecount; /* number of free inodes */ 197 xfs_agino_t pagi_freecount; /* number of free inodes */
193 xfs_agino_t pagi_count; /* number of allocated inodes */ 198 xfs_agino_t pagi_count; /* number of allocated inodes */
194 int pagb_count; /* pagb slots in use */ 199 int pagb_count; /* pagb slots in use */
200 xfs_perag_busy_t *pagb_list; /* unstable blocks */
195#ifdef __KERNEL__ 201#ifdef __KERNEL__
196 spinlock_t pagb_lock; /* lock for pagb_list */ 202 spinlock_t pagb_lock; /* lock for pagb_list */
197#endif 203
198 xfs_perag_busy_t *pagb_list; /* unstable blocks */
199 atomic_t pagf_fstrms; /* # of filestreams active in this AG */ 204 atomic_t pagf_fstrms; /* # of filestreams active in this AG */
200 205
201 int pag_ici_init; /* incore inode cache initialised */ 206 int pag_ici_init; /* incore inode cache initialised */
202 rwlock_t pag_ici_lock; /* incore inode lock */ 207 rwlock_t pag_ici_lock; /* incore inode lock */
203 struct radix_tree_root pag_ici_root; /* incore inode cache root */ 208 struct radix_tree_root pag_ici_root; /* incore inode cache root */
209#endif
204} xfs_perag_t; 210} xfs_perag_t;
205 211
212/*
213 * tags for inode radix tree
214 */
215#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */
216
206#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) 217#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels)
207#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \ 218#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \
208 (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp))) 219 (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 1956f83489f1..028e44e58ea9 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -90,6 +90,92 @@ STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
90 */ 90 */
91 91
92/* 92/*
93 * Lookup the record equal to [bno, len] in the btree given by cur.
94 */
95STATIC int /* error */
96xfs_alloc_lookup_eq(
97 struct xfs_btree_cur *cur, /* btree cursor */
98 xfs_agblock_t bno, /* starting block of extent */
99 xfs_extlen_t len, /* length of extent */
100 int *stat) /* success/failure */
101{
102 cur->bc_rec.a.ar_startblock = bno;
103 cur->bc_rec.a.ar_blockcount = len;
104 return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
105}
106
107/*
108 * Lookup the first record greater than or equal to [bno, len]
109 * in the btree given by cur.
110 */
111STATIC int /* error */
112xfs_alloc_lookup_ge(
113 struct xfs_btree_cur *cur, /* btree cursor */
114 xfs_agblock_t bno, /* starting block of extent */
115 xfs_extlen_t len, /* length of extent */
116 int *stat) /* success/failure */
117{
118 cur->bc_rec.a.ar_startblock = bno;
119 cur->bc_rec.a.ar_blockcount = len;
120 return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
121}
122
123/*
124 * Lookup the first record less than or equal to [bno, len]
125 * in the btree given by cur.
126 */
127STATIC int /* error */
128xfs_alloc_lookup_le(
129 struct xfs_btree_cur *cur, /* btree cursor */
130 xfs_agblock_t bno, /* starting block of extent */
131 xfs_extlen_t len, /* length of extent */
132 int *stat) /* success/failure */
133{
134 cur->bc_rec.a.ar_startblock = bno;
135 cur->bc_rec.a.ar_blockcount = len;
136 return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
137}
138
139/*
140 * Update the record referred to by cur to the value given
141 * by [bno, len].
142 * This either works (return 0) or gets an EFSCORRUPTED error.
143 */
144STATIC int /* error */
145xfs_alloc_update(
146 struct xfs_btree_cur *cur, /* btree cursor */
147 xfs_agblock_t bno, /* starting block of extent */
148 xfs_extlen_t len) /* length of extent */
149{
150 union xfs_btree_rec rec;
151
152 rec.alloc.ar_startblock = cpu_to_be32(bno);
153 rec.alloc.ar_blockcount = cpu_to_be32(len);
154 return xfs_btree_update(cur, &rec);
155}
156
157/*
158 * Get the data from the pointed-to record.
159 */
160STATIC int /* error */
161xfs_alloc_get_rec(
162 struct xfs_btree_cur *cur, /* btree cursor */
163 xfs_agblock_t *bno, /* output: starting block of extent */
164 xfs_extlen_t *len, /* output: length of extent */
165 int *stat) /* output: success/failure */
166{
167 union xfs_btree_rec *rec;
168 int error;
169
170 error = xfs_btree_get_rec(cur, &rec, stat);
171 if (!error && *stat == 1) {
172 *bno = be32_to_cpu(rec->alloc.ar_startblock);
173 *len = be32_to_cpu(rec->alloc.ar_blockcount);
174 }
175 return error;
176}
177
178/*
93 * Compute aligned version of the found extent. 179 * Compute aligned version of the found extent.
94 * Takes alignment and min length into account. 180 * Takes alignment and min length into account.
95 */ 181 */
@@ -294,21 +380,20 @@ xfs_alloc_fixup_trees(
294 return error; 380 return error;
295 XFS_WANT_CORRUPTED_RETURN(i == 1); 381 XFS_WANT_CORRUPTED_RETURN(i == 1);
296 } 382 }
383
297#ifdef DEBUG 384#ifdef DEBUG
298 { 385 if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
299 xfs_alloc_block_t *bnoblock; 386 struct xfs_btree_block *bnoblock;
300 xfs_alloc_block_t *cntblock; 387 struct xfs_btree_block *cntblock;
301 388
302 if (bno_cur->bc_nlevels == 1 && 389 bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
303 cnt_cur->bc_nlevels == 1) { 390 cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
304 bnoblock = XFS_BUF_TO_ALLOC_BLOCK(bno_cur->bc_bufs[0]); 391
305 cntblock = XFS_BUF_TO_ALLOC_BLOCK(cnt_cur->bc_bufs[0]); 392 XFS_WANT_CORRUPTED_RETURN(
306 XFS_WANT_CORRUPTED_RETURN( 393 bnoblock->bb_numrecs == cntblock->bb_numrecs);
307 be16_to_cpu(bnoblock->bb_numrecs) ==
308 be16_to_cpu(cntblock->bb_numrecs));
309 }
310 } 394 }
311#endif 395#endif
396
312 /* 397 /*
313 * Deal with all four cases: the allocated record is contained 398 * Deal with all four cases: the allocated record is contained
314 * within the freespace record, so we can have new freespace 399 * within the freespace record, so we can have new freespace
@@ -333,7 +418,7 @@ xfs_alloc_fixup_trees(
333 /* 418 /*
334 * Delete the entry from the by-size btree. 419 * Delete the entry from the by-size btree.
335 */ 420 */
336 if ((error = xfs_alloc_delete(cnt_cur, &i))) 421 if ((error = xfs_btree_delete(cnt_cur, &i)))
337 return error; 422 return error;
338 XFS_WANT_CORRUPTED_RETURN(i == 1); 423 XFS_WANT_CORRUPTED_RETURN(i == 1);
339 /* 424 /*
@@ -343,7 +428,7 @@ xfs_alloc_fixup_trees(
343 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i))) 428 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
344 return error; 429 return error;
345 XFS_WANT_CORRUPTED_RETURN(i == 0); 430 XFS_WANT_CORRUPTED_RETURN(i == 0);
346 if ((error = xfs_alloc_insert(cnt_cur, &i))) 431 if ((error = xfs_btree_insert(cnt_cur, &i)))
347 return error; 432 return error;
348 XFS_WANT_CORRUPTED_RETURN(i == 1); 433 XFS_WANT_CORRUPTED_RETURN(i == 1);
349 } 434 }
@@ -351,7 +436,7 @@ xfs_alloc_fixup_trees(
351 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i))) 436 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
352 return error; 437 return error;
353 XFS_WANT_CORRUPTED_RETURN(i == 0); 438 XFS_WANT_CORRUPTED_RETURN(i == 0);
354 if ((error = xfs_alloc_insert(cnt_cur, &i))) 439 if ((error = xfs_btree_insert(cnt_cur, &i)))
355 return error; 440 return error;
356 XFS_WANT_CORRUPTED_RETURN(i == 1); 441 XFS_WANT_CORRUPTED_RETURN(i == 1);
357 } 442 }
@@ -362,7 +447,7 @@ xfs_alloc_fixup_trees(
362 /* 447 /*
363 * No remaining freespace, just delete the by-block tree entry. 448 * No remaining freespace, just delete the by-block tree entry.
364 */ 449 */
365 if ((error = xfs_alloc_delete(bno_cur, &i))) 450 if ((error = xfs_btree_delete(bno_cur, &i)))
366 return error; 451 return error;
367 XFS_WANT_CORRUPTED_RETURN(i == 1); 452 XFS_WANT_CORRUPTED_RETURN(i == 1);
368 } else { 453 } else {
@@ -379,7 +464,7 @@ xfs_alloc_fixup_trees(
379 if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i))) 464 if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
380 return error; 465 return error;
381 XFS_WANT_CORRUPTED_RETURN(i == 0); 466 XFS_WANT_CORRUPTED_RETURN(i == 0);
382 if ((error = xfs_alloc_insert(bno_cur, &i))) 467 if ((error = xfs_btree_insert(bno_cur, &i)))
383 return error; 468 return error;
384 XFS_WANT_CORRUPTED_RETURN(i == 1); 469 XFS_WANT_CORRUPTED_RETURN(i == 1);
385 } 470 }
@@ -640,8 +725,8 @@ xfs_alloc_ag_vextent_exact(
640 /* 725 /*
641 * Allocate/initialize a cursor for the by-number freespace btree. 726 * Allocate/initialize a cursor for the by-number freespace btree.
642 */ 727 */
643 bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, 728 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
644 args->agno, XFS_BTNUM_BNO, NULL, 0); 729 args->agno, XFS_BTNUM_BNO);
645 /* 730 /*
646 * Lookup bno and minlen in the btree (minlen is irrelevant, really). 731 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
647 * Look for the closest free block <= bno, it must contain bno 732 * Look for the closest free block <= bno, it must contain bno
@@ -696,8 +781,8 @@ xfs_alloc_ag_vextent_exact(
696 * We are allocating agbno for rlen [agbno .. end] 781 * We are allocating agbno for rlen [agbno .. end]
697 * Allocate/initialize a cursor for the by-size btree. 782 * Allocate/initialize a cursor for the by-size btree.
698 */ 783 */
699 cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, 784 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
700 args->agno, XFS_BTNUM_CNT, NULL, 0); 785 args->agno, XFS_BTNUM_CNT);
701 ASSERT(args->agbno + args->len <= 786 ASSERT(args->agbno + args->len <=
702 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); 787 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
703 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, 788 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
@@ -759,8 +844,8 @@ xfs_alloc_ag_vextent_near(
759 /* 844 /*
760 * Get a cursor for the by-size btree. 845 * Get a cursor for the by-size btree.
761 */ 846 */
762 cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, 847 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
763 args->agno, XFS_BTNUM_CNT, NULL, 0); 848 args->agno, XFS_BTNUM_CNT);
764 ltlen = 0; 849 ltlen = 0;
765 bno_cur_lt = bno_cur_gt = NULL; 850 bno_cur_lt = bno_cur_gt = NULL;
766 /* 851 /*
@@ -818,7 +903,7 @@ xfs_alloc_ag_vextent_near(
818 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 903 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
819 if (ltlen >= args->minlen) 904 if (ltlen >= args->minlen)
820 break; 905 break;
821 if ((error = xfs_alloc_increment(cnt_cur, 0, &i))) 906 if ((error = xfs_btree_increment(cnt_cur, 0, &i)))
822 goto error0; 907 goto error0;
823 } while (i); 908 } while (i);
824 ASSERT(ltlen >= args->minlen); 909 ASSERT(ltlen >= args->minlen);
@@ -828,7 +913,7 @@ xfs_alloc_ag_vextent_near(
828 i = cnt_cur->bc_ptrs[0]; 913 i = cnt_cur->bc_ptrs[0];
829 for (j = 1, blen = 0, bdiff = 0; 914 for (j = 1, blen = 0, bdiff = 0;
830 !error && j && (blen < args->maxlen || bdiff > 0); 915 !error && j && (blen < args->maxlen || bdiff > 0);
831 error = xfs_alloc_increment(cnt_cur, 0, &j)) { 916 error = xfs_btree_increment(cnt_cur, 0, &j)) {
832 /* 917 /*
833 * For each entry, decide if it's better than 918 * For each entry, decide if it's better than
834 * the previous best entry. 919 * the previous best entry.
@@ -886,8 +971,8 @@ xfs_alloc_ag_vextent_near(
886 /* 971 /*
887 * Set up a cursor for the by-bno tree. 972 * Set up a cursor for the by-bno tree.
888 */ 973 */
889 bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp, 974 bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp,
890 args->agbp, args->agno, XFS_BTNUM_BNO, NULL, 0); 975 args->agbp, args->agno, XFS_BTNUM_BNO);
891 /* 976 /*
892 * Fix up the btree entries. 977 * Fix up the btree entries.
893 */ 978 */
@@ -914,8 +999,8 @@ xfs_alloc_ag_vextent_near(
914 /* 999 /*
915 * Allocate and initialize the cursor for the leftward search. 1000 * Allocate and initialize the cursor for the leftward search.
916 */ 1001 */
917 bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, 1002 bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
918 args->agno, XFS_BTNUM_BNO, NULL, 0); 1003 args->agno, XFS_BTNUM_BNO);
919 /* 1004 /*
920 * Lookup <= bno to find the leftward search's starting point. 1005 * Lookup <= bno to find the leftward search's starting point.
921 */ 1006 */
@@ -938,7 +1023,7 @@ xfs_alloc_ag_vextent_near(
938 * Increment the cursor, so we will point at the entry just right 1023 * Increment the cursor, so we will point at the entry just right
939 * of the leftward entry if any, or to the leftmost entry. 1024 * of the leftward entry if any, or to the leftmost entry.
940 */ 1025 */
941 if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i))) 1026 if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
942 goto error0; 1027 goto error0;
943 if (!i) { 1028 if (!i) {
944 /* 1029 /*
@@ -961,7 +1046,7 @@ xfs_alloc_ag_vextent_near(
961 args->minlen, &ltbnoa, &ltlena); 1046 args->minlen, &ltbnoa, &ltlena);
962 if (ltlena >= args->minlen) 1047 if (ltlena >= args->minlen)
963 break; 1048 break;
964 if ((error = xfs_alloc_decrement(bno_cur_lt, 0, &i))) 1049 if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
965 goto error0; 1050 goto error0;
966 if (!i) { 1051 if (!i) {
967 xfs_btree_del_cursor(bno_cur_lt, 1052 xfs_btree_del_cursor(bno_cur_lt,
@@ -977,7 +1062,7 @@ xfs_alloc_ag_vextent_near(
977 args->minlen, &gtbnoa, &gtlena); 1062 args->minlen, &gtbnoa, &gtlena);
978 if (gtlena >= args->minlen) 1063 if (gtlena >= args->minlen)
979 break; 1064 break;
980 if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i))) 1065 if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
981 goto error0; 1066 goto error0;
982 if (!i) { 1067 if (!i) {
983 xfs_btree_del_cursor(bno_cur_gt, 1068 xfs_btree_del_cursor(bno_cur_gt,
@@ -1066,7 +1151,7 @@ xfs_alloc_ag_vextent_near(
1066 /* 1151 /*
1067 * Fell off the right end. 1152 * Fell off the right end.
1068 */ 1153 */
1069 if ((error = xfs_alloc_increment( 1154 if ((error = xfs_btree_increment(
1070 bno_cur_gt, 0, &i))) 1155 bno_cur_gt, 0, &i)))
1071 goto error0; 1156 goto error0;
1072 if (!i) { 1157 if (!i) {
@@ -1162,7 +1247,7 @@ xfs_alloc_ag_vextent_near(
1162 /* 1247 /*
1163 * Fell off the left end. 1248 * Fell off the left end.
1164 */ 1249 */
1165 if ((error = xfs_alloc_decrement( 1250 if ((error = xfs_btree_decrement(
1166 bno_cur_lt, 0, &i))) 1251 bno_cur_lt, 0, &i)))
1167 goto error0; 1252 goto error0;
1168 if (!i) { 1253 if (!i) {
@@ -1267,8 +1352,8 @@ xfs_alloc_ag_vextent_size(
1267 /* 1352 /*
1268 * Allocate and initialize a cursor for the by-size btree. 1353 * Allocate and initialize a cursor for the by-size btree.
1269 */ 1354 */
1270 cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, 1355 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1271 args->agno, XFS_BTNUM_CNT, NULL, 0); 1356 args->agno, XFS_BTNUM_CNT);
1272 bno_cur = NULL; 1357 bno_cur = NULL;
1273 /* 1358 /*
1274 * Look for an entry >= maxlen+alignment-1 blocks. 1359 * Look for an entry >= maxlen+alignment-1 blocks.
@@ -1321,7 +1406,7 @@ xfs_alloc_ag_vextent_size(
1321 bestflen = flen; 1406 bestflen = flen;
1322 bestfbno = fbno; 1407 bestfbno = fbno;
1323 for (;;) { 1408 for (;;) {
1324 if ((error = xfs_alloc_decrement(cnt_cur, 0, &i))) 1409 if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
1325 goto error0; 1410 goto error0;
1326 if (i == 0) 1411 if (i == 0)
1327 break; 1412 break;
@@ -1372,8 +1457,8 @@ xfs_alloc_ag_vextent_size(
1372 /* 1457 /*
1373 * Allocate and initialize a cursor for the by-block tree. 1458 * Allocate and initialize a cursor for the by-block tree.
1374 */ 1459 */
1375 bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, 1460 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1376 args->agno, XFS_BTNUM_BNO, NULL, 0); 1461 args->agno, XFS_BTNUM_BNO);
1377 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, 1462 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
1378 rbno, rlen, XFSA_FIXUP_CNT_OK))) 1463 rbno, rlen, XFSA_FIXUP_CNT_OK)))
1379 goto error0; 1464 goto error0;
@@ -1416,7 +1501,7 @@ xfs_alloc_ag_vextent_small(
1416 xfs_extlen_t flen; 1501 xfs_extlen_t flen;
1417 int i; 1502 int i;
1418 1503
1419 if ((error = xfs_alloc_decrement(ccur, 0, &i))) 1504 if ((error = xfs_btree_decrement(ccur, 0, &i)))
1420 goto error0; 1505 goto error0;
1421 if (i) { 1506 if (i) {
1422 if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i))) 1507 if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
@@ -1515,8 +1600,7 @@ xfs_free_ag_extent(
1515 /* 1600 /*
1516 * Allocate and initialize a cursor for the by-block btree. 1601 * Allocate and initialize a cursor for the by-block btree.
1517 */ 1602 */
1518 bno_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO, NULL, 1603 bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
1519 0);
1520 cnt_cur = NULL; 1604 cnt_cur = NULL;
1521 /* 1605 /*
1522 * Look for a neighboring block on the left (lower block numbers) 1606 * Look for a neighboring block on the left (lower block numbers)
@@ -1549,7 +1633,7 @@ xfs_free_ag_extent(
1549 * Look for a neighboring block on the right (higher block numbers) 1633 * Look for a neighboring block on the right (higher block numbers)
1550 * that is contiguous with this space. 1634 * that is contiguous with this space.
1551 */ 1635 */
1552 if ((error = xfs_alloc_increment(bno_cur, 0, &haveright))) 1636 if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
1553 goto error0; 1637 goto error0;
1554 if (haveright) { 1638 if (haveright) {
1555 /* 1639 /*
@@ -1575,8 +1659,7 @@ xfs_free_ag_extent(
1575 /* 1659 /*
1576 * Now allocate and initialize a cursor for the by-size tree. 1660 * Now allocate and initialize a cursor for the by-size tree.
1577 */ 1661 */
1578 cnt_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT, NULL, 1662 cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT);
1579 0);
1580 /* 1663 /*
1581 * Have both left and right contiguous neighbors. 1664 * Have both left and right contiguous neighbors.
1582 * Merge all three into a single free block. 1665 * Merge all three into a single free block.
@@ -1588,7 +1671,7 @@ xfs_free_ag_extent(
1588 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) 1671 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
1589 goto error0; 1672 goto error0;
1590 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1673 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1591 if ((error = xfs_alloc_delete(cnt_cur, &i))) 1674 if ((error = xfs_btree_delete(cnt_cur, &i)))
1592 goto error0; 1675 goto error0;
1593 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1676 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1594 /* 1677 /*
@@ -1597,19 +1680,19 @@ xfs_free_ag_extent(
1597 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) 1680 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
1598 goto error0; 1681 goto error0;
1599 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1682 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1600 if ((error = xfs_alloc_delete(cnt_cur, &i))) 1683 if ((error = xfs_btree_delete(cnt_cur, &i)))
1601 goto error0; 1684 goto error0;
1602 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1685 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1603 /* 1686 /*
1604 * Delete the old by-block entry for the right block. 1687 * Delete the old by-block entry for the right block.
1605 */ 1688 */
1606 if ((error = xfs_alloc_delete(bno_cur, &i))) 1689 if ((error = xfs_btree_delete(bno_cur, &i)))
1607 goto error0; 1690 goto error0;
1608 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1691 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1609 /* 1692 /*
1610 * Move the by-block cursor back to the left neighbor. 1693 * Move the by-block cursor back to the left neighbor.
1611 */ 1694 */
1612 if ((error = xfs_alloc_decrement(bno_cur, 0, &i))) 1695 if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
1613 goto error0; 1696 goto error0;
1614 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1697 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1615#ifdef DEBUG 1698#ifdef DEBUG
@@ -1648,14 +1731,14 @@ xfs_free_ag_extent(
1648 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) 1731 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
1649 goto error0; 1732 goto error0;
1650 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1733 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1651 if ((error = xfs_alloc_delete(cnt_cur, &i))) 1734 if ((error = xfs_btree_delete(cnt_cur, &i)))
1652 goto error0; 1735 goto error0;
1653 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1736 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1654 /* 1737 /*
1655 * Back up the by-block cursor to the left neighbor, and 1738 * Back up the by-block cursor to the left neighbor, and
1656 * update its length. 1739 * update its length.
1657 */ 1740 */
1658 if ((error = xfs_alloc_decrement(bno_cur, 0, &i))) 1741 if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
1659 goto error0; 1742 goto error0;
1660 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1743 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1661 nbno = ltbno; 1744 nbno = ltbno;
@@ -1674,7 +1757,7 @@ xfs_free_ag_extent(
1674 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) 1757 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
1675 goto error0; 1758 goto error0;
1676 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1759 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1677 if ((error = xfs_alloc_delete(cnt_cur, &i))) 1760 if ((error = xfs_btree_delete(cnt_cur, &i)))
1678 goto error0; 1761 goto error0;
1679 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1762 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1680 /* 1763 /*
@@ -1693,7 +1776,7 @@ xfs_free_ag_extent(
1693 else { 1776 else {
1694 nbno = bno; 1777 nbno = bno;
1695 nlen = len; 1778 nlen = len;
1696 if ((error = xfs_alloc_insert(bno_cur, &i))) 1779 if ((error = xfs_btree_insert(bno_cur, &i)))
1697 goto error0; 1780 goto error0;
1698 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1781 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1699 } 1782 }
@@ -1705,7 +1788,7 @@ xfs_free_ag_extent(
1705 if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) 1788 if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
1706 goto error0; 1789 goto error0;
1707 XFS_WANT_CORRUPTED_GOTO(i == 0, error0); 1790 XFS_WANT_CORRUPTED_GOTO(i == 0, error0);
1708 if ((error = xfs_alloc_insert(cnt_cur, &i))) 1791 if ((error = xfs_btree_insert(cnt_cur, &i)))
1709 goto error0; 1792 goto error0;
1710 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1793 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1711 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1794 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
@@ -2150,51 +2233,83 @@ xfs_alloc_put_freelist(
2150 * Read in the allocation group header (free/alloc section). 2233 * Read in the allocation group header (free/alloc section).
2151 */ 2234 */
2152int /* error */ 2235int /* error */
2153xfs_alloc_read_agf( 2236xfs_read_agf(
2154 xfs_mount_t *mp, /* mount point structure */ 2237 struct xfs_mount *mp, /* mount point structure */
2155 xfs_trans_t *tp, /* transaction pointer */ 2238 struct xfs_trans *tp, /* transaction pointer */
2156 xfs_agnumber_t agno, /* allocation group number */ 2239 xfs_agnumber_t agno, /* allocation group number */
2157 int flags, /* XFS_ALLOC_FLAG_... */ 2240 int flags, /* XFS_BUF_ */
2158 xfs_buf_t **bpp) /* buffer for the ag freelist header */ 2241 struct xfs_buf **bpp) /* buffer for the ag freelist header */
2159{ 2242{
2160 xfs_agf_t *agf; /* ag freelist header */ 2243 struct xfs_agf *agf; /* ag freelist header */
2161 int agf_ok; /* set if agf is consistent */ 2244 int agf_ok; /* set if agf is consistent */
2162 xfs_buf_t *bp; /* return value */
2163 xfs_perag_t *pag; /* per allocation group data */
2164 int error; 2245 int error;
2165 2246
2166 ASSERT(agno != NULLAGNUMBER); 2247 ASSERT(agno != NULLAGNUMBER);
2167 error = xfs_trans_read_buf( 2248 error = xfs_trans_read_buf(
2168 mp, tp, mp->m_ddev_targp, 2249 mp, tp, mp->m_ddev_targp,
2169 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), 2250 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
2170 XFS_FSS_TO_BB(mp, 1), 2251 XFS_FSS_TO_BB(mp, 1), flags, bpp);
2171 (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0U,
2172 &bp);
2173 if (error) 2252 if (error)
2174 return error; 2253 return error;
2175 ASSERT(!bp || !XFS_BUF_GETERROR(bp)); 2254 if (!*bpp)
2176 if (!bp) {
2177 *bpp = NULL;
2178 return 0; 2255 return 0;
2179 } 2256
2257 ASSERT(!XFS_BUF_GETERROR(*bpp));
2258 agf = XFS_BUF_TO_AGF(*bpp);
2259
2180 /* 2260 /*
2181 * Validate the magic number of the agf block. 2261 * Validate the magic number of the agf block.
2182 */ 2262 */
2183 agf = XFS_BUF_TO_AGF(bp);
2184 agf_ok = 2263 agf_ok =
2185 be32_to_cpu(agf->agf_magicnum) == XFS_AGF_MAGIC && 2264 be32_to_cpu(agf->agf_magicnum) == XFS_AGF_MAGIC &&
2186 XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && 2265 XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
2187 be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && 2266 be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
2188 be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && 2267 be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
2189 be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && 2268 be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
2190 be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp); 2269 be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) &&
2270 be32_to_cpu(agf->agf_seqno) == agno;
2271 if (xfs_sb_version_haslazysbcount(&mp->m_sb))
2272 agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
2273 be32_to_cpu(agf->agf_length);
2191 if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, 2274 if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
2192 XFS_RANDOM_ALLOC_READ_AGF))) { 2275 XFS_RANDOM_ALLOC_READ_AGF))) {
2193 XFS_CORRUPTION_ERROR("xfs_alloc_read_agf", 2276 XFS_CORRUPTION_ERROR("xfs_alloc_read_agf",
2194 XFS_ERRLEVEL_LOW, mp, agf); 2277 XFS_ERRLEVEL_LOW, mp, agf);
2195 xfs_trans_brelse(tp, bp); 2278 xfs_trans_brelse(tp, *bpp);
2196 return XFS_ERROR(EFSCORRUPTED); 2279 return XFS_ERROR(EFSCORRUPTED);
2197 } 2280 }
2281
2282 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF);
2283 return 0;
2284}
2285
2286/*
2287 * Read in the allocation group header (free/alloc section).
2288 */
2289int /* error */
2290xfs_alloc_read_agf(
2291 struct xfs_mount *mp, /* mount point structure */
2292 struct xfs_trans *tp, /* transaction pointer */
2293 xfs_agnumber_t agno, /* allocation group number */
2294 int flags, /* XFS_ALLOC_FLAG_... */
2295 struct xfs_buf **bpp) /* buffer for the ag freelist header */
2296{
2297 struct xfs_agf *agf; /* ag freelist header */
2298 struct xfs_perag *pag; /* per allocation group data */
2299 int error;
2300
2301 ASSERT(agno != NULLAGNUMBER);
2302
2303 error = xfs_read_agf(mp, tp, agno,
2304 (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0,
2305 bpp);
2306 if (error)
2307 return error;
2308 if (!*bpp)
2309 return 0;
2310 ASSERT(!XFS_BUF_GETERROR(*bpp));
2311
2312 agf = XFS_BUF_TO_AGF(*bpp);
2198 pag = &mp->m_perag[agno]; 2313 pag = &mp->m_perag[agno];
2199 if (!pag->pagf_init) { 2314 if (!pag->pagf_init) {
2200 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); 2315 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
@@ -2213,6 +2328,7 @@ xfs_alloc_read_agf(
2213#ifdef DEBUG 2328#ifdef DEBUG
2214 else if (!XFS_FORCED_SHUTDOWN(mp)) { 2329 else if (!XFS_FORCED_SHUTDOWN(mp)) {
2215 ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); 2330 ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
2331 ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
2216 ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); 2332 ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
2217 ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest)); 2333 ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
2218 ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] == 2334 ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
@@ -2221,8 +2337,6 @@ xfs_alloc_read_agf(
2221 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); 2337 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
2222 } 2338 }
2223#endif 2339#endif
2224 XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGF, XFS_AGF_REF);
2225 *bpp = bp;
2226 return 0; 2340 return 0;
2227} 2341}
2228 2342
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 5aec15d0651e..588172796f7b 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -121,6 +121,19 @@ extern ktrace_t *xfs_alloc_trace_buf;
121#define XFS_ALLOC_KTRACE_BUSYSEARCH 6 121#define XFS_ALLOC_KTRACE_BUSYSEARCH 6
122#endif 122#endif
123 123
124void
125xfs_alloc_mark_busy(xfs_trans_t *tp,
126 xfs_agnumber_t agno,
127 xfs_agblock_t bno,
128 xfs_extlen_t len);
129
130void
131xfs_alloc_clear_busy(xfs_trans_t *tp,
132 xfs_agnumber_t ag,
133 int idx);
134
135#endif /* __KERNEL__ */
136
124/* 137/*
125 * Compute and fill in value of m_ag_maxlevels. 138 * Compute and fill in value of m_ag_maxlevels.
126 */ 139 */
@@ -196,18 +209,4 @@ xfs_free_extent(
196 xfs_fsblock_t bno, /* starting block number of extent */ 209 xfs_fsblock_t bno, /* starting block number of extent */
197 xfs_extlen_t len); /* length of extent */ 210 xfs_extlen_t len); /* length of extent */
198 211
199void
200xfs_alloc_mark_busy(xfs_trans_t *tp,
201 xfs_agnumber_t agno,
202 xfs_agblock_t bno,
203 xfs_extlen_t len);
204
205void
206xfs_alloc_clear_busy(xfs_trans_t *tp,
207 xfs_agnumber_t ag,
208 int idx);
209
210
211#endif /* __KERNEL__ */
212
213#endif /* __XFS_ALLOC_H__ */ 212#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 3ce2645508ae..733cb75a8c5d 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -35,2177 +35,464 @@
35#include "xfs_dinode.h" 35#include "xfs_dinode.h"
36#include "xfs_inode.h" 36#include "xfs_inode.h"
37#include "xfs_btree.h" 37#include "xfs_btree.h"
38#include "xfs_btree_trace.h"
38#include "xfs_ialloc.h" 39#include "xfs_ialloc.h"
39#include "xfs_alloc.h" 40#include "xfs_alloc.h"
40#include "xfs_error.h" 41#include "xfs_error.h"
41 42
42/*
43 * Prototypes for internal functions.
44 */
45 43
46STATIC void xfs_alloc_log_block(xfs_trans_t *, xfs_buf_t *, int); 44STATIC struct xfs_btree_cur *
47STATIC void xfs_alloc_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int); 45xfs_allocbt_dup_cursor(
48STATIC void xfs_alloc_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int); 46 struct xfs_btree_cur *cur)
49STATIC void xfs_alloc_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int); 47{
50STATIC int xfs_alloc_lshift(xfs_btree_cur_t *, int, int *); 48 return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
51STATIC int xfs_alloc_newroot(xfs_btree_cur_t *, int *); 49 cur->bc_private.a.agbp, cur->bc_private.a.agno,
52STATIC int xfs_alloc_rshift(xfs_btree_cur_t *, int, int *); 50 cur->bc_btnum);
53STATIC int xfs_alloc_split(xfs_btree_cur_t *, int, xfs_agblock_t *, 51}
54 xfs_alloc_key_t *, xfs_btree_cur_t **, int *);
55STATIC int xfs_alloc_updkey(xfs_btree_cur_t *, xfs_alloc_key_t *, int);
56 52
57/* 53STATIC void
58 * Internal functions. 54xfs_allocbt_set_root(
59 */ 55 struct xfs_btree_cur *cur,
56 union xfs_btree_ptr *ptr,
57 int inc)
58{
59 struct xfs_buf *agbp = cur->bc_private.a.agbp;
60 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
61 xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
62 int btnum = cur->bc_btnum;
60 63
61/* 64 ASSERT(ptr->s != 0);
62 * Single level of the xfs_alloc_delete record deletion routine. 65
63 * Delete record pointed to by cur/level. 66 agf->agf_roots[btnum] = ptr->s;
64 * Remove the record from its block then rebalance the tree. 67 be32_add_cpu(&agf->agf_levels[btnum], inc);
65 * Return 0 for error, 1 for done, 2 to go on to the next level. 68 cur->bc_mp->m_perag[seqno].pagf_levels[btnum] += inc;
66 */ 69
67STATIC int /* error */ 70 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
68xfs_alloc_delrec( 71}
69 xfs_btree_cur_t *cur, /* btree cursor */ 72
70 int level, /* level removing record from */ 73STATIC int
71 int *stat) /* fail/done/go-on */ 74xfs_allocbt_alloc_block(
75 struct xfs_btree_cur *cur,
76 union xfs_btree_ptr *start,
77 union xfs_btree_ptr *new,
78 int length,
79 int *stat)
72{ 80{
73 xfs_agf_t *agf; /* allocation group freelist header */ 81 int error;
74 xfs_alloc_block_t *block; /* btree block record/key lives in */ 82 xfs_agblock_t bno;
75 xfs_agblock_t bno; /* btree block number */
76 xfs_buf_t *bp; /* buffer for block */
77 int error; /* error return value */
78 int i; /* loop index */
79 xfs_alloc_key_t key; /* kp points here if block is level 0 */
80 xfs_agblock_t lbno; /* left block's block number */
81 xfs_buf_t *lbp; /* left block's buffer pointer */
82 xfs_alloc_block_t *left; /* left btree block */
83 xfs_alloc_key_t *lkp=NULL; /* left block key pointer */
84 xfs_alloc_ptr_t *lpp=NULL; /* left block address pointer */
85 int lrecs=0; /* number of records in left block */
86 xfs_alloc_rec_t *lrp; /* left block record pointer */
87 xfs_mount_t *mp; /* mount structure */
88 int ptr; /* index in btree block for this rec */
89 xfs_agblock_t rbno; /* right block's block number */
90 xfs_buf_t *rbp; /* right block's buffer pointer */
91 xfs_alloc_block_t *right; /* right btree block */
92 xfs_alloc_key_t *rkp; /* right block key pointer */
93 xfs_alloc_ptr_t *rpp; /* right block address pointer */
94 int rrecs=0; /* number of records in right block */
95 int numrecs;
96 xfs_alloc_rec_t *rrp; /* right block record pointer */
97 xfs_btree_cur_t *tcur; /* temporary btree cursor */
98 83
99 /* 84 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
100 * Get the index of the entry being deleted, check for nothing there. 85
101 */ 86 /* Allocate the new block from the freelist. If we can't, give up. */
102 ptr = cur->bc_ptrs[level]; 87 error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
103 if (ptr == 0) { 88 &bno, 1);
104 *stat = 0; 89 if (error) {
105 return 0; 90 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
106 }
107 /*
108 * Get the buffer & block containing the record or key/ptr.
109 */
110 bp = cur->bc_bufs[level];
111 block = XFS_BUF_TO_ALLOC_BLOCK(bp);
112#ifdef DEBUG
113 if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
114 return error; 91 return error;
115#endif 92 }
116 /* 93
117 * Fail if we're off the end of the block. 94 if (bno == NULLAGBLOCK) {
118 */ 95 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
119 numrecs = be16_to_cpu(block->bb_numrecs);
120 if (ptr > numrecs) {
121 *stat = 0; 96 *stat = 0;
122 return 0; 97 return 0;
123 } 98 }
124 XFS_STATS_INC(xs_abt_delrec);
125 /*
126 * It's a nonleaf. Excise the key and ptr being deleted, by
127 * sliding the entries past them down one.
128 * Log the changed areas of the block.
129 */
130 if (level > 0) {
131 lkp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
132 lpp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
133#ifdef DEBUG
134 for (i = ptr; i < numrecs; i++) {
135 if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
136 return error;
137 }
138#endif
139 if (ptr < numrecs) {
140 memmove(&lkp[ptr - 1], &lkp[ptr],
141 (numrecs - ptr) * sizeof(*lkp));
142 memmove(&lpp[ptr - 1], &lpp[ptr],
143 (numrecs - ptr) * sizeof(*lpp));
144 xfs_alloc_log_ptrs(cur, bp, ptr, numrecs - 1);
145 xfs_alloc_log_keys(cur, bp, ptr, numrecs - 1);
146 }
147 }
148 /*
149 * It's a leaf. Excise the record being deleted, by sliding the
150 * entries past it down one. Log the changed areas of the block.
151 */
152 else {
153 lrp = XFS_ALLOC_REC_ADDR(block, 1, cur);
154 if (ptr < numrecs) {
155 memmove(&lrp[ptr - 1], &lrp[ptr],
156 (numrecs - ptr) * sizeof(*lrp));
157 xfs_alloc_log_recs(cur, bp, ptr, numrecs - 1);
158 }
159 /*
160 * If it's the first record in the block, we'll need a key
161 * structure to pass up to the next level (updkey).
162 */
163 if (ptr == 1) {
164 key.ar_startblock = lrp->ar_startblock;
165 key.ar_blockcount = lrp->ar_blockcount;
166 lkp = &key;
167 }
168 }
169 /*
170 * Decrement and log the number of entries in the block.
171 */
172 numrecs--;
173 block->bb_numrecs = cpu_to_be16(numrecs);
174 xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
175 /*
176 * See if the longest free extent in the allocation group was
177 * changed by this operation. True if it's the by-size btree, and
178 * this is the leaf level, and there is no right sibling block,
179 * and this was the last record.
180 */
181 agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
182 mp = cur->bc_mp;
183 99
184 if (level == 0 && 100 xfs_trans_agbtree_delta(cur->bc_tp, 1);
185 cur->bc_btnum == XFS_BTNUM_CNT && 101 new->s = cpu_to_be32(bno);
186 be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
187 ptr > numrecs) {
188 ASSERT(ptr == numrecs + 1);
189 /*
190 * There are still records in the block. Grab the size
191 * from the last one.
192 */
193 if (numrecs) {
194 rrp = XFS_ALLOC_REC_ADDR(block, numrecs, cur);
195 agf->agf_longest = rrp->ar_blockcount;
196 }
197 /*
198 * No free extents left.
199 */
200 else
201 agf->agf_longest = 0;
202 mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_longest =
203 be32_to_cpu(agf->agf_longest);
204 xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
205 XFS_AGF_LONGEST);
206 }
207 /*
208 * Is this the root level? If so, we're almost done.
209 */
210 if (level == cur->bc_nlevels - 1) {
211 /*
212 * If this is the root level,
213 * and there's only one entry left,
214 * and it's NOT the leaf level,
215 * then we can get rid of this level.
216 */
217 if (numrecs == 1 && level > 0) {
218 /*
219 * lpp is still set to the first pointer in the block.
220 * Make it the new root of the btree.
221 */
222 bno = be32_to_cpu(agf->agf_roots[cur->bc_btnum]);
223 agf->agf_roots[cur->bc_btnum] = *lpp;
224 be32_add_cpu(&agf->agf_levels[cur->bc_btnum], -1);
225 mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_levels[cur->bc_btnum]--;
226 /*
227 * Put this buffer/block on the ag's freelist.
228 */
229 error = xfs_alloc_put_freelist(cur->bc_tp,
230 cur->bc_private.a.agbp, NULL, bno, 1);
231 if (error)
232 return error;
233 /*
234 * Since blocks move to the free list without the
235 * coordination used in xfs_bmap_finish, we can't allow
236 * block to be available for reallocation and
237 * non-transaction writing (user data) until we know
238 * that the transaction that moved it to the free list
239 * is permanently on disk. We track the blocks by
240 * declaring these blocks as "busy"; the busy list is
241 * maintained on a per-ag basis and each transaction
242 * records which entries should be removed when the
243 * iclog commits to disk. If a busy block is
244 * allocated, the iclog is pushed up to the LSN
245 * that freed the block.
246 */
247 xfs_alloc_mark_busy(cur->bc_tp,
248 be32_to_cpu(agf->agf_seqno), bno, 1);
249 102
250 xfs_trans_agbtree_delta(cur->bc_tp, -1); 103 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
251 xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, 104 *stat = 1;
252 XFS_AGF_ROOTS | XFS_AGF_LEVELS); 105 return 0;
253 /* 106}
254 * Update the cursor so there's one fewer level.
255 */
256 xfs_btree_setbuf(cur, level, NULL);
257 cur->bc_nlevels--;
258 } else if (level > 0 &&
259 (error = xfs_alloc_decrement(cur, level, &i)))
260 return error;
261 *stat = 1;
262 return 0;
263 }
264 /*
265 * If we deleted the leftmost entry in the block, update the
266 * key values above us in the tree.
267 */
268 if (ptr == 1 && (error = xfs_alloc_updkey(cur, lkp, level + 1)))
269 return error;
270 /*
271 * If the number of records remaining in the block is at least
272 * the minimum, we're done.
273 */
274 if (numrecs >= XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
275 if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
276 return error;
277 *stat = 1;
278 return 0;
279 }
280 /*
281 * Otherwise, we have to move some records around to keep the
282 * tree balanced. Look at the left and right sibling blocks to
283 * see if we can re-balance by moving only one record.
284 */
285 rbno = be32_to_cpu(block->bb_rightsib);
286 lbno = be32_to_cpu(block->bb_leftsib);
287 bno = NULLAGBLOCK;
288 ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK);
289 /*
290 * Duplicate the cursor so our btree manipulations here won't
291 * disrupt the next level up.
292 */
293 if ((error = xfs_btree_dup_cursor(cur, &tcur)))
294 return error;
295 /*
296 * If there's a right sibling, see if it's ok to shift an entry
297 * out of it.
298 */
299 if (rbno != NULLAGBLOCK) {
300 /*
301 * Move the temp cursor to the last entry in the next block.
302 * Actually any entry but the first would suffice.
303 */
304 i = xfs_btree_lastrec(tcur, level);
305 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
306 if ((error = xfs_alloc_increment(tcur, level, &i)))
307 goto error0;
308 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
309 i = xfs_btree_lastrec(tcur, level);
310 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
311 /*
312 * Grab a pointer to the block.
313 */
314 rbp = tcur->bc_bufs[level];
315 right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
316#ifdef DEBUG
317 if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
318 goto error0;
319#endif
320 /*
321 * Grab the current block number, for future use.
322 */
323 bno = be32_to_cpu(right->bb_leftsib);
324 /*
325 * If right block is full enough so that removing one entry
326 * won't make it too empty, and left-shifting an entry out
327 * of right to us works, we're done.
328 */
329 if (be16_to_cpu(right->bb_numrecs) - 1 >=
330 XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
331 if ((error = xfs_alloc_lshift(tcur, level, &i)))
332 goto error0;
333 if (i) {
334 ASSERT(be16_to_cpu(block->bb_numrecs) >=
335 XFS_ALLOC_BLOCK_MINRECS(level, cur));
336 xfs_btree_del_cursor(tcur,
337 XFS_BTREE_NOERROR);
338 if (level > 0 &&
339 (error = xfs_alloc_decrement(cur, level,
340 &i)))
341 return error;
342 *stat = 1;
343 return 0;
344 }
345 }
346 /*
347 * Otherwise, grab the number of records in right for
348 * future reference, and fix up the temp cursor to point
349 * to our block again (last record).
350 */
351 rrecs = be16_to_cpu(right->bb_numrecs);
352 if (lbno != NULLAGBLOCK) {
353 i = xfs_btree_firstrec(tcur, level);
354 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
355 if ((error = xfs_alloc_decrement(tcur, level, &i)))
356 goto error0;
357 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
358 }
359 }
360 /*
361 * If there's a left sibling, see if it's ok to shift an entry
362 * out of it.
363 */
364 if (lbno != NULLAGBLOCK) {
365 /*
366 * Move the temp cursor to the first entry in the
367 * previous block.
368 */
369 i = xfs_btree_firstrec(tcur, level);
370 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
371 if ((error = xfs_alloc_decrement(tcur, level, &i)))
372 goto error0;
373 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
374 xfs_btree_firstrec(tcur, level);
375 /*
376 * Grab a pointer to the block.
377 */
378 lbp = tcur->bc_bufs[level];
379 left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
380#ifdef DEBUG
381 if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
382 goto error0;
383#endif
384 /*
385 * Grab the current block number, for future use.
386 */
387 bno = be32_to_cpu(left->bb_rightsib);
388 /*
389 * If left block is full enough so that removing one entry
390 * won't make it too empty, and right-shifting an entry out
391 * of left to us works, we're done.
392 */
393 if (be16_to_cpu(left->bb_numrecs) - 1 >=
394 XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
395 if ((error = xfs_alloc_rshift(tcur, level, &i)))
396 goto error0;
397 if (i) {
398 ASSERT(be16_to_cpu(block->bb_numrecs) >=
399 XFS_ALLOC_BLOCK_MINRECS(level, cur));
400 xfs_btree_del_cursor(tcur,
401 XFS_BTREE_NOERROR);
402 if (level == 0)
403 cur->bc_ptrs[0]++;
404 *stat = 1;
405 return 0;
406 }
407 }
408 /*
409 * Otherwise, grab the number of records in right for
410 * future reference.
411 */
412 lrecs = be16_to_cpu(left->bb_numrecs);
413 }
414 /*
415 * Delete the temp cursor, we're done with it.
416 */
417 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
418 /*
419 * If here, we need to do a join to keep the tree balanced.
420 */
421 ASSERT(bno != NULLAGBLOCK);
422 /*
423 * See if we can join with the left neighbor block.
424 */
425 if (lbno != NULLAGBLOCK &&
426 lrecs + numrecs <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
427 /*
428 * Set "right" to be the starting block,
429 * "left" to be the left neighbor.
430 */
431 rbno = bno;
432 right = block;
433 rrecs = be16_to_cpu(right->bb_numrecs);
434 rbp = bp;
435 if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
436 cur->bc_private.a.agno, lbno, 0, &lbp,
437 XFS_ALLOC_BTREE_REF)))
438 return error;
439 left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
440 lrecs = be16_to_cpu(left->bb_numrecs);
441 if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
442 return error;
443 }
444 /*
445 * If that won't work, see if we can join with the right neighbor block.
446 */
447 else if (rbno != NULLAGBLOCK &&
448 rrecs + numrecs <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
449 /*
450 * Set "left" to be the starting block,
451 * "right" to be the right neighbor.
452 */
453 lbno = bno;
454 left = block;
455 lrecs = be16_to_cpu(left->bb_numrecs);
456 lbp = bp;
457 if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
458 cur->bc_private.a.agno, rbno, 0, &rbp,
459 XFS_ALLOC_BTREE_REF)))
460 return error;
461 right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
462 rrecs = be16_to_cpu(right->bb_numrecs);
463 if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
464 return error;
465 }
466 /*
467 * Otherwise, we can't fix the imbalance.
468 * Just return. This is probably a logic error, but it's not fatal.
469 */
470 else {
471 if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
472 return error;
473 *stat = 1;
474 return 0;
475 }
476 /*
477 * We're now going to join "left" and "right" by moving all the stuff
478 * in "right" to "left" and deleting "right".
479 */
480 if (level > 0) {
481 /*
482 * It's a non-leaf. Move keys and pointers.
483 */
484 lkp = XFS_ALLOC_KEY_ADDR(left, lrecs + 1, cur);
485 lpp = XFS_ALLOC_PTR_ADDR(left, lrecs + 1, cur);
486 rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
487 rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
488#ifdef DEBUG
489 for (i = 0; i < rrecs; i++) {
490 if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
491 return error;
492 }
493#endif
494 memcpy(lkp, rkp, rrecs * sizeof(*lkp));
495 memcpy(lpp, rpp, rrecs * sizeof(*lpp));
496 xfs_alloc_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
497 xfs_alloc_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
498 } else {
499 /*
500 * It's a leaf. Move records.
501 */
502 lrp = XFS_ALLOC_REC_ADDR(left, lrecs + 1, cur);
503 rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
504 memcpy(lrp, rrp, rrecs * sizeof(*lrp));
505 xfs_alloc_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
506 }
507 /*
508 * If we joined with the left neighbor, set the buffer in the
509 * cursor to the left block, and fix up the index.
510 */
511 if (bp != lbp) {
512 xfs_btree_setbuf(cur, level, lbp);
513 cur->bc_ptrs[level] += lrecs;
514 }
515 /*
516 * If we joined with the right neighbor and there's a level above
517 * us, increment the cursor at that level.
518 */
519 else if (level + 1 < cur->bc_nlevels &&
520 (error = xfs_alloc_increment(cur, level + 1, &i)))
521 return error;
522 /*
523 * Fix up the number of records in the surviving block.
524 */
525 lrecs += rrecs;
526 left->bb_numrecs = cpu_to_be16(lrecs);
527 /*
528 * Fix up the right block pointer in the surviving block, and log it.
529 */
530 left->bb_rightsib = right->bb_rightsib;
531 xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
532 /*
533 * If there is a right sibling now, make it point to the
534 * remaining block.
535 */
536 if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) {
537 xfs_alloc_block_t *rrblock;
538 xfs_buf_t *rrbp;
539 107
540 if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, 108STATIC int
541 cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0, 109xfs_allocbt_free_block(
542 &rrbp, XFS_ALLOC_BTREE_REF))) 110 struct xfs_btree_cur *cur,
543 return error; 111 struct xfs_buf *bp)
544 rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp); 112{
545 if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp))) 113 struct xfs_buf *agbp = cur->bc_private.a.agbp;
546 return error; 114 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
547 rrblock->bb_leftsib = cpu_to_be32(lbno); 115 xfs_agblock_t bno;
548 xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB); 116 int error;
549 } 117
550 /* 118 bno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(bp));
551 * Free the deleting block by putting it on the freelist. 119 error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
552 */
553 error = xfs_alloc_put_freelist(cur->bc_tp,
554 cur->bc_private.a.agbp, NULL, rbno, 1);
555 if (error) 120 if (error)
556 return error; 121 return error;
122
557 /* 123 /*
558 * Since blocks move to the free list without the coordination 124 * Since blocks move to the free list without the coordination used in
559 * used in xfs_bmap_finish, we can't allow block to be available 125 * xfs_bmap_finish, we can't allow block to be available for
560 * for reallocation and non-transaction writing (user data) 126 * reallocation and non-transaction writing (user data) until we know
561 * until we know that the transaction that moved it to the free 127 * that the transaction that moved it to the free list is permanently
562 * list is permanently on disk. We track the blocks by declaring 128 * on disk. We track the blocks by declaring these blocks as "busy";
563 * these blocks as "busy"; the busy list is maintained on a 129 * the busy list is maintained on a per-ag basis and each transaction
564 * per-ag basis and each transaction records which entries 130 * records which entries should be removed when the iclog commits to
565 * should be removed when the iclog commits to disk. If a 131 * disk. If a busy block is allocated, the iclog is pushed up to the
566 * busy block is allocated, the iclog is pushed up to the
567 * LSN that freed the block. 132 * LSN that freed the block.
568 */ 133 */
569 xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); 134 xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
570 xfs_trans_agbtree_delta(cur->bc_tp, -1); 135 xfs_trans_agbtree_delta(cur->bc_tp, -1);
571
572 /*
573 * Adjust the current level's cursor so that we're left referring
574 * to the right node, after we're done.
575 * If this leaves the ptr value 0 our caller will fix it up.
576 */
577 if (level > 0)
578 cur->bc_ptrs[level]--;
579 /*
580 * Return value means the next level up has something to do.
581 */
582 *stat = 2;
583 return 0; 136 return 0;
584
585error0:
586 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
587 return error;
588} 137}
589 138
590/* 139/*
591 * Insert one record/level. Return information to the caller 140 * Update the longest extent in the AGF
592 * allowing the next level up to proceed if necessary.
593 */ 141 */
594STATIC int /* error */ 142STATIC void
595xfs_alloc_insrec( 143xfs_allocbt_update_lastrec(
596 xfs_btree_cur_t *cur, /* btree cursor */ 144 struct xfs_btree_cur *cur,
597 int level, /* level to insert record at */ 145 struct xfs_btree_block *block,
598 xfs_agblock_t *bnop, /* i/o: block number inserted */ 146 union xfs_btree_rec *rec,
599 xfs_alloc_rec_t *recp, /* i/o: record data inserted */ 147 int ptr,
600 xfs_btree_cur_t **curp, /* output: new cursor replacing cur */ 148 int reason)
601 int *stat) /* output: success/failure */
602{ 149{
603 xfs_agf_t *agf; /* allocation group freelist header */ 150 struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
604 xfs_alloc_block_t *block; /* btree block record/key lives in */ 151 xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
605 xfs_buf_t *bp; /* buffer for block */ 152 __be32 len;
606 int error; /* error return value */
607 int i; /* loop index */
608 xfs_alloc_key_t key; /* key value being inserted */
609 xfs_alloc_key_t *kp; /* pointer to btree keys */
610 xfs_agblock_t nbno; /* block number of allocated block */
611 xfs_btree_cur_t *ncur; /* new cursor to be used at next lvl */
612 xfs_alloc_key_t nkey; /* new key value, from split */
613 xfs_alloc_rec_t nrec; /* new record value, for caller */
614 int numrecs; 153 int numrecs;
615 int optr; /* old ptr value */
616 xfs_alloc_ptr_t *pp; /* pointer to btree addresses */
617 int ptr; /* index in btree block for this rec */
618 xfs_alloc_rec_t *rp; /* pointer to btree records */
619 154
620 ASSERT(be32_to_cpu(recp->ar_blockcount) > 0); 155 ASSERT(cur->bc_btnum == XFS_BTNUM_CNT);
156
157 switch (reason) {
158 case LASTREC_UPDATE:
159 /*
160 * If this is the last leaf block and it's the last record,
161 * then update the size of the longest extent in the AG.
162 */
163 if (ptr != xfs_btree_get_numrecs(block))
164 return;
165 len = rec->alloc.ar_blockcount;
166 break;
167 case LASTREC_INSREC:
168 if (be32_to_cpu(rec->alloc.ar_blockcount) <=
169 be32_to_cpu(agf->agf_longest))
170 return;
171 len = rec->alloc.ar_blockcount;
172 break;
173 case LASTREC_DELREC:
174 numrecs = xfs_btree_get_numrecs(block);
175 if (ptr <= numrecs)
176 return;
177 ASSERT(ptr == numrecs + 1);
621 178
622 /* 179 if (numrecs) {
623 * GCC doesn't understand the (arguably complex) control flow in 180 xfs_alloc_rec_t *rrp;
624 * this function and complains about uninitialized structure fields
625 * without this.
626 */
627 memset(&nrec, 0, sizeof(nrec));
628 181
629 /* 182 rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs);
630 * If we made it to the root level, allocate a new root block 183 len = rrp->ar_blockcount;
631 * and we're done.
632 */
633 if (level >= cur->bc_nlevels) {
634 XFS_STATS_INC(xs_abt_insrec);
635 if ((error = xfs_alloc_newroot(cur, &i)))
636 return error;
637 *bnop = NULLAGBLOCK;
638 *stat = i;
639 return 0;
640 }
641 /*
642 * Make a key out of the record data to be inserted, and save it.
643 */
644 key.ar_startblock = recp->ar_startblock;
645 key.ar_blockcount = recp->ar_blockcount;
646 optr = ptr = cur->bc_ptrs[level];
647 /*
648 * If we're off the left edge, return failure.
649 */
650 if (ptr == 0) {
651 *stat = 0;
652 return 0;
653 }
654 XFS_STATS_INC(xs_abt_insrec);
655 /*
656 * Get pointers to the btree buffer and block.
657 */
658 bp = cur->bc_bufs[level];
659 block = XFS_BUF_TO_ALLOC_BLOCK(bp);
660 numrecs = be16_to_cpu(block->bb_numrecs);
661#ifdef DEBUG
662 if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
663 return error;
664 /*
665 * Check that the new entry is being inserted in the right place.
666 */
667 if (ptr <= numrecs) {
668 if (level == 0) {
669 rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
670 xfs_btree_check_rec(cur->bc_btnum, recp, rp);
671 } else { 184 } else {
672 kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur); 185 len = 0;
673 xfs_btree_check_key(cur->bc_btnum, &key, kp);
674 }
675 }
676#endif
677 nbno = NULLAGBLOCK;
678 ncur = NULL;
679 /*
680 * If the block is full, we can't insert the new entry until we
681 * make the block un-full.
682 */
683 if (numrecs == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
684 /*
685 * First, try shifting an entry to the right neighbor.
686 */
687 if ((error = xfs_alloc_rshift(cur, level, &i)))
688 return error;
689 if (i) {
690 /* nothing */
691 }
692 /*
693 * Next, try shifting an entry to the left neighbor.
694 */
695 else {
696 if ((error = xfs_alloc_lshift(cur, level, &i)))
697 return error;
698 if (i)
699 optr = ptr = cur->bc_ptrs[level];
700 else {
701 /*
702 * Next, try splitting the current block in
703 * half. If this works we have to re-set our
704 * variables because we could be in a
705 * different block now.
706 */
707 if ((error = xfs_alloc_split(cur, level, &nbno,
708 &nkey, &ncur, &i)))
709 return error;
710 if (i) {
711 bp = cur->bc_bufs[level];
712 block = XFS_BUF_TO_ALLOC_BLOCK(bp);
713#ifdef DEBUG
714 if ((error =
715 xfs_btree_check_sblock(cur,
716 block, level, bp)))
717 return error;
718#endif
719 ptr = cur->bc_ptrs[level];
720 nrec.ar_startblock = nkey.ar_startblock;
721 nrec.ar_blockcount = nkey.ar_blockcount;
722 }
723 /*
724 * Otherwise the insert fails.
725 */
726 else {
727 *stat = 0;
728 return 0;
729 }
730 }
731 }
732 }
733 /*
734 * At this point we know there's room for our new entry in the block
735 * we're pointing at.
736 */
737 numrecs = be16_to_cpu(block->bb_numrecs);
738 if (level > 0) {
739 /*
740 * It's a non-leaf entry. Make a hole for the new data
741 * in the key and ptr regions of the block.
742 */
743 kp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
744 pp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
745#ifdef DEBUG
746 for (i = numrecs; i >= ptr; i--) {
747 if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i - 1]), level)))
748 return error;
749 } 186 }
750#endif
751 memmove(&kp[ptr], &kp[ptr - 1],
752 (numrecs - ptr + 1) * sizeof(*kp));
753 memmove(&pp[ptr], &pp[ptr - 1],
754 (numrecs - ptr + 1) * sizeof(*pp));
755#ifdef DEBUG
756 if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
757 return error;
758#endif
759 /*
760 * Now stuff the new data in, bump numrecs and log the new data.
761 */
762 kp[ptr - 1] = key;
763 pp[ptr - 1] = cpu_to_be32(*bnop);
764 numrecs++;
765 block->bb_numrecs = cpu_to_be16(numrecs);
766 xfs_alloc_log_keys(cur, bp, ptr, numrecs);
767 xfs_alloc_log_ptrs(cur, bp, ptr, numrecs);
768#ifdef DEBUG
769 if (ptr < numrecs)
770 xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1,
771 kp + ptr);
772#endif
773 } else {
774 /*
775 * It's a leaf entry. Make a hole for the new record.
776 */
777 rp = XFS_ALLOC_REC_ADDR(block, 1, cur);
778 memmove(&rp[ptr], &rp[ptr - 1],
779 (numrecs - ptr + 1) * sizeof(*rp));
780 /*
781 * Now stuff the new record in, bump numrecs
782 * and log the new data.
783 */
784 rp[ptr - 1] = *recp;
785 numrecs++;
786 block->bb_numrecs = cpu_to_be16(numrecs);
787 xfs_alloc_log_recs(cur, bp, ptr, numrecs);
788#ifdef DEBUG
789 if (ptr < numrecs)
790 xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1,
791 rp + ptr);
792#endif
793 }
794 /*
795 * Log the new number of records in the btree header.
796 */
797 xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
798 /*
799 * If we inserted at the start of a block, update the parents' keys.
800 */
801 if (optr == 1 && (error = xfs_alloc_updkey(cur, &key, level + 1)))
802 return error;
803 /*
804 * Look to see if the longest extent in the allocation group
805 * needs to be updated.
806 */
807 187
808 agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); 188 break;
809 if (level == 0 && 189 default:
810 cur->bc_btnum == XFS_BTNUM_CNT && 190 ASSERT(0);
811 be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK && 191 return;
812 be32_to_cpu(recp->ar_blockcount) > be32_to_cpu(agf->agf_longest)) {
813 /*
814 * If this is a leaf in the by-size btree and there
815 * is no right sibling block and this block is bigger
816 * than the previous longest block, update it.
817 */
818 agf->agf_longest = recp->ar_blockcount;
819 cur->bc_mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_longest
820 = be32_to_cpu(recp->ar_blockcount);
821 xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
822 XFS_AGF_LONGEST);
823 } 192 }
824 /* 193
825 * Return the new block number, if any. 194 agf->agf_longest = len;
826 * If there is one, give back a record value and a cursor too. 195 cur->bc_mp->m_perag[seqno].pagf_longest = be32_to_cpu(len);
827 */ 196 xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
828 *bnop = nbno;
829 if (nbno != NULLAGBLOCK) {
830 *recp = nrec;
831 *curp = ncur;
832 }
833 *stat = 1;
834 return 0;
835} 197}
836 198
837/* 199STATIC int
838 * Log header fields from a btree block. 200xfs_allocbt_get_minrecs(
839 */ 201 struct xfs_btree_cur *cur,
840STATIC void 202 int level)
841xfs_alloc_log_block(
842 xfs_trans_t *tp, /* transaction pointer */
843 xfs_buf_t *bp, /* buffer containing btree block */
844 int fields) /* mask of fields: XFS_BB_... */
845{ 203{
846 int first; /* first byte offset logged */ 204 return cur->bc_mp->m_alloc_mnr[level != 0];
847 int last; /* last byte offset logged */ 205}
848 static const short offsets[] = { /* table of offsets */
849 offsetof(xfs_alloc_block_t, bb_magic),
850 offsetof(xfs_alloc_block_t, bb_level),
851 offsetof(xfs_alloc_block_t, bb_numrecs),
852 offsetof(xfs_alloc_block_t, bb_leftsib),
853 offsetof(xfs_alloc_block_t, bb_rightsib),
854 sizeof(xfs_alloc_block_t)
855 };
856 206
857 xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last); 207STATIC int
858 xfs_trans_log_buf(tp, bp, first, last); 208xfs_allocbt_get_maxrecs(
209 struct xfs_btree_cur *cur,
210 int level)
211{
212 return cur->bc_mp->m_alloc_mxr[level != 0];
859} 213}
860 214
861/*
862 * Log keys from a btree block (nonleaf).
863 */
864STATIC void 215STATIC void
865xfs_alloc_log_keys( 216xfs_allocbt_init_key_from_rec(
866 xfs_btree_cur_t *cur, /* btree cursor */ 217 union xfs_btree_key *key,
867 xfs_buf_t *bp, /* buffer containing btree block */ 218 union xfs_btree_rec *rec)
868 int kfirst, /* index of first key to log */
869 int klast) /* index of last key to log */
870{ 219{
871 xfs_alloc_block_t *block; /* btree block to log from */ 220 ASSERT(rec->alloc.ar_startblock != 0);
872 int first; /* first byte offset logged */
873 xfs_alloc_key_t *kp; /* key pointer in btree block */
874 int last; /* last byte offset logged */
875 221
876 block = XFS_BUF_TO_ALLOC_BLOCK(bp); 222 key->alloc.ar_startblock = rec->alloc.ar_startblock;
877 kp = XFS_ALLOC_KEY_ADDR(block, 1, cur); 223 key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
878 first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
879 last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
880 xfs_trans_log_buf(cur->bc_tp, bp, first, last);
881} 224}
882 225
883/*
884 * Log block pointer fields from a btree block (nonleaf).
885 */
886STATIC void 226STATIC void
887xfs_alloc_log_ptrs( 227xfs_allocbt_init_rec_from_key(
888 xfs_btree_cur_t *cur, /* btree cursor */ 228 union xfs_btree_key *key,
889 xfs_buf_t *bp, /* buffer containing btree block */ 229 union xfs_btree_rec *rec)
890 int pfirst, /* index of first pointer to log */
891 int plast) /* index of last pointer to log */
892{ 230{
893 xfs_alloc_block_t *block; /* btree block to log from */ 231 ASSERT(key->alloc.ar_startblock != 0);
894 int first; /* first byte offset logged */
895 int last; /* last byte offset logged */
896 xfs_alloc_ptr_t *pp; /* block-pointer pointer in btree blk */
897 232
898 block = XFS_BUF_TO_ALLOC_BLOCK(bp); 233 rec->alloc.ar_startblock = key->alloc.ar_startblock;
899 pp = XFS_ALLOC_PTR_ADDR(block, 1, cur); 234 rec->alloc.ar_blockcount = key->alloc.ar_blockcount;
900 first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
901 last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
902 xfs_trans_log_buf(cur->bc_tp, bp, first, last);
903} 235}
904 236
905/*
906 * Log records from a btree block (leaf).
907 */
908STATIC void 237STATIC void
909xfs_alloc_log_recs( 238xfs_allocbt_init_rec_from_cur(
910 xfs_btree_cur_t *cur, /* btree cursor */ 239 struct xfs_btree_cur *cur,
911 xfs_buf_t *bp, /* buffer containing btree block */ 240 union xfs_btree_rec *rec)
912 int rfirst, /* index of first record to log */
913 int rlast) /* index of last record to log */
914{ 241{
915 xfs_alloc_block_t *block; /* btree block to log from */ 242 ASSERT(cur->bc_rec.a.ar_startblock != 0);
916 int first; /* first byte offset logged */
917 int last; /* last byte offset logged */
918 xfs_alloc_rec_t *rp; /* record pointer for btree block */
919
920 243
921 block = XFS_BUF_TO_ALLOC_BLOCK(bp); 244 rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
922 rp = XFS_ALLOC_REC_ADDR(block, 1, cur); 245 rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
923#ifdef DEBUG
924 {
925 xfs_agf_t *agf;
926 xfs_alloc_rec_t *p;
927
928 agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
929 for (p = &rp[rfirst - 1]; p <= &rp[rlast - 1]; p++)
930 ASSERT(be32_to_cpu(p->ar_startblock) +
931 be32_to_cpu(p->ar_blockcount) <=
932 be32_to_cpu(agf->agf_length));
933 }
934#endif
935 first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
936 last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
937 xfs_trans_log_buf(cur->bc_tp, bp, first, last);
938} 246}
939 247
940/* 248STATIC void
941 * Lookup the record. The cursor is made to point to it, based on dir. 249xfs_allocbt_init_ptr_from_cur(
942 * Return 0 if can't find any such record, 1 for success. 250 struct xfs_btree_cur *cur,
943 */ 251 union xfs_btree_ptr *ptr)
944STATIC int /* error */
945xfs_alloc_lookup(
946 xfs_btree_cur_t *cur, /* btree cursor */
947 xfs_lookup_t dir, /* <=, ==, or >= */
948 int *stat) /* success/failure */
949{ 252{
950 xfs_agblock_t agbno; /* a.g. relative btree block number */ 253 struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
951 xfs_agnumber_t agno; /* allocation group number */
952 xfs_alloc_block_t *block=NULL; /* current btree block */
953 int diff; /* difference for the current key */
954 int error; /* error return value */
955 int keyno=0; /* current key number */
956 int level; /* level in the btree */
957 xfs_mount_t *mp; /* file system mount point */
958
959 XFS_STATS_INC(xs_abt_lookup);
960 /*
961 * Get the allocation group header, and the root block number.
962 */
963 mp = cur->bc_mp;
964
965 {
966 xfs_agf_t *agf; /* a.g. freespace header */
967
968 agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
969 agno = be32_to_cpu(agf->agf_seqno);
970 agbno = be32_to_cpu(agf->agf_roots[cur->bc_btnum]);
971 }
972 /*
973 * Iterate over each level in the btree, starting at the root.
974 * For each level above the leaves, find the key we need, based
975 * on the lookup record, then follow the corresponding block
976 * pointer down to the next level.
977 */
978 for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
979 xfs_buf_t *bp; /* buffer pointer for btree block */
980 xfs_daddr_t d; /* disk address of btree block */
981
982 /*
983 * Get the disk address we're looking for.
984 */
985 d = XFS_AGB_TO_DADDR(mp, agno, agbno);
986 /*
987 * If the old buffer at this level is for a different block,
988 * throw it away, otherwise just use it.
989 */
990 bp = cur->bc_bufs[level];
991 if (bp && XFS_BUF_ADDR(bp) != d)
992 bp = NULL;
993 if (!bp) {
994 /*
995 * Need to get a new buffer. Read it, then
996 * set it in the cursor, releasing the old one.
997 */
998 if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, agno,
999 agbno, 0, &bp, XFS_ALLOC_BTREE_REF)))
1000 return error;
1001 xfs_btree_setbuf(cur, level, bp);
1002 /*
1003 * Point to the btree block, now that we have the buffer
1004 */
1005 block = XFS_BUF_TO_ALLOC_BLOCK(bp);
1006 if ((error = xfs_btree_check_sblock(cur, block, level,
1007 bp)))
1008 return error;
1009 } else
1010 block = XFS_BUF_TO_ALLOC_BLOCK(bp);
1011 /*
1012 * If we already had a key match at a higher level, we know
1013 * we need to use the first entry in this block.
1014 */
1015 if (diff == 0)
1016 keyno = 1;
1017 /*
1018 * Otherwise we need to search this block. Do a binary search.
1019 */
1020 else {
1021 int high; /* high entry number */
1022 xfs_alloc_key_t *kkbase=NULL;/* base of keys in block */
1023 xfs_alloc_rec_t *krbase=NULL;/* base of records in block */
1024 int low; /* low entry number */
1025
1026 /*
1027 * Get a pointer to keys or records.
1028 */
1029 if (level > 0)
1030 kkbase = XFS_ALLOC_KEY_ADDR(block, 1, cur);
1031 else
1032 krbase = XFS_ALLOC_REC_ADDR(block, 1, cur);
1033 /*
1034 * Set low and high entry numbers, 1-based.
1035 */
1036 low = 1;
1037 if (!(high = be16_to_cpu(block->bb_numrecs))) {
1038 /*
1039 * If the block is empty, the tree must
1040 * be an empty leaf.
1041 */
1042 ASSERT(level == 0 && cur->bc_nlevels == 1);
1043 cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
1044 *stat = 0;
1045 return 0;
1046 }
1047 /*
1048 * Binary search the block.
1049 */
1050 while (low <= high) {
1051 xfs_extlen_t blockcount; /* key value */
1052 xfs_agblock_t startblock; /* key value */
1053
1054 XFS_STATS_INC(xs_abt_compare);
1055 /*
1056 * keyno is average of low and high.
1057 */
1058 keyno = (low + high) >> 1;
1059 /*
1060 * Get startblock & blockcount.
1061 */
1062 if (level > 0) {
1063 xfs_alloc_key_t *kkp;
1064
1065 kkp = kkbase + keyno - 1;
1066 startblock = be32_to_cpu(kkp->ar_startblock);
1067 blockcount = be32_to_cpu(kkp->ar_blockcount);
1068 } else {
1069 xfs_alloc_rec_t *krp;
1070 254
1071 krp = krbase + keyno - 1; 255 ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
1072 startblock = be32_to_cpu(krp->ar_startblock); 256 ASSERT(agf->agf_roots[cur->bc_btnum] != 0);
1073 blockcount = be32_to_cpu(krp->ar_blockcount);
1074 }
1075 /*
1076 * Compute difference to get next direction.
1077 */
1078 if (cur->bc_btnum == XFS_BTNUM_BNO)
1079 diff = (int)startblock -
1080 (int)cur->bc_rec.a.ar_startblock;
1081 else if (!(diff = (int)blockcount -
1082 (int)cur->bc_rec.a.ar_blockcount))
1083 diff = (int)startblock -
1084 (int)cur->bc_rec.a.ar_startblock;
1085 /*
1086 * Less than, move right.
1087 */
1088 if (diff < 0)
1089 low = keyno + 1;
1090 /*
1091 * Greater than, move left.
1092 */
1093 else if (diff > 0)
1094 high = keyno - 1;
1095 /*
1096 * Equal, we're done.
1097 */
1098 else
1099 break;
1100 }
1101 }
1102 /*
1103 * If there are more levels, set up for the next level
1104 * by getting the block number and filling in the cursor.
1105 */
1106 if (level > 0) {
1107 /*
1108 * If we moved left, need the previous key number,
1109 * unless there isn't one.
1110 */
1111 if (diff > 0 && --keyno < 1)
1112 keyno = 1;
1113 agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, keyno, cur));
1114#ifdef DEBUG
1115 if ((error = xfs_btree_check_sptr(cur, agbno, level)))
1116 return error;
1117#endif
1118 cur->bc_ptrs[level] = keyno;
1119 }
1120 }
1121 /*
1122 * Done with the search.
1123 * See if we need to adjust the results.
1124 */
1125 if (dir != XFS_LOOKUP_LE && diff < 0) {
1126 keyno++;
1127 /*
1128 * If ge search and we went off the end of the block, but it's
1129 * not the last block, we're in the wrong block.
1130 */
1131 if (dir == XFS_LOOKUP_GE &&
1132 keyno > be16_to_cpu(block->bb_numrecs) &&
1133 be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) {
1134 int i;
1135 257
1136 cur->bc_ptrs[0] = keyno; 258 ptr->s = agf->agf_roots[cur->bc_btnum];
1137 if ((error = xfs_alloc_increment(cur, 0, &i)))
1138 return error;
1139 XFS_WANT_CORRUPTED_RETURN(i == 1);
1140 *stat = 1;
1141 return 0;
1142 }
1143 }
1144 else if (dir == XFS_LOOKUP_LE && diff > 0)
1145 keyno--;
1146 cur->bc_ptrs[0] = keyno;
1147 /*
1148 * Return if we succeeded or not.
1149 */
1150 if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs))
1151 *stat = 0;
1152 else
1153 *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
1154 return 0;
1155} 259}
1156 260
1157/* 261STATIC __int64_t
1158 * Move 1 record left from cur/level if possible. 262xfs_allocbt_key_diff(
1159 * Update cur to reflect the new path. 263 struct xfs_btree_cur *cur,
1160 */ 264 union xfs_btree_key *key)
1161STATIC int /* error */
1162xfs_alloc_lshift(
1163 xfs_btree_cur_t *cur, /* btree cursor */
1164 int level, /* level to shift record on */
1165 int *stat) /* success/failure */
1166{ 265{
1167 int error; /* error return value */ 266 xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a;
1168#ifdef DEBUG 267 xfs_alloc_key_t *kp = &key->alloc;
1169 int i; /* loop index */ 268 __int64_t diff;
1170#endif
1171 xfs_alloc_key_t key; /* key value for leaf level upward */
1172 xfs_buf_t *lbp; /* buffer for left neighbor block */
1173 xfs_alloc_block_t *left; /* left neighbor btree block */
1174 int nrec; /* new number of left block entries */
1175 xfs_buf_t *rbp; /* buffer for right (current) block */
1176 xfs_alloc_block_t *right; /* right (current) btree block */
1177 xfs_alloc_key_t *rkp=NULL; /* key pointer for right block */
1178 xfs_alloc_ptr_t *rpp=NULL; /* address pointer for right block */
1179 xfs_alloc_rec_t *rrp=NULL; /* record pointer for right block */
1180 269
1181 /* 270 if (cur->bc_btnum == XFS_BTNUM_BNO) {
1182 * Set up variables for this block as "right". 271 return (__int64_t)be32_to_cpu(kp->ar_startblock) -
1183 */ 272 rec->ar_startblock;
1184 rbp = cur->bc_bufs[level];
1185 right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
1186#ifdef DEBUG
1187 if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
1188 return error;
1189#endif
1190 /*
1191 * If we've got no left sibling then we can't shift an entry left.
1192 */
1193 if (be32_to_cpu(right->bb_leftsib) == NULLAGBLOCK) {
1194 *stat = 0;
1195 return 0;
1196 }
1197 /*
1198 * If the cursor entry is the one that would be moved, don't
1199 * do it... it's too complicated.
1200 */
1201 if (cur->bc_ptrs[level] <= 1) {
1202 *stat = 0;
1203 return 0;
1204 }
1205 /*
1206 * Set up the left neighbor as "left".
1207 */
1208 if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
1209 cur->bc_private.a.agno, be32_to_cpu(right->bb_leftsib),
1210 0, &lbp, XFS_ALLOC_BTREE_REF)))
1211 return error;
1212 left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
1213 if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
1214 return error;
1215 /*
1216 * If it's full, it can't take another entry.
1217 */
1218 if (be16_to_cpu(left->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
1219 *stat = 0;
1220 return 0;
1221 } 273 }
1222 nrec = be16_to_cpu(left->bb_numrecs) + 1;
1223 /*
1224 * If non-leaf, copy a key and a ptr to the left block.
1225 */
1226 if (level > 0) {
1227 xfs_alloc_key_t *lkp; /* key pointer for left block */
1228 xfs_alloc_ptr_t *lpp; /* address pointer for left block */
1229 274
1230 lkp = XFS_ALLOC_KEY_ADDR(left, nrec, cur); 275 diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
1231 rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur); 276 if (diff)
1232 *lkp = *rkp; 277 return diff;
1233 xfs_alloc_log_keys(cur, lbp, nrec, nrec);
1234 lpp = XFS_ALLOC_PTR_ADDR(left, nrec, cur);
1235 rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
1236#ifdef DEBUG
1237 if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level)))
1238 return error;
1239#endif
1240 *lpp = *rpp;
1241 xfs_alloc_log_ptrs(cur, lbp, nrec, nrec);
1242 xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp);
1243 }
1244 /*
1245 * If leaf, copy a record to the left block.
1246 */
1247 else {
1248 xfs_alloc_rec_t *lrp; /* record pointer for left block */
1249 278
1250 lrp = XFS_ALLOC_REC_ADDR(left, nrec, cur); 279 return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
1251 rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
1252 *lrp = *rrp;
1253 xfs_alloc_log_recs(cur, lbp, nrec, nrec);
1254 xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp);
1255 }
1256 /*
1257 * Bump and log left's numrecs, decrement and log right's numrecs.
1258 */
1259 be16_add_cpu(&left->bb_numrecs, 1);
1260 xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
1261 be16_add_cpu(&right->bb_numrecs, -1);
1262 xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
1263 /*
1264 * Slide the contents of right down one entry.
1265 */
1266 if (level > 0) {
1267#ifdef DEBUG
1268 for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
1269 if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i + 1]),
1270 level)))
1271 return error;
1272 }
1273#endif
1274 memmove(rkp, rkp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
1275 memmove(rpp, rpp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
1276 xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
1277 xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
1278 } else {
1279 memmove(rrp, rrp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
1280 xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
1281 key.ar_startblock = rrp->ar_startblock;
1282 key.ar_blockcount = rrp->ar_blockcount;
1283 rkp = &key;
1284 }
1285 /*
1286 * Update the parent key values of right.
1287 */
1288 if ((error = xfs_alloc_updkey(cur, rkp, level + 1)))
1289 return error;
1290 /*
1291 * Slide the cursor value left one.
1292 */
1293 cur->bc_ptrs[level]--;
1294 *stat = 1;
1295 return 0;
1296} 280}
1297 281
1298/* 282STATIC int
1299 * Allocate a new root block, fill it in. 283xfs_allocbt_kill_root(
1300 */ 284 struct xfs_btree_cur *cur,
1301STATIC int /* error */ 285 struct xfs_buf *bp,
1302xfs_alloc_newroot( 286 int level,
1303 xfs_btree_cur_t *cur, /* btree cursor */ 287 union xfs_btree_ptr *newroot)
1304 int *stat) /* success/failure */
1305{ 288{
1306 int error; /* error return value */ 289 int error;
1307 xfs_agblock_t lbno; /* left block number */
1308 xfs_buf_t *lbp; /* left btree buffer */
1309 xfs_alloc_block_t *left; /* left btree block */
1310 xfs_mount_t *mp; /* mount structure */
1311 xfs_agblock_t nbno; /* new block number */
1312 xfs_buf_t *nbp; /* new (root) buffer */
1313 xfs_alloc_block_t *new; /* new (root) btree block */
1314 int nptr; /* new value for key index, 1 or 2 */
1315 xfs_agblock_t rbno; /* right block number */
1316 xfs_buf_t *rbp; /* right btree buffer */
1317 xfs_alloc_block_t *right; /* right btree block */
1318
1319 mp = cur->bc_mp;
1320 290
1321 ASSERT(cur->bc_nlevels < XFS_AG_MAXLEVELS(mp)); 291 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1322 /* 292 XFS_BTREE_STATS_INC(cur, killroot);
1323 * Get a buffer from the freelist blocks, for the new root.
1324 */
1325 error = xfs_alloc_get_freelist(cur->bc_tp,
1326 cur->bc_private.a.agbp, &nbno, 1);
1327 if (error)
1328 return error;
1329 /*
1330 * None available, we fail.
1331 */
1332 if (nbno == NULLAGBLOCK) {
1333 *stat = 0;
1334 return 0;
1335 }
1336 xfs_trans_agbtree_delta(cur->bc_tp, 1);
1337 nbp = xfs_btree_get_bufs(mp, cur->bc_tp, cur->bc_private.a.agno, nbno,
1338 0);
1339 new = XFS_BUF_TO_ALLOC_BLOCK(nbp);
1340 /*
1341 * Set the root data in the a.g. freespace structure.
1342 */
1343 {
1344 xfs_agf_t *agf; /* a.g. freespace header */
1345 xfs_agnumber_t seqno;
1346 293
1347 agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
1348 agf->agf_roots[cur->bc_btnum] = cpu_to_be32(nbno);
1349 be32_add_cpu(&agf->agf_levels[cur->bc_btnum], 1);
1350 seqno = be32_to_cpu(agf->agf_seqno);
1351 mp->m_perag[seqno].pagf_levels[cur->bc_btnum]++;
1352 xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
1353 XFS_AGF_ROOTS | XFS_AGF_LEVELS);
1354 }
1355 /* 294 /*
1356 * At the previous root level there are now two blocks: the old 295 * Update the root pointer, decreasing the level by 1 and then
1357 * root, and the new block generated when it was split. 296 * free the old root.
1358 * We don't know which one the cursor is pointing at, so we
1359 * set up variables "left" and "right" for each case.
1360 */ 297 */
1361 lbp = cur->bc_bufs[cur->bc_nlevels - 1]; 298 xfs_allocbt_set_root(cur, newroot, -1);
1362 left = XFS_BUF_TO_ALLOC_BLOCK(lbp); 299 error = xfs_allocbt_free_block(cur, bp);
1363#ifdef DEBUG 300 if (error) {
1364 if ((error = xfs_btree_check_sblock(cur, left, cur->bc_nlevels - 1, lbp))) 301 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1365 return error; 302 return error;
1366#endif
1367 if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) {
1368 /*
1369 * Our block is left, pick up the right block.
1370 */
1371 lbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(lbp));
1372 rbno = be32_to_cpu(left->bb_rightsib);
1373 if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
1374 cur->bc_private.a.agno, rbno, 0, &rbp,
1375 XFS_ALLOC_BTREE_REF)))
1376 return error;
1377 right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
1378 if ((error = xfs_btree_check_sblock(cur, right,
1379 cur->bc_nlevels - 1, rbp)))
1380 return error;
1381 nptr = 1;
1382 } else {
1383 /*
1384 * Our block is right, pick up the left block.
1385 */
1386 rbp = lbp;
1387 right = left;
1388 rbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(rbp));
1389 lbno = be32_to_cpu(right->bb_leftsib);
1390 if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
1391 cur->bc_private.a.agno, lbno, 0, &lbp,
1392 XFS_ALLOC_BTREE_REF)))
1393 return error;
1394 left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
1395 if ((error = xfs_btree_check_sblock(cur, left,
1396 cur->bc_nlevels - 1, lbp)))
1397 return error;
1398 nptr = 2;
1399 } 303 }
1400 /*
1401 * Fill in the new block's btree header and log it.
1402 */
1403 new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
1404 new->bb_level = cpu_to_be16(cur->bc_nlevels);
1405 new->bb_numrecs = cpu_to_be16(2);
1406 new->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1407 new->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1408 xfs_alloc_log_block(cur->bc_tp, nbp, XFS_BB_ALL_BITS);
1409 ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK);
1410 /*
1411 * Fill in the key data in the new root.
1412 */
1413 {
1414 xfs_alloc_key_t *kp; /* btree key pointer */
1415 304
1416 kp = XFS_ALLOC_KEY_ADDR(new, 1, cur); 305 XFS_BTREE_STATS_INC(cur, free);
1417 if (be16_to_cpu(left->bb_level) > 0) {
1418 kp[0] = *XFS_ALLOC_KEY_ADDR(left, 1, cur);
1419 kp[1] = *XFS_ALLOC_KEY_ADDR(right, 1, cur);
1420 } else {
1421 xfs_alloc_rec_t *rp; /* btree record pointer */
1422 306
1423 rp = XFS_ALLOC_REC_ADDR(left, 1, cur); 307 xfs_btree_setbuf(cur, level, NULL);
1424 kp[0].ar_startblock = rp->ar_startblock; 308 cur->bc_nlevels--;
1425 kp[0].ar_blockcount = rp->ar_blockcount;
1426 rp = XFS_ALLOC_REC_ADDR(right, 1, cur);
1427 kp[1].ar_startblock = rp->ar_startblock;
1428 kp[1].ar_blockcount = rp->ar_blockcount;
1429 }
1430 }
1431 xfs_alloc_log_keys(cur, nbp, 1, 2);
1432 /*
1433 * Fill in the pointer data in the new root.
1434 */
1435 {
1436 xfs_alloc_ptr_t *pp; /* btree address pointer */
1437 309
1438 pp = XFS_ALLOC_PTR_ADDR(new, 1, cur); 310 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1439 pp[0] = cpu_to_be32(lbno);
1440 pp[1] = cpu_to_be32(rbno);
1441 }
1442 xfs_alloc_log_ptrs(cur, nbp, 1, 2);
1443 /*
1444 * Fix up the cursor.
1445 */
1446 xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
1447 cur->bc_ptrs[cur->bc_nlevels] = nptr;
1448 cur->bc_nlevels++;
1449 *stat = 1;
1450 return 0; 311 return 0;
1451} 312}
1452 313
1453/*
1454 * Move 1 record right from cur/level if possible.
1455 * Update cur to reflect the new path.
1456 */
1457STATIC int /* error */
1458xfs_alloc_rshift(
1459 xfs_btree_cur_t *cur, /* btree cursor */
1460 int level, /* level to shift record on */
1461 int *stat) /* success/failure */
1462{
1463 int error; /* error return value */
1464 int i; /* loop index */
1465 xfs_alloc_key_t key; /* key value for leaf level upward */
1466 xfs_buf_t *lbp; /* buffer for left (current) block */
1467 xfs_alloc_block_t *left; /* left (current) btree block */
1468 xfs_buf_t *rbp; /* buffer for right neighbor block */
1469 xfs_alloc_block_t *right; /* right neighbor btree block */
1470 xfs_alloc_key_t *rkp; /* key pointer for right block */
1471 xfs_btree_cur_t *tcur; /* temporary cursor */
1472
1473 /*
1474 * Set up variables for this block as "left".
1475 */
1476 lbp = cur->bc_bufs[level];
1477 left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
1478#ifdef DEBUG
1479 if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
1480 return error;
1481#endif
1482 /*
1483 * If we've got no right sibling then we can't shift an entry right.
1484 */
1485 if (be32_to_cpu(left->bb_rightsib) == NULLAGBLOCK) {
1486 *stat = 0;
1487 return 0;
1488 }
1489 /*
1490 * If the cursor entry is the one that would be moved, don't
1491 * do it... it's too complicated.
1492 */
1493 if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) {
1494 *stat = 0;
1495 return 0;
1496 }
1497 /*
1498 * Set up the right neighbor as "right".
1499 */
1500 if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
1501 cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib),
1502 0, &rbp, XFS_ALLOC_BTREE_REF)))
1503 return error;
1504 right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
1505 if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
1506 return error;
1507 /*
1508 * If it's full, it can't take another entry.
1509 */
1510 if (be16_to_cpu(right->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
1511 *stat = 0;
1512 return 0;
1513 }
1514 /*
1515 * Make a hole at the start of the right neighbor block, then
1516 * copy the last left block entry to the hole.
1517 */
1518 if (level > 0) {
1519 xfs_alloc_key_t *lkp; /* key pointer for left block */
1520 xfs_alloc_ptr_t *lpp; /* address pointer for left block */
1521 xfs_alloc_ptr_t *rpp; /* address pointer for right block */
1522
1523 lkp = XFS_ALLOC_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
1524 lpp = XFS_ALLOC_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
1525 rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
1526 rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
1527#ifdef DEBUG 314#ifdef DEBUG
1528 for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) { 315STATIC int
1529 if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level))) 316xfs_allocbt_keys_inorder(
1530 return error; 317 struct xfs_btree_cur *cur,
1531 } 318 union xfs_btree_key *k1,
1532#endif 319 union xfs_btree_key *k2)
1533 memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp)); 320{
1534 memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp)); 321 if (cur->bc_btnum == XFS_BTNUM_BNO) {
1535#ifdef DEBUG 322 return be32_to_cpu(k1->alloc.ar_startblock) <
1536 if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level))) 323 be32_to_cpu(k2->alloc.ar_startblock);
1537 return error;
1538#endif
1539 *rkp = *lkp;
1540 *rpp = *lpp;
1541 xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
1542 xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
1543 xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
1544 } else { 324 } else {
1545 xfs_alloc_rec_t *lrp; /* record pointer for left block */ 325 return be32_to_cpu(k1->alloc.ar_blockcount) <
1546 xfs_alloc_rec_t *rrp; /* record pointer for right block */ 326 be32_to_cpu(k2->alloc.ar_blockcount) ||
1547 327 (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
1548 lrp = XFS_ALLOC_REC_ADDR(left, be16_to_cpu(left->bb_numrecs), cur); 328 be32_to_cpu(k1->alloc.ar_startblock) <
1549 rrp = XFS_ALLOC_REC_ADDR(right, 1, cur); 329 be32_to_cpu(k2->alloc.ar_startblock));
1550 memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
1551 *rrp = *lrp;
1552 xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
1553 key.ar_startblock = rrp->ar_startblock;
1554 key.ar_blockcount = rrp->ar_blockcount;
1555 rkp = &key;
1556 xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1);
1557 } 330 }
1558 /*
1559 * Decrement and log left's numrecs, bump and log right's numrecs.
1560 */
1561 be16_add_cpu(&left->bb_numrecs, -1);
1562 xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
1563 be16_add_cpu(&right->bb_numrecs, 1);
1564 xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
1565 /*
1566 * Using a temporary cursor, update the parent key values of the
1567 * block on the right.
1568 */
1569 if ((error = xfs_btree_dup_cursor(cur, &tcur)))
1570 return error;
1571 i = xfs_btree_lastrec(tcur, level);
1572 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1573 if ((error = xfs_alloc_increment(tcur, level, &i)) ||
1574 (error = xfs_alloc_updkey(tcur, rkp, level + 1)))
1575 goto error0;
1576 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1577 *stat = 1;
1578 return 0;
1579error0:
1580 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
1581 return error;
1582} 331}
1583 332
1584/* 333STATIC int
1585 * Split cur/level block in half. 334xfs_allocbt_recs_inorder(
1586 * Return new block number and its first record (to be inserted into parent). 335 struct xfs_btree_cur *cur,
1587 */ 336 union xfs_btree_rec *r1,
1588STATIC int /* error */ 337 union xfs_btree_rec *r2)
1589xfs_alloc_split(
1590 xfs_btree_cur_t *cur, /* btree cursor */
1591 int level, /* level to split */
1592 xfs_agblock_t *bnop, /* output: block number allocated */
1593 xfs_alloc_key_t *keyp, /* output: first key of new block */
1594 xfs_btree_cur_t **curp, /* output: new cursor */
1595 int *stat) /* success/failure */
1596{ 338{
1597 int error; /* error return value */ 339 if (cur->bc_btnum == XFS_BTNUM_BNO) {
1598 int i; /* loop index/record number */ 340 return be32_to_cpu(r1->alloc.ar_startblock) +
1599 xfs_agblock_t lbno; /* left (current) block number */ 341 be32_to_cpu(r1->alloc.ar_blockcount) <=
1600 xfs_buf_t *lbp; /* buffer for left block */ 342 be32_to_cpu(r2->alloc.ar_startblock);
1601 xfs_alloc_block_t *left; /* left (current) btree block */ 343 } else {
1602 xfs_agblock_t rbno; /* right (new) block number */ 344 return be32_to_cpu(r1->alloc.ar_blockcount) <
1603 xfs_buf_t *rbp; /* buffer for right block */ 345 be32_to_cpu(r2->alloc.ar_blockcount) ||
1604 xfs_alloc_block_t *right; /* right (new) btree block */ 346 (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
1605 347 be32_to_cpu(r1->alloc.ar_startblock) <
1606 /* 348 be32_to_cpu(r2->alloc.ar_startblock));
1607 * Allocate the new block from the freelist.
1608 * If we can't do it, we're toast. Give up.
1609 */
1610 error = xfs_alloc_get_freelist(cur->bc_tp,
1611 cur->bc_private.a.agbp, &rbno, 1);
1612 if (error)
1613 return error;
1614 if (rbno == NULLAGBLOCK) {
1615 *stat = 0;
1616 return 0;
1617 }
1618 xfs_trans_agbtree_delta(cur->bc_tp, 1);
1619 rbp = xfs_btree_get_bufs(cur->bc_mp, cur->bc_tp, cur->bc_private.a.agno,
1620 rbno, 0);
1621 /*
1622 * Set up the new block as "right".
1623 */
1624 right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
1625 /*
1626 * "Left" is the current (according to the cursor) block.
1627 */
1628 lbp = cur->bc_bufs[level];
1629 left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
1630#ifdef DEBUG
1631 if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
1632 return error;
1633#endif
1634 /*
1635 * Fill in the btree header for the new block.
1636 */
1637 right->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
1638 right->bb_level = left->bb_level;
1639 right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2);
1640 /*
1641 * Make sure that if there's an odd number of entries now, that
1642 * each new block will have the same number of entries.
1643 */
1644 if ((be16_to_cpu(left->bb_numrecs) & 1) &&
1645 cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
1646 be16_add_cpu(&right->bb_numrecs, 1);
1647 i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
1648 /*
1649 * For non-leaf blocks, copy keys and addresses over to the new block.
1650 */
1651 if (level > 0) {
1652 xfs_alloc_key_t *lkp; /* left btree key pointer */
1653 xfs_alloc_ptr_t *lpp; /* left btree address pointer */
1654 xfs_alloc_key_t *rkp; /* right btree key pointer */
1655 xfs_alloc_ptr_t *rpp; /* right btree address pointer */
1656
1657 lkp = XFS_ALLOC_KEY_ADDR(left, i, cur);
1658 lpp = XFS_ALLOC_PTR_ADDR(left, i, cur);
1659 rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
1660 rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
1661#ifdef DEBUG
1662 for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
1663 if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
1664 return error;
1665 }
1666#endif
1667 memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
1668 memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
1669 xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
1670 xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
1671 *keyp = *rkp;
1672 } 349 }
1673 /* 350}
1674 * For leaf blocks, copy records over to the new block. 351#endif /* DEBUG */
1675 */
1676 else {
1677 xfs_alloc_rec_t *lrp; /* left btree record pointer */
1678 xfs_alloc_rec_t *rrp; /* right btree record pointer */
1679 352
1680 lrp = XFS_ALLOC_REC_ADDR(left, i, cur); 353#ifdef XFS_BTREE_TRACE
1681 rrp = XFS_ALLOC_REC_ADDR(right, 1, cur); 354ktrace_t *xfs_allocbt_trace_buf;
1682 memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
1683 xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
1684 keyp->ar_startblock = rrp->ar_startblock;
1685 keyp->ar_blockcount = rrp->ar_blockcount;
1686 }
1687 /*
1688 * Find the left block number by looking in the buffer.
1689 * Adjust numrecs, sibling pointers.
1690 */
1691 lbno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(lbp));
1692 be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
1693 right->bb_rightsib = left->bb_rightsib;
1694 left->bb_rightsib = cpu_to_be32(rbno);
1695 right->bb_leftsib = cpu_to_be32(lbno);
1696 xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_ALL_BITS);
1697 xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
1698 /*
1699 * If there's a block to the new block's right, make that block
1700 * point back to right instead of to left.
1701 */
1702 if (be32_to_cpu(right->bb_rightsib) != NULLAGBLOCK) {
1703 xfs_alloc_block_t *rrblock; /* rr btree block */
1704 xfs_buf_t *rrbp; /* buffer for rrblock */
1705 355
1706 if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, 356STATIC void
1707 cur->bc_private.a.agno, be32_to_cpu(right->bb_rightsib), 0, 357xfs_allocbt_trace_enter(
1708 &rrbp, XFS_ALLOC_BTREE_REF))) 358 struct xfs_btree_cur *cur,
1709 return error; 359 const char *func,
1710 rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp); 360 char *s,
1711 if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp))) 361 int type,
1712 return error; 362 int line,
1713 rrblock->bb_leftsib = cpu_to_be32(rbno); 363 __psunsigned_t a0,
1714 xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB); 364 __psunsigned_t a1,
1715 } 365 __psunsigned_t a2,
1716 /* 366 __psunsigned_t a3,
1717 * If the cursor is really in the right block, move it there. 367 __psunsigned_t a4,
1718 * If it's just pointing past the last entry in left, then we'll 368 __psunsigned_t a5,
1719 * insert there, so don't change anything in that case. 369 __psunsigned_t a6,
1720 */ 370 __psunsigned_t a7,
1721 if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) { 371 __psunsigned_t a8,
1722 xfs_btree_setbuf(cur, level, rbp); 372 __psunsigned_t a9,
1723 cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs); 373 __psunsigned_t a10)
1724 } 374{
1725 /* 375 ktrace_enter(xfs_allocbt_trace_buf, (void *)(__psint_t)type,
1726 * If there are more levels, we'll need another cursor which refers to 376 (void *)func, (void *)s, NULL, (void *)cur,
1727 * the right block, no matter where this cursor was. 377 (void *)a0, (void *)a1, (void *)a2, (void *)a3,
1728 */ 378 (void *)a4, (void *)a5, (void *)a6, (void *)a7,
1729 if (level + 1 < cur->bc_nlevels) { 379 (void *)a8, (void *)a9, (void *)a10);
1730 if ((error = xfs_btree_dup_cursor(cur, curp)))
1731 return error;
1732 (*curp)->bc_ptrs[level + 1]++;
1733 }
1734 *bnop = rbno;
1735 *stat = 1;
1736 return 0;
1737} 380}
1738 381
1739/* 382STATIC void
1740 * Update keys at all levels from here to the root along the cursor's path. 383xfs_allocbt_trace_cursor(
1741 */ 384 struct xfs_btree_cur *cur,
1742STATIC int /* error */ 385 __uint32_t *s0,
1743xfs_alloc_updkey( 386 __uint64_t *l0,
1744 xfs_btree_cur_t *cur, /* btree cursor */ 387 __uint64_t *l1)
1745 xfs_alloc_key_t *keyp, /* new key value to update to */
1746 int level) /* starting level for update */
1747{ 388{
1748 int ptr; /* index of key in block */ 389 *s0 = cur->bc_private.a.agno;
1749 390 *l0 = cur->bc_rec.a.ar_startblock;
1750 /* 391 *l1 = cur->bc_rec.a.ar_blockcount;
1751 * Go up the tree from this level toward the root.
1752 * At each level, update the key value to the value input.
1753 * Stop when we reach a level where the cursor isn't pointing
1754 * at the first entry in the block.
1755 */
1756 for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
1757 xfs_alloc_block_t *block; /* btree block */
1758 xfs_buf_t *bp; /* buffer for block */
1759#ifdef DEBUG
1760 int error; /* error return value */
1761#endif
1762 xfs_alloc_key_t *kp; /* ptr to btree block keys */
1763
1764 bp = cur->bc_bufs[level];
1765 block = XFS_BUF_TO_ALLOC_BLOCK(bp);
1766#ifdef DEBUG
1767 if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
1768 return error;
1769#endif
1770 ptr = cur->bc_ptrs[level];
1771 kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur);
1772 *kp = *keyp;
1773 xfs_alloc_log_keys(cur, bp, ptr, ptr);
1774 }
1775 return 0;
1776} 392}
1777 393
1778/* 394STATIC void
1779 * Externally visible routines. 395xfs_allocbt_trace_key(
1780 */ 396 struct xfs_btree_cur *cur,
1781 397 union xfs_btree_key *key,
1782/* 398 __uint64_t *l0,
1783 * Decrement cursor by one record at the level. 399 __uint64_t *l1)
1784 * For nonzero levels the leaf-ward information is untouched.
1785 */
1786int /* error */
1787xfs_alloc_decrement(
1788 xfs_btree_cur_t *cur, /* btree cursor */
1789 int level, /* level in btree, 0 is leaf */
1790 int *stat) /* success/failure */
1791{ 400{
1792 xfs_alloc_block_t *block; /* btree block */ 401 *l0 = be32_to_cpu(key->alloc.ar_startblock);
1793 int error; /* error return value */ 402 *l1 = be32_to_cpu(key->alloc.ar_blockcount);
1794 int lev; /* btree level */
1795
1796 ASSERT(level < cur->bc_nlevels);
1797 /*
1798 * Read-ahead to the left at this level.
1799 */
1800 xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1801 /*
1802 * Decrement the ptr at this level. If we're still in the block
1803 * then we're done.
1804 */
1805 if (--cur->bc_ptrs[level] > 0) {
1806 *stat = 1;
1807 return 0;
1808 }
1809 /*
1810 * Get a pointer to the btree block.
1811 */
1812 block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[level]);
1813#ifdef DEBUG
1814 if ((error = xfs_btree_check_sblock(cur, block, level,
1815 cur->bc_bufs[level])))
1816 return error;
1817#endif
1818 /*
1819 * If we just went off the left edge of the tree, return failure.
1820 */
1821 if (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK) {
1822 *stat = 0;
1823 return 0;
1824 }
1825 /*
1826 * March up the tree decrementing pointers.
1827 * Stop when we don't go off the left edge of a block.
1828 */
1829 for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1830 if (--cur->bc_ptrs[lev] > 0)
1831 break;
1832 /*
1833 * Read-ahead the left block, we're going to read it
1834 * in the next loop.
1835 */
1836 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1837 }
1838 /*
1839 * If we went off the root then we are seriously confused.
1840 */
1841 ASSERT(lev < cur->bc_nlevels);
1842 /*
1843 * Now walk back down the tree, fixing up the cursor's buffer
1844 * pointers and key numbers.
1845 */
1846 for (block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]); lev > level; ) {
1847 xfs_agblock_t agbno; /* block number of btree block */
1848 xfs_buf_t *bp; /* buffer pointer for block */
1849
1850 agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
1851 if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
1852 cur->bc_private.a.agno, agbno, 0, &bp,
1853 XFS_ALLOC_BTREE_REF)))
1854 return error;
1855 lev--;
1856 xfs_btree_setbuf(cur, lev, bp);
1857 block = XFS_BUF_TO_ALLOC_BLOCK(bp);
1858 if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
1859 return error;
1860 cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs);
1861 }
1862 *stat = 1;
1863 return 0;
1864} 403}
1865 404
1866/* 405STATIC void
1867 * Delete the record pointed to by cur. 406xfs_allocbt_trace_record(
1868 * The cursor refers to the place where the record was (could be inserted) 407 struct xfs_btree_cur *cur,
1869 * when the operation returns. 408 union xfs_btree_rec *rec,
1870 */ 409 __uint64_t *l0,
1871int /* error */ 410 __uint64_t *l1,
1872xfs_alloc_delete( 411 __uint64_t *l2)
1873 xfs_btree_cur_t *cur, /* btree cursor */
1874 int *stat) /* success/failure */
1875{ 412{
1876 int error; /* error return value */ 413 *l0 = be32_to_cpu(rec->alloc.ar_startblock);
1877 int i; /* result code */ 414 *l1 = be32_to_cpu(rec->alloc.ar_blockcount);
1878 int level; /* btree level */ 415 *l2 = 0;
1879
1880 /*
1881 * Go up the tree, starting at leaf level.
1882 * If 2 is returned then a join was done; go to the next level.
1883 * Otherwise we are done.
1884 */
1885 for (level = 0, i = 2; i == 2; level++) {
1886 if ((error = xfs_alloc_delrec(cur, level, &i)))
1887 return error;
1888 }
1889 if (i == 0) {
1890 for (level = 1; level < cur->bc_nlevels; level++) {
1891 if (cur->bc_ptrs[level] == 0) {
1892 if ((error = xfs_alloc_decrement(cur, level, &i)))
1893 return error;
1894 break;
1895 }
1896 }
1897 }
1898 *stat = i;
1899 return 0;
1900} 416}
417#endif /* XFS_BTREE_TRACE */
418
419static const struct xfs_btree_ops xfs_allocbt_ops = {
420 .rec_len = sizeof(xfs_alloc_rec_t),
421 .key_len = sizeof(xfs_alloc_key_t),
422
423 .dup_cursor = xfs_allocbt_dup_cursor,
424 .set_root = xfs_allocbt_set_root,
425 .kill_root = xfs_allocbt_kill_root,
426 .alloc_block = xfs_allocbt_alloc_block,
427 .free_block = xfs_allocbt_free_block,
428 .update_lastrec = xfs_allocbt_update_lastrec,
429 .get_minrecs = xfs_allocbt_get_minrecs,
430 .get_maxrecs = xfs_allocbt_get_maxrecs,
431 .init_key_from_rec = xfs_allocbt_init_key_from_rec,
432 .init_rec_from_key = xfs_allocbt_init_rec_from_key,
433 .init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
434 .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
435 .key_diff = xfs_allocbt_key_diff,
1901 436
1902/*
1903 * Get the data from the pointed-to record.
1904 */
1905int /* error */
1906xfs_alloc_get_rec(
1907 xfs_btree_cur_t *cur, /* btree cursor */
1908 xfs_agblock_t *bno, /* output: starting block of extent */
1909 xfs_extlen_t *len, /* output: length of extent */
1910 int *stat) /* output: success/failure */
1911{
1912 xfs_alloc_block_t *block; /* btree block */
1913#ifdef DEBUG 437#ifdef DEBUG
1914 int error; /* error return value */ 438 .keys_inorder = xfs_allocbt_keys_inorder,
439 .recs_inorder = xfs_allocbt_recs_inorder,
1915#endif 440#endif
1916 int ptr; /* record number */
1917 441
1918 ptr = cur->bc_ptrs[0]; 442#ifdef XFS_BTREE_TRACE
1919 block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]); 443 .trace_enter = xfs_allocbt_trace_enter,
1920#ifdef DEBUG 444 .trace_cursor = xfs_allocbt_trace_cursor,
1921 if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0]))) 445 .trace_key = xfs_allocbt_trace_key,
1922 return error; 446 .trace_record = xfs_allocbt_trace_record,
1923#endif 447#endif
1924 /* 448};
1925 * Off the right end or left end, return failure.
1926 */
1927 if (ptr > be16_to_cpu(block->bb_numrecs) || ptr <= 0) {
1928 *stat = 0;
1929 return 0;
1930 }
1931 /*
1932 * Point to the record and extract its data.
1933 */
1934 {
1935 xfs_alloc_rec_t *rec; /* record data */
1936
1937 rec = XFS_ALLOC_REC_ADDR(block, ptr, cur);
1938 *bno = be32_to_cpu(rec->ar_startblock);
1939 *len = be32_to_cpu(rec->ar_blockcount);
1940 }
1941 *stat = 1;
1942 return 0;
1943}
1944 449
1945/* 450/*
1946 * Increment cursor by one record at the level. 451 * Allocate a new allocation btree cursor.
1947 * For nonzero levels the leaf-ward information is untouched.
1948 */ 452 */
1949int /* error */ 453struct xfs_btree_cur * /* new alloc btree cursor */
1950xfs_alloc_increment( 454xfs_allocbt_init_cursor(
1951 xfs_btree_cur_t *cur, /* btree cursor */ 455 struct xfs_mount *mp, /* file system mount point */
1952 int level, /* level in btree, 0 is leaf */ 456 struct xfs_trans *tp, /* transaction pointer */
1953 int *stat) /* success/failure */ 457 struct xfs_buf *agbp, /* buffer for agf structure */
458 xfs_agnumber_t agno, /* allocation group number */
459 xfs_btnum_t btnum) /* btree identifier */
1954{ 460{
1955 xfs_alloc_block_t *block; /* btree block */ 461 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
1956 xfs_buf_t *bp; /* tree block buffer */ 462 struct xfs_btree_cur *cur;
1957 int error; /* error return value */
1958 int lev; /* btree level */
1959
1960 ASSERT(level < cur->bc_nlevels);
1961 /*
1962 * Read-ahead to the right at this level.
1963 */
1964 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1965 /*
1966 * Get a pointer to the btree block.
1967 */
1968 bp = cur->bc_bufs[level];
1969 block = XFS_BUF_TO_ALLOC_BLOCK(bp);
1970#ifdef DEBUG
1971 if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
1972 return error;
1973#endif
1974 /*
1975 * Increment the ptr at this level. If we're still in the block
1976 * then we're done.
1977 */
1978 if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) {
1979 *stat = 1;
1980 return 0;
1981 }
1982 /*
1983 * If we just went off the right edge of the tree, return failure.
1984 */
1985 if (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK) {
1986 *stat = 0;
1987 return 0;
1988 }
1989 /*
1990 * March up the tree incrementing pointers.
1991 * Stop when we don't go off the right edge of a block.
1992 */
1993 for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1994 bp = cur->bc_bufs[lev];
1995 block = XFS_BUF_TO_ALLOC_BLOCK(bp);
1996#ifdef DEBUG
1997 if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
1998 return error;
1999#endif
2000 if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs))
2001 break;
2002 /*
2003 * Read-ahead the right block, we're going to read it
2004 * in the next loop.
2005 */
2006 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
2007 }
2008 /*
2009 * If we went off the root then we are seriously confused.
2010 */
2011 ASSERT(lev < cur->bc_nlevels);
2012 /*
2013 * Now walk back down the tree, fixing up the cursor's buffer
2014 * pointers and key numbers.
2015 */
2016 for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_ALLOC_BLOCK(bp);
2017 lev > level; ) {
2018 xfs_agblock_t agbno; /* block number of btree block */
2019 463
2020 agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur)); 464 ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
2021 if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
2022 cur->bc_private.a.agno, agbno, 0, &bp,
2023 XFS_ALLOC_BTREE_REF)))
2024 return error;
2025 lev--;
2026 xfs_btree_setbuf(cur, lev, bp);
2027 block = XFS_BUF_TO_ALLOC_BLOCK(bp);
2028 if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
2029 return error;
2030 cur->bc_ptrs[lev] = 1;
2031 }
2032 *stat = 1;
2033 return 0;
2034}
2035 465
2036/* 466 cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
2037 * Insert the current record at the point referenced by cur.
2038 * The cursor may be inconsistent on return if splits have been done.
2039 */
2040int /* error */
2041xfs_alloc_insert(
2042 xfs_btree_cur_t *cur, /* btree cursor */
2043 int *stat) /* success/failure */
2044{
2045 int error; /* error return value */
2046 int i; /* result value, 0 for failure */
2047 int level; /* current level number in btree */
2048 xfs_agblock_t nbno; /* new block number (split result) */
2049 xfs_btree_cur_t *ncur; /* new cursor (split result) */
2050 xfs_alloc_rec_t nrec; /* record being inserted this level */
2051 xfs_btree_cur_t *pcur; /* previous level's cursor */
2052 467
2053 level = 0; 468 cur->bc_tp = tp;
2054 nbno = NULLAGBLOCK; 469 cur->bc_mp = mp;
2055 nrec.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock); 470 cur->bc_nlevels = be32_to_cpu(agf->agf_levels[btnum]);
2056 nrec.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount); 471 cur->bc_btnum = btnum;
2057 ncur = NULL; 472 cur->bc_blocklog = mp->m_sb.sb_blocklog;
2058 pcur = cur;
2059 /*
2060 * Loop going up the tree, starting at the leaf level.
2061 * Stop when we don't get a split block, that must mean that
2062 * the insert is finished with this level.
2063 */
2064 do {
2065 /*
2066 * Insert nrec/nbno into this level of the tree.
2067 * Note if we fail, nbno will be null.
2068 */
2069 if ((error = xfs_alloc_insrec(pcur, level++, &nbno, &nrec, &ncur,
2070 &i))) {
2071 if (pcur != cur)
2072 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
2073 return error;
2074 }
2075 /*
2076 * See if the cursor we just used is trash.
2077 * Can't trash the caller's cursor, but otherwise we should
2078 * if ncur is a new cursor or we're about to be done.
2079 */
2080 if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) {
2081 cur->bc_nlevels = pcur->bc_nlevels;
2082 xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
2083 }
2084 /*
2085 * If we got a new cursor, switch to it.
2086 */
2087 if (ncur) {
2088 pcur = ncur;
2089 ncur = NULL;
2090 }
2091 } while (nbno != NULLAGBLOCK);
2092 *stat = i;
2093 return 0;
2094}
2095 473
2096/* 474 cur->bc_ops = &xfs_allocbt_ops;
2097 * Lookup the record equal to [bno, len] in the btree given by cur. 475 if (btnum == XFS_BTNUM_CNT)
2098 */ 476 cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
2099int /* error */
2100xfs_alloc_lookup_eq(
2101 xfs_btree_cur_t *cur, /* btree cursor */
2102 xfs_agblock_t bno, /* starting block of extent */
2103 xfs_extlen_t len, /* length of extent */
2104 int *stat) /* success/failure */
2105{
2106 cur->bc_rec.a.ar_startblock = bno;
2107 cur->bc_rec.a.ar_blockcount = len;
2108 return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, stat);
2109}
2110 477
2111/* 478 cur->bc_private.a.agbp = agbp;
2112 * Lookup the first record greater than or equal to [bno, len] 479 cur->bc_private.a.agno = agno;
2113 * in the btree given by cur.
2114 */
2115int /* error */
2116xfs_alloc_lookup_ge(
2117 xfs_btree_cur_t *cur, /* btree cursor */
2118 xfs_agblock_t bno, /* starting block of extent */
2119 xfs_extlen_t len, /* length of extent */
2120 int *stat) /* success/failure */
2121{
2122 cur->bc_rec.a.ar_startblock = bno;
2123 cur->bc_rec.a.ar_blockcount = len;
2124 return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, stat);
2125}
2126 480
2127/* 481 return cur;
2128 * Lookup the first record less than or equal to [bno, len]
2129 * in the btree given by cur.
2130 */
2131int /* error */
2132xfs_alloc_lookup_le(
2133 xfs_btree_cur_t *cur, /* btree cursor */
2134 xfs_agblock_t bno, /* starting block of extent */
2135 xfs_extlen_t len, /* length of extent */
2136 int *stat) /* success/failure */
2137{
2138 cur->bc_rec.a.ar_startblock = bno;
2139 cur->bc_rec.a.ar_blockcount = len;
2140 return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, stat);
2141} 482}
2142 483
2143/* 484/*
2144 * Update the record referred to by cur, to the value given by [bno, len]. 485 * Calculate number of records in an alloc btree block.
2145 * This either works (return 0) or gets an EFSCORRUPTED error.
2146 */ 486 */
2147int /* error */ 487int
2148xfs_alloc_update( 488xfs_allocbt_maxrecs(
2149 xfs_btree_cur_t *cur, /* btree cursor */ 489 struct xfs_mount *mp,
2150 xfs_agblock_t bno, /* starting block of extent */ 490 int blocklen,
2151 xfs_extlen_t len) /* length of extent */ 491 int leaf)
2152{ 492{
2153 xfs_alloc_block_t *block; /* btree block to update */ 493 blocklen -= XFS_ALLOC_BLOCK_LEN(mp);
2154 int error; /* error return value */
2155 int ptr; /* current record number (updating) */
2156 494
2157 ASSERT(len > 0); 495 if (leaf)
2158 /* 496 return blocklen / sizeof(xfs_alloc_rec_t);
2159 * Pick up the a.g. freelist struct and the current block. 497 return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
2160 */
2161 block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]);
2162#ifdef DEBUG
2163 if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0])))
2164 return error;
2165#endif
2166 /*
2167 * Get the address of the rec to be updated.
2168 */
2169 ptr = cur->bc_ptrs[0];
2170 {
2171 xfs_alloc_rec_t *rp; /* pointer to updated record */
2172
2173 rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
2174 /*
2175 * Fill in the new contents and log them.
2176 */
2177 rp->ar_startblock = cpu_to_be32(bno);
2178 rp->ar_blockcount = cpu_to_be32(len);
2179 xfs_alloc_log_recs(cur, cur->bc_bufs[0], ptr, ptr);
2180 }
2181 /*
2182 * If it's the by-size btree and it's the last leaf block and
2183 * it's the last record... then update the size of the longest
2184 * extent in the a.g., which we cache in the a.g. freelist header.
2185 */
2186 if (cur->bc_btnum == XFS_BTNUM_CNT &&
2187 be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
2188 ptr == be16_to_cpu(block->bb_numrecs)) {
2189 xfs_agf_t *agf; /* a.g. freespace header */
2190 xfs_agnumber_t seqno;
2191
2192 agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
2193 seqno = be32_to_cpu(agf->agf_seqno);
2194 cur->bc_mp->m_perag[seqno].pagf_longest = len;
2195 agf->agf_longest = cpu_to_be32(len);
2196 xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
2197 XFS_AGF_LONGEST);
2198 }
2199 /*
2200 * Updating first record in leaf. Pass new key value up to our parent.
2201 */
2202 if (ptr == 1) {
2203 xfs_alloc_key_t key; /* key containing [bno, len] */
2204
2205 key.ar_startblock = cpu_to_be32(bno);
2206 key.ar_blockcount = cpu_to_be32(len);
2207 if ((error = xfs_alloc_updkey(cur, &key, 1)))
2208 return error;
2209 }
2210 return 0;
2211} 498}
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
index 5bd1a2c8bd07..a6caa0022c9b 100644
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -24,7 +24,6 @@
24 24
25struct xfs_buf; 25struct xfs_buf;
26struct xfs_btree_cur; 26struct xfs_btree_cur;
27struct xfs_btree_sblock;
28struct xfs_mount; 27struct xfs_mount;
29 28
30/* 29/*
@@ -50,16 +49,6 @@ typedef struct xfs_alloc_rec_incore {
50 49
51/* btree pointer type */ 50/* btree pointer type */
52typedef __be32 xfs_alloc_ptr_t; 51typedef __be32 xfs_alloc_ptr_t;
53/* btree block header type */
54typedef struct xfs_btree_sblock xfs_alloc_block_t;
55
56#define XFS_BUF_TO_ALLOC_BLOCK(bp) ((xfs_alloc_block_t *)XFS_BUF_PTR(bp))
57
58/*
59 * Real block structures have a size equal to the disk block size.
60 */
61#define XFS_ALLOC_BLOCK_MAXRECS(lev,cur) ((cur)->bc_mp->m_alloc_mxr[lev != 0])
62#define XFS_ALLOC_BLOCK_MINRECS(lev,cur) ((cur)->bc_mp->m_alloc_mnr[lev != 0])
63 52
64/* 53/*
65 * Minimum and maximum blocksize and sectorsize. 54 * Minimum and maximum blocksize and sectorsize.
@@ -83,73 +72,39 @@ typedef struct xfs_btree_sblock xfs_alloc_block_t;
83#define XFS_CNT_BLOCK(mp) ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1)) 72#define XFS_CNT_BLOCK(mp) ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
84 73
85/* 74/*
86 * Record, key, and pointer address macros for btree blocks. 75 * Btree block header size depends on a superblock flag.
87 */ 76 *
88#define XFS_ALLOC_REC_ADDR(bb,i,cur) \ 77 * (not quite yet, but soon)
89 XFS_BTREE_REC_ADDR(xfs_alloc, bb, i)
90
91#define XFS_ALLOC_KEY_ADDR(bb,i,cur) \
92 XFS_BTREE_KEY_ADDR(xfs_alloc, bb, i)
93
94#define XFS_ALLOC_PTR_ADDR(bb,i,cur) \
95 XFS_BTREE_PTR_ADDR(xfs_alloc, bb, i, XFS_ALLOC_BLOCK_MAXRECS(1, cur))
96
97/*
98 * Decrement cursor by one record at the level.
99 * For nonzero levels the leaf-ward information is untouched.
100 */
101extern int xfs_alloc_decrement(struct xfs_btree_cur *cur, int level, int *stat);
102
103/*
104 * Delete the record pointed to by cur.
105 * The cursor refers to the place where the record was (could be inserted)
106 * when the operation returns.
107 */
108extern int xfs_alloc_delete(struct xfs_btree_cur *cur, int *stat);
109
110/*
111 * Get the data from the pointed-to record.
112 */
113extern int xfs_alloc_get_rec(struct xfs_btree_cur *cur, xfs_agblock_t *bno,
114 xfs_extlen_t *len, int *stat);
115
116/*
117 * Increment cursor by one record at the level.
118 * For nonzero levels the leaf-ward information is untouched.
119 */
120extern int xfs_alloc_increment(struct xfs_btree_cur *cur, int level, int *stat);
121
122/*
123 * Insert the current record at the point referenced by cur.
124 * The cursor may be inconsistent on return if splits have been done.
125 */
126extern int xfs_alloc_insert(struct xfs_btree_cur *cur, int *stat);
127
128/*
129 * Lookup the record equal to [bno, len] in the btree given by cur.
130 */
131extern int xfs_alloc_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno,
132 xfs_extlen_t len, int *stat);
133
134/*
135 * Lookup the first record greater than or equal to [bno, len]
136 * in the btree given by cur.
137 */
138extern int xfs_alloc_lookup_ge(struct xfs_btree_cur *cur, xfs_agblock_t bno,
139 xfs_extlen_t len, int *stat);
140
141/*
142 * Lookup the first record less than or equal to [bno, len]
143 * in the btree given by cur.
144 */ 78 */
145extern int xfs_alloc_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno, 79#define XFS_ALLOC_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN
146 xfs_extlen_t len, int *stat);
147 80
148/* 81/*
149 * Update the record referred to by cur, to the value given by [bno, len]. 82 * Record, key, and pointer address macros for btree blocks.
150 * This either works (return 0) or gets an EFSCORRUPTED error. 83 *
151 */ 84 * (note that some of these may appear unused, but they are used in userspace)
152extern int xfs_alloc_update(struct xfs_btree_cur *cur, xfs_agblock_t bno, 85 */
153 xfs_extlen_t len); 86#define XFS_ALLOC_REC_ADDR(mp, block, index) \
87 ((xfs_alloc_rec_t *) \
88 ((char *)(block) + \
89 XFS_ALLOC_BLOCK_LEN(mp) + \
90 (((index) - 1) * sizeof(xfs_alloc_rec_t))))
91
92#define XFS_ALLOC_KEY_ADDR(mp, block, index) \
93 ((xfs_alloc_key_t *) \
94 ((char *)(block) + \
95 XFS_ALLOC_BLOCK_LEN(mp) + \
96 ((index) - 1) * sizeof(xfs_alloc_key_t)))
97
98#define XFS_ALLOC_PTR_ADDR(mp, block, index, maxrecs) \
99 ((xfs_alloc_ptr_t *) \
100 ((char *)(block) + \
101 XFS_ALLOC_BLOCK_LEN(mp) + \
102 (maxrecs) * sizeof(xfs_alloc_key_t) + \
103 ((index) - 1) * sizeof(xfs_alloc_ptr_t)))
104
105extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
106 struct xfs_trans *, struct xfs_buf *,
107 xfs_agnumber_t, xfs_btnum_t);
108extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
154 109
155#endif /* __XFS_ALLOC_BTREE_H__ */ 110#endif /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
index 0b3b5efe848c..53d5e70d1360 100644
--- a/fs/xfs/xfs_arch.h
+++ b/fs/xfs/xfs_arch.h
@@ -41,21 +41,36 @@
41#endif 41#endif
42 42
43#ifdef XFS_NATIVE_HOST 43#ifdef XFS_NATIVE_HOST
44#define cpu_to_be16(val) ((__be16)(val)) 44#define cpu_to_be16(val) ((__force __be16)(__u16)(val))
45#define cpu_to_be32(val) ((__be32)(val)) 45#define cpu_to_be32(val) ((__force __be32)(__u32)(val))
46#define cpu_to_be64(val) ((__be64)(val)) 46#define cpu_to_be64(val) ((__force __be64)(__u64)(val))
47#define be16_to_cpu(val) ((__uint16_t)(val)) 47#define be16_to_cpu(val) ((__force __u16)(__be16)(val))
48#define be32_to_cpu(val) ((__uint32_t)(val)) 48#define be32_to_cpu(val) ((__force __u32)(__be32)(val))
49#define be64_to_cpu(val) ((__uint64_t)(val)) 49#define be64_to_cpu(val) ((__force __u64)(__be64)(val))
50#else 50#else
51#define cpu_to_be16(val) (__swab16((__uint16_t)(val))) 51#define cpu_to_be16(val) ((__force __be16)__swab16((__u16)(val)))
52#define cpu_to_be32(val) (__swab32((__uint32_t)(val))) 52#define cpu_to_be32(val) ((__force __be32)__swab32((__u32)(val)))
53#define cpu_to_be64(val) (__swab64((__uint64_t)(val))) 53#define cpu_to_be64(val) ((__force __be64)__swab64((__u64)(val)))
54#define be16_to_cpu(val) (__swab16((__be16)(val))) 54#define be16_to_cpu(val) (__swab16((__force __u16)(__be16)(val)))
55#define be32_to_cpu(val) (__swab32((__be32)(val))) 55#define be32_to_cpu(val) (__swab32((__force __u32)(__be32)(val)))
56#define be64_to_cpu(val) (__swab64((__be64)(val))) 56#define be64_to_cpu(val) (__swab64((__force __u64)(__be64)(val)))
57#endif 57#endif
58 58
59static inline void be16_add_cpu(__be16 *a, __s16 b)
60{
61 *a = cpu_to_be16(be16_to_cpu(*a) + b);
62}
63
64static inline void be32_add_cpu(__be32 *a, __s32 b)
65{
66 *a = cpu_to_be32(be32_to_cpu(*a) + b);
67}
68
69static inline void be64_add_cpu(__be64 *a, __s64 b)
70{
71 *a = cpu_to_be64(be64_to_cpu(*a) + b);
72}
73
59#endif /* __KERNEL__ */ 74#endif /* __KERNEL__ */
60 75
61/* do we need conversion? */ 76/* do we need conversion? */
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h
index 8e0e463dae2d..bca7b243c319 100644
--- a/fs/xfs/xfs_bit.h
+++ b/fs/xfs/xfs_bit.h
@@ -61,8 +61,7 @@ static inline int xfs_highbit64(__uint64_t v)
61/* Get low bit set out of 32-bit argument, -1 if none set */ 61/* Get low bit set out of 32-bit argument, -1 if none set */
62static inline int xfs_lowbit32(__uint32_t v) 62static inline int xfs_lowbit32(__uint32_t v)
63{ 63{
64 unsigned long t = v; 64 return ffs(v) - 1;
65 return (v) ? find_first_bit(&t, 32) : -1;
66} 65}
67 66
68/* Get low bit set out of 64-bit argument, -1 if none set */ 67/* Get low bit set out of 64-bit argument, -1 if none set */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index a1aab9275d5a..138308e70d14 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -393,8 +393,8 @@ xfs_bmap_count_leaves(
393 393
394STATIC void 394STATIC void
395xfs_bmap_disk_count_leaves( 395xfs_bmap_disk_count_leaves(
396 xfs_extnum_t idx, 396 struct xfs_mount *mp,
397 xfs_bmbt_block_t *block, 397 struct xfs_btree_block *block,
398 int numrecs, 398 int numrecs,
399 int *count); 399 int *count);
400 400
@@ -402,6 +402,53 @@ xfs_bmap_disk_count_leaves(
402 * Bmap internal routines. 402 * Bmap internal routines.
403 */ 403 */
404 404
405STATIC int /* error */
406xfs_bmbt_lookup_eq(
407 struct xfs_btree_cur *cur,
408 xfs_fileoff_t off,
409 xfs_fsblock_t bno,
410 xfs_filblks_t len,
411 int *stat) /* success/failure */
412{
413 cur->bc_rec.b.br_startoff = off;
414 cur->bc_rec.b.br_startblock = bno;
415 cur->bc_rec.b.br_blockcount = len;
416 return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
417}
418
419STATIC int /* error */
420xfs_bmbt_lookup_ge(
421 struct xfs_btree_cur *cur,
422 xfs_fileoff_t off,
423 xfs_fsblock_t bno,
424 xfs_filblks_t len,
425 int *stat) /* success/failure */
426{
427 cur->bc_rec.b.br_startoff = off;
428 cur->bc_rec.b.br_startblock = bno;
429 cur->bc_rec.b.br_blockcount = len;
430 return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
431}
432
433/*
434* Update the record referred to by cur to the value given
435 * by [off, bno, len, state].
436 * This either works (return 0) or gets an EFSCORRUPTED error.
437 */
438STATIC int
439xfs_bmbt_update(
440 struct xfs_btree_cur *cur,
441 xfs_fileoff_t off,
442 xfs_fsblock_t bno,
443 xfs_filblks_t len,
444 xfs_exntst_t state)
445{
446 union xfs_btree_rec rec;
447
448 xfs_bmbt_disk_set_allf(&rec.bmbt, off, bno, len, state);
449 return xfs_btree_update(cur, &rec);
450}
451
405/* 452/*
406 * Called from xfs_bmap_add_attrfork to handle btree format files. 453 * Called from xfs_bmap_add_attrfork to handle btree format files.
407 */ 454 */
@@ -422,15 +469,14 @@ xfs_bmap_add_attrfork_btree(
422 if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip)) 469 if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip))
423 *flags |= XFS_ILOG_DBROOT; 470 *flags |= XFS_ILOG_DBROOT;
424 else { 471 else {
425 cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip, 472 cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
426 XFS_DATA_FORK);
427 cur->bc_private.b.flist = flist; 473 cur->bc_private.b.flist = flist;
428 cur->bc_private.b.firstblock = *firstblock; 474 cur->bc_private.b.firstblock = *firstblock;
429 if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat))) 475 if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
430 goto error0; 476 goto error0;
431 /* must be at least one entry */ 477 /* must be at least one entry */
432 XFS_WANT_CORRUPTED_GOTO(stat == 1, error0); 478 XFS_WANT_CORRUPTED_GOTO(stat == 1, error0);
433 if ((error = xfs_bmbt_newroot(cur, flags, &stat))) 479 if ((error = xfs_btree_new_iroot(cur, flags, &stat)))
434 goto error0; 480 goto error0;
435 if (stat == 0) { 481 if (stat == 0) {
436 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 482 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
@@ -818,10 +864,10 @@ xfs_bmap_add_extent_delay_real(
818 RIGHT.br_blockcount, &i))) 864 RIGHT.br_blockcount, &i)))
819 goto done; 865 goto done;
820 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 866 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
821 if ((error = xfs_bmbt_delete(cur, &i))) 867 if ((error = xfs_btree_delete(cur, &i)))
822 goto done; 868 goto done;
823 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 869 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
824 if ((error = xfs_bmbt_decrement(cur, 0, &i))) 870 if ((error = xfs_btree_decrement(cur, 0, &i)))
825 goto done; 871 goto done;
826 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 872 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
827 if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, 873 if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
@@ -931,7 +977,7 @@ xfs_bmap_add_extent_delay_real(
931 goto done; 977 goto done;
932 XFS_WANT_CORRUPTED_GOTO(i == 0, done); 978 XFS_WANT_CORRUPTED_GOTO(i == 0, done);
933 cur->bc_rec.b.br_state = XFS_EXT_NORM; 979 cur->bc_rec.b.br_state = XFS_EXT_NORM;
934 if ((error = xfs_bmbt_insert(cur, &i))) 980 if ((error = xfs_btree_insert(cur, &i)))
935 goto done; 981 goto done;
936 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 982 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
937 } 983 }
@@ -1007,7 +1053,7 @@ xfs_bmap_add_extent_delay_real(
1007 goto done; 1053 goto done;
1008 XFS_WANT_CORRUPTED_GOTO(i == 0, done); 1054 XFS_WANT_CORRUPTED_GOTO(i == 0, done);
1009 cur->bc_rec.b.br_state = XFS_EXT_NORM; 1055 cur->bc_rec.b.br_state = XFS_EXT_NORM;
1010 if ((error = xfs_bmbt_insert(cur, &i))) 1056 if ((error = xfs_btree_insert(cur, &i)))
1011 goto done; 1057 goto done;
1012 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1058 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1013 } 1059 }
@@ -1097,7 +1143,7 @@ xfs_bmap_add_extent_delay_real(
1097 goto done; 1143 goto done;
1098 XFS_WANT_CORRUPTED_GOTO(i == 0, done); 1144 XFS_WANT_CORRUPTED_GOTO(i == 0, done);
1099 cur->bc_rec.b.br_state = XFS_EXT_NORM; 1145 cur->bc_rec.b.br_state = XFS_EXT_NORM;
1100 if ((error = xfs_bmbt_insert(cur, &i))) 1146 if ((error = xfs_btree_insert(cur, &i)))
1101 goto done; 1147 goto done;
1102 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1148 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1103 } 1149 }
@@ -1152,7 +1198,7 @@ xfs_bmap_add_extent_delay_real(
1152 goto done; 1198 goto done;
1153 XFS_WANT_CORRUPTED_GOTO(i == 0, done); 1199 XFS_WANT_CORRUPTED_GOTO(i == 0, done);
1154 cur->bc_rec.b.br_state = XFS_EXT_NORM; 1200 cur->bc_rec.b.br_state = XFS_EXT_NORM;
1155 if ((error = xfs_bmbt_insert(cur, &i))) 1201 if ((error = xfs_btree_insert(cur, &i)))
1156 goto done; 1202 goto done;
1157 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1203 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1158 } 1204 }
@@ -1379,16 +1425,16 @@ xfs_bmap_add_extent_unwritten_real(
1379 RIGHT.br_blockcount, &i))) 1425 RIGHT.br_blockcount, &i)))
1380 goto done; 1426 goto done;
1381 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1427 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1382 if ((error = xfs_bmbt_delete(cur, &i))) 1428 if ((error = xfs_btree_delete(cur, &i)))
1383 goto done; 1429 goto done;
1384 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1430 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1385 if ((error = xfs_bmbt_decrement(cur, 0, &i))) 1431 if ((error = xfs_btree_decrement(cur, 0, &i)))
1386 goto done; 1432 goto done;
1387 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1433 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1388 if ((error = xfs_bmbt_delete(cur, &i))) 1434 if ((error = xfs_btree_delete(cur, &i)))
1389 goto done; 1435 goto done;
1390 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1436 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1391 if ((error = xfs_bmbt_decrement(cur, 0, &i))) 1437 if ((error = xfs_btree_decrement(cur, 0, &i)))
1392 goto done; 1438 goto done;
1393 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1439 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1394 if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, 1440 if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
@@ -1428,10 +1474,10 @@ xfs_bmap_add_extent_unwritten_real(
1428 &i))) 1474 &i)))
1429 goto done; 1475 goto done;
1430 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1476 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1431 if ((error = xfs_bmbt_delete(cur, &i))) 1477 if ((error = xfs_btree_delete(cur, &i)))
1432 goto done; 1478 goto done;
1433 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1479 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1434 if ((error = xfs_bmbt_decrement(cur, 0, &i))) 1480 if ((error = xfs_btree_decrement(cur, 0, &i)))
1435 goto done; 1481 goto done;
1436 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1482 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1437 if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, 1483 if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
@@ -1471,10 +1517,10 @@ xfs_bmap_add_extent_unwritten_real(
1471 RIGHT.br_blockcount, &i))) 1517 RIGHT.br_blockcount, &i)))
1472 goto done; 1518 goto done;
1473 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1519 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1474 if ((error = xfs_bmbt_delete(cur, &i))) 1520 if ((error = xfs_btree_delete(cur, &i)))
1475 goto done; 1521 goto done;
1476 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1522 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1477 if ((error = xfs_bmbt_decrement(cur, 0, &i))) 1523 if ((error = xfs_btree_decrement(cur, 0, &i)))
1478 goto done; 1524 goto done;
1479 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1525 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1480 if ((error = xfs_bmbt_update(cur, new->br_startoff, 1526 if ((error = xfs_bmbt_update(cur, new->br_startoff,
@@ -1557,7 +1603,7 @@ xfs_bmap_add_extent_unwritten_real(
1557 PREV.br_blockcount - new->br_blockcount, 1603 PREV.br_blockcount - new->br_blockcount,
1558 oldext))) 1604 oldext)))
1559 goto done; 1605 goto done;
1560 if ((error = xfs_bmbt_decrement(cur, 0, &i))) 1606 if ((error = xfs_btree_decrement(cur, 0, &i)))
1561 goto done; 1607 goto done;
1562 if (xfs_bmbt_update(cur, LEFT.br_startoff, 1608 if (xfs_bmbt_update(cur, LEFT.br_startoff,
1563 LEFT.br_startblock, 1609 LEFT.br_startblock,
@@ -1605,7 +1651,7 @@ xfs_bmap_add_extent_unwritten_real(
1605 oldext))) 1651 oldext)))
1606 goto done; 1652 goto done;
1607 cur->bc_rec.b = *new; 1653 cur->bc_rec.b = *new;
1608 if ((error = xfs_bmbt_insert(cur, &i))) 1654 if ((error = xfs_btree_insert(cur, &i)))
1609 goto done; 1655 goto done;
1610 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1656 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1611 } 1657 }
@@ -1647,7 +1693,7 @@ xfs_bmap_add_extent_unwritten_real(
1647 PREV.br_blockcount - new->br_blockcount, 1693 PREV.br_blockcount - new->br_blockcount,
1648 oldext))) 1694 oldext)))
1649 goto done; 1695 goto done;
1650 if ((error = xfs_bmbt_increment(cur, 0, &i))) 1696 if ((error = xfs_btree_increment(cur, 0, &i)))
1651 goto done; 1697 goto done;
1652 if ((error = xfs_bmbt_update(cur, new->br_startoff, 1698 if ((error = xfs_bmbt_update(cur, new->br_startoff,
1653 new->br_startblock, 1699 new->br_startblock,
@@ -1695,7 +1741,7 @@ xfs_bmap_add_extent_unwritten_real(
1695 goto done; 1741 goto done;
1696 XFS_WANT_CORRUPTED_GOTO(i == 0, done); 1742 XFS_WANT_CORRUPTED_GOTO(i == 0, done);
1697 cur->bc_rec.b.br_state = XFS_EXT_NORM; 1743 cur->bc_rec.b.br_state = XFS_EXT_NORM;
1698 if ((error = xfs_bmbt_insert(cur, &i))) 1744 if ((error = xfs_btree_insert(cur, &i)))
1699 goto done; 1745 goto done;
1700 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1746 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1701 } 1747 }
@@ -1743,7 +1789,7 @@ xfs_bmap_add_extent_unwritten_real(
1743 cur->bc_rec.b = PREV; 1789 cur->bc_rec.b = PREV;
1744 cur->bc_rec.b.br_blockcount = 1790 cur->bc_rec.b.br_blockcount =
1745 new->br_startoff - PREV.br_startoff; 1791 new->br_startoff - PREV.br_startoff;
1746 if ((error = xfs_bmbt_insert(cur, &i))) 1792 if ((error = xfs_btree_insert(cur, &i)))
1747 goto done; 1793 goto done;
1748 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1794 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1749 /* 1795 /*
@@ -1758,7 +1804,7 @@ xfs_bmap_add_extent_unwritten_real(
1758 XFS_WANT_CORRUPTED_GOTO(i == 0, done); 1804 XFS_WANT_CORRUPTED_GOTO(i == 0, done);
1759 /* new middle extent - newext */ 1805 /* new middle extent - newext */
1760 cur->bc_rec.b.br_state = new->br_state; 1806 cur->bc_rec.b.br_state = new->br_state;
1761 if ((error = xfs_bmbt_insert(cur, &i))) 1807 if ((error = xfs_btree_insert(cur, &i)))
1762 goto done; 1808 goto done;
1763 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1809 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1764 } 1810 }
@@ -2106,10 +2152,10 @@ xfs_bmap_add_extent_hole_real(
2106 right.br_blockcount, &i))) 2152 right.br_blockcount, &i)))
2107 goto done; 2153 goto done;
2108 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2154 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2109 if ((error = xfs_bmbt_delete(cur, &i))) 2155 if ((error = xfs_btree_delete(cur, &i)))
2110 goto done; 2156 goto done;
2111 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2157 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2112 if ((error = xfs_bmbt_decrement(cur, 0, &i))) 2158 if ((error = xfs_btree_decrement(cur, 0, &i)))
2113 goto done; 2159 goto done;
2114 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2160 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2115 if ((error = xfs_bmbt_update(cur, left.br_startoff, 2161 if ((error = xfs_bmbt_update(cur, left.br_startoff,
@@ -2218,7 +2264,7 @@ xfs_bmap_add_extent_hole_real(
2218 goto done; 2264 goto done;
2219 XFS_WANT_CORRUPTED_GOTO(i == 0, done); 2265 XFS_WANT_CORRUPTED_GOTO(i == 0, done);
2220 cur->bc_rec.b.br_state = new->br_state; 2266 cur->bc_rec.b.br_state = new->br_state;
2221 if ((error = xfs_bmbt_insert(cur, &i))) 2267 if ((error = xfs_btree_insert(cur, &i)))
2222 goto done; 2268 goto done;
2223 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2269 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2224 } 2270 }
@@ -2996,24 +3042,24 @@ xfs_bmap_btree_to_extents(
2996 int whichfork) /* data or attr fork */ 3042 int whichfork) /* data or attr fork */
2997{ 3043{
2998 /* REFERENCED */ 3044 /* REFERENCED */
2999 xfs_bmbt_block_t *cblock;/* child btree block */ 3045 struct xfs_btree_block *cblock;/* child btree block */
3000 xfs_fsblock_t cbno; /* child block number */ 3046 xfs_fsblock_t cbno; /* child block number */
3001 xfs_buf_t *cbp; /* child block's buffer */ 3047 xfs_buf_t *cbp; /* child block's buffer */
3002 int error; /* error return value */ 3048 int error; /* error return value */
3003 xfs_ifork_t *ifp; /* inode fork data */ 3049 xfs_ifork_t *ifp; /* inode fork data */
3004 xfs_mount_t *mp; /* mount point structure */ 3050 xfs_mount_t *mp; /* mount point structure */
3005 __be64 *pp; /* ptr to block address */ 3051 __be64 *pp; /* ptr to block address */
3006 xfs_bmbt_block_t *rblock;/* root btree block */ 3052 struct xfs_btree_block *rblock;/* root btree block */
3007 3053
3054 mp = ip->i_mount;
3008 ifp = XFS_IFORK_PTR(ip, whichfork); 3055 ifp = XFS_IFORK_PTR(ip, whichfork);
3009 ASSERT(ifp->if_flags & XFS_IFEXTENTS); 3056 ASSERT(ifp->if_flags & XFS_IFEXTENTS);
3010 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); 3057 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
3011 rblock = ifp->if_broot; 3058 rblock = ifp->if_broot;
3012 ASSERT(be16_to_cpu(rblock->bb_level) == 1); 3059 ASSERT(be16_to_cpu(rblock->bb_level) == 1);
3013 ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); 3060 ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
3014 ASSERT(XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes) == 1); 3061 ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
3015 mp = ip->i_mount; 3062 pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
3016 pp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, ifp->if_broot_bytes);
3017 cbno = be64_to_cpu(*pp); 3063 cbno = be64_to_cpu(*pp);
3018 *logflagsp = 0; 3064 *logflagsp = 0;
3019#ifdef DEBUG 3065#ifdef DEBUG
@@ -3023,8 +3069,8 @@ xfs_bmap_btree_to_extents(
3023 if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, 3069 if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp,
3024 XFS_BMAP_BTREE_REF))) 3070 XFS_BMAP_BTREE_REF)))
3025 return error; 3071 return error;
3026 cblock = XFS_BUF_TO_BMBT_BLOCK(cbp); 3072 cblock = XFS_BUF_TO_BLOCK(cbp);
3027 if ((error = xfs_btree_check_lblock(cur, cblock, 0, cbp))) 3073 if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
3028 return error; 3074 return error;
3029 xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp); 3075 xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
3030 ip->i_d.di_nblocks--; 3076 ip->i_d.di_nblocks--;
@@ -3170,7 +3216,7 @@ xfs_bmap_del_extent(
3170 flags |= XFS_ILOG_FEXT(whichfork); 3216 flags |= XFS_ILOG_FEXT(whichfork);
3171 break; 3217 break;
3172 } 3218 }
3173 if ((error = xfs_bmbt_delete(cur, &i))) 3219 if ((error = xfs_btree_delete(cur, &i)))
3174 goto done; 3220 goto done;
3175 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 3221 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
3176 break; 3222 break;
@@ -3254,10 +3300,10 @@ xfs_bmap_del_extent(
3254 got.br_startblock, temp, 3300 got.br_startblock, temp,
3255 got.br_state))) 3301 got.br_state)))
3256 goto done; 3302 goto done;
3257 if ((error = xfs_bmbt_increment(cur, 0, &i))) 3303 if ((error = xfs_btree_increment(cur, 0, &i)))
3258 goto done; 3304 goto done;
3259 cur->bc_rec.b = new; 3305 cur->bc_rec.b = new;
3260 error = xfs_bmbt_insert(cur, &i); 3306 error = xfs_btree_insert(cur, &i);
3261 if (error && error != ENOSPC) 3307 if (error && error != ENOSPC)
3262 goto done; 3308 goto done;
3263 /* 3309 /*
@@ -3404,11 +3450,11 @@ xfs_bmap_extents_to_btree(
3404 int *logflagsp, /* inode logging flags */ 3450 int *logflagsp, /* inode logging flags */
3405 int whichfork) /* data or attr fork */ 3451 int whichfork) /* data or attr fork */
3406{ 3452{
3407 xfs_bmbt_block_t *ablock; /* allocated (child) bt block */ 3453 struct xfs_btree_block *ablock; /* allocated (child) bt block */
3408 xfs_buf_t *abp; /* buffer for ablock */ 3454 xfs_buf_t *abp; /* buffer for ablock */
3409 xfs_alloc_arg_t args; /* allocation arguments */ 3455 xfs_alloc_arg_t args; /* allocation arguments */
3410 xfs_bmbt_rec_t *arp; /* child record pointer */ 3456 xfs_bmbt_rec_t *arp; /* child record pointer */
3411 xfs_bmbt_block_t *block; /* btree root block */ 3457 struct xfs_btree_block *block; /* btree root block */
3412 xfs_btree_cur_t *cur; /* bmap btree cursor */ 3458 xfs_btree_cur_t *cur; /* bmap btree cursor */
3413 xfs_bmbt_rec_host_t *ep; /* extent record pointer */ 3459 xfs_bmbt_rec_host_t *ep; /* extent record pointer */
3414 int error; /* error return value */ 3460 int error; /* error return value */
@@ -3428,6 +3474,7 @@ xfs_bmap_extents_to_btree(
3428 */ 3474 */
3429 xfs_iroot_realloc(ip, 1, whichfork); 3475 xfs_iroot_realloc(ip, 1, whichfork);
3430 ifp->if_flags |= XFS_IFBROOT; 3476 ifp->if_flags |= XFS_IFBROOT;
3477
3431 /* 3478 /*
3432 * Fill in the root. 3479 * Fill in the root.
3433 */ 3480 */
@@ -3435,14 +3482,14 @@ xfs_bmap_extents_to_btree(
3435 block->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); 3482 block->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
3436 block->bb_level = cpu_to_be16(1); 3483 block->bb_level = cpu_to_be16(1);
3437 block->bb_numrecs = cpu_to_be16(1); 3484 block->bb_numrecs = cpu_to_be16(1);
3438 block->bb_leftsib = cpu_to_be64(NULLDFSBNO); 3485 block->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
3439 block->bb_rightsib = cpu_to_be64(NULLDFSBNO); 3486 block->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
3487
3440 /* 3488 /*
3441 * Need a cursor. Can't allocate until bb_level is filled in. 3489 * Need a cursor. Can't allocate until bb_level is filled in.
3442 */ 3490 */
3443 mp = ip->i_mount; 3491 mp = ip->i_mount;
3444 cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip, 3492 cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
3445 whichfork);
3446 cur->bc_private.b.firstblock = *firstblock; 3493 cur->bc_private.b.firstblock = *firstblock;
3447 cur->bc_private.b.flist = flist; 3494 cur->bc_private.b.flist = flist;
3448 cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; 3495 cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
@@ -3489,12 +3536,12 @@ xfs_bmap_extents_to_btree(
3489 /* 3536 /*
3490 * Fill in the child block. 3537 * Fill in the child block.
3491 */ 3538 */
3492 ablock = XFS_BUF_TO_BMBT_BLOCK(abp); 3539 ablock = XFS_BUF_TO_BLOCK(abp);
3493 ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); 3540 ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
3494 ablock->bb_level = 0; 3541 ablock->bb_level = 0;
3495 ablock->bb_leftsib = cpu_to_be64(NULLDFSBNO); 3542 ablock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
3496 ablock->bb_rightsib = cpu_to_be64(NULLDFSBNO); 3543 ablock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
3497 arp = XFS_BMAP_REC_IADDR(ablock, 1, cur); 3544 arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
3498 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3545 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3499 for (cnt = i = 0; i < nextents; i++) { 3546 for (cnt = i = 0; i < nextents; i++) {
3500 ep = xfs_iext_get_ext(ifp, i); 3547 ep = xfs_iext_get_ext(ifp, i);
@@ -3505,21 +3552,24 @@ xfs_bmap_extents_to_btree(
3505 } 3552 }
3506 } 3553 }
3507 ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork)); 3554 ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
3508 ablock->bb_numrecs = cpu_to_be16(cnt); 3555 xfs_btree_set_numrecs(ablock, cnt);
3556
3509 /* 3557 /*
3510 * Fill in the root key and pointer. 3558 * Fill in the root key and pointer.
3511 */ 3559 */
3512 kp = XFS_BMAP_KEY_IADDR(block, 1, cur); 3560 kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
3513 arp = XFS_BMAP_REC_IADDR(ablock, 1, cur); 3561 arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
3514 kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp)); 3562 kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
3515 pp = XFS_BMAP_PTR_IADDR(block, 1, cur); 3563 pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
3564 be16_to_cpu(block->bb_level)));
3516 *pp = cpu_to_be64(args.fsbno); 3565 *pp = cpu_to_be64(args.fsbno);
3566
3517 /* 3567 /*
3518 * Do all this logging at the end so that 3568 * Do all this logging at the end so that
3519 * the root is at the right level. 3569 * the root is at the right level.
3520 */ 3570 */
3521 xfs_bmbt_log_block(cur, abp, XFS_BB_ALL_BITS); 3571 xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS);
3522 xfs_bmbt_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs)); 3572 xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
3523 ASSERT(*curp == NULL); 3573 ASSERT(*curp == NULL);
3524 *curp = cur; 3574 *curp = cur;
3525 *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FBROOT(whichfork); 3575 *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FBROOT(whichfork);
@@ -4176,7 +4226,7 @@ xfs_bmap_compute_maxlevels(
4176 maxleafents = MAXAEXTNUM; 4226 maxleafents = MAXAEXTNUM;
4177 sz = XFS_BMDR_SPACE_CALC(MINABTPTRS); 4227 sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
4178 } 4228 }
4179 maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0); 4229 maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0);
4180 minleafrecs = mp->m_bmap_dmnr[0]; 4230 minleafrecs = mp->m_bmap_dmnr[0];
4181 minnoderecs = mp->m_bmap_dmnr[1]; 4231 minnoderecs = mp->m_bmap_dmnr[1];
4182 maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; 4232 maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
@@ -4242,9 +4292,15 @@ xfs_bmap_finish(
4242 * We have a new transaction, so we should return committed=1, 4292 * We have a new transaction, so we should return committed=1,
4243 * even though we're returning an error. 4293 * even though we're returning an error.
4244 */ 4294 */
4245 if (error) { 4295 if (error)
4246 return error; 4296 return error;
4247 } 4297
4298 /*
4299 * transaction commit worked ok so we can drop the extra ticket
4300 * reference that we gained in xfs_trans_dup()
4301 */
4302 xfs_log_ticket_put(ntp->t_ticket);
4303
4248 if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES, 4304 if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES,
4249 logcount))) 4305 logcount)))
4250 return error; 4306 return error;
@@ -4474,6 +4530,22 @@ xfs_bmap_one_block(
4474 return rval; 4530 return rval;
4475} 4531}
4476 4532
4533STATIC int
4534xfs_bmap_sanity_check(
4535 struct xfs_mount *mp,
4536 struct xfs_buf *bp,
4537 int level)
4538{
4539 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
4540
4541 if (be32_to_cpu(block->bb_magic) != XFS_BMAP_MAGIC ||
4542 be16_to_cpu(block->bb_level) != level ||
4543 be16_to_cpu(block->bb_numrecs) == 0 ||
4544 be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
4545 return 0;
4546 return 1;
4547}
4548
4477/* 4549/*
4478 * Read in the extents to if_extents. 4550 * Read in the extents to if_extents.
4479 * All inode fields are set up by caller, we just traverse the btree 4551 * All inode fields are set up by caller, we just traverse the btree
@@ -4486,7 +4558,7 @@ xfs_bmap_read_extents(
4486 xfs_inode_t *ip, /* incore inode */ 4558 xfs_inode_t *ip, /* incore inode */
4487 int whichfork) /* data or attr fork */ 4559 int whichfork) /* data or attr fork */
4488{ 4560{
4489 xfs_bmbt_block_t *block; /* current btree block */ 4561 struct xfs_btree_block *block; /* current btree block */
4490 xfs_fsblock_t bno; /* block # of "block" */ 4562 xfs_fsblock_t bno; /* block # of "block" */
4491 xfs_buf_t *bp; /* buffer for "block" */ 4563 xfs_buf_t *bp; /* buffer for "block" */
4492 int error; /* error return value */ 4564 int error; /* error return value */
@@ -4510,7 +4582,7 @@ xfs_bmap_read_extents(
4510 */ 4582 */
4511 level = be16_to_cpu(block->bb_level); 4583 level = be16_to_cpu(block->bb_level);
4512 ASSERT(level > 0); 4584 ASSERT(level > 0);
4513 pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes); 4585 pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
4514 bno = be64_to_cpu(*pp); 4586 bno = be64_to_cpu(*pp);
4515 ASSERT(bno != NULLDFSBNO); 4587 ASSERT(bno != NULLDFSBNO);
4516 ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); 4588 ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
@@ -4523,13 +4595,13 @@ xfs_bmap_read_extents(
4523 if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, 4595 if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
4524 XFS_BMAP_BTREE_REF))) 4596 XFS_BMAP_BTREE_REF)))
4525 return error; 4597 return error;
4526 block = XFS_BUF_TO_BMBT_BLOCK(bp); 4598 block = XFS_BUF_TO_BLOCK(bp);
4527 XFS_WANT_CORRUPTED_GOTO( 4599 XFS_WANT_CORRUPTED_GOTO(
4528 XFS_BMAP_SANITY_CHECK(mp, block, level), 4600 xfs_bmap_sanity_check(mp, bp, level),
4529 error0); 4601 error0);
4530 if (level == 0) 4602 if (level == 0)
4531 break; 4603 break;
4532 pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]); 4604 pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
4533 bno = be64_to_cpu(*pp); 4605 bno = be64_to_cpu(*pp);
4534 XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); 4606 XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
4535 xfs_trans_brelse(tp, bp); 4607 xfs_trans_brelse(tp, bp);
@@ -4549,7 +4621,7 @@ xfs_bmap_read_extents(
4549 xfs_extnum_t start; 4621 xfs_extnum_t start;
4550 4622
4551 4623
4552 num_recs = be16_to_cpu(block->bb_numrecs); 4624 num_recs = xfs_btree_get_numrecs(block);
4553 if (unlikely(i + num_recs > room)) { 4625 if (unlikely(i + num_recs > room)) {
4554 ASSERT(i + num_recs <= room); 4626 ASSERT(i + num_recs <= room);
4555 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 4627 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
@@ -4561,18 +4633,18 @@ xfs_bmap_read_extents(
4561 goto error0; 4633 goto error0;
4562 } 4634 }
4563 XFS_WANT_CORRUPTED_GOTO( 4635 XFS_WANT_CORRUPTED_GOTO(
4564 XFS_BMAP_SANITY_CHECK(mp, block, 0), 4636 xfs_bmap_sanity_check(mp, bp, 0),
4565 error0); 4637 error0);
4566 /* 4638 /*
4567 * Read-ahead the next leaf block, if any. 4639 * Read-ahead the next leaf block, if any.
4568 */ 4640 */
4569 nextbno = be64_to_cpu(block->bb_rightsib); 4641 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
4570 if (nextbno != NULLFSBLOCK) 4642 if (nextbno != NULLFSBLOCK)
4571 xfs_btree_reada_bufl(mp, nextbno, 1); 4643 xfs_btree_reada_bufl(mp, nextbno, 1);
4572 /* 4644 /*
4573 * Copy records into the extent records. 4645 * Copy records into the extent records.
4574 */ 4646 */
4575 frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1); 4647 frp = XFS_BMBT_REC_ADDR(mp, block, 1);
4576 start = i; 4648 start = i;
4577 for (j = 0; j < num_recs; j++, i++, frp++) { 4649 for (j = 0; j < num_recs; j++, i++, frp++) {
4578 xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i); 4650 xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
@@ -4603,7 +4675,7 @@ xfs_bmap_read_extents(
4603 if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, 4675 if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
4604 XFS_BMAP_BTREE_REF))) 4676 XFS_BMAP_BTREE_REF)))
4605 return error; 4677 return error;
4606 block = XFS_BUF_TO_BMBT_BLOCK(bp); 4678 block = XFS_BUF_TO_BLOCK(bp);
4607 } 4679 }
4608 ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); 4680 ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
4609 ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork)); 4681 ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
@@ -5029,8 +5101,7 @@ xfs_bmapi(
5029 if (abno == NULLFSBLOCK) 5101 if (abno == NULLFSBLOCK)
5030 break; 5102 break;
5031 if ((ifp->if_flags & XFS_IFBROOT) && !cur) { 5103 if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
5032 cur = xfs_btree_init_cursor(mp, 5104 cur = xfs_bmbt_init_cursor(mp, tp,
5033 tp, NULL, 0, XFS_BTNUM_BMAP,
5034 ip, whichfork); 5105 ip, whichfork);
5035 cur->bc_private.b.firstblock = 5106 cur->bc_private.b.firstblock =
5036 *firstblock; 5107 *firstblock;
@@ -5147,9 +5218,8 @@ xfs_bmapi(
5147 */ 5218 */
5148 ASSERT(mval->br_blockcount <= len); 5219 ASSERT(mval->br_blockcount <= len);
5149 if ((ifp->if_flags & XFS_IFBROOT) && !cur) { 5220 if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
5150 cur = xfs_btree_init_cursor(mp, 5221 cur = xfs_bmbt_init_cursor(mp,
5151 tp, NULL, 0, XFS_BTNUM_BMAP, 5222 tp, ip, whichfork);
5152 ip, whichfork);
5153 cur->bc_private.b.firstblock = 5223 cur->bc_private.b.firstblock =
5154 *firstblock; 5224 *firstblock;
5155 cur->bc_private.b.flist = flist; 5225 cur->bc_private.b.flist = flist;
@@ -5440,8 +5510,7 @@ xfs_bunmapi(
5440 logflags = 0; 5510 logflags = 0;
5441 if (ifp->if_flags & XFS_IFBROOT) { 5511 if (ifp->if_flags & XFS_IFBROOT) {
5442 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); 5512 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
5443 cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip, 5513 cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
5444 whichfork);
5445 cur->bc_private.b.firstblock = *firstblock; 5514 cur->bc_private.b.firstblock = *firstblock;
5446 cur->bc_private.b.flist = flist; 5515 cur->bc_private.b.flist = flist;
5447 cur->bc_private.b.flags = 0; 5516 cur->bc_private.b.flags = 0;
@@ -5742,14 +5811,17 @@ error0:
5742STATIC int 5811STATIC int
5743xfs_getbmapx_fix_eof_hole( 5812xfs_getbmapx_fix_eof_hole(
5744 xfs_inode_t *ip, /* xfs incore inode pointer */ 5813 xfs_inode_t *ip, /* xfs incore inode pointer */
5745 struct getbmap *out, /* output structure */ 5814 struct getbmapx *out, /* output structure */
5746 int prealloced, /* this is a file with 5815 int prealloced, /* this is a file with
5747 * preallocated data space */ 5816 * preallocated data space */
5748 __int64_t end, /* last block requested */ 5817 __int64_t end, /* last block requested */
5749 xfs_fsblock_t startblock) 5818 xfs_fsblock_t startblock)
5750{ 5819{
5751 __int64_t fixlen; 5820 __int64_t fixlen;
5752 xfs_mount_t *mp; /* file system mount point */ 5821 xfs_mount_t *mp; /* file system mount point */
5822 xfs_ifork_t *ifp; /* inode fork pointer */
5823 xfs_extnum_t lastx; /* last extent pointer */
5824 xfs_fileoff_t fileblock;
5753 5825
5754 if (startblock == HOLESTARTBLOCK) { 5826 if (startblock == HOLESTARTBLOCK) {
5755 mp = ip->i_mount; 5827 mp = ip->i_mount;
@@ -5763,21 +5835,33 @@ xfs_getbmapx_fix_eof_hole(
5763 out->bmv_length = fixlen; 5835 out->bmv_length = fixlen;
5764 } 5836 }
5765 } else { 5837 } else {
5766 out->bmv_block = XFS_FSB_TO_DB(ip, startblock); 5838 if (startblock == DELAYSTARTBLOCK)
5839 out->bmv_block = -2;
5840 else
5841 out->bmv_block = XFS_FSB_TO_DB(ip, startblock);
5842 fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
5843 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
5844 if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
5845 (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
5846 out->bmv_oflags |= BMV_OF_LAST;
5767 } 5847 }
5768 5848
5769 return 1; 5849 return 1;
5770} 5850}
5771 5851
5772/* 5852/*
5773 * Fcntl interface to xfs_bmapi. 5853 * Get inode's extents as described in bmv, and format for output.
5854 * Calls formatter to fill the user's buffer until all extents
5855 * are mapped, until the passed-in bmv->bmv_count slots have
5856 * been filled, or until the formatter short-circuits the loop,
5857 * if it is tracking filled-in extents on its own.
5774 */ 5858 */
5775int /* error code */ 5859int /* error code */
5776xfs_getbmap( 5860xfs_getbmap(
5777 xfs_inode_t *ip, 5861 xfs_inode_t *ip,
5778 struct getbmap *bmv, /* user bmap structure */ 5862 struct getbmapx *bmv, /* user bmap structure */
5779 void __user *ap, /* pointer to user's array */ 5863 xfs_bmap_format_t formatter, /* format to user */
5780 int interface) /* interface flags */ 5864 void *arg) /* formatter arg */
5781{ 5865{
5782 __int64_t bmvend; /* last block requested */ 5866 __int64_t bmvend; /* last block requested */
5783 int error; /* return value */ 5867 int error; /* return value */
@@ -5790,19 +5874,17 @@ xfs_getbmap(
5790 int nexleft; /* # of user extents left */ 5874 int nexleft; /* # of user extents left */
5791 int subnex; /* # of bmapi's can do */ 5875 int subnex; /* # of bmapi's can do */
5792 int nmap; /* number of map entries */ 5876 int nmap; /* number of map entries */
5793 struct getbmap out; /* output structure */ 5877 struct getbmapx out; /* output structure */
5794 int whichfork; /* data or attr fork */ 5878 int whichfork; /* data or attr fork */
5795 int prealloced; /* this is a file with 5879 int prealloced; /* this is a file with
5796 * preallocated data space */ 5880 * preallocated data space */
5797 int sh_unwritten; /* true, if unwritten */ 5881 int iflags; /* interface flags */
5798 /* extents listed separately */
5799 int bmapi_flags; /* flags for xfs_bmapi */ 5882 int bmapi_flags; /* flags for xfs_bmapi */
5800 __int32_t oflags; /* getbmapx bmv_oflags field */
5801 5883
5802 mp = ip->i_mount; 5884 mp = ip->i_mount;
5885 iflags = bmv->bmv_iflags;
5803 5886
5804 whichfork = interface & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK; 5887 whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
5805 sh_unwritten = (interface & BMV_IF_PREALLOC) != 0;
5806 5888
5807 /* If the BMV_IF_NO_DMAPI_READ interface bit specified, do not 5889 /* If the BMV_IF_NO_DMAPI_READ interface bit specified, do not
5808 * generate a DMAPI read event. Otherwise, if the DM_EVENT_READ 5890 * generate a DMAPI read event. Otherwise, if the DM_EVENT_READ
@@ -5817,7 +5899,7 @@ xfs_getbmap(
5817 * could misinterpret holes in a DMAPI file as true holes, 5899 * could misinterpret holes in a DMAPI file as true holes,
5818 * when in fact they may represent offline user data. 5900 * when in fact they may represent offline user data.
5819 */ 5901 */
5820 if ((interface & BMV_IF_NO_DMAPI_READ) == 0 && 5902 if ((iflags & BMV_IF_NO_DMAPI_READ) == 0 &&
5821 DM_EVENT_ENABLED(ip, DM_EVENT_READ) && 5903 DM_EVENT_ENABLED(ip, DM_EVENT_READ) &&
5822 whichfork == XFS_DATA_FORK) { 5904 whichfork == XFS_DATA_FORK) {
5823 error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL); 5905 error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL);
@@ -5873,8 +5955,9 @@ xfs_getbmap(
5873 5955
5874 xfs_ilock(ip, XFS_IOLOCK_SHARED); 5956 xfs_ilock(ip, XFS_IOLOCK_SHARED);
5875 5957
5876 if (whichfork == XFS_DATA_FORK && 5958 if (((iflags & BMV_IF_DELALLOC) == 0) &&
5877 (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size)) { 5959 (whichfork == XFS_DATA_FORK) &&
5960 (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size)) {
5878 /* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */ 5961 /* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */
5879 error = xfs_flush_pages(ip, (xfs_off_t)0, 5962 error = xfs_flush_pages(ip, (xfs_off_t)0,
5880 -1, 0, FI_REMAPF); 5963 -1, 0, FI_REMAPF);
@@ -5884,7 +5967,8 @@ xfs_getbmap(
5884 } 5967 }
5885 } 5968 }
5886 5969
5887 ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0); 5970 ASSERT(whichfork == XFS_ATTR_FORK || (iflags & BMV_IF_DELALLOC) ||
5971 ip->i_delayed_blks == 0);
5888 5972
5889 lock = xfs_ilock_map_shared(ip); 5973 lock = xfs_ilock_map_shared(ip);
5890 5974
@@ -5896,7 +5980,7 @@ xfs_getbmap(
5896 nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1; 5980 nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
5897 5981
5898 bmapi_flags = XFS_BMAPI_AFLAG(whichfork) | 5982 bmapi_flags = XFS_BMAPI_AFLAG(whichfork) |
5899 ((sh_unwritten) ? 0 : XFS_BMAPI_IGSTATE); 5983 ((iflags & BMV_IF_PREALLOC) ? 0 : XFS_BMAPI_IGSTATE);
5900 5984
5901 /* 5985 /*
5902 * Allocate enough space to handle "subnex" maps at a time. 5986 * Allocate enough space to handle "subnex" maps at a time.
@@ -5906,9 +5990,12 @@ xfs_getbmap(
5906 5990
5907 bmv->bmv_entries = 0; 5991 bmv->bmv_entries = 0;
5908 5992
5909 if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0) { 5993 if ((XFS_IFORK_NEXTENTS(ip, whichfork) == 0)) {
5910 error = 0; 5994 if (((iflags & BMV_IF_DELALLOC) == 0) ||
5911 goto unlock_and_return; 5995 whichfork == XFS_ATTR_FORK) {
5996 error = 0;
5997 goto unlock_and_return;
5998 }
5912 } 5999 }
5913 6000
5914 nexleft = nex; 6001 nexleft = nex;
@@ -5924,52 +6011,40 @@ xfs_getbmap(
5924 ASSERT(nmap <= subnex); 6011 ASSERT(nmap <= subnex);
5925 6012
5926 for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) { 6013 for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
5927 nexleft--; 6014 out.bmv_oflags = 0;
5928 oflags = (map[i].br_state == XFS_EXT_UNWRITTEN) ? 6015 if (map[i].br_state == XFS_EXT_UNWRITTEN)
5929 BMV_OF_PREALLOC : 0; 6016 out.bmv_oflags |= BMV_OF_PREALLOC;
6017 else if (map[i].br_startblock == DELAYSTARTBLOCK)
6018 out.bmv_oflags |= BMV_OF_DELALLOC;
5930 out.bmv_offset = XFS_FSB_TO_BB(mp, map[i].br_startoff); 6019 out.bmv_offset = XFS_FSB_TO_BB(mp, map[i].br_startoff);
5931 out.bmv_length = XFS_FSB_TO_BB(mp, map[i].br_blockcount); 6020 out.bmv_length = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
5932 ASSERT(map[i].br_startblock != DELAYSTARTBLOCK); 6021 out.bmv_unused1 = out.bmv_unused2 = 0;
6022 ASSERT(((iflags & BMV_IF_DELALLOC) != 0) ||
6023 (map[i].br_startblock != DELAYSTARTBLOCK));
5933 if (map[i].br_startblock == HOLESTARTBLOCK && 6024 if (map[i].br_startblock == HOLESTARTBLOCK &&
5934 whichfork == XFS_ATTR_FORK) { 6025 whichfork == XFS_ATTR_FORK) {
5935 /* came to the end of attribute fork */ 6026 /* came to the end of attribute fork */
6027 out.bmv_oflags |= BMV_OF_LAST;
5936 goto unlock_and_return; 6028 goto unlock_and_return;
5937 } else { 6029 } else {
6030 int full = 0; /* user array is full */
6031
5938 if (!xfs_getbmapx_fix_eof_hole(ip, &out, 6032 if (!xfs_getbmapx_fix_eof_hole(ip, &out,
5939 prealloced, bmvend, 6033 prealloced, bmvend,
5940 map[i].br_startblock)) { 6034 map[i].br_startblock)) {
5941 goto unlock_and_return; 6035 goto unlock_and_return;
5942 } 6036 }
5943 6037
5944 /* return either getbmap/getbmapx structure. */ 6038 /* format results & advance arg */
5945 if (interface & BMV_IF_EXTENDED) { 6039 error = formatter(&arg, &out, &full);
5946 struct getbmapx outx; 6040 if (error || full)
5947 6041 goto unlock_and_return;
5948 GETBMAP_CONVERT(out,outx); 6042 nexleft--;
5949 outx.bmv_oflags = oflags;
5950 outx.bmv_unused1 = outx.bmv_unused2 = 0;
5951 if (copy_to_user(ap, &outx,
5952 sizeof(outx))) {
5953 error = XFS_ERROR(EFAULT);
5954 goto unlock_and_return;
5955 }
5956 } else {
5957 if (copy_to_user(ap, &out,
5958 sizeof(out))) {
5959 error = XFS_ERROR(EFAULT);
5960 goto unlock_and_return;
5961 }
5962 }
5963 bmv->bmv_offset = 6043 bmv->bmv_offset =
5964 out.bmv_offset + out.bmv_length; 6044 out.bmv_offset + out.bmv_length;
5965 bmv->bmv_length = MAX((__int64_t)0, 6045 bmv->bmv_length = MAX((__int64_t)0,
5966 (__int64_t)(bmvend - bmv->bmv_offset)); 6046 (__int64_t)(bmvend - bmv->bmv_offset));
5967 bmv->bmv_entries++; 6047 bmv->bmv_entries++;
5968 ap = (interface & BMV_IF_EXTENDED) ?
5969 (void __user *)
5970 ((struct getbmapx __user *)ap + 1) :
5971 (void __user *)
5972 ((struct getbmap __user *)ap + 1);
5973 } 6048 }
5974 } 6049 }
5975 } while (nmap && nexleft && bmv->bmv_length); 6050 } while (nmap && nexleft && bmv->bmv_length);
@@ -6131,7 +6206,7 @@ xfs_bmap_get_bp(
6131 6206
6132void 6207void
6133xfs_check_block( 6208xfs_check_block(
6134 xfs_bmbt_block_t *block, 6209 struct xfs_btree_block *block,
6135 xfs_mount_t *mp, 6210 xfs_mount_t *mp,
6136 int root, 6211 int root,
6137 short sz) 6212 short sz)
@@ -6143,36 +6218,29 @@ xfs_check_block(
6143 ASSERT(be16_to_cpu(block->bb_level) > 0); 6218 ASSERT(be16_to_cpu(block->bb_level) > 0);
6144 6219
6145 prevp = NULL; 6220 prevp = NULL;
6146 for( i = 1; i <= be16_to_cpu(block->bb_numrecs); i++) { 6221 for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
6147 dmxr = mp->m_bmap_dmxr[0]; 6222 dmxr = mp->m_bmap_dmxr[0];
6148 6223 keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
6149 if (root) {
6150 keyp = XFS_BMAP_BROOT_KEY_ADDR(block, i, sz);
6151 } else {
6152 keyp = XFS_BTREE_KEY_ADDR(xfs_bmbt, block, i);
6153 }
6154 6224
6155 if (prevp) { 6225 if (prevp) {
6156 xfs_btree_check_key(XFS_BTNUM_BMAP, prevp, keyp); 6226 ASSERT(be64_to_cpu(prevp->br_startoff) <
6227 be64_to_cpu(keyp->br_startoff));
6157 } 6228 }
6158 prevp = keyp; 6229 prevp = keyp;
6159 6230
6160 /* 6231 /*
6161 * Compare the block numbers to see if there are dups. 6232 * Compare the block numbers to see if there are dups.
6162 */ 6233 */
6234 if (root)
6235 pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
6236 else
6237 pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
6163 6238
6164 if (root) {
6165 pp = XFS_BMAP_BROOT_PTR_ADDR(block, i, sz);
6166 } else {
6167 pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, i, dmxr);
6168 }
6169 for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) { 6239 for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
6170 if (root) { 6240 if (root)
6171 thispa = XFS_BMAP_BROOT_PTR_ADDR(block, j, sz); 6241 thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
6172 } else { 6242 else
6173 thispa = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, j, 6243 thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
6174 dmxr);
6175 }
6176 if (*thispa == *pp) { 6244 if (*thispa == *pp) {
6177 cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld", 6245 cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld",
6178 __func__, j, i, 6246 __func__, j, i,
@@ -6195,7 +6263,7 @@ xfs_bmap_check_leaf_extents(
6195 xfs_inode_t *ip, /* incore inode pointer */ 6263 xfs_inode_t *ip, /* incore inode pointer */
6196 int whichfork) /* data or attr fork */ 6264 int whichfork) /* data or attr fork */
6197{ 6265{
6198 xfs_bmbt_block_t *block; /* current btree block */ 6266 struct xfs_btree_block *block; /* current btree block */
6199 xfs_fsblock_t bno; /* block # of "block" */ 6267 xfs_fsblock_t bno; /* block # of "block" */
6200 xfs_buf_t *bp; /* buffer for "block" */ 6268 xfs_buf_t *bp; /* buffer for "block" */
6201 int error; /* error return value */ 6269 int error; /* error return value */
@@ -6223,7 +6291,7 @@ xfs_bmap_check_leaf_extents(
6223 level = be16_to_cpu(block->bb_level); 6291 level = be16_to_cpu(block->bb_level);
6224 ASSERT(level > 0); 6292 ASSERT(level > 0);
6225 xfs_check_block(block, mp, 1, ifp->if_broot_bytes); 6293 xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
6226 pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes); 6294 pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
6227 bno = be64_to_cpu(*pp); 6295 bno = be64_to_cpu(*pp);
6228 6296
6229 ASSERT(bno != NULLDFSBNO); 6297 ASSERT(bno != NULLDFSBNO);
@@ -6245,9 +6313,9 @@ xfs_bmap_check_leaf_extents(
6245 if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, 6313 if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
6246 XFS_BMAP_BTREE_REF))) 6314 XFS_BMAP_BTREE_REF)))
6247 goto error_norelse; 6315 goto error_norelse;
6248 block = XFS_BUF_TO_BMBT_BLOCK(bp); 6316 block = XFS_BUF_TO_BLOCK(bp);
6249 XFS_WANT_CORRUPTED_GOTO( 6317 XFS_WANT_CORRUPTED_GOTO(
6250 XFS_BMAP_SANITY_CHECK(mp, block, level), 6318 xfs_bmap_sanity_check(mp, bp, level),
6251 error0); 6319 error0);
6252 if (level == 0) 6320 if (level == 0)
6253 break; 6321 break;
@@ -6258,7 +6326,7 @@ xfs_bmap_check_leaf_extents(
6258 */ 6326 */
6259 6327
6260 xfs_check_block(block, mp, 0, 0); 6328 xfs_check_block(block, mp, 0, 0);
6261 pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]); 6329 pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
6262 bno = be64_to_cpu(*pp); 6330 bno = be64_to_cpu(*pp);
6263 XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); 6331 XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
6264 if (bp_release) { 6332 if (bp_release) {
@@ -6280,13 +6348,13 @@ xfs_bmap_check_leaf_extents(
6280 xfs_extnum_t num_recs; 6348 xfs_extnum_t num_recs;
6281 6349
6282 6350
6283 num_recs = be16_to_cpu(block->bb_numrecs); 6351 num_recs = xfs_btree_get_numrecs(block);
6284 6352
6285 /* 6353 /*
6286 * Read-ahead the next leaf block, if any. 6354 * Read-ahead the next leaf block, if any.
6287 */ 6355 */
6288 6356
6289 nextbno = be64_to_cpu(block->bb_rightsib); 6357 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
6290 6358
6291 /* 6359 /*
6292 * Check all the extents to make sure they are OK. 6360 * Check all the extents to make sure they are OK.
@@ -6294,13 +6362,17 @@ xfs_bmap_check_leaf_extents(
6294 * conform with the first entry in this one. 6362 * conform with the first entry in this one.
6295 */ 6363 */
6296 6364
6297 ep = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1); 6365 ep = XFS_BMBT_REC_ADDR(mp, block, 1);
6298 if (i) { 6366 if (i) {
6299 xfs_btree_check_rec(XFS_BTNUM_BMAP, &last, ep); 6367 ASSERT(xfs_bmbt_disk_get_startoff(&last) +
6368 xfs_bmbt_disk_get_blockcount(&last) <=
6369 xfs_bmbt_disk_get_startoff(ep));
6300 } 6370 }
6301 for (j = 1; j < num_recs; j++) { 6371 for (j = 1; j < num_recs; j++) {
6302 nextp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, j + 1); 6372 nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
6303 xfs_btree_check_rec(XFS_BTNUM_BMAP, ep, nextp); 6373 ASSERT(xfs_bmbt_disk_get_startoff(ep) +
6374 xfs_bmbt_disk_get_blockcount(ep) <=
6375 xfs_bmbt_disk_get_startoff(nextp));
6304 ep = nextp; 6376 ep = nextp;
6305 } 6377 }
6306 6378
@@ -6326,7 +6398,7 @@ xfs_bmap_check_leaf_extents(
6326 if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, 6398 if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
6327 XFS_BMAP_BTREE_REF))) 6399 XFS_BMAP_BTREE_REF)))
6328 goto error_norelse; 6400 goto error_norelse;
6329 block = XFS_BUF_TO_BMBT_BLOCK(bp); 6401 block = XFS_BUF_TO_BLOCK(bp);
6330 } 6402 }
6331 if (bp_release) { 6403 if (bp_release) {
6332 bp_release = 0; 6404 bp_release = 0;
@@ -6356,7 +6428,7 @@ xfs_bmap_count_blocks(
6356 int whichfork, /* data or attr fork */ 6428 int whichfork, /* data or attr fork */
6357 int *count) /* out: count of blocks */ 6429 int *count) /* out: count of blocks */
6358{ 6430{
6359 xfs_bmbt_block_t *block; /* current btree block */ 6431 struct xfs_btree_block *block; /* current btree block */
6360 xfs_fsblock_t bno; /* block # of "block" */ 6432 xfs_fsblock_t bno; /* block # of "block" */
6361 xfs_ifork_t *ifp; /* fork structure */ 6433 xfs_ifork_t *ifp; /* fork structure */
6362 int level; /* btree level, for checking */ 6434 int level; /* btree level, for checking */
@@ -6379,7 +6451,7 @@ xfs_bmap_count_blocks(
6379 block = ifp->if_broot; 6451 block = ifp->if_broot;
6380 level = be16_to_cpu(block->bb_level); 6452 level = be16_to_cpu(block->bb_level);
6381 ASSERT(level > 0); 6453 ASSERT(level > 0);
6382 pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes); 6454 pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
6383 bno = be64_to_cpu(*pp); 6455 bno = be64_to_cpu(*pp);
6384 ASSERT(bno != NULLDFSBNO); 6456 ASSERT(bno != NULLDFSBNO);
6385 ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); 6457 ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
@@ -6413,29 +6485,29 @@ xfs_bmap_count_tree(
6413 __be64 *pp; 6485 __be64 *pp;
6414 xfs_fsblock_t bno = blockno; 6486 xfs_fsblock_t bno = blockno;
6415 xfs_fsblock_t nextbno; 6487 xfs_fsblock_t nextbno;
6416 xfs_bmbt_block_t *block, *nextblock; 6488 struct xfs_btree_block *block, *nextblock;
6417 int numrecs; 6489 int numrecs;
6418 6490
6419 if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF))) 6491 if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF)))
6420 return error; 6492 return error;
6421 *count += 1; 6493 *count += 1;
6422 block = XFS_BUF_TO_BMBT_BLOCK(bp); 6494 block = XFS_BUF_TO_BLOCK(bp);
6423 6495
6424 if (--level) { 6496 if (--level) {
6425 /* Not at node above leafs, count this level of nodes */ 6497 /* Not at node above leafs, count this level of nodes */
6426 nextbno = be64_to_cpu(block->bb_rightsib); 6498 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
6427 while (nextbno != NULLFSBLOCK) { 6499 while (nextbno != NULLFSBLOCK) {
6428 if ((error = xfs_btree_read_bufl(mp, tp, nextbno, 6500 if ((error = xfs_btree_read_bufl(mp, tp, nextbno,
6429 0, &nbp, XFS_BMAP_BTREE_REF))) 6501 0, &nbp, XFS_BMAP_BTREE_REF)))
6430 return error; 6502 return error;
6431 *count += 1; 6503 *count += 1;
6432 nextblock = XFS_BUF_TO_BMBT_BLOCK(nbp); 6504 nextblock = XFS_BUF_TO_BLOCK(nbp);
6433 nextbno = be64_to_cpu(nextblock->bb_rightsib); 6505 nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
6434 xfs_trans_brelse(tp, nbp); 6506 xfs_trans_brelse(tp, nbp);
6435 } 6507 }
6436 6508
6437 /* Dive to the next level */ 6509 /* Dive to the next level */
6438 pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]); 6510 pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
6439 bno = be64_to_cpu(*pp); 6511 bno = be64_to_cpu(*pp);
6440 if (unlikely((error = 6512 if (unlikely((error =
6441 xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) { 6513 xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
@@ -6448,9 +6520,9 @@ xfs_bmap_count_tree(
6448 } else { 6520 } else {
6449 /* count all level 1 nodes and their leaves */ 6521 /* count all level 1 nodes and their leaves */
6450 for (;;) { 6522 for (;;) {
6451 nextbno = be64_to_cpu(block->bb_rightsib); 6523 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
6452 numrecs = be16_to_cpu(block->bb_numrecs); 6524 numrecs = be16_to_cpu(block->bb_numrecs);
6453 xfs_bmap_disk_count_leaves(0, block, numrecs, count); 6525 xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
6454 xfs_trans_brelse(tp, bp); 6526 xfs_trans_brelse(tp, bp);
6455 if (nextbno == NULLFSBLOCK) 6527 if (nextbno == NULLFSBLOCK)
6456 break; 6528 break;
@@ -6459,7 +6531,7 @@ xfs_bmap_count_tree(
6459 XFS_BMAP_BTREE_REF))) 6531 XFS_BMAP_BTREE_REF)))
6460 return error; 6532 return error;
6461 *count += 1; 6533 *count += 1;
6462 block = XFS_BUF_TO_BMBT_BLOCK(bp); 6534 block = XFS_BUF_TO_BLOCK(bp);
6463 } 6535 }
6464 } 6536 }
6465 return 0; 6537 return 0;
@@ -6489,8 +6561,8 @@ xfs_bmap_count_leaves(
6489 */ 6561 */
6490STATIC void 6562STATIC void
6491xfs_bmap_disk_count_leaves( 6563xfs_bmap_disk_count_leaves(
6492 xfs_extnum_t idx, 6564 struct xfs_mount *mp,
6493 xfs_bmbt_block_t *block, 6565 struct xfs_btree_block *block,
6494 int numrecs, 6566 int numrecs,
6495 int *count) 6567 int *count)
6496{ 6568{
@@ -6498,7 +6570,7 @@ xfs_bmap_disk_count_leaves(
6498 xfs_bmbt_rec_t *frp; 6570 xfs_bmbt_rec_t *frp;
6499 6571
6500 for (b = 1; b <= numrecs; b++) { 6572 for (b = 1; b <= numrecs; b++) {
6501 frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, idx + b); 6573 frp = XFS_BMBT_REC_ADDR(mp, block, b);
6502 *count += xfs_bmbt_disk_get_blockcount(frp); 6574 *count += xfs_bmbt_disk_get_blockcount(frp);
6503 } 6575 }
6504} 6576}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 9f3e3a836d15..284571c05ed0 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -137,9 +137,7 @@ typedef struct xfs_bmalloca {
137 char conv; /* overwriting unwritten extents */ 137 char conv; /* overwriting unwritten extents */
138} xfs_bmalloca_t; 138} xfs_bmalloca_t;
139 139
140#ifdef __KERNEL__ 140#if defined(__KERNEL__) && defined(XFS_BMAP_TRACE)
141
142#if defined(XFS_BMAP_TRACE)
143/* 141/*
144 * Trace operations for bmap extent tracing 142 * Trace operations for bmap extent tracing
145 */ 143 */
@@ -163,9 +161,12 @@ xfs_bmap_trace_exlist(
163 int whichfork); /* data or attr fork */ 161 int whichfork); /* data or attr fork */
164#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \ 162#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \
165 xfs_bmap_trace_exlist(__func__,ip,c,w) 163 xfs_bmap_trace_exlist(__func__,ip,c,w)
166#else 164
165#else /* __KERNEL__ && XFS_BMAP_TRACE */
166
167#define XFS_BMAP_TRACE_EXLIST(ip,c,w) 167#define XFS_BMAP_TRACE_EXLIST(ip,c,w)
168#endif 168
169#endif /* __KERNEL__ && XFS_BMAP_TRACE */
169 170
170/* 171/*
171 * Convert inode from non-attributed to attributed. 172 * Convert inode from non-attributed to attributed.
@@ -206,20 +207,6 @@ xfs_bmap_compute_maxlevels(
206 int whichfork); /* data or attr fork */ 207 int whichfork); /* data or attr fork */
207 208
208/* 209/*
209 * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
210 * caller. Frees all the extents that need freeing, which must be done
211 * last due to locking considerations.
212 *
213 * Return 1 if the given transaction was committed and a new one allocated,
214 * and 0 otherwise.
215 */
216int /* error */
217xfs_bmap_finish(
218 struct xfs_trans **tp, /* transaction pointer addr */
219 xfs_bmap_free_t *flist, /* i/o: list extents to free */
220 int *committed); /* xact committed or not */
221
222/*
223 * Returns the file-relative block number of the first unused block in the file. 210 * Returns the file-relative block number of the first unused block in the file.
224 * This is the lowest-address hole if the file has holes, else the first block 211 * This is the lowest-address hole if the file has holes, else the first block
225 * past the end of file. 212 * past the end of file.
@@ -344,14 +331,43 @@ xfs_bunmapi(
344 int *done); /* set if not done yet */ 331 int *done); /* set if not done yet */
345 332
346/* 333/*
347 * Fcntl interface to xfs_bmapi. 334 * Check an extent list, which has just been read, for
335 * any bit in the extent flag field.
336 */
337int
338xfs_check_nostate_extents(
339 struct xfs_ifork *ifp,
340 xfs_extnum_t idx,
341 xfs_extnum_t num);
342
343#ifdef __KERNEL__
344
345/*
346 * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
347 * caller. Frees all the extents that need freeing, which must be done
348 * last due to locking considerations.
349 *
350 * Return 1 if the given transaction was committed and a new one allocated,
351 * and 0 otherwise.
352 */
353int /* error */
354xfs_bmap_finish(
355 struct xfs_trans **tp, /* transaction pointer addr */
356 xfs_bmap_free_t *flist, /* i/o: list extents to free */
357 int *committed); /* xact committed or not */
358
359/* bmap to userspace formatter - copy to user & advance pointer */
360typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
361
362/*
363 * Get inode's extents as described in bmv, and format for output.
348 */ 364 */
349int /* error code */ 365int /* error code */
350xfs_getbmap( 366xfs_getbmap(
351 xfs_inode_t *ip, 367 xfs_inode_t *ip,
352 struct getbmap *bmv, /* user bmap structure */ 368 struct getbmapx *bmv, /* user bmap structure */
353 void __user *ap, /* pointer to user's array */ 369 xfs_bmap_format_t formatter, /* format to user */
354 int iflags); /* interface flags */ 370 void *arg); /* formatter arg */
355 371
356/* 372/*
357 * Check if the endoff is outside the last extent. If so the caller will grow 373 * Check if the endoff is outside the last extent. If so the caller will grow
@@ -375,16 +391,6 @@ xfs_bmap_count_blocks(
375 int *count); 391 int *count);
376 392
377/* 393/*
378 * Check an extent list, which has just been read, for
379 * any bit in the extent flag field.
380 */
381int
382xfs_check_nostate_extents(
383 struct xfs_ifork *ifp,
384 xfs_extnum_t idx,
385 xfs_extnum_t num);
386
387/*
388 * Search the extent records for the entry containing block bno. 394 * Search the extent records for the entry containing block bno.
389 * If bno lies in a hole, point to the next entry. If bno lies 395 * If bno lies in a hole, point to the next entry. If bno lies
390 * past eof, *eofp will be set, and *prevp will contain the last 396 * past eof, *eofp will be set, and *prevp will contain the last
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 23efad29a5cd..8f1ec73725d3 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -37,1406 +37,13 @@
37#include "xfs_inode_item.h" 37#include "xfs_inode_item.h"
38#include "xfs_alloc.h" 38#include "xfs_alloc.h"
39#include "xfs_btree.h" 39#include "xfs_btree.h"
40#include "xfs_btree_trace.h"
40#include "xfs_ialloc.h" 41#include "xfs_ialloc.h"
41#include "xfs_itable.h" 42#include "xfs_itable.h"
42#include "xfs_bmap.h" 43#include "xfs_bmap.h"
43#include "xfs_error.h" 44#include "xfs_error.h"
44#include "xfs_quota.h" 45#include "xfs_quota.h"
45 46
46#if defined(XFS_BMBT_TRACE)
47ktrace_t *xfs_bmbt_trace_buf;
48#endif
49
50/*
51 * Prototypes for internal btree functions.
52 */
53
54
55STATIC int xfs_bmbt_killroot(xfs_btree_cur_t *);
56STATIC void xfs_bmbt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
57STATIC void xfs_bmbt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
58STATIC int xfs_bmbt_lshift(xfs_btree_cur_t *, int, int *);
59STATIC int xfs_bmbt_rshift(xfs_btree_cur_t *, int, int *);
60STATIC int xfs_bmbt_split(xfs_btree_cur_t *, int, xfs_fsblock_t *,
61 __uint64_t *, xfs_btree_cur_t **, int *);
62STATIC int xfs_bmbt_updkey(xfs_btree_cur_t *, xfs_bmbt_key_t *, int);
63
64
65#if defined(XFS_BMBT_TRACE)
66
67static char ARGS[] = "args";
68static char ENTRY[] = "entry";
69static char ERROR[] = "error";
70#undef EXIT
71static char EXIT[] = "exit";
72
73/*
74 * Add a trace buffer entry for the arguments given to the routine,
75 * generic form.
76 */
77STATIC void
78xfs_bmbt_trace_enter(
79 const char *func,
80 xfs_btree_cur_t *cur,
81 char *s,
82 int type,
83 int line,
84 __psunsigned_t a0,
85 __psunsigned_t a1,
86 __psunsigned_t a2,
87 __psunsigned_t a3,
88 __psunsigned_t a4,
89 __psunsigned_t a5,
90 __psunsigned_t a6,
91 __psunsigned_t a7,
92 __psunsigned_t a8,
93 __psunsigned_t a9,
94 __psunsigned_t a10)
95{
96 xfs_inode_t *ip;
97 int whichfork;
98
99 ip = cur->bc_private.b.ip;
100 whichfork = cur->bc_private.b.whichfork;
101 ktrace_enter(xfs_bmbt_trace_buf,
102 (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
103 (void *)func, (void *)s, (void *)ip, (void *)cur,
104 (void *)a0, (void *)a1, (void *)a2, (void *)a3,
105 (void *)a4, (void *)a5, (void *)a6, (void *)a7,
106 (void *)a8, (void *)a9, (void *)a10);
107 ASSERT(ip->i_btrace);
108 ktrace_enter(ip->i_btrace,
109 (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
110 (void *)func, (void *)s, (void *)ip, (void *)cur,
111 (void *)a0, (void *)a1, (void *)a2, (void *)a3,
112 (void *)a4, (void *)a5, (void *)a6, (void *)a7,
113 (void *)a8, (void *)a9, (void *)a10);
114}
115/*
116 * Add a trace buffer entry for arguments, for a buffer & 1 integer arg.
117 */
118STATIC void
119xfs_bmbt_trace_argbi(
120 const char *func,
121 xfs_btree_cur_t *cur,
122 xfs_buf_t *b,
123 int i,
124 int line)
125{
126 xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBI, line,
127 (__psunsigned_t)b, i, 0, 0,
128 0, 0, 0, 0,
129 0, 0, 0);
130}
131
132/*
133 * Add a trace buffer entry for arguments, for a buffer & 2 integer args.
134 */
135STATIC void
136xfs_bmbt_trace_argbii(
137 const char *func,
138 xfs_btree_cur_t *cur,
139 xfs_buf_t *b,
140 int i0,
141 int i1,
142 int line)
143{
144 xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBII, line,
145 (__psunsigned_t)b, i0, i1, 0,
146 0, 0, 0, 0,
147 0, 0, 0);
148}
149
150/*
151 * Add a trace buffer entry for arguments, for 3 block-length args
152 * and an integer arg.
153 */
154STATIC void
155xfs_bmbt_trace_argfffi(
156 const char *func,
157 xfs_btree_cur_t *cur,
158 xfs_dfiloff_t o,
159 xfs_dfsbno_t b,
160 xfs_dfilblks_t i,
161 int j,
162 int line)
163{
164 xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGFFFI, line,
165 o >> 32, (int)o, b >> 32, (int)b,
166 i >> 32, (int)i, (int)j, 0,
167 0, 0, 0);
168}
169
170/*
171 * Add a trace buffer entry for arguments, for one integer arg.
172 */
173STATIC void
174xfs_bmbt_trace_argi(
175 const char *func,
176 xfs_btree_cur_t *cur,
177 int i,
178 int line)
179{
180 xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGI, line,
181 i, 0, 0, 0,
182 0, 0, 0, 0,
183 0, 0, 0);
184}
185
186/*
187 * Add a trace buffer entry for arguments, for int, fsblock, key.
188 */
189STATIC void
190xfs_bmbt_trace_argifk(
191 const char *func,
192 xfs_btree_cur_t *cur,
193 int i,
194 xfs_fsblock_t f,
195 xfs_dfiloff_t o,
196 int line)
197{
198 xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
199 i, (xfs_dfsbno_t)f >> 32, (int)f, o >> 32,
200 (int)o, 0, 0, 0,
201 0, 0, 0);
202}
203
204/*
205 * Add a trace buffer entry for arguments, for int, fsblock, rec.
206 */
207STATIC void
208xfs_bmbt_trace_argifr(
209 const char *func,
210 xfs_btree_cur_t *cur,
211 int i,
212 xfs_fsblock_t f,
213 xfs_bmbt_rec_t *r,
214 int line)
215{
216 xfs_dfsbno_t b;
217 xfs_dfilblks_t c;
218 xfs_dfsbno_t d;
219 xfs_dfiloff_t o;
220 xfs_bmbt_irec_t s;
221
222 d = (xfs_dfsbno_t)f;
223 xfs_bmbt_disk_get_all(r, &s);
224 o = (xfs_dfiloff_t)s.br_startoff;
225 b = (xfs_dfsbno_t)s.br_startblock;
226 c = s.br_blockcount;
227 xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFR, line,
228 i, d >> 32, (int)d, o >> 32,
229 (int)o, b >> 32, (int)b, c >> 32,
230 (int)c, 0, 0);
231}
232
233/*
234 * Add a trace buffer entry for arguments, for int, key.
235 */
236STATIC void
237xfs_bmbt_trace_argik(
238 const char *func,
239 xfs_btree_cur_t *cur,
240 int i,
241 xfs_bmbt_key_t *k,
242 int line)
243{
244 xfs_dfiloff_t o;
245
246 o = be64_to_cpu(k->br_startoff);
247 xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
248 i, o >> 32, (int)o, 0,
249 0, 0, 0, 0,
250 0, 0, 0);
251}
252
253/*
254 * Add a trace buffer entry for the cursor/operation.
255 */
256STATIC void
257xfs_bmbt_trace_cursor(
258 const char *func,
259 xfs_btree_cur_t *cur,
260 char *s,
261 int line)
262{
263 xfs_bmbt_rec_host_t r;
264
265 xfs_bmbt_set_all(&r, &cur->bc_rec.b);
266 xfs_bmbt_trace_enter(func, cur, s, XFS_BMBT_KTRACE_CUR, line,
267 (cur->bc_nlevels << 24) | (cur->bc_private.b.flags << 16) |
268 cur->bc_private.b.allocated,
269 r.l0 >> 32, (int)r.l0,
270 r.l1 >> 32, (int)r.l1,
271 (unsigned long)cur->bc_bufs[0], (unsigned long)cur->bc_bufs[1],
272 (unsigned long)cur->bc_bufs[2], (unsigned long)cur->bc_bufs[3],
273 (cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1],
274 (cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]);
275}
276
277#define XFS_BMBT_TRACE_ARGBI(c,b,i) \
278 xfs_bmbt_trace_argbi(__func__, c, b, i, __LINE__)
279#define XFS_BMBT_TRACE_ARGBII(c,b,i,j) \
280 xfs_bmbt_trace_argbii(__func__, c, b, i, j, __LINE__)
281#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j) \
282 xfs_bmbt_trace_argfffi(__func__, c, o, b, i, j, __LINE__)
283#define XFS_BMBT_TRACE_ARGI(c,i) \
284 xfs_bmbt_trace_argi(__func__, c, i, __LINE__)
285#define XFS_BMBT_TRACE_ARGIFK(c,i,f,s) \
286 xfs_bmbt_trace_argifk(__func__, c, i, f, s, __LINE__)
287#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r) \
288 xfs_bmbt_trace_argifr(__func__, c, i, f, r, __LINE__)
289#define XFS_BMBT_TRACE_ARGIK(c,i,k) \
290 xfs_bmbt_trace_argik(__func__, c, i, k, __LINE__)
291#define XFS_BMBT_TRACE_CURSOR(c,s) \
292 xfs_bmbt_trace_cursor(__func__, c, s, __LINE__)
293#else
294#define XFS_BMBT_TRACE_ARGBI(c,b,i)
295#define XFS_BMBT_TRACE_ARGBII(c,b,i,j)
296#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j)
297#define XFS_BMBT_TRACE_ARGI(c,i)
298#define XFS_BMBT_TRACE_ARGIFK(c,i,f,s)
299#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r)
300#define XFS_BMBT_TRACE_ARGIK(c,i,k)
301#define XFS_BMBT_TRACE_CURSOR(c,s)
302#endif /* XFS_BMBT_TRACE */
303
304
305/*
306 * Internal functions.
307 */
308
309/*
310 * Delete record pointed to by cur/level.
311 */
312STATIC int /* error */
313xfs_bmbt_delrec(
314 xfs_btree_cur_t *cur,
315 int level,
316 int *stat) /* success/failure */
317{
318 xfs_bmbt_block_t *block; /* bmap btree block */
319 xfs_fsblock_t bno; /* fs-relative block number */
320 xfs_buf_t *bp; /* buffer for block */
321 int error; /* error return value */
322 int i; /* loop counter */
323 int j; /* temp state */
324 xfs_bmbt_key_t key; /* bmap btree key */
325 xfs_bmbt_key_t *kp=NULL; /* pointer to bmap btree key */
326 xfs_fsblock_t lbno; /* left sibling block number */
327 xfs_buf_t *lbp; /* left buffer pointer */
328 xfs_bmbt_block_t *left; /* left btree block */
329 xfs_bmbt_key_t *lkp; /* left btree key */
330 xfs_bmbt_ptr_t *lpp; /* left address pointer */
331 int lrecs=0; /* left record count */
332 xfs_bmbt_rec_t *lrp; /* left record pointer */
333 xfs_mount_t *mp; /* file system mount point */
334 xfs_bmbt_ptr_t *pp; /* pointer to bmap block addr */
335 int ptr; /* key/record index */
336 xfs_fsblock_t rbno; /* right sibling block number */
337 xfs_buf_t *rbp; /* right buffer pointer */
338 xfs_bmbt_block_t *right; /* right btree block */
339 xfs_bmbt_key_t *rkp; /* right btree key */
340 xfs_bmbt_rec_t *rp; /* pointer to bmap btree rec */
341 xfs_bmbt_ptr_t *rpp; /* right address pointer */
342 xfs_bmbt_block_t *rrblock; /* right-right btree block */
343 xfs_buf_t *rrbp; /* right-right buffer pointer */
344 int rrecs=0; /* right record count */
345 xfs_bmbt_rec_t *rrp; /* right record pointer */
346 xfs_btree_cur_t *tcur; /* temporary btree cursor */
347 int numrecs; /* temporary numrec count */
348 int numlrecs, numrrecs;
349
350 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
351 XFS_BMBT_TRACE_ARGI(cur, level);
352 ptr = cur->bc_ptrs[level];
353 tcur = NULL;
354 if (ptr == 0) {
355 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
356 *stat = 0;
357 return 0;
358 }
359 block = xfs_bmbt_get_block(cur, level, &bp);
360 numrecs = be16_to_cpu(block->bb_numrecs);
361#ifdef DEBUG
362 if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
363 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
364 goto error0;
365 }
366#endif
367 if (ptr > numrecs) {
368 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
369 *stat = 0;
370 return 0;
371 }
372 XFS_STATS_INC(xs_bmbt_delrec);
373 if (level > 0) {
374 kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
375 pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
376#ifdef DEBUG
377 for (i = ptr; i < numrecs; i++) {
378 if ((error = xfs_btree_check_lptr_disk(cur, pp[i], level))) {
379 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
380 goto error0;
381 }
382 }
383#endif
384 if (ptr < numrecs) {
385 memmove(&kp[ptr - 1], &kp[ptr],
386 (numrecs - ptr) * sizeof(*kp));
387 memmove(&pp[ptr - 1], &pp[ptr],
388 (numrecs - ptr) * sizeof(*pp));
389 xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs - 1);
390 xfs_bmbt_log_keys(cur, bp, ptr, numrecs - 1);
391 }
392 } else {
393 rp = XFS_BMAP_REC_IADDR(block, 1, cur);
394 if (ptr < numrecs) {
395 memmove(&rp[ptr - 1], &rp[ptr],
396 (numrecs - ptr) * sizeof(*rp));
397 xfs_bmbt_log_recs(cur, bp, ptr, numrecs - 1);
398 }
399 if (ptr == 1) {
400 key.br_startoff =
401 cpu_to_be64(xfs_bmbt_disk_get_startoff(rp));
402 kp = &key;
403 }
404 }
405 numrecs--;
406 block->bb_numrecs = cpu_to_be16(numrecs);
407 xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS);
408 /*
409 * We're at the root level.
410 * First, shrink the root block in-memory.
411 * Try to get rid of the next level down.
412 * If we can't then there's nothing left to do.
413 */
414 if (level == cur->bc_nlevels - 1) {
415 xfs_iroot_realloc(cur->bc_private.b.ip, -1,
416 cur->bc_private.b.whichfork);
417 if ((error = xfs_bmbt_killroot(cur))) {
418 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
419 goto error0;
420 }
421 if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) {
422 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
423 goto error0;
424 }
425 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
426 *stat = 1;
427 return 0;
428 }
429 if (ptr == 1 && (error = xfs_bmbt_updkey(cur, kp, level + 1))) {
430 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
431 goto error0;
432 }
433 if (numrecs >= XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
434 if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) {
435 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
436 goto error0;
437 }
438 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
439 *stat = 1;
440 return 0;
441 }
442 rbno = be64_to_cpu(block->bb_rightsib);
443 lbno = be64_to_cpu(block->bb_leftsib);
444 /*
445 * One child of root, need to get a chance to copy its contents
446 * into the root and delete it. Can't go up to next level,
447 * there's nothing to delete there.
448 */
449 if (lbno == NULLFSBLOCK && rbno == NULLFSBLOCK &&
450 level == cur->bc_nlevels - 2) {
451 if ((error = xfs_bmbt_killroot(cur))) {
452 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
453 goto error0;
454 }
455 if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) {
456 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
457 goto error0;
458 }
459 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
460 *stat = 1;
461 return 0;
462 }
463 ASSERT(rbno != NULLFSBLOCK || lbno != NULLFSBLOCK);
464 if ((error = xfs_btree_dup_cursor(cur, &tcur))) {
465 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
466 goto error0;
467 }
468 bno = NULLFSBLOCK;
469 if (rbno != NULLFSBLOCK) {
470 i = xfs_btree_lastrec(tcur, level);
471 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
472 if ((error = xfs_bmbt_increment(tcur, level, &i))) {
473 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
474 goto error0;
475 }
476 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
477 i = xfs_btree_lastrec(tcur, level);
478 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
479 rbp = tcur->bc_bufs[level];
480 right = XFS_BUF_TO_BMBT_BLOCK(rbp);
481#ifdef DEBUG
482 if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
483 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
484 goto error0;
485 }
486#endif
487 bno = be64_to_cpu(right->bb_leftsib);
488 if (be16_to_cpu(right->bb_numrecs) - 1 >=
489 XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
490 if ((error = xfs_bmbt_lshift(tcur, level, &i))) {
491 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
492 goto error0;
493 }
494 if (i) {
495 ASSERT(be16_to_cpu(block->bb_numrecs) >=
496 XFS_BMAP_BLOCK_IMINRECS(level, tcur));
497 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
498 tcur = NULL;
499 if (level > 0) {
500 if ((error = xfs_bmbt_decrement(cur,
501 level, &i))) {
502 XFS_BMBT_TRACE_CURSOR(cur,
503 ERROR);
504 goto error0;
505 }
506 }
507 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
508 *stat = 1;
509 return 0;
510 }
511 }
512 rrecs = be16_to_cpu(right->bb_numrecs);
513 if (lbno != NULLFSBLOCK) {
514 i = xfs_btree_firstrec(tcur, level);
515 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
516 if ((error = xfs_bmbt_decrement(tcur, level, &i))) {
517 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
518 goto error0;
519 }
520 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
521 }
522 }
523 if (lbno != NULLFSBLOCK) {
524 i = xfs_btree_firstrec(tcur, level);
525 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
526 /*
527 * decrement to last in block
528 */
529 if ((error = xfs_bmbt_decrement(tcur, level, &i))) {
530 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
531 goto error0;
532 }
533 i = xfs_btree_firstrec(tcur, level);
534 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
535 lbp = tcur->bc_bufs[level];
536 left = XFS_BUF_TO_BMBT_BLOCK(lbp);
537#ifdef DEBUG
538 if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
539 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
540 goto error0;
541 }
542#endif
543 bno = be64_to_cpu(left->bb_rightsib);
544 if (be16_to_cpu(left->bb_numrecs) - 1 >=
545 XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
546 if ((error = xfs_bmbt_rshift(tcur, level, &i))) {
547 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
548 goto error0;
549 }
550 if (i) {
551 ASSERT(be16_to_cpu(block->bb_numrecs) >=
552 XFS_BMAP_BLOCK_IMINRECS(level, tcur));
553 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
554 tcur = NULL;
555 if (level == 0)
556 cur->bc_ptrs[0]++;
557 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
558 *stat = 1;
559 return 0;
560 }
561 }
562 lrecs = be16_to_cpu(left->bb_numrecs);
563 }
564 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
565 tcur = NULL;
566 mp = cur->bc_mp;
567 ASSERT(bno != NULLFSBLOCK);
568 if (lbno != NULLFSBLOCK &&
569 lrecs + be16_to_cpu(block->bb_numrecs) <= XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
570 rbno = bno;
571 right = block;
572 rbp = bp;
573 if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, lbno, 0, &lbp,
574 XFS_BMAP_BTREE_REF))) {
575 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
576 goto error0;
577 }
578 left = XFS_BUF_TO_BMBT_BLOCK(lbp);
579 if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
580 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
581 goto error0;
582 }
583 } else if (rbno != NULLFSBLOCK &&
584 rrecs + be16_to_cpu(block->bb_numrecs) <=
585 XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
586 lbno = bno;
587 left = block;
588 lbp = bp;
589 if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, rbno, 0, &rbp,
590 XFS_BMAP_BTREE_REF))) {
591 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
592 goto error0;
593 }
594 right = XFS_BUF_TO_BMBT_BLOCK(rbp);
595 if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
596 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
597 goto error0;
598 }
599 lrecs = be16_to_cpu(left->bb_numrecs);
600 } else {
601 if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) {
602 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
603 goto error0;
604 }
605 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
606 *stat = 1;
607 return 0;
608 }
609 numlrecs = be16_to_cpu(left->bb_numrecs);
610 numrrecs = be16_to_cpu(right->bb_numrecs);
611 if (level > 0) {
612 lkp = XFS_BMAP_KEY_IADDR(left, numlrecs + 1, cur);
613 lpp = XFS_BMAP_PTR_IADDR(left, numlrecs + 1, cur);
614 rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
615 rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
616#ifdef DEBUG
617 for (i = 0; i < numrrecs; i++) {
618 if ((error = xfs_btree_check_lptr_disk(cur, rpp[i], level))) {
619 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
620 goto error0;
621 }
622 }
623#endif
624 memcpy(lkp, rkp, numrrecs * sizeof(*lkp));
625 memcpy(lpp, rpp, numrrecs * sizeof(*lpp));
626 xfs_bmbt_log_keys(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
627 xfs_bmbt_log_ptrs(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
628 } else {
629 lrp = XFS_BMAP_REC_IADDR(left, numlrecs + 1, cur);
630 rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
631 memcpy(lrp, rrp, numrrecs * sizeof(*lrp));
632 xfs_bmbt_log_recs(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
633 }
634 be16_add_cpu(&left->bb_numrecs, numrrecs);
635 left->bb_rightsib = right->bb_rightsib;
636 xfs_bmbt_log_block(cur, lbp, XFS_BB_RIGHTSIB | XFS_BB_NUMRECS);
637 if (be64_to_cpu(left->bb_rightsib) != NULLDFSBNO) {
638 if ((error = xfs_btree_read_bufl(mp, cur->bc_tp,
639 be64_to_cpu(left->bb_rightsib),
640 0, &rrbp, XFS_BMAP_BTREE_REF))) {
641 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
642 goto error0;
643 }
644 rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp);
645 if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) {
646 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
647 goto error0;
648 }
649 rrblock->bb_leftsib = cpu_to_be64(lbno);
650 xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB);
651 }
652 xfs_bmap_add_free(XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(rbp)), 1,
653 cur->bc_private.b.flist, mp);
654 cur->bc_private.b.ip->i_d.di_nblocks--;
655 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
656 XFS_TRANS_MOD_DQUOT_BYINO(mp, cur->bc_tp, cur->bc_private.b.ip,
657 XFS_TRANS_DQ_BCOUNT, -1L);
658 xfs_trans_binval(cur->bc_tp, rbp);
659 if (bp != lbp) {
660 cur->bc_bufs[level] = lbp;
661 cur->bc_ptrs[level] += lrecs;
662 cur->bc_ra[level] = 0;
663 } else if ((error = xfs_bmbt_increment(cur, level + 1, &i))) {
664 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
665 goto error0;
666 }
667 if (level > 0)
668 cur->bc_ptrs[level]--;
669 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
670 *stat = 2;
671 return 0;
672
673error0:
674 if (tcur)
675 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
676 return error;
677}
678
679/*
680 * Insert one record/level. Return information to the caller
681 * allowing the next level up to proceed if necessary.
682 */
683STATIC int /* error */
684xfs_bmbt_insrec(
685 xfs_btree_cur_t *cur,
686 int level,
687 xfs_fsblock_t *bnop,
688 xfs_bmbt_rec_t *recp,
689 xfs_btree_cur_t **curp,
690 int *stat) /* no-go/done/continue */
691{
692 xfs_bmbt_block_t *block; /* bmap btree block */
693 xfs_buf_t *bp; /* buffer for block */
694 int error; /* error return value */
695 int i; /* loop index */
696 xfs_bmbt_key_t key; /* bmap btree key */
697 xfs_bmbt_key_t *kp=NULL; /* pointer to bmap btree key */
698 int logflags; /* inode logging flags */
699 xfs_fsblock_t nbno; /* new block number */
700 struct xfs_btree_cur *ncur; /* new btree cursor */
701 __uint64_t startoff; /* new btree key value */
702 xfs_bmbt_rec_t nrec; /* new record count */
703 int optr; /* old key/record index */
704 xfs_bmbt_ptr_t *pp; /* pointer to bmap block addr */
705 int ptr; /* key/record index */
706 xfs_bmbt_rec_t *rp=NULL; /* pointer to bmap btree rec */
707 int numrecs;
708
709 ASSERT(level < cur->bc_nlevels);
710 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
711 XFS_BMBT_TRACE_ARGIFR(cur, level, *bnop, recp);
712 ncur = NULL;
713 key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(recp));
714 optr = ptr = cur->bc_ptrs[level];
715 if (ptr == 0) {
716 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
717 *stat = 0;
718 return 0;
719 }
720 XFS_STATS_INC(xs_bmbt_insrec);
721 block = xfs_bmbt_get_block(cur, level, &bp);
722 numrecs = be16_to_cpu(block->bb_numrecs);
723#ifdef DEBUG
724 if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
725 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
726 return error;
727 }
728 if (ptr <= numrecs) {
729 if (level == 0) {
730 rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
731 xfs_btree_check_rec(XFS_BTNUM_BMAP, recp, rp);
732 } else {
733 kp = XFS_BMAP_KEY_IADDR(block, ptr, cur);
734 xfs_btree_check_key(XFS_BTNUM_BMAP, &key, kp);
735 }
736 }
737#endif
738 nbno = NULLFSBLOCK;
739 if (numrecs == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
740 if (numrecs < XFS_BMAP_BLOCK_DMAXRECS(level, cur)) {
741 /*
742 * A root block, that can be made bigger.
743 */
744 xfs_iroot_realloc(cur->bc_private.b.ip, 1,
745 cur->bc_private.b.whichfork);
746 block = xfs_bmbt_get_block(cur, level, &bp);
747 } else if (level == cur->bc_nlevels - 1) {
748 if ((error = xfs_bmbt_newroot(cur, &logflags, stat)) ||
749 *stat == 0) {
750 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
751 return error;
752 }
753 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
754 logflags);
755 block = xfs_bmbt_get_block(cur, level, &bp);
756 } else {
757 if ((error = xfs_bmbt_rshift(cur, level, &i))) {
758 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
759 return error;
760 }
761 if (i) {
762 /* nothing */
763 } else {
764 if ((error = xfs_bmbt_lshift(cur, level, &i))) {
765 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
766 return error;
767 }
768 if (i) {
769 optr = ptr = cur->bc_ptrs[level];
770 } else {
771 if ((error = xfs_bmbt_split(cur, level,
772 &nbno, &startoff, &ncur,
773 &i))) {
774 XFS_BMBT_TRACE_CURSOR(cur,
775 ERROR);
776 return error;
777 }
778 if (i) {
779 block = xfs_bmbt_get_block(
780 cur, level, &bp);
781#ifdef DEBUG
782 if ((error =
783 xfs_btree_check_lblock(cur,
784 block, level, bp))) {
785 XFS_BMBT_TRACE_CURSOR(
786 cur, ERROR);
787 return error;
788 }
789#endif
790 ptr = cur->bc_ptrs[level];
791 xfs_bmbt_disk_set_allf(&nrec,
792 startoff, 0, 0,
793 XFS_EXT_NORM);
794 } else {
795 XFS_BMBT_TRACE_CURSOR(cur,
796 EXIT);
797 *stat = 0;
798 return 0;
799 }
800 }
801 }
802 }
803 }
804 numrecs = be16_to_cpu(block->bb_numrecs);
805 if (level > 0) {
806 kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
807 pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
808#ifdef DEBUG
809 for (i = numrecs; i >= ptr; i--) {
810 if ((error = xfs_btree_check_lptr_disk(cur, pp[i - 1],
811 level))) {
812 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
813 return error;
814 }
815 }
816#endif
817 memmove(&kp[ptr], &kp[ptr - 1],
818 (numrecs - ptr + 1) * sizeof(*kp));
819 memmove(&pp[ptr], &pp[ptr - 1],
820 (numrecs - ptr + 1) * sizeof(*pp));
821#ifdef DEBUG
822 if ((error = xfs_btree_check_lptr(cur, *bnop, level))) {
823 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
824 return error;
825 }
826#endif
827 kp[ptr - 1] = key;
828 pp[ptr - 1] = cpu_to_be64(*bnop);
829 numrecs++;
830 block->bb_numrecs = cpu_to_be16(numrecs);
831 xfs_bmbt_log_keys(cur, bp, ptr, numrecs);
832 xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs);
833 } else {
834 rp = XFS_BMAP_REC_IADDR(block, 1, cur);
835 memmove(&rp[ptr], &rp[ptr - 1],
836 (numrecs - ptr + 1) * sizeof(*rp));
837 rp[ptr - 1] = *recp;
838 numrecs++;
839 block->bb_numrecs = cpu_to_be16(numrecs);
840 xfs_bmbt_log_recs(cur, bp, ptr, numrecs);
841 }
842 xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS);
843#ifdef DEBUG
844 if (ptr < numrecs) {
845 if (level == 0)
846 xfs_btree_check_rec(XFS_BTNUM_BMAP, rp + ptr - 1,
847 rp + ptr);
848 else
849 xfs_btree_check_key(XFS_BTNUM_BMAP, kp + ptr - 1,
850 kp + ptr);
851 }
852#endif
853 if (optr == 1 && (error = xfs_bmbt_updkey(cur, &key, level + 1))) {
854 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
855 return error;
856 }
857 *bnop = nbno;
858 if (nbno != NULLFSBLOCK) {
859 *recp = nrec;
860 *curp = ncur;
861 }
862 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
863 *stat = 1;
864 return 0;
865}
866
867STATIC int
868xfs_bmbt_killroot(
869 xfs_btree_cur_t *cur)
870{
871 xfs_bmbt_block_t *block;
872 xfs_bmbt_block_t *cblock;
873 xfs_buf_t *cbp;
874 xfs_bmbt_key_t *ckp;
875 xfs_bmbt_ptr_t *cpp;
876#ifdef DEBUG
877 int error;
878#endif
879 int i;
880 xfs_bmbt_key_t *kp;
881 xfs_inode_t *ip;
882 xfs_ifork_t *ifp;
883 int level;
884 xfs_bmbt_ptr_t *pp;
885
886 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
887 level = cur->bc_nlevels - 1;
888 ASSERT(level >= 1);
889 /*
890 * Don't deal with the root block needs to be a leaf case.
891 * We're just going to turn the thing back into extents anyway.
892 */
893 if (level == 1) {
894 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
895 return 0;
896 }
897 block = xfs_bmbt_get_block(cur, level, &cbp);
898 /*
899 * Give up if the root has multiple children.
900 */
901 if (be16_to_cpu(block->bb_numrecs) != 1) {
902 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
903 return 0;
904 }
905 /*
906 * Only do this if the next level will fit.
907 * Then the data must be copied up to the inode,
908 * instead of freeing the root you free the next level.
909 */
910 cbp = cur->bc_bufs[level - 1];
911 cblock = XFS_BUF_TO_BMBT_BLOCK(cbp);
912 if (be16_to_cpu(cblock->bb_numrecs) > XFS_BMAP_BLOCK_DMAXRECS(level, cur)) {
913 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
914 return 0;
915 }
916 ASSERT(be64_to_cpu(cblock->bb_leftsib) == NULLDFSBNO);
917 ASSERT(be64_to_cpu(cblock->bb_rightsib) == NULLDFSBNO);
918 ip = cur->bc_private.b.ip;
919 ifp = XFS_IFORK_PTR(ip, cur->bc_private.b.whichfork);
920 ASSERT(XFS_BMAP_BLOCK_IMAXRECS(level, cur) ==
921 XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes));
922 i = (int)(be16_to_cpu(cblock->bb_numrecs) - XFS_BMAP_BLOCK_IMAXRECS(level, cur));
923 if (i) {
924 xfs_iroot_realloc(ip, i, cur->bc_private.b.whichfork);
925 block = ifp->if_broot;
926 }
927 be16_add_cpu(&block->bb_numrecs, i);
928 ASSERT(block->bb_numrecs == cblock->bb_numrecs);
929 kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
930 ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur);
931 memcpy(kp, ckp, be16_to_cpu(block->bb_numrecs) * sizeof(*kp));
932 pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
933 cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
934#ifdef DEBUG
935 for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
936 if ((error = xfs_btree_check_lptr_disk(cur, cpp[i], level - 1))) {
937 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
938 return error;
939 }
940 }
941#endif
942 memcpy(pp, cpp, be16_to_cpu(block->bb_numrecs) * sizeof(*pp));
943 xfs_bmap_add_free(XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(cbp)), 1,
944 cur->bc_private.b.flist, cur->bc_mp);
945 ip->i_d.di_nblocks--;
946 XFS_TRANS_MOD_DQUOT_BYINO(cur->bc_mp, cur->bc_tp, ip,
947 XFS_TRANS_DQ_BCOUNT, -1L);
948 xfs_trans_binval(cur->bc_tp, cbp);
949 cur->bc_bufs[level - 1] = NULL;
950 be16_add_cpu(&block->bb_level, -1);
951 xfs_trans_log_inode(cur->bc_tp, ip,
952 XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
953 cur->bc_nlevels--;
954 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
955 return 0;
956}
957
958/*
959 * Log key values from the btree block.
960 */
961STATIC void
962xfs_bmbt_log_keys(
963 xfs_btree_cur_t *cur,
964 xfs_buf_t *bp,
965 int kfirst,
966 int klast)
967{
968 xfs_trans_t *tp;
969
970 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
971 XFS_BMBT_TRACE_ARGBII(cur, bp, kfirst, klast);
972 tp = cur->bc_tp;
973 if (bp) {
974 xfs_bmbt_block_t *block;
975 int first;
976 xfs_bmbt_key_t *kp;
977 int last;
978
979 block = XFS_BUF_TO_BMBT_BLOCK(bp);
980 kp = XFS_BMAP_KEY_DADDR(block, 1, cur);
981 first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
982 last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
983 xfs_trans_log_buf(tp, bp, first, last);
984 } else {
985 xfs_inode_t *ip;
986
987 ip = cur->bc_private.b.ip;
988 xfs_trans_log_inode(tp, ip,
989 XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
990 }
991 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
992}
993
994/*
995 * Log pointer values from the btree block.
996 */
997STATIC void
998xfs_bmbt_log_ptrs(
999 xfs_btree_cur_t *cur,
1000 xfs_buf_t *bp,
1001 int pfirst,
1002 int plast)
1003{
1004 xfs_trans_t *tp;
1005
1006 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
1007 XFS_BMBT_TRACE_ARGBII(cur, bp, pfirst, plast);
1008 tp = cur->bc_tp;
1009 if (bp) {
1010 xfs_bmbt_block_t *block;
1011 int first;
1012 int last;
1013 xfs_bmbt_ptr_t *pp;
1014
1015 block = XFS_BUF_TO_BMBT_BLOCK(bp);
1016 pp = XFS_BMAP_PTR_DADDR(block, 1, cur);
1017 first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
1018 last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
1019 xfs_trans_log_buf(tp, bp, first, last);
1020 } else {
1021 xfs_inode_t *ip;
1022
1023 ip = cur->bc_private.b.ip;
1024 xfs_trans_log_inode(tp, ip,
1025 XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
1026 }
1027 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1028}
1029
1030/*
1031 * Lookup the record. The cursor is made to point to it, based on dir.
1032 */
1033STATIC int /* error */
1034xfs_bmbt_lookup(
1035 xfs_btree_cur_t *cur,
1036 xfs_lookup_t dir,
1037 int *stat) /* success/failure */
1038{
1039 xfs_bmbt_block_t *block=NULL;
1040 xfs_buf_t *bp;
1041 xfs_daddr_t d;
1042 xfs_sfiloff_t diff;
1043 int error; /* error return value */
1044 xfs_fsblock_t fsbno=0;
1045 int high;
1046 int i;
1047 int keyno=0;
1048 xfs_bmbt_key_t *kkbase=NULL;
1049 xfs_bmbt_key_t *kkp;
1050 xfs_bmbt_rec_t *krbase=NULL;
1051 xfs_bmbt_rec_t *krp;
1052 int level;
1053 int low;
1054 xfs_mount_t *mp;
1055 xfs_bmbt_ptr_t *pp;
1056 xfs_bmbt_irec_t *rp;
1057 xfs_fileoff_t startoff;
1058 xfs_trans_t *tp;
1059
1060 XFS_STATS_INC(xs_bmbt_lookup);
1061 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
1062 XFS_BMBT_TRACE_ARGI(cur, (int)dir);
1063 tp = cur->bc_tp;
1064 mp = cur->bc_mp;
1065 rp = &cur->bc_rec.b;
1066 for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
1067 if (level < cur->bc_nlevels - 1) {
1068 d = XFS_FSB_TO_DADDR(mp, fsbno);
1069 bp = cur->bc_bufs[level];
1070 if (bp && XFS_BUF_ADDR(bp) != d)
1071 bp = NULL;
1072 if (!bp) {
1073 if ((error = xfs_btree_read_bufl(mp, tp, fsbno,
1074 0, &bp, XFS_BMAP_BTREE_REF))) {
1075 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1076 return error;
1077 }
1078 xfs_btree_setbuf(cur, level, bp);
1079 block = XFS_BUF_TO_BMBT_BLOCK(bp);
1080 if ((error = xfs_btree_check_lblock(cur, block,
1081 level, bp))) {
1082 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1083 return error;
1084 }
1085 } else
1086 block = XFS_BUF_TO_BMBT_BLOCK(bp);
1087 } else
1088 block = xfs_bmbt_get_block(cur, level, &bp);
1089 if (diff == 0)
1090 keyno = 1;
1091 else {
1092 if (level > 0)
1093 kkbase = XFS_BMAP_KEY_IADDR(block, 1, cur);
1094 else
1095 krbase = XFS_BMAP_REC_IADDR(block, 1, cur);
1096 low = 1;
1097 if (!(high = be16_to_cpu(block->bb_numrecs))) {
1098 ASSERT(level == 0);
1099 cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
1100 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1101 *stat = 0;
1102 return 0;
1103 }
1104 while (low <= high) {
1105 XFS_STATS_INC(xs_bmbt_compare);
1106 keyno = (low + high) >> 1;
1107 if (level > 0) {
1108 kkp = kkbase + keyno - 1;
1109 startoff = be64_to_cpu(kkp->br_startoff);
1110 } else {
1111 krp = krbase + keyno - 1;
1112 startoff = xfs_bmbt_disk_get_startoff(krp);
1113 }
1114 diff = (xfs_sfiloff_t)
1115 (startoff - rp->br_startoff);
1116 if (diff < 0)
1117 low = keyno + 1;
1118 else if (diff > 0)
1119 high = keyno - 1;
1120 else
1121 break;
1122 }
1123 }
1124 if (level > 0) {
1125 if (diff > 0 && --keyno < 1)
1126 keyno = 1;
1127 pp = XFS_BMAP_PTR_IADDR(block, keyno, cur);
1128 fsbno = be64_to_cpu(*pp);
1129#ifdef DEBUG
1130 if ((error = xfs_btree_check_lptr(cur, fsbno, level))) {
1131 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1132 return error;
1133 }
1134#endif
1135 cur->bc_ptrs[level] = keyno;
1136 }
1137 }
1138 if (dir != XFS_LOOKUP_LE && diff < 0) {
1139 keyno++;
1140 /*
1141 * If ge search and we went off the end of the block, but it's
1142 * not the last block, we're in the wrong block.
1143 */
1144 if (dir == XFS_LOOKUP_GE && keyno > be16_to_cpu(block->bb_numrecs) &&
1145 be64_to_cpu(block->bb_rightsib) != NULLDFSBNO) {
1146 cur->bc_ptrs[0] = keyno;
1147 if ((error = xfs_bmbt_increment(cur, 0, &i))) {
1148 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1149 return error;
1150 }
1151 XFS_WANT_CORRUPTED_RETURN(i == 1);
1152 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1153 *stat = 1;
1154 return 0;
1155 }
1156 }
1157 else if (dir == XFS_LOOKUP_LE && diff > 0)
1158 keyno--;
1159 cur->bc_ptrs[0] = keyno;
1160 if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs)) {
1161 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1162 *stat = 0;
1163 } else {
1164 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1165 *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
1166 }
1167 return 0;
1168}
1169
1170/*
1171 * Move 1 record left from cur/level if possible.
1172 * Update cur to reflect the new path.
1173 */
1174STATIC int /* error */
1175xfs_bmbt_lshift(
1176 xfs_btree_cur_t *cur,
1177 int level,
1178 int *stat) /* success/failure */
1179{
1180 int error; /* error return value */
1181#ifdef DEBUG
1182 int i; /* loop counter */
1183#endif
1184 xfs_bmbt_key_t key; /* bmap btree key */
1185 xfs_buf_t *lbp; /* left buffer pointer */
1186 xfs_bmbt_block_t *left; /* left btree block */
1187 xfs_bmbt_key_t *lkp=NULL; /* left btree key */
1188 xfs_bmbt_ptr_t *lpp; /* left address pointer */
1189 int lrecs; /* left record count */
1190 xfs_bmbt_rec_t *lrp=NULL; /* left record pointer */
1191 xfs_mount_t *mp; /* file system mount point */
1192 xfs_buf_t *rbp; /* right buffer pointer */
1193 xfs_bmbt_block_t *right; /* right btree block */
1194 xfs_bmbt_key_t *rkp=NULL; /* right btree key */
1195 xfs_bmbt_ptr_t *rpp=NULL; /* right address pointer */
1196 xfs_bmbt_rec_t *rrp=NULL; /* right record pointer */
1197 int rrecs; /* right record count */
1198
1199 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
1200 XFS_BMBT_TRACE_ARGI(cur, level);
1201 if (level == cur->bc_nlevels - 1) {
1202 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1203 *stat = 0;
1204 return 0;
1205 }
1206 rbp = cur->bc_bufs[level];
1207 right = XFS_BUF_TO_BMBT_BLOCK(rbp);
1208#ifdef DEBUG
1209 if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
1210 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1211 return error;
1212 }
1213#endif
1214 if (be64_to_cpu(right->bb_leftsib) == NULLDFSBNO) {
1215 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1216 *stat = 0;
1217 return 0;
1218 }
1219 if (cur->bc_ptrs[level] <= 1) {
1220 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1221 *stat = 0;
1222 return 0;
1223 }
1224 mp = cur->bc_mp;
1225 if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, be64_to_cpu(right->bb_leftsib), 0,
1226 &lbp, XFS_BMAP_BTREE_REF))) {
1227 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1228 return error;
1229 }
1230 left = XFS_BUF_TO_BMBT_BLOCK(lbp);
1231 if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
1232 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1233 return error;
1234 }
1235 if (be16_to_cpu(left->bb_numrecs) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
1236 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1237 *stat = 0;
1238 return 0;
1239 }
1240 lrecs = be16_to_cpu(left->bb_numrecs) + 1;
1241 if (level > 0) {
1242 lkp = XFS_BMAP_KEY_IADDR(left, lrecs, cur);
1243 rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
1244 *lkp = *rkp;
1245 xfs_bmbt_log_keys(cur, lbp, lrecs, lrecs);
1246 lpp = XFS_BMAP_PTR_IADDR(left, lrecs, cur);
1247 rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
1248#ifdef DEBUG
1249 if ((error = xfs_btree_check_lptr_disk(cur, *rpp, level))) {
1250 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1251 return error;
1252 }
1253#endif
1254 *lpp = *rpp;
1255 xfs_bmbt_log_ptrs(cur, lbp, lrecs, lrecs);
1256 } else {
1257 lrp = XFS_BMAP_REC_IADDR(left, lrecs, cur);
1258 rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
1259 *lrp = *rrp;
1260 xfs_bmbt_log_recs(cur, lbp, lrecs, lrecs);
1261 }
1262 left->bb_numrecs = cpu_to_be16(lrecs);
1263 xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS);
1264#ifdef DEBUG
1265 if (level > 0)
1266 xfs_btree_check_key(XFS_BTNUM_BMAP, lkp - 1, lkp);
1267 else
1268 xfs_btree_check_rec(XFS_BTNUM_BMAP, lrp - 1, lrp);
1269#endif
1270 rrecs = be16_to_cpu(right->bb_numrecs) - 1;
1271 right->bb_numrecs = cpu_to_be16(rrecs);
1272 xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS);
1273 if (level > 0) {
1274#ifdef DEBUG
1275 for (i = 0; i < rrecs; i++) {
1276 if ((error = xfs_btree_check_lptr_disk(cur, rpp[i + 1],
1277 level))) {
1278 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1279 return error;
1280 }
1281 }
1282#endif
1283 memmove(rkp, rkp + 1, rrecs * sizeof(*rkp));
1284 memmove(rpp, rpp + 1, rrecs * sizeof(*rpp));
1285 xfs_bmbt_log_keys(cur, rbp, 1, rrecs);
1286 xfs_bmbt_log_ptrs(cur, rbp, 1, rrecs);
1287 } else {
1288 memmove(rrp, rrp + 1, rrecs * sizeof(*rrp));
1289 xfs_bmbt_log_recs(cur, rbp, 1, rrecs);
1290 key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(rrp));
1291 rkp = &key;
1292 }
1293 if ((error = xfs_bmbt_updkey(cur, rkp, level + 1))) {
1294 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1295 return error;
1296 }
1297 cur->bc_ptrs[level]--;
1298 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1299 *stat = 1;
1300 return 0;
1301}
1302
1303/*
1304 * Move 1 record right from cur/level if possible.
1305 * Update cur to reflect the new path.
1306 */
1307STATIC int /* error */
1308xfs_bmbt_rshift(
1309 xfs_btree_cur_t *cur,
1310 int level,
1311 int *stat) /* success/failure */
1312{
1313 int error; /* error return value */
1314 int i; /* loop counter */
1315 xfs_bmbt_key_t key; /* bmap btree key */
1316 xfs_buf_t *lbp; /* left buffer pointer */
1317 xfs_bmbt_block_t *left; /* left btree block */
1318 xfs_bmbt_key_t *lkp; /* left btree key */
1319 xfs_bmbt_ptr_t *lpp; /* left address pointer */
1320 xfs_bmbt_rec_t *lrp; /* left record pointer */
1321 xfs_mount_t *mp; /* file system mount point */
1322 xfs_buf_t *rbp; /* right buffer pointer */
1323 xfs_bmbt_block_t *right; /* right btree block */
1324 xfs_bmbt_key_t *rkp; /* right btree key */
1325 xfs_bmbt_ptr_t *rpp; /* right address pointer */
1326 xfs_bmbt_rec_t *rrp=NULL; /* right record pointer */
1327 struct xfs_btree_cur *tcur; /* temporary btree cursor */
1328
1329 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
1330 XFS_BMBT_TRACE_ARGI(cur, level);
1331 if (level == cur->bc_nlevels - 1) {
1332 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1333 *stat = 0;
1334 return 0;
1335 }
1336 lbp = cur->bc_bufs[level];
1337 left = XFS_BUF_TO_BMBT_BLOCK(lbp);
1338#ifdef DEBUG
1339 if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
1340 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1341 return error;
1342 }
1343#endif
1344 if (be64_to_cpu(left->bb_rightsib) == NULLDFSBNO) {
1345 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1346 *stat = 0;
1347 return 0;
1348 }
1349 if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) {
1350 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1351 *stat = 0;
1352 return 0;
1353 }
1354 mp = cur->bc_mp;
1355 if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, be64_to_cpu(left->bb_rightsib), 0,
1356 &rbp, XFS_BMAP_BTREE_REF))) {
1357 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1358 return error;
1359 }
1360 right = XFS_BUF_TO_BMBT_BLOCK(rbp);
1361 if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
1362 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1363 return error;
1364 }
1365 if (be16_to_cpu(right->bb_numrecs) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
1366 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1367 *stat = 0;
1368 return 0;
1369 }
1370 if (level > 0) {
1371 lkp = XFS_BMAP_KEY_IADDR(left, be16_to_cpu(left->bb_numrecs), cur);
1372 lpp = XFS_BMAP_PTR_IADDR(left, be16_to_cpu(left->bb_numrecs), cur);
1373 rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
1374 rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
1375#ifdef DEBUG
1376 for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) {
1377 if ((error = xfs_btree_check_lptr_disk(cur, rpp[i], level))) {
1378 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1379 return error;
1380 }
1381 }
1382#endif
1383 memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
1384 memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
1385#ifdef DEBUG
1386 if ((error = xfs_btree_check_lptr_disk(cur, *lpp, level))) {
1387 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1388 return error;
1389 }
1390#endif
1391 *rkp = *lkp;
1392 *rpp = *lpp;
1393 xfs_bmbt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
1394 xfs_bmbt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
1395 } else {
1396 lrp = XFS_BMAP_REC_IADDR(left, be16_to_cpu(left->bb_numrecs), cur);
1397 rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
1398 memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
1399 *rrp = *lrp;
1400 xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
1401 key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(rrp));
1402 rkp = &key;
1403 }
1404 be16_add_cpu(&left->bb_numrecs, -1);
1405 xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS);
1406 be16_add_cpu(&right->bb_numrecs, 1);
1407#ifdef DEBUG
1408 if (level > 0)
1409 xfs_btree_check_key(XFS_BTNUM_BMAP, rkp, rkp + 1);
1410 else
1411 xfs_btree_check_rec(XFS_BTNUM_BMAP, rrp, rrp + 1);
1412#endif
1413 xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS);
1414 if ((error = xfs_btree_dup_cursor(cur, &tcur))) {
1415 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1416 return error;
1417 }
1418 i = xfs_btree_lastrec(tcur, level);
1419 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1420 if ((error = xfs_bmbt_increment(tcur, level, &i))) {
1421 XFS_BMBT_TRACE_CURSOR(tcur, ERROR);
1422 goto error1;
1423 }
1424 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1425 if ((error = xfs_bmbt_updkey(tcur, rkp, level + 1))) {
1426 XFS_BMBT_TRACE_CURSOR(tcur, ERROR);
1427 goto error1;
1428 }
1429 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1430 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1431 *stat = 1;
1432 return 0;
1433error0:
1434 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1435error1:
1436 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
1437 return error;
1438}
1439
1440/* 47/*
1441 * Determine the extent state. 48 * Determine the extent state.
1442 */ 49 */
@@ -1453,229 +60,15 @@ xfs_extent_state(
1453 return XFS_EXT_NORM; 60 return XFS_EXT_NORM;
1454} 61}
1455 62
1456
1457/*
1458 * Split cur/level block in half.
1459 * Return new block number and its first record (to be inserted into parent).
1460 */
1461STATIC int /* error */
1462xfs_bmbt_split(
1463 xfs_btree_cur_t *cur,
1464 int level,
1465 xfs_fsblock_t *bnop,
1466 __uint64_t *startoff,
1467 xfs_btree_cur_t **curp,
1468 int *stat) /* success/failure */
1469{
1470 xfs_alloc_arg_t args; /* block allocation args */
1471 int error; /* error return value */
1472 int i; /* loop counter */
1473 xfs_fsblock_t lbno; /* left sibling block number */
1474 xfs_buf_t *lbp; /* left buffer pointer */
1475 xfs_bmbt_block_t *left; /* left btree block */
1476 xfs_bmbt_key_t *lkp; /* left btree key */
1477 xfs_bmbt_ptr_t *lpp; /* left address pointer */
1478 xfs_bmbt_rec_t *lrp; /* left record pointer */
1479 xfs_buf_t *rbp; /* right buffer pointer */
1480 xfs_bmbt_block_t *right; /* right btree block */
1481 xfs_bmbt_key_t *rkp; /* right btree key */
1482 xfs_bmbt_ptr_t *rpp; /* right address pointer */
1483 xfs_bmbt_block_t *rrblock; /* right-right btree block */
1484 xfs_buf_t *rrbp; /* right-right buffer pointer */
1485 xfs_bmbt_rec_t *rrp; /* right record pointer */
1486
1487 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
1488 XFS_BMBT_TRACE_ARGIFK(cur, level, *bnop, *startoff);
1489 args.tp = cur->bc_tp;
1490 args.mp = cur->bc_mp;
1491 lbp = cur->bc_bufs[level];
1492 lbno = XFS_DADDR_TO_FSB(args.mp, XFS_BUF_ADDR(lbp));
1493 left = XFS_BUF_TO_BMBT_BLOCK(lbp);
1494 args.fsbno = cur->bc_private.b.firstblock;
1495 args.firstblock = args.fsbno;
1496 args.minleft = 0;
1497 if (args.fsbno == NULLFSBLOCK) {
1498 args.fsbno = lbno;
1499 args.type = XFS_ALLOCTYPE_START_BNO;
1500 /*
1501 * Make sure there is sufficient room left in the AG to
1502 * complete a full tree split for an extent insert. If
1503 * we are converting the middle part of an extent then
1504 * we may need space for two tree splits.
1505 *
1506 * We are relying on the caller to make the correct block
1507 * reservation for this operation to succeed. If the
1508 * reservation amount is insufficient then we may fail a
1509 * block allocation here and corrupt the filesystem.
1510 */
1511 args.minleft = xfs_trans_get_block_res(args.tp);
1512 } else if (cur->bc_private.b.flist->xbf_low)
1513 args.type = XFS_ALLOCTYPE_START_BNO;
1514 else
1515 args.type = XFS_ALLOCTYPE_NEAR_BNO;
1516 args.mod = args.alignment = args.total = args.isfl =
1517 args.userdata = args.minalignslop = 0;
1518 args.minlen = args.maxlen = args.prod = 1;
1519 args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
1520 if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
1521 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1522 return XFS_ERROR(ENOSPC);
1523 }
1524 if ((error = xfs_alloc_vextent(&args))) {
1525 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1526 return error;
1527 }
1528 if (args.fsbno == NULLFSBLOCK && args.minleft) {
1529 /*
1530 * Could not find an AG with enough free space to satisfy
1531 * a full btree split. Try again without minleft and if
1532 * successful activate the lowspace algorithm.
1533 */
1534 args.fsbno = 0;
1535 args.type = XFS_ALLOCTYPE_FIRST_AG;
1536 args.minleft = 0;
1537 if ((error = xfs_alloc_vextent(&args))) {
1538 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1539 return error;
1540 }
1541 cur->bc_private.b.flist->xbf_low = 1;
1542 }
1543 if (args.fsbno == NULLFSBLOCK) {
1544 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1545 *stat = 0;
1546 return 0;
1547 }
1548 ASSERT(args.len == 1);
1549 cur->bc_private.b.firstblock = args.fsbno;
1550 cur->bc_private.b.allocated++;
1551 cur->bc_private.b.ip->i_d.di_nblocks++;
1552 xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
1553 XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
1554 XFS_TRANS_DQ_BCOUNT, 1L);
1555 rbp = xfs_btree_get_bufl(args.mp, args.tp, args.fsbno, 0);
1556 right = XFS_BUF_TO_BMBT_BLOCK(rbp);
1557#ifdef DEBUG
1558 if ((error = xfs_btree_check_lblock(cur, left, level, rbp))) {
1559 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1560 return error;
1561 }
1562#endif
1563 right->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
1564 right->bb_level = left->bb_level;
1565 right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2);
1566 if ((be16_to_cpu(left->bb_numrecs) & 1) &&
1567 cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
1568 be16_add_cpu(&right->bb_numrecs, 1);
1569 i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
1570 if (level > 0) {
1571 lkp = XFS_BMAP_KEY_IADDR(left, i, cur);
1572 lpp = XFS_BMAP_PTR_IADDR(left, i, cur);
1573 rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
1574 rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
1575#ifdef DEBUG
1576 for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
1577 if ((error = xfs_btree_check_lptr_disk(cur, lpp[i], level))) {
1578 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1579 return error;
1580 }
1581 }
1582#endif
1583 memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
1584 memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
1585 xfs_bmbt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
1586 xfs_bmbt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
1587 *startoff = be64_to_cpu(rkp->br_startoff);
1588 } else {
1589 lrp = XFS_BMAP_REC_IADDR(left, i, cur);
1590 rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
1591 memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
1592 xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
1593 *startoff = xfs_bmbt_disk_get_startoff(rrp);
1594 }
1595 be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
1596 right->bb_rightsib = left->bb_rightsib;
1597 left->bb_rightsib = cpu_to_be64(args.fsbno);
1598 right->bb_leftsib = cpu_to_be64(lbno);
1599 xfs_bmbt_log_block(cur, rbp, XFS_BB_ALL_BITS);
1600 xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
1601 if (be64_to_cpu(right->bb_rightsib) != NULLDFSBNO) {
1602 if ((error = xfs_btree_read_bufl(args.mp, args.tp,
1603 be64_to_cpu(right->bb_rightsib), 0, &rrbp,
1604 XFS_BMAP_BTREE_REF))) {
1605 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1606 return error;
1607 }
1608 rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp);
1609 if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) {
1610 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1611 return error;
1612 }
1613 rrblock->bb_leftsib = cpu_to_be64(args.fsbno);
1614 xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB);
1615 }
1616 if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) {
1617 xfs_btree_setbuf(cur, level, rbp);
1618 cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs);
1619 }
1620 if (level + 1 < cur->bc_nlevels) {
1621 if ((error = xfs_btree_dup_cursor(cur, curp))) {
1622 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1623 return error;
1624 }
1625 (*curp)->bc_ptrs[level + 1]++;
1626 }
1627 *bnop = args.fsbno;
1628 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1629 *stat = 1;
1630 return 0;
1631}
1632
1633
1634/*
1635 * Update keys for the record.
1636 */
1637STATIC int
1638xfs_bmbt_updkey(
1639 xfs_btree_cur_t *cur,
1640 xfs_bmbt_key_t *keyp, /* on-disk format */
1641 int level)
1642{
1643 xfs_bmbt_block_t *block;
1644 xfs_buf_t *bp;
1645#ifdef DEBUG
1646 int error;
1647#endif
1648 xfs_bmbt_key_t *kp;
1649 int ptr;
1650
1651 ASSERT(level >= 1);
1652 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
1653 XFS_BMBT_TRACE_ARGIK(cur, level, keyp);
1654 for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
1655 block = xfs_bmbt_get_block(cur, level, &bp);
1656#ifdef DEBUG
1657 if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
1658 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1659 return error;
1660 }
1661#endif
1662 ptr = cur->bc_ptrs[level];
1663 kp = XFS_BMAP_KEY_IADDR(block, ptr, cur);
1664 *kp = *keyp;
1665 xfs_bmbt_log_keys(cur, bp, ptr, ptr);
1666 }
1667 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1668 return 0;
1669}
1670
1671/* 63/*
1672 * Convert on-disk form of btree root to in-memory form. 64 * Convert on-disk form of btree root to in-memory form.
1673 */ 65 */
1674void 66void
1675xfs_bmdr_to_bmbt( 67xfs_bmdr_to_bmbt(
68 struct xfs_mount *mp,
1676 xfs_bmdr_block_t *dblock, 69 xfs_bmdr_block_t *dblock,
1677 int dblocklen, 70 int dblocklen,
1678 xfs_bmbt_block_t *rblock, 71 struct xfs_btree_block *rblock,
1679 int rblocklen) 72 int rblocklen)
1680{ 73{
1681 int dmxr; 74 int dmxr;
@@ -1688,129 +81,19 @@ xfs_bmdr_to_bmbt(
1688 rblock->bb_level = dblock->bb_level; 81 rblock->bb_level = dblock->bb_level;
1689 ASSERT(be16_to_cpu(rblock->bb_level) > 0); 82 ASSERT(be16_to_cpu(rblock->bb_level) > 0);
1690 rblock->bb_numrecs = dblock->bb_numrecs; 83 rblock->bb_numrecs = dblock->bb_numrecs;
1691 rblock->bb_leftsib = cpu_to_be64(NULLDFSBNO); 84 rblock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
1692 rblock->bb_rightsib = cpu_to_be64(NULLDFSBNO); 85 rblock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
1693 dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0); 86 dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
1694 fkp = XFS_BTREE_KEY_ADDR(xfs_bmdr, dblock, 1); 87 fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
1695 tkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen); 88 tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
1696 fpp = XFS_BTREE_PTR_ADDR(xfs_bmdr, dblock, 1, dmxr); 89 fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
1697 tpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen); 90 tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
1698 dmxr = be16_to_cpu(dblock->bb_numrecs); 91 dmxr = be16_to_cpu(dblock->bb_numrecs);
1699 memcpy(tkp, fkp, sizeof(*fkp) * dmxr); 92 memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
1700 memcpy(tpp, fpp, sizeof(*fpp) * dmxr); 93 memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
1701} 94}
1702 95
1703/* 96/*
1704 * Decrement cursor by one record at the level.
1705 * For nonzero levels the leaf-ward information is untouched.
1706 */
1707int /* error */
1708xfs_bmbt_decrement(
1709 xfs_btree_cur_t *cur,
1710 int level,
1711 int *stat) /* success/failure */
1712{
1713 xfs_bmbt_block_t *block;
1714 xfs_buf_t *bp;
1715 int error; /* error return value */
1716 xfs_fsblock_t fsbno;
1717 int lev;
1718 xfs_mount_t *mp;
1719 xfs_trans_t *tp;
1720
1721 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
1722 XFS_BMBT_TRACE_ARGI(cur, level);
1723 ASSERT(level < cur->bc_nlevels);
1724 if (level < cur->bc_nlevels - 1)
1725 xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1726 if (--cur->bc_ptrs[level] > 0) {
1727 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1728 *stat = 1;
1729 return 0;
1730 }
1731 block = xfs_bmbt_get_block(cur, level, &bp);
1732#ifdef DEBUG
1733 if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
1734 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1735 return error;
1736 }
1737#endif
1738 if (be64_to_cpu(block->bb_leftsib) == NULLDFSBNO) {
1739 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1740 *stat = 0;
1741 return 0;
1742 }
1743 for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1744 if (--cur->bc_ptrs[lev] > 0)
1745 break;
1746 if (lev < cur->bc_nlevels - 1)
1747 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1748 }
1749 if (lev == cur->bc_nlevels) {
1750 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1751 *stat = 0;
1752 return 0;
1753 }
1754 tp = cur->bc_tp;
1755 mp = cur->bc_mp;
1756 for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
1757 fsbno = be64_to_cpu(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur));
1758 if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
1759 XFS_BMAP_BTREE_REF))) {
1760 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1761 return error;
1762 }
1763 lev--;
1764 xfs_btree_setbuf(cur, lev, bp);
1765 block = XFS_BUF_TO_BMBT_BLOCK(bp);
1766 if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
1767 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1768 return error;
1769 }
1770 cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs);
1771 }
1772 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1773 *stat = 1;
1774 return 0;
1775}
1776
1777/*
1778 * Delete the record pointed to by cur.
1779 */
1780int /* error */
1781xfs_bmbt_delete(
1782 xfs_btree_cur_t *cur,
1783 int *stat) /* success/failure */
1784{
1785 int error; /* error return value */
1786 int i;
1787 int level;
1788
1789 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
1790 for (level = 0, i = 2; i == 2; level++) {
1791 if ((error = xfs_bmbt_delrec(cur, level, &i))) {
1792 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1793 return error;
1794 }
1795 }
1796 if (i == 0) {
1797 for (level = 1; level < cur->bc_nlevels; level++) {
1798 if (cur->bc_ptrs[level] == 0) {
1799 if ((error = xfs_bmbt_decrement(cur, level,
1800 &i))) {
1801 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1802 return error;
1803 }
1804 break;
1805 }
1806 }
1807 }
1808 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1809 *stat = i;
1810 return 0;
1811}
1812
1813/*
1814 * Convert a compressed bmap extent record to an uncompressed form. 97 * Convert a compressed bmap extent record to an uncompressed form.
1815 * This code must be in sync with the routines xfs_bmbt_get_startoff, 98 * This code must be in sync with the routines xfs_bmbt_get_startoff,
1816 * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state. 99 * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state.
@@ -1864,31 +147,6 @@ xfs_bmbt_get_all(
1864} 147}
1865 148
1866/* 149/*
1867 * Get the block pointer for the given level of the cursor.
1868 * Fill in the buffer pointer, if applicable.
1869 */
1870xfs_bmbt_block_t *
1871xfs_bmbt_get_block(
1872 xfs_btree_cur_t *cur,
1873 int level,
1874 xfs_buf_t **bpp)
1875{
1876 xfs_ifork_t *ifp;
1877 xfs_bmbt_block_t *rval;
1878
1879 if (level < cur->bc_nlevels - 1) {
1880 *bpp = cur->bc_bufs[level];
1881 rval = XFS_BUF_TO_BMBT_BLOCK(*bpp);
1882 } else {
1883 *bpp = NULL;
1884 ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
1885 cur->bc_private.b.whichfork);
1886 rval = ifp->if_broot;
1887 }
1888 return rval;
1889}
1890
1891/*
1892 * Extract the blockcount field from an in memory bmap extent record. 150 * Extract the blockcount field from an in memory bmap extent record.
1893 */ 151 */
1894xfs_filblks_t 152xfs_filblks_t
@@ -1950,7 +208,8 @@ xfs_bmbt_disk_get_all(
1950 xfs_bmbt_rec_t *r, 208 xfs_bmbt_rec_t *r,
1951 xfs_bmbt_irec_t *s) 209 xfs_bmbt_irec_t *s)
1952{ 210{
1953 __xfs_bmbt_get_all(be64_to_cpu(r->l0), be64_to_cpu(r->l1), s); 211 __xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
212 get_unaligned_be64(&r->l1), s);
1954} 213}
1955 214
1956/* 215/*
@@ -1974,348 +233,6 @@ xfs_bmbt_disk_get_startoff(
1974 XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; 233 XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
1975} 234}
1976 235
1977/*
1978 * Increment cursor by one record at the level.
1979 * For nonzero levels the leaf-ward information is untouched.
1980 */
1981int /* error */
1982xfs_bmbt_increment(
1983 xfs_btree_cur_t *cur,
1984 int level,
1985 int *stat) /* success/failure */
1986{
1987 xfs_bmbt_block_t *block;
1988 xfs_buf_t *bp;
1989 int error; /* error return value */
1990 xfs_fsblock_t fsbno;
1991 int lev;
1992 xfs_mount_t *mp;
1993 xfs_trans_t *tp;
1994
1995 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
1996 XFS_BMBT_TRACE_ARGI(cur, level);
1997 ASSERT(level < cur->bc_nlevels);
1998 if (level < cur->bc_nlevels - 1)
1999 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
2000 block = xfs_bmbt_get_block(cur, level, &bp);
2001#ifdef DEBUG
2002 if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
2003 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
2004 return error;
2005 }
2006#endif
2007 if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) {
2008 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
2009 *stat = 1;
2010 return 0;
2011 }
2012 if (be64_to_cpu(block->bb_rightsib) == NULLDFSBNO) {
2013 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
2014 *stat = 0;
2015 return 0;
2016 }
2017 for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
2018 block = xfs_bmbt_get_block(cur, lev, &bp);
2019#ifdef DEBUG
2020 if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
2021 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
2022 return error;
2023 }
2024#endif
2025 if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs))
2026 break;
2027 if (lev < cur->bc_nlevels - 1)
2028 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
2029 }
2030 if (lev == cur->bc_nlevels) {
2031 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
2032 *stat = 0;
2033 return 0;
2034 }
2035 tp = cur->bc_tp;
2036 mp = cur->bc_mp;
2037 for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
2038 fsbno = be64_to_cpu(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur));
2039 if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
2040 XFS_BMAP_BTREE_REF))) {
2041 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
2042 return error;
2043 }
2044 lev--;
2045 xfs_btree_setbuf(cur, lev, bp);
2046 block = XFS_BUF_TO_BMBT_BLOCK(bp);
2047 if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
2048 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
2049 return error;
2050 }
2051 cur->bc_ptrs[lev] = 1;
2052 }
2053 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
2054 *stat = 1;
2055 return 0;
2056}
2057
2058/*
2059 * Insert the current record at the point referenced by cur.
2060 *
2061 * A multi-level split of the tree on insert will invalidate the original
2062 * cursor. All callers of this function should assume that the cursor is
2063 * no longer valid and revalidate it.
2064 */
2065int /* error */
2066xfs_bmbt_insert(
2067 xfs_btree_cur_t *cur,
2068 int *stat) /* success/failure */
2069{
2070 int error; /* error return value */
2071 int i;
2072 int level;
2073 xfs_fsblock_t nbno;
2074 xfs_btree_cur_t *ncur;
2075 xfs_bmbt_rec_t nrec;
2076 xfs_btree_cur_t *pcur;
2077
2078 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
2079 level = 0;
2080 nbno = NULLFSBLOCK;
2081 xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b);
2082 ncur = NULL;
2083 pcur = cur;
2084 do {
2085 if ((error = xfs_bmbt_insrec(pcur, level++, &nbno, &nrec, &ncur,
2086 &i))) {
2087 if (pcur != cur)
2088 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
2089 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
2090 return error;
2091 }
2092 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
2093 if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) {
2094 cur->bc_nlevels = pcur->bc_nlevels;
2095 cur->bc_private.b.allocated +=
2096 pcur->bc_private.b.allocated;
2097 pcur->bc_private.b.allocated = 0;
2098 ASSERT((cur->bc_private.b.firstblock != NULLFSBLOCK) ||
2099 XFS_IS_REALTIME_INODE(cur->bc_private.b.ip));
2100 cur->bc_private.b.firstblock =
2101 pcur->bc_private.b.firstblock;
2102 ASSERT(cur->bc_private.b.flist ==
2103 pcur->bc_private.b.flist);
2104 xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
2105 }
2106 if (ncur) {
2107 pcur = ncur;
2108 ncur = NULL;
2109 }
2110 } while (nbno != NULLFSBLOCK);
2111 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
2112 *stat = i;
2113 return 0;
2114error0:
2115 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
2116 return error;
2117}
2118
2119/*
2120 * Log fields from the btree block header.
2121 */
2122void
2123xfs_bmbt_log_block(
2124 xfs_btree_cur_t *cur,
2125 xfs_buf_t *bp,
2126 int fields)
2127{
2128 int first;
2129 int last;
2130 xfs_trans_t *tp;
2131 static const short offsets[] = {
2132 offsetof(xfs_bmbt_block_t, bb_magic),
2133 offsetof(xfs_bmbt_block_t, bb_level),
2134 offsetof(xfs_bmbt_block_t, bb_numrecs),
2135 offsetof(xfs_bmbt_block_t, bb_leftsib),
2136 offsetof(xfs_bmbt_block_t, bb_rightsib),
2137 sizeof(xfs_bmbt_block_t)
2138 };
2139
2140 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
2141 XFS_BMBT_TRACE_ARGBI(cur, bp, fields);
2142 tp = cur->bc_tp;
2143 if (bp) {
2144 xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first,
2145 &last);
2146 xfs_trans_log_buf(tp, bp, first, last);
2147 } else
2148 xfs_trans_log_inode(tp, cur->bc_private.b.ip,
2149 XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
2150 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
2151}
2152
2153/*
2154 * Log record values from the btree block.
2155 */
2156void
2157xfs_bmbt_log_recs(
2158 xfs_btree_cur_t *cur,
2159 xfs_buf_t *bp,
2160 int rfirst,
2161 int rlast)
2162{
2163 xfs_bmbt_block_t *block;
2164 int first;
2165 int last;
2166 xfs_bmbt_rec_t *rp;
2167 xfs_trans_t *tp;
2168
2169 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
2170 XFS_BMBT_TRACE_ARGBII(cur, bp, rfirst, rlast);
2171 ASSERT(bp);
2172 tp = cur->bc_tp;
2173 block = XFS_BUF_TO_BMBT_BLOCK(bp);
2174 rp = XFS_BMAP_REC_DADDR(block, 1, cur);
2175 first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
2176 last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
2177 xfs_trans_log_buf(tp, bp, first, last);
2178 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
2179}
2180
2181int /* error */
2182xfs_bmbt_lookup_eq(
2183 xfs_btree_cur_t *cur,
2184 xfs_fileoff_t off,
2185 xfs_fsblock_t bno,
2186 xfs_filblks_t len,
2187 int *stat) /* success/failure */
2188{
2189 cur->bc_rec.b.br_startoff = off;
2190 cur->bc_rec.b.br_startblock = bno;
2191 cur->bc_rec.b.br_blockcount = len;
2192 return xfs_bmbt_lookup(cur, XFS_LOOKUP_EQ, stat);
2193}
2194
2195int /* error */
2196xfs_bmbt_lookup_ge(
2197 xfs_btree_cur_t *cur,
2198 xfs_fileoff_t off,
2199 xfs_fsblock_t bno,
2200 xfs_filblks_t len,
2201 int *stat) /* success/failure */
2202{
2203 cur->bc_rec.b.br_startoff = off;
2204 cur->bc_rec.b.br_startblock = bno;
2205 cur->bc_rec.b.br_blockcount = len;
2206 return xfs_bmbt_lookup(cur, XFS_LOOKUP_GE, stat);
2207}
2208
2209/*
2210 * Give the bmap btree a new root block. Copy the old broot contents
2211 * down into a real block and make the broot point to it.
2212 */
2213int /* error */
2214xfs_bmbt_newroot(
2215 xfs_btree_cur_t *cur, /* btree cursor */
2216 int *logflags, /* logging flags for inode */
2217 int *stat) /* return status - 0 fail */
2218{
2219 xfs_alloc_arg_t args; /* allocation arguments */
2220 xfs_bmbt_block_t *block; /* bmap btree block */
2221 xfs_buf_t *bp; /* buffer for block */
2222 xfs_bmbt_block_t *cblock; /* child btree block */
2223 xfs_bmbt_key_t *ckp; /* child key pointer */
2224 xfs_bmbt_ptr_t *cpp; /* child ptr pointer */
2225 int error; /* error return code */
2226#ifdef DEBUG
2227 int i; /* loop counter */
2228#endif
2229 xfs_bmbt_key_t *kp; /* pointer to bmap btree key */
2230 int level; /* btree level */
2231 xfs_bmbt_ptr_t *pp; /* pointer to bmap block addr */
2232
2233 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
2234 level = cur->bc_nlevels - 1;
2235 block = xfs_bmbt_get_block(cur, level, &bp);
2236 /*
2237 * Copy the root into a real block.
2238 */
2239 args.mp = cur->bc_mp;
2240 pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
2241 args.tp = cur->bc_tp;
2242 args.fsbno = cur->bc_private.b.firstblock;
2243 args.mod = args.minleft = args.alignment = args.total = args.isfl =
2244 args.userdata = args.minalignslop = 0;
2245 args.minlen = args.maxlen = args.prod = 1;
2246 args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
2247 args.firstblock = args.fsbno;
2248 if (args.fsbno == NULLFSBLOCK) {
2249#ifdef DEBUG
2250 if ((error = xfs_btree_check_lptr_disk(cur, *pp, level))) {
2251 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
2252 return error;
2253 }
2254#endif
2255 args.fsbno = be64_to_cpu(*pp);
2256 args.type = XFS_ALLOCTYPE_START_BNO;
2257 } else if (cur->bc_private.b.flist->xbf_low)
2258 args.type = XFS_ALLOCTYPE_START_BNO;
2259 else
2260 args.type = XFS_ALLOCTYPE_NEAR_BNO;
2261 if ((error = xfs_alloc_vextent(&args))) {
2262 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
2263 return error;
2264 }
2265 if (args.fsbno == NULLFSBLOCK) {
2266 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
2267 *stat = 0;
2268 return 0;
2269 }
2270 ASSERT(args.len == 1);
2271 cur->bc_private.b.firstblock = args.fsbno;
2272 cur->bc_private.b.allocated++;
2273 cur->bc_private.b.ip->i_d.di_nblocks++;
2274 XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
2275 XFS_TRANS_DQ_BCOUNT, 1L);
2276 bp = xfs_btree_get_bufl(args.mp, cur->bc_tp, args.fsbno, 0);
2277 cblock = XFS_BUF_TO_BMBT_BLOCK(bp);
2278 *cblock = *block;
2279 be16_add_cpu(&block->bb_level, 1);
2280 block->bb_numrecs = cpu_to_be16(1);
2281 cur->bc_nlevels++;
2282 cur->bc_ptrs[level + 1] = 1;
2283 kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
2284 ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur);
2285 memcpy(ckp, kp, be16_to_cpu(cblock->bb_numrecs) * sizeof(*kp));
2286 cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
2287#ifdef DEBUG
2288 for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
2289 if ((error = xfs_btree_check_lptr_disk(cur, pp[i], level))) {
2290 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
2291 return error;
2292 }
2293 }
2294#endif
2295 memcpy(cpp, pp, be16_to_cpu(cblock->bb_numrecs) * sizeof(*pp));
2296#ifdef DEBUG
2297 if ((error = xfs_btree_check_lptr(cur, args.fsbno, level))) {
2298 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
2299 return error;
2300 }
2301#endif
2302 *pp = cpu_to_be64(args.fsbno);
2303 xfs_iroot_realloc(cur->bc_private.b.ip, 1 - be16_to_cpu(cblock->bb_numrecs),
2304 cur->bc_private.b.whichfork);
2305 xfs_btree_setbuf(cur, level, bp);
2306 /*
2307 * Do all this logging at the end so that
2308 * the root is at the right level.
2309 */
2310 xfs_bmbt_log_block(cur, bp, XFS_BB_ALL_BITS);
2311 xfs_bmbt_log_keys(cur, bp, 1, be16_to_cpu(cblock->bb_numrecs));
2312 xfs_bmbt_log_ptrs(cur, bp, 1, be16_to_cpu(cblock->bb_numrecs));
2313 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
2314 *logflags |=
2315 XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork);
2316 *stat = 1;
2317 return 0;
2318}
2319 236
2320/* 237/*
2321 * Set all the fields in a bmap extent record from the arguments. 238 * Set all the fields in a bmap extent record from the arguments.
@@ -2512,7 +429,8 @@ xfs_bmbt_set_state(
2512 */ 429 */
2513void 430void
2514xfs_bmbt_to_bmdr( 431xfs_bmbt_to_bmdr(
2515 xfs_bmbt_block_t *rblock, 432 struct xfs_mount *mp,
433 struct xfs_btree_block *rblock,
2516 int rblocklen, 434 int rblocklen,
2517 xfs_bmdr_block_t *dblock, 435 xfs_bmdr_block_t *dblock,
2518 int dblocklen) 436 int dblocklen)
@@ -2524,67 +442,22 @@ xfs_bmbt_to_bmdr(
2524 __be64 *tpp; 442 __be64 *tpp;
2525 443
2526 ASSERT(be32_to_cpu(rblock->bb_magic) == XFS_BMAP_MAGIC); 444 ASSERT(be32_to_cpu(rblock->bb_magic) == XFS_BMAP_MAGIC);
2527 ASSERT(be64_to_cpu(rblock->bb_leftsib) == NULLDFSBNO); 445 ASSERT(be64_to_cpu(rblock->bb_u.l.bb_leftsib) == NULLDFSBNO);
2528 ASSERT(be64_to_cpu(rblock->bb_rightsib) == NULLDFSBNO); 446 ASSERT(be64_to_cpu(rblock->bb_u.l.bb_rightsib) == NULLDFSBNO);
2529 ASSERT(be16_to_cpu(rblock->bb_level) > 0); 447 ASSERT(be16_to_cpu(rblock->bb_level) > 0);
2530 dblock->bb_level = rblock->bb_level; 448 dblock->bb_level = rblock->bb_level;
2531 dblock->bb_numrecs = rblock->bb_numrecs; 449 dblock->bb_numrecs = rblock->bb_numrecs;
2532 dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0); 450 dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
2533 fkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen); 451 fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
2534 tkp = XFS_BTREE_KEY_ADDR(xfs_bmdr, dblock, 1); 452 tkp = XFS_BMDR_KEY_ADDR(dblock, 1);
2535 fpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen); 453 fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
2536 tpp = XFS_BTREE_PTR_ADDR(xfs_bmdr, dblock, 1, dmxr); 454 tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
2537 dmxr = be16_to_cpu(dblock->bb_numrecs); 455 dmxr = be16_to_cpu(dblock->bb_numrecs);
2538 memcpy(tkp, fkp, sizeof(*fkp) * dmxr); 456 memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
2539 memcpy(tpp, fpp, sizeof(*fpp) * dmxr); 457 memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
2540} 458}
2541 459
2542/* 460/*
2543 * Update the record to the passed values.
2544 */
2545int
2546xfs_bmbt_update(
2547 xfs_btree_cur_t *cur,
2548 xfs_fileoff_t off,
2549 xfs_fsblock_t bno,
2550 xfs_filblks_t len,
2551 xfs_exntst_t state)
2552{
2553 xfs_bmbt_block_t *block;
2554 xfs_buf_t *bp;
2555 int error;
2556 xfs_bmbt_key_t key;
2557 int ptr;
2558 xfs_bmbt_rec_t *rp;
2559
2560 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
2561 XFS_BMBT_TRACE_ARGFFFI(cur, (xfs_dfiloff_t)off, (xfs_dfsbno_t)bno,
2562 (xfs_dfilblks_t)len, (int)state);
2563 block = xfs_bmbt_get_block(cur, 0, &bp);
2564#ifdef DEBUG
2565 if ((error = xfs_btree_check_lblock(cur, block, 0, bp))) {
2566 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
2567 return error;
2568 }
2569#endif
2570 ptr = cur->bc_ptrs[0];
2571 rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
2572 xfs_bmbt_disk_set_allf(rp, off, bno, len, state);
2573 xfs_bmbt_log_recs(cur, bp, ptr, ptr);
2574 if (ptr > 1) {
2575 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
2576 return 0;
2577 }
2578 key.br_startoff = cpu_to_be64(off);
2579 if ((error = xfs_bmbt_updkey(cur, &key, 1))) {
2580 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
2581 return error;
2582 }
2583 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
2584 return 0;
2585}
2586
2587/*
2588 * Check extent records, which have just been read, for 461 * Check extent records, which have just been read, for
2589 * any bit in the extent flag field. ASSERT on debug 462 * any bit in the extent flag field. ASSERT on debug
2590 * kernels, as this condition should not occur. 463 * kernels, as this condition should not occur.
@@ -2608,3 +481,451 @@ xfs_check_nostate_extents(
2608 } 481 }
2609 return 0; 482 return 0;
2610} 483}
484
485
486STATIC struct xfs_btree_cur *
487xfs_bmbt_dup_cursor(
488 struct xfs_btree_cur *cur)
489{
490 struct xfs_btree_cur *new;
491
492 new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp,
493 cur->bc_private.b.ip, cur->bc_private.b.whichfork);
494
495 /*
496 * Copy the firstblock, flist, and flags values,
497 * since init cursor doesn't get them.
498 */
499 new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
500 new->bc_private.b.flist = cur->bc_private.b.flist;
501 new->bc_private.b.flags = cur->bc_private.b.flags;
502
503 return new;
504}
505
506STATIC void
507xfs_bmbt_update_cursor(
508 struct xfs_btree_cur *src,
509 struct xfs_btree_cur *dst)
510{
511 ASSERT((dst->bc_private.b.firstblock != NULLFSBLOCK) ||
512 (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME));
513 ASSERT(dst->bc_private.b.flist == src->bc_private.b.flist);
514
515 dst->bc_private.b.allocated += src->bc_private.b.allocated;
516 dst->bc_private.b.firstblock = src->bc_private.b.firstblock;
517
518 src->bc_private.b.allocated = 0;
519}
520
521STATIC int
522xfs_bmbt_alloc_block(
523 struct xfs_btree_cur *cur,
524 union xfs_btree_ptr *start,
525 union xfs_btree_ptr *new,
526 int length,
527 int *stat)
528{
529 xfs_alloc_arg_t args; /* block allocation args */
530 int error; /* error return value */
531
532 memset(&args, 0, sizeof(args));
533 args.tp = cur->bc_tp;
534 args.mp = cur->bc_mp;
535 args.fsbno = cur->bc_private.b.firstblock;
536 args.firstblock = args.fsbno;
537
538 if (args.fsbno == NULLFSBLOCK) {
539 args.fsbno = be64_to_cpu(start->l);
540 args.type = XFS_ALLOCTYPE_START_BNO;
541 /*
542 * Make sure there is sufficient room left in the AG to
543 * complete a full tree split for an extent insert. If
544 * we are converting the middle part of an extent then
545 * we may need space for two tree splits.
546 *
547 * We are relying on the caller to make the correct block
548 * reservation for this operation to succeed. If the
549 * reservation amount is insufficient then we may fail a
550 * block allocation here and corrupt the filesystem.
551 */
552 args.minleft = xfs_trans_get_block_res(args.tp);
553 } else if (cur->bc_private.b.flist->xbf_low) {
554 args.type = XFS_ALLOCTYPE_START_BNO;
555 } else {
556 args.type = XFS_ALLOCTYPE_NEAR_BNO;
557 }
558
559 args.minlen = args.maxlen = args.prod = 1;
560 args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
561 if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
562 error = XFS_ERROR(ENOSPC);
563 goto error0;
564 }
565 error = xfs_alloc_vextent(&args);
566 if (error)
567 goto error0;
568
569 if (args.fsbno == NULLFSBLOCK && args.minleft) {
570 /*
571 * Could not find an AG with enough free space to satisfy
572 * a full btree split. Try again without minleft and if
573 * successful activate the lowspace algorithm.
574 */
575 args.fsbno = 0;
576 args.type = XFS_ALLOCTYPE_FIRST_AG;
577 args.minleft = 0;
578 error = xfs_alloc_vextent(&args);
579 if (error)
580 goto error0;
581 cur->bc_private.b.flist->xbf_low = 1;
582 }
583 if (args.fsbno == NULLFSBLOCK) {
584 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
585 *stat = 0;
586 return 0;
587 }
588 ASSERT(args.len == 1);
589 cur->bc_private.b.firstblock = args.fsbno;
590 cur->bc_private.b.allocated++;
591 cur->bc_private.b.ip->i_d.di_nblocks++;
592 xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
593 XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
594 XFS_TRANS_DQ_BCOUNT, 1L);
595
596 new->l = cpu_to_be64(args.fsbno);
597
598 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
599 *stat = 1;
600 return 0;
601
602 error0:
603 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
604 return error;
605}
606
607STATIC int
608xfs_bmbt_free_block(
609 struct xfs_btree_cur *cur,
610 struct xfs_buf *bp)
611{
612 struct xfs_mount *mp = cur->bc_mp;
613 struct xfs_inode *ip = cur->bc_private.b.ip;
614 struct xfs_trans *tp = cur->bc_tp;
615 xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
616
617 xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp);
618 ip->i_d.di_nblocks--;
619
620 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
621 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
622 xfs_trans_binval(tp, bp);
623 return 0;
624}
625
626STATIC int
627xfs_bmbt_get_minrecs(
628 struct xfs_btree_cur *cur,
629 int level)
630{
631 if (level == cur->bc_nlevels - 1) {
632 struct xfs_ifork *ifp;
633
634 ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
635 cur->bc_private.b.whichfork);
636
637 return xfs_bmbt_maxrecs(cur->bc_mp,
638 ifp->if_broot_bytes, level == 0) / 2;
639 }
640
641 return cur->bc_mp->m_bmap_dmnr[level != 0];
642}
643
644int
645xfs_bmbt_get_maxrecs(
646 struct xfs_btree_cur *cur,
647 int level)
648{
649 if (level == cur->bc_nlevels - 1) {
650 struct xfs_ifork *ifp;
651
652 ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
653 cur->bc_private.b.whichfork);
654
655 return xfs_bmbt_maxrecs(cur->bc_mp,
656 ifp->if_broot_bytes, level == 0);
657 }
658
659 return cur->bc_mp->m_bmap_dmxr[level != 0];
660
661}
662
663/*
664 * Get the maximum records we could store in the on-disk format.
665 *
666 * For non-root nodes this is equivalent to xfs_bmbt_get_maxrecs, but
667 * for the root node this checks the available space in the dinode fork
668 * so that we can resize the in-memory buffer to match it. After a
669 * resize to the maximum size this function returns the same value
670 * as xfs_bmbt_get_maxrecs for the root node, too.
671 */
672STATIC int
673xfs_bmbt_get_dmaxrecs(
674 struct xfs_btree_cur *cur,
675 int level)
676{
677 if (level != cur->bc_nlevels - 1)
678 return cur->bc_mp->m_bmap_dmxr[level != 0];
679 return xfs_bmdr_maxrecs(cur->bc_mp, cur->bc_private.b.forksize,
680 level == 0);
681}
682
683STATIC void
684xfs_bmbt_init_key_from_rec(
685 union xfs_btree_key *key,
686 union xfs_btree_rec *rec)
687{
688 key->bmbt.br_startoff =
689 cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt));
690}
691
692STATIC void
693xfs_bmbt_init_rec_from_key(
694 union xfs_btree_key *key,
695 union xfs_btree_rec *rec)
696{
697 ASSERT(key->bmbt.br_startoff != 0);
698
699 xfs_bmbt_disk_set_allf(&rec->bmbt, be64_to_cpu(key->bmbt.br_startoff),
700 0, 0, XFS_EXT_NORM);
701}
702
703STATIC void
704xfs_bmbt_init_rec_from_cur(
705 struct xfs_btree_cur *cur,
706 union xfs_btree_rec *rec)
707{
708 xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b);
709}
710
711STATIC void
712xfs_bmbt_init_ptr_from_cur(
713 struct xfs_btree_cur *cur,
714 union xfs_btree_ptr *ptr)
715{
716 ptr->l = 0;
717}
718
719STATIC __int64_t
720xfs_bmbt_key_diff(
721 struct xfs_btree_cur *cur,
722 union xfs_btree_key *key)
723{
724 return (__int64_t)be64_to_cpu(key->bmbt.br_startoff) -
725 cur->bc_rec.b.br_startoff;
726}
727
728#ifdef DEBUG
729STATIC int
730xfs_bmbt_keys_inorder(
731 struct xfs_btree_cur *cur,
732 union xfs_btree_key *k1,
733 union xfs_btree_key *k2)
734{
735 return be64_to_cpu(k1->bmbt.br_startoff) <
736 be64_to_cpu(k2->bmbt.br_startoff);
737}
738
739STATIC int
740xfs_bmbt_recs_inorder(
741 struct xfs_btree_cur *cur,
742 union xfs_btree_rec *r1,
743 union xfs_btree_rec *r2)
744{
745 return xfs_bmbt_disk_get_startoff(&r1->bmbt) +
746 xfs_bmbt_disk_get_blockcount(&r1->bmbt) <=
747 xfs_bmbt_disk_get_startoff(&r2->bmbt);
748}
749#endif /* DEBUG */
750
751#ifdef XFS_BTREE_TRACE
752ktrace_t *xfs_bmbt_trace_buf;
753
754STATIC void
755xfs_bmbt_trace_enter(
756 struct xfs_btree_cur *cur,
757 const char *func,
758 char *s,
759 int type,
760 int line,
761 __psunsigned_t a0,
762 __psunsigned_t a1,
763 __psunsigned_t a2,
764 __psunsigned_t a3,
765 __psunsigned_t a4,
766 __psunsigned_t a5,
767 __psunsigned_t a6,
768 __psunsigned_t a7,
769 __psunsigned_t a8,
770 __psunsigned_t a9,
771 __psunsigned_t a10)
772{
773 struct xfs_inode *ip = cur->bc_private.b.ip;
774 int whichfork = cur->bc_private.b.whichfork;
775
776 ktrace_enter(xfs_bmbt_trace_buf,
777 (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
778 (void *)func, (void *)s, (void *)ip, (void *)cur,
779 (void *)a0, (void *)a1, (void *)a2, (void *)a3,
780 (void *)a4, (void *)a5, (void *)a6, (void *)a7,
781 (void *)a8, (void *)a9, (void *)a10);
782 ktrace_enter(ip->i_btrace,
783 (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
784 (void *)func, (void *)s, (void *)ip, (void *)cur,
785 (void *)a0, (void *)a1, (void *)a2, (void *)a3,
786 (void *)a4, (void *)a5, (void *)a6, (void *)a7,
787 (void *)a8, (void *)a9, (void *)a10);
788}
789
790STATIC void
791xfs_bmbt_trace_cursor(
792 struct xfs_btree_cur *cur,
793 __uint32_t *s0,
794 __uint64_t *l0,
795 __uint64_t *l1)
796{
797 struct xfs_bmbt_rec_host r;
798
799 xfs_bmbt_set_all(&r, &cur->bc_rec.b);
800
801 *s0 = (cur->bc_nlevels << 24) |
802 (cur->bc_private.b.flags << 16) |
803 cur->bc_private.b.allocated;
804 *l0 = r.l0;
805 *l1 = r.l1;
806}
807
808STATIC void
809xfs_bmbt_trace_key(
810 struct xfs_btree_cur *cur,
811 union xfs_btree_key *key,
812 __uint64_t *l0,
813 __uint64_t *l1)
814{
815 *l0 = be64_to_cpu(key->bmbt.br_startoff);
816 *l1 = 0;
817}
818
819STATIC void
820xfs_bmbt_trace_record(
821 struct xfs_btree_cur *cur,
822 union xfs_btree_rec *rec,
823 __uint64_t *l0,
824 __uint64_t *l1,
825 __uint64_t *l2)
826{
827 struct xfs_bmbt_irec irec;
828
829 xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
830 *l0 = irec.br_startoff;
831 *l1 = irec.br_startblock;
832 *l2 = irec.br_blockcount;
833}
834#endif /* XFS_BTREE_TRACE */
835
836static const struct xfs_btree_ops xfs_bmbt_ops = {
837 .rec_len = sizeof(xfs_bmbt_rec_t),
838 .key_len = sizeof(xfs_bmbt_key_t),
839
840 .dup_cursor = xfs_bmbt_dup_cursor,
841 .update_cursor = xfs_bmbt_update_cursor,
842 .alloc_block = xfs_bmbt_alloc_block,
843 .free_block = xfs_bmbt_free_block,
844 .get_maxrecs = xfs_bmbt_get_maxrecs,
845 .get_minrecs = xfs_bmbt_get_minrecs,
846 .get_dmaxrecs = xfs_bmbt_get_dmaxrecs,
847 .init_key_from_rec = xfs_bmbt_init_key_from_rec,
848 .init_rec_from_key = xfs_bmbt_init_rec_from_key,
849 .init_rec_from_cur = xfs_bmbt_init_rec_from_cur,
850 .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur,
851 .key_diff = xfs_bmbt_key_diff,
852
853#ifdef DEBUG
854 .keys_inorder = xfs_bmbt_keys_inorder,
855 .recs_inorder = xfs_bmbt_recs_inorder,
856#endif
857
858#ifdef XFS_BTREE_TRACE
859 .trace_enter = xfs_bmbt_trace_enter,
860 .trace_cursor = xfs_bmbt_trace_cursor,
861 .trace_key = xfs_bmbt_trace_key,
862 .trace_record = xfs_bmbt_trace_record,
863#endif
864};
865
866/*
867 * Allocate a new bmap btree cursor.
868 */
869struct xfs_btree_cur * /* new bmap btree cursor */
870xfs_bmbt_init_cursor(
871 struct xfs_mount *mp, /* file system mount point */
872 struct xfs_trans *tp, /* transaction pointer */
873 struct xfs_inode *ip, /* inode owning the btree */
874 int whichfork) /* data or attr fork */
875{
876 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
877 struct xfs_btree_cur *cur;
878
879 cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
880
881 cur->bc_tp = tp;
882 cur->bc_mp = mp;
883 cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
884 cur->bc_btnum = XFS_BTNUM_BMAP;
885 cur->bc_blocklog = mp->m_sb.sb_blocklog;
886
887 cur->bc_ops = &xfs_bmbt_ops;
888 cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
889
890 cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
891 cur->bc_private.b.ip = ip;
892 cur->bc_private.b.firstblock = NULLFSBLOCK;
893 cur->bc_private.b.flist = NULL;
894 cur->bc_private.b.allocated = 0;
895 cur->bc_private.b.flags = 0;
896 cur->bc_private.b.whichfork = whichfork;
897
898 return cur;
899}
900
901/*
902 * Calculate number of records in a bmap btree block.
903 */
904int
905xfs_bmbt_maxrecs(
906 struct xfs_mount *mp,
907 int blocklen,
908 int leaf)
909{
910 blocklen -= XFS_BMBT_BLOCK_LEN(mp);
911
912 if (leaf)
913 return blocklen / sizeof(xfs_bmbt_rec_t);
914 return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t));
915}
916
917/*
918 * Calculate number of records in a bmap btree inode root.
919 */
920int
921xfs_bmdr_maxrecs(
922 struct xfs_mount *mp,
923 int blocklen,
924 int leaf)
925{
926 blocklen -= sizeof(xfs_bmdr_block_t);
927
928 if (leaf)
929 return blocklen / sizeof(xfs_bmdr_rec_t);
930 return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t));
931}
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index cd0d4b4bb816..a4555abb6622 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -21,9 +21,10 @@
21#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */ 21#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */
22 22
23struct xfs_btree_cur; 23struct xfs_btree_cur;
24struct xfs_btree_lblock; 24struct xfs_btree_block;
25struct xfs_mount; 25struct xfs_mount;
26struct xfs_inode; 26struct xfs_inode;
27struct xfs_trans;
27 28
28/* 29/*
29 * Bmap root header, on-disk form only. 30 * Bmap root header, on-disk form only.
@@ -145,71 +146,60 @@ typedef struct xfs_bmbt_key {
145/* btree pointer type */ 146/* btree pointer type */
146typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; 147typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
147 148
148/* btree block header type */ 149/*
149typedef struct xfs_btree_lblock xfs_bmbt_block_t; 150 * Btree block header size depends on a superblock flag.
150 151 *
151#define XFS_BUF_TO_BMBT_BLOCK(bp) ((xfs_bmbt_block_t *)XFS_BUF_PTR(bp)) 152 * (not quite yet, but soon)
152 153 */
153#define XFS_BMAP_RBLOCK_DSIZE(lev,cur) ((cur)->bc_private.b.forksize) 154#define XFS_BMBT_BLOCK_LEN(mp) XFS_BTREE_LBLOCK_LEN
154#define XFS_BMAP_RBLOCK_ISIZE(lev,cur) \ 155
155 ((int)XFS_IFORK_PTR((cur)->bc_private.b.ip, \ 156#define XFS_BMBT_REC_ADDR(mp, block, index) \
156 (cur)->bc_private.b.whichfork)->if_broot_bytes) 157 ((xfs_bmbt_rec_t *) \
157 158 ((char *)(block) + \
158#define XFS_BMAP_BLOCK_DMAXRECS(lev,cur) \ 159 XFS_BMBT_BLOCK_LEN(mp) + \
159 (((lev) == (cur)->bc_nlevels - 1 ? \ 160 ((index) - 1) * sizeof(xfs_bmbt_rec_t)))
160 XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur), \ 161
161 xfs_bmdr, (lev) == 0) : \ 162#define XFS_BMBT_KEY_ADDR(mp, block, index) \
162 ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0]))) 163 ((xfs_bmbt_key_t *) \
163#define XFS_BMAP_BLOCK_IMAXRECS(lev,cur) \ 164 ((char *)(block) + \
164 (((lev) == (cur)->bc_nlevels - 1 ? \ 165 XFS_BMBT_BLOCK_LEN(mp) + \
165 XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur),\ 166 ((index) - 1) * sizeof(xfs_bmbt_key_t)))
166 xfs_bmbt, (lev) == 0) : \ 167
167 ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0]))) 168#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \
168 169 ((xfs_bmbt_ptr_t *) \
169#define XFS_BMAP_BLOCK_DMINRECS(lev,cur) \ 170 ((char *)(block) + \
170 (((lev) == (cur)->bc_nlevels - 1 ? \ 171 XFS_BMBT_BLOCK_LEN(mp) + \
171 XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur),\ 172 (maxrecs) * sizeof(xfs_bmbt_key_t) + \
172 xfs_bmdr, (lev) == 0) : \ 173 ((index) - 1) * sizeof(xfs_bmbt_ptr_t)))
173 ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0]))) 174
174#define XFS_BMAP_BLOCK_IMINRECS(lev,cur) \ 175#define XFS_BMDR_REC_ADDR(block, index) \
175 (((lev) == (cur)->bc_nlevels - 1 ? \ 176 ((xfs_bmdr_rec_t *) \
176 XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur),\ 177 ((char *)(block) + \
177 xfs_bmbt, (lev) == 0) : \ 178 sizeof(struct xfs_bmdr_block) + \
178 ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0]))) 179 ((index) - 1) * sizeof(xfs_bmdr_rec_t)))
179 180
180#define XFS_BMAP_REC_DADDR(bb,i,cur) (XFS_BTREE_REC_ADDR(xfs_bmbt, bb, i)) 181#define XFS_BMDR_KEY_ADDR(block, index) \
181 182 ((xfs_bmdr_key_t *) \
182#define XFS_BMAP_REC_IADDR(bb,i,cur) (XFS_BTREE_REC_ADDR(xfs_bmbt, bb, i)) 183 ((char *)(block) + \
183 184 sizeof(struct xfs_bmdr_block) + \
184#define XFS_BMAP_KEY_DADDR(bb,i,cur) \ 185 ((index) - 1) * sizeof(xfs_bmdr_key_t)))
185 (XFS_BTREE_KEY_ADDR(xfs_bmbt, bb, i)) 186
186 187#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \
187#define XFS_BMAP_KEY_IADDR(bb,i,cur) \ 188 ((xfs_bmdr_ptr_t *) \
188 (XFS_BTREE_KEY_ADDR(xfs_bmbt, bb, i)) 189 ((char *)(block) + \
189 190 sizeof(struct xfs_bmdr_block) + \
190#define XFS_BMAP_PTR_DADDR(bb,i,cur) \ 191 (maxrecs) * sizeof(xfs_bmdr_key_t) + \
191 (XFS_BTREE_PTR_ADDR(xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS( \ 192 ((index) - 1) * sizeof(xfs_bmdr_ptr_t)))
192 be16_to_cpu((bb)->bb_level), cur)))
193#define XFS_BMAP_PTR_IADDR(bb,i,cur) \
194 (XFS_BTREE_PTR_ADDR(xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS( \
195 be16_to_cpu((bb)->bb_level), cur)))
196 193
197/* 194/*
198 * These are to be used when we know the size of the block and 195 * These are to be used when we know the size of the block and
199 * we don't have a cursor. 196 * we don't have a cursor.
200 */ 197 */
201#define XFS_BMAP_BROOT_REC_ADDR(bb,i,sz) \ 198#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \
202 (XFS_BTREE_REC_ADDR(xfs_bmbt,bb,i)) 199 XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0))
203#define XFS_BMAP_BROOT_KEY_ADDR(bb,i,sz) \
204 (XFS_BTREE_KEY_ADDR(xfs_bmbt,bb,i))
205#define XFS_BMAP_BROOT_PTR_ADDR(bb,i,sz) \
206 (XFS_BTREE_PTR_ADDR(xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz)))
207
208#define XFS_BMAP_BROOT_NUMRECS(bb) be16_to_cpu((bb)->bb_numrecs)
209#define XFS_BMAP_BROOT_MAXRECS(sz) XFS_BTREE_BLOCK_MAXRECS(sz,xfs_bmbt,0)
210 200
211#define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \ 201#define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \
212 (int)(sizeof(xfs_bmbt_block_t) + \ 202 (int)(XFS_BTREE_LBLOCK_LEN + \
213 ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) 203 ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
214 204
215#define XFS_BMAP_BROOT_SPACE(bb) \ 205#define XFS_BMAP_BROOT_SPACE(bb) \
@@ -223,42 +213,12 @@ typedef struct xfs_btree_lblock xfs_bmbt_block_t;
223 */ 213 */
224#define XFS_BM_MAXLEVELS(mp,w) ((mp)->m_bm_maxlevels[(w)]) 214#define XFS_BM_MAXLEVELS(mp,w) ((mp)->m_bm_maxlevels[(w)])
225 215
226#define XFS_BMAP_SANITY_CHECK(mp,bb,level) \
227 (be32_to_cpu((bb)->bb_magic) == XFS_BMAP_MAGIC && \
228 be16_to_cpu((bb)->bb_level) == level && \
229 be16_to_cpu((bb)->bb_numrecs) > 0 && \
230 be16_to_cpu((bb)->bb_numrecs) <= (mp)->m_bmap_dmxr[(level) != 0])
231
232
233#ifdef __KERNEL__
234
235#if defined(XFS_BMBT_TRACE)
236/*
237 * Trace buffer entry types.
238 */
239#define XFS_BMBT_KTRACE_ARGBI 1
240#define XFS_BMBT_KTRACE_ARGBII 2
241#define XFS_BMBT_KTRACE_ARGFFFI 3
242#define XFS_BMBT_KTRACE_ARGI 4
243#define XFS_BMBT_KTRACE_ARGIFK 5
244#define XFS_BMBT_KTRACE_ARGIFR 6
245#define XFS_BMBT_KTRACE_ARGIK 7
246#define XFS_BMBT_KTRACE_CUR 8
247
248#define XFS_BMBT_TRACE_SIZE 4096 /* size of global trace buffer */
249#define XFS_BMBT_KTRACE_SIZE 32 /* size of per-inode trace buffer */
250extern ktrace_t *xfs_bmbt_trace_buf;
251#endif
252
253/* 216/*
254 * Prototypes for xfs_bmap.c to call. 217 * Prototypes for xfs_bmap.c to call.
255 */ 218 */
256extern void xfs_bmdr_to_bmbt(xfs_bmdr_block_t *, int, xfs_bmbt_block_t *, int); 219extern void xfs_bmdr_to_bmbt(struct xfs_mount *, xfs_bmdr_block_t *, int,
257extern int xfs_bmbt_decrement(struct xfs_btree_cur *, int, int *); 220 struct xfs_btree_block *, int);
258extern int xfs_bmbt_delete(struct xfs_btree_cur *, int *);
259extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s); 221extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
260extern xfs_bmbt_block_t *xfs_bmbt_get_block(struct xfs_btree_cur *cur,
261 int, struct xfs_buf **bpp);
262extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r); 222extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r);
263extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r); 223extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
264extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r); 224extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
@@ -268,22 +228,6 @@ extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
268extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r); 228extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
269extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r); 229extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
270 230
271extern int xfs_bmbt_increment(struct xfs_btree_cur *, int, int *);
272extern int xfs_bmbt_insert(struct xfs_btree_cur *, int *);
273extern void xfs_bmbt_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
274extern void xfs_bmbt_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int,
275 int);
276extern int xfs_bmbt_lookup_eq(struct xfs_btree_cur *, xfs_fileoff_t,
277 xfs_fsblock_t, xfs_filblks_t, int *);
278extern int xfs_bmbt_lookup_ge(struct xfs_btree_cur *, xfs_fileoff_t,
279 xfs_fsblock_t, xfs_filblks_t, int *);
280
281/*
282 * Give the bmap btree a new root block. Copy the old broot contents
283 * down into a real block and make the broot point to it.
284 */
285extern int xfs_bmbt_newroot(struct xfs_btree_cur *cur, int *lflags, int *stat);
286
287extern void xfs_bmbt_set_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s); 231extern void xfs_bmbt_set_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
288extern void xfs_bmbt_set_allf(xfs_bmbt_rec_host_t *r, xfs_fileoff_t o, 232extern void xfs_bmbt_set_allf(xfs_bmbt_rec_host_t *r, xfs_fileoff_t o,
289 xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v); 233 xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
@@ -296,10 +240,15 @@ extern void xfs_bmbt_disk_set_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
296extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o, 240extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o,
297 xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v); 241 xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
298 242
299extern void xfs_bmbt_to_bmdr(xfs_bmbt_block_t *, int, xfs_bmdr_block_t *, int); 243extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
300extern int xfs_bmbt_update(struct xfs_btree_cur *, xfs_fileoff_t, 244 xfs_bmdr_block_t *, int);
301 xfs_fsblock_t, xfs_filblks_t, xfs_exntst_t); 245
246extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
247extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf);
248extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
249
250extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
251 struct xfs_trans *, struct xfs_inode *, int);
302 252
303#endif /* __KERNEL__ */
304 253
305#endif /* __XFS_BMAP_BTREE_H__ */ 254#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index cc593a84c345..7ed59267420d 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -34,7 +34,9 @@
34#include "xfs_attr_sf.h" 34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 35#include "xfs_dinode.h"
36#include "xfs_inode.h" 36#include "xfs_inode.h"
37#include "xfs_inode_item.h"
37#include "xfs_btree.h" 38#include "xfs_btree.h"
39#include "xfs_btree_trace.h"
38#include "xfs_ialloc.h" 40#include "xfs_ialloc.h"
39#include "xfs_error.h" 41#include "xfs_error.h"
40 42
@@ -50,135 +52,33 @@ const __uint32_t xfs_magics[XFS_BTNUM_MAX] = {
50 XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC 52 XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC
51}; 53};
52 54
53/*
54 * Checking routine: return maxrecs for the block.
55 */
56STATIC int /* number of records fitting in block */
57xfs_btree_maxrecs(
58 xfs_btree_cur_t *cur, /* btree cursor */
59 xfs_btree_block_t *block) /* generic btree block pointer */
60{
61 switch (cur->bc_btnum) {
62 case XFS_BTNUM_BNO:
63 case XFS_BTNUM_CNT:
64 return (int)XFS_ALLOC_BLOCK_MAXRECS(
65 be16_to_cpu(block->bb_h.bb_level), cur);
66 case XFS_BTNUM_BMAP:
67 return (int)XFS_BMAP_BLOCK_IMAXRECS(
68 be16_to_cpu(block->bb_h.bb_level), cur);
69 case XFS_BTNUM_INO:
70 return (int)XFS_INOBT_BLOCK_MAXRECS(
71 be16_to_cpu(block->bb_h.bb_level), cur);
72 default:
73 ASSERT(0);
74 return 0;
75 }
76}
77
78/*
79 * External routines.
80 */
81
82#ifdef DEBUG
83/*
84 * Debug routine: check that block header is ok.
85 */
86void
87xfs_btree_check_block(
88 xfs_btree_cur_t *cur, /* btree cursor */
89 xfs_btree_block_t *block, /* generic btree block pointer */
90 int level, /* level of the btree block */
91 xfs_buf_t *bp) /* buffer containing block, if any */
92{
93 if (XFS_BTREE_LONG_PTRS(cur->bc_btnum))
94 xfs_btree_check_lblock(cur, (xfs_btree_lblock_t *)block, level,
95 bp);
96 else
97 xfs_btree_check_sblock(cur, (xfs_btree_sblock_t *)block, level,
98 bp);
99}
100
101/*
102 * Debug routine: check that keys are in the right order.
103 */
104void
105xfs_btree_check_key(
106 xfs_btnum_t btnum, /* btree identifier */
107 void *ak1, /* pointer to left (lower) key */
108 void *ak2) /* pointer to right (higher) key */
109{
110 switch (btnum) {
111 case XFS_BTNUM_BNO: {
112 xfs_alloc_key_t *k1;
113 xfs_alloc_key_t *k2;
114
115 k1 = ak1;
116 k2 = ak2;
117 ASSERT(be32_to_cpu(k1->ar_startblock) < be32_to_cpu(k2->ar_startblock));
118 break;
119 }
120 case XFS_BTNUM_CNT: {
121 xfs_alloc_key_t *k1;
122 xfs_alloc_key_t *k2;
123
124 k1 = ak1;
125 k2 = ak2;
126 ASSERT(be32_to_cpu(k1->ar_blockcount) < be32_to_cpu(k2->ar_blockcount) ||
127 (k1->ar_blockcount == k2->ar_blockcount &&
128 be32_to_cpu(k1->ar_startblock) < be32_to_cpu(k2->ar_startblock)));
129 break;
130 }
131 case XFS_BTNUM_BMAP: {
132 xfs_bmbt_key_t *k1;
133 xfs_bmbt_key_t *k2;
134
135 k1 = ak1;
136 k2 = ak2;
137 ASSERT(be64_to_cpu(k1->br_startoff) < be64_to_cpu(k2->br_startoff));
138 break;
139 }
140 case XFS_BTNUM_INO: {
141 xfs_inobt_key_t *k1;
142 xfs_inobt_key_t *k2;
143
144 k1 = ak1;
145 k2 = ak2;
146 ASSERT(be32_to_cpu(k1->ir_startino) < be32_to_cpu(k2->ir_startino));
147 break;
148 }
149 default:
150 ASSERT(0);
151 }
152}
153#endif /* DEBUG */
154 55
155/* 56STATIC int /* error (0 or EFSCORRUPTED) */
156 * Checking routine: check that long form block header is ok.
157 */
158/* ARGSUSED */
159int /* error (0 or EFSCORRUPTED) */
160xfs_btree_check_lblock( 57xfs_btree_check_lblock(
161 xfs_btree_cur_t *cur, /* btree cursor */ 58 struct xfs_btree_cur *cur, /* btree cursor */
162 xfs_btree_lblock_t *block, /* btree long form block pointer */ 59 struct xfs_btree_block *block, /* btree long form block pointer */
163 int level, /* level of the btree block */ 60 int level, /* level of the btree block */
164 xfs_buf_t *bp) /* buffer for block, if any */ 61 struct xfs_buf *bp) /* buffer for block, if any */
165{ 62{
166 int lblock_ok; /* block passes checks */ 63 int lblock_ok; /* block passes checks */
167 xfs_mount_t *mp; /* file system mount point */ 64 struct xfs_mount *mp; /* file system mount point */
168 65
169 mp = cur->bc_mp; 66 mp = cur->bc_mp;
170 lblock_ok = 67 lblock_ok =
171 be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] && 68 be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
172 be16_to_cpu(block->bb_level) == level && 69 be16_to_cpu(block->bb_level) == level &&
173 be16_to_cpu(block->bb_numrecs) <= 70 be16_to_cpu(block->bb_numrecs) <=
174 xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) && 71 cur->bc_ops->get_maxrecs(cur, level) &&
175 block->bb_leftsib && 72 block->bb_u.l.bb_leftsib &&
176 (be64_to_cpu(block->bb_leftsib) == NULLDFSBNO || 73 (be64_to_cpu(block->bb_u.l.bb_leftsib) == NULLDFSBNO ||
177 XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_leftsib))) && 74 XFS_FSB_SANITY_CHECK(mp,
178 block->bb_rightsib && 75 be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
179 (be64_to_cpu(block->bb_rightsib) == NULLDFSBNO || 76 block->bb_u.l.bb_rightsib &&
180 XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_rightsib))); 77 (be64_to_cpu(block->bb_u.l.bb_rightsib) == NULLDFSBNO ||
181 if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK, 78 XFS_FSB_SANITY_CHECK(mp,
79 be64_to_cpu(block->bb_u.l.bb_rightsib)));
80 if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
81 XFS_ERRTAG_BTREE_CHECK_LBLOCK,
182 XFS_RANDOM_BTREE_CHECK_LBLOCK))) { 82 XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
183 if (bp) 83 if (bp)
184 xfs_buftrace("LBTREE ERROR", bp); 84 xfs_buftrace("LBTREE ERROR", bp);
@@ -189,98 +89,15 @@ xfs_btree_check_lblock(
189 return 0; 89 return 0;
190} 90}
191 91
192/* 92STATIC int /* error (0 or EFSCORRUPTED) */
193 * Checking routine: check that (long) pointer is ok.
194 */
195int /* error (0 or EFSCORRUPTED) */
196xfs_btree_check_lptr(
197 xfs_btree_cur_t *cur, /* btree cursor */
198 xfs_dfsbno_t ptr, /* btree block disk address */
199 int level) /* btree block level */
200{
201 xfs_mount_t *mp; /* file system mount point */
202
203 mp = cur->bc_mp;
204 XFS_WANT_CORRUPTED_RETURN(
205 level > 0 &&
206 ptr != NULLDFSBNO &&
207 XFS_FSB_SANITY_CHECK(mp, ptr));
208 return 0;
209}
210
211#ifdef DEBUG
212/*
213 * Debug routine: check that records are in the right order.
214 */
215void
216xfs_btree_check_rec(
217 xfs_btnum_t btnum, /* btree identifier */
218 void *ar1, /* pointer to left (lower) record */
219 void *ar2) /* pointer to right (higher) record */
220{
221 switch (btnum) {
222 case XFS_BTNUM_BNO: {
223 xfs_alloc_rec_t *r1;
224 xfs_alloc_rec_t *r2;
225
226 r1 = ar1;
227 r2 = ar2;
228 ASSERT(be32_to_cpu(r1->ar_startblock) +
229 be32_to_cpu(r1->ar_blockcount) <=
230 be32_to_cpu(r2->ar_startblock));
231 break;
232 }
233 case XFS_BTNUM_CNT: {
234 xfs_alloc_rec_t *r1;
235 xfs_alloc_rec_t *r2;
236
237 r1 = ar1;
238 r2 = ar2;
239 ASSERT(be32_to_cpu(r1->ar_blockcount) < be32_to_cpu(r2->ar_blockcount) ||
240 (r1->ar_blockcount == r2->ar_blockcount &&
241 be32_to_cpu(r1->ar_startblock) < be32_to_cpu(r2->ar_startblock)));
242 break;
243 }
244 case XFS_BTNUM_BMAP: {
245 xfs_bmbt_rec_t *r1;
246 xfs_bmbt_rec_t *r2;
247
248 r1 = ar1;
249 r2 = ar2;
250 ASSERT(xfs_bmbt_disk_get_startoff(r1) +
251 xfs_bmbt_disk_get_blockcount(r1) <=
252 xfs_bmbt_disk_get_startoff(r2));
253 break;
254 }
255 case XFS_BTNUM_INO: {
256 xfs_inobt_rec_t *r1;
257 xfs_inobt_rec_t *r2;
258
259 r1 = ar1;
260 r2 = ar2;
261 ASSERT(be32_to_cpu(r1->ir_startino) + XFS_INODES_PER_CHUNK <=
262 be32_to_cpu(r2->ir_startino));
263 break;
264 }
265 default:
266 ASSERT(0);
267 }
268}
269#endif /* DEBUG */
270
271/*
272 * Checking routine: check that block header is ok.
273 */
274/* ARGSUSED */
275int /* error (0 or EFSCORRUPTED) */
276xfs_btree_check_sblock( 93xfs_btree_check_sblock(
277 xfs_btree_cur_t *cur, /* btree cursor */ 94 struct xfs_btree_cur *cur, /* btree cursor */
278 xfs_btree_sblock_t *block, /* btree short form block pointer */ 95 struct xfs_btree_block *block, /* btree short form block pointer */
279 int level, /* level of the btree block */ 96 int level, /* level of the btree block */
280 xfs_buf_t *bp) /* buffer containing block */ 97 struct xfs_buf *bp) /* buffer containing block */
281{ 98{
282 xfs_buf_t *agbp; /* buffer for ag. freespace struct */ 99 struct xfs_buf *agbp; /* buffer for ag. freespace struct */
283 xfs_agf_t *agf; /* ag. freespace structure */ 100 struct xfs_agf *agf; /* ag. freespace structure */
284 xfs_agblock_t agflen; /* native ag. freespace length */ 101 xfs_agblock_t agflen; /* native ag. freespace length */
285 int sblock_ok; /* block passes checks */ 102 int sblock_ok; /* block passes checks */
286 103
@@ -291,13 +108,13 @@ xfs_btree_check_sblock(
291 be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] && 108 be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
292 be16_to_cpu(block->bb_level) == level && 109 be16_to_cpu(block->bb_level) == level &&
293 be16_to_cpu(block->bb_numrecs) <= 110 be16_to_cpu(block->bb_numrecs) <=
294 xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) && 111 cur->bc_ops->get_maxrecs(cur, level) &&
295 (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK || 112 (be32_to_cpu(block->bb_u.s.bb_leftsib) == NULLAGBLOCK ||
296 be32_to_cpu(block->bb_leftsib) < agflen) && 113 be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
297 block->bb_leftsib && 114 block->bb_u.s.bb_leftsib &&
298 (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK || 115 (be32_to_cpu(block->bb_u.s.bb_rightsib) == NULLAGBLOCK ||
299 be32_to_cpu(block->bb_rightsib) < agflen) && 116 be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
300 block->bb_rightsib; 117 block->bb_u.s.bb_rightsib;
301 if (unlikely(XFS_TEST_ERROR(!sblock_ok, cur->bc_mp, 118 if (unlikely(XFS_TEST_ERROR(!sblock_ok, cur->bc_mp,
302 XFS_ERRTAG_BTREE_CHECK_SBLOCK, 119 XFS_ERRTAG_BTREE_CHECK_SBLOCK,
303 XFS_RANDOM_BTREE_CHECK_SBLOCK))) { 120 XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
@@ -311,27 +128,78 @@ xfs_btree_check_sblock(
311} 128}
312 129
313/* 130/*
314 * Checking routine: check that (short) pointer is ok. 131 * Debug routine: check that block header is ok.
132 */
133int
134xfs_btree_check_block(
135 struct xfs_btree_cur *cur, /* btree cursor */
136 struct xfs_btree_block *block, /* generic btree block pointer */
137 int level, /* level of the btree block */
138 struct xfs_buf *bp) /* buffer containing block, if any */
139{
140 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
141 return xfs_btree_check_lblock(cur, block, level, bp);
142 else
143 return xfs_btree_check_sblock(cur, block, level, bp);
144}
145
146/*
147 * Check that (long) pointer is ok.
315 */ 148 */
316int /* error (0 or EFSCORRUPTED) */ 149int /* error (0 or EFSCORRUPTED) */
150xfs_btree_check_lptr(
151 struct xfs_btree_cur *cur, /* btree cursor */
152 xfs_dfsbno_t bno, /* btree block disk address */
153 int level) /* btree block level */
154{
155 XFS_WANT_CORRUPTED_RETURN(
156 level > 0 &&
157 bno != NULLDFSBNO &&
158 XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
159 return 0;
160}
161
162#ifdef DEBUG
163/*
164 * Check that (short) pointer is ok.
165 */
166STATIC int /* error (0 or EFSCORRUPTED) */
317xfs_btree_check_sptr( 167xfs_btree_check_sptr(
318 xfs_btree_cur_t *cur, /* btree cursor */ 168 struct xfs_btree_cur *cur, /* btree cursor */
319 xfs_agblock_t ptr, /* btree block disk address */ 169 xfs_agblock_t bno, /* btree block disk address */
320 int level) /* btree block level */ 170 int level) /* btree block level */
321{ 171{
322 xfs_buf_t *agbp; /* buffer for ag. freespace struct */ 172 xfs_agblock_t agblocks = cur->bc_mp->m_sb.sb_agblocks;
323 xfs_agf_t *agf; /* ag. freespace structure */
324 173
325 agbp = cur->bc_private.a.agbp;
326 agf = XFS_BUF_TO_AGF(agbp);
327 XFS_WANT_CORRUPTED_RETURN( 174 XFS_WANT_CORRUPTED_RETURN(
328 level > 0 && 175 level > 0 &&
329 ptr != NULLAGBLOCK && ptr != 0 && 176 bno != NULLAGBLOCK &&
330 ptr < be32_to_cpu(agf->agf_length)); 177 bno != 0 &&
178 bno < agblocks);
331 return 0; 179 return 0;
332} 180}
333 181
334/* 182/*
183 * Check that block ptr is ok.
184 */
185STATIC int /* error (0 or EFSCORRUPTED) */
186xfs_btree_check_ptr(
187 struct xfs_btree_cur *cur, /* btree cursor */
188 union xfs_btree_ptr *ptr, /* btree block disk address */
189 int index, /* offset from ptr to check */
190 int level) /* btree block level */
191{
192 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
193 return xfs_btree_check_lptr(cur,
194 be64_to_cpu((&ptr->l)[index]), level);
195 } else {
196 return xfs_btree_check_sptr(cur,
197 be32_to_cpu((&ptr->s)[index]), level);
198 }
199}
200#endif
201
202/*
335 * Delete the btree cursor. 203 * Delete the btree cursor.
336 */ 204 */
337void 205void
@@ -387,16 +255,17 @@ xfs_btree_dup_cursor(
387 255
388 tp = cur->bc_tp; 256 tp = cur->bc_tp;
389 mp = cur->bc_mp; 257 mp = cur->bc_mp;
258
390 /* 259 /*
391 * Allocate a new cursor like the old one. 260 * Allocate a new cursor like the old one.
392 */ 261 */
393 new = xfs_btree_init_cursor(mp, tp, cur->bc_private.a.agbp, 262 new = cur->bc_ops->dup_cursor(cur);
394 cur->bc_private.a.agno, cur->bc_btnum, cur->bc_private.b.ip, 263
395 cur->bc_private.b.whichfork);
396 /* 264 /*
397 * Copy the record currently in the cursor. 265 * Copy the record currently in the cursor.
398 */ 266 */
399 new->bc_rec = cur->bc_rec; 267 new->bc_rec = cur->bc_rec;
268
400 /* 269 /*
401 * For each level current, re-get the buffer and copy the ptr value. 270 * For each level current, re-get the buffer and copy the ptr value.
402 */ 271 */
@@ -416,46 +285,174 @@ xfs_btree_dup_cursor(
416 } else 285 } else
417 new->bc_bufs[i] = NULL; 286 new->bc_bufs[i] = NULL;
418 } 287 }
419 /*
420 * For bmap btrees, copy the firstblock, flist, and flags values,
421 * since init cursor doesn't get them.
422 */
423 if (new->bc_btnum == XFS_BTNUM_BMAP) {
424 new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
425 new->bc_private.b.flist = cur->bc_private.b.flist;
426 new->bc_private.b.flags = cur->bc_private.b.flags;
427 }
428 *ncur = new; 288 *ncur = new;
429 return 0; 289 return 0;
430} 290}
431 291
432/* 292/*
293 * XFS btree block layout and addressing:
294 *
295 * There are two types of blocks in the btree: leaf and non-leaf blocks.
296 *
297 * The leaf record start with a header then followed by records containing
298 * the values. A non-leaf block also starts with the same header, and
299 * then first contains lookup keys followed by an equal number of pointers
300 * to the btree blocks at the previous level.
301 *
302 * +--------+-------+-------+-------+-------+-------+-------+
303 * Leaf: | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
304 * +--------+-------+-------+-------+-------+-------+-------+
305 *
306 * +--------+-------+-------+-------+-------+-------+-------+
307 * Non-Leaf: | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
308 * +--------+-------+-------+-------+-------+-------+-------+
309 *
310 * The header is called struct xfs_btree_block for reasons better left unknown
311 * and comes in different versions for short (32bit) and long (64bit) block
312 * pointers. The record and key structures are defined by the btree instances
313 * and opaque to the btree core. The block pointers are simple disk endian
314 * integers, available in a short (32bit) and long (64bit) variant.
315 *
316 * The helpers below calculate the offset of a given record, key or pointer
317 * into a btree block (xfs_btree_*_offset) or return a pointer to the given
318 * record, key or pointer (xfs_btree_*_addr). Note that all addressing
319 * inside the btree block is done using indices starting at one, not zero!
320 */
321
322/*
323 * Return size of the btree block header for this btree instance.
324 */
325static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
326{
327 return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
328 XFS_BTREE_LBLOCK_LEN :
329 XFS_BTREE_SBLOCK_LEN;
330}
331
332/*
333 * Return size of btree block pointers for this btree instance.
334 */
335static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
336{
337 return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
338 sizeof(__be64) : sizeof(__be32);
339}
340
341/*
342 * Calculate offset of the n-th record in a btree block.
343 */
344STATIC size_t
345xfs_btree_rec_offset(
346 struct xfs_btree_cur *cur,
347 int n)
348{
349 return xfs_btree_block_len(cur) +
350 (n - 1) * cur->bc_ops->rec_len;
351}
352
353/*
354 * Calculate offset of the n-th key in a btree block.
355 */
356STATIC size_t
357xfs_btree_key_offset(
358 struct xfs_btree_cur *cur,
359 int n)
360{
361 return xfs_btree_block_len(cur) +
362 (n - 1) * cur->bc_ops->key_len;
363}
364
365/*
366 * Calculate offset of the n-th block pointer in a btree block.
367 */
368STATIC size_t
369xfs_btree_ptr_offset(
370 struct xfs_btree_cur *cur,
371 int n,
372 int level)
373{
374 return xfs_btree_block_len(cur) +
375 cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
376 (n - 1) * xfs_btree_ptr_len(cur);
377}
378
379/*
380 * Return a pointer to the n-th record in the btree block.
381 */
382STATIC union xfs_btree_rec *
383xfs_btree_rec_addr(
384 struct xfs_btree_cur *cur,
385 int n,
386 struct xfs_btree_block *block)
387{
388 return (union xfs_btree_rec *)
389 ((char *)block + xfs_btree_rec_offset(cur, n));
390}
391
392/*
393 * Return a pointer to the n-th key in the btree block.
394 */
395STATIC union xfs_btree_key *
396xfs_btree_key_addr(
397 struct xfs_btree_cur *cur,
398 int n,
399 struct xfs_btree_block *block)
400{
401 return (union xfs_btree_key *)
402 ((char *)block + xfs_btree_key_offset(cur, n));
403}
404
405/*
406 * Return a pointer to the n-th block pointer in the btree block.
407 */
408STATIC union xfs_btree_ptr *
409xfs_btree_ptr_addr(
410 struct xfs_btree_cur *cur,
411 int n,
412 struct xfs_btree_block *block)
413{
414 int level = xfs_btree_get_level(block);
415
416 ASSERT(block->bb_level != 0);
417
418 return (union xfs_btree_ptr *)
419 ((char *)block + xfs_btree_ptr_offset(cur, n, level));
420}
421
422/*
423 * Get a the root block which is stored in the inode.
424 *
425 * For now this btree implementation assumes the btree root is always
426 * stored in the if_broot field of an inode fork.
427 */
428STATIC struct xfs_btree_block *
429xfs_btree_get_iroot(
430 struct xfs_btree_cur *cur)
431{
432 struct xfs_ifork *ifp;
433
434 ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
435 return (struct xfs_btree_block *)ifp->if_broot;
436}
437
438/*
433 * Retrieve the block pointer from the cursor at the given level. 439 * Retrieve the block pointer from the cursor at the given level.
434 * This may be a bmap btree root or from a buffer. 440 * This may be an inode btree root or from a buffer.
435 */ 441 */
436STATIC xfs_btree_block_t * /* generic btree block pointer */ 442STATIC struct xfs_btree_block * /* generic btree block pointer */
437xfs_btree_get_block( 443xfs_btree_get_block(
438 xfs_btree_cur_t *cur, /* btree cursor */ 444 struct xfs_btree_cur *cur, /* btree cursor */
439 int level, /* level in btree */ 445 int level, /* level in btree */
440 xfs_buf_t **bpp) /* buffer containing the block */ 446 struct xfs_buf **bpp) /* buffer containing the block */
441{ 447{
442 xfs_btree_block_t *block; /* return value */ 448 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
443 xfs_buf_t *bp; /* return buffer */ 449 (level == cur->bc_nlevels - 1)) {
444 xfs_ifork_t *ifp; /* inode fork pointer */ 450 *bpp = NULL;
445 int whichfork; /* data or attr fork */ 451 return xfs_btree_get_iroot(cur);
446
447 if (cur->bc_btnum == XFS_BTNUM_BMAP && level == cur->bc_nlevels - 1) {
448 whichfork = cur->bc_private.b.whichfork;
449 ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, whichfork);
450 block = (xfs_btree_block_t *)ifp->if_broot;
451 bp = NULL;
452 } else {
453 bp = cur->bc_bufs[level];
454 block = XFS_BUF_TO_BLOCK(bp);
455 } 452 }
456 ASSERT(block != NULL); 453
457 *bpp = bp; 454 *bpp = cur->bc_bufs[level];
458 return block; 455 return XFS_BUF_TO_BLOCK(*bpp);
459} 456}
460 457
461/* 458/*
@@ -505,97 +502,6 @@ xfs_btree_get_bufs(
505} 502}
506 503
507/* 504/*
508 * Allocate a new btree cursor.
509 * The cursor is either for allocation (A) or bmap (B) or inodes (I).
510 */
511xfs_btree_cur_t * /* new btree cursor */
512xfs_btree_init_cursor(
513 xfs_mount_t *mp, /* file system mount point */
514 xfs_trans_t *tp, /* transaction pointer */
515 xfs_buf_t *agbp, /* (A only) buffer for agf structure */
516 /* (I only) buffer for agi structure */
517 xfs_agnumber_t agno, /* (AI only) allocation group number */
518 xfs_btnum_t btnum, /* btree identifier */
519 xfs_inode_t *ip, /* (B only) inode owning the btree */
520 int whichfork) /* (B only) data or attr fork */
521{
522 xfs_agf_t *agf; /* (A) allocation group freespace */
523 xfs_agi_t *agi; /* (I) allocation group inodespace */
524 xfs_btree_cur_t *cur; /* return value */
525 xfs_ifork_t *ifp; /* (I) inode fork pointer */
526 int nlevels=0; /* number of levels in the btree */
527
528 ASSERT(xfs_btree_cur_zone != NULL);
529 /*
530 * Allocate a new cursor.
531 */
532 cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
533 /*
534 * Deduce the number of btree levels from the arguments.
535 */
536 switch (btnum) {
537 case XFS_BTNUM_BNO:
538 case XFS_BTNUM_CNT:
539 agf = XFS_BUF_TO_AGF(agbp);
540 nlevels = be32_to_cpu(agf->agf_levels[btnum]);
541 break;
542 case XFS_BTNUM_BMAP:
543 ifp = XFS_IFORK_PTR(ip, whichfork);
544 nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
545 break;
546 case XFS_BTNUM_INO:
547 agi = XFS_BUF_TO_AGI(agbp);
548 nlevels = be32_to_cpu(agi->agi_level);
549 break;
550 default:
551 ASSERT(0);
552 }
553 /*
554 * Fill in the common fields.
555 */
556 cur->bc_tp = tp;
557 cur->bc_mp = mp;
558 cur->bc_nlevels = nlevels;
559 cur->bc_btnum = btnum;
560 cur->bc_blocklog = mp->m_sb.sb_blocklog;
561 /*
562 * Fill in private fields.
563 */
564 switch (btnum) {
565 case XFS_BTNUM_BNO:
566 case XFS_BTNUM_CNT:
567 /*
568 * Allocation btree fields.
569 */
570 cur->bc_private.a.agbp = agbp;
571 cur->bc_private.a.agno = agno;
572 break;
573 case XFS_BTNUM_INO:
574 /*
575 * Inode allocation btree fields.
576 */
577 cur->bc_private.a.agbp = agbp;
578 cur->bc_private.a.agno = agno;
579 break;
580 case XFS_BTNUM_BMAP:
581 /*
582 * Bmap btree fields.
583 */
584 cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
585 cur->bc_private.b.ip = ip;
586 cur->bc_private.b.firstblock = NULLFSBLOCK;
587 cur->bc_private.b.flist = NULL;
588 cur->bc_private.b.allocated = 0;
589 cur->bc_private.b.flags = 0;
590 cur->bc_private.b.whichfork = whichfork;
591 break;
592 default:
593 ASSERT(0);
594 }
595 return cur;
596}
597
598/*
599 * Check for the cursor referring to the last block at the given level. 505 * Check for the cursor referring to the last block at the given level.
600 */ 506 */
601int /* 1=is last block, 0=not last block */ 507int /* 1=is last block, 0=not last block */
@@ -603,12 +509,12 @@ xfs_btree_islastblock(
603 xfs_btree_cur_t *cur, /* btree cursor */ 509 xfs_btree_cur_t *cur, /* btree cursor */
604 int level) /* level to check */ 510 int level) /* level to check */
605{ 511{
606 xfs_btree_block_t *block; /* generic btree block pointer */ 512 struct xfs_btree_block *block; /* generic btree block pointer */
607 xfs_buf_t *bp; /* buffer containing block */ 513 xfs_buf_t *bp; /* buffer containing block */
608 514
609 block = xfs_btree_get_block(cur, level, &bp); 515 block = xfs_btree_get_block(cur, level, &bp);
610 xfs_btree_check_block(cur, block, level, bp); 516 xfs_btree_check_block(cur, block, level, bp);
611 if (XFS_BTREE_LONG_PTRS(cur->bc_btnum)) 517 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
612 return be64_to_cpu(block->bb_u.l.bb_rightsib) == NULLDFSBNO; 518 return be64_to_cpu(block->bb_u.l.bb_rightsib) == NULLDFSBNO;
613 else 519 else
614 return be32_to_cpu(block->bb_u.s.bb_rightsib) == NULLAGBLOCK; 520 return be32_to_cpu(block->bb_u.s.bb_rightsib) == NULLAGBLOCK;
@@ -618,12 +524,12 @@ xfs_btree_islastblock(
618 * Change the cursor to point to the first record at the given level. 524 * Change the cursor to point to the first record at the given level.
619 * Other levels are unaffected. 525 * Other levels are unaffected.
620 */ 526 */
621int /* success=1, failure=0 */ 527STATIC int /* success=1, failure=0 */
622xfs_btree_firstrec( 528xfs_btree_firstrec(
623 xfs_btree_cur_t *cur, /* btree cursor */ 529 xfs_btree_cur_t *cur, /* btree cursor */
624 int level) /* level to change */ 530 int level) /* level to change */
625{ 531{
626 xfs_btree_block_t *block; /* generic btree block pointer */ 532 struct xfs_btree_block *block; /* generic btree block pointer */
627 xfs_buf_t *bp; /* buffer containing block */ 533 xfs_buf_t *bp; /* buffer containing block */
628 534
629 /* 535 /*
@@ -634,7 +540,7 @@ xfs_btree_firstrec(
634 /* 540 /*
635 * It's empty, there is no such record. 541 * It's empty, there is no such record.
636 */ 542 */
637 if (!block->bb_h.bb_numrecs) 543 if (!block->bb_numrecs)
638 return 0; 544 return 0;
639 /* 545 /*
640 * Set the ptr value to 1, that's the first record/key. 546 * Set the ptr value to 1, that's the first record/key.
@@ -647,12 +553,12 @@ xfs_btree_firstrec(
647 * Change the cursor to point to the last record in the current block 553 * Change the cursor to point to the last record in the current block
648 * at the given level. Other levels are unaffected. 554 * at the given level. Other levels are unaffected.
649 */ 555 */
650int /* success=1, failure=0 */ 556STATIC int /* success=1, failure=0 */
651xfs_btree_lastrec( 557xfs_btree_lastrec(
652 xfs_btree_cur_t *cur, /* btree cursor */ 558 xfs_btree_cur_t *cur, /* btree cursor */
653 int level) /* level to change */ 559 int level) /* level to change */
654{ 560{
655 xfs_btree_block_t *block; /* generic btree block pointer */ 561 struct xfs_btree_block *block; /* generic btree block pointer */
656 xfs_buf_t *bp; /* buffer containing block */ 562 xfs_buf_t *bp; /* buffer containing block */
657 563
658 /* 564 /*
@@ -663,12 +569,12 @@ xfs_btree_lastrec(
663 /* 569 /*
664 * It's empty, there is no such record. 570 * It's empty, there is no such record.
665 */ 571 */
666 if (!block->bb_h.bb_numrecs) 572 if (!block->bb_numrecs)
667 return 0; 573 return 0;
668 /* 574 /*
669 * Set the ptr value to numrecs, that's the last record/key. 575 * Set the ptr value to numrecs, that's the last record/key.
670 */ 576 */
671 cur->bc_ptrs[level] = be16_to_cpu(block->bb_h.bb_numrecs); 577 cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
672 return 1; 578 return 1;
673} 579}
674 580
@@ -817,66 +723,84 @@ xfs_btree_reada_bufs(
817 xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count); 723 xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count);
818} 724}
819 725
726STATIC int
727xfs_btree_readahead_lblock(
728 struct xfs_btree_cur *cur,
729 int lr,
730 struct xfs_btree_block *block)
731{
732 int rval = 0;
733 xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib);
734 xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
735
736 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
737 xfs_btree_reada_bufl(cur->bc_mp, left, 1);
738 rval++;
739 }
740
741 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
742 xfs_btree_reada_bufl(cur->bc_mp, right, 1);
743 rval++;
744 }
745
746 return rval;
747}
748
749STATIC int
750xfs_btree_readahead_sblock(
751 struct xfs_btree_cur *cur,
752 int lr,
753 struct xfs_btree_block *block)
754{
755 int rval = 0;
756 xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib);
757 xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib);
758
759
760 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
761 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
762 left, 1);
763 rval++;
764 }
765
766 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
767 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
768 right, 1);
769 rval++;
770 }
771
772 return rval;
773}
774
820/* 775/*
821 * Read-ahead btree blocks, at the given level. 776 * Read-ahead btree blocks, at the given level.
822 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA. 777 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
823 */ 778 */
824int 779STATIC int
825xfs_btree_readahead_core( 780xfs_btree_readahead(
826 xfs_btree_cur_t *cur, /* btree cursor */ 781 struct xfs_btree_cur *cur, /* btree cursor */
827 int lev, /* level in btree */ 782 int lev, /* level in btree */
828 int lr) /* left/right bits */ 783 int lr) /* left/right bits */
829{ 784{
830 xfs_alloc_block_t *a; 785 struct xfs_btree_block *block;
831 xfs_bmbt_block_t *b; 786
832 xfs_inobt_block_t *i; 787 /*
833 int rval = 0; 788 * No readahead needed if we are at the root level and the
789 * btree root is stored in the inode.
790 */
791 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
792 (lev == cur->bc_nlevels - 1))
793 return 0;
794
795 if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
796 return 0;
834 797
835 ASSERT(cur->bc_bufs[lev] != NULL);
836 cur->bc_ra[lev] |= lr; 798 cur->bc_ra[lev] |= lr;
837 switch (cur->bc_btnum) { 799 block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
838 case XFS_BTNUM_BNO: 800
839 case XFS_BTNUM_CNT: 801 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
840 a = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]); 802 return xfs_btree_readahead_lblock(cur, lr, block);
841 if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(a->bb_leftsib) != NULLAGBLOCK) { 803 return xfs_btree_readahead_sblock(cur, lr, block);
842 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
843 be32_to_cpu(a->bb_leftsib), 1);
844 rval++;
845 }
846 if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(a->bb_rightsib) != NULLAGBLOCK) {
847 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
848 be32_to_cpu(a->bb_rightsib), 1);
849 rval++;
850 }
851 break;
852 case XFS_BTNUM_BMAP:
853 b = XFS_BUF_TO_BMBT_BLOCK(cur->bc_bufs[lev]);
854 if ((lr & XFS_BTCUR_LEFTRA) && be64_to_cpu(b->bb_leftsib) != NULLDFSBNO) {
855 xfs_btree_reada_bufl(cur->bc_mp, be64_to_cpu(b->bb_leftsib), 1);
856 rval++;
857 }
858 if ((lr & XFS_BTCUR_RIGHTRA) && be64_to_cpu(b->bb_rightsib) != NULLDFSBNO) {
859 xfs_btree_reada_bufl(cur->bc_mp, be64_to_cpu(b->bb_rightsib), 1);
860 rval++;
861 }
862 break;
863 case XFS_BTNUM_INO:
864 i = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]);
865 if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(i->bb_leftsib) != NULLAGBLOCK) {
866 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
867 be32_to_cpu(i->bb_leftsib), 1);
868 rval++;
869 }
870 if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(i->bb_rightsib) != NULLAGBLOCK) {
871 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
872 be32_to_cpu(i->bb_rightsib), 1);
873 rval++;
874 }
875 break;
876 default:
877 ASSERT(0);
878 }
879 return rval;
880} 804}
881 805
882/* 806/*
@@ -889,7 +813,7 @@ xfs_btree_setbuf(
889 int lev, /* level in btree */ 813 int lev, /* level in btree */
890 xfs_buf_t *bp) /* new buffer to set */ 814 xfs_buf_t *bp) /* new buffer to set */
891{ 815{
892 xfs_btree_block_t *b; /* btree block */ 816 struct xfs_btree_block *b; /* btree block */
893 xfs_buf_t *obp; /* old buffer pointer */ 817 xfs_buf_t *obp; /* old buffer pointer */
894 818
895 obp = cur->bc_bufs[lev]; 819 obp = cur->bc_bufs[lev];
@@ -900,7 +824,7 @@ xfs_btree_setbuf(
900 if (!bp) 824 if (!bp)
901 return; 825 return;
902 b = XFS_BUF_TO_BLOCK(bp); 826 b = XFS_BUF_TO_BLOCK(bp);
903 if (XFS_BTREE_LONG_PTRS(cur->bc_btnum)) { 827 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
904 if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO) 828 if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO)
905 cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA; 829 cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
906 if (be64_to_cpu(b->bb_u.l.bb_rightsib) == NULLDFSBNO) 830 if (be64_to_cpu(b->bb_u.l.bb_rightsib) == NULLDFSBNO)
@@ -912,3 +836,2855 @@ xfs_btree_setbuf(
912 cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA; 836 cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
913 } 837 }
914} 838}
839
840STATIC int
841xfs_btree_ptr_is_null(
842 struct xfs_btree_cur *cur,
843 union xfs_btree_ptr *ptr)
844{
845 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
846 return be64_to_cpu(ptr->l) == NULLFSBLOCK;
847 else
848 return be32_to_cpu(ptr->s) == NULLAGBLOCK;
849}
850
851STATIC void
852xfs_btree_set_ptr_null(
853 struct xfs_btree_cur *cur,
854 union xfs_btree_ptr *ptr)
855{
856 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
857 ptr->l = cpu_to_be64(NULLFSBLOCK);
858 else
859 ptr->s = cpu_to_be32(NULLAGBLOCK);
860}
861
862/*
863 * Get/set/init sibling pointers
864 */
865STATIC void
866xfs_btree_get_sibling(
867 struct xfs_btree_cur *cur,
868 struct xfs_btree_block *block,
869 union xfs_btree_ptr *ptr,
870 int lr)
871{
872 ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
873
874 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
875 if (lr == XFS_BB_RIGHTSIB)
876 ptr->l = block->bb_u.l.bb_rightsib;
877 else
878 ptr->l = block->bb_u.l.bb_leftsib;
879 } else {
880 if (lr == XFS_BB_RIGHTSIB)
881 ptr->s = block->bb_u.s.bb_rightsib;
882 else
883 ptr->s = block->bb_u.s.bb_leftsib;
884 }
885}
886
887STATIC void
888xfs_btree_set_sibling(
889 struct xfs_btree_cur *cur,
890 struct xfs_btree_block *block,
891 union xfs_btree_ptr *ptr,
892 int lr)
893{
894 ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
895
896 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
897 if (lr == XFS_BB_RIGHTSIB)
898 block->bb_u.l.bb_rightsib = ptr->l;
899 else
900 block->bb_u.l.bb_leftsib = ptr->l;
901 } else {
902 if (lr == XFS_BB_RIGHTSIB)
903 block->bb_u.s.bb_rightsib = ptr->s;
904 else
905 block->bb_u.s.bb_leftsib = ptr->s;
906 }
907}
908
909STATIC void
910xfs_btree_init_block(
911 struct xfs_btree_cur *cur,
912 int level,
913 int numrecs,
914 struct xfs_btree_block *new) /* new block */
915{
916 new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
917 new->bb_level = cpu_to_be16(level);
918 new->bb_numrecs = cpu_to_be16(numrecs);
919
920 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
921 new->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
922 new->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
923 } else {
924 new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
925 new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
926 }
927}
928
929/*
930 * Return true if ptr is the last record in the btree and
931 * we need to track updateѕ to this record. The decision
932 * will be further refined in the update_lastrec method.
933 */
934STATIC int
935xfs_btree_is_lastrec(
936 struct xfs_btree_cur *cur,
937 struct xfs_btree_block *block,
938 int level)
939{
940 union xfs_btree_ptr ptr;
941
942 if (level > 0)
943 return 0;
944 if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
945 return 0;
946
947 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
948 if (!xfs_btree_ptr_is_null(cur, &ptr))
949 return 0;
950 return 1;
951}
952
953STATIC void
954xfs_btree_buf_to_ptr(
955 struct xfs_btree_cur *cur,
956 struct xfs_buf *bp,
957 union xfs_btree_ptr *ptr)
958{
959 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
960 ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
961 XFS_BUF_ADDR(bp)));
962 else {
963 ptr->s = cpu_to_be32(XFS_DADDR_TO_AGBNO(cur->bc_mp,
964 XFS_BUF_ADDR(bp)));
965 }
966}
967
968STATIC xfs_daddr_t
969xfs_btree_ptr_to_daddr(
970 struct xfs_btree_cur *cur,
971 union xfs_btree_ptr *ptr)
972{
973 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
974 ASSERT(be64_to_cpu(ptr->l) != NULLFSBLOCK);
975
976 return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
977 } else {
978 ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
979 ASSERT(be32_to_cpu(ptr->s) != NULLAGBLOCK);
980
981 return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
982 be32_to_cpu(ptr->s));
983 }
984}
985
986STATIC void
987xfs_btree_set_refs(
988 struct xfs_btree_cur *cur,
989 struct xfs_buf *bp)
990{
991 switch (cur->bc_btnum) {
992 case XFS_BTNUM_BNO:
993 case XFS_BTNUM_CNT:
994 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
995 break;
996 case XFS_BTNUM_INO:
997 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF);
998 break;
999 case XFS_BTNUM_BMAP:
1000 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF);
1001 break;
1002 default:
1003 ASSERT(0);
1004 }
1005}
1006
1007STATIC int
1008xfs_btree_get_buf_block(
1009 struct xfs_btree_cur *cur,
1010 union xfs_btree_ptr *ptr,
1011 int flags,
1012 struct xfs_btree_block **block,
1013 struct xfs_buf **bpp)
1014{
1015 struct xfs_mount *mp = cur->bc_mp;
1016 xfs_daddr_t d;
1017
1018 /* need to sort out how callers deal with failures first */
1019 ASSERT(!(flags & XFS_BUF_TRYLOCK));
1020
1021 d = xfs_btree_ptr_to_daddr(cur, ptr);
1022 *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
1023 mp->m_bsize, flags);
1024
1025 ASSERT(*bpp);
1026 ASSERT(!XFS_BUF_GETERROR(*bpp));
1027
1028 *block = XFS_BUF_TO_BLOCK(*bpp);
1029 return 0;
1030}
1031
1032/*
1033 * Read in the buffer at the given ptr and return the buffer and
1034 * the block pointer within the buffer.
1035 */
1036STATIC int
1037xfs_btree_read_buf_block(
1038 struct xfs_btree_cur *cur,
1039 union xfs_btree_ptr *ptr,
1040 int level,
1041 int flags,
1042 struct xfs_btree_block **block,
1043 struct xfs_buf **bpp)
1044{
1045 struct xfs_mount *mp = cur->bc_mp;
1046 xfs_daddr_t d;
1047 int error;
1048
1049 /* need to sort out how callers deal with failures first */
1050 ASSERT(!(flags & XFS_BUF_TRYLOCK));
1051
1052 d = xfs_btree_ptr_to_daddr(cur, ptr);
1053 error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1054 mp->m_bsize, flags, bpp);
1055 if (error)
1056 return error;
1057
1058 ASSERT(*bpp != NULL);
1059 ASSERT(!XFS_BUF_GETERROR(*bpp));
1060
1061 xfs_btree_set_refs(cur, *bpp);
1062 *block = XFS_BUF_TO_BLOCK(*bpp);
1063
1064 error = xfs_btree_check_block(cur, *block, level, *bpp);
1065 if (error)
1066 xfs_trans_brelse(cur->bc_tp, *bpp);
1067 return error;
1068}
1069
1070/*
1071 * Copy keys from one btree block to another.
1072 */
1073STATIC void
1074xfs_btree_copy_keys(
1075 struct xfs_btree_cur *cur,
1076 union xfs_btree_key *dst_key,
1077 union xfs_btree_key *src_key,
1078 int numkeys)
1079{
1080 ASSERT(numkeys >= 0);
1081 memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
1082}
1083
1084/*
1085 * Copy records from one btree block to another.
1086 */
1087STATIC void
1088xfs_btree_copy_recs(
1089 struct xfs_btree_cur *cur,
1090 union xfs_btree_rec *dst_rec,
1091 union xfs_btree_rec *src_rec,
1092 int numrecs)
1093{
1094 ASSERT(numrecs >= 0);
1095 memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
1096}
1097
1098/*
1099 * Copy block pointers from one btree block to another.
1100 */
1101STATIC void
1102xfs_btree_copy_ptrs(
1103 struct xfs_btree_cur *cur,
1104 union xfs_btree_ptr *dst_ptr,
1105 union xfs_btree_ptr *src_ptr,
1106 int numptrs)
1107{
1108 ASSERT(numptrs >= 0);
1109 memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
1110}
1111
1112/*
1113 * Shift keys one index left/right inside a single btree block.
1114 */
1115STATIC void
1116xfs_btree_shift_keys(
1117 struct xfs_btree_cur *cur,
1118 union xfs_btree_key *key,
1119 int dir,
1120 int numkeys)
1121{
1122 char *dst_key;
1123
1124 ASSERT(numkeys >= 0);
1125 ASSERT(dir == 1 || dir == -1);
1126
1127 dst_key = (char *)key + (dir * cur->bc_ops->key_len);
1128 memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
1129}
1130
1131/*
1132 * Shift records one index left/right inside a single btree block.
1133 */
1134STATIC void
1135xfs_btree_shift_recs(
1136 struct xfs_btree_cur *cur,
1137 union xfs_btree_rec *rec,
1138 int dir,
1139 int numrecs)
1140{
1141 char *dst_rec;
1142
1143 ASSERT(numrecs >= 0);
1144 ASSERT(dir == 1 || dir == -1);
1145
1146 dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
1147 memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
1148}
1149
1150/*
1151 * Shift block pointers one index left/right inside a single btree block.
1152 */
1153STATIC void
1154xfs_btree_shift_ptrs(
1155 struct xfs_btree_cur *cur,
1156 union xfs_btree_ptr *ptr,
1157 int dir,
1158 int numptrs)
1159{
1160 char *dst_ptr;
1161
1162 ASSERT(numptrs >= 0);
1163 ASSERT(dir == 1 || dir == -1);
1164
1165 dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
1166 memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
1167}
1168
1169/*
1170 * Log key values from the btree block.
1171 */
1172STATIC void
1173xfs_btree_log_keys(
1174 struct xfs_btree_cur *cur,
1175 struct xfs_buf *bp,
1176 int first,
1177 int last)
1178{
1179 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1180 XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1181
1182 if (bp) {
1183 xfs_trans_log_buf(cur->bc_tp, bp,
1184 xfs_btree_key_offset(cur, first),
1185 xfs_btree_key_offset(cur, last + 1) - 1);
1186 } else {
1187 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1188 xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1189 }
1190
1191 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1192}
1193
1194/*
1195 * Log record values from the btree block.
1196 */
1197void
1198xfs_btree_log_recs(
1199 struct xfs_btree_cur *cur,
1200 struct xfs_buf *bp,
1201 int first,
1202 int last)
1203{
1204 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1205 XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1206
1207 xfs_trans_log_buf(cur->bc_tp, bp,
1208 xfs_btree_rec_offset(cur, first),
1209 xfs_btree_rec_offset(cur, last + 1) - 1);
1210
1211 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1212}
1213
1214/*
1215 * Log block pointer fields from a btree block (nonleaf).
1216 */
1217STATIC void
1218xfs_btree_log_ptrs(
1219 struct xfs_btree_cur *cur, /* btree cursor */
1220 struct xfs_buf *bp, /* buffer containing btree block */
1221 int first, /* index of first pointer to log */
1222 int last) /* index of last pointer to log */
1223{
1224 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1225 XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1226
1227 if (bp) {
1228 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
1229 int level = xfs_btree_get_level(block);
1230
1231 xfs_trans_log_buf(cur->bc_tp, bp,
1232 xfs_btree_ptr_offset(cur, first, level),
1233 xfs_btree_ptr_offset(cur, last + 1, level) - 1);
1234 } else {
1235 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1236 xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1237 }
1238
1239 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1240}
1241
1242/*
1243 * Log fields from a btree block header.
1244 */
1245void
1246xfs_btree_log_block(
1247 struct xfs_btree_cur *cur, /* btree cursor */
1248 struct xfs_buf *bp, /* buffer containing btree block */
1249 int fields) /* mask of fields: XFS_BB_... */
1250{
1251 int first; /* first byte offset logged */
1252 int last; /* last byte offset logged */
1253 static const short soffsets[] = { /* table of offsets (short) */
1254 offsetof(struct xfs_btree_block, bb_magic),
1255 offsetof(struct xfs_btree_block, bb_level),
1256 offsetof(struct xfs_btree_block, bb_numrecs),
1257 offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
1258 offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
1259 XFS_BTREE_SBLOCK_LEN
1260 };
1261 static const short loffsets[] = { /* table of offsets (long) */
1262 offsetof(struct xfs_btree_block, bb_magic),
1263 offsetof(struct xfs_btree_block, bb_level),
1264 offsetof(struct xfs_btree_block, bb_numrecs),
1265 offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
1266 offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
1267 XFS_BTREE_LBLOCK_LEN
1268 };
1269
1270 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1271 XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
1272
1273 if (bp) {
1274 xfs_btree_offsets(fields,
1275 (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
1276 loffsets : soffsets,
1277 XFS_BB_NUM_BITS, &first, &last);
1278 xfs_trans_log_buf(cur->bc_tp, bp, first, last);
1279 } else {
1280 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1281 xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1282 }
1283
1284 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1285}
1286
1287/*
1288 * Increment cursor by one record at the level.
1289 * For nonzero levels the leaf-ward information is untouched.
1290 */
1291int /* error */
1292xfs_btree_increment(
1293 struct xfs_btree_cur *cur,
1294 int level,
1295 int *stat) /* success/failure */
1296{
1297 struct xfs_btree_block *block;
1298 union xfs_btree_ptr ptr;
1299 struct xfs_buf *bp;
1300 int error; /* error return value */
1301 int lev;
1302
1303 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1304 XFS_BTREE_TRACE_ARGI(cur, level);
1305
1306 ASSERT(level < cur->bc_nlevels);
1307
1308 /* Read-ahead to the right at this level. */
1309 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1310
1311 /* Get a pointer to the btree block. */
1312 block = xfs_btree_get_block(cur, level, &bp);
1313
1314#ifdef DEBUG
1315 error = xfs_btree_check_block(cur, block, level, bp);
1316 if (error)
1317 goto error0;
1318#endif
1319
1320 /* We're done if we remain in the block after the increment. */
1321 if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
1322 goto out1;
1323
1324 /* Fail if we just went off the right edge of the tree. */
1325 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1326 if (xfs_btree_ptr_is_null(cur, &ptr))
1327 goto out0;
1328
1329 XFS_BTREE_STATS_INC(cur, increment);
1330
1331 /*
1332 * March up the tree incrementing pointers.
1333 * Stop when we don't go off the right edge of a block.
1334 */
1335 for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1336 block = xfs_btree_get_block(cur, lev, &bp);
1337
1338#ifdef DEBUG
1339 error = xfs_btree_check_block(cur, block, lev, bp);
1340 if (error)
1341 goto error0;
1342#endif
1343
1344 if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
1345 break;
1346
1347 /* Read-ahead the right block for the next loop. */
1348 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1349 }
1350
1351 /*
1352 * If we went off the root then we are either seriously
1353 * confused or have the tree root in an inode.
1354 */
1355 if (lev == cur->bc_nlevels) {
1356 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1357 goto out0;
1358 ASSERT(0);
1359 error = EFSCORRUPTED;
1360 goto error0;
1361 }
1362 ASSERT(lev < cur->bc_nlevels);
1363
1364 /*
1365 * Now walk back down the tree, fixing up the cursor's buffer
1366 * pointers and key numbers.
1367 */
1368 for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1369 union xfs_btree_ptr *ptrp;
1370
1371 ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1372 error = xfs_btree_read_buf_block(cur, ptrp, --lev,
1373 0, &block, &bp);
1374 if (error)
1375 goto error0;
1376
1377 xfs_btree_setbuf(cur, lev, bp);
1378 cur->bc_ptrs[lev] = 1;
1379 }
1380out1:
1381 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1382 *stat = 1;
1383 return 0;
1384
1385out0:
1386 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1387 *stat = 0;
1388 return 0;
1389
1390error0:
1391 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1392 return error;
1393}
1394
1395/*
1396 * Decrement cursor by one record at the level.
1397 * For nonzero levels the leaf-ward information is untouched.
1398 */
1399int /* error */
1400xfs_btree_decrement(
1401 struct xfs_btree_cur *cur,
1402 int level,
1403 int *stat) /* success/failure */
1404{
1405 struct xfs_btree_block *block;
1406 xfs_buf_t *bp;
1407 int error; /* error return value */
1408 int lev;
1409 union xfs_btree_ptr ptr;
1410
1411 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1412 XFS_BTREE_TRACE_ARGI(cur, level);
1413
1414 ASSERT(level < cur->bc_nlevels);
1415
1416 /* Read-ahead to the left at this level. */
1417 xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1418
1419 /* We're done if we remain in the block after the decrement. */
1420 if (--cur->bc_ptrs[level] > 0)
1421 goto out1;
1422
1423 /* Get a pointer to the btree block. */
1424 block = xfs_btree_get_block(cur, level, &bp);
1425
1426#ifdef DEBUG
1427 error = xfs_btree_check_block(cur, block, level, bp);
1428 if (error)
1429 goto error0;
1430#endif
1431
1432 /* Fail if we just went off the left edge of the tree. */
1433 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
1434 if (xfs_btree_ptr_is_null(cur, &ptr))
1435 goto out0;
1436
1437 XFS_BTREE_STATS_INC(cur, decrement);
1438
1439 /*
1440 * March up the tree decrementing pointers.
1441 * Stop when we don't go off the left edge of a block.
1442 */
1443 for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1444 if (--cur->bc_ptrs[lev] > 0)
1445 break;
1446 /* Read-ahead the left block for the next loop. */
1447 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1448 }
1449
1450 /*
1451 * If we went off the root then we are seriously confused.
1452 * or the root of the tree is in an inode.
1453 */
1454 if (lev == cur->bc_nlevels) {
1455 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1456 goto out0;
1457 ASSERT(0);
1458 error = EFSCORRUPTED;
1459 goto error0;
1460 }
1461 ASSERT(lev < cur->bc_nlevels);
1462
1463 /*
1464 * Now walk back down the tree, fixing up the cursor's buffer
1465 * pointers and key numbers.
1466 */
1467 for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1468 union xfs_btree_ptr *ptrp;
1469
1470 ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1471 error = xfs_btree_read_buf_block(cur, ptrp, --lev,
1472 0, &block, &bp);
1473 if (error)
1474 goto error0;
1475 xfs_btree_setbuf(cur, lev, bp);
1476 cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
1477 }
1478out1:
1479 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1480 *stat = 1;
1481 return 0;
1482
1483out0:
1484 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1485 *stat = 0;
1486 return 0;
1487
1488error0:
1489 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1490 return error;
1491}
1492
1493STATIC int
1494xfs_btree_lookup_get_block(
1495 struct xfs_btree_cur *cur, /* btree cursor */
1496 int level, /* level in the btree */
1497 union xfs_btree_ptr *pp, /* ptr to btree block */
1498 struct xfs_btree_block **blkp) /* return btree block */
1499{
1500 struct xfs_buf *bp; /* buffer pointer for btree block */
1501 int error = 0;
1502
1503 /* special case the root block if in an inode */
1504 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1505 (level == cur->bc_nlevels - 1)) {
1506 *blkp = xfs_btree_get_iroot(cur);
1507 return 0;
1508 }
1509
1510 /*
1511 * If the old buffer at this level for the disk address we are
1512 * looking for re-use it.
1513 *
1514 * Otherwise throw it away and get a new one.
1515 */
1516 bp = cur->bc_bufs[level];
1517 if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
1518 *blkp = XFS_BUF_TO_BLOCK(bp);
1519 return 0;
1520 }
1521
1522 error = xfs_btree_read_buf_block(cur, pp, level, 0, blkp, &bp);
1523 if (error)
1524 return error;
1525
1526 xfs_btree_setbuf(cur, level, bp);
1527 return 0;
1528}
1529
1530/*
1531 * Get current search key. For level 0 we don't actually have a key
1532 * structure so we make one up from the record. For all other levels
1533 * we just return the right key.
1534 */
1535STATIC union xfs_btree_key *
1536xfs_lookup_get_search_key(
1537 struct xfs_btree_cur *cur,
1538 int level,
1539 int keyno,
1540 struct xfs_btree_block *block,
1541 union xfs_btree_key *kp)
1542{
1543 if (level == 0) {
1544 cur->bc_ops->init_key_from_rec(kp,
1545 xfs_btree_rec_addr(cur, keyno, block));
1546 return kp;
1547 }
1548
1549 return xfs_btree_key_addr(cur, keyno, block);
1550}
1551
1552/*
1553 * Lookup the record. The cursor is made to point to it, based on dir.
1554 * Return 0 if can't find any such record, 1 for success.
1555 */
1556int /* error */
1557xfs_btree_lookup(
1558 struct xfs_btree_cur *cur, /* btree cursor */
1559 xfs_lookup_t dir, /* <=, ==, or >= */
1560 int *stat) /* success/failure */
1561{
1562 struct xfs_btree_block *block; /* current btree block */
1563 __int64_t diff; /* difference for the current key */
1564 int error; /* error return value */
1565 int keyno; /* current key number */
1566 int level; /* level in the btree */
1567 union xfs_btree_ptr *pp; /* ptr to btree block */
1568 union xfs_btree_ptr ptr; /* ptr to btree block */
1569
1570 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1571 XFS_BTREE_TRACE_ARGI(cur, dir);
1572
1573 XFS_BTREE_STATS_INC(cur, lookup);
1574
1575 block = NULL;
1576 keyno = 0;
1577
1578 /* initialise start pointer from cursor */
1579 cur->bc_ops->init_ptr_from_cur(cur, &ptr);
1580 pp = &ptr;
1581
1582 /*
1583 * Iterate over each level in the btree, starting at the root.
1584 * For each level above the leaves, find the key we need, based
1585 * on the lookup record, then follow the corresponding block
1586 * pointer down to the next level.
1587 */
1588 for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
1589 /* Get the block we need to do the lookup on. */
1590 error = xfs_btree_lookup_get_block(cur, level, pp, &block);
1591 if (error)
1592 goto error0;
1593
1594 if (diff == 0) {
1595 /*
1596 * If we already had a key match at a higher level, we
1597 * know we need to use the first entry in this block.
1598 */
1599 keyno = 1;
1600 } else {
1601 /* Otherwise search this block. Do a binary search. */
1602
1603 int high; /* high entry number */
1604 int low; /* low entry number */
1605
1606 /* Set low and high entry numbers, 1-based. */
1607 low = 1;
1608 high = xfs_btree_get_numrecs(block);
1609 if (!high) {
1610 /* Block is empty, must be an empty leaf. */
1611 ASSERT(level == 0 && cur->bc_nlevels == 1);
1612
1613 cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
1614 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1615 *stat = 0;
1616 return 0;
1617 }
1618
1619 /* Binary search the block. */
1620 while (low <= high) {
1621 union xfs_btree_key key;
1622 union xfs_btree_key *kp;
1623
1624 XFS_BTREE_STATS_INC(cur, compare);
1625
1626 /* keyno is average of low and high. */
1627 keyno = (low + high) >> 1;
1628
1629 /* Get current search key */
1630 kp = xfs_lookup_get_search_key(cur, level,
1631 keyno, block, &key);
1632
1633 /*
1634 * Compute difference to get next direction:
1635 * - less than, move right
1636 * - greater than, move left
1637 * - equal, we're done
1638 */
1639 diff = cur->bc_ops->key_diff(cur, kp);
1640 if (diff < 0)
1641 low = keyno + 1;
1642 else if (diff > 0)
1643 high = keyno - 1;
1644 else
1645 break;
1646 }
1647 }
1648
1649 /*
1650 * If there are more levels, set up for the next level
1651 * by getting the block number and filling in the cursor.
1652 */
1653 if (level > 0) {
1654 /*
1655 * If we moved left, need the previous key number,
1656 * unless there isn't one.
1657 */
1658 if (diff > 0 && --keyno < 1)
1659 keyno = 1;
1660 pp = xfs_btree_ptr_addr(cur, keyno, block);
1661
1662#ifdef DEBUG
1663 error = xfs_btree_check_ptr(cur, pp, 0, level);
1664 if (error)
1665 goto error0;
1666#endif
1667 cur->bc_ptrs[level] = keyno;
1668 }
1669 }
1670
1671 /* Done with the search. See if we need to adjust the results. */
1672 if (dir != XFS_LOOKUP_LE && diff < 0) {
1673 keyno++;
1674 /*
1675 * If ge search and we went off the end of the block, but it's
1676 * not the last block, we're in the wrong block.
1677 */
1678 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1679 if (dir == XFS_LOOKUP_GE &&
1680 keyno > xfs_btree_get_numrecs(block) &&
1681 !xfs_btree_ptr_is_null(cur, &ptr)) {
1682 int i;
1683
1684 cur->bc_ptrs[0] = keyno;
1685 error = xfs_btree_increment(cur, 0, &i);
1686 if (error)
1687 goto error0;
1688 XFS_WANT_CORRUPTED_RETURN(i == 1);
1689 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1690 *stat = 1;
1691 return 0;
1692 }
1693 } else if (dir == XFS_LOOKUP_LE && diff > 0)
1694 keyno--;
1695 cur->bc_ptrs[0] = keyno;
1696
1697 /* Return if we succeeded or not. */
1698 if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
1699 *stat = 0;
1700 else if (dir != XFS_LOOKUP_EQ || diff == 0)
1701 *stat = 1;
1702 else
1703 *stat = 0;
1704 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1705 return 0;
1706
1707error0:
1708 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1709 return error;
1710}
1711
1712/*
1713 * Update keys at all levels from here to the root along the cursor's path.
1714 */
1715STATIC int
1716xfs_btree_updkey(
1717 struct xfs_btree_cur *cur,
1718 union xfs_btree_key *keyp,
1719 int level)
1720{
1721 struct xfs_btree_block *block;
1722 struct xfs_buf *bp;
1723 union xfs_btree_key *kp;
1724 int ptr;
1725
1726 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1727 XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
1728
1729 ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
1730
1731 /*
1732 * Go up the tree from this level toward the root.
1733 * At each level, update the key value to the value input.
1734 * Stop when we reach a level where the cursor isn't pointing
1735 * at the first entry in the block.
1736 */
1737 for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
1738#ifdef DEBUG
1739 int error;
1740#endif
1741 block = xfs_btree_get_block(cur, level, &bp);
1742#ifdef DEBUG
1743 error = xfs_btree_check_block(cur, block, level, bp);
1744 if (error) {
1745 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1746 return error;
1747 }
1748#endif
1749 ptr = cur->bc_ptrs[level];
1750 kp = xfs_btree_key_addr(cur, ptr, block);
1751 xfs_btree_copy_keys(cur, kp, keyp, 1);
1752 xfs_btree_log_keys(cur, bp, ptr, ptr);
1753 }
1754
1755 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1756 return 0;
1757}
1758
1759/*
1760 * Update the record referred to by cur to the value in the
1761 * given record. This either works (return 0) or gets an
1762 * EFSCORRUPTED error.
1763 */
1764int
1765xfs_btree_update(
1766 struct xfs_btree_cur *cur,
1767 union xfs_btree_rec *rec)
1768{
1769 struct xfs_btree_block *block;
1770 struct xfs_buf *bp;
1771 int error;
1772 int ptr;
1773 union xfs_btree_rec *rp;
1774
1775 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1776 XFS_BTREE_TRACE_ARGR(cur, rec);
1777
1778 /* Pick up the current block. */
1779 block = xfs_btree_get_block(cur, 0, &bp);
1780
1781#ifdef DEBUG
1782 error = xfs_btree_check_block(cur, block, 0, bp);
1783 if (error)
1784 goto error0;
1785#endif
1786 /* Get the address of the rec to be updated. */
1787 ptr = cur->bc_ptrs[0];
1788 rp = xfs_btree_rec_addr(cur, ptr, block);
1789
1790 /* Fill in the new contents and log them. */
1791 xfs_btree_copy_recs(cur, rp, rec, 1);
1792 xfs_btree_log_recs(cur, bp, ptr, ptr);
1793
1794 /*
1795 * If we are tracking the last record in the tree and
1796 * we are at the far right edge of the tree, update it.
1797 */
1798 if (xfs_btree_is_lastrec(cur, block, 0)) {
1799 cur->bc_ops->update_lastrec(cur, block, rec,
1800 ptr, LASTREC_UPDATE);
1801 }
1802
1803 /* Updating first rec in leaf. Pass new key value up to our parent. */
1804 if (ptr == 1) {
1805 union xfs_btree_key key;
1806
1807 cur->bc_ops->init_key_from_rec(&key, rec);
1808 error = xfs_btree_updkey(cur, &key, 1);
1809 if (error)
1810 goto error0;
1811 }
1812
1813 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1814 return 0;
1815
1816error0:
1817 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1818 return error;
1819}
1820
1821/*
1822 * Move 1 record left from cur/level if possible.
1823 * Update cur to reflect the new path.
1824 */
1825STATIC int /* error */
1826xfs_btree_lshift(
1827 struct xfs_btree_cur *cur,
1828 int level,
1829 int *stat) /* success/failure */
1830{
1831 union xfs_btree_key key; /* btree key */
1832 struct xfs_buf *lbp; /* left buffer pointer */
1833 struct xfs_btree_block *left; /* left btree block */
1834 int lrecs; /* left record count */
1835 struct xfs_buf *rbp; /* right buffer pointer */
1836 struct xfs_btree_block *right; /* right btree block */
1837 int rrecs; /* right record count */
1838 union xfs_btree_ptr lptr; /* left btree pointer */
1839 union xfs_btree_key *rkp = NULL; /* right btree key */
1840 union xfs_btree_ptr *rpp = NULL; /* right address pointer */
1841 union xfs_btree_rec *rrp = NULL; /* right record pointer */
1842 int error; /* error return value */
1843
1844 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1845 XFS_BTREE_TRACE_ARGI(cur, level);
1846
1847 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1848 level == cur->bc_nlevels - 1)
1849 goto out0;
1850
1851 /* Set up variables for this block as "right". */
1852 right = xfs_btree_get_block(cur, level, &rbp);
1853
1854#ifdef DEBUG
1855 error = xfs_btree_check_block(cur, right, level, rbp);
1856 if (error)
1857 goto error0;
1858#endif
1859
1860 /* If we've got no left sibling then we can't shift an entry left. */
1861 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
1862 if (xfs_btree_ptr_is_null(cur, &lptr))
1863 goto out0;
1864
1865 /*
1866 * If the cursor entry is the one that would be moved, don't
1867 * do it... it's too complicated.
1868 */
1869 if (cur->bc_ptrs[level] <= 1)
1870 goto out0;
1871
1872 /* Set up the left neighbor as "left". */
1873 error = xfs_btree_read_buf_block(cur, &lptr, level, 0, &left, &lbp);
1874 if (error)
1875 goto error0;
1876
1877 /* If it's full, it can't take another entry. */
1878 lrecs = xfs_btree_get_numrecs(left);
1879 if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
1880 goto out0;
1881
1882 rrecs = xfs_btree_get_numrecs(right);
1883
1884 /*
1885 * We add one entry to the left side and remove one for the right side.
1886 * Accout for it here, the changes will be updated on disk and logged
1887 * later.
1888 */
1889 lrecs++;
1890 rrecs--;
1891
1892 XFS_BTREE_STATS_INC(cur, lshift);
1893 XFS_BTREE_STATS_ADD(cur, moves, 1);
1894
1895 /*
1896 * If non-leaf, copy a key and a ptr to the left block.
1897 * Log the changes to the left block.
1898 */
1899 if (level > 0) {
1900 /* It's a non-leaf. Move keys and pointers. */
1901 union xfs_btree_key *lkp; /* left btree key */
1902 union xfs_btree_ptr *lpp; /* left address pointer */
1903
1904 lkp = xfs_btree_key_addr(cur, lrecs, left);
1905 rkp = xfs_btree_key_addr(cur, 1, right);
1906
1907 lpp = xfs_btree_ptr_addr(cur, lrecs, left);
1908 rpp = xfs_btree_ptr_addr(cur, 1, right);
1909#ifdef DEBUG
1910 error = xfs_btree_check_ptr(cur, rpp, 0, level);
1911 if (error)
1912 goto error0;
1913#endif
1914 xfs_btree_copy_keys(cur, lkp, rkp, 1);
1915 xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
1916
1917 xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
1918 xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
1919
1920 ASSERT(cur->bc_ops->keys_inorder(cur,
1921 xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
1922 } else {
1923 /* It's a leaf. Move records. */
1924 union xfs_btree_rec *lrp; /* left record pointer */
1925
1926 lrp = xfs_btree_rec_addr(cur, lrecs, left);
1927 rrp = xfs_btree_rec_addr(cur, 1, right);
1928
1929 xfs_btree_copy_recs(cur, lrp, rrp, 1);
1930 xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
1931
1932 ASSERT(cur->bc_ops->recs_inorder(cur,
1933 xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
1934 }
1935
1936 xfs_btree_set_numrecs(left, lrecs);
1937 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
1938
1939 xfs_btree_set_numrecs(right, rrecs);
1940 xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
1941
1942 /*
1943 * Slide the contents of right down one entry.
1944 */
1945 XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
1946 if (level > 0) {
1947 /* It's a nonleaf. operate on keys and ptrs */
1948#ifdef DEBUG
1949 int i; /* loop index */
1950
1951 for (i = 0; i < rrecs; i++) {
1952 error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
1953 if (error)
1954 goto error0;
1955 }
1956#endif
1957 xfs_btree_shift_keys(cur,
1958 xfs_btree_key_addr(cur, 2, right),
1959 -1, rrecs);
1960 xfs_btree_shift_ptrs(cur,
1961 xfs_btree_ptr_addr(cur, 2, right),
1962 -1, rrecs);
1963
1964 xfs_btree_log_keys(cur, rbp, 1, rrecs);
1965 xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
1966 } else {
1967 /* It's a leaf. operate on records */
1968 xfs_btree_shift_recs(cur,
1969 xfs_btree_rec_addr(cur, 2, right),
1970 -1, rrecs);
1971 xfs_btree_log_recs(cur, rbp, 1, rrecs);
1972
1973 /*
1974 * If it's the first record in the block, we'll need a key
1975 * structure to pass up to the next level (updkey).
1976 */
1977 cur->bc_ops->init_key_from_rec(&key,
1978 xfs_btree_rec_addr(cur, 1, right));
1979 rkp = &key;
1980 }
1981
1982 /* Update the parent key values of right. */
1983 error = xfs_btree_updkey(cur, rkp, level + 1);
1984 if (error)
1985 goto error0;
1986
1987 /* Slide the cursor value left one. */
1988 cur->bc_ptrs[level]--;
1989
1990 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1991 *stat = 1;
1992 return 0;
1993
1994out0:
1995 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1996 *stat = 0;
1997 return 0;
1998
1999error0:
2000 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2001 return error;
2002}
2003
2004/*
2005 * Move 1 record right from cur/level if possible.
2006 * Update cur to reflect the new path.
2007 */
2008STATIC int /* error */
2009xfs_btree_rshift(
2010 struct xfs_btree_cur *cur,
2011 int level,
2012 int *stat) /* success/failure */
2013{
2014 union xfs_btree_key key; /* btree key */
2015 struct xfs_buf *lbp; /* left buffer pointer */
2016 struct xfs_btree_block *left; /* left btree block */
2017 struct xfs_buf *rbp; /* right buffer pointer */
2018 struct xfs_btree_block *right; /* right btree block */
2019 struct xfs_btree_cur *tcur; /* temporary btree cursor */
2020 union xfs_btree_ptr rptr; /* right block pointer */
2021 union xfs_btree_key *rkp; /* right btree key */
2022 int rrecs; /* right record count */
2023 int lrecs; /* left record count */
2024 int error; /* error return value */
2025 int i; /* loop counter */
2026
2027 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2028 XFS_BTREE_TRACE_ARGI(cur, level);
2029
2030 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2031 (level == cur->bc_nlevels - 1))
2032 goto out0;
2033
2034 /* Set up variables for this block as "left". */
2035 left = xfs_btree_get_block(cur, level, &lbp);
2036
2037#ifdef DEBUG
2038 error = xfs_btree_check_block(cur, left, level, lbp);
2039 if (error)
2040 goto error0;
2041#endif
2042
2043 /* If we've got no right sibling then we can't shift an entry right. */
2044 xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2045 if (xfs_btree_ptr_is_null(cur, &rptr))
2046 goto out0;
2047
2048 /*
2049 * If the cursor entry is the one that would be moved, don't
2050 * do it... it's too complicated.
2051 */
2052 lrecs = xfs_btree_get_numrecs(left);
2053 if (cur->bc_ptrs[level] >= lrecs)
2054 goto out0;
2055
2056 /* Set up the right neighbor as "right". */
2057 error = xfs_btree_read_buf_block(cur, &rptr, level, 0, &right, &rbp);
2058 if (error)
2059 goto error0;
2060
2061 /* If it's full, it can't take another entry. */
2062 rrecs = xfs_btree_get_numrecs(right);
2063 if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
2064 goto out0;
2065
2066 XFS_BTREE_STATS_INC(cur, rshift);
2067 XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2068
2069 /*
2070 * Make a hole at the start of the right neighbor block, then
2071 * copy the last left block entry to the hole.
2072 */
2073 if (level > 0) {
2074 /* It's a nonleaf. make a hole in the keys and ptrs */
2075 union xfs_btree_key *lkp;
2076 union xfs_btree_ptr *lpp;
2077 union xfs_btree_ptr *rpp;
2078
2079 lkp = xfs_btree_key_addr(cur, lrecs, left);
2080 lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2081 rkp = xfs_btree_key_addr(cur, 1, right);
2082 rpp = xfs_btree_ptr_addr(cur, 1, right);
2083
2084#ifdef DEBUG
2085 for (i = rrecs - 1; i >= 0; i--) {
2086 error = xfs_btree_check_ptr(cur, rpp, i, level);
2087 if (error)
2088 goto error0;
2089 }
2090#endif
2091
2092 xfs_btree_shift_keys(cur, rkp, 1, rrecs);
2093 xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
2094
2095#ifdef DEBUG
2096 error = xfs_btree_check_ptr(cur, lpp, 0, level);
2097 if (error)
2098 goto error0;
2099#endif
2100
2101 /* Now put the new data in, and log it. */
2102 xfs_btree_copy_keys(cur, rkp, lkp, 1);
2103 xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
2104
2105 xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
2106 xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
2107
2108 ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
2109 xfs_btree_key_addr(cur, 2, right)));
2110 } else {
2111 /* It's a leaf. make a hole in the records */
2112 union xfs_btree_rec *lrp;
2113 union xfs_btree_rec *rrp;
2114
2115 lrp = xfs_btree_rec_addr(cur, lrecs, left);
2116 rrp = xfs_btree_rec_addr(cur, 1, right);
2117
2118 xfs_btree_shift_recs(cur, rrp, 1, rrecs);
2119
2120 /* Now put the new data in, and log it. */
2121 xfs_btree_copy_recs(cur, rrp, lrp, 1);
2122 xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
2123
2124 cur->bc_ops->init_key_from_rec(&key, rrp);
2125 rkp = &key;
2126
2127 ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
2128 xfs_btree_rec_addr(cur, 2, right)));
2129 }
2130
2131 /*
2132 * Decrement and log left's numrecs, bump and log right's numrecs.
2133 */
2134 xfs_btree_set_numrecs(left, --lrecs);
2135 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2136
2137 xfs_btree_set_numrecs(right, ++rrecs);
2138 xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2139
2140 /*
2141 * Using a temporary cursor, update the parent key values of the
2142 * block on the right.
2143 */
2144 error = xfs_btree_dup_cursor(cur, &tcur);
2145 if (error)
2146 goto error0;
2147 i = xfs_btree_lastrec(tcur, level);
2148 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
2149
2150 error = xfs_btree_increment(tcur, level, &i);
2151 if (error)
2152 goto error1;
2153
2154 error = xfs_btree_updkey(tcur, rkp, level + 1);
2155 if (error)
2156 goto error1;
2157
2158 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2159
2160 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2161 *stat = 1;
2162 return 0;
2163
2164out0:
2165 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2166 *stat = 0;
2167 return 0;
2168
2169error0:
2170 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2171 return error;
2172
2173error1:
2174 XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
2175 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2176 return error;
2177}
2178
2179/*
2180 * Split cur/level block in half.
2181 * Return new block number and the key to its first
2182 * record (to be inserted into parent).
2183 */
2184STATIC int /* error */
2185xfs_btree_split(
2186 struct xfs_btree_cur *cur,
2187 int level,
2188 union xfs_btree_ptr *ptrp,
2189 union xfs_btree_key *key,
2190 struct xfs_btree_cur **curp,
2191 int *stat) /* success/failure */
2192{
2193 union xfs_btree_ptr lptr; /* left sibling block ptr */
2194 struct xfs_buf *lbp; /* left buffer pointer */
2195 struct xfs_btree_block *left; /* left btree block */
2196 union xfs_btree_ptr rptr; /* right sibling block ptr */
2197 struct xfs_buf *rbp; /* right buffer pointer */
2198 struct xfs_btree_block *right; /* right btree block */
2199 union xfs_btree_ptr rrptr; /* right-right sibling ptr */
2200 struct xfs_buf *rrbp; /* right-right buffer pointer */
2201 struct xfs_btree_block *rrblock; /* right-right btree block */
2202 int lrecs;
2203 int rrecs;
2204 int src_index;
2205 int error; /* error return value */
2206#ifdef DEBUG
2207 int i;
2208#endif
2209
2210 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2211 XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
2212
2213 XFS_BTREE_STATS_INC(cur, split);
2214
2215 /* Set up left block (current one). */
2216 left = xfs_btree_get_block(cur, level, &lbp);
2217
2218#ifdef DEBUG
2219 error = xfs_btree_check_block(cur, left, level, lbp);
2220 if (error)
2221 goto error0;
2222#endif
2223
2224 xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2225
2226 /* Allocate the new block. If we can't do it, we're toast. Give up. */
2227 error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, 1, stat);
2228 if (error)
2229 goto error0;
2230 if (*stat == 0)
2231 goto out0;
2232 XFS_BTREE_STATS_INC(cur, alloc);
2233
2234 /* Set up the new block as "right". */
2235 error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
2236 if (error)
2237 goto error0;
2238
2239 /* Fill in the btree header for the new right block. */
2240 xfs_btree_init_block(cur, xfs_btree_get_level(left), 0, right);
2241
2242 /*
2243 * Split the entries between the old and the new block evenly.
2244 * Make sure that if there's an odd number of entries now, that
2245 * each new block will have the same number of entries.
2246 */
2247 lrecs = xfs_btree_get_numrecs(left);
2248 rrecs = lrecs / 2;
2249 if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
2250 rrecs++;
2251 src_index = (lrecs - rrecs + 1);
2252
2253 XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2254
2255 /*
2256 * Copy btree block entries from the left block over to the
2257 * new block, the right. Update the right block and log the
2258 * changes.
2259 */
2260 if (level > 0) {
2261 /* It's a non-leaf. Move keys and pointers. */
2262 union xfs_btree_key *lkp; /* left btree key */
2263 union xfs_btree_ptr *lpp; /* left address pointer */
2264 union xfs_btree_key *rkp; /* right btree key */
2265 union xfs_btree_ptr *rpp; /* right address pointer */
2266
2267 lkp = xfs_btree_key_addr(cur, src_index, left);
2268 lpp = xfs_btree_ptr_addr(cur, src_index, left);
2269 rkp = xfs_btree_key_addr(cur, 1, right);
2270 rpp = xfs_btree_ptr_addr(cur, 1, right);
2271
2272#ifdef DEBUG
2273 for (i = src_index; i < rrecs; i++) {
2274 error = xfs_btree_check_ptr(cur, lpp, i, level);
2275 if (error)
2276 goto error0;
2277 }
2278#endif
2279
2280 xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
2281 xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
2282
2283 xfs_btree_log_keys(cur, rbp, 1, rrecs);
2284 xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2285
2286 /* Grab the keys to the entries moved to the right block */
2287 xfs_btree_copy_keys(cur, key, rkp, 1);
2288 } else {
2289 /* It's a leaf. Move records. */
2290 union xfs_btree_rec *lrp; /* left record pointer */
2291 union xfs_btree_rec *rrp; /* right record pointer */
2292
2293 lrp = xfs_btree_rec_addr(cur, src_index, left);
2294 rrp = xfs_btree_rec_addr(cur, 1, right);
2295
2296 xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
2297 xfs_btree_log_recs(cur, rbp, 1, rrecs);
2298
2299 cur->bc_ops->init_key_from_rec(key,
2300 xfs_btree_rec_addr(cur, 1, right));
2301 }
2302
2303
2304 /*
2305 * Find the left block number by looking in the buffer.
2306 * Adjust numrecs, sibling pointers.
2307 */
2308 xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
2309 xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
2310 xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2311 xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2312
2313 lrecs -= rrecs;
2314 xfs_btree_set_numrecs(left, lrecs);
2315 xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
2316
2317 xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
2318 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
2319
2320 /*
2321 * If there's a block to the new block's right, make that block
2322 * point back to right instead of to left.
2323 */
2324 if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
2325 error = xfs_btree_read_buf_block(cur, &rrptr, level,
2326 0, &rrblock, &rrbp);
2327 if (error)
2328 goto error0;
2329 xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
2330 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
2331 }
2332 /*
2333 * If the cursor is really in the right block, move it there.
2334 * If it's just pointing past the last entry in left, then we'll
2335 * insert there, so don't change anything in that case.
2336 */
2337 if (cur->bc_ptrs[level] > lrecs + 1) {
2338 xfs_btree_setbuf(cur, level, rbp);
2339 cur->bc_ptrs[level] -= lrecs;
2340 }
2341 /*
2342 * If there are more levels, we'll need another cursor which refers
2343 * the right block, no matter where this cursor was.
2344 */
2345 if (level + 1 < cur->bc_nlevels) {
2346 error = xfs_btree_dup_cursor(cur, curp);
2347 if (error)
2348 goto error0;
2349 (*curp)->bc_ptrs[level + 1]++;
2350 }
2351 *ptrp = rptr;
2352 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2353 *stat = 1;
2354 return 0;
2355out0:
2356 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2357 *stat = 0;
2358 return 0;
2359
2360error0:
2361 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2362 return error;
2363}
2364
2365/*
2366 * Copy the old inode root contents into a real block and make the
2367 * broot point to it.
2368 */
2369int /* error */
2370xfs_btree_new_iroot(
2371 struct xfs_btree_cur *cur, /* btree cursor */
2372 int *logflags, /* logging flags for inode */
2373 int *stat) /* return status - 0 fail */
2374{
2375 struct xfs_buf *cbp; /* buffer for cblock */
2376 struct xfs_btree_block *block; /* btree block */
2377 struct xfs_btree_block *cblock; /* child btree block */
2378 union xfs_btree_key *ckp; /* child key pointer */
2379 union xfs_btree_ptr *cpp; /* child ptr pointer */
2380 union xfs_btree_key *kp; /* pointer to btree key */
2381 union xfs_btree_ptr *pp; /* pointer to block addr */
2382 union xfs_btree_ptr nptr; /* new block addr */
2383 int level; /* btree level */
2384 int error; /* error return code */
2385#ifdef DEBUG
2386 int i; /* loop counter */
2387#endif
2388
2389 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2390 XFS_BTREE_STATS_INC(cur, newroot);
2391
2392 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
2393
2394 level = cur->bc_nlevels - 1;
2395
2396 block = xfs_btree_get_iroot(cur);
2397 pp = xfs_btree_ptr_addr(cur, 1, block);
2398
2399 /* Allocate the new block. If we can't do it, we're toast. Give up. */
2400 error = cur->bc_ops->alloc_block(cur, pp, &nptr, 1, stat);
2401 if (error)
2402 goto error0;
2403 if (*stat == 0) {
2404 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2405 return 0;
2406 }
2407 XFS_BTREE_STATS_INC(cur, alloc);
2408
2409 /* Copy the root into a real block. */
2410 error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
2411 if (error)
2412 goto error0;
2413
2414 memcpy(cblock, block, xfs_btree_block_len(cur));
2415
2416 be16_add_cpu(&block->bb_level, 1);
2417 xfs_btree_set_numrecs(block, 1);
2418 cur->bc_nlevels++;
2419 cur->bc_ptrs[level + 1] = 1;
2420
2421 kp = xfs_btree_key_addr(cur, 1, block);
2422 ckp = xfs_btree_key_addr(cur, 1, cblock);
2423 xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
2424
2425 cpp = xfs_btree_ptr_addr(cur, 1, cblock);
2426#ifdef DEBUG
2427 for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
2428 error = xfs_btree_check_ptr(cur, pp, i, level);
2429 if (error)
2430 goto error0;
2431 }
2432#endif
2433 xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
2434
2435#ifdef DEBUG
2436 error = xfs_btree_check_ptr(cur, &nptr, 0, level);
2437 if (error)
2438 goto error0;
2439#endif
2440 xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
2441
2442 xfs_iroot_realloc(cur->bc_private.b.ip,
2443 1 - xfs_btree_get_numrecs(cblock),
2444 cur->bc_private.b.whichfork);
2445
2446 xfs_btree_setbuf(cur, level, cbp);
2447
2448 /*
2449 * Do all this logging at the end so that
2450 * the root is at the right level.
2451 */
2452 xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
2453 xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2454 xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2455
2456 *logflags |=
2457 XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork);
2458 *stat = 1;
2459 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2460 return 0;
2461error0:
2462 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2463 return error;
2464}
2465
2466/*
2467 * Allocate a new root block, fill it in.
2468 */
2469STATIC int /* error */
2470xfs_btree_new_root(
2471 struct xfs_btree_cur *cur, /* btree cursor */
2472 int *stat) /* success/failure */
2473{
2474 struct xfs_btree_block *block; /* one half of the old root block */
2475 struct xfs_buf *bp; /* buffer containing block */
2476 int error; /* error return value */
2477 struct xfs_buf *lbp; /* left buffer pointer */
2478 struct xfs_btree_block *left; /* left btree block */
2479 struct xfs_buf *nbp; /* new (root) buffer */
2480 struct xfs_btree_block *new; /* new (root) btree block */
2481 int nptr; /* new value for key index, 1 or 2 */
2482 struct xfs_buf *rbp; /* right buffer pointer */
2483 struct xfs_btree_block *right; /* right btree block */
2484 union xfs_btree_ptr rptr;
2485 union xfs_btree_ptr lptr;
2486
2487 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2488 XFS_BTREE_STATS_INC(cur, newroot);
2489
2490 /* initialise our start point from the cursor */
2491 cur->bc_ops->init_ptr_from_cur(cur, &rptr);
2492
2493 /* Allocate the new block. If we can't do it, we're toast. Give up. */
2494 error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, 1, stat);
2495 if (error)
2496 goto error0;
2497 if (*stat == 0)
2498 goto out0;
2499 XFS_BTREE_STATS_INC(cur, alloc);
2500
2501 /* Set up the new block. */
2502 error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
2503 if (error)
2504 goto error0;
2505
2506 /* Set the root in the holding structure increasing the level by 1. */
2507 cur->bc_ops->set_root(cur, &lptr, 1);
2508
2509 /*
2510 * At the previous root level there are now two blocks: the old root,
2511 * and the new block generated when it was split. We don't know which
2512 * one the cursor is pointing at, so we set up variables "left" and
2513 * "right" for each case.
2514 */
2515 block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
2516
2517#ifdef DEBUG
2518 error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
2519 if (error)
2520 goto error0;
2521#endif
2522
2523 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
2524 if (!xfs_btree_ptr_is_null(cur, &rptr)) {
2525 /* Our block is left, pick up the right block. */
2526 lbp = bp;
2527 xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2528 left = block;
2529 error = xfs_btree_read_buf_block(cur, &rptr,
2530 cur->bc_nlevels - 1, 0, &right, &rbp);
2531 if (error)
2532 goto error0;
2533 bp = rbp;
2534 nptr = 1;
2535 } else {
2536 /* Our block is right, pick up the left block. */
2537 rbp = bp;
2538 xfs_btree_buf_to_ptr(cur, rbp, &rptr);
2539 right = block;
2540 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2541 error = xfs_btree_read_buf_block(cur, &lptr,
2542 cur->bc_nlevels - 1, 0, &left, &lbp);
2543 if (error)
2544 goto error0;
2545 bp = lbp;
2546 nptr = 2;
2547 }
2548 /* Fill in the new block's btree header and log it. */
2549 xfs_btree_init_block(cur, cur->bc_nlevels, 2, new);
2550 xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
2551 ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
2552 !xfs_btree_ptr_is_null(cur, &rptr));
2553
2554 /* Fill in the key data in the new root. */
2555 if (xfs_btree_get_level(left) > 0) {
2556 xfs_btree_copy_keys(cur,
2557 xfs_btree_key_addr(cur, 1, new),
2558 xfs_btree_key_addr(cur, 1, left), 1);
2559 xfs_btree_copy_keys(cur,
2560 xfs_btree_key_addr(cur, 2, new),
2561 xfs_btree_key_addr(cur, 1, right), 1);
2562 } else {
2563 cur->bc_ops->init_key_from_rec(
2564 xfs_btree_key_addr(cur, 1, new),
2565 xfs_btree_rec_addr(cur, 1, left));
2566 cur->bc_ops->init_key_from_rec(
2567 xfs_btree_key_addr(cur, 2, new),
2568 xfs_btree_rec_addr(cur, 1, right));
2569 }
2570 xfs_btree_log_keys(cur, nbp, 1, 2);
2571
2572 /* Fill in the pointer data in the new root. */
2573 xfs_btree_copy_ptrs(cur,
2574 xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
2575 xfs_btree_copy_ptrs(cur,
2576 xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
2577 xfs_btree_log_ptrs(cur, nbp, 1, 2);
2578
2579 /* Fix up the cursor. */
2580 xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
2581 cur->bc_ptrs[cur->bc_nlevels] = nptr;
2582 cur->bc_nlevels++;
2583 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2584 *stat = 1;
2585 return 0;
2586error0:
2587 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2588 return error;
2589out0:
2590 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2591 *stat = 0;
2592 return 0;
2593}
2594
2595STATIC int
2596xfs_btree_make_block_unfull(
2597 struct xfs_btree_cur *cur, /* btree cursor */
2598 int level, /* btree level */
2599 int numrecs,/* # of recs in block */
2600 int *oindex,/* old tree index */
2601 int *index, /* new tree index */
2602 union xfs_btree_ptr *nptr, /* new btree ptr */
2603 struct xfs_btree_cur **ncur, /* new btree cursor */
2604 union xfs_btree_rec *nrec, /* new record */
2605 int *stat)
2606{
2607 union xfs_btree_key key; /* new btree key value */
2608 int error = 0;
2609
2610 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2611 level == cur->bc_nlevels - 1) {
2612 struct xfs_inode *ip = cur->bc_private.b.ip;
2613
2614 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
2615 /* A root block that can be made bigger. */
2616
2617 xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
2618 } else {
2619 /* A root block that needs replacing */
2620 int logflags = 0;
2621
2622 error = xfs_btree_new_iroot(cur, &logflags, stat);
2623 if (error || *stat == 0)
2624 return error;
2625
2626 xfs_trans_log_inode(cur->bc_tp, ip, logflags);
2627 }
2628
2629 return 0;
2630 }
2631
2632 /* First, try shifting an entry to the right neighbor. */
2633 error = xfs_btree_rshift(cur, level, stat);
2634 if (error || *stat)
2635 return error;
2636
2637 /* Next, try shifting an entry to the left neighbor. */
2638 error = xfs_btree_lshift(cur, level, stat);
2639 if (error)
2640 return error;
2641
2642 if (*stat) {
2643 *oindex = *index = cur->bc_ptrs[level];
2644 return 0;
2645 }
2646
2647 /*
2648 * Next, try splitting the current block in half.
2649 *
2650 * If this works we have to re-set our variables because we
2651 * could be in a different block now.
2652 */
2653 error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
2654 if (error || *stat == 0)
2655 return error;
2656
2657
2658 *index = cur->bc_ptrs[level];
2659 cur->bc_ops->init_rec_from_key(&key, nrec);
2660 return 0;
2661}
2662
2663/*
2664 * Insert one record/level. Return information to the caller
2665 * allowing the next level up to proceed if necessary.
2666 */
2667STATIC int
2668xfs_btree_insrec(
2669 struct xfs_btree_cur *cur, /* btree cursor */
2670 int level, /* level to insert record at */
2671 union xfs_btree_ptr *ptrp, /* i/o: block number inserted */
2672 union xfs_btree_rec *recp, /* i/o: record data inserted */
2673 struct xfs_btree_cur **curp, /* output: new cursor replacing cur */
2674 int *stat) /* success/failure */
2675{
2676 struct xfs_btree_block *block; /* btree block */
2677 struct xfs_buf *bp; /* buffer for block */
2678 union xfs_btree_key key; /* btree key */
2679 union xfs_btree_ptr nptr; /* new block ptr */
2680 struct xfs_btree_cur *ncur; /* new btree cursor */
2681 union xfs_btree_rec nrec; /* new record count */
2682 int optr; /* old key/record index */
2683 int ptr; /* key/record index */
2684 int numrecs;/* number of records */
2685 int error; /* error return value */
2686#ifdef DEBUG
2687 int i;
2688#endif
2689
2690 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2691 XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
2692
2693 ncur = NULL;
2694
2695 /*
2696 * If we have an external root pointer, and we've made it to the
2697 * root level, allocate a new root block and we're done.
2698 */
2699 if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2700 (level >= cur->bc_nlevels)) {
2701 error = xfs_btree_new_root(cur, stat);
2702 xfs_btree_set_ptr_null(cur, ptrp);
2703
2704 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2705 return error;
2706 }
2707
2708 /* If we're off the left edge, return failure. */
2709 ptr = cur->bc_ptrs[level];
2710 if (ptr == 0) {
2711 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2712 *stat = 0;
2713 return 0;
2714 }
2715
2716 /* Make a key out of the record data to be inserted, and save it. */
2717 cur->bc_ops->init_key_from_rec(&key, recp);
2718
2719 optr = ptr;
2720
2721 XFS_BTREE_STATS_INC(cur, insrec);
2722
2723 /* Get pointers to the btree buffer and block. */
2724 block = xfs_btree_get_block(cur, level, &bp);
2725 numrecs = xfs_btree_get_numrecs(block);
2726
2727#ifdef DEBUG
2728 error = xfs_btree_check_block(cur, block, level, bp);
2729 if (error)
2730 goto error0;
2731
2732 /* Check that the new entry is being inserted in the right place. */
2733 if (ptr <= numrecs) {
2734 if (level == 0) {
2735 ASSERT(cur->bc_ops->recs_inorder(cur, recp,
2736 xfs_btree_rec_addr(cur, ptr, block)));
2737 } else {
2738 ASSERT(cur->bc_ops->keys_inorder(cur, &key,
2739 xfs_btree_key_addr(cur, ptr, block)));
2740 }
2741 }
2742#endif
2743
2744 /*
2745 * If the block is full, we can't insert the new entry until we
2746 * make the block un-full.
2747 */
2748 xfs_btree_set_ptr_null(cur, &nptr);
2749 if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
2750 error = xfs_btree_make_block_unfull(cur, level, numrecs,
2751 &optr, &ptr, &nptr, &ncur, &nrec, stat);
2752 if (error || *stat == 0)
2753 goto error0;
2754 }
2755
2756 /*
2757 * The current block may have changed if the block was
2758 * previously full and we have just made space in it.
2759 */
2760 block = xfs_btree_get_block(cur, level, &bp);
2761 numrecs = xfs_btree_get_numrecs(block);
2762
2763#ifdef DEBUG
2764 error = xfs_btree_check_block(cur, block, level, bp);
2765 if (error)
2766 return error;
2767#endif
2768
2769 /*
2770 * At this point we know there's room for our new entry in the block
2771 * we're pointing at.
2772 */
2773 XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
2774
2775 if (level > 0) {
2776 /* It's a nonleaf. make a hole in the keys and ptrs */
2777 union xfs_btree_key *kp;
2778 union xfs_btree_ptr *pp;
2779
2780 kp = xfs_btree_key_addr(cur, ptr, block);
2781 pp = xfs_btree_ptr_addr(cur, ptr, block);
2782
2783#ifdef DEBUG
2784 for (i = numrecs - ptr; i >= 0; i--) {
2785 error = xfs_btree_check_ptr(cur, pp, i, level);
2786 if (error)
2787 return error;
2788 }
2789#endif
2790
2791 xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
2792 xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
2793
2794#ifdef DEBUG
2795 error = xfs_btree_check_ptr(cur, ptrp, 0, level);
2796 if (error)
2797 goto error0;
2798#endif
2799
2800 /* Now put the new data in, bump numrecs and log it. */
2801 xfs_btree_copy_keys(cur, kp, &key, 1);
2802 xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
2803 numrecs++;
2804 xfs_btree_set_numrecs(block, numrecs);
2805 xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
2806 xfs_btree_log_keys(cur, bp, ptr, numrecs);
2807#ifdef DEBUG
2808 if (ptr < numrecs) {
2809 ASSERT(cur->bc_ops->keys_inorder(cur, kp,
2810 xfs_btree_key_addr(cur, ptr + 1, block)));
2811 }
2812#endif
2813 } else {
2814 /* It's a leaf. make a hole in the records */
2815 union xfs_btree_rec *rp;
2816
2817 rp = xfs_btree_rec_addr(cur, ptr, block);
2818
2819 xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
2820
2821 /* Now put the new data in, bump numrecs and log it. */
2822 xfs_btree_copy_recs(cur, rp, recp, 1);
2823 xfs_btree_set_numrecs(block, ++numrecs);
2824 xfs_btree_log_recs(cur, bp, ptr, numrecs);
2825#ifdef DEBUG
2826 if (ptr < numrecs) {
2827 ASSERT(cur->bc_ops->recs_inorder(cur, rp,
2828 xfs_btree_rec_addr(cur, ptr + 1, block)));
2829 }
2830#endif
2831 }
2832
2833 /* Log the new number of records in the btree header. */
2834 xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
2835
2836 /* If we inserted at the start of a block, update the parents' keys. */
2837 if (optr == 1) {
2838 error = xfs_btree_updkey(cur, &key, level + 1);
2839 if (error)
2840 goto error0;
2841 }
2842
2843 /*
2844 * If we are tracking the last record in the tree and
2845 * we are at the far right edge of the tree, update it.
2846 */
2847 if (xfs_btree_is_lastrec(cur, block, level)) {
2848 cur->bc_ops->update_lastrec(cur, block, recp,
2849 ptr, LASTREC_INSREC);
2850 }
2851
2852 /*
2853 * Return the new block number, if any.
2854 * If there is one, give back a record value and a cursor too.
2855 */
2856 *ptrp = nptr;
2857 if (!xfs_btree_ptr_is_null(cur, &nptr)) {
2858 *recp = nrec;
2859 *curp = ncur;
2860 }
2861
2862 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2863 *stat = 1;
2864 return 0;
2865
2866error0:
2867 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2868 return error;
2869}
2870
2871/*
2872 * Insert the record at the point referenced by cur.
2873 *
2874 * A multi-level split of the tree on insert will invalidate the original
2875 * cursor. All callers of this function should assume that the cursor is
2876 * no longer valid and revalidate it.
2877 */
2878int
2879xfs_btree_insert(
2880 struct xfs_btree_cur *cur,
2881 int *stat)
2882{
2883 int error; /* error return value */
2884 int i; /* result value, 0 for failure */
2885 int level; /* current level number in btree */
2886 union xfs_btree_ptr nptr; /* new block number (split result) */
2887 struct xfs_btree_cur *ncur; /* new cursor (split result) */
2888 struct xfs_btree_cur *pcur; /* previous level's cursor */
2889 union xfs_btree_rec rec; /* record to insert */
2890
2891 level = 0;
2892 ncur = NULL;
2893 pcur = cur;
2894
2895 xfs_btree_set_ptr_null(cur, &nptr);
2896 cur->bc_ops->init_rec_from_cur(cur, &rec);
2897
2898 /*
2899 * Loop going up the tree, starting at the leaf level.
2900 * Stop when we don't get a split block, that must mean that
2901 * the insert is finished with this level.
2902 */
2903 do {
2904 /*
2905 * Insert nrec/nptr into this level of the tree.
2906 * Note if we fail, nptr will be null.
2907 */
2908 error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
2909 if (error) {
2910 if (pcur != cur)
2911 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
2912 goto error0;
2913 }
2914
2915 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
2916 level++;
2917
2918 /*
2919 * See if the cursor we just used is trash.
2920 * Can't trash the caller's cursor, but otherwise we should
2921 * if ncur is a new cursor or we're about to be done.
2922 */
2923 if (pcur != cur &&
2924 (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
2925 /* Save the state from the cursor before we trash it */
2926 if (cur->bc_ops->update_cursor)
2927 cur->bc_ops->update_cursor(pcur, cur);
2928 cur->bc_nlevels = pcur->bc_nlevels;
2929 xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
2930 }
2931 /* If we got a new cursor, switch to it. */
2932 if (ncur) {
2933 pcur = ncur;
2934 ncur = NULL;
2935 }
2936 } while (!xfs_btree_ptr_is_null(cur, &nptr));
2937
2938 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2939 *stat = i;
2940 return 0;
2941error0:
2942 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2943 return error;
2944}
2945
2946/*
2947 * Try to merge a non-leaf block back into the inode root.
2948 *
2949 * Note: the killroot names comes from the fact that we're effectively
2950 * killing the old root block. But because we can't just delete the
2951 * inode we have to copy the single block it was pointing to into the
2952 * inode.
2953 */
2954int
2955xfs_btree_kill_iroot(
2956 struct xfs_btree_cur *cur)
2957{
2958 int whichfork = cur->bc_private.b.whichfork;
2959 struct xfs_inode *ip = cur->bc_private.b.ip;
2960 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
2961 struct xfs_btree_block *block;
2962 struct xfs_btree_block *cblock;
2963 union xfs_btree_key *kp;
2964 union xfs_btree_key *ckp;
2965 union xfs_btree_ptr *pp;
2966 union xfs_btree_ptr *cpp;
2967 struct xfs_buf *cbp;
2968 int level;
2969 int index;
2970 int numrecs;
2971#ifdef DEBUG
2972 union xfs_btree_ptr ptr;
2973 int i;
2974#endif
2975
2976 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2977
2978 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
2979 ASSERT(cur->bc_nlevels > 1);
2980
2981 /*
2982 * Don't deal with the root block needs to be a leaf case.
2983 * We're just going to turn the thing back into extents anyway.
2984 */
2985 level = cur->bc_nlevels - 1;
2986 if (level == 1)
2987 goto out0;
2988
2989 /*
2990 * Give up if the root has multiple children.
2991 */
2992 block = xfs_btree_get_iroot(cur);
2993 if (xfs_btree_get_numrecs(block) != 1)
2994 goto out0;
2995
2996 cblock = xfs_btree_get_block(cur, level - 1, &cbp);
2997 numrecs = xfs_btree_get_numrecs(cblock);
2998
2999 /*
3000 * Only do this if the next level will fit.
3001 * Then the data must be copied up to the inode,
3002 * instead of freeing the root you free the next level.
3003 */
3004 if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
3005 goto out0;
3006
3007 XFS_BTREE_STATS_INC(cur, killroot);
3008
3009#ifdef DEBUG
3010 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
3011 ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3012 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
3013 ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3014#endif
3015
3016 index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
3017 if (index) {
3018 xfs_iroot_realloc(cur->bc_private.b.ip, index,
3019 cur->bc_private.b.whichfork);
3020 block = ifp->if_broot;
3021 }
3022
3023 be16_add_cpu(&block->bb_numrecs, index);
3024 ASSERT(block->bb_numrecs == cblock->bb_numrecs);
3025
3026 kp = xfs_btree_key_addr(cur, 1, block);
3027 ckp = xfs_btree_key_addr(cur, 1, cblock);
3028 xfs_btree_copy_keys(cur, kp, ckp, numrecs);
3029
3030 pp = xfs_btree_ptr_addr(cur, 1, block);
3031 cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3032#ifdef DEBUG
3033 for (i = 0; i < numrecs; i++) {
3034 int error;
3035
3036 error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
3037 if (error) {
3038 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3039 return error;
3040 }
3041 }
3042#endif
3043 xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
3044
3045 cur->bc_ops->free_block(cur, cbp);
3046 XFS_BTREE_STATS_INC(cur, free);
3047
3048 cur->bc_bufs[level - 1] = NULL;
3049 be16_add_cpu(&block->bb_level, -1);
3050 xfs_trans_log_inode(cur->bc_tp, ip,
3051 XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
3052 cur->bc_nlevels--;
3053out0:
3054 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3055 return 0;
3056}
3057
3058STATIC int
3059xfs_btree_dec_cursor(
3060 struct xfs_btree_cur *cur,
3061 int level,
3062 int *stat)
3063{
3064 int error;
3065 int i;
3066
3067 if (level > 0) {
3068 error = xfs_btree_decrement(cur, level, &i);
3069 if (error)
3070 return error;
3071 }
3072
3073 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3074 *stat = 1;
3075 return 0;
3076}
3077
3078/*
3079 * Single level of the btree record deletion routine.
3080 * Delete record pointed to by cur/level.
3081 * Remove the record from its block then rebalance the tree.
3082 * Return 0 for error, 1 for done, 2 to go on to the next level.
3083 */
3084STATIC int /* error */
3085xfs_btree_delrec(
3086 struct xfs_btree_cur *cur, /* btree cursor */
3087 int level, /* level removing record from */
3088 int *stat) /* fail/done/go-on */
3089{
3090 struct xfs_btree_block *block; /* btree block */
3091 union xfs_btree_ptr cptr; /* current block ptr */
3092 struct xfs_buf *bp; /* buffer for block */
3093 int error; /* error return value */
3094 int i; /* loop counter */
3095 union xfs_btree_key key; /* storage for keyp */
3096 union xfs_btree_key *keyp = &key; /* passed to the next level */
3097 union xfs_btree_ptr lptr; /* left sibling block ptr */
3098 struct xfs_buf *lbp; /* left buffer pointer */
3099 struct xfs_btree_block *left; /* left btree block */
3100 int lrecs = 0; /* left record count */
3101 int ptr; /* key/record index */
3102 union xfs_btree_ptr rptr; /* right sibling block ptr */
3103 struct xfs_buf *rbp; /* right buffer pointer */
3104 struct xfs_btree_block *right; /* right btree block */
3105 struct xfs_btree_block *rrblock; /* right-right btree block */
3106 struct xfs_buf *rrbp; /* right-right buffer pointer */
3107 int rrecs = 0; /* right record count */
3108 struct xfs_btree_cur *tcur; /* temporary btree cursor */
3109 int numrecs; /* temporary numrec count */
3110
3111 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3112 XFS_BTREE_TRACE_ARGI(cur, level);
3113
3114 tcur = NULL;
3115
3116 /* Get the index of the entry being deleted, check for nothing there. */
3117 ptr = cur->bc_ptrs[level];
3118 if (ptr == 0) {
3119 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3120 *stat = 0;
3121 return 0;
3122 }
3123
3124 /* Get the buffer & block containing the record or key/ptr. */
3125 block = xfs_btree_get_block(cur, level, &bp);
3126 numrecs = xfs_btree_get_numrecs(block);
3127
3128#ifdef DEBUG
3129 error = xfs_btree_check_block(cur, block, level, bp);
3130 if (error)
3131 goto error0;
3132#endif
3133
3134 /* Fail if we're off the end of the block. */
3135 if (ptr > numrecs) {
3136 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3137 *stat = 0;
3138 return 0;
3139 }
3140
3141 XFS_BTREE_STATS_INC(cur, delrec);
3142 XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
3143
3144 /* Excise the entries being deleted. */
3145 if (level > 0) {
3146 /* It's a nonleaf. operate on keys and ptrs */
3147 union xfs_btree_key *lkp;
3148 union xfs_btree_ptr *lpp;
3149
3150 lkp = xfs_btree_key_addr(cur, ptr + 1, block);
3151 lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
3152
3153#ifdef DEBUG
3154 for (i = 0; i < numrecs - ptr; i++) {
3155 error = xfs_btree_check_ptr(cur, lpp, i, level);
3156 if (error)
3157 goto error0;
3158 }
3159#endif
3160
3161 if (ptr < numrecs) {
3162 xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
3163 xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
3164 xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
3165 xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
3166 }
3167
3168 /*
3169 * If it's the first record in the block, we'll need to pass a
3170 * key up to the next level (updkey).
3171 */
3172 if (ptr == 1)
3173 keyp = xfs_btree_key_addr(cur, 1, block);
3174 } else {
3175 /* It's a leaf. operate on records */
3176 if (ptr < numrecs) {
3177 xfs_btree_shift_recs(cur,
3178 xfs_btree_rec_addr(cur, ptr + 1, block),
3179 -1, numrecs - ptr);
3180 xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
3181 }
3182
3183 /*
3184 * If it's the first record in the block, we'll need a key
3185 * structure to pass up to the next level (updkey).
3186 */
3187 if (ptr == 1) {
3188 cur->bc_ops->init_key_from_rec(&key,
3189 xfs_btree_rec_addr(cur, 1, block));
3190 keyp = &key;
3191 }
3192 }
3193
3194 /*
3195 * Decrement and log the number of entries in the block.
3196 */
3197 xfs_btree_set_numrecs(block, --numrecs);
3198 xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3199
3200 /*
3201 * If we are tracking the last record in the tree and
3202 * we are at the far right edge of the tree, update it.
3203 */
3204 if (xfs_btree_is_lastrec(cur, block, level)) {
3205 cur->bc_ops->update_lastrec(cur, block, NULL,
3206 ptr, LASTREC_DELREC);
3207 }
3208
3209 /*
3210 * We're at the root level. First, shrink the root block in-memory.
3211 * Try to get rid of the next level down. If we can't then there's
3212 * nothing left to do.
3213 */
3214 if (level == cur->bc_nlevels - 1) {
3215 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3216 xfs_iroot_realloc(cur->bc_private.b.ip, -1,
3217 cur->bc_private.b.whichfork);
3218
3219 error = xfs_btree_kill_iroot(cur);
3220 if (error)
3221 goto error0;
3222
3223 error = xfs_btree_dec_cursor(cur, level, stat);
3224 if (error)
3225 goto error0;
3226 *stat = 1;
3227 return 0;
3228 }
3229
3230 /*
3231 * If this is the root level, and there's only one entry left,
3232 * and it's NOT the leaf level, then we can get rid of this
3233 * level.
3234 */
3235 if (numrecs == 1 && level > 0) {
3236 union xfs_btree_ptr *pp;
3237 /*
3238 * pp is still set to the first pointer in the block.
3239 * Make it the new root of the btree.
3240 */
3241 pp = xfs_btree_ptr_addr(cur, 1, block);
3242 error = cur->bc_ops->kill_root(cur, bp, level, pp);
3243 if (error)
3244 goto error0;
3245 } else if (level > 0) {
3246 error = xfs_btree_dec_cursor(cur, level, stat);
3247 if (error)
3248 goto error0;
3249 }
3250 *stat = 1;
3251 return 0;
3252 }
3253
3254 /*
3255 * If we deleted the leftmost entry in the block, update the
3256 * key values above us in the tree.
3257 */
3258 if (ptr == 1) {
3259 error = xfs_btree_updkey(cur, keyp, level + 1);
3260 if (error)
3261 goto error0;
3262 }
3263
3264 /*
3265 * If the number of records remaining in the block is at least
3266 * the minimum, we're done.
3267 */
3268 if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
3269 error = xfs_btree_dec_cursor(cur, level, stat);
3270 if (error)
3271 goto error0;
3272 return 0;
3273 }
3274
3275 /*
3276 * Otherwise, we have to move some records around to keep the
3277 * tree balanced. Look at the left and right sibling blocks to
3278 * see if we can re-balance by moving only one record.
3279 */
3280 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3281 xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
3282
3283 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3284 /*
3285 * One child of root, need to get a chance to copy its contents
3286 * into the root and delete it. Can't go up to next level,
3287 * there's nothing to delete there.
3288 */
3289 if (xfs_btree_ptr_is_null(cur, &rptr) &&
3290 xfs_btree_ptr_is_null(cur, &lptr) &&
3291 level == cur->bc_nlevels - 2) {
3292 error = xfs_btree_kill_iroot(cur);
3293 if (!error)
3294 error = xfs_btree_dec_cursor(cur, level, stat);
3295 if (error)
3296 goto error0;
3297 return 0;
3298 }
3299 }
3300
3301 ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
3302 !xfs_btree_ptr_is_null(cur, &lptr));
3303
3304 /*
3305 * Duplicate the cursor so our btree manipulations here won't
3306 * disrupt the next level up.
3307 */
3308 error = xfs_btree_dup_cursor(cur, &tcur);
3309 if (error)
3310 goto error0;
3311
3312 /*
3313 * If there's a right sibling, see if it's ok to shift an entry
3314 * out of it.
3315 */
3316 if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3317 /*
3318 * Move the temp cursor to the last entry in the next block.
3319 * Actually any entry but the first would suffice.
3320 */
3321 i = xfs_btree_lastrec(tcur, level);
3322 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3323
3324 error = xfs_btree_increment(tcur, level, &i);
3325 if (error)
3326 goto error0;
3327 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3328
3329 i = xfs_btree_lastrec(tcur, level);
3330 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3331
3332 /* Grab a pointer to the block. */
3333 right = xfs_btree_get_block(tcur, level, &rbp);
3334#ifdef DEBUG
3335 error = xfs_btree_check_block(tcur, right, level, rbp);
3336 if (error)
3337 goto error0;
3338#endif
3339 /* Grab the current block number, for future use. */
3340 xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
3341
3342 /*
3343 * If right block is full enough so that removing one entry
3344 * won't make it too empty, and left-shifting an entry out
3345 * of right to us works, we're done.
3346 */
3347 if (xfs_btree_get_numrecs(right) - 1 >=
3348 cur->bc_ops->get_minrecs(tcur, level)) {
3349 error = xfs_btree_lshift(tcur, level, &i);
3350 if (error)
3351 goto error0;
3352 if (i) {
3353 ASSERT(xfs_btree_get_numrecs(block) >=
3354 cur->bc_ops->get_minrecs(tcur, level));
3355
3356 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3357 tcur = NULL;
3358
3359 error = xfs_btree_dec_cursor(cur, level, stat);
3360 if (error)
3361 goto error0;
3362 return 0;
3363 }
3364 }
3365
3366 /*
3367 * Otherwise, grab the number of records in right for
3368 * future reference, and fix up the temp cursor to point
3369 * to our block again (last record).
3370 */
3371 rrecs = xfs_btree_get_numrecs(right);
3372 if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3373 i = xfs_btree_firstrec(tcur, level);
3374 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3375
3376 error = xfs_btree_decrement(tcur, level, &i);
3377 if (error)
3378 goto error0;
3379 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3380 }
3381 }
3382
3383 /*
3384 * If there's a left sibling, see if it's ok to shift an entry
3385 * out of it.
3386 */
3387 if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3388 /*
3389 * Move the temp cursor to the first entry in the
3390 * previous block.
3391 */
3392 i = xfs_btree_firstrec(tcur, level);
3393 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3394
3395 error = xfs_btree_decrement(tcur, level, &i);
3396 if (error)
3397 goto error0;
3398 i = xfs_btree_firstrec(tcur, level);
3399 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3400
3401 /* Grab a pointer to the block. */
3402 left = xfs_btree_get_block(tcur, level, &lbp);
3403#ifdef DEBUG
3404 error = xfs_btree_check_block(cur, left, level, lbp);
3405 if (error)
3406 goto error0;
3407#endif
3408 /* Grab the current block number, for future use. */
3409 xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
3410
3411 /*
3412 * If left block is full enough so that removing one entry
3413 * won't make it too empty, and right-shifting an entry out
3414 * of left to us works, we're done.
3415 */
3416 if (xfs_btree_get_numrecs(left) - 1 >=
3417 cur->bc_ops->get_minrecs(tcur, level)) {
3418 error = xfs_btree_rshift(tcur, level, &i);
3419 if (error)
3420 goto error0;
3421 if (i) {
3422 ASSERT(xfs_btree_get_numrecs(block) >=
3423 cur->bc_ops->get_minrecs(tcur, level));
3424 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3425 tcur = NULL;
3426 if (level == 0)
3427 cur->bc_ptrs[0]++;
3428 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3429 *stat = 1;
3430 return 0;
3431 }
3432 }
3433
3434 /*
3435 * Otherwise, grab the number of records in right for
3436 * future reference.
3437 */
3438 lrecs = xfs_btree_get_numrecs(left);
3439 }
3440
3441 /* Delete the temp cursor, we're done with it. */
3442 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3443 tcur = NULL;
3444
3445 /* If here, we need to do a join to keep the tree balanced. */
3446 ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
3447
3448 if (!xfs_btree_ptr_is_null(cur, &lptr) &&
3449 lrecs + xfs_btree_get_numrecs(block) <=
3450 cur->bc_ops->get_maxrecs(cur, level)) {
3451 /*
3452 * Set "right" to be the starting block,
3453 * "left" to be the left neighbor.
3454 */
3455 rptr = cptr;
3456 right = block;
3457 rbp = bp;
3458 error = xfs_btree_read_buf_block(cur, &lptr, level,
3459 0, &left, &lbp);
3460 if (error)
3461 goto error0;
3462
3463 /*
3464 * If that won't work, see if we can join with the right neighbor block.
3465 */
3466 } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
3467 rrecs + xfs_btree_get_numrecs(block) <=
3468 cur->bc_ops->get_maxrecs(cur, level)) {
3469 /*
3470 * Set "left" to be the starting block,
3471 * "right" to be the right neighbor.
3472 */
3473 lptr = cptr;
3474 left = block;
3475 lbp = bp;
3476 error = xfs_btree_read_buf_block(cur, &rptr, level,
3477 0, &right, &rbp);
3478 if (error)
3479 goto error0;
3480
3481 /*
3482 * Otherwise, we can't fix the imbalance.
3483 * Just return. This is probably a logic error, but it's not fatal.
3484 */
3485 } else {
3486 error = xfs_btree_dec_cursor(cur, level, stat);
3487 if (error)
3488 goto error0;
3489 return 0;
3490 }
3491
3492 rrecs = xfs_btree_get_numrecs(right);
3493 lrecs = xfs_btree_get_numrecs(left);
3494
3495 /*
3496 * We're now going to join "left" and "right" by moving all the stuff
3497 * in "right" to "left" and deleting "right".
3498 */
3499 XFS_BTREE_STATS_ADD(cur, moves, rrecs);
3500 if (level > 0) {
3501 /* It's a non-leaf. Move keys and pointers. */
3502 union xfs_btree_key *lkp; /* left btree key */
3503 union xfs_btree_ptr *lpp; /* left address pointer */
3504 union xfs_btree_key *rkp; /* right btree key */
3505 union xfs_btree_ptr *rpp; /* right address pointer */
3506
3507 lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
3508 lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
3509 rkp = xfs_btree_key_addr(cur, 1, right);
3510 rpp = xfs_btree_ptr_addr(cur, 1, right);
3511#ifdef DEBUG
3512 for (i = 1; i < rrecs; i++) {
3513 error = xfs_btree_check_ptr(cur, rpp, i, level);
3514 if (error)
3515 goto error0;
3516 }
3517#endif
3518 xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
3519 xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
3520
3521 xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
3522 xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
3523 } else {
3524 /* It's a leaf. Move records. */
3525 union xfs_btree_rec *lrp; /* left record pointer */
3526 union xfs_btree_rec *rrp; /* right record pointer */
3527
3528 lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
3529 rrp = xfs_btree_rec_addr(cur, 1, right);
3530
3531 xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
3532 xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
3533 }
3534
3535 XFS_BTREE_STATS_INC(cur, join);
3536
3537 /*
3538 * Fix up the the number of records and right block pointer in the
3539 * surviving block, and log it.
3540 */
3541 xfs_btree_set_numrecs(left, lrecs + rrecs);
3542 xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
3543 xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
3544 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
3545
3546 /* If there is a right sibling, point it to the remaining block. */
3547 xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
3548 if (!xfs_btree_ptr_is_null(cur, &cptr)) {
3549 error = xfs_btree_read_buf_block(cur, &cptr, level,
3550 0, &rrblock, &rrbp);
3551 if (error)
3552 goto error0;
3553 xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
3554 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
3555 }
3556
3557 /* Free the deleted block. */
3558 error = cur->bc_ops->free_block(cur, rbp);
3559 if (error)
3560 goto error0;
3561 XFS_BTREE_STATS_INC(cur, free);
3562
3563 /*
3564 * If we joined with the left neighbor, set the buffer in the
3565 * cursor to the left block, and fix up the index.
3566 */
3567 if (bp != lbp) {
3568 cur->bc_bufs[level] = lbp;
3569 cur->bc_ptrs[level] += lrecs;
3570 cur->bc_ra[level] = 0;
3571 }
3572 /*
3573 * If we joined with the right neighbor and there's a level above
3574 * us, increment the cursor at that level.
3575 */
3576 else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
3577 (level + 1 < cur->bc_nlevels)) {
3578 error = xfs_btree_increment(cur, level + 1, &i);
3579 if (error)
3580 goto error0;
3581 }
3582
3583 /*
3584 * Readjust the ptr at this level if it's not a leaf, since it's
3585 * still pointing at the deletion point, which makes the cursor
3586 * inconsistent. If this makes the ptr 0, the caller fixes it up.
3587 * We can't use decrement because it would change the next level up.
3588 */
3589 if (level > 0)
3590 cur->bc_ptrs[level]--;
3591
3592 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3593 /* Return value means the next level up has something to do. */
3594 *stat = 2;
3595 return 0;
3596
3597error0:
3598 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3599 if (tcur)
3600 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
3601 return error;
3602}
3603
3604/*
3605 * Delete the record pointed to by cur.
3606 * The cursor refers to the place where the record was (could be inserted)
3607 * when the operation returns.
3608 */
3609int /* error */
3610xfs_btree_delete(
3611 struct xfs_btree_cur *cur,
3612 int *stat) /* success/failure */
3613{
3614 int error; /* error return value */
3615 int level;
3616 int i;
3617
3618 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3619
3620 /*
3621 * Go up the tree, starting at leaf level.
3622 *
3623 * If 2 is returned then a join was done; go to the next level.
3624 * Otherwise we are done.
3625 */
3626 for (level = 0, i = 2; i == 2; level++) {
3627 error = xfs_btree_delrec(cur, level, &i);
3628 if (error)
3629 goto error0;
3630 }
3631
3632 if (i == 0) {
3633 for (level = 1; level < cur->bc_nlevels; level++) {
3634 if (cur->bc_ptrs[level] == 0) {
3635 error = xfs_btree_decrement(cur, level, &i);
3636 if (error)
3637 goto error0;
3638 break;
3639 }
3640 }
3641 }
3642
3643 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3644 *stat = i;
3645 return 0;
3646error0:
3647 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3648 return error;
3649}
3650
3651/*
3652 * Get the data from the pointed-to record.
3653 */
3654int /* error */
3655xfs_btree_get_rec(
3656 struct xfs_btree_cur *cur, /* btree cursor */
3657 union xfs_btree_rec **recp, /* output: btree record */
3658 int *stat) /* output: success/failure */
3659{
3660 struct xfs_btree_block *block; /* btree block */
3661 struct xfs_buf *bp; /* buffer pointer */
3662 int ptr; /* record number */
3663#ifdef DEBUG
3664 int error; /* error return value */
3665#endif
3666
3667 ptr = cur->bc_ptrs[0];
3668 block = xfs_btree_get_block(cur, 0, &bp);
3669
3670#ifdef DEBUG
3671 error = xfs_btree_check_block(cur, block, 0, bp);
3672 if (error)
3673 return error;
3674#endif
3675
3676 /*
3677 * Off the right end or left end, return failure.
3678 */
3679 if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
3680 *stat = 0;
3681 return 0;
3682 }
3683
3684 /*
3685 * Point to the record and extract its data.
3686 */
3687 *recp = xfs_btree_rec_addr(cur, ptr, block);
3688 *stat = 1;
3689 return 0;
3690}
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 1f528a2a3754..789fffdf8b2f 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -39,39 +39,19 @@ extern kmem_zone_t *xfs_btree_cur_zone;
39#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi) 39#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi)
40 40
41/* 41/*
42 * Short form header: space allocation btrees. 42 * Generic btree header.
43 */ 43 *
44typedef struct xfs_btree_sblock { 44 * This is a comination of the actual format used on disk for short and long
45 __be32 bb_magic; /* magic number for block type */ 45 * format btrees. The first three fields are shared by both format, but
46 __be16 bb_level; /* 0 is a leaf */ 46 * the pointers are different and should be used with care.
47 __be16 bb_numrecs; /* current # of data records */ 47 *
48 __be32 bb_leftsib; /* left sibling block or NULLAGBLOCK */ 48 * To get the size of the actual short or long form headers please use
49 __be32 bb_rightsib; /* right sibling block or NULLAGBLOCK */ 49 * the size macros below. Never use sizeof(xfs_btree_block).
50} xfs_btree_sblock_t;
51
52/*
53 * Long form header: bmap btrees.
54 */
55typedef struct xfs_btree_lblock {
56 __be32 bb_magic; /* magic number for block type */
57 __be16 bb_level; /* 0 is a leaf */
58 __be16 bb_numrecs; /* current # of data records */
59 __be64 bb_leftsib; /* left sibling block or NULLDFSBNO */
60 __be64 bb_rightsib; /* right sibling block or NULLDFSBNO */
61} xfs_btree_lblock_t;
62
63/*
64 * Combined header and structure, used by common code.
65 */ 50 */
66typedef struct xfs_btree_hdr 51struct xfs_btree_block {
67{
68 __be32 bb_magic; /* magic number for block type */ 52 __be32 bb_magic; /* magic number for block type */
69 __be16 bb_level; /* 0 is a leaf */ 53 __be16 bb_level; /* 0 is a leaf */
70 __be16 bb_numrecs; /* current # of data records */ 54 __be16 bb_numrecs; /* current # of data records */
71} xfs_btree_hdr_t;
72
73typedef struct xfs_btree_block {
74 xfs_btree_hdr_t bb_h; /* header */
75 union { 55 union {
76 struct { 56 struct {
77 __be32 bb_leftsib; 57 __be32 bb_leftsib;
@@ -82,7 +62,36 @@ typedef struct xfs_btree_block {
82 __be64 bb_rightsib; 62 __be64 bb_rightsib;
83 } l; /* long form pointers */ 63 } l; /* long form pointers */
84 } bb_u; /* rest */ 64 } bb_u; /* rest */
85} xfs_btree_block_t; 65};
66
67#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */
68#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */
69
70
71/*
72 * Generic key, ptr and record wrapper structures.
73 *
74 * These are disk format structures, and are converted where necessary
75 * by the btree specific code that needs to interpret them.
76 */
77union xfs_btree_ptr {
78 __be32 s; /* short form ptr */
79 __be64 l; /* long form ptr */
80};
81
82union xfs_btree_key {
83 xfs_bmbt_key_t bmbt;
84 xfs_bmdr_key_t bmbr; /* bmbt root block */
85 xfs_alloc_key_t alloc;
86 xfs_inobt_key_t inobt;
87};
88
89union xfs_btree_rec {
90 xfs_bmbt_rec_t bmbt;
91 xfs_bmdr_rec_t bmbr; /* bmbt root block */
92 xfs_alloc_rec_t alloc;
93 xfs_inobt_rec_t inobt;
94};
86 95
87/* 96/*
88 * For logging record fields. 97 * For logging record fields.
@@ -96,46 +105,131 @@ typedef struct xfs_btree_block {
96#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1) 105#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1)
97 106
98/* 107/*
99 * Boolean to select which form of xfs_btree_block_t.bb_u to use.
100 */
101#define XFS_BTREE_LONG_PTRS(btnum) ((btnum) == XFS_BTNUM_BMAP)
102
103/*
104 * Magic numbers for btree blocks. 108 * Magic numbers for btree blocks.
105 */ 109 */
106extern const __uint32_t xfs_magics[]; 110extern const __uint32_t xfs_magics[];
107 111
108/* 112/*
109 * Maximum and minimum records in a btree block. 113 * Generic stats interface
110 * Given block size, type prefix, and leaf flag (0 or 1). 114 */
111 * The divisor below is equivalent to lf ? (e1) : (e2) but that produces 115#define __XFS_BTREE_STATS_INC(type, stat) \
112 * compiler warnings. 116 XFS_STATS_INC(xs_ ## type ## _2_ ## stat)
113 */ 117#define XFS_BTREE_STATS_INC(cur, stat) \
114#define XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf) \ 118do { \
115 ((int)(((bsz) - (uint)sizeof(t ## _block_t)) / \ 119 switch (cur->bc_btnum) { \
116 (((lf) * (uint)sizeof(t ## _rec_t)) + \ 120 case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break; \
117 ((1 - (lf)) * \ 121 case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break; \
118 ((uint)sizeof(t ## _key_t) + (uint)sizeof(t ## _ptr_t)))))) 122 case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break; \
119#define XFS_BTREE_BLOCK_MINRECS(bsz,t,lf) \ 123 case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break; \
120 (XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf) / 2) 124 case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
121 125 } \
122/* 126} while (0)
123 * Record, key, and pointer address calculation macros. 127
124 * Given block size, type prefix, block pointer, and index of requested entry 128#define __XFS_BTREE_STATS_ADD(type, stat, val) \
125 * (first entry numbered 1). 129 XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val)
126 */ 130#define XFS_BTREE_STATS_ADD(cur, stat, val) \
127#define XFS_BTREE_REC_ADDR(t,bb,i) \ 131do { \
128 ((t ## _rec_t *)((char *)(bb) + sizeof(t ## _block_t) + \ 132 switch (cur->bc_btnum) { \
129 ((i) - 1) * sizeof(t ## _rec_t))) 133 case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \
130#define XFS_BTREE_KEY_ADDR(t,bb,i) \ 134 case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \
131 ((t ## _key_t *)((char *)(bb) + sizeof(t ## _block_t) + \ 135 case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \
132 ((i) - 1) * sizeof(t ## _key_t))) 136 case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \
133#define XFS_BTREE_PTR_ADDR(t,bb,i,mxr) \ 137 case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
134 ((t ## _ptr_t *)((char *)(bb) + sizeof(t ## _block_t) + \ 138 } \
135 (mxr) * sizeof(t ## _key_t) + ((i) - 1) * sizeof(t ## _ptr_t))) 139} while (0)
136 140
137#define XFS_BTREE_MAXLEVELS 8 /* max of all btrees */ 141#define XFS_BTREE_MAXLEVELS 8 /* max of all btrees */
138 142
143struct xfs_btree_ops {
144 /* size of the key and record structures */
145 size_t key_len;
146 size_t rec_len;
147
148 /* cursor operations */
149 struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *);
150 void (*update_cursor)(struct xfs_btree_cur *src,
151 struct xfs_btree_cur *dst);
152
153 /* update btree root pointer */
154 void (*set_root)(struct xfs_btree_cur *cur,
155 union xfs_btree_ptr *nptr, int level_change);
156 int (*kill_root)(struct xfs_btree_cur *cur, struct xfs_buf *bp,
157 int level, union xfs_btree_ptr *newroot);
158
159 /* block allocation / freeing */
160 int (*alloc_block)(struct xfs_btree_cur *cur,
161 union xfs_btree_ptr *start_bno,
162 union xfs_btree_ptr *new_bno,
163 int length, int *stat);
164 int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp);
165
166 /* update last record information */
167 void (*update_lastrec)(struct xfs_btree_cur *cur,
168 struct xfs_btree_block *block,
169 union xfs_btree_rec *rec,
170 int ptr, int reason);
171
172 /* records in block/level */
173 int (*get_minrecs)(struct xfs_btree_cur *cur, int level);
174 int (*get_maxrecs)(struct xfs_btree_cur *cur, int level);
175
176 /* records on disk. Matter for the root in inode case. */
177 int (*get_dmaxrecs)(struct xfs_btree_cur *cur, int level);
178
179 /* init values of btree structures */
180 void (*init_key_from_rec)(union xfs_btree_key *key,
181 union xfs_btree_rec *rec);
182 void (*init_rec_from_key)(union xfs_btree_key *key,
183 union xfs_btree_rec *rec);
184 void (*init_rec_from_cur)(struct xfs_btree_cur *cur,
185 union xfs_btree_rec *rec);
186 void (*init_ptr_from_cur)(struct xfs_btree_cur *cur,
187 union xfs_btree_ptr *ptr);
188
189 /* difference between key value and cursor value */
190 __int64_t (*key_diff)(struct xfs_btree_cur *cur,
191 union xfs_btree_key *key);
192
193#ifdef DEBUG
194 /* check that k1 is lower than k2 */
195 int (*keys_inorder)(struct xfs_btree_cur *cur,
196 union xfs_btree_key *k1,
197 union xfs_btree_key *k2);
198
199 /* check that r1 is lower than r2 */
200 int (*recs_inorder)(struct xfs_btree_cur *cur,
201 union xfs_btree_rec *r1,
202 union xfs_btree_rec *r2);
203#endif
204
205 /* btree tracing */
206#ifdef XFS_BTREE_TRACE
207 void (*trace_enter)(struct xfs_btree_cur *, const char *,
208 char *, int, int, __psunsigned_t,
209 __psunsigned_t, __psunsigned_t,
210 __psunsigned_t, __psunsigned_t,
211 __psunsigned_t, __psunsigned_t,
212 __psunsigned_t, __psunsigned_t,
213 __psunsigned_t, __psunsigned_t);
214 void (*trace_cursor)(struct xfs_btree_cur *, __uint32_t *,
215 __uint64_t *, __uint64_t *);
216 void (*trace_key)(struct xfs_btree_cur *,
217 union xfs_btree_key *, __uint64_t *,
218 __uint64_t *);
219 void (*trace_record)(struct xfs_btree_cur *,
220 union xfs_btree_rec *, __uint64_t *,
221 __uint64_t *, __uint64_t *);
222#endif
223};
224
225/*
226 * Reasons for the update_lastrec method to be called.
227 */
228#define LASTREC_UPDATE 0
229#define LASTREC_INSREC 1
230#define LASTREC_DELREC 2
231
232
139/* 233/*
140 * Btree cursor structure. 234 * Btree cursor structure.
141 * This collects all information needed by the btree code in one place. 235 * This collects all information needed by the btree code in one place.
@@ -144,6 +238,8 @@ typedef struct xfs_btree_cur
144{ 238{
145 struct xfs_trans *bc_tp; /* transaction we're in, if any */ 239 struct xfs_trans *bc_tp; /* transaction we're in, if any */
146 struct xfs_mount *bc_mp; /* file system mount struct */ 240 struct xfs_mount *bc_mp; /* file system mount struct */
241 const struct xfs_btree_ops *bc_ops;
242 uint bc_flags; /* btree features - below */
147 union { 243 union {
148 xfs_alloc_rec_incore_t a; 244 xfs_alloc_rec_incore_t a;
149 xfs_bmbt_irec_t b; 245 xfs_bmbt_irec_t b;
@@ -175,94 +271,40 @@ typedef struct xfs_btree_cur
175 } bc_private; /* per-btree type data */ 271 } bc_private; /* per-btree type data */
176} xfs_btree_cur_t; 272} xfs_btree_cur_t;
177 273
274/* cursor flags */
275#define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */
276#define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */
277#define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */
278
279
178#define XFS_BTREE_NOERROR 0 280#define XFS_BTREE_NOERROR 0
179#define XFS_BTREE_ERROR 1 281#define XFS_BTREE_ERROR 1
180 282
181/* 283/*
182 * Convert from buffer to btree block header. 284 * Convert from buffer to btree block header.
183 */ 285 */
184#define XFS_BUF_TO_BLOCK(bp) ((xfs_btree_block_t *)XFS_BUF_PTR(bp)) 286#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)XFS_BUF_PTR(bp))
185#define XFS_BUF_TO_LBLOCK(bp) ((xfs_btree_lblock_t *)XFS_BUF_PTR(bp))
186#define XFS_BUF_TO_SBLOCK(bp) ((xfs_btree_sblock_t *)XFS_BUF_PTR(bp))
187 287
188 288
189#ifdef __KERNEL__
190
191#ifdef DEBUG
192/* 289/*
193 * Debug routine: check that block header is ok. 290 * Check that block header is ok.
194 */ 291 */
195void 292int
196xfs_btree_check_block( 293xfs_btree_check_block(
197 xfs_btree_cur_t *cur, /* btree cursor */ 294 struct xfs_btree_cur *cur, /* btree cursor */
198 xfs_btree_block_t *block, /* generic btree block pointer */ 295 struct xfs_btree_block *block, /* generic btree block pointer */
199 int level, /* level of the btree block */
200 struct xfs_buf *bp); /* buffer containing block, if any */
201
202/*
203 * Debug routine: check that keys are in the right order.
204 */
205void
206xfs_btree_check_key(
207 xfs_btnum_t btnum, /* btree identifier */
208 void *ak1, /* pointer to left (lower) key */
209 void *ak2); /* pointer to right (higher) key */
210
211/*
212 * Debug routine: check that records are in the right order.
213 */
214void
215xfs_btree_check_rec(
216 xfs_btnum_t btnum, /* btree identifier */
217 void *ar1, /* pointer to left (lower) record */
218 void *ar2); /* pointer to right (higher) record */
219#else
220#define xfs_btree_check_block(a,b,c,d)
221#define xfs_btree_check_key(a,b,c)
222#define xfs_btree_check_rec(a,b,c)
223#endif /* DEBUG */
224
225/*
226 * Checking routine: check that long form block header is ok.
227 */
228int /* error (0 or EFSCORRUPTED) */
229xfs_btree_check_lblock(
230 xfs_btree_cur_t *cur, /* btree cursor */
231 xfs_btree_lblock_t *block, /* btree long form block pointer */
232 int level, /* level of the btree block */ 296 int level, /* level of the btree block */
233 struct xfs_buf *bp); /* buffer containing block, if any */ 297 struct xfs_buf *bp); /* buffer containing block, if any */
234 298
235/* 299/*
236 * Checking routine: check that (long) pointer is ok. 300 * Check that (long) pointer is ok.
237 */ 301 */
238int /* error (0 or EFSCORRUPTED) */ 302int /* error (0 or EFSCORRUPTED) */
239xfs_btree_check_lptr( 303xfs_btree_check_lptr(
240 xfs_btree_cur_t *cur, /* btree cursor */ 304 struct xfs_btree_cur *cur, /* btree cursor */
241 xfs_dfsbno_t ptr, /* btree block disk address */ 305 xfs_dfsbno_t ptr, /* btree block disk address */
242 int level); /* btree block level */ 306 int level); /* btree block level */
243 307
244#define xfs_btree_check_lptr_disk(cur, ptr, level) \
245 xfs_btree_check_lptr(cur, be64_to_cpu(ptr), level)
246
247/*
248 * Checking routine: check that short form block header is ok.
249 */
250int /* error (0 or EFSCORRUPTED) */
251xfs_btree_check_sblock(
252 xfs_btree_cur_t *cur, /* btree cursor */
253 xfs_btree_sblock_t *block, /* btree short form block pointer */
254 int level, /* level of the btree block */
255 struct xfs_buf *bp); /* buffer containing block */
256
257/*
258 * Checking routine: check that (short) pointer is ok.
259 */
260int /* error (0 or EFSCORRUPTED) */
261xfs_btree_check_sptr(
262 xfs_btree_cur_t *cur, /* btree cursor */
263 xfs_agblock_t ptr, /* btree block disk address */
264 int level); /* btree block level */
265
266/* 308/*
267 * Delete the btree cursor. 309 * Delete the btree cursor.
268 */ 310 */
@@ -281,15 +323,6 @@ xfs_btree_dup_cursor(
281 xfs_btree_cur_t **ncur);/* output cursor */ 323 xfs_btree_cur_t **ncur);/* output cursor */
282 324
283/* 325/*
284 * Change the cursor to point to the first record in the current block
285 * at the given level. Other levels are unaffected.
286 */
287int /* success=1, failure=0 */
288xfs_btree_firstrec(
289 xfs_btree_cur_t *cur, /* btree cursor */
290 int level); /* level to change */
291
292/*
293 * Get a buffer for the block, return it with no data read. 326 * Get a buffer for the block, return it with no data read.
294 * Long-form addressing. 327 * Long-form addressing.
295 */ 328 */
@@ -313,20 +346,6 @@ xfs_btree_get_bufs(
313 uint lock); /* lock flags for get_buf */ 346 uint lock); /* lock flags for get_buf */
314 347
315/* 348/*
316 * Allocate a new btree cursor.
317 * The cursor is either for allocation (A) or bmap (B).
318 */
319xfs_btree_cur_t * /* new btree cursor */
320xfs_btree_init_cursor(
321 struct xfs_mount *mp, /* file system mount point */
322 struct xfs_trans *tp, /* transaction pointer */
323 struct xfs_buf *agbp, /* (A only) buffer for agf structure */
324 xfs_agnumber_t agno, /* (A only) allocation group number */
325 xfs_btnum_t btnum, /* btree identifier */
326 struct xfs_inode *ip, /* (B only) inode owning the btree */
327 int whichfork); /* (B only) data/attr fork */
328
329/*
330 * Check for the cursor referring to the last block at the given level. 349 * Check for the cursor referring to the last block at the given level.
331 */ 350 */
332int /* 1=is last block, 0=not last block */ 351int /* 1=is last block, 0=not last block */
@@ -335,15 +354,6 @@ xfs_btree_islastblock(
335 int level); /* level to check */ 354 int level); /* level to check */
336 355
337/* 356/*
338 * Change the cursor to point to the last record in the current block
339 * at the given level. Other levels are unaffected.
340 */
341int /* success=1, failure=0 */
342xfs_btree_lastrec(
343 xfs_btree_cur_t *cur, /* btree cursor */
344 int level); /* level to change */
345
346/*
347 * Compute first and last byte offsets for the fields given. 357 * Compute first and last byte offsets for the fields given.
348 * Interprets the offsets table, which contains struct field offsets. 358 * Interprets the offsets table, which contains struct field offsets.
349 */ 359 */
@@ -404,39 +414,53 @@ xfs_btree_reada_bufs(
404 xfs_extlen_t count); /* count of filesystem blocks */ 414 xfs_extlen_t count); /* count of filesystem blocks */
405 415
406/* 416/*
407 * Read-ahead btree blocks, at the given level. 417 * Set the buffer for level "lev" in the cursor to bp, releasing
408 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA. 418 * any previous buffer.
409 */ 419 */
410int /* readahead block count */ 420void
411xfs_btree_readahead_core( 421xfs_btree_setbuf(
412 xfs_btree_cur_t *cur, /* btree cursor */ 422 xfs_btree_cur_t *cur, /* btree cursor */
413 int lev, /* level in btree */ 423 int lev, /* level in btree */
414 int lr); /* left/right bits */ 424 struct xfs_buf *bp); /* new buffer to set */
415 425
416static inline int /* readahead block count */
417xfs_btree_readahead(
418 xfs_btree_cur_t *cur, /* btree cursor */
419 int lev, /* level in btree */
420 int lr) /* left/right bits */
421{
422 if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
423 return 0;
424 426
425 return xfs_btree_readahead_core(cur, lev, lr); 427/*
426} 428 * Common btree core entry points.
429 */
430int xfs_btree_increment(struct xfs_btree_cur *, int, int *);
431int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
432int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
433int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
434int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
435int xfs_btree_kill_iroot(struct xfs_btree_cur *);
436int xfs_btree_insert(struct xfs_btree_cur *, int *);
437int xfs_btree_delete(struct xfs_btree_cur *, int *);
438int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
427 439
440/*
441 * Internal btree helpers also used by xfs_bmap.c.
442 */
443void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
444void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int);
428 445
429/* 446/*
430 * Set the buffer for level "lev" in the cursor to bp, releasing 447 * Helpers.
431 * any previous buffer.
432 */ 448 */
433void 449static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block)
434xfs_btree_setbuf( 450{
435 xfs_btree_cur_t *cur, /* btree cursor */ 451 return be16_to_cpu(block->bb_numrecs);
436 int lev, /* level in btree */ 452}
437 struct xfs_buf *bp); /* new buffer to set */ 453
454static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block,
455 __uint16_t numrecs)
456{
457 block->bb_numrecs = cpu_to_be16(numrecs);
458}
438 459
439#endif /* __KERNEL__ */ 460static inline int xfs_btree_get_level(struct xfs_btree_block *block)
461{
462 return be16_to_cpu(block->bb_level);
463}
440 464
441 465
442/* 466/*
diff --git a/fs/xfs/xfs_btree_trace.c b/fs/xfs/xfs_btree_trace.c
new file mode 100644
index 000000000000..44ff942a0fda
--- /dev/null
+++ b/fs/xfs/xfs_btree_trace.c
@@ -0,0 +1,249 @@
1/*
2 * Copyright (c) 2008 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_types.h"
20#include "xfs_inum.h"
21#include "xfs_bmap_btree.h"
22#include "xfs_alloc_btree.h"
23#include "xfs_ialloc_btree.h"
24#include "xfs_inode.h"
25#include "xfs_btree.h"
26#include "xfs_btree_trace.h"
27
28STATIC void
29xfs_btree_trace_ptr(
30 struct xfs_btree_cur *cur,
31 union xfs_btree_ptr ptr,
32 __psunsigned_t *high,
33 __psunsigned_t *low)
34{
35 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
36 __u64 val = be64_to_cpu(ptr.l);
37 *high = val >> 32;
38 *low = (int)val;
39 } else {
40 *high = 0;
41 *low = be32_to_cpu(ptr.s);
42 }
43}
44
45/*
46 * Add a trace buffer entry for arguments, for a buffer & 1 integer arg.
47 */
48void
49xfs_btree_trace_argbi(
50 const char *func,
51 struct xfs_btree_cur *cur,
52 struct xfs_buf *b,
53 int i,
54 int line)
55{
56 cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGBI,
57 line, (__psunsigned_t)b, i, 0, 0, 0, 0, 0,
58 0, 0, 0, 0);
59}
60
61/*
62 * Add a trace buffer entry for arguments, for a buffer & 2 integer args.
63 */
64void
65xfs_btree_trace_argbii(
66 const char *func,
67 struct xfs_btree_cur *cur,
68 struct xfs_buf *b,
69 int i0,
70 int i1,
71 int line)
72{
73 cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGBII,
74 line, (__psunsigned_t)b, i0, i1, 0, 0, 0, 0,
75 0, 0, 0, 0);
76}
77
78/*
79 * Add a trace buffer entry for arguments, for 3 block-length args
80 * and an integer arg.
81 */
82void
83xfs_btree_trace_argfffi(
84 const char *func,
85 struct xfs_btree_cur *cur,
86 xfs_dfiloff_t o,
87 xfs_dfsbno_t b,
88 xfs_dfilblks_t i,
89 int j,
90 int line)
91{
92 cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGFFFI,
93 line,
94 o >> 32, (int)o,
95 b >> 32, (int)b,
96 i >> 32, (int)i,
97 (int)j, 0, 0, 0, 0);
98}
99
100/*
101 * Add a trace buffer entry for arguments, for one integer arg.
102 */
103void
104xfs_btree_trace_argi(
105 const char *func,
106 struct xfs_btree_cur *cur,
107 int i,
108 int line)
109{
110 cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGI,
111 line, i, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
112}
113
114/*
115 * Add a trace buffer entry for arguments, for int, fsblock, key.
116 */
117void
118xfs_btree_trace_argipk(
119 const char *func,
120 struct xfs_btree_cur *cur,
121 int i,
122 union xfs_btree_ptr ptr,
123 union xfs_btree_key *key,
124 int line)
125{
126 __psunsigned_t high, low;
127 __uint64_t l0, l1;
128
129 xfs_btree_trace_ptr(cur, ptr, &high, &low);
130 cur->bc_ops->trace_key(cur, key, &l0, &l1);
131 cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIPK,
132 line, i, high, low,
133 l0 >> 32, (int)l0,
134 l1 >> 32, (int)l1,
135 0, 0, 0, 0);
136}
137
138/*
139 * Add a trace buffer entry for arguments, for int, fsblock, rec.
140 */
141void
142xfs_btree_trace_argipr(
143 const char *func,
144 struct xfs_btree_cur *cur,
145 int i,
146 union xfs_btree_ptr ptr,
147 union xfs_btree_rec *rec,
148 int line)
149{
150 __psunsigned_t high, low;
151 __uint64_t l0, l1, l2;
152
153 xfs_btree_trace_ptr(cur, ptr, &high, &low);
154 cur->bc_ops->trace_record(cur, rec, &l0, &l1, &l2);
155 cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIPR,
156 line, i,
157 high, low,
158 l0 >> 32, (int)l0,
159 l1 >> 32, (int)l1,
160 l2 >> 32, (int)l2,
161 0, 0);
162}
163
164/*
165 * Add a trace buffer entry for arguments, for int, key.
166 */
167void
168xfs_btree_trace_argik(
169 const char *func,
170 struct xfs_btree_cur *cur,
171 int i,
172 union xfs_btree_key *key,
173 int line)
174{
175 __uint64_t l0, l1;
176
177 cur->bc_ops->trace_key(cur, key, &l0, &l1);
178 cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIK,
179 line, i,
180 l0 >> 32, (int)l0,
181 l1 >> 32, (int)l1,
182 0, 0, 0, 0, 0, 0);
183}
184
185/*
186 * Add a trace buffer entry for arguments, for record.
187 */
188void
189xfs_btree_trace_argr(
190 const char *func,
191 struct xfs_btree_cur *cur,
192 union xfs_btree_rec *rec,
193 int line)
194{
195 __uint64_t l0, l1, l2;
196
197 cur->bc_ops->trace_record(cur, rec, &l0, &l1, &l2);
198 cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGR,
199 line,
200 l0 >> 32, (int)l0,
201 l1 >> 32, (int)l1,
202 l2 >> 32, (int)l2,
203 0, 0, 0, 0, 0);
204}
205
206/*
207 * Add a trace buffer entry for the cursor/operation.
208 */
209void
210xfs_btree_trace_cursor(
211 const char *func,
212 struct xfs_btree_cur *cur,
213 int type,
214 int line)
215{
216 __uint32_t s0;
217 __uint64_t l0, l1;
218 char *s;
219
220 switch (type) {
221 case XBT_ARGS:
222 s = "args";
223 break;
224 case XBT_ENTRY:
225 s = "entry";
226 break;
227 case XBT_ERROR:
228 s = "error";
229 break;
230 case XBT_EXIT:
231 s = "exit";
232 break;
233 default:
234 s = "unknown";
235 break;
236 }
237
238 cur->bc_ops->trace_cursor(cur, &s0, &l0, &l1);
239 cur->bc_ops->trace_enter(cur, func, s, XFS_BTREE_KTRACE_CUR, line,
240 s0,
241 l0 >> 32, (int)l0,
242 l1 >> 32, (int)l1,
243 (__psunsigned_t)cur->bc_bufs[0],
244 (__psunsigned_t)cur->bc_bufs[1],
245 (__psunsigned_t)cur->bc_bufs[2],
246 (__psunsigned_t)cur->bc_bufs[3],
247 (cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1],
248 (cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]);
249}
diff --git a/fs/xfs/xfs_btree_trace.h b/fs/xfs/xfs_btree_trace.h
new file mode 100644
index 000000000000..b3f5eb3c3c6c
--- /dev/null
+++ b/fs/xfs/xfs_btree_trace.h
@@ -0,0 +1,116 @@
1/*
2 * Copyright (c) 2008 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_BTREE_TRACE_H__
19#define __XFS_BTREE_TRACE_H__
20
21struct xfs_btree_cur;
22struct xfs_buf;
23
24
25/*
26 * Trace hooks.
27 * i,j = integer (32 bit)
28 * b = btree block buffer (xfs_buf_t)
29 * p = btree ptr
30 * r = btree record
31 * k = btree key
32 */
33
34#ifdef XFS_BTREE_TRACE
35
36/*
37 * Trace buffer entry types.
38 */
39#define XFS_BTREE_KTRACE_ARGBI 1
40#define XFS_BTREE_KTRACE_ARGBII 2
41#define XFS_BTREE_KTRACE_ARGFFFI 3
42#define XFS_BTREE_KTRACE_ARGI 4
43#define XFS_BTREE_KTRACE_ARGIPK 5
44#define XFS_BTREE_KTRACE_ARGIPR 6
45#define XFS_BTREE_KTRACE_ARGIK 7
46#define XFS_BTREE_KTRACE_ARGR 8
47#define XFS_BTREE_KTRACE_CUR 9
48
49/*
50 * Sub-types for cursor traces.
51 */
52#define XBT_ARGS 0
53#define XBT_ENTRY 1
54#define XBT_ERROR 2
55#define XBT_EXIT 3
56
57void xfs_btree_trace_argbi(const char *, struct xfs_btree_cur *,
58 struct xfs_buf *, int, int);
59void xfs_btree_trace_argbii(const char *, struct xfs_btree_cur *,
60 struct xfs_buf *, int, int, int);
61void xfs_btree_trace_argfffi(const char *, struct xfs_btree_cur *,
62 xfs_dfiloff_t, xfs_dfsbno_t, xfs_dfilblks_t, int, int);
63void xfs_btree_trace_argi(const char *, struct xfs_btree_cur *, int, int);
64void xfs_btree_trace_argipk(const char *, struct xfs_btree_cur *, int,
65 union xfs_btree_ptr, union xfs_btree_key *, int);
66void xfs_btree_trace_argipr(const char *, struct xfs_btree_cur *, int,
67 union xfs_btree_ptr, union xfs_btree_rec *, int);
68void xfs_btree_trace_argik(const char *, struct xfs_btree_cur *, int,
69 union xfs_btree_key *, int);
70void xfs_btree_trace_argr(const char *, struct xfs_btree_cur *,
71 union xfs_btree_rec *, int);
72void xfs_btree_trace_cursor(const char *, struct xfs_btree_cur *, int, int);
73
74
75#define XFS_ALLOCBT_TRACE_SIZE 4096 /* size of global trace buffer */
76extern ktrace_t *xfs_allocbt_trace_buf;
77
78#define XFS_INOBT_TRACE_SIZE 4096 /* size of global trace buffer */
79extern ktrace_t *xfs_inobt_trace_buf;
80
81#define XFS_BMBT_TRACE_SIZE 4096 /* size of global trace buffer */
82#define XFS_BMBT_KTRACE_SIZE 32 /* size of per-inode trace buffer */
83extern ktrace_t *xfs_bmbt_trace_buf;
84
85
86#define XFS_BTREE_TRACE_ARGBI(c, b, i) \
87 xfs_btree_trace_argbi(__func__, c, b, i, __LINE__)
88#define XFS_BTREE_TRACE_ARGBII(c, b, i, j) \
89 xfs_btree_trace_argbii(__func__, c, b, i, j, __LINE__)
90#define XFS_BTREE_TRACE_ARGFFFI(c, o, b, i, j) \
91 xfs_btree_trace_argfffi(__func__, c, o, b, i, j, __LINE__)
92#define XFS_BTREE_TRACE_ARGI(c, i) \
93 xfs_btree_trace_argi(__func__, c, i, __LINE__)
94#define XFS_BTREE_TRACE_ARGIPK(c, i, p, k) \
95 xfs_btree_trace_argipk(__func__, c, i, p, k, __LINE__)
96#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r) \
97 xfs_btree_trace_argipr(__func__, c, i, p, r, __LINE__)
98#define XFS_BTREE_TRACE_ARGIK(c, i, k) \
99 xfs_btree_trace_argik(__func__, c, i, k, __LINE__)
100#define XFS_BTREE_TRACE_ARGR(c, r) \
101 xfs_btree_trace_argr(__func__, c, r, __LINE__)
102#define XFS_BTREE_TRACE_CURSOR(c, t) \
103 xfs_btree_trace_cursor(__func__, c, t, __LINE__)
104#else
105#define XFS_BTREE_TRACE_ARGBI(c, b, i)
106#define XFS_BTREE_TRACE_ARGBII(c, b, i, j)
107#define XFS_BTREE_TRACE_ARGFFFI(c, o, b, i, j)
108#define XFS_BTREE_TRACE_ARGI(c, i)
109#define XFS_BTREE_TRACE_ARGIPK(c, i, p, s)
110#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r)
111#define XFS_BTREE_TRACE_ARGIK(c, i, k)
112#define XFS_BTREE_TRACE_ARGR(c, r)
113#define XFS_BTREE_TRACE_CURSOR(c, t)
114#endif /* XFS_BTREE_TRACE */
115
116#endif /* __XFS_BTREE_TRACE_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 002fc2617c8e..92af4098c7e8 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -375,7 +375,7 @@ xfs_buf_item_unpin(
375 xfs_buf_log_item_t *bip, 375 xfs_buf_log_item_t *bip,
376 int stale) 376 int stale)
377{ 377{
378 xfs_mount_t *mp; 378 struct xfs_ail *ailp;
379 xfs_buf_t *bp; 379 xfs_buf_t *bp;
380 int freed; 380 int freed;
381 381
@@ -387,7 +387,7 @@ xfs_buf_item_unpin(
387 xfs_buftrace("XFS_UNPIN", bp); 387 xfs_buftrace("XFS_UNPIN", bp);
388 388
389 freed = atomic_dec_and_test(&bip->bli_refcount); 389 freed = atomic_dec_and_test(&bip->bli_refcount);
390 mp = bip->bli_item.li_mountp; 390 ailp = bip->bli_item.li_ailp;
391 xfs_bunpin(bp); 391 xfs_bunpin(bp);
392 if (freed && stale) { 392 if (freed && stale) {
393 ASSERT(bip->bli_flags & XFS_BLI_STALE); 393 ASSERT(bip->bli_flags & XFS_BLI_STALE);
@@ -399,17 +399,17 @@ xfs_buf_item_unpin(
399 xfs_buftrace("XFS_UNPIN STALE", bp); 399 xfs_buftrace("XFS_UNPIN STALE", bp);
400 /* 400 /*
401 * If we get called here because of an IO error, we may 401 * If we get called here because of an IO error, we may
402 * or may not have the item on the AIL. xfs_trans_delete_ail() 402 * or may not have the item on the AIL. xfs_trans_ail_delete()
403 * will take care of that situation. 403 * will take care of that situation.
404 * xfs_trans_delete_ail() drops the AIL lock. 404 * xfs_trans_ail_delete() drops the AIL lock.
405 */ 405 */
406 if (bip->bli_flags & XFS_BLI_STALE_INODE) { 406 if (bip->bli_flags & XFS_BLI_STALE_INODE) {
407 xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip); 407 xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip);
408 XFS_BUF_SET_FSPRIVATE(bp, NULL); 408 XFS_BUF_SET_FSPRIVATE(bp, NULL);
409 XFS_BUF_CLR_IODONE_FUNC(bp); 409 XFS_BUF_CLR_IODONE_FUNC(bp);
410 } else { 410 } else {
411 spin_lock(&mp->m_ail_lock); 411 spin_lock(&ailp->xa_lock);
412 xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip); 412 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
413 xfs_buf_item_relse(bp); 413 xfs_buf_item_relse(bp);
414 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL); 414 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL);
415 } 415 }
@@ -707,8 +707,8 @@ xfs_buf_item_init(
707 * the first. If we do already have one, there is 707 * the first. If we do already have one, there is
708 * nothing to do here so return. 708 * nothing to do here so return.
709 */ 709 */
710 if (XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *) != mp) 710 if (bp->b_mount != mp)
711 XFS_BUF_SET_FSPRIVATE3(bp, mp); 711 bp->b_mount = mp;
712 XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb); 712 XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
713 if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { 713 if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
714 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 714 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
@@ -731,6 +731,7 @@ xfs_buf_item_init(
731 bip->bli_item.li_type = XFS_LI_BUF; 731 bip->bli_item.li_type = XFS_LI_BUF;
732 bip->bli_item.li_ops = &xfs_buf_item_ops; 732 bip->bli_item.li_ops = &xfs_buf_item_ops;
733 bip->bli_item.li_mountp = mp; 733 bip->bli_item.li_mountp = mp;
734 bip->bli_item.li_ailp = mp->m_ail;
734 bip->bli_buf = bp; 735 bip->bli_buf = bp;
735 xfs_buf_hold(bp); 736 xfs_buf_hold(bp);
736 bip->bli_format.blf_type = XFS_LI_BUF; 737 bip->bli_format.blf_type = XFS_LI_BUF;
@@ -997,21 +998,7 @@ xfs_buf_iodone_callbacks(
997 xfs_buf_do_callbacks(bp, lip); 998 xfs_buf_do_callbacks(bp, lip);
998 XFS_BUF_SET_FSPRIVATE(bp, NULL); 999 XFS_BUF_SET_FSPRIVATE(bp, NULL);
999 XFS_BUF_CLR_IODONE_FUNC(bp); 1000 XFS_BUF_CLR_IODONE_FUNC(bp);
1000 1001 xfs_biodone(bp);
1001 /*
1002 * XFS_SHUT flag gets set when we go thru the
1003 * entire buffer cache and deliberately start
1004 * throwing away delayed write buffers.
1005 * Since there's no biowait done on those,
1006 * we should just brelse them.
1007 */
1008 if (XFS_BUF_ISSHUT(bp)) {
1009 XFS_BUF_UNSHUT(bp);
1010 xfs_buf_relse(bp);
1011 } else {
1012 xfs_biodone(bp);
1013 }
1014
1015 return; 1002 return;
1016 } 1003 }
1017 1004
@@ -1122,27 +1109,23 @@ xfs_buf_iodone(
1122 xfs_buf_t *bp, 1109 xfs_buf_t *bp,
1123 xfs_buf_log_item_t *bip) 1110 xfs_buf_log_item_t *bip)
1124{ 1111{
1125 struct xfs_mount *mp; 1112 struct xfs_ail *ailp = bip->bli_item.li_ailp;
1126 1113
1127 ASSERT(bip->bli_buf == bp); 1114 ASSERT(bip->bli_buf == bp);
1128 1115
1129 xfs_buf_rele(bp); 1116 xfs_buf_rele(bp);
1130 mp = bip->bli_item.li_mountp;
1131 1117
1132 /* 1118 /*
1133 * If we are forcibly shutting down, this may well be 1119 * If we are forcibly shutting down, this may well be
1134 * off the AIL already. That's because we simulate the 1120 * off the AIL already. That's because we simulate the
1135 * log-committed callbacks to unpin these buffers. Or we may never 1121 * log-committed callbacks to unpin these buffers. Or we may never
1136 * have put this item on AIL because of the transaction was 1122 * have put this item on AIL because of the transaction was
1137 * aborted forcibly. xfs_trans_delete_ail() takes care of these. 1123 * aborted forcibly. xfs_trans_ail_delete() takes care of these.
1138 * 1124 *
1139 * Either way, AIL is useless if we're forcing a shutdown. 1125 * Either way, AIL is useless if we're forcing a shutdown.
1140 */ 1126 */
1141 spin_lock(&mp->m_ail_lock); 1127 spin_lock(&ailp->xa_lock);
1142 /* 1128 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
1143 * xfs_trans_delete_ail() drops the AIL lock.
1144 */
1145 xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip);
1146 xfs_buf_item_free(bip); 1129 xfs_buf_item_free(bip);
1147} 1130}
1148 1131
diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h
deleted file mode 100644
index d2ce5dd70d87..000000000000
--- a/fs/xfs/xfs_clnt.h
+++ /dev/null
@@ -1,105 +0,0 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_CLNT_H__
19#define __XFS_CLNT_H__
20
21/*
22 * XFS arguments structure, constructed from the arguments we
23 * are passed via the mount system call.
24 *
25 * NOTE: The mount system call is handled differently between
26 * Linux and IRIX. In IRIX we worked work with a binary data
27 * structure coming in across the syscall interface from user
28 * space (the mount userspace knows about each filesystem type
29 * and the set of valid options for it, and converts the users
30 * argument string into a binary structure _before_ making the
31 * system call), and the ABI issues that this implies.
32 *
33 * In Linux, we are passed a comma separated set of options;
34 * ie. a NULL terminated string of characters. Userspace mount
35 * code does not have any knowledge of mount options expected by
36 * each filesystem type and so each filesystem parses its mount
37 * options in kernel space.
38 *
39 * For the Linux port, we kept this structure pretty much intact
40 * and use it internally (because the existing code groks it).
41 */
42struct xfs_mount_args {
43 int flags; /* flags -> see XFSMNT_... macros below */
44 int flags2; /* flags -> see XFSMNT2_... macros below */
45 int logbufs; /* Number of log buffers, -1 to default */
46 int logbufsize; /* Size of log buffers, -1 to default */
47 char fsname[MAXNAMELEN+1]; /* data device name */
48 char rtname[MAXNAMELEN+1]; /* realtime device filename */
49 char logname[MAXNAMELEN+1]; /* journal device filename */
50 char mtpt[MAXNAMELEN+1]; /* filesystem mount point */
51 int sunit; /* stripe unit (BBs) */
52 int swidth; /* stripe width (BBs), multiple of sunit */
53 uchar_t iosizelog; /* log2 of the preferred I/O size */
54 int ihashsize; /* inode hash table size (buckets) */
55};
56
57/*
58 * XFS mount option flags -- args->flags1
59 */
60#define XFSMNT_ATTR2 0x00000001 /* allow ATTR2 EA format */
61#define XFSMNT_WSYNC 0x00000002 /* safe mode nfs mount
62 * compatible */
63#define XFSMNT_INO64 0x00000004 /* move inode numbers up
64 * past 2^32 */
65#define XFSMNT_UQUOTA 0x00000008 /* user quota accounting */
66#define XFSMNT_PQUOTA 0x00000010 /* IRIX prj quota accounting */
67#define XFSMNT_UQUOTAENF 0x00000020 /* user quota limit
68 * enforcement */
69#define XFSMNT_PQUOTAENF 0x00000040 /* IRIX project quota limit
70 * enforcement */
71#define XFSMNT_QUIET 0x00000080 /* don't report mount errors */
72#define XFSMNT_NOALIGN 0x00000200 /* don't allocate at
73 * stripe boundaries*/
74#define XFSMNT_RETERR 0x00000400 /* return error to user */
75#define XFSMNT_NORECOVERY 0x00000800 /* no recovery, implies
76 * read-only mount */
77#define XFSMNT_SHARED 0x00001000 /* shared XFS mount */
78#define XFSMNT_IOSIZE 0x00002000 /* optimize for I/O size */
79#define XFSMNT_OSYNCISOSYNC 0x00004000 /* o_sync is REALLY o_sync */
80 /* (osyncisdsync is default) */
81#define XFSMNT_NOATTR2 0x00008000 /* turn off ATTR2 EA format */
82#define XFSMNT_32BITINODES 0x00200000 /* restrict inodes to 32
83 * bits of address space */
84#define XFSMNT_GQUOTA 0x00400000 /* group quota accounting */
85#define XFSMNT_GQUOTAENF 0x00800000 /* group quota limit
86 * enforcement */
87#define XFSMNT_NOUUID 0x01000000 /* Ignore fs uuid */
88#define XFSMNT_DMAPI 0x02000000 /* enable dmapi/xdsm */
89#define XFSMNT_BARRIER 0x04000000 /* use write barriers */
90#define XFSMNT_IKEEP 0x08000000 /* inode cluster delete */
91#define XFSMNT_SWALLOC 0x10000000 /* turn on stripe width
92 * allocation */
93#define XFSMNT_DIRSYNC 0x40000000 /* sync creat,link,unlink,rename
94 * symlink,mkdir,rmdir,mknod */
95#define XFSMNT_FLAGS2 0x80000000 /* more flags set in flags2 */
96
97/*
98 * XFS mount option flags -- args->flags2
99 */
100#define XFSMNT2_COMPAT_IOSIZE 0x00000001 /* don't report large preferred
101 * I/O size in stat(2) */
102#define XFSMNT2_FILESTREAMS 0x00000002 /* enable the filestreams
103 * allocator */
104
105#endif /* __XFS_CLNT_H__ */
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 8be0b00ede9a..70b710c1792d 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -72,27 +72,7 @@ typedef struct xfs_da_intnode {
72typedef struct xfs_da_node_hdr xfs_da_node_hdr_t; 72typedef struct xfs_da_node_hdr xfs_da_node_hdr_t;
73typedef struct xfs_da_node_entry xfs_da_node_entry_t; 73typedef struct xfs_da_node_entry xfs_da_node_entry_t;
74 74
75#define XFS_DA_MAXHASH ((xfs_dahash_t)-1) /* largest valid hash value */
76
77#define XFS_LBSIZE(mp) (mp)->m_sb.sb_blocksize 75#define XFS_LBSIZE(mp) (mp)->m_sb.sb_blocksize
78#define XFS_LBLOG(mp) (mp)->m_sb.sb_blocklog
79
80#define XFS_DA_MAKE_BNOENTRY(mp,bno,entry) \
81 (((bno) << (mp)->m_dircook_elog) | (entry))
82#define XFS_DA_MAKE_COOKIE(mp,bno,entry,hash) \
83 (((xfs_off_t)XFS_DA_MAKE_BNOENTRY(mp, bno, entry) << 32) | (hash))
84#define XFS_DA_COOKIE_HASH(mp,cookie) ((xfs_dahash_t)cookie)
85#define XFS_DA_COOKIE_BNO(mp,cookie) \
86 ((((xfs_off_t)(cookie) >> 31) == -1LL ? \
87 (xfs_dablk_t)0 : \
88 (xfs_dablk_t)((xfs_off_t)(cookie) >> \
89 ((mp)->m_dircook_elog + 32))))
90#define XFS_DA_COOKIE_ENTRY(mp,cookie) \
91 ((((xfs_off_t)(cookie) >> 31) == -1LL ? \
92 (xfs_dablk_t)0 : \
93 (xfs_dablk_t)(((xfs_off_t)(cookie) >> 32) & \
94 ((1 << (mp)->m_dircook_elog) - 1))))
95
96 76
97/*======================================================================== 77/*========================================================================
98 * Btree searching and modification structure definitions. 78 * Btree searching and modification structure definitions.
@@ -226,9 +206,8 @@ struct xfs_nameops {
226}; 206};
227 207
228 208
229#ifdef __KERNEL__
230/*======================================================================== 209/*========================================================================
231 * Function prototypes for the kernel. 210 * Function prototypes.
232 *========================================================================*/ 211 *========================================================================*/
233 212
234/* 213/*
@@ -289,6 +268,5 @@ xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf);
289 268
290extern struct kmem_zone *xfs_da_state_zone; 269extern struct kmem_zone *xfs_da_state_zone;
291extern struct kmem_zone *xfs_dabuf_zone; 270extern struct kmem_zone *xfs_dabuf_zone;
292#endif /* __KERNEL__ */
293 271
294#endif /* __XFS_DA_BTREE_H__ */ 272#endif /* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 75b0cd4da0ea..b4c1ee713492 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -49,9 +49,8 @@
49 */ 49 */
50int 50int
51xfs_swapext( 51xfs_swapext(
52 xfs_swapext_t __user *sxu) 52 xfs_swapext_t *sxp)
53{ 53{
54 xfs_swapext_t *sxp;
55 xfs_inode_t *ip, *tip; 54 xfs_inode_t *ip, *tip;
56 struct file *file, *target_file; 55 struct file *file, *target_file;
57 int error = 0; 56 int error = 0;
@@ -62,11 +61,6 @@ xfs_swapext(
62 goto out; 61 goto out;
63 } 62 }
64 63
65 if (copy_from_user(sxp, sxu, sizeof(xfs_swapext_t))) {
66 error = XFS_ERROR(EFAULT);
67 goto out_free_sxp;
68 }
69
70 /* Pull information for the target fd */ 64 /* Pull information for the target fd */
71 file = fget((int)sxp->sx_fdtarget); 65 file = fget((int)sxp->sx_fdtarget);
72 if (!file) { 66 if (!file) {
diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h
index da178205be68..4f55a6306558 100644
--- a/fs/xfs/xfs_dfrag.h
+++ b/fs/xfs/xfs_dfrag.h
@@ -46,7 +46,7 @@ typedef struct xfs_swapext
46/* 46/*
47 * Syscall interface for xfs_swapext 47 * Syscall interface for xfs_swapext
48 */ 48 */
49int xfs_swapext(struct xfs_swapext __user *sx); 49int xfs_swapext(struct xfs_swapext *sx);
50 50
51int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, 51int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
52 struct xfs_swapext *sxp); 52 struct xfs_swapext *sxp);
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index c9065eaf2a4d..162e8726df5e 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -18,32 +18,29 @@
18#ifndef __XFS_DINODE_H__ 18#ifndef __XFS_DINODE_H__
19#define __XFS_DINODE_H__ 19#define __XFS_DINODE_H__
20 20
21struct xfs_buf; 21#define XFS_DINODE_MAGIC 0x494e /* 'IN' */
22struct xfs_mount; 22#define XFS_DINODE_GOOD_VERSION(v) (((v) == 1 || (v) == 2))
23 23
24#define XFS_DINODE_VERSION_1 1
25#define XFS_DINODE_VERSION_2 2
26#define XFS_DINODE_GOOD_VERSION(v) \
27 (((v) == XFS_DINODE_VERSION_1 || (v) == XFS_DINODE_VERSION_2))
28#define XFS_DINODE_MAGIC 0x494e /* 'IN' */
29
30/*
31 * Disk inode structure.
32 * This is just the header; the inode is expanded to fill a variable size
33 * with the last field expanding. It is split into the core and "other"
34 * because we only need the core part in the in-core inode.
35 */
36typedef struct xfs_timestamp { 24typedef struct xfs_timestamp {
37 __be32 t_sec; /* timestamp seconds */ 25 __be32 t_sec; /* timestamp seconds */
38 __be32 t_nsec; /* timestamp nanoseconds */ 26 __be32 t_nsec; /* timestamp nanoseconds */
39} xfs_timestamp_t; 27} xfs_timestamp_t;
40 28
41/* 29/*
42 * Note: Coordinate changes to this structure with the XFS_DI_* #defines 30 * On-disk inode structure.
43 * below, the offsets table in xfs_ialloc_log_di() and struct xfs_icdinode 31 *
44 * in xfs_inode.h. 32 * This is just the header or "dinode core", the inode is expanded to fill a
33 * variable size the leftover area split into a data and an attribute fork.
34 * The format of the data and attribute fork depends on the format of the
35 * inode as indicated by di_format and di_aformat. To access the data and
36 * attribute use the XFS_DFORK_PTR, XFS_DFORK_DPTR, and XFS_DFORK_PTR macros
37 * below.
38 *
39 * There is a very similar struct icdinode in xfs_inode which matches the
40 * layout of the first 96 bytes of this structure, but is kept in native
41 * format instead of big endian.
45 */ 42 */
46typedef struct xfs_dinode_core { 43typedef struct xfs_dinode {
47 __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */ 44 __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */
48 __be16 di_mode; /* mode and type of file */ 45 __be16 di_mode; /* mode and type of file */
49 __u8 di_version; /* inode version */ 46 __u8 di_version; /* inode version */
@@ -69,34 +66,12 @@ typedef struct xfs_dinode_core {
69 __be16 di_dmstate; /* DMIG state info */ 66 __be16 di_dmstate; /* DMIG state info */
70 __be16 di_flags; /* random flags, XFS_DIFLAG_... */ 67 __be16 di_flags; /* random flags, XFS_DIFLAG_... */
71 __be32 di_gen; /* generation number */ 68 __be32 di_gen; /* generation number */
72} xfs_dinode_core_t;
73 69
74#define DI_MAX_FLUSH 0xffff 70 /* di_next_unlinked is the only non-core field in the old dinode */
71 __be32 di_next_unlinked;/* agi unlinked list ptr */
72} __attribute__((packed)) xfs_dinode_t;
75 73
76typedef struct xfs_dinode 74#define DI_MAX_FLUSH 0xffff
77{
78 xfs_dinode_core_t di_core;
79 /*
80 * In adding anything between the core and the union, be
81 * sure to update the macros like XFS_LITINO below and
82 * XFS_BMAP_RBLOCK_DSIZE in xfs_bmap_btree.h.
83 */
84 __be32 di_next_unlinked;/* agi unlinked list ptr */
85 union {
86 xfs_bmdr_block_t di_bmbt; /* btree root block */
87 xfs_bmbt_rec_32_t di_bmx[1]; /* extent list */
88 xfs_dir2_sf_t di_dir2sf; /* shortform directory v2 */
89 char di_c[1]; /* local contents */
90 __be32 di_dev; /* device for S_IFCHR/S_IFBLK */
91 uuid_t di_muuid; /* mount point value */
92 char di_symlink[1]; /* local symbolic link */
93 } di_u;
94 union {
95 xfs_bmdr_block_t di_abmbt; /* btree root block */
96 xfs_bmbt_rec_32_t di_abmx[1]; /* extent list */
97 xfs_attr_shortform_t di_attrsf; /* shortform attribute list */
98 } di_a;
99} xfs_dinode_t;
100 75
101/* 76/*
102 * The 32 bit link count in the inode theoretically maxes out at UINT_MAX. 77 * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
@@ -107,50 +82,14 @@ typedef struct xfs_dinode
107#define XFS_MAXLINK_1 65535U 82#define XFS_MAXLINK_1 65535U
108 83
109/* 84/*
110 * Bit names for logging disk inodes only
111 */
112#define XFS_DI_MAGIC 0x0000001
113#define XFS_DI_MODE 0x0000002
114#define XFS_DI_VERSION 0x0000004
115#define XFS_DI_FORMAT 0x0000008
116#define XFS_DI_ONLINK 0x0000010
117#define XFS_DI_UID 0x0000020
118#define XFS_DI_GID 0x0000040
119#define XFS_DI_NLINK 0x0000080
120#define XFS_DI_PROJID 0x0000100
121#define XFS_DI_PAD 0x0000200
122#define XFS_DI_ATIME 0x0000400
123#define XFS_DI_MTIME 0x0000800
124#define XFS_DI_CTIME 0x0001000
125#define XFS_DI_SIZE 0x0002000
126#define XFS_DI_NBLOCKS 0x0004000
127#define XFS_DI_EXTSIZE 0x0008000
128#define XFS_DI_NEXTENTS 0x0010000
129#define XFS_DI_NAEXTENTS 0x0020000
130#define XFS_DI_FORKOFF 0x0040000
131#define XFS_DI_AFORMAT 0x0080000
132#define XFS_DI_DMEVMASK 0x0100000
133#define XFS_DI_DMSTATE 0x0200000
134#define XFS_DI_FLAGS 0x0400000
135#define XFS_DI_GEN 0x0800000
136#define XFS_DI_NEXT_UNLINKED 0x1000000
137#define XFS_DI_U 0x2000000
138#define XFS_DI_A 0x4000000
139#define XFS_DI_NUM_BITS 27
140#define XFS_DI_ALL_BITS ((1 << XFS_DI_NUM_BITS) - 1)
141#define XFS_DI_CORE_BITS (XFS_DI_ALL_BITS & ~(XFS_DI_U|XFS_DI_A))
142
143/*
144 * Values for di_format 85 * Values for di_format
145 */ 86 */
146typedef enum xfs_dinode_fmt 87typedef enum xfs_dinode_fmt {
147{ 88 XFS_DINODE_FMT_DEV, /* xfs_dev_t */
148 XFS_DINODE_FMT_DEV, /* CHR, BLK: di_dev */ 89 XFS_DINODE_FMT_LOCAL, /* bulk data */
149 XFS_DINODE_FMT_LOCAL, /* DIR, REG: di_c */ 90 XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */
150 /* LNK: di_symlink */ 91 XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */
151 XFS_DINODE_FMT_EXTENTS, /* DIR, REG, LNK: di_bmx */ 92 XFS_DINODE_FMT_UUID /* uuid_t */
152 XFS_DINODE_FMT_BTREE, /* DIR, REG, LNK: di_bmbt */
153 XFS_DINODE_FMT_UUID /* MNT: di_uuid */
154} xfs_dinode_fmt_t; 93} xfs_dinode_fmt_t;
155 94
156/* 95/*
@@ -166,13 +105,13 @@ typedef enum xfs_dinode_fmt
166 */ 105 */
167#define XFS_LITINO(mp) ((mp)->m_litino) 106#define XFS_LITINO(mp) ((mp)->m_litino)
168#define XFS_BROOT_SIZE_ADJ \ 107#define XFS_BROOT_SIZE_ADJ \
169 (sizeof(xfs_bmbt_block_t) - sizeof(xfs_bmdr_block_t)) 108 (XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t))
170 109
171/* 110/*
172 * Inode data & attribute fork sizes, per inode. 111 * Inode data & attribute fork sizes, per inode.
173 */ 112 */
174#define XFS_DFORK_Q(dip) ((dip)->di_core.di_forkoff != 0) 113#define XFS_DFORK_Q(dip) ((dip)->di_forkoff != 0)
175#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_core.di_forkoff << 3)) 114#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3))
176 115
177#define XFS_DFORK_DSIZE(dip,mp) \ 116#define XFS_DFORK_DSIZE(dip,mp) \
178 (XFS_DFORK_Q(dip) ? \ 117 (XFS_DFORK_Q(dip) ? \
@@ -187,23 +126,42 @@ typedef enum xfs_dinode_fmt
187 XFS_DFORK_DSIZE(dip, mp) : \ 126 XFS_DFORK_DSIZE(dip, mp) : \
188 XFS_DFORK_ASIZE(dip, mp)) 127 XFS_DFORK_ASIZE(dip, mp))
189 128
190#define XFS_DFORK_DPTR(dip) ((dip)->di_u.di_c) 129/*
130 * Return pointers to the data or attribute forks.
131 */
132#define XFS_DFORK_DPTR(dip) \
133 ((char *)(dip) + sizeof(struct xfs_dinode))
191#define XFS_DFORK_APTR(dip) \ 134#define XFS_DFORK_APTR(dip) \
192 ((dip)->di_u.di_c + XFS_DFORK_BOFF(dip)) 135 (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
193#define XFS_DFORK_PTR(dip,w) \ 136#define XFS_DFORK_PTR(dip,w) \
194 ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip)) 137 ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip))
138
195#define XFS_DFORK_FORMAT(dip,w) \ 139#define XFS_DFORK_FORMAT(dip,w) \
196 ((w) == XFS_DATA_FORK ? \ 140 ((w) == XFS_DATA_FORK ? \
197 (dip)->di_core.di_format : \ 141 (dip)->di_format : \
198 (dip)->di_core.di_aformat) 142 (dip)->di_aformat)
199#define XFS_DFORK_NEXTENTS(dip,w) \ 143#define XFS_DFORK_NEXTENTS(dip,w) \
200 ((w) == XFS_DATA_FORK ? \ 144 ((w) == XFS_DATA_FORK ? \
201 be32_to_cpu((dip)->di_core.di_nextents) : \ 145 be32_to_cpu((dip)->di_nextents) : \
202 be16_to_cpu((dip)->di_core.di_anextents)) 146 be16_to_cpu((dip)->di_anextents))
203 147
204#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)XFS_BUF_PTR(bp)) 148#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)XFS_BUF_PTR(bp))
205 149
206/* 150/*
151 * For block and character special files the 32bit dev_t is stored at the
152 * beginning of the data fork.
153 */
154static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip)
155{
156 return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip));
157}
158
159static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
160{
161 *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev);
162}
163
164/*
207 * Values for di_flags 165 * Values for di_flags
208 * There should be a one-to-one correspondence between these flags and the 166 * There should be a one-to-one correspondence between these flags and the
209 * XFS_XFLAG_s. 167 * XFS_XFLAG_s.
diff --git a/fs/xfs/xfs_dir2_sf.h b/fs/xfs/xfs_dir2_sf.h
index deecc9d238f8..6ac44b550d39 100644
--- a/fs/xfs/xfs_dir2_sf.h
+++ b/fs/xfs/xfs_dir2_sf.h
@@ -34,13 +34,6 @@ struct xfs_mount;
34struct xfs_trans; 34struct xfs_trans;
35 35
36/* 36/*
37 * Maximum size of a shortform directory.
38 */
39#define XFS_DIR2_SF_MAX_SIZE \
40 (XFS_DINODE_MAX_SIZE - (uint)sizeof(xfs_dinode_core_t) - \
41 (uint)sizeof(xfs_agino_t))
42
43/*
44 * Inode number stored as 8 8-bit values. 37 * Inode number stored as 8 8-bit values.
45 */ 38 */
46typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t; 39typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
diff --git a/fs/xfs/xfs_dmops.c b/fs/xfs/xfs_dmops.c
index a1e55fb9d5dd..e71e2581c0c3 100644
--- a/fs/xfs/xfs_dmops.c
+++ b/fs/xfs/xfs_dmops.c
@@ -25,7 +25,6 @@
25#include "xfs_inum.h" 25#include "xfs_inum.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_mount.h" 27#include "xfs_mount.h"
28#include "xfs_clnt.h"
29 28
30 29
31static struct xfs_dmops xfs_dmcore_stub = { 30static struct xfs_dmops xfs_dmcore_stub = {
@@ -38,9 +37,9 @@ static struct xfs_dmops xfs_dmcore_stub = {
38}; 37};
39 38
40int 39int
41xfs_dmops_get(struct xfs_mount *mp, struct xfs_mount_args *args) 40xfs_dmops_get(struct xfs_mount *mp)
42{ 41{
43 if (args->flags & XFSMNT_DMAPI) { 42 if (mp->m_flags & XFS_MOUNT_DMAPI) {
44 cmn_err(CE_WARN, 43 cmn_err(CE_WARN,
45 "XFS: dmapi support not available in this kernel."); 44 "XFS: dmapi support not available in this kernel.");
46 return EINVAL; 45 return EINVAL;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index f227ecd1a294..92d5cd5bf4f2 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -153,21 +153,6 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
153} 153}
154#endif /* DEBUG */ 154#endif /* DEBUG */
155 155
156static void
157xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap)
158{
159 if (mp != NULL) {
160 char *newfmt;
161 int len = 16 + mp->m_fsname_len + strlen(fmt);
162
163 newfmt = kmem_alloc(len, KM_SLEEP);
164 sprintf(newfmt, "Filesystem \"%s\": %s", mp->m_fsname, fmt);
165 icmn_err(level, newfmt, ap);
166 kmem_free(newfmt);
167 } else {
168 icmn_err(level, fmt, ap);
169 }
170}
171 156
172void 157void
173xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...) 158xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...)
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 11543f10b0c6..0c93051c4651 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -159,11 +159,15 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
159#define XFS_PTAG_FSBLOCK_ZERO 0x00000080 159#define XFS_PTAG_FSBLOCK_ZERO 0x00000080
160 160
161struct xfs_mount; 161struct xfs_mount;
162/* PRINTFLIKE4 */ 162
163extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp,
164 char *fmt, va_list ap)
165 __attribute__ ((format (printf, 3, 0)));
163extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp, 166extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp,
164 char *fmt, ...); 167 char *fmt, ...)
165/* PRINTFLIKE3 */ 168 __attribute__ ((format (printf, 4, 5)));
166extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...); 169extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...)
170 __attribute__ ((format (printf, 3, 4)));
167 171
168extern void xfs_hex_dump(void *p, int length); 172extern void xfs_hex_dump(void *p, int length);
169 173
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 8aa28f751b2a..05a4bdd4be39 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -108,19 +108,16 @@ xfs_efi_item_pin(xfs_efi_log_item_t *efip)
108STATIC void 108STATIC void
109xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale) 109xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale)
110{ 110{
111 xfs_mount_t *mp; 111 struct xfs_ail *ailp = efip->efi_item.li_ailp;
112 112
113 mp = efip->efi_item.li_mountp; 113 spin_lock(&ailp->xa_lock);
114 spin_lock(&mp->m_ail_lock);
115 if (efip->efi_flags & XFS_EFI_CANCELED) { 114 if (efip->efi_flags & XFS_EFI_CANCELED) {
116 /* 115 /* xfs_trans_ail_delete() drops the AIL lock. */
117 * xfs_trans_delete_ail() drops the AIL lock. 116 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
118 */
119 xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip);
120 xfs_efi_item_free(efip); 117 xfs_efi_item_free(efip);
121 } else { 118 } else {
122 efip->efi_flags |= XFS_EFI_COMMITTED; 119 efip->efi_flags |= XFS_EFI_COMMITTED;
123 spin_unlock(&mp->m_ail_lock); 120 spin_unlock(&ailp->xa_lock);
124 } 121 }
125} 122}
126 123
@@ -134,26 +131,23 @@ xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale)
134STATIC void 131STATIC void
135xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp) 132xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp)
136{ 133{
137 xfs_mount_t *mp; 134 struct xfs_ail *ailp = efip->efi_item.li_ailp;
138 xfs_log_item_desc_t *lidp; 135 xfs_log_item_desc_t *lidp;
139 136
140 mp = efip->efi_item.li_mountp; 137 spin_lock(&ailp->xa_lock);
141 spin_lock(&mp->m_ail_lock);
142 if (efip->efi_flags & XFS_EFI_CANCELED) { 138 if (efip->efi_flags & XFS_EFI_CANCELED) {
143 /* 139 /*
144 * free the xaction descriptor pointing to this item 140 * free the xaction descriptor pointing to this item
145 */ 141 */
146 lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) efip); 142 lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) efip);
147 xfs_trans_free_item(tp, lidp); 143 xfs_trans_free_item(tp, lidp);
148 /* 144
149 * pull the item off the AIL. 145 /* xfs_trans_ail_delete() drops the AIL lock. */
150 * xfs_trans_delete_ail() drops the AIL lock. 146 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
151 */
152 xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip);
153 xfs_efi_item_free(efip); 147 xfs_efi_item_free(efip);
154 } else { 148 } else {
155 efip->efi_flags |= XFS_EFI_COMMITTED; 149 efip->efi_flags |= XFS_EFI_COMMITTED;
156 spin_unlock(&mp->m_ail_lock); 150 spin_unlock(&ailp->xa_lock);
157 } 151 }
158} 152}
159 153
@@ -268,6 +262,7 @@ xfs_efi_init(xfs_mount_t *mp,
268 efip->efi_item.li_type = XFS_LI_EFI; 262 efip->efi_item.li_type = XFS_LI_EFI;
269 efip->efi_item.li_ops = &xfs_efi_item_ops; 263 efip->efi_item.li_ops = &xfs_efi_item_ops;
270 efip->efi_item.li_mountp = mp; 264 efip->efi_item.li_mountp = mp;
265 efip->efi_item.li_ailp = mp->m_ail;
271 efip->efi_format.efi_nextents = nextents; 266 efip->efi_format.efi_nextents = nextents;
272 efip->efi_format.efi_id = (__psint_t)(void*)efip; 267 efip->efi_format.efi_id = (__psint_t)(void*)efip;
273 268
@@ -345,25 +340,22 @@ void
345xfs_efi_release(xfs_efi_log_item_t *efip, 340xfs_efi_release(xfs_efi_log_item_t *efip,
346 uint nextents) 341 uint nextents)
347{ 342{
348 xfs_mount_t *mp; 343 struct xfs_ail *ailp = efip->efi_item.li_ailp;
349 int extents_left; 344 int extents_left;
350 345
351 mp = efip->efi_item.li_mountp;
352 ASSERT(efip->efi_next_extent > 0); 346 ASSERT(efip->efi_next_extent > 0);
353 ASSERT(efip->efi_flags & XFS_EFI_COMMITTED); 347 ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
354 348
355 spin_lock(&mp->m_ail_lock); 349 spin_lock(&ailp->xa_lock);
356 ASSERT(efip->efi_next_extent >= nextents); 350 ASSERT(efip->efi_next_extent >= nextents);
357 efip->efi_next_extent -= nextents; 351 efip->efi_next_extent -= nextents;
358 extents_left = efip->efi_next_extent; 352 extents_left = efip->efi_next_extent;
359 if (extents_left == 0) { 353 if (extents_left == 0) {
360 /* 354 /* xfs_trans_ail_delete() drops the AIL lock. */
361 * xfs_trans_delete_ail() drops the AIL lock. 355 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
362 */
363 xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip);
364 xfs_efi_item_free(efip); 356 xfs_efi_item_free(efip);
365 } else { 357 } else {
366 spin_unlock(&mp->m_ail_lock); 358 spin_unlock(&ailp->xa_lock);
367 } 359 }
368} 360}
369 361
@@ -565,6 +557,7 @@ xfs_efd_init(xfs_mount_t *mp,
565 efdp->efd_item.li_type = XFS_LI_EFD; 557 efdp->efd_item.li_type = XFS_LI_EFD;
566 efdp->efd_item.li_ops = &xfs_efd_item_ops; 558 efdp->efd_item.li_ops = &xfs_efd_item_ops;
567 efdp->efd_item.li_mountp = mp; 559 efdp->efd_item.li_mountp = mp;
560 efdp->efd_item.li_ailp = mp->m_ail;
568 efdp->efd_efip = efip; 561 efdp->efd_efip = efip;
569 efdp->efd_format.efd_nextents = nextents; 562 efdp->efd_format.efd_nextents = nextents;
570 efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; 563 efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 01c0cc88d3f3..f7c06fac8229 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -113,22 +113,14 @@ struct getbmapx {
113#define BMV_IF_ATTRFORK 0x1 /* return attr fork rather than data */ 113#define BMV_IF_ATTRFORK 0x1 /* return attr fork rather than data */
114#define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */ 114#define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */
115#define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */ 115#define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */
116#define BMV_IF_VALID (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC) 116#define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */
117#ifdef __KERNEL__ 117#define BMV_IF_VALID \
118#define BMV_IF_EXTENDED 0x40000000 /* getpmapx if set */ 118 (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC)
119#endif
120 119
121/* bmv_oflags values - returned for for each non-header segment */ 120/* bmv_oflags values - returned for for each non-header segment */
122#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */ 121#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */
123 122#define BMV_OF_DELALLOC 0x2 /* segment = delayed allocation */
124/* Convert getbmap <-> getbmapx - move fields from p1 to p2. */ 123#define BMV_OF_LAST 0x4 /* segment is the last in the file */
125#define GETBMAP_CONVERT(p1,p2) { \
126 p2.bmv_offset = p1.bmv_offset; \
127 p2.bmv_block = p1.bmv_block; \
128 p2.bmv_length = p1.bmv_length; \
129 p2.bmv_count = p1.bmv_count; \
130 p2.bmv_entries = p1.bmv_entries; }
131
132 124
133/* 125/*
134 * Structure for XFS_IOC_FSSETDM. 126 * Structure for XFS_IOC_FSSETDM.
@@ -426,10 +418,6 @@ typedef struct xfs_handle {
426#define XFS_IOC_GETXFLAGS FS_IOC_GETFLAGS 418#define XFS_IOC_GETXFLAGS FS_IOC_GETFLAGS
427#define XFS_IOC_SETXFLAGS FS_IOC_SETFLAGS 419#define XFS_IOC_SETXFLAGS FS_IOC_SETFLAGS
428#define XFS_IOC_GETVERSION FS_IOC_GETVERSION 420#define XFS_IOC_GETVERSION FS_IOC_GETVERSION
429/* 32-bit compat counterparts */
430#define XFS_IOC32_GETXFLAGS FS_IOC32_GETFLAGS
431#define XFS_IOC32_SETXFLAGS FS_IOC32_SETFLAGS
432#define XFS_IOC32_GETVERSION FS_IOC32_GETVERSION
433 421
434/* 422/*
435 * ioctl commands that replace IRIX fcntl()'s 423 * ioctl commands that replace IRIX fcntl()'s
@@ -477,8 +465,8 @@ typedef struct xfs_handle {
477#define XFS_IOC_ERROR_INJECTION _IOW ('X', 116, struct xfs_error_injection) 465#define XFS_IOC_ERROR_INJECTION _IOW ('X', 116, struct xfs_error_injection)
478#define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection) 466#define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection)
479/* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */ 467/* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */
480#define XFS_IOC_FREEZE _IOWR('X', 119, int) 468/* XFS_IOC_FREEZE -- FIFREEZE 119 */
481#define XFS_IOC_THAW _IOWR('X', 120, int) 469/* XFS_IOC_THAW -- FITHAW 120 */
482#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) 470#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
483#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) 471#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
484#define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq) 472#define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 84583cf73db3..680d0e0ec932 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -126,7 +126,7 @@ xfs_growfs_data_private(
126 xfs_extlen_t agsize; 126 xfs_extlen_t agsize;
127 xfs_extlen_t tmpsize; 127 xfs_extlen_t tmpsize;
128 xfs_alloc_rec_t *arec; 128 xfs_alloc_rec_t *arec;
129 xfs_btree_sblock_t *block; 129 struct xfs_btree_block *block;
130 xfs_buf_t *bp; 130 xfs_buf_t *bp;
131 int bucket; 131 int bucket;
132 int dpct; 132 int dpct;
@@ -251,14 +251,14 @@ xfs_growfs_data_private(
251 bp = xfs_buf_get(mp->m_ddev_targp, 251 bp = xfs_buf_get(mp->m_ddev_targp,
252 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), 252 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
253 BTOBB(mp->m_sb.sb_blocksize), 0); 253 BTOBB(mp->m_sb.sb_blocksize), 0);
254 block = XFS_BUF_TO_SBLOCK(bp); 254 block = XFS_BUF_TO_BLOCK(bp);
255 memset(block, 0, mp->m_sb.sb_blocksize); 255 memset(block, 0, mp->m_sb.sb_blocksize);
256 block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC); 256 block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC);
257 block->bb_level = 0; 257 block->bb_level = 0;
258 block->bb_numrecs = cpu_to_be16(1); 258 block->bb_numrecs = cpu_to_be16(1);
259 block->bb_leftsib = cpu_to_be32(NULLAGBLOCK); 259 block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
260 block->bb_rightsib = cpu_to_be32(NULLAGBLOCK); 260 block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
261 arec = XFS_BTREE_REC_ADDR(xfs_alloc, block, 1); 261 arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
262 arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); 262 arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
263 arec->ar_blockcount = cpu_to_be32( 263 arec->ar_blockcount = cpu_to_be32(
264 agsize - be32_to_cpu(arec->ar_startblock)); 264 agsize - be32_to_cpu(arec->ar_startblock));
@@ -272,14 +272,14 @@ xfs_growfs_data_private(
272 bp = xfs_buf_get(mp->m_ddev_targp, 272 bp = xfs_buf_get(mp->m_ddev_targp,
273 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), 273 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
274 BTOBB(mp->m_sb.sb_blocksize), 0); 274 BTOBB(mp->m_sb.sb_blocksize), 0);
275 block = XFS_BUF_TO_SBLOCK(bp); 275 block = XFS_BUF_TO_BLOCK(bp);
276 memset(block, 0, mp->m_sb.sb_blocksize); 276 memset(block, 0, mp->m_sb.sb_blocksize);
277 block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC); 277 block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC);
278 block->bb_level = 0; 278 block->bb_level = 0;
279 block->bb_numrecs = cpu_to_be16(1); 279 block->bb_numrecs = cpu_to_be16(1);
280 block->bb_leftsib = cpu_to_be32(NULLAGBLOCK); 280 block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
281 block->bb_rightsib = cpu_to_be32(NULLAGBLOCK); 281 block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
282 arec = XFS_BTREE_REC_ADDR(xfs_alloc, block, 1); 282 arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
283 arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); 283 arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
284 arec->ar_blockcount = cpu_to_be32( 284 arec->ar_blockcount = cpu_to_be32(
285 agsize - be32_to_cpu(arec->ar_startblock)); 285 agsize - be32_to_cpu(arec->ar_startblock));
@@ -294,13 +294,13 @@ xfs_growfs_data_private(
294 bp = xfs_buf_get(mp->m_ddev_targp, 294 bp = xfs_buf_get(mp->m_ddev_targp,
295 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), 295 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
296 BTOBB(mp->m_sb.sb_blocksize), 0); 296 BTOBB(mp->m_sb.sb_blocksize), 0);
297 block = XFS_BUF_TO_SBLOCK(bp); 297 block = XFS_BUF_TO_BLOCK(bp);
298 memset(block, 0, mp->m_sb.sb_blocksize); 298 memset(block, 0, mp->m_sb.sb_blocksize);
299 block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC); 299 block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC);
300 block->bb_level = 0; 300 block->bb_level = 0;
301 block->bb_numrecs = 0; 301 block->bb_numrecs = 0;
302 block->bb_leftsib = cpu_to_be32(NULLAGBLOCK); 302 block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
303 block->bb_rightsib = cpu_to_be32(NULLAGBLOCK); 303 block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
304 error = xfs_bwrite(mp, bp); 304 error = xfs_bwrite(mp, bp);
305 if (error) { 305 if (error) {
306 goto error0; 306 goto error0;
@@ -435,6 +435,9 @@ xfs_growfs_data(
435 xfs_growfs_data_t *in) 435 xfs_growfs_data_t *in)
436{ 436{
437 int error; 437 int error;
438
439 if (!capable(CAP_SYS_ADMIN))
440 return XFS_ERROR(EPERM);
438 if (!mutex_trylock(&mp->m_growlock)) 441 if (!mutex_trylock(&mp->m_growlock))
439 return XFS_ERROR(EWOULDBLOCK); 442 return XFS_ERROR(EWOULDBLOCK);
440 error = xfs_growfs_data_private(mp, in); 443 error = xfs_growfs_data_private(mp, in);
@@ -448,6 +451,9 @@ xfs_growfs_log(
448 xfs_growfs_log_t *in) 451 xfs_growfs_log_t *in)
449{ 452{
450 int error; 453 int error;
454
455 if (!capable(CAP_SYS_ADMIN))
456 return XFS_ERROR(EPERM);
451 if (!mutex_trylock(&mp->m_growlock)) 457 if (!mutex_trylock(&mp->m_growlock))
452 return XFS_ERROR(EWOULDBLOCK); 458 return XFS_ERROR(EWOULDBLOCK);
453 error = xfs_growfs_log_private(mp, in); 459 error = xfs_growfs_log_private(mp, in);
@@ -589,17 +595,19 @@ out:
589 return 0; 595 return 0;
590} 596}
591 597
592void 598int
593xfs_fs_log_dummy( 599xfs_fs_log_dummy(
594 xfs_mount_t *mp) 600 xfs_mount_t *mp)
595{ 601{
596 xfs_trans_t *tp; 602 xfs_trans_t *tp;
597 xfs_inode_t *ip; 603 xfs_inode_t *ip;
604 int error;
598 605
599 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1); 606 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
600 if (xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0)) { 607 error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
608 if (error) {
601 xfs_trans_cancel(tp, 0); 609 xfs_trans_cancel(tp, 0);
602 return; 610 return error;
603 } 611 }
604 612
605 ip = mp->m_rootip; 613 ip = mp->m_rootip;
@@ -609,9 +617,10 @@ xfs_fs_log_dummy(
609 xfs_trans_ihold(tp, ip); 617 xfs_trans_ihold(tp, ip);
610 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 618 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
611 xfs_trans_set_sync(tp); 619 xfs_trans_set_sync(tp);
612 xfs_trans_commit(tp, 0); 620 error = xfs_trans_commit(tp, 0);
613 621
614 xfs_iunlock(ip, XFS_ILOCK_EXCL); 622 xfs_iunlock(ip, XFS_ILOCK_EXCL);
623 return error;
615} 624}
616 625
617int 626int
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 300d0c9d61ad..88435e0a77c9 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, 25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
26 xfs_fsop_resblks_t *outval); 26 xfs_fsop_resblks_t *outval);
27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); 27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
28extern void xfs_fs_log_dummy(xfs_mount_t *mp); 28extern int xfs_fs_log_dummy(xfs_mount_t *mp);
29 29
30#endif /* __XFS_FSOPS_H__ */ 30#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index aad8c5da38af..e6ebbaeb4dc6 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -41,68 +41,6 @@
41#include "xfs_error.h" 41#include "xfs_error.h"
42#include "xfs_bmap.h" 42#include "xfs_bmap.h"
43 43
44/*
45 * Log specified fields for the inode given by bp and off.
46 */
47STATIC void
48xfs_ialloc_log_di(
49 xfs_trans_t *tp, /* transaction pointer */
50 xfs_buf_t *bp, /* inode buffer */
51 int off, /* index of inode in buffer */
52 int fields) /* bitmask of fields to log */
53{
54 int first; /* first byte number */
55 int ioffset; /* off in bytes */
56 int last; /* last byte number */
57 xfs_mount_t *mp; /* mount point structure */
58 static const short offsets[] = { /* field offsets */
59 /* keep in sync with bits */
60 offsetof(xfs_dinode_core_t, di_magic),
61 offsetof(xfs_dinode_core_t, di_mode),
62 offsetof(xfs_dinode_core_t, di_version),
63 offsetof(xfs_dinode_core_t, di_format),
64 offsetof(xfs_dinode_core_t, di_onlink),
65 offsetof(xfs_dinode_core_t, di_uid),
66 offsetof(xfs_dinode_core_t, di_gid),
67 offsetof(xfs_dinode_core_t, di_nlink),
68 offsetof(xfs_dinode_core_t, di_projid),
69 offsetof(xfs_dinode_core_t, di_pad),
70 offsetof(xfs_dinode_core_t, di_atime),
71 offsetof(xfs_dinode_core_t, di_mtime),
72 offsetof(xfs_dinode_core_t, di_ctime),
73 offsetof(xfs_dinode_core_t, di_size),
74 offsetof(xfs_dinode_core_t, di_nblocks),
75 offsetof(xfs_dinode_core_t, di_extsize),
76 offsetof(xfs_dinode_core_t, di_nextents),
77 offsetof(xfs_dinode_core_t, di_anextents),
78 offsetof(xfs_dinode_core_t, di_forkoff),
79 offsetof(xfs_dinode_core_t, di_aformat),
80 offsetof(xfs_dinode_core_t, di_dmevmask),
81 offsetof(xfs_dinode_core_t, di_dmstate),
82 offsetof(xfs_dinode_core_t, di_flags),
83 offsetof(xfs_dinode_core_t, di_gen),
84 offsetof(xfs_dinode_t, di_next_unlinked),
85 offsetof(xfs_dinode_t, di_u),
86 offsetof(xfs_dinode_t, di_a),
87 sizeof(xfs_dinode_t)
88 };
89
90
91 ASSERT(offsetof(xfs_dinode_t, di_core) == 0);
92 ASSERT((fields & (XFS_DI_U|XFS_DI_A)) == 0);
93 mp = tp->t_mountp;
94 /*
95 * Get the inode-relative first and last bytes for these fields
96 */
97 xfs_btree_offsets(fields, offsets, XFS_DI_NUM_BITS, &first, &last);
98 /*
99 * Convert to buffer offsets and log it.
100 */
101 ioffset = off << mp->m_sb.sb_inodelog;
102 first += ioffset;
103 last += ioffset;
104 xfs_trans_log_buf(tp, bp, first, last);
105}
106 44
107/* 45/*
108 * Allocation group level functions. 46 * Allocation group level functions.
@@ -119,6 +57,102 @@ xfs_ialloc_cluster_alignment(
119} 57}
120 58
121/* 59/*
60 * Lookup the record equal to ino in the btree given by cur.
61 */
62STATIC int /* error */
63xfs_inobt_lookup_eq(
64 struct xfs_btree_cur *cur, /* btree cursor */
65 xfs_agino_t ino, /* starting inode of chunk */
66 __int32_t fcnt, /* free inode count */
67 xfs_inofree_t free, /* free inode mask */
68 int *stat) /* success/failure */
69{
70 cur->bc_rec.i.ir_startino = ino;
71 cur->bc_rec.i.ir_freecount = fcnt;
72 cur->bc_rec.i.ir_free = free;
73 return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
74}
75
76/*
77 * Lookup the first record greater than or equal to ino
78 * in the btree given by cur.
79 */
80int /* error */
81xfs_inobt_lookup_ge(
82 struct xfs_btree_cur *cur, /* btree cursor */
83 xfs_agino_t ino, /* starting inode of chunk */
84 __int32_t fcnt, /* free inode count */
85 xfs_inofree_t free, /* free inode mask */
86 int *stat) /* success/failure */
87{
88 cur->bc_rec.i.ir_startino = ino;
89 cur->bc_rec.i.ir_freecount = fcnt;
90 cur->bc_rec.i.ir_free = free;
91 return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
92}
93
94/*
95 * Lookup the first record less than or equal to ino
96 * in the btree given by cur.
97 */
98int /* error */
99xfs_inobt_lookup_le(
100 struct xfs_btree_cur *cur, /* btree cursor */
101 xfs_agino_t ino, /* starting inode of chunk */
102 __int32_t fcnt, /* free inode count */
103 xfs_inofree_t free, /* free inode mask */
104 int *stat) /* success/failure */
105{
106 cur->bc_rec.i.ir_startino = ino;
107 cur->bc_rec.i.ir_freecount = fcnt;
108 cur->bc_rec.i.ir_free = free;
109 return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
110}
111
112/*
113 * Update the record referred to by cur to the value given
114 * by [ino, fcnt, free].
115 * This either works (return 0) or gets an EFSCORRUPTED error.
116 */
117STATIC int /* error */
118xfs_inobt_update(
119 struct xfs_btree_cur *cur, /* btree cursor */
120 xfs_agino_t ino, /* starting inode of chunk */
121 __int32_t fcnt, /* free inode count */
122 xfs_inofree_t free) /* free inode mask */
123{
124 union xfs_btree_rec rec;
125
126 rec.inobt.ir_startino = cpu_to_be32(ino);
127 rec.inobt.ir_freecount = cpu_to_be32(fcnt);
128 rec.inobt.ir_free = cpu_to_be64(free);
129 return xfs_btree_update(cur, &rec);
130}
131
132/*
133 * Get the data from the pointed-to record.
134 */
135int /* error */
136xfs_inobt_get_rec(
137 struct xfs_btree_cur *cur, /* btree cursor */
138 xfs_agino_t *ino, /* output: starting inode of chunk */
139 __int32_t *fcnt, /* output: number of free inodes */
140 xfs_inofree_t *free, /* output: free inode mask */
141 int *stat) /* output: success/failure */
142{
143 union xfs_btree_rec *rec;
144 int error;
145
146 error = xfs_btree_get_rec(cur, &rec, stat);
147 if (!error && *stat == 1) {
148 *ino = be32_to_cpu(rec->inobt.ir_startino);
149 *fcnt = be32_to_cpu(rec->inobt.ir_freecount);
150 *free = be64_to_cpu(rec->inobt.ir_free);
151 }
152 return error;
153}
154
155/*
122 * Allocate new inodes in the allocation group specified by agbp. 156 * Allocate new inodes in the allocation group specified by agbp.
123 * Return 0 for success, else error code. 157 * Return 0 for success, else error code.
124 */ 158 */
@@ -287,9 +321,9 @@ xfs_ialloc_ag_alloc(
287 * able to use the file system. 321 * able to use the file system.
288 */ 322 */
289 if (xfs_sb_version_hasnlink(&args.mp->m_sb)) 323 if (xfs_sb_version_hasnlink(&args.mp->m_sb))
290 version = XFS_DINODE_VERSION_2; 324 version = 2;
291 else 325 else
292 version = XFS_DINODE_VERSION_1; 326 version = 1;
293 327
294 /* 328 /*
295 * Seed the new inode cluster with a random generation number. This 329 * Seed the new inode cluster with a random generation number. This
@@ -310,18 +344,25 @@ xfs_ialloc_ag_alloc(
310 XFS_BUF_LOCK); 344 XFS_BUF_LOCK);
311 ASSERT(fbuf); 345 ASSERT(fbuf);
312 ASSERT(!XFS_BUF_GETERROR(fbuf)); 346 ASSERT(!XFS_BUF_GETERROR(fbuf));
347
313 /* 348 /*
314 * Set initial values for the inodes in this buffer. 349 * Initialize all inodes in this buffer and then log them.
350 *
351 * XXX: It would be much better if we had just one transaction to
352 * log a whole cluster of inodes instead of all the indivdual
353 * transactions causing a lot of log traffic.
315 */ 354 */
316 xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog); 355 xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog);
317 for (i = 0; i < ninodes; i++) { 356 for (i = 0; i < ninodes; i++) {
357 int ioffset = i << args.mp->m_sb.sb_inodelog;
358 uint isize = sizeof(struct xfs_dinode);
359
318 free = XFS_MAKE_IPTR(args.mp, fbuf, i); 360 free = XFS_MAKE_IPTR(args.mp, fbuf, i);
319 free->di_core.di_magic = cpu_to_be16(XFS_DINODE_MAGIC); 361 free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
320 free->di_core.di_version = version; 362 free->di_version = version;
321 free->di_core.di_gen = cpu_to_be32(gen); 363 free->di_gen = cpu_to_be32(gen);
322 free->di_next_unlinked = cpu_to_be32(NULLAGINO); 364 free->di_next_unlinked = cpu_to_be32(NULLAGINO);
323 xfs_ialloc_log_di(tp, fbuf, i, 365 xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
324 XFS_DI_CORE_BITS | XFS_DI_NEXT_UNLINKED);
325 } 366 }
326 xfs_trans_inode_alloc_buf(tp, fbuf); 367 xfs_trans_inode_alloc_buf(tp, fbuf);
327 } 368 }
@@ -335,8 +376,7 @@ xfs_ialloc_ag_alloc(
335 /* 376 /*
336 * Insert records describing the new inode chunk into the btree. 377 * Insert records describing the new inode chunk into the btree.
337 */ 378 */
338 cur = xfs_btree_init_cursor(args.mp, tp, agbp, agno, 379 cur = xfs_inobt_init_cursor(args.mp, tp, agbp, agno);
339 XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
340 for (thisino = newino; 380 for (thisino = newino;
341 thisino < newino + newlen; 381 thisino < newino + newlen;
342 thisino += XFS_INODES_PER_CHUNK) { 382 thisino += XFS_INODES_PER_CHUNK) {
@@ -346,7 +386,7 @@ xfs_ialloc_ag_alloc(
346 return error; 386 return error;
347 } 387 }
348 ASSERT(i == 0); 388 ASSERT(i == 0);
349 if ((error = xfs_inobt_insert(cur, &i))) { 389 if ((error = xfs_btree_insert(cur, &i))) {
350 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); 390 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
351 return error; 391 return error;
352 } 392 }
@@ -676,8 +716,7 @@ nextag:
676 */ 716 */
677 agno = tagno; 717 agno = tagno;
678 *IO_agbp = NULL; 718 *IO_agbp = NULL;
679 cur = xfs_btree_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno), 719 cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
680 XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
681 /* 720 /*
682 * If pagino is 0 (this is the root inode allocation) use newino. 721 * If pagino is 0 (this is the root inode allocation) use newino.
683 * This must work because we've just allocated some. 722 * This must work because we've just allocated some.
@@ -697,7 +736,7 @@ nextag:
697 goto error0; 736 goto error0;
698 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 737 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
699 freecount += rec.ir_freecount; 738 freecount += rec.ir_freecount;
700 if ((error = xfs_inobt_increment(cur, 0, &i))) 739 if ((error = xfs_btree_increment(cur, 0, &i)))
701 goto error0; 740 goto error0;
702 } while (i == 1); 741 } while (i == 1);
703 742
@@ -741,7 +780,7 @@ nextag:
741 /* 780 /*
742 * Search left with tcur, back up 1 record. 781 * Search left with tcur, back up 1 record.
743 */ 782 */
744 if ((error = xfs_inobt_decrement(tcur, 0, &i))) 783 if ((error = xfs_btree_decrement(tcur, 0, &i)))
745 goto error1; 784 goto error1;
746 doneleft = !i; 785 doneleft = !i;
747 if (!doneleft) { 786 if (!doneleft) {
@@ -755,7 +794,7 @@ nextag:
755 /* 794 /*
756 * Search right with cur, go forward 1 record. 795 * Search right with cur, go forward 1 record.
757 */ 796 */
758 if ((error = xfs_inobt_increment(cur, 0, &i))) 797 if ((error = xfs_btree_increment(cur, 0, &i)))
759 goto error1; 798 goto error1;
760 doneright = !i; 799 doneright = !i;
761 if (!doneright) { 800 if (!doneright) {
@@ -817,7 +856,7 @@ nextag:
817 * further left. 856 * further left.
818 */ 857 */
819 if (useleft) { 858 if (useleft) {
820 if ((error = xfs_inobt_decrement(tcur, 0, 859 if ((error = xfs_btree_decrement(tcur, 0,
821 &i))) 860 &i)))
822 goto error1; 861 goto error1;
823 doneleft = !i; 862 doneleft = !i;
@@ -837,7 +876,7 @@ nextag:
837 * further right. 876 * further right.
838 */ 877 */
839 else { 878 else {
840 if ((error = xfs_inobt_increment(cur, 0, 879 if ((error = xfs_btree_increment(cur, 0,
841 &i))) 880 &i)))
842 goto error1; 881 goto error1;
843 doneright = !i; 882 doneright = !i;
@@ -892,7 +931,7 @@ nextag:
892 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 931 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
893 if (rec.ir_freecount > 0) 932 if (rec.ir_freecount > 0)
894 break; 933 break;
895 if ((error = xfs_inobt_increment(cur, 0, &i))) 934 if ((error = xfs_btree_increment(cur, 0, &i)))
896 goto error0; 935 goto error0;
897 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 936 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
898 } 937 }
@@ -926,7 +965,7 @@ nextag:
926 goto error0; 965 goto error0;
927 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 966 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
928 freecount += rec.ir_freecount; 967 freecount += rec.ir_freecount;
929 if ((error = xfs_inobt_increment(cur, 0, &i))) 968 if ((error = xfs_btree_increment(cur, 0, &i)))
930 goto error0; 969 goto error0;
931 } while (i == 1); 970 } while (i == 1);
932 ASSERT(freecount == be32_to_cpu(agi->agi_freecount) || 971 ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
@@ -1022,8 +1061,7 @@ xfs_difree(
1022 /* 1061 /*
1023 * Initialize the cursor. 1062 * Initialize the cursor.
1024 */ 1063 */
1025 cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO, 1064 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
1026 (xfs_inode_t *)0, 0);
1027#ifdef DEBUG 1065#ifdef DEBUG
1028 if (cur->bc_nlevels == 1) { 1066 if (cur->bc_nlevels == 1) {
1029 int freecount = 0; 1067 int freecount = 0;
@@ -1036,7 +1074,7 @@ xfs_difree(
1036 goto error0; 1074 goto error0;
1037 if (i) { 1075 if (i) {
1038 freecount += rec.ir_freecount; 1076 freecount += rec.ir_freecount;
1039 if ((error = xfs_inobt_increment(cur, 0, &i))) 1077 if ((error = xfs_btree_increment(cur, 0, &i)))
1040 goto error0; 1078 goto error0;
1041 } 1079 }
1042 } while (i == 1); 1080 } while (i == 1);
@@ -1098,8 +1136,8 @@ xfs_difree(
1098 xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen); 1136 xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
1099 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); 1137 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
1100 1138
1101 if ((error = xfs_inobt_delete(cur, &i))) { 1139 if ((error = xfs_btree_delete(cur, &i))) {
1102 cmn_err(CE_WARN, "xfs_difree: xfs_inobt_delete returned an error %d on %s.\n", 1140 cmn_err(CE_WARN, "xfs_difree: xfs_btree_delete returned an error %d on %s.\n",
1103 error, mp->m_fsname); 1141 error, mp->m_fsname);
1104 goto error0; 1142 goto error0;
1105 } 1143 }
@@ -1141,7 +1179,7 @@ xfs_difree(
1141 goto error0; 1179 goto error0;
1142 if (i) { 1180 if (i) {
1143 freecount += rec.ir_freecount; 1181 freecount += rec.ir_freecount;
1144 if ((error = xfs_inobt_increment(cur, 0, &i))) 1182 if ((error = xfs_btree_increment(cur, 0, &i)))
1145 goto error0; 1183 goto error0;
1146 } 1184 }
1147 } while (i == 1); 1185 } while (i == 1);
@@ -1158,36 +1196,28 @@ error0:
1158} 1196}
1159 1197
1160/* 1198/*
1161 * Return the location of the inode in bno/off, for mapping it into a buffer. 1199 * Return the location of the inode in imap, for mapping it into a buffer.
1162 */ 1200 */
1163/*ARGSUSED*/
1164int 1201int
1165xfs_dilocate( 1202xfs_imap(
1166 xfs_mount_t *mp, /* file system mount structure */ 1203 xfs_mount_t *mp, /* file system mount structure */
1167 xfs_trans_t *tp, /* transaction pointer */ 1204 xfs_trans_t *tp, /* transaction pointer */
1168 xfs_ino_t ino, /* inode to locate */ 1205 xfs_ino_t ino, /* inode to locate */
1169 xfs_fsblock_t *bno, /* output: block containing inode */ 1206 struct xfs_imap *imap, /* location map structure */
1170 int *len, /* output: num blocks in inode cluster */ 1207 uint flags) /* flags for inode btree lookup */
1171 int *off, /* output: index in block of inode */
1172 uint flags) /* flags concerning inode lookup */
1173{ 1208{
1174 xfs_agblock_t agbno; /* block number of inode in the alloc group */ 1209 xfs_agblock_t agbno; /* block number of inode in the alloc group */
1175 xfs_buf_t *agbp; /* agi buffer */
1176 xfs_agino_t agino; /* inode number within alloc group */ 1210 xfs_agino_t agino; /* inode number within alloc group */
1177 xfs_agnumber_t agno; /* allocation group number */ 1211 xfs_agnumber_t agno; /* allocation group number */
1178 int blks_per_cluster; /* num blocks per inode cluster */ 1212 int blks_per_cluster; /* num blocks per inode cluster */
1179 xfs_agblock_t chunk_agbno; /* first block in inode chunk */ 1213 xfs_agblock_t chunk_agbno; /* first block in inode chunk */
1180 xfs_agino_t chunk_agino; /* first agino in inode chunk */
1181 __int32_t chunk_cnt; /* count of free inodes in chunk */
1182 xfs_inofree_t chunk_free; /* mask of free inodes in chunk */
1183 xfs_agblock_t cluster_agbno; /* first block in inode cluster */ 1214 xfs_agblock_t cluster_agbno; /* first block in inode cluster */
1184 xfs_btree_cur_t *cur; /* inode btree cursor */
1185 int error; /* error code */ 1215 int error; /* error code */
1186 int i; /* temp state */
1187 int offset; /* index of inode in its buffer */ 1216 int offset; /* index of inode in its buffer */
1188 int offset_agbno; /* blks from chunk start to inode */ 1217 int offset_agbno; /* blks from chunk start to inode */
1189 1218
1190 ASSERT(ino != NULLFSINO); 1219 ASSERT(ino != NULLFSINO);
1220
1191 /* 1221 /*
1192 * Split up the inode number into its parts. 1222 * Split up the inode number into its parts.
1193 */ 1223 */
@@ -1198,24 +1228,24 @@ xfs_dilocate(
1198 ino != XFS_AGINO_TO_INO(mp, agno, agino)) { 1228 ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
1199#ifdef DEBUG 1229#ifdef DEBUG
1200 /* no diagnostics for bulkstat, ino comes from userspace */ 1230 /* no diagnostics for bulkstat, ino comes from userspace */
1201 if (flags & XFS_IMAP_BULKSTAT) 1231 if (flags & XFS_IGET_BULKSTAT)
1202 return XFS_ERROR(EINVAL); 1232 return XFS_ERROR(EINVAL);
1203 if (agno >= mp->m_sb.sb_agcount) { 1233 if (agno >= mp->m_sb.sb_agcount) {
1204 xfs_fs_cmn_err(CE_ALERT, mp, 1234 xfs_fs_cmn_err(CE_ALERT, mp,
1205 "xfs_dilocate: agno (%d) >= " 1235 "xfs_imap: agno (%d) >= "
1206 "mp->m_sb.sb_agcount (%d)", 1236 "mp->m_sb.sb_agcount (%d)",
1207 agno, mp->m_sb.sb_agcount); 1237 agno, mp->m_sb.sb_agcount);
1208 } 1238 }
1209 if (agbno >= mp->m_sb.sb_agblocks) { 1239 if (agbno >= mp->m_sb.sb_agblocks) {
1210 xfs_fs_cmn_err(CE_ALERT, mp, 1240 xfs_fs_cmn_err(CE_ALERT, mp,
1211 "xfs_dilocate: agbno (0x%llx) >= " 1241 "xfs_imap: agbno (0x%llx) >= "
1212 "mp->m_sb.sb_agblocks (0x%lx)", 1242 "mp->m_sb.sb_agblocks (0x%lx)",
1213 (unsigned long long) agbno, 1243 (unsigned long long) agbno,
1214 (unsigned long) mp->m_sb.sb_agblocks); 1244 (unsigned long) mp->m_sb.sb_agblocks);
1215 } 1245 }
1216 if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) { 1246 if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
1217 xfs_fs_cmn_err(CE_ALERT, mp, 1247 xfs_fs_cmn_err(CE_ALERT, mp,
1218 "xfs_dilocate: ino (0x%llx) != " 1248 "xfs_imap: ino (0x%llx) != "
1219 "XFS_AGINO_TO_INO(mp, agno, agino) " 1249 "XFS_AGINO_TO_INO(mp, agno, agino) "
1220 "(0x%llx)", 1250 "(0x%llx)",
1221 ino, XFS_AGINO_TO_INO(mp, agno, agino)); 1251 ino, XFS_AGINO_TO_INO(mp, agno, agino));
@@ -1224,65 +1254,89 @@ xfs_dilocate(
1224#endif /* DEBUG */ 1254#endif /* DEBUG */
1225 return XFS_ERROR(EINVAL); 1255 return XFS_ERROR(EINVAL);
1226 } 1256 }
1227 if ((mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) || 1257
1228 !(flags & XFS_IMAP_LOOKUP)) { 1258 /*
1259 * If the inode cluster size is the same as the blocksize or
1260 * smaller we get to the buffer by simple arithmetics.
1261 */
1262 if (XFS_INODE_CLUSTER_SIZE(mp) <= mp->m_sb.sb_blocksize) {
1229 offset = XFS_INO_TO_OFFSET(mp, ino); 1263 offset = XFS_INO_TO_OFFSET(mp, ino);
1230 ASSERT(offset < mp->m_sb.sb_inopblock); 1264 ASSERT(offset < mp->m_sb.sb_inopblock);
1231 *bno = XFS_AGB_TO_FSB(mp, agno, agbno); 1265
1232 *off = offset; 1266 imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
1233 *len = 1; 1267 imap->im_len = XFS_FSB_TO_BB(mp, 1);
1268 imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
1234 return 0; 1269 return 0;
1235 } 1270 }
1271
1236 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog; 1272 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
1237 if (*bno != NULLFSBLOCK) { 1273
1274 /*
1275 * If we get a block number passed from bulkstat we can use it to
1276 * find the buffer easily.
1277 */
1278 if (imap->im_blkno) {
1238 offset = XFS_INO_TO_OFFSET(mp, ino); 1279 offset = XFS_INO_TO_OFFSET(mp, ino);
1239 ASSERT(offset < mp->m_sb.sb_inopblock); 1280 ASSERT(offset < mp->m_sb.sb_inopblock);
1240 cluster_agbno = XFS_FSB_TO_AGBNO(mp, *bno); 1281
1241 *off = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) + 1282 cluster_agbno = XFS_DADDR_TO_AGBNO(mp, imap->im_blkno);
1242 offset; 1283 offset += (agbno - cluster_agbno) * mp->m_sb.sb_inopblock;
1243 *len = blks_per_cluster; 1284
1285 imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
1286 imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
1244 return 0; 1287 return 0;
1245 } 1288 }
1289
1290 /*
1291 * If the inode chunks are aligned then use simple maths to
1292 * find the location. Otherwise we have to do a btree
1293 * lookup to find the location.
1294 */
1246 if (mp->m_inoalign_mask) { 1295 if (mp->m_inoalign_mask) {
1247 offset_agbno = agbno & mp->m_inoalign_mask; 1296 offset_agbno = agbno & mp->m_inoalign_mask;
1248 chunk_agbno = agbno - offset_agbno; 1297 chunk_agbno = agbno - offset_agbno;
1249 } else { 1298 } else {
1299 xfs_btree_cur_t *cur; /* inode btree cursor */
1300 xfs_agino_t chunk_agino; /* first agino in inode chunk */
1301 __int32_t chunk_cnt; /* count of free inodes in chunk */
1302 xfs_inofree_t chunk_free; /* mask of free inodes in chunk */
1303 xfs_buf_t *agbp; /* agi buffer */
1304 int i; /* temp state */
1305
1250 down_read(&mp->m_peraglock); 1306 down_read(&mp->m_peraglock);
1251 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); 1307 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
1252 up_read(&mp->m_peraglock); 1308 up_read(&mp->m_peraglock);
1253 if (error) { 1309 if (error) {
1254#ifdef DEBUG 1310 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
1255 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
1256 "xfs_ialloc_read_agi() returned " 1311 "xfs_ialloc_read_agi() returned "
1257 "error %d, agno %d", 1312 "error %d, agno %d",
1258 error, agno); 1313 error, agno);
1259#endif /* DEBUG */
1260 return error; 1314 return error;
1261 } 1315 }
1262 cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO, 1316
1263 (xfs_inode_t *)0, 0); 1317 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
1264 if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) { 1318 error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i);
1265#ifdef DEBUG 1319 if (error) {
1266 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: " 1320 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
1267 "xfs_inobt_lookup_le() failed"); 1321 "xfs_inobt_lookup_le() failed");
1268#endif /* DEBUG */
1269 goto error0; 1322 goto error0;
1270 } 1323 }
1271 if ((error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt, 1324
1272 &chunk_free, &i))) { 1325 error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt,
1273#ifdef DEBUG 1326 &chunk_free, &i);
1274 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: " 1327 if (error) {
1328 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
1275 "xfs_inobt_get_rec() failed"); 1329 "xfs_inobt_get_rec() failed");
1276#endif /* DEBUG */
1277 goto error0; 1330 goto error0;
1278 } 1331 }
1279 if (i == 0) { 1332 if (i == 0) {
1280#ifdef DEBUG 1333#ifdef DEBUG
1281 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: " 1334 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
1282 "xfs_inobt_get_rec() failed"); 1335 "xfs_inobt_get_rec() failed");
1283#endif /* DEBUG */ 1336#endif /* DEBUG */
1284 error = XFS_ERROR(EINVAL); 1337 error = XFS_ERROR(EINVAL);
1285 } 1338 }
1339 error0:
1286 xfs_trans_brelse(tp, agbp); 1340 xfs_trans_brelse(tp, agbp);
1287 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 1341 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1288 if (error) 1342 if (error)
@@ -1290,19 +1344,35 @@ xfs_dilocate(
1290 chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino); 1344 chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino);
1291 offset_agbno = agbno - chunk_agbno; 1345 offset_agbno = agbno - chunk_agbno;
1292 } 1346 }
1347
1293 ASSERT(agbno >= chunk_agbno); 1348 ASSERT(agbno >= chunk_agbno);
1294 cluster_agbno = chunk_agbno + 1349 cluster_agbno = chunk_agbno +
1295 ((offset_agbno / blks_per_cluster) * blks_per_cluster); 1350 ((offset_agbno / blks_per_cluster) * blks_per_cluster);
1296 offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) + 1351 offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
1297 XFS_INO_TO_OFFSET(mp, ino); 1352 XFS_INO_TO_OFFSET(mp, ino);
1298 *bno = XFS_AGB_TO_FSB(mp, agno, cluster_agbno); 1353
1299 *off = offset; 1354 imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
1300 *len = blks_per_cluster; 1355 imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
1356 imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
1357
1358 /*
1359 * If the inode number maps to a block outside the bounds
1360 * of the file system then return NULL rather than calling
1361 * read_buf and panicing when we get an error from the
1362 * driver.
1363 */
1364 if ((imap->im_blkno + imap->im_len) >
1365 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
1366 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
1367 "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > "
1368 " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)",
1369 (unsigned long long) imap->im_blkno,
1370 (unsigned long long) imap->im_len,
1371 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
1372 return XFS_ERROR(EINVAL);
1373 }
1374
1301 return 0; 1375 return 0;
1302error0:
1303 xfs_trans_brelse(tp, agbp);
1304 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
1305 return error;
1306} 1376}
1307 1377
1308/* 1378/*
@@ -1370,70 +1440,95 @@ xfs_ialloc_log_agi(
1370 xfs_trans_log_buf(tp, bp, first, last); 1440 xfs_trans_log_buf(tp, bp, first, last);
1371} 1441}
1372 1442
1443#ifdef DEBUG
1444STATIC void
1445xfs_check_agi_unlinked(
1446 struct xfs_agi *agi)
1447{
1448 int i;
1449
1450 for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
1451 ASSERT(agi->agi_unlinked[i]);
1452}
1453#else
1454#define xfs_check_agi_unlinked(agi)
1455#endif
1456
1373/* 1457/*
1374 * Read in the allocation group header (inode allocation section) 1458 * Read in the allocation group header (inode allocation section)
1375 */ 1459 */
1376int 1460int
1377xfs_ialloc_read_agi( 1461xfs_read_agi(
1378 xfs_mount_t *mp, /* file system mount structure */ 1462 struct xfs_mount *mp, /* file system mount structure */
1379 xfs_trans_t *tp, /* transaction pointer */ 1463 struct xfs_trans *tp, /* transaction pointer */
1380 xfs_agnumber_t agno, /* allocation group number */ 1464 xfs_agnumber_t agno, /* allocation group number */
1381 xfs_buf_t **bpp) /* allocation group hdr buf */ 1465 struct xfs_buf **bpp) /* allocation group hdr buf */
1382{ 1466{
1383 xfs_agi_t *agi; /* allocation group header */ 1467 struct xfs_agi *agi; /* allocation group header */
1384 int agi_ok; /* agi is consistent */ 1468 int agi_ok; /* agi is consistent */
1385 xfs_buf_t *bp; /* allocation group hdr buf */ 1469 int error;
1386 xfs_perag_t *pag; /* per allocation group data */
1387 int error;
1388 1470
1389 ASSERT(agno != NULLAGNUMBER); 1471 ASSERT(agno != NULLAGNUMBER);
1390 error = xfs_trans_read_buf( 1472
1391 mp, tp, mp->m_ddev_targp, 1473 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
1392 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), 1474 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
1393 XFS_FSS_TO_BB(mp, 1), 0, &bp); 1475 XFS_FSS_TO_BB(mp, 1), 0, bpp);
1394 if (error) 1476 if (error)
1395 return error; 1477 return error;
1396 ASSERT(bp && !XFS_BUF_GETERROR(bp)); 1478
1479 ASSERT(*bpp && !XFS_BUF_GETERROR(*bpp));
1480 agi = XFS_BUF_TO_AGI(*bpp);
1397 1481
1398 /* 1482 /*
1399 * Validate the magic number of the agi block. 1483 * Validate the magic number of the agi block.
1400 */ 1484 */
1401 agi = XFS_BUF_TO_AGI(bp); 1485 agi_ok = be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
1402 agi_ok = 1486 XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) &&
1403 be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC && 1487 be32_to_cpu(agi->agi_seqno) == agno;
1404 XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
1405 if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, 1488 if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
1406 XFS_RANDOM_IALLOC_READ_AGI))) { 1489 XFS_RANDOM_IALLOC_READ_AGI))) {
1407 XFS_CORRUPTION_ERROR("xfs_ialloc_read_agi", XFS_ERRLEVEL_LOW, 1490 XFS_CORRUPTION_ERROR("xfs_read_agi", XFS_ERRLEVEL_LOW,
1408 mp, agi); 1491 mp, agi);
1409 xfs_trans_brelse(tp, bp); 1492 xfs_trans_brelse(tp, *bpp);
1410 return XFS_ERROR(EFSCORRUPTED); 1493 return XFS_ERROR(EFSCORRUPTED);
1411 } 1494 }
1495
1496 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGI, XFS_AGI_REF);
1497
1498 xfs_check_agi_unlinked(agi);
1499 return 0;
1500}
1501
1502int
1503xfs_ialloc_read_agi(
1504 struct xfs_mount *mp, /* file system mount structure */
1505 struct xfs_trans *tp, /* transaction pointer */
1506 xfs_agnumber_t agno, /* allocation group number */
1507 struct xfs_buf **bpp) /* allocation group hdr buf */
1508{
1509 struct xfs_agi *agi; /* allocation group header */
1510 struct xfs_perag *pag; /* per allocation group data */
1511 int error;
1512
1513 error = xfs_read_agi(mp, tp, agno, bpp);
1514 if (error)
1515 return error;
1516
1517 agi = XFS_BUF_TO_AGI(*bpp);
1412 pag = &mp->m_perag[agno]; 1518 pag = &mp->m_perag[agno];
1519
1413 if (!pag->pagi_init) { 1520 if (!pag->pagi_init) {
1414 pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); 1521 pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
1415 pag->pagi_count = be32_to_cpu(agi->agi_count); 1522 pag->pagi_count = be32_to_cpu(agi->agi_count);
1416 pag->pagi_init = 1; 1523 pag->pagi_init = 1;
1417 } else {
1418 /*
1419 * It's possible for these to be out of sync if
1420 * we are in the middle of a forced shutdown.
1421 */
1422 ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
1423 XFS_FORCED_SHUTDOWN(mp));
1424 } 1524 }
1425 1525
1426#ifdef DEBUG 1526 /*
1427 { 1527 * It's possible for these to be out of sync if
1428 int i; 1528 * we are in the middle of a forced shutdown.
1429 1529 */
1430 for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) 1530 ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
1431 ASSERT(agi->agi_unlinked[i]); 1531 XFS_FORCED_SHUTDOWN(mp));
1432 }
1433#endif
1434
1435 XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGI, XFS_AGI_REF);
1436 *bpp = bp;
1437 return 0; 1532 return 0;
1438} 1533}
1439 1534
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 4e30ec1d13bc..50f558a4e0a8 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -20,6 +20,7 @@
20 20
21struct xfs_buf; 21struct xfs_buf;
22struct xfs_dinode; 22struct xfs_dinode;
23struct xfs_imap;
23struct xfs_mount; 24struct xfs_mount;
24struct xfs_trans; 25struct xfs_trans;
25 26
@@ -56,7 +57,6 @@ static inline int xfs_ialloc_find_free(xfs_inofree_t *fp)
56} 57}
57 58
58 59
59#ifdef __KERNEL__
60/* 60/*
61 * Allocate an inode on disk. 61 * Allocate an inode on disk.
62 * Mode is used to tell whether the new inode will need space, and whether 62 * Mode is used to tell whether the new inode will need space, and whether
@@ -105,17 +105,14 @@ xfs_difree(
105 xfs_ino_t *first_ino); /* first inode in deleted cluster */ 105 xfs_ino_t *first_ino); /* first inode in deleted cluster */
106 106
107/* 107/*
108 * Return the location of the inode in bno/len/off, 108 * Return the location of the inode in imap, for mapping it into a buffer.
109 * for mapping it into a buffer.
110 */ 109 */
111int 110int
112xfs_dilocate( 111xfs_imap(
113 struct xfs_mount *mp, /* file system mount structure */ 112 struct xfs_mount *mp, /* file system mount structure */
114 struct xfs_trans *tp, /* transaction pointer */ 113 struct xfs_trans *tp, /* transaction pointer */
115 xfs_ino_t ino, /* inode to locate */ 114 xfs_ino_t ino, /* inode to locate */
116 xfs_fsblock_t *bno, /* output: block containing inode */ 115 struct xfs_imap *imap, /* location map structure */
117 int *len, /* output: num blocks in cluster*/
118 int *off, /* output: index in block of inode */
119 uint flags); /* flags for inode btree lookup */ 116 uint flags); /* flags for inode btree lookup */
120 117
121/* 118/*
@@ -154,6 +151,24 @@ xfs_ialloc_pagi_init(
154 struct xfs_trans *tp, /* transaction pointer */ 151 struct xfs_trans *tp, /* transaction pointer */
155 xfs_agnumber_t agno); /* allocation group number */ 152 xfs_agnumber_t agno); /* allocation group number */
156 153
157#endif /* __KERNEL__ */ 154/*
155 * Lookup the first record greater than or equal to ino
156 * in the btree given by cur.
157 */
158int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino,
159 __int32_t fcnt, xfs_inofree_t free, int *stat);
160
161/*
162 * Lookup the first record less than or equal to ino
163 * in the btree given by cur.
164 */
165int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino,
166 __int32_t fcnt, xfs_inofree_t free, int *stat);
167
168/*
169 * Get the data from the pointed-to record.
170 */
171extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino,
172 __int32_t *fcnt, xfs_inofree_t *free, int *stat);
158 173
159#endif /* __XFS_IALLOC_H__ */ 174#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 83502f3edef0..99f2408e8d8e 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -35,2044 +35,349 @@
35#include "xfs_dinode.h" 35#include "xfs_dinode.h"
36#include "xfs_inode.h" 36#include "xfs_inode.h"
37#include "xfs_btree.h" 37#include "xfs_btree.h"
38#include "xfs_btree_trace.h"
38#include "xfs_ialloc.h" 39#include "xfs_ialloc.h"
39#include "xfs_alloc.h" 40#include "xfs_alloc.h"
40#include "xfs_error.h" 41#include "xfs_error.h"
41 42
42STATIC void xfs_inobt_log_block(xfs_trans_t *, xfs_buf_t *, int);
43STATIC void xfs_inobt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
44STATIC void xfs_inobt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
45STATIC void xfs_inobt_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
46STATIC int xfs_inobt_lshift(xfs_btree_cur_t *, int, int *);
47STATIC int xfs_inobt_newroot(xfs_btree_cur_t *, int *);
48STATIC int xfs_inobt_rshift(xfs_btree_cur_t *, int, int *);
49STATIC int xfs_inobt_split(xfs_btree_cur_t *, int, xfs_agblock_t *,
50 xfs_inobt_key_t *, xfs_btree_cur_t **, int *);
51STATIC int xfs_inobt_updkey(xfs_btree_cur_t *, xfs_inobt_key_t *, int);
52 43
53/* 44STATIC int
54 * Single level of the xfs_inobt_delete record deletion routine. 45xfs_inobt_get_minrecs(
55 * Delete record pointed to by cur/level. 46 struct xfs_btree_cur *cur,
56 * Remove the record from its block then rebalance the tree. 47 int level)
57 * Return 0 for error, 1 for done, 2 to go on to the next level.
58 */
59STATIC int /* error */
60xfs_inobt_delrec(
61 xfs_btree_cur_t *cur, /* btree cursor */
62 int level, /* level removing record from */
63 int *stat) /* fail/done/go-on */
64{ 48{
65 xfs_buf_t *agbp; /* buffer for a.g. inode header */ 49 return cur->bc_mp->m_inobt_mnr[level != 0];
66 xfs_mount_t *mp; /* mount structure */ 50}
67 xfs_agi_t *agi; /* allocation group inode header */
68 xfs_inobt_block_t *block; /* btree block record/key lives in */
69 xfs_agblock_t bno; /* btree block number */
70 xfs_buf_t *bp; /* buffer for block */
71 int error; /* error return value */
72 int i; /* loop index */
73 xfs_inobt_key_t key; /* kp points here if block is level 0 */
74 xfs_inobt_key_t *kp = NULL; /* pointer to btree keys */
75 xfs_agblock_t lbno; /* left block's block number */
76 xfs_buf_t *lbp; /* left block's buffer pointer */
77 xfs_inobt_block_t *left; /* left btree block */
78 xfs_inobt_key_t *lkp; /* left block key pointer */
79 xfs_inobt_ptr_t *lpp; /* left block address pointer */
80 int lrecs = 0; /* number of records in left block */
81 xfs_inobt_rec_t *lrp; /* left block record pointer */
82 xfs_inobt_ptr_t *pp = NULL; /* pointer to btree addresses */
83 int ptr; /* index in btree block for this rec */
84 xfs_agblock_t rbno; /* right block's block number */
85 xfs_buf_t *rbp; /* right block's buffer pointer */
86 xfs_inobt_block_t *right; /* right btree block */
87 xfs_inobt_key_t *rkp; /* right block key pointer */
88 xfs_inobt_rec_t *rp; /* pointer to btree records */
89 xfs_inobt_ptr_t *rpp; /* right block address pointer */
90 int rrecs = 0; /* number of records in right block */
91 int numrecs;
92 xfs_inobt_rec_t *rrp; /* right block record pointer */
93 xfs_btree_cur_t *tcur; /* temporary btree cursor */
94
95 mp = cur->bc_mp;
96
97 /*
98 * Get the index of the entry being deleted, check for nothing there.
99 */
100 ptr = cur->bc_ptrs[level];
101 if (ptr == 0) {
102 *stat = 0;
103 return 0;
104 }
105
106 /*
107 * Get the buffer & block containing the record or key/ptr.
108 */
109 bp = cur->bc_bufs[level];
110 block = XFS_BUF_TO_INOBT_BLOCK(bp);
111#ifdef DEBUG
112 if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
113 return error;
114#endif
115 /*
116 * Fail if we're off the end of the block.
117 */
118 51
119 numrecs = be16_to_cpu(block->bb_numrecs); 52STATIC struct xfs_btree_cur *
120 if (ptr > numrecs) { 53xfs_inobt_dup_cursor(
121 *stat = 0; 54 struct xfs_btree_cur *cur)
122 return 0; 55{
123 } 56 return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
124 /* 57 cur->bc_private.a.agbp, cur->bc_private.a.agno);
125 * It's a nonleaf. Excise the key and ptr being deleted, by 58}
126 * sliding the entries past them down one.
127 * Log the changed areas of the block.
128 */
129 if (level > 0) {
130 kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
131 pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
132#ifdef DEBUG
133 for (i = ptr; i < numrecs; i++) {
134 if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i]), level)))
135 return error;
136 }
137#endif
138 if (ptr < numrecs) {
139 memmove(&kp[ptr - 1], &kp[ptr],
140 (numrecs - ptr) * sizeof(*kp));
141 memmove(&pp[ptr - 1], &pp[ptr],
142 (numrecs - ptr) * sizeof(*kp));
143 xfs_inobt_log_keys(cur, bp, ptr, numrecs - 1);
144 xfs_inobt_log_ptrs(cur, bp, ptr, numrecs - 1);
145 }
146 }
147 /*
148 * It's a leaf. Excise the record being deleted, by sliding the
149 * entries past it down one. Log the changed areas of the block.
150 */
151 else {
152 rp = XFS_INOBT_REC_ADDR(block, 1, cur);
153 if (ptr < numrecs) {
154 memmove(&rp[ptr - 1], &rp[ptr],
155 (numrecs - ptr) * sizeof(*rp));
156 xfs_inobt_log_recs(cur, bp, ptr, numrecs - 1);
157 }
158 /*
159 * If it's the first record in the block, we'll need a key
160 * structure to pass up to the next level (updkey).
161 */
162 if (ptr == 1) {
163 key.ir_startino = rp->ir_startino;
164 kp = &key;
165 }
166 }
167 /*
168 * Decrement and log the number of entries in the block.
169 */
170 numrecs--;
171 block->bb_numrecs = cpu_to_be16(numrecs);
172 xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
173 /*
174 * Is this the root level? If so, we're almost done.
175 */
176 if (level == cur->bc_nlevels - 1) {
177 /*
178 * If this is the root level,
179 * and there's only one entry left,
180 * and it's NOT the leaf level,
181 * then we can get rid of this level.
182 */
183 if (numrecs == 1 && level > 0) {
184 agbp = cur->bc_private.a.agbp;
185 agi = XFS_BUF_TO_AGI(agbp);
186 /*
187 * pp is still set to the first pointer in the block.
188 * Make it the new root of the btree.
189 */
190 bno = be32_to_cpu(agi->agi_root);
191 agi->agi_root = *pp;
192 be32_add_cpu(&agi->agi_level, -1);
193 /*
194 * Free the block.
195 */
196 if ((error = xfs_free_extent(cur->bc_tp,
197 XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, bno), 1)))
198 return error;
199 xfs_trans_binval(cur->bc_tp, bp);
200 xfs_ialloc_log_agi(cur->bc_tp, agbp,
201 XFS_AGI_ROOT | XFS_AGI_LEVEL);
202 /*
203 * Update the cursor so there's one fewer level.
204 */
205 cur->bc_bufs[level] = NULL;
206 cur->bc_nlevels--;
207 } else if (level > 0 &&
208 (error = xfs_inobt_decrement(cur, level, &i)))
209 return error;
210 *stat = 1;
211 return 0;
212 }
213 /*
214 * If we deleted the leftmost entry in the block, update the
215 * key values above us in the tree.
216 */
217 if (ptr == 1 && (error = xfs_inobt_updkey(cur, kp, level + 1)))
218 return error;
219 /*
220 * If the number of records remaining in the block is at least
221 * the minimum, we're done.
222 */
223 if (numrecs >= XFS_INOBT_BLOCK_MINRECS(level, cur)) {
224 if (level > 0 &&
225 (error = xfs_inobt_decrement(cur, level, &i)))
226 return error;
227 *stat = 1;
228 return 0;
229 }
230 /*
231 * Otherwise, we have to move some records around to keep the
232 * tree balanced. Look at the left and right sibling blocks to
233 * see if we can re-balance by moving only one record.
234 */
235 rbno = be32_to_cpu(block->bb_rightsib);
236 lbno = be32_to_cpu(block->bb_leftsib);
237 bno = NULLAGBLOCK;
238 ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK);
239 /*
240 * Duplicate the cursor so our btree manipulations here won't
241 * disrupt the next level up.
242 */
243 if ((error = xfs_btree_dup_cursor(cur, &tcur)))
244 return error;
245 /*
246 * If there's a right sibling, see if it's ok to shift an entry
247 * out of it.
248 */
249 if (rbno != NULLAGBLOCK) {
250 /*
251 * Move the temp cursor to the last entry in the next block.
252 * Actually any entry but the first would suffice.
253 */
254 i = xfs_btree_lastrec(tcur, level);
255 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
256 if ((error = xfs_inobt_increment(tcur, level, &i)))
257 goto error0;
258 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
259 i = xfs_btree_lastrec(tcur, level);
260 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
261 /*
262 * Grab a pointer to the block.
263 */
264 rbp = tcur->bc_bufs[level];
265 right = XFS_BUF_TO_INOBT_BLOCK(rbp);
266#ifdef DEBUG
267 if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
268 goto error0;
269#endif
270 /*
271 * Grab the current block number, for future use.
272 */
273 bno = be32_to_cpu(right->bb_leftsib);
274 /*
275 * If right block is full enough so that removing one entry
276 * won't make it too empty, and left-shifting an entry out
277 * of right to us works, we're done.
278 */
279 if (be16_to_cpu(right->bb_numrecs) - 1 >=
280 XFS_INOBT_BLOCK_MINRECS(level, cur)) {
281 if ((error = xfs_inobt_lshift(tcur, level, &i)))
282 goto error0;
283 if (i) {
284 ASSERT(be16_to_cpu(block->bb_numrecs) >=
285 XFS_INOBT_BLOCK_MINRECS(level, cur));
286 xfs_btree_del_cursor(tcur,
287 XFS_BTREE_NOERROR);
288 if (level > 0 &&
289 (error = xfs_inobt_decrement(cur, level,
290 &i)))
291 return error;
292 *stat = 1;
293 return 0;
294 }
295 }
296 /*
297 * Otherwise, grab the number of records in right for
298 * future reference, and fix up the temp cursor to point
299 * to our block again (last record).
300 */
301 rrecs = be16_to_cpu(right->bb_numrecs);
302 if (lbno != NULLAGBLOCK) {
303 xfs_btree_firstrec(tcur, level);
304 if ((error = xfs_inobt_decrement(tcur, level, &i)))
305 goto error0;
306 }
307 }
308 /*
309 * If there's a left sibling, see if it's ok to shift an entry
310 * out of it.
311 */
312 if (lbno != NULLAGBLOCK) {
313 /*
314 * Move the temp cursor to the first entry in the
315 * previous block.
316 */
317 xfs_btree_firstrec(tcur, level);
318 if ((error = xfs_inobt_decrement(tcur, level, &i)))
319 goto error0;
320 xfs_btree_firstrec(tcur, level);
321 /*
322 * Grab a pointer to the block.
323 */
324 lbp = tcur->bc_bufs[level];
325 left = XFS_BUF_TO_INOBT_BLOCK(lbp);
326#ifdef DEBUG
327 if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
328 goto error0;
329#endif
330 /*
331 * Grab the current block number, for future use.
332 */
333 bno = be32_to_cpu(left->bb_rightsib);
334 /*
335 * If left block is full enough so that removing one entry
336 * won't make it too empty, and right-shifting an entry out
337 * of left to us works, we're done.
338 */
339 if (be16_to_cpu(left->bb_numrecs) - 1 >=
340 XFS_INOBT_BLOCK_MINRECS(level, cur)) {
341 if ((error = xfs_inobt_rshift(tcur, level, &i)))
342 goto error0;
343 if (i) {
344 ASSERT(be16_to_cpu(block->bb_numrecs) >=
345 XFS_INOBT_BLOCK_MINRECS(level, cur));
346 xfs_btree_del_cursor(tcur,
347 XFS_BTREE_NOERROR);
348 if (level == 0)
349 cur->bc_ptrs[0]++;
350 *stat = 1;
351 return 0;
352 }
353 }
354 /*
355 * Otherwise, grab the number of records in right for
356 * future reference.
357 */
358 lrecs = be16_to_cpu(left->bb_numrecs);
359 }
360 /*
361 * Delete the temp cursor, we're done with it.
362 */
363 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
364 /*
365 * If here, we need to do a join to keep the tree balanced.
366 */
367 ASSERT(bno != NULLAGBLOCK);
368 /*
369 * See if we can join with the left neighbor block.
370 */
371 if (lbno != NULLAGBLOCK &&
372 lrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
373 /*
374 * Set "right" to be the starting block,
375 * "left" to be the left neighbor.
376 */
377 rbno = bno;
378 right = block;
379 rrecs = be16_to_cpu(right->bb_numrecs);
380 rbp = bp;
381 if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
382 cur->bc_private.a.agno, lbno, 0, &lbp,
383 XFS_INO_BTREE_REF)))
384 return error;
385 left = XFS_BUF_TO_INOBT_BLOCK(lbp);
386 lrecs = be16_to_cpu(left->bb_numrecs);
387 if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
388 return error;
389 }
390 /*
391 * If that won't work, see if we can join with the right neighbor block.
392 */
393 else if (rbno != NULLAGBLOCK &&
394 rrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
395 /*
396 * Set "left" to be the starting block,
397 * "right" to be the right neighbor.
398 */
399 lbno = bno;
400 left = block;
401 lrecs = be16_to_cpu(left->bb_numrecs);
402 lbp = bp;
403 if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
404 cur->bc_private.a.agno, rbno, 0, &rbp,
405 XFS_INO_BTREE_REF)))
406 return error;
407 right = XFS_BUF_TO_INOBT_BLOCK(rbp);
408 rrecs = be16_to_cpu(right->bb_numrecs);
409 if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
410 return error;
411 }
412 /*
413 * Otherwise, we can't fix the imbalance.
414 * Just return. This is probably a logic error, but it's not fatal.
415 */
416 else {
417 if (level > 0 && (error = xfs_inobt_decrement(cur, level, &i)))
418 return error;
419 *stat = 1;
420 return 0;
421 }
422 /*
423 * We're now going to join "left" and "right" by moving all the stuff
424 * in "right" to "left" and deleting "right".
425 */
426 if (level > 0) {
427 /*
428 * It's a non-leaf. Move keys and pointers.
429 */
430 lkp = XFS_INOBT_KEY_ADDR(left, lrecs + 1, cur);
431 lpp = XFS_INOBT_PTR_ADDR(left, lrecs + 1, cur);
432 rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
433 rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
434#ifdef DEBUG
435 for (i = 0; i < rrecs; i++) {
436 if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
437 return error;
438 }
439#endif
440 memcpy(lkp, rkp, rrecs * sizeof(*lkp));
441 memcpy(lpp, rpp, rrecs * sizeof(*lpp));
442 xfs_inobt_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
443 xfs_inobt_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
444 } else {
445 /*
446 * It's a leaf. Move records.
447 */
448 lrp = XFS_INOBT_REC_ADDR(left, lrecs + 1, cur);
449 rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
450 memcpy(lrp, rrp, rrecs * sizeof(*lrp));
451 xfs_inobt_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
452 }
453 /*
454 * If we joined with the left neighbor, set the buffer in the
455 * cursor to the left block, and fix up the index.
456 */
457 if (bp != lbp) {
458 xfs_btree_setbuf(cur, level, lbp);
459 cur->bc_ptrs[level] += lrecs;
460 }
461 /*
462 * If we joined with the right neighbor and there's a level above
463 * us, increment the cursor at that level.
464 */
465 else if (level + 1 < cur->bc_nlevels &&
466 (error = xfs_alloc_increment(cur, level + 1, &i)))
467 return error;
468 /*
469 * Fix up the number of records in the surviving block.
470 */
471 lrecs += rrecs;
472 left->bb_numrecs = cpu_to_be16(lrecs);
473 /*
474 * Fix up the right block pointer in the surviving block, and log it.
475 */
476 left->bb_rightsib = right->bb_rightsib;
477 xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
478 /*
479 * If there is a right sibling now, make it point to the
480 * remaining block.
481 */
482 if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) {
483 xfs_inobt_block_t *rrblock;
484 xfs_buf_t *rrbp;
485 59
486 if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, 60STATIC void
487 cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0, 61xfs_inobt_set_root(
488 &rrbp, XFS_INO_BTREE_REF))) 62 struct xfs_btree_cur *cur,
489 return error; 63 union xfs_btree_ptr *nptr,
490 rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp); 64 int inc) /* level change */
491 if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp))) 65{
492 return error; 66 struct xfs_buf *agbp = cur->bc_private.a.agbp;
493 rrblock->bb_leftsib = cpu_to_be32(lbno); 67 struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
494 xfs_inobt_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
495 }
496 /*
497 * Free the deleting block.
498 */
499 if ((error = xfs_free_extent(cur->bc_tp, XFS_AGB_TO_FSB(mp,
500 cur->bc_private.a.agno, rbno), 1)))
501 return error;
502 xfs_trans_binval(cur->bc_tp, rbp);
503 /*
504 * Readjust the ptr at this level if it's not a leaf, since it's
505 * still pointing at the deletion point, which makes the cursor
506 * inconsistent. If this makes the ptr 0, the caller fixes it up.
507 * We can't use decrement because it would change the next level up.
508 */
509 if (level > 0)
510 cur->bc_ptrs[level]--;
511 /*
512 * Return value means the next level up has something to do.
513 */
514 *stat = 2;
515 return 0;
516 68
517error0: 69 agi->agi_root = nptr->s;
518 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 70 be32_add_cpu(&agi->agi_level, inc);
519 return error; 71 xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL);
520} 72}
521 73
522/* 74STATIC int
523 * Insert one record/level. Return information to the caller 75xfs_inobt_alloc_block(
524 * allowing the next level up to proceed if necessary. 76 struct xfs_btree_cur *cur,
525 */ 77 union xfs_btree_ptr *start,
526STATIC int /* error */ 78 union xfs_btree_ptr *new,
527xfs_inobt_insrec( 79 int length,
528 xfs_btree_cur_t *cur, /* btree cursor */ 80 int *stat)
529 int level, /* level to insert record at */
530 xfs_agblock_t *bnop, /* i/o: block number inserted */
531 xfs_inobt_rec_t *recp, /* i/o: record data inserted */
532 xfs_btree_cur_t **curp, /* output: new cursor replacing cur */
533 int *stat) /* success/failure */
534{ 81{
535 xfs_inobt_block_t *block; /* btree block record/key lives in */ 82 xfs_alloc_arg_t args; /* block allocation args */
536 xfs_buf_t *bp; /* buffer for block */ 83 int error; /* error return value */
537 int error; /* error return value */ 84 xfs_agblock_t sbno = be32_to_cpu(start->s);
538 int i; /* loop index */
539 xfs_inobt_key_t key; /* key value being inserted */
540 xfs_inobt_key_t *kp=NULL; /* pointer to btree keys */
541 xfs_agblock_t nbno; /* block number of allocated block */
542 xfs_btree_cur_t *ncur; /* new cursor to be used at next lvl */
543 xfs_inobt_key_t nkey; /* new key value, from split */
544 xfs_inobt_rec_t nrec; /* new record value, for caller */
545 int numrecs;
546 int optr; /* old ptr value */
547 xfs_inobt_ptr_t *pp; /* pointer to btree addresses */
548 int ptr; /* index in btree block for this rec */
549 xfs_inobt_rec_t *rp=NULL; /* pointer to btree records */
550 85
551 /* 86 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
552 * GCC doesn't understand the (arguably complex) control flow in
553 * this function and complains about uninitialized structure fields
554 * without this.
555 */
556 memset(&nrec, 0, sizeof(nrec));
557 87
558 /* 88 memset(&args, 0, sizeof(args));
559 * If we made it to the root level, allocate a new root block 89 args.tp = cur->bc_tp;
560 * and we're done. 90 args.mp = cur->bc_mp;
561 */ 91 args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
562 if (level >= cur->bc_nlevels) { 92 args.minlen = 1;
563 error = xfs_inobt_newroot(cur, &i); 93 args.maxlen = 1;
564 *bnop = NULLAGBLOCK; 94 args.prod = 1;
565 *stat = i; 95 args.type = XFS_ALLOCTYPE_NEAR_BNO;
96
97 error = xfs_alloc_vextent(&args);
98 if (error) {
99 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
566 return error; 100 return error;
567 } 101 }
568 /* 102 if (args.fsbno == NULLFSBLOCK) {
569 * Make a key out of the record data to be inserted, and save it. 103 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
570 */
571 key.ir_startino = recp->ir_startino;
572 optr = ptr = cur->bc_ptrs[level];
573 /*
574 * If we're off the left edge, return failure.
575 */
576 if (ptr == 0) {
577 *stat = 0; 104 *stat = 0;
578 return 0; 105 return 0;
579 } 106 }
580 /* 107 ASSERT(args.len == 1);
581 * Get pointers to the btree buffer and block. 108 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
582 */ 109
583 bp = cur->bc_bufs[level]; 110 new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno));
584 block = XFS_BUF_TO_INOBT_BLOCK(bp);
585 numrecs = be16_to_cpu(block->bb_numrecs);
586#ifdef DEBUG
587 if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
588 return error;
589 /*
590 * Check that the new entry is being inserted in the right place.
591 */
592 if (ptr <= numrecs) {
593 if (level == 0) {
594 rp = XFS_INOBT_REC_ADDR(block, ptr, cur);
595 xfs_btree_check_rec(cur->bc_btnum, recp, rp);
596 } else {
597 kp = XFS_INOBT_KEY_ADDR(block, ptr, cur);
598 xfs_btree_check_key(cur->bc_btnum, &key, kp);
599 }
600 }
601#endif
602 nbno = NULLAGBLOCK;
603 ncur = NULL;
604 /*
605 * If the block is full, we can't insert the new entry until we
606 * make the block un-full.
607 */
608 if (numrecs == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
609 /*
610 * First, try shifting an entry to the right neighbor.
611 */
612 if ((error = xfs_inobt_rshift(cur, level, &i)))
613 return error;
614 if (i) {
615 /* nothing */
616 }
617 /*
618 * Next, try shifting an entry to the left neighbor.
619 */
620 else {
621 if ((error = xfs_inobt_lshift(cur, level, &i)))
622 return error;
623 if (i) {
624 optr = ptr = cur->bc_ptrs[level];
625 } else {
626 /*
627 * Next, try splitting the current block
628 * in half. If this works we have to
629 * re-set our variables because
630 * we could be in a different block now.
631 */
632 if ((error = xfs_inobt_split(cur, level, &nbno,
633 &nkey, &ncur, &i)))
634 return error;
635 if (i) {
636 bp = cur->bc_bufs[level];
637 block = XFS_BUF_TO_INOBT_BLOCK(bp);
638#ifdef DEBUG
639 if ((error = xfs_btree_check_sblock(cur,
640 block, level, bp)))
641 return error;
642#endif
643 ptr = cur->bc_ptrs[level];
644 nrec.ir_startino = nkey.ir_startino;
645 } else {
646 /*
647 * Otherwise the insert fails.
648 */
649 *stat = 0;
650 return 0;
651 }
652 }
653 }
654 }
655 /*
656 * At this point we know there's room for our new entry in the block
657 * we're pointing at.
658 */
659 numrecs = be16_to_cpu(block->bb_numrecs);
660 if (level > 0) {
661 /*
662 * It's a non-leaf entry. Make a hole for the new data
663 * in the key and ptr regions of the block.
664 */
665 kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
666 pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
667#ifdef DEBUG
668 for (i = numrecs; i >= ptr; i--) {
669 if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i - 1]), level)))
670 return error;
671 }
672#endif
673 memmove(&kp[ptr], &kp[ptr - 1],
674 (numrecs - ptr + 1) * sizeof(*kp));
675 memmove(&pp[ptr], &pp[ptr - 1],
676 (numrecs - ptr + 1) * sizeof(*pp));
677 /*
678 * Now stuff the new data in, bump numrecs and log the new data.
679 */
680#ifdef DEBUG
681 if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
682 return error;
683#endif
684 kp[ptr - 1] = key;
685 pp[ptr - 1] = cpu_to_be32(*bnop);
686 numrecs++;
687 block->bb_numrecs = cpu_to_be16(numrecs);
688 xfs_inobt_log_keys(cur, bp, ptr, numrecs);
689 xfs_inobt_log_ptrs(cur, bp, ptr, numrecs);
690 } else {
691 /*
692 * It's a leaf entry. Make a hole for the new record.
693 */
694 rp = XFS_INOBT_REC_ADDR(block, 1, cur);
695 memmove(&rp[ptr], &rp[ptr - 1],
696 (numrecs - ptr + 1) * sizeof(*rp));
697 /*
698 * Now stuff the new record in, bump numrecs
699 * and log the new data.
700 */
701 rp[ptr - 1] = *recp;
702 numrecs++;
703 block->bb_numrecs = cpu_to_be16(numrecs);
704 xfs_inobt_log_recs(cur, bp, ptr, numrecs);
705 }
706 /*
707 * Log the new number of records in the btree header.
708 */
709 xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
710#ifdef DEBUG
711 /*
712 * Check that the key/record is in the right place, now.
713 */
714 if (ptr < numrecs) {
715 if (level == 0)
716 xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1,
717 rp + ptr);
718 else
719 xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1,
720 kp + ptr);
721 }
722#endif
723 /*
724 * If we inserted at the start of a block, update the parents' keys.
725 */
726 if (optr == 1 && (error = xfs_inobt_updkey(cur, &key, level + 1)))
727 return error;
728 /*
729 * Return the new block number, if any.
730 * If there is one, give back a record value and a cursor too.
731 */
732 *bnop = nbno;
733 if (nbno != NULLAGBLOCK) {
734 *recp = nrec;
735 *curp = ncur;
736 }
737 *stat = 1; 111 *stat = 1;
738 return 0; 112 return 0;
739} 113}
740 114
741/* 115STATIC int
742 * Log header fields from a btree block. 116xfs_inobt_free_block(
743 */ 117 struct xfs_btree_cur *cur,
744STATIC void 118 struct xfs_buf *bp)
745xfs_inobt_log_block(
746 xfs_trans_t *tp, /* transaction pointer */
747 xfs_buf_t *bp, /* buffer containing btree block */
748 int fields) /* mask of fields: XFS_BB_... */
749{ 119{
750 int first; /* first byte offset logged */ 120 xfs_fsblock_t fsbno;
751 int last; /* last byte offset logged */ 121 int error;
752 static const short offsets[] = { /* table of offsets */
753 offsetof(xfs_inobt_block_t, bb_magic),
754 offsetof(xfs_inobt_block_t, bb_level),
755 offsetof(xfs_inobt_block_t, bb_numrecs),
756 offsetof(xfs_inobt_block_t, bb_leftsib),
757 offsetof(xfs_inobt_block_t, bb_rightsib),
758 sizeof(xfs_inobt_block_t)
759 };
760 122
761 xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last); 123 fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp));
762 xfs_trans_log_buf(tp, bp, first, last); 124 error = xfs_free_extent(cur->bc_tp, fsbno, 1);
125 if (error)
126 return error;
127
128 xfs_trans_binval(cur->bc_tp, bp);
129 return error;
763} 130}
764 131
765/* 132STATIC int
766 * Log keys from a btree block (nonleaf). 133xfs_inobt_get_maxrecs(
767 */ 134 struct xfs_btree_cur *cur,
768STATIC void 135 int level)
769xfs_inobt_log_keys(
770 xfs_btree_cur_t *cur, /* btree cursor */
771 xfs_buf_t *bp, /* buffer containing btree block */
772 int kfirst, /* index of first key to log */
773 int klast) /* index of last key to log */
774{ 136{
775 xfs_inobt_block_t *block; /* btree block to log from */ 137 return cur->bc_mp->m_inobt_mxr[level != 0];
776 int first; /* first byte offset logged */
777 xfs_inobt_key_t *kp; /* key pointer in btree block */
778 int last; /* last byte offset logged */
779
780 block = XFS_BUF_TO_INOBT_BLOCK(bp);
781 kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
782 first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
783 last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
784 xfs_trans_log_buf(cur->bc_tp, bp, first, last);
785} 138}
786 139
787/*
788 * Log block pointer fields from a btree block (nonleaf).
789 */
790STATIC void 140STATIC void
791xfs_inobt_log_ptrs( 141xfs_inobt_init_key_from_rec(
792 xfs_btree_cur_t *cur, /* btree cursor */ 142 union xfs_btree_key *key,
793 xfs_buf_t *bp, /* buffer containing btree block */ 143 union xfs_btree_rec *rec)
794 int pfirst, /* index of first pointer to log */
795 int plast) /* index of last pointer to log */
796{ 144{
797 xfs_inobt_block_t *block; /* btree block to log from */ 145 key->inobt.ir_startino = rec->inobt.ir_startino;
798 int first; /* first byte offset logged */
799 int last; /* last byte offset logged */
800 xfs_inobt_ptr_t *pp; /* block-pointer pointer in btree blk */
801
802 block = XFS_BUF_TO_INOBT_BLOCK(bp);
803 pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
804 first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
805 last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
806 xfs_trans_log_buf(cur->bc_tp, bp, first, last);
807} 146}
808 147
809/*
810 * Log records from a btree block (leaf).
811 */
812STATIC void 148STATIC void
813xfs_inobt_log_recs( 149xfs_inobt_init_rec_from_key(
814 xfs_btree_cur_t *cur, /* btree cursor */ 150 union xfs_btree_key *key,
815 xfs_buf_t *bp, /* buffer containing btree block */ 151 union xfs_btree_rec *rec)
816 int rfirst, /* index of first record to log */
817 int rlast) /* index of last record to log */
818{ 152{
819 xfs_inobt_block_t *block; /* btree block to log from */ 153 rec->inobt.ir_startino = key->inobt.ir_startino;
820 int first; /* first byte offset logged */ 154}
821 int last; /* last byte offset logged */
822 xfs_inobt_rec_t *rp; /* record pointer for btree block */
823 155
824 block = XFS_BUF_TO_INOBT_BLOCK(bp); 156STATIC void
825 rp = XFS_INOBT_REC_ADDR(block, 1, cur); 157xfs_inobt_init_rec_from_cur(
826 first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block); 158 struct xfs_btree_cur *cur,
827 last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block); 159 union xfs_btree_rec *rec)
828 xfs_trans_log_buf(cur->bc_tp, bp, first, last); 160{
161 rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
162 rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
163 rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
829} 164}
830 165
831/* 166/*
832 * Lookup the record. The cursor is made to point to it, based on dir. 167 * intial value of ptr for lookup
833 * Return 0 if can't find any such record, 1 for success.
834 */ 168 */
835STATIC int /* error */ 169STATIC void
836xfs_inobt_lookup( 170xfs_inobt_init_ptr_from_cur(
837 xfs_btree_cur_t *cur, /* btree cursor */ 171 struct xfs_btree_cur *cur,
838 xfs_lookup_t dir, /* <=, ==, or >= */ 172 union xfs_btree_ptr *ptr)
839 int *stat) /* success/failure */
840{ 173{
841 xfs_agblock_t agbno; /* a.g. relative btree block number */ 174 struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
842 xfs_agnumber_t agno; /* allocation group number */
843 xfs_inobt_block_t *block=NULL; /* current btree block */
844 __int64_t diff; /* difference for the current key */
845 int error; /* error return value */
846 int keyno=0; /* current key number */
847 int level; /* level in the btree */
848 xfs_mount_t *mp; /* file system mount point */
849
850 /*
851 * Get the allocation group header, and the root block number.
852 */
853 mp = cur->bc_mp;
854 {
855 xfs_agi_t *agi; /* a.g. inode header */
856
857 agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
858 agno = be32_to_cpu(agi->agi_seqno);
859 agbno = be32_to_cpu(agi->agi_root);
860 }
861 /*
862 * Iterate over each level in the btree, starting at the root.
863 * For each level above the leaves, find the key we need, based
864 * on the lookup record, then follow the corresponding block
865 * pointer down to the next level.
866 */
867 for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
868 xfs_buf_t *bp; /* buffer pointer for btree block */
869 xfs_daddr_t d; /* disk address of btree block */
870
871 /*
872 * Get the disk address we're looking for.
873 */
874 d = XFS_AGB_TO_DADDR(mp, agno, agbno);
875 /*
876 * If the old buffer at this level is for a different block,
877 * throw it away, otherwise just use it.
878 */
879 bp = cur->bc_bufs[level];
880 if (bp && XFS_BUF_ADDR(bp) != d)
881 bp = NULL;
882 if (!bp) {
883 /*
884 * Need to get a new buffer. Read it, then
885 * set it in the cursor, releasing the old one.
886 */
887 if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
888 agno, agbno, 0, &bp, XFS_INO_BTREE_REF)))
889 return error;
890 xfs_btree_setbuf(cur, level, bp);
891 /*
892 * Point to the btree block, now that we have the buffer
893 */
894 block = XFS_BUF_TO_INOBT_BLOCK(bp);
895 if ((error = xfs_btree_check_sblock(cur, block, level,
896 bp)))
897 return error;
898 } else
899 block = XFS_BUF_TO_INOBT_BLOCK(bp);
900 /*
901 * If we already had a key match at a higher level, we know
902 * we need to use the first entry in this block.
903 */
904 if (diff == 0)
905 keyno = 1;
906 /*
907 * Otherwise we need to search this block. Do a binary search.
908 */
909 else {
910 int high; /* high entry number */
911 xfs_inobt_key_t *kkbase=NULL;/* base of keys in block */
912 xfs_inobt_rec_t *krbase=NULL;/* base of records in block */
913 int low; /* low entry number */
914 175
915 /* 176 ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
916 * Get a pointer to keys or records.
917 */
918 if (level > 0)
919 kkbase = XFS_INOBT_KEY_ADDR(block, 1, cur);
920 else
921 krbase = XFS_INOBT_REC_ADDR(block, 1, cur);
922 /*
923 * Set low and high entry numbers, 1-based.
924 */
925 low = 1;
926 if (!(high = be16_to_cpu(block->bb_numrecs))) {
927 /*
928 * If the block is empty, the tree must
929 * be an empty leaf.
930 */
931 ASSERT(level == 0 && cur->bc_nlevels == 1);
932 cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
933 *stat = 0;
934 return 0;
935 }
936 /*
937 * Binary search the block.
938 */
939 while (low <= high) {
940 xfs_agino_t startino; /* key value */
941
942 /*
943 * keyno is average of low and high.
944 */
945 keyno = (low + high) >> 1;
946 /*
947 * Get startino.
948 */
949 if (level > 0) {
950 xfs_inobt_key_t *kkp;
951
952 kkp = kkbase + keyno - 1;
953 startino = be32_to_cpu(kkp->ir_startino);
954 } else {
955 xfs_inobt_rec_t *krp;
956
957 krp = krbase + keyno - 1;
958 startino = be32_to_cpu(krp->ir_startino);
959 }
960 /*
961 * Compute difference to get next direction.
962 */
963 diff = (__int64_t)
964 startino - cur->bc_rec.i.ir_startino;
965 /*
966 * Less than, move right.
967 */
968 if (diff < 0)
969 low = keyno + 1;
970 /*
971 * Greater than, move left.
972 */
973 else if (diff > 0)
974 high = keyno - 1;
975 /*
976 * Equal, we're done.
977 */
978 else
979 break;
980 }
981 }
982 /*
983 * If there are more levels, set up for the next level
984 * by getting the block number and filling in the cursor.
985 */
986 if (level > 0) {
987 /*
988 * If we moved left, need the previous key number,
989 * unless there isn't one.
990 */
991 if (diff > 0 && --keyno < 1)
992 keyno = 1;
993 agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, keyno, cur));
994#ifdef DEBUG
995 if ((error = xfs_btree_check_sptr(cur, agbno, level)))
996 return error;
997#endif
998 cur->bc_ptrs[level] = keyno;
999 }
1000 }
1001 /*
1002 * Done with the search.
1003 * See if we need to adjust the results.
1004 */
1005 if (dir != XFS_LOOKUP_LE && diff < 0) {
1006 keyno++;
1007 /*
1008 * If ge search and we went off the end of the block, but it's
1009 * not the last block, we're in the wrong block.
1010 */
1011 if (dir == XFS_LOOKUP_GE &&
1012 keyno > be16_to_cpu(block->bb_numrecs) &&
1013 be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) {
1014 int i;
1015 177
1016 cur->bc_ptrs[0] = keyno; 178 ptr->s = agi->agi_root;
1017 if ((error = xfs_inobt_increment(cur, 0, &i)))
1018 return error;
1019 ASSERT(i == 1);
1020 *stat = 1;
1021 return 0;
1022 }
1023 }
1024 else if (dir == XFS_LOOKUP_LE && diff > 0)
1025 keyno--;
1026 cur->bc_ptrs[0] = keyno;
1027 /*
1028 * Return if we succeeded or not.
1029 */
1030 if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs))
1031 *stat = 0;
1032 else
1033 *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
1034 return 0;
1035} 179}
1036 180
1037/* 181STATIC __int64_t
1038 * Move 1 record left from cur/level if possible. 182xfs_inobt_key_diff(
1039 * Update cur to reflect the new path. 183 struct xfs_btree_cur *cur,
1040 */ 184 union xfs_btree_key *key)
1041STATIC int /* error */
1042xfs_inobt_lshift(
1043 xfs_btree_cur_t *cur, /* btree cursor */
1044 int level, /* level to shift record on */
1045 int *stat) /* success/failure */
1046{ 185{
1047 int error; /* error return value */ 186 return (__int64_t)be32_to_cpu(key->inobt.ir_startino) -
1048#ifdef DEBUG 187 cur->bc_rec.i.ir_startino;
1049 int i; /* loop index */
1050#endif
1051 xfs_inobt_key_t key; /* key value for leaf level upward */
1052 xfs_buf_t *lbp; /* buffer for left neighbor block */
1053 xfs_inobt_block_t *left; /* left neighbor btree block */
1054 xfs_inobt_key_t *lkp=NULL; /* key pointer for left block */
1055 xfs_inobt_ptr_t *lpp; /* address pointer for left block */
1056 xfs_inobt_rec_t *lrp=NULL; /* record pointer for left block */
1057 int nrec; /* new number of left block entries */
1058 xfs_buf_t *rbp; /* buffer for right (current) block */
1059 xfs_inobt_block_t *right; /* right (current) btree block */
1060 xfs_inobt_key_t *rkp=NULL; /* key pointer for right block */
1061 xfs_inobt_ptr_t *rpp=NULL; /* address pointer for right block */
1062 xfs_inobt_rec_t *rrp=NULL; /* record pointer for right block */
1063
1064 /*
1065 * Set up variables for this block as "right".
1066 */
1067 rbp = cur->bc_bufs[level];
1068 right = XFS_BUF_TO_INOBT_BLOCK(rbp);
1069#ifdef DEBUG
1070 if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
1071 return error;
1072#endif
1073 /*
1074 * If we've got no left sibling then we can't shift an entry left.
1075 */
1076 if (be32_to_cpu(right->bb_leftsib) == NULLAGBLOCK) {
1077 *stat = 0;
1078 return 0;
1079 }
1080 /*
1081 * If the cursor entry is the one that would be moved, don't
1082 * do it... it's too complicated.
1083 */
1084 if (cur->bc_ptrs[level] <= 1) {
1085 *stat = 0;
1086 return 0;
1087 }
1088 /*
1089 * Set up the left neighbor as "left".
1090 */
1091 if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
1092 cur->bc_private.a.agno, be32_to_cpu(right->bb_leftsib),
1093 0, &lbp, XFS_INO_BTREE_REF)))
1094 return error;
1095 left = XFS_BUF_TO_INOBT_BLOCK(lbp);
1096 if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
1097 return error;
1098 /*
1099 * If it's full, it can't take another entry.
1100 */
1101 if (be16_to_cpu(left->bb_numrecs) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
1102 *stat = 0;
1103 return 0;
1104 }
1105 nrec = be16_to_cpu(left->bb_numrecs) + 1;
1106 /*
1107 * If non-leaf, copy a key and a ptr to the left block.
1108 */
1109 if (level > 0) {
1110 lkp = XFS_INOBT_KEY_ADDR(left, nrec, cur);
1111 rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
1112 *lkp = *rkp;
1113 xfs_inobt_log_keys(cur, lbp, nrec, nrec);
1114 lpp = XFS_INOBT_PTR_ADDR(left, nrec, cur);
1115 rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
1116#ifdef DEBUG
1117 if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level)))
1118 return error;
1119#endif
1120 *lpp = *rpp;
1121 xfs_inobt_log_ptrs(cur, lbp, nrec, nrec);
1122 }
1123 /*
1124 * If leaf, copy a record to the left block.
1125 */
1126 else {
1127 lrp = XFS_INOBT_REC_ADDR(left, nrec, cur);
1128 rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
1129 *lrp = *rrp;
1130 xfs_inobt_log_recs(cur, lbp, nrec, nrec);
1131 }
1132 /*
1133 * Bump and log left's numrecs, decrement and log right's numrecs.
1134 */
1135 be16_add_cpu(&left->bb_numrecs, 1);
1136 xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
1137#ifdef DEBUG
1138 if (level > 0)
1139 xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp);
1140 else
1141 xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp);
1142#endif
1143 be16_add_cpu(&right->bb_numrecs, -1);
1144 xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
1145 /*
1146 * Slide the contents of right down one entry.
1147 */
1148 if (level > 0) {
1149#ifdef DEBUG
1150 for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
1151 if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i + 1]),
1152 level)))
1153 return error;
1154 }
1155#endif
1156 memmove(rkp, rkp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
1157 memmove(rpp, rpp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
1158 xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
1159 xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
1160 } else {
1161 memmove(rrp, rrp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
1162 xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
1163 key.ir_startino = rrp->ir_startino;
1164 rkp = &key;
1165 }
1166 /*
1167 * Update the parent key values of right.
1168 */
1169 if ((error = xfs_inobt_updkey(cur, rkp, level + 1)))
1170 return error;
1171 /*
1172 * Slide the cursor value left one.
1173 */
1174 cur->bc_ptrs[level]--;
1175 *stat = 1;
1176 return 0;
1177} 188}
1178 189
1179/* 190STATIC int
1180 * Allocate a new root block, fill it in. 191xfs_inobt_kill_root(
1181 */ 192 struct xfs_btree_cur *cur,
1182STATIC int /* error */ 193 struct xfs_buf *bp,
1183xfs_inobt_newroot( 194 int level,
1184 xfs_btree_cur_t *cur, /* btree cursor */ 195 union xfs_btree_ptr *newroot)
1185 int *stat) /* success/failure */
1186{ 196{
1187 xfs_agi_t *agi; /* a.g. inode header */ 197 int error;
1188 xfs_alloc_arg_t args; /* allocation argument structure */
1189 xfs_inobt_block_t *block; /* one half of the old root block */
1190 xfs_buf_t *bp; /* buffer containing block */
1191 int error; /* error return value */
1192 xfs_inobt_key_t *kp; /* btree key pointer */
1193 xfs_agblock_t lbno; /* left block number */
1194 xfs_buf_t *lbp; /* left buffer pointer */
1195 xfs_inobt_block_t *left; /* left btree block */
1196 xfs_buf_t *nbp; /* new (root) buffer */
1197 xfs_inobt_block_t *new; /* new (root) btree block */
1198 int nptr; /* new value for key index, 1 or 2 */
1199 xfs_inobt_ptr_t *pp; /* btree address pointer */
1200 xfs_agblock_t rbno; /* right block number */
1201 xfs_buf_t *rbp; /* right buffer pointer */
1202 xfs_inobt_block_t *right; /* right btree block */
1203 xfs_inobt_rec_t *rp; /* btree record pointer */
1204 198
1205 ASSERT(cur->bc_nlevels < XFS_IN_MAXLEVELS(cur->bc_mp)); 199 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
200 XFS_BTREE_STATS_INC(cur, killroot);
1206 201
1207 /* 202 /*
1208 * Get a block & a buffer. 203 * Update the root pointer, decreasing the level by 1 and then
204 * free the old root.
1209 */ 205 */
1210 agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); 206 xfs_inobt_set_root(cur, newroot, -1);
1211 args.tp = cur->bc_tp; 207 error = xfs_inobt_free_block(cur, bp);
1212 args.mp = cur->bc_mp; 208 if (error) {
1213 args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, 209 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1214 be32_to_cpu(agi->agi_root));
1215 args.mod = args.minleft = args.alignment = args.total = args.wasdel =
1216 args.isfl = args.userdata = args.minalignslop = 0;
1217 args.minlen = args.maxlen = args.prod = 1;
1218 args.type = XFS_ALLOCTYPE_NEAR_BNO;
1219 if ((error = xfs_alloc_vextent(&args)))
1220 return error; 210 return error;
1221 /*
1222 * None available, we fail.
1223 */
1224 if (args.fsbno == NULLFSBLOCK) {
1225 *stat = 0;
1226 return 0;
1227 }
1228 ASSERT(args.len == 1);
1229 nbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0);
1230 new = XFS_BUF_TO_INOBT_BLOCK(nbp);
1231 /*
1232 * Set the root data in the a.g. inode structure.
1233 */
1234 agi->agi_root = cpu_to_be32(args.agbno);
1235 be32_add_cpu(&agi->agi_level, 1);
1236 xfs_ialloc_log_agi(args.tp, cur->bc_private.a.agbp,
1237 XFS_AGI_ROOT | XFS_AGI_LEVEL);
1238 /*
1239 * At the previous root level there are now two blocks: the old
1240 * root, and the new block generated when it was split.
1241 * We don't know which one the cursor is pointing at, so we
1242 * set up variables "left" and "right" for each case.
1243 */
1244 bp = cur->bc_bufs[cur->bc_nlevels - 1];
1245 block = XFS_BUF_TO_INOBT_BLOCK(bp);
1246#ifdef DEBUG
1247 if ((error = xfs_btree_check_sblock(cur, block, cur->bc_nlevels - 1, bp)))
1248 return error;
1249#endif
1250 if (be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) {
1251 /*
1252 * Our block is left, pick up the right block.
1253 */
1254 lbp = bp;
1255 lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp));
1256 left = block;
1257 rbno = be32_to_cpu(left->bb_rightsib);
1258 if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
1259 rbno, 0, &rbp, XFS_INO_BTREE_REF)))
1260 return error;
1261 bp = rbp;
1262 right = XFS_BUF_TO_INOBT_BLOCK(rbp);
1263 if ((error = xfs_btree_check_sblock(cur, right,
1264 cur->bc_nlevels - 1, rbp)))
1265 return error;
1266 nptr = 1;
1267 } else {
1268 /*
1269 * Our block is right, pick up the left block.
1270 */
1271 rbp = bp;
1272 rbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(rbp));
1273 right = block;
1274 lbno = be32_to_cpu(right->bb_leftsib);
1275 if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
1276 lbno, 0, &lbp, XFS_INO_BTREE_REF)))
1277 return error;
1278 bp = lbp;
1279 left = XFS_BUF_TO_INOBT_BLOCK(lbp);
1280 if ((error = xfs_btree_check_sblock(cur, left,
1281 cur->bc_nlevels - 1, lbp)))
1282 return error;
1283 nptr = 2;
1284 }
1285 /*
1286 * Fill in the new block's btree header and log it.
1287 */
1288 new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
1289 new->bb_level = cpu_to_be16(cur->bc_nlevels);
1290 new->bb_numrecs = cpu_to_be16(2);
1291 new->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1292 new->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1293 xfs_inobt_log_block(args.tp, nbp, XFS_BB_ALL_BITS);
1294 ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK);
1295 /*
1296 * Fill in the key data in the new root.
1297 */
1298 kp = XFS_INOBT_KEY_ADDR(new, 1, cur);
1299 if (be16_to_cpu(left->bb_level) > 0) {
1300 kp[0] = *XFS_INOBT_KEY_ADDR(left, 1, cur);
1301 kp[1] = *XFS_INOBT_KEY_ADDR(right, 1, cur);
1302 } else {
1303 rp = XFS_INOBT_REC_ADDR(left, 1, cur);
1304 kp[0].ir_startino = rp->ir_startino;
1305 rp = XFS_INOBT_REC_ADDR(right, 1, cur);
1306 kp[1].ir_startino = rp->ir_startino;
1307 } 211 }
1308 xfs_inobt_log_keys(cur, nbp, 1, 2);
1309 /*
1310 * Fill in the pointer data in the new root.
1311 */
1312 pp = XFS_INOBT_PTR_ADDR(new, 1, cur);
1313 pp[0] = cpu_to_be32(lbno);
1314 pp[1] = cpu_to_be32(rbno);
1315 xfs_inobt_log_ptrs(cur, nbp, 1, 2);
1316 /*
1317 * Fix up the cursor.
1318 */
1319 xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
1320 cur->bc_ptrs[cur->bc_nlevels] = nptr;
1321 cur->bc_nlevels++;
1322 *stat = 1;
1323 return 0;
1324}
1325 212
1326/* 213 XFS_BTREE_STATS_INC(cur, free);
1327 * Move 1 record right from cur/level if possible.
1328 * Update cur to reflect the new path.
1329 */
1330STATIC int /* error */
1331xfs_inobt_rshift(
1332 xfs_btree_cur_t *cur, /* btree cursor */
1333 int level, /* level to shift record on */
1334 int *stat) /* success/failure */
1335{
1336 int error; /* error return value */
1337 int i; /* loop index */
1338 xfs_inobt_key_t key; /* key value for leaf level upward */
1339 xfs_buf_t *lbp; /* buffer for left (current) block */
1340 xfs_inobt_block_t *left; /* left (current) btree block */
1341 xfs_inobt_key_t *lkp; /* key pointer for left block */
1342 xfs_inobt_ptr_t *lpp; /* address pointer for left block */
1343 xfs_inobt_rec_t *lrp; /* record pointer for left block */
1344 xfs_buf_t *rbp; /* buffer for right neighbor block */
1345 xfs_inobt_block_t *right; /* right neighbor btree block */
1346 xfs_inobt_key_t *rkp; /* key pointer for right block */
1347 xfs_inobt_ptr_t *rpp; /* address pointer for right block */
1348 xfs_inobt_rec_t *rrp=NULL; /* record pointer for right block */
1349 xfs_btree_cur_t *tcur; /* temporary cursor */
1350 214
1351 /* 215 cur->bc_bufs[level] = NULL;
1352 * Set up variables for this block as "left". 216 cur->bc_nlevels--;
1353 */ 217
1354 lbp = cur->bc_bufs[level]; 218 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1355 left = XFS_BUF_TO_INOBT_BLOCK(lbp);
1356#ifdef DEBUG
1357 if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
1358 return error;
1359#endif
1360 /*
1361 * If we've got no right sibling then we can't shift an entry right.
1362 */
1363 if (be32_to_cpu(left->bb_rightsib) == NULLAGBLOCK) {
1364 *stat = 0;
1365 return 0;
1366 }
1367 /*
1368 * If the cursor entry is the one that would be moved, don't
1369 * do it... it's too complicated.
1370 */
1371 if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) {
1372 *stat = 0;
1373 return 0;
1374 }
1375 /*
1376 * Set up the right neighbor as "right".
1377 */
1378 if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
1379 cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib),
1380 0, &rbp, XFS_INO_BTREE_REF)))
1381 return error;
1382 right = XFS_BUF_TO_INOBT_BLOCK(rbp);
1383 if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
1384 return error;
1385 /*
1386 * If it's full, it can't take another entry.
1387 */
1388 if (be16_to_cpu(right->bb_numrecs) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
1389 *stat = 0;
1390 return 0;
1391 }
1392 /*
1393 * Make a hole at the start of the right neighbor block, then
1394 * copy the last left block entry to the hole.
1395 */
1396 if (level > 0) {
1397 lkp = XFS_INOBT_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
1398 lpp = XFS_INOBT_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
1399 rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
1400 rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
1401#ifdef DEBUG
1402 for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) {
1403 if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
1404 return error;
1405 }
1406#endif
1407 memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
1408 memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
1409#ifdef DEBUG
1410 if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level)))
1411 return error;
1412#endif
1413 *rkp = *lkp;
1414 *rpp = *lpp;
1415 xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
1416 xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
1417 } else {
1418 lrp = XFS_INOBT_REC_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
1419 rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
1420 memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
1421 *rrp = *lrp;
1422 xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
1423 key.ir_startino = rrp->ir_startino;
1424 rkp = &key;
1425 }
1426 /*
1427 * Decrement and log left's numrecs, bump and log right's numrecs.
1428 */
1429 be16_add_cpu(&left->bb_numrecs, -1);
1430 xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
1431 be16_add_cpu(&right->bb_numrecs, 1);
1432#ifdef DEBUG
1433 if (level > 0)
1434 xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
1435 else
1436 xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1);
1437#endif
1438 xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
1439 /*
1440 * Using a temporary cursor, update the parent key values of the
1441 * block on the right.
1442 */
1443 if ((error = xfs_btree_dup_cursor(cur, &tcur)))
1444 return error;
1445 xfs_btree_lastrec(tcur, level);
1446 if ((error = xfs_inobt_increment(tcur, level, &i)) ||
1447 (error = xfs_inobt_updkey(tcur, rkp, level + 1))) {
1448 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
1449 return error;
1450 }
1451 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1452 *stat = 1;
1453 return 0; 219 return 0;
1454} 220}
1455 221
1456/*
1457 * Split cur/level block in half.
1458 * Return new block number and its first record (to be inserted into parent).
1459 */
1460STATIC int /* error */
1461xfs_inobt_split(
1462 xfs_btree_cur_t *cur, /* btree cursor */
1463 int level, /* level to split */
1464 xfs_agblock_t *bnop, /* output: block number allocated */
1465 xfs_inobt_key_t *keyp, /* output: first key of new block */
1466 xfs_btree_cur_t **curp, /* output: new cursor */
1467 int *stat) /* success/failure */
1468{
1469 xfs_alloc_arg_t args; /* allocation argument structure */
1470 int error; /* error return value */
1471 int i; /* loop index/record number */
1472 xfs_agblock_t lbno; /* left (current) block number */
1473 xfs_buf_t *lbp; /* buffer for left block */
1474 xfs_inobt_block_t *left; /* left (current) btree block */
1475 xfs_inobt_key_t *lkp; /* left btree key pointer */
1476 xfs_inobt_ptr_t *lpp; /* left btree address pointer */
1477 xfs_inobt_rec_t *lrp; /* left btree record pointer */
1478 xfs_buf_t *rbp; /* buffer for right block */
1479 xfs_inobt_block_t *right; /* right (new) btree block */
1480 xfs_inobt_key_t *rkp; /* right btree key pointer */
1481 xfs_inobt_ptr_t *rpp; /* right btree address pointer */
1482 xfs_inobt_rec_t *rrp; /* right btree record pointer */
1483
1484 /*
1485 * Set up left block (current one).
1486 */
1487 lbp = cur->bc_bufs[level];
1488 args.tp = cur->bc_tp;
1489 args.mp = cur->bc_mp;
1490 lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp));
1491 /*
1492 * Allocate the new block.
1493 * If we can't do it, we're toast. Give up.
1494 */
1495 args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, lbno);
1496 args.mod = args.minleft = args.alignment = args.total = args.wasdel =
1497 args.isfl = args.userdata = args.minalignslop = 0;
1498 args.minlen = args.maxlen = args.prod = 1;
1499 args.type = XFS_ALLOCTYPE_NEAR_BNO;
1500 if ((error = xfs_alloc_vextent(&args)))
1501 return error;
1502 if (args.fsbno == NULLFSBLOCK) {
1503 *stat = 0;
1504 return 0;
1505 }
1506 ASSERT(args.len == 1);
1507 rbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0);
1508 /*
1509 * Set up the new block as "right".
1510 */
1511 right = XFS_BUF_TO_INOBT_BLOCK(rbp);
1512 /*
1513 * "Left" is the current (according to the cursor) block.
1514 */
1515 left = XFS_BUF_TO_INOBT_BLOCK(lbp);
1516#ifdef DEBUG 222#ifdef DEBUG
1517 if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) 223STATIC int
1518 return error; 224xfs_inobt_keys_inorder(
1519#endif 225 struct xfs_btree_cur *cur,
1520 /* 226 union xfs_btree_key *k1,
1521 * Fill in the btree header for the new block. 227 union xfs_btree_key *k2)
1522 */ 228{
1523 right->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]); 229 return be32_to_cpu(k1->inobt.ir_startino) <
1524 right->bb_level = left->bb_level; 230 be32_to_cpu(k2->inobt.ir_startino);
1525 right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2);
1526 /*
1527 * Make sure that if there's an odd number of entries now, that
1528 * each new block will have the same number of entries.
1529 */
1530 if ((be16_to_cpu(left->bb_numrecs) & 1) &&
1531 cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
1532 be16_add_cpu(&right->bb_numrecs, 1);
1533 i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
1534 /*
1535 * For non-leaf blocks, copy keys and addresses over to the new block.
1536 */
1537 if (level > 0) {
1538 lkp = XFS_INOBT_KEY_ADDR(left, i, cur);
1539 lpp = XFS_INOBT_PTR_ADDR(left, i, cur);
1540 rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
1541 rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
1542#ifdef DEBUG
1543 for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
1544 if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
1545 return error;
1546 }
1547#endif
1548 memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
1549 memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
1550 xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
1551 xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
1552 *keyp = *rkp;
1553 }
1554 /*
1555 * For leaf blocks, copy records over to the new block.
1556 */
1557 else {
1558 lrp = XFS_INOBT_REC_ADDR(left, i, cur);
1559 rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
1560 memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
1561 xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
1562 keyp->ir_startino = rrp->ir_startino;
1563 }
1564 /*
1565 * Find the left block number by looking in the buffer.
1566 * Adjust numrecs, sibling pointers.
1567 */
1568 be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
1569 right->bb_rightsib = left->bb_rightsib;
1570 left->bb_rightsib = cpu_to_be32(args.agbno);
1571 right->bb_leftsib = cpu_to_be32(lbno);
1572 xfs_inobt_log_block(args.tp, rbp, XFS_BB_ALL_BITS);
1573 xfs_inobt_log_block(args.tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
1574 /*
1575 * If there's a block to the new block's right, make that block
1576 * point back to right instead of to left.
1577 */
1578 if (be32_to_cpu(right->bb_rightsib) != NULLAGBLOCK) {
1579 xfs_inobt_block_t *rrblock; /* rr btree block */
1580 xfs_buf_t *rrbp; /* buffer for rrblock */
1581
1582 if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
1583 be32_to_cpu(right->bb_rightsib), 0, &rrbp,
1584 XFS_INO_BTREE_REF)))
1585 return error;
1586 rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
1587 if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
1588 return error;
1589 rrblock->bb_leftsib = cpu_to_be32(args.agbno);
1590 xfs_inobt_log_block(args.tp, rrbp, XFS_BB_LEFTSIB);
1591 }
1592 /*
1593 * If the cursor is really in the right block, move it there.
1594 * If it's just pointing past the last entry in left, then we'll
1595 * insert there, so don't change anything in that case.
1596 */
1597 if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) {
1598 xfs_btree_setbuf(cur, level, rbp);
1599 cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs);
1600 }
1601 /*
1602 * If there are more levels, we'll need another cursor which refers
1603 * the right block, no matter where this cursor was.
1604 */
1605 if (level + 1 < cur->bc_nlevels) {
1606 if ((error = xfs_btree_dup_cursor(cur, curp)))
1607 return error;
1608 (*curp)->bc_ptrs[level + 1]++;
1609 }
1610 *bnop = args.agbno;
1611 *stat = 1;
1612 return 0;
1613} 231}
1614 232
1615/* 233STATIC int
1616 * Update keys at all levels from here to the root along the cursor's path. 234xfs_inobt_recs_inorder(
1617 */ 235 struct xfs_btree_cur *cur,
1618STATIC int /* error */ 236 union xfs_btree_rec *r1,
1619xfs_inobt_updkey( 237 union xfs_btree_rec *r2)
1620 xfs_btree_cur_t *cur, /* btree cursor */
1621 xfs_inobt_key_t *keyp, /* new key value to update to */
1622 int level) /* starting level for update */
1623{ 238{
1624 int ptr; /* index of key in block */ 239 return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <=
1625 240 be32_to_cpu(r2->inobt.ir_startino);
1626 /*
1627 * Go up the tree from this level toward the root.
1628 * At each level, update the key value to the value input.
1629 * Stop when we reach a level where the cursor isn't pointing
1630 * at the first entry in the block.
1631 */
1632 for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
1633 xfs_buf_t *bp; /* buffer for block */
1634 xfs_inobt_block_t *block; /* btree block */
1635#ifdef DEBUG
1636 int error; /* error return value */
1637#endif
1638 xfs_inobt_key_t *kp; /* ptr to btree block keys */
1639
1640 bp = cur->bc_bufs[level];
1641 block = XFS_BUF_TO_INOBT_BLOCK(bp);
1642#ifdef DEBUG
1643 if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
1644 return error;
1645#endif
1646 ptr = cur->bc_ptrs[level];
1647 kp = XFS_INOBT_KEY_ADDR(block, ptr, cur);
1648 *kp = *keyp;
1649 xfs_inobt_log_keys(cur, bp, ptr, ptr);
1650 }
1651 return 0;
1652} 241}
242#endif /* DEBUG */
1653 243
1654/* 244#ifdef XFS_BTREE_TRACE
1655 * Externally visible routines. 245ktrace_t *xfs_inobt_trace_buf;
1656 */
1657 246
1658/* 247STATIC void
1659 * Decrement cursor by one record at the level. 248xfs_inobt_trace_enter(
1660 * For nonzero levels the leaf-ward information is untouched. 249 struct xfs_btree_cur *cur,
1661 */ 250 const char *func,
1662int /* error */ 251 char *s,
1663xfs_inobt_decrement( 252 int type,
1664 xfs_btree_cur_t *cur, /* btree cursor */ 253 int line,
1665 int level, /* level in btree, 0 is leaf */ 254 __psunsigned_t a0,
1666 int *stat) /* success/failure */ 255 __psunsigned_t a1,
256 __psunsigned_t a2,
257 __psunsigned_t a3,
258 __psunsigned_t a4,
259 __psunsigned_t a5,
260 __psunsigned_t a6,
261 __psunsigned_t a7,
262 __psunsigned_t a8,
263 __psunsigned_t a9,
264 __psunsigned_t a10)
1667{ 265{
1668 xfs_inobt_block_t *block; /* btree block */ 266 ktrace_enter(xfs_inobt_trace_buf, (void *)(__psint_t)type,
1669 int error; 267 (void *)func, (void *)s, NULL, (void *)cur,
1670 int lev; /* btree level */ 268 (void *)a0, (void *)a1, (void *)a2, (void *)a3,
1671 269 (void *)a4, (void *)a5, (void *)a6, (void *)a7,
1672 ASSERT(level < cur->bc_nlevels); 270 (void *)a8, (void *)a9, (void *)a10);
1673 /*
1674 * Read-ahead to the left at this level.
1675 */
1676 xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1677 /*
1678 * Decrement the ptr at this level. If we're still in the block
1679 * then we're done.
1680 */
1681 if (--cur->bc_ptrs[level] > 0) {
1682 *stat = 1;
1683 return 0;
1684 }
1685 /*
1686 * Get a pointer to the btree block.
1687 */
1688 block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[level]);
1689#ifdef DEBUG
1690 if ((error = xfs_btree_check_sblock(cur, block, level,
1691 cur->bc_bufs[level])))
1692 return error;
1693#endif
1694 /*
1695 * If we just went off the left edge of the tree, return failure.
1696 */
1697 if (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK) {
1698 *stat = 0;
1699 return 0;
1700 }
1701 /*
1702 * March up the tree decrementing pointers.
1703 * Stop when we don't go off the left edge of a block.
1704 */
1705 for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1706 if (--cur->bc_ptrs[lev] > 0)
1707 break;
1708 /*
1709 * Read-ahead the left block, we're going to read it
1710 * in the next loop.
1711 */
1712 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1713 }
1714 /*
1715 * If we went off the root then we are seriously confused.
1716 */
1717 ASSERT(lev < cur->bc_nlevels);
1718 /*
1719 * Now walk back down the tree, fixing up the cursor's buffer
1720 * pointers and key numbers.
1721 */
1722 for (block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]); lev > level; ) {
1723 xfs_agblock_t agbno; /* block number of btree block */
1724 xfs_buf_t *bp; /* buffer containing btree block */
1725
1726 agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
1727 if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
1728 cur->bc_private.a.agno, agbno, 0, &bp,
1729 XFS_INO_BTREE_REF)))
1730 return error;
1731 lev--;
1732 xfs_btree_setbuf(cur, lev, bp);
1733 block = XFS_BUF_TO_INOBT_BLOCK(bp);
1734 if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
1735 return error;
1736 cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs);
1737 }
1738 *stat = 1;
1739 return 0;
1740} 271}
1741 272
1742/* 273STATIC void
1743 * Delete the record pointed to by cur. 274xfs_inobt_trace_cursor(
1744 * The cursor refers to the place where the record was (could be inserted) 275 struct xfs_btree_cur *cur,
1745 * when the operation returns. 276 __uint32_t *s0,
1746 */ 277 __uint64_t *l0,
1747int /* error */ 278 __uint64_t *l1)
1748xfs_inobt_delete(
1749 xfs_btree_cur_t *cur, /* btree cursor */
1750 int *stat) /* success/failure */
1751{ 279{
1752 int error; 280 *s0 = cur->bc_private.a.agno;
1753 int i; /* result code */ 281 *l0 = cur->bc_rec.i.ir_startino;
1754 int level; /* btree level */ 282 *l1 = cur->bc_rec.i.ir_free;
1755
1756 /*
1757 * Go up the tree, starting at leaf level.
1758 * If 2 is returned then a join was done; go to the next level.
1759 * Otherwise we are done.
1760 */
1761 for (level = 0, i = 2; i == 2; level++) {
1762 if ((error = xfs_inobt_delrec(cur, level, &i)))
1763 return error;
1764 }
1765 if (i == 0) {
1766 for (level = 1; level < cur->bc_nlevels; level++) {
1767 if (cur->bc_ptrs[level] == 0) {
1768 if ((error = xfs_inobt_decrement(cur, level, &i)))
1769 return error;
1770 break;
1771 }
1772 }
1773 }
1774 *stat = i;
1775 return 0;
1776} 283}
1777 284
1778 285STATIC void
1779/* 286xfs_inobt_trace_key(
1780 * Get the data from the pointed-to record. 287 struct xfs_btree_cur *cur,
1781 */ 288 union xfs_btree_key *key,
1782int /* error */ 289 __uint64_t *l0,
1783xfs_inobt_get_rec( 290 __uint64_t *l1)
1784 xfs_btree_cur_t *cur, /* btree cursor */
1785 xfs_agino_t *ino, /* output: starting inode of chunk */
1786 __int32_t *fcnt, /* output: number of free inodes */
1787 xfs_inofree_t *free, /* output: free inode mask */
1788 int *stat) /* output: success/failure */
1789{ 291{
1790 xfs_inobt_block_t *block; /* btree block */ 292 *l0 = be32_to_cpu(key->inobt.ir_startino);
1791 xfs_buf_t *bp; /* buffer containing btree block */ 293 *l1 = 0;
1792#ifdef DEBUG
1793 int error; /* error return value */
1794#endif
1795 int ptr; /* record number */
1796 xfs_inobt_rec_t *rec; /* record data */
1797
1798 bp = cur->bc_bufs[0];
1799 ptr = cur->bc_ptrs[0];
1800 block = XFS_BUF_TO_INOBT_BLOCK(bp);
1801#ifdef DEBUG
1802 if ((error = xfs_btree_check_sblock(cur, block, 0, bp)))
1803 return error;
1804#endif
1805 /*
1806 * Off the right end or left end, return failure.
1807 */
1808 if (ptr > be16_to_cpu(block->bb_numrecs) || ptr <= 0) {
1809 *stat = 0;
1810 return 0;
1811 }
1812 /*
1813 * Point to the record and extract its data.
1814 */
1815 rec = XFS_INOBT_REC_ADDR(block, ptr, cur);
1816 *ino = be32_to_cpu(rec->ir_startino);
1817 *fcnt = be32_to_cpu(rec->ir_freecount);
1818 *free = be64_to_cpu(rec->ir_free);
1819 *stat = 1;
1820 return 0;
1821} 294}
1822 295
1823/* 296STATIC void
1824 * Increment cursor by one record at the level. 297xfs_inobt_trace_record(
1825 * For nonzero levels the leaf-ward information is untouched. 298 struct xfs_btree_cur *cur,
1826 */ 299 union xfs_btree_rec *rec,
1827int /* error */ 300 __uint64_t *l0,
1828xfs_inobt_increment( 301 __uint64_t *l1,
1829 xfs_btree_cur_t *cur, /* btree cursor */ 302 __uint64_t *l2)
1830 int level, /* level in btree, 0 is leaf */
1831 int *stat) /* success/failure */
1832{ 303{
1833 xfs_inobt_block_t *block; /* btree block */ 304 *l0 = be32_to_cpu(rec->inobt.ir_startino);
1834 xfs_buf_t *bp; /* buffer containing btree block */ 305 *l1 = be32_to_cpu(rec->inobt.ir_freecount);
1835 int error; /* error return value */ 306 *l2 = be64_to_cpu(rec->inobt.ir_free);
1836 int lev; /* btree level */ 307}
308#endif /* XFS_BTREE_TRACE */
309
310static const struct xfs_btree_ops xfs_inobt_ops = {
311 .rec_len = sizeof(xfs_inobt_rec_t),
312 .key_len = sizeof(xfs_inobt_key_t),
313
314 .dup_cursor = xfs_inobt_dup_cursor,
315 .set_root = xfs_inobt_set_root,
316 .kill_root = xfs_inobt_kill_root,
317 .alloc_block = xfs_inobt_alloc_block,
318 .free_block = xfs_inobt_free_block,
319 .get_minrecs = xfs_inobt_get_minrecs,
320 .get_maxrecs = xfs_inobt_get_maxrecs,
321 .init_key_from_rec = xfs_inobt_init_key_from_rec,
322 .init_rec_from_key = xfs_inobt_init_rec_from_key,
323 .init_rec_from_cur = xfs_inobt_init_rec_from_cur,
324 .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur,
325 .key_diff = xfs_inobt_key_diff,
1837 326
1838 ASSERT(level < cur->bc_nlevels);
1839 /*
1840 * Read-ahead to the right at this level.
1841 */
1842 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1843 /*
1844 * Get a pointer to the btree block.
1845 */
1846 bp = cur->bc_bufs[level];
1847 block = XFS_BUF_TO_INOBT_BLOCK(bp);
1848#ifdef DEBUG
1849 if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
1850 return error;
1851#endif
1852 /*
1853 * Increment the ptr at this level. If we're still in the block
1854 * then we're done.
1855 */
1856 if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) {
1857 *stat = 1;
1858 return 0;
1859 }
1860 /*
1861 * If we just went off the right edge of the tree, return failure.
1862 */
1863 if (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK) {
1864 *stat = 0;
1865 return 0;
1866 }
1867 /*
1868 * March up the tree incrementing pointers.
1869 * Stop when we don't go off the right edge of a block.
1870 */
1871 for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1872 bp = cur->bc_bufs[lev];
1873 block = XFS_BUF_TO_INOBT_BLOCK(bp);
1874#ifdef DEBUG 327#ifdef DEBUG
1875 if ((error = xfs_btree_check_sblock(cur, block, lev, bp))) 328 .keys_inorder = xfs_inobt_keys_inorder,
1876 return error; 329 .recs_inorder = xfs_inobt_recs_inorder,
1877#endif 330#endif
1878 if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs))
1879 break;
1880 /*
1881 * Read-ahead the right block, we're going to read it
1882 * in the next loop.
1883 */
1884 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1885 }
1886 /*
1887 * If we went off the root then we are seriously confused.
1888 */
1889 ASSERT(lev < cur->bc_nlevels);
1890 /*
1891 * Now walk back down the tree, fixing up the cursor's buffer
1892 * pointers and key numbers.
1893 */
1894 for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_INOBT_BLOCK(bp);
1895 lev > level; ) {
1896 xfs_agblock_t agbno; /* block number of btree block */
1897 331
1898 agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur)); 332#ifdef XFS_BTREE_TRACE
1899 if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, 333 .trace_enter = xfs_inobt_trace_enter,
1900 cur->bc_private.a.agno, agbno, 0, &bp, 334 .trace_cursor = xfs_inobt_trace_cursor,
1901 XFS_INO_BTREE_REF))) 335 .trace_key = xfs_inobt_trace_key,
1902 return error; 336 .trace_record = xfs_inobt_trace_record,
1903 lev--; 337#endif
1904 xfs_btree_setbuf(cur, lev, bp); 338};
1905 block = XFS_BUF_TO_INOBT_BLOCK(bp);
1906 if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
1907 return error;
1908 cur->bc_ptrs[lev] = 1;
1909 }
1910 *stat = 1;
1911 return 0;
1912}
1913 339
1914/* 340/*
1915 * Insert the current record at the point referenced by cur. 341 * Allocate a new inode btree cursor.
1916 * The cursor may be inconsistent on return if splits have been done.
1917 */ 342 */
1918int /* error */ 343struct xfs_btree_cur * /* new inode btree cursor */
1919xfs_inobt_insert( 344xfs_inobt_init_cursor(
1920 xfs_btree_cur_t *cur, /* btree cursor */ 345 struct xfs_mount *mp, /* file system mount point */
1921 int *stat) /* success/failure */ 346 struct xfs_trans *tp, /* transaction pointer */
347 struct xfs_buf *agbp, /* buffer for agi structure */
348 xfs_agnumber_t agno) /* allocation group number */
1922{ 349{
1923 int error; /* error return value */ 350 struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
1924 int i; /* result value, 0 for failure */ 351 struct xfs_btree_cur *cur;
1925 int level; /* current level number in btree */
1926 xfs_agblock_t nbno; /* new block number (split result) */
1927 xfs_btree_cur_t *ncur; /* new cursor (split result) */
1928 xfs_inobt_rec_t nrec; /* record being inserted this level */
1929 xfs_btree_cur_t *pcur; /* previous level's cursor */
1930 352
1931 level = 0; 353 cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
1932 nbno = NULLAGBLOCK;
1933 nrec.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
1934 nrec.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
1935 nrec.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
1936 ncur = NULL;
1937 pcur = cur;
1938 /*
1939 * Loop going up the tree, starting at the leaf level.
1940 * Stop when we don't get a split block, that must mean that
1941 * the insert is finished with this level.
1942 */
1943 do {
1944 /*
1945 * Insert nrec/nbno into this level of the tree.
1946 * Note if we fail, nbno will be null.
1947 */
1948 if ((error = xfs_inobt_insrec(pcur, level++, &nbno, &nrec, &ncur,
1949 &i))) {
1950 if (pcur != cur)
1951 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
1952 return error;
1953 }
1954 /*
1955 * See if the cursor we just used is trash.
1956 * Can't trash the caller's cursor, but otherwise we should
1957 * if ncur is a new cursor or we're about to be done.
1958 */
1959 if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) {
1960 cur->bc_nlevels = pcur->bc_nlevels;
1961 xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
1962 }
1963 /*
1964 * If we got a new cursor, switch to it.
1965 */
1966 if (ncur) {
1967 pcur = ncur;
1968 ncur = NULL;
1969 }
1970 } while (nbno != NULLAGBLOCK);
1971 *stat = i;
1972 return 0;
1973}
1974 354
1975/* 355 cur->bc_tp = tp;
1976 * Lookup the record equal to ino in the btree given by cur. 356 cur->bc_mp = mp;
1977 */ 357 cur->bc_nlevels = be32_to_cpu(agi->agi_level);
1978int /* error */ 358 cur->bc_btnum = XFS_BTNUM_INO;
1979xfs_inobt_lookup_eq( 359 cur->bc_blocklog = mp->m_sb.sb_blocklog;
1980 xfs_btree_cur_t *cur, /* btree cursor */
1981 xfs_agino_t ino, /* starting inode of chunk */
1982 __int32_t fcnt, /* free inode count */
1983 xfs_inofree_t free, /* free inode mask */
1984 int *stat) /* success/failure */
1985{
1986 cur->bc_rec.i.ir_startino = ino;
1987 cur->bc_rec.i.ir_freecount = fcnt;
1988 cur->bc_rec.i.ir_free = free;
1989 return xfs_inobt_lookup(cur, XFS_LOOKUP_EQ, stat);
1990}
1991 360
1992/* 361 cur->bc_ops = &xfs_inobt_ops;
1993 * Lookup the first record greater than or equal to ino
1994 * in the btree given by cur.
1995 */
1996int /* error */
1997xfs_inobt_lookup_ge(
1998 xfs_btree_cur_t *cur, /* btree cursor */
1999 xfs_agino_t ino, /* starting inode of chunk */
2000 __int32_t fcnt, /* free inode count */
2001 xfs_inofree_t free, /* free inode mask */
2002 int *stat) /* success/failure */
2003{
2004 cur->bc_rec.i.ir_startino = ino;
2005 cur->bc_rec.i.ir_freecount = fcnt;
2006 cur->bc_rec.i.ir_free = free;
2007 return xfs_inobt_lookup(cur, XFS_LOOKUP_GE, stat);
2008}
2009 362
2010/* 363 cur->bc_private.a.agbp = agbp;
2011 * Lookup the first record less than or equal to ino 364 cur->bc_private.a.agno = agno;
2012 * in the btree given by cur. 365
2013 */ 366 return cur;
2014int /* error */
2015xfs_inobt_lookup_le(
2016 xfs_btree_cur_t *cur, /* btree cursor */
2017 xfs_agino_t ino, /* starting inode of chunk */
2018 __int32_t fcnt, /* free inode count */
2019 xfs_inofree_t free, /* free inode mask */
2020 int *stat) /* success/failure */
2021{
2022 cur->bc_rec.i.ir_startino = ino;
2023 cur->bc_rec.i.ir_freecount = fcnt;
2024 cur->bc_rec.i.ir_free = free;
2025 return xfs_inobt_lookup(cur, XFS_LOOKUP_LE, stat);
2026} 367}
2027 368
2028/* 369/*
2029 * Update the record referred to by cur, to the value given 370 * Calculate number of records in an inobt btree block.
2030 * by [ino, fcnt, free].
2031 * This either works (return 0) or gets an EFSCORRUPTED error.
2032 */ 371 */
2033int /* error */ 372int
2034xfs_inobt_update( 373xfs_inobt_maxrecs(
2035 xfs_btree_cur_t *cur, /* btree cursor */ 374 struct xfs_mount *mp,
2036 xfs_agino_t ino, /* starting inode of chunk */ 375 int blocklen,
2037 __int32_t fcnt, /* free inode count */ 376 int leaf)
2038 xfs_inofree_t free) /* free inode mask */
2039{ 377{
2040 xfs_inobt_block_t *block; /* btree block to update */ 378 blocklen -= XFS_INOBT_BLOCK_LEN(mp);
2041 xfs_buf_t *bp; /* buffer containing btree block */
2042 int error; /* error return value */
2043 int ptr; /* current record number (updating) */
2044 xfs_inobt_rec_t *rp; /* pointer to updated record */
2045 379
2046 /* 380 if (leaf)
2047 * Pick up the current block. 381 return blocklen / sizeof(xfs_inobt_rec_t);
2048 */ 382 return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
2049 bp = cur->bc_bufs[0];
2050 block = XFS_BUF_TO_INOBT_BLOCK(bp);
2051#ifdef DEBUG
2052 if ((error = xfs_btree_check_sblock(cur, block, 0, bp)))
2053 return error;
2054#endif
2055 /*
2056 * Get the address of the rec to be updated.
2057 */
2058 ptr = cur->bc_ptrs[0];
2059 rp = XFS_INOBT_REC_ADDR(block, ptr, cur);
2060 /*
2061 * Fill in the new contents and log them.
2062 */
2063 rp->ir_startino = cpu_to_be32(ino);
2064 rp->ir_freecount = cpu_to_be32(fcnt);
2065 rp->ir_free = cpu_to_be64(free);
2066 xfs_inobt_log_recs(cur, bp, ptr, ptr);
2067 /*
2068 * Updating first record in leaf. Pass new key value up to our parent.
2069 */
2070 if (ptr == 1) {
2071 xfs_inobt_key_t key; /* key containing [ino] */
2072
2073 key.ir_startino = cpu_to_be32(ino);
2074 if ((error = xfs_inobt_updkey(cur, &key, 1)))
2075 return error;
2076 }
2077 return 0;
2078} 383}
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index 8efc4a5b8b92..37e5dd01a577 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -24,7 +24,6 @@
24 24
25struct xfs_buf; 25struct xfs_buf;
26struct xfs_btree_cur; 26struct xfs_btree_cur;
27struct xfs_btree_sblock;
28struct xfs_mount; 27struct xfs_mount;
29 28
30/* 29/*
@@ -70,11 +69,6 @@ typedef struct xfs_inobt_key {
70/* btree pointer type */ 69/* btree pointer type */
71typedef __be32 xfs_inobt_ptr_t; 70typedef __be32 xfs_inobt_ptr_t;
72 71
73/* btree block header type */
74typedef struct xfs_btree_sblock xfs_inobt_block_t;
75
76#define XFS_BUF_TO_INOBT_BLOCK(bp) ((xfs_inobt_block_t *)XFS_BUF_PTR(bp))
77
78/* 72/*
79 * Bit manipulations for ir_free. 73 * Bit manipulations for ir_free.
80 */ 74 */
@@ -85,14 +79,6 @@ typedef struct xfs_btree_sblock xfs_inobt_block_t;
85#define XFS_INOBT_CLR_FREE(rp,i) ((rp)->ir_free &= ~XFS_INOBT_MASK(i)) 79#define XFS_INOBT_CLR_FREE(rp,i) ((rp)->ir_free &= ~XFS_INOBT_MASK(i))
86 80
87/* 81/*
88 * Real block structures have a size equal to the disk block size.
89 */
90#define XFS_INOBT_BLOCK_MAXRECS(lev,cur) ((cur)->bc_mp->m_inobt_mxr[lev != 0])
91#define XFS_INOBT_BLOCK_MINRECS(lev,cur) ((cur)->bc_mp->m_inobt_mnr[lev != 0])
92#define XFS_INOBT_IS_LAST_REC(cur) \
93 ((cur)->bc_ptrs[0] == be16_to_cpu(XFS_BUF_TO_INOBT_BLOCK((cur)->bc_bufs[0])->bb_numrecs))
94
95/*
96 * Maximum number of inode btree levels. 82 * Maximum number of inode btree levels.
97 */ 83 */
98#define XFS_IN_MAXLEVELS(mp) ((mp)->m_in_maxlevels) 84#define XFS_IN_MAXLEVELS(mp) ((mp)->m_in_maxlevels)
@@ -104,75 +90,38 @@ typedef struct xfs_btree_sblock xfs_inobt_block_t;
104#define XFS_PREALLOC_BLOCKS(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1)) 90#define XFS_PREALLOC_BLOCKS(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
105 91
106/* 92/*
107 * Record, key, and pointer address macros for btree blocks. 93 * Btree block header size depends on a superblock flag.
108 */ 94 *
109#define XFS_INOBT_REC_ADDR(bb,i,cur) \ 95 * (not quite yet, but soon)
110 (XFS_BTREE_REC_ADDR(xfs_inobt, bb, i))
111
112#define XFS_INOBT_KEY_ADDR(bb,i,cur) \
113 (XFS_BTREE_KEY_ADDR(xfs_inobt, bb, i))
114
115#define XFS_INOBT_PTR_ADDR(bb,i,cur) \
116 (XFS_BTREE_PTR_ADDR(xfs_inobt, bb, \
117 i, XFS_INOBT_BLOCK_MAXRECS(1, cur)))
118
119/*
120 * Decrement cursor by one record at the level.
121 * For nonzero levels the leaf-ward information is untouched.
122 */
123extern int xfs_inobt_decrement(struct xfs_btree_cur *cur, int level, int *stat);
124
125/*
126 * Delete the record pointed to by cur.
127 * The cursor refers to the place where the record was (could be inserted)
128 * when the operation returns.
129 */
130extern int xfs_inobt_delete(struct xfs_btree_cur *cur, int *stat);
131
132/*
133 * Get the data from the pointed-to record.
134 */
135extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino,
136 __int32_t *fcnt, xfs_inofree_t *free, int *stat);
137
138/*
139 * Increment cursor by one record at the level.
140 * For nonzero levels the leaf-ward information is untouched.
141 */
142extern int xfs_inobt_increment(struct xfs_btree_cur *cur, int level, int *stat);
143
144/*
145 * Insert the current record at the point referenced by cur.
146 * The cursor may be inconsistent on return if splits have been done.
147 */
148extern int xfs_inobt_insert(struct xfs_btree_cur *cur, int *stat);
149
150/*
151 * Lookup the record equal to ino in the btree given by cur.
152 */
153extern int xfs_inobt_lookup_eq(struct xfs_btree_cur *cur, xfs_agino_t ino,
154 __int32_t fcnt, xfs_inofree_t free, int *stat);
155
156/*
157 * Lookup the first record greater than or equal to ino
158 * in the btree given by cur.
159 */
160extern int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino,
161 __int32_t fcnt, xfs_inofree_t free, int *stat);
162
163/*
164 * Lookup the first record less than or equal to ino
165 * in the btree given by cur.
166 */ 96 */
167extern int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino, 97#define XFS_INOBT_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN
168 __int32_t fcnt, xfs_inofree_t free, int *stat);
169 98
170/* 99/*
171 * Update the record referred to by cur, to the value given 100 * Record, key, and pointer address macros for btree blocks.
172 * by [ino, fcnt, free]. 101 *
173 * This either works (return 0) or gets an EFSCORRUPTED error. 102 * (note that some of these may appear unused, but they are used in userspace)
174 */ 103 */
175extern int xfs_inobt_update(struct xfs_btree_cur *cur, xfs_agino_t ino, 104#define XFS_INOBT_REC_ADDR(mp, block, index) \
176 __int32_t fcnt, xfs_inofree_t free); 105 ((xfs_inobt_rec_t *) \
106 ((char *)(block) + \
107 XFS_INOBT_BLOCK_LEN(mp) + \
108 (((index) - 1) * sizeof(xfs_inobt_rec_t))))
109
110#define XFS_INOBT_KEY_ADDR(mp, block, index) \
111 ((xfs_inobt_key_t *) \
112 ((char *)(block) + \
113 XFS_INOBT_BLOCK_LEN(mp) + \
114 ((index) - 1) * sizeof(xfs_inobt_key_t)))
115
116#define XFS_INOBT_PTR_ADDR(mp, block, index, maxrecs) \
117 ((xfs_inobt_ptr_t *) \
118 ((char *)(block) + \
119 XFS_INOBT_BLOCK_LEN(mp) + \
120 (maxrecs) * sizeof(xfs_inobt_key_t) + \
121 ((index) - 1) * sizeof(xfs_inobt_ptr_t)))
122
123extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
124 struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t);
125extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
177 126
178#endif /* __XFS_IALLOC_BTREE_H__ */ 127#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index e229e9e001c2..e2fb6210d4c5 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -38,281 +38,283 @@
38#include "xfs_ialloc.h" 38#include "xfs_ialloc.h"
39#include "xfs_quota.h" 39#include "xfs_quota.h"
40#include "xfs_utils.h" 40#include "xfs_utils.h"
41#include "xfs_trans_priv.h"
42#include "xfs_inode_item.h"
43#include "xfs_bmap.h"
44#include "xfs_btree_trace.h"
45#include "xfs_dir2_trace.h"
46
41 47
42/* 48/*
43 * Look up an inode by number in the given file system. 49 * Allocate and initialise an xfs_inode.
44 * The inode is looked up in the cache held in each AG.
45 * If the inode is found in the cache, attach it to the provided
46 * vnode.
47 *
48 * If it is not in core, read it in from the file system's device,
49 * add it to the cache and attach the provided vnode.
50 *
51 * The inode is locked according to the value of the lock_flags parameter.
52 * This flag parameter indicates how and if the inode's IO lock and inode lock
53 * should be taken.
54 *
55 * mp -- the mount point structure for the current file system. It points
56 * to the inode hash table.
57 * tp -- a pointer to the current transaction if there is one. This is
58 * simply passed through to the xfs_iread() call.
59 * ino -- the number of the inode desired. This is the unique identifier
60 * within the file system for the inode being requested.
61 * lock_flags -- flags indicating how to lock the inode. See the comment
62 * for xfs_ilock() for a list of valid values.
63 * bno -- the block number starting the buffer containing the inode,
64 * if known (as by bulkstat), else 0.
65 */ 50 */
66STATIC int 51STATIC struct xfs_inode *
67xfs_iget_core( 52xfs_inode_alloc(
68 struct inode *inode, 53 struct xfs_mount *mp,
69 xfs_mount_t *mp, 54 xfs_ino_t ino)
70 xfs_trans_t *tp,
71 xfs_ino_t ino,
72 uint flags,
73 uint lock_flags,
74 xfs_inode_t **ipp,
75 xfs_daddr_t bno)
76{ 55{
77 struct inode *old_inode; 56 struct xfs_inode *ip;
78 xfs_inode_t *ip;
79 xfs_inode_t *iq;
80 int error;
81 unsigned long first_index, mask;
82 xfs_perag_t *pag;
83 xfs_agino_t agino;
84 57
85 /* the radix tree exists only in inode capable AGs */ 58 /*
86 if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi) 59 * if this didn't occur in transactions, we could use
87 return EINVAL; 60 * KM_MAYFAIL and return NULL here on ENOMEM. Set the
61 * code up to do this anyway.
62 */
63 ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
64 if (!ip)
65 return NULL;
88 66
89 /* get the perag structure and ensure that it's inode capable */ 67 ASSERT(atomic_read(&ip->i_iocount) == 0);
90 pag = xfs_get_perag(mp, ino); 68 ASSERT(atomic_read(&ip->i_pincount) == 0);
91 if (!pag->pagi_inodeok) 69 ASSERT(!spin_is_locked(&ip->i_flags_lock));
92 return EINVAL; 70 ASSERT(completion_done(&ip->i_flush));
93 ASSERT(pag->pag_ici_init);
94 agino = XFS_INO_TO_AGINO(mp, ino);
95 71
96again: 72 /*
97 read_lock(&pag->pag_ici_lock); 73 * initialise the VFS inode here to get failures
98 ip = radix_tree_lookup(&pag->pag_ici_root, agino); 74 * out of the way early.
75 */
76 if (!inode_init_always(mp->m_super, VFS_I(ip))) {
77 kmem_zone_free(xfs_inode_zone, ip);
78 return NULL;
79 }
80
81 /* initialise the xfs inode */
82 ip->i_ino = ino;
83 ip->i_mount = mp;
84 memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
85 ip->i_afp = NULL;
86 memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
87 ip->i_flags = 0;
88 ip->i_update_core = 0;
89 ip->i_update_size = 0;
90 ip->i_delayed_blks = 0;
91 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
92 ip->i_size = 0;
93 ip->i_new_size = 0;
94
95 /*
96 * Initialize inode's trace buffers.
97 */
98#ifdef XFS_INODE_TRACE
99 ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS);
100#endif
101#ifdef XFS_BMAP_TRACE
102 ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS);
103#endif
104#ifdef XFS_BTREE_TRACE
105 ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS);
106#endif
107#ifdef XFS_RW_TRACE
108 ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS);
109#endif
110#ifdef XFS_ILOCK_TRACE
111 ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS);
112#endif
113#ifdef XFS_DIR2_TRACE
114 ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
115#endif
116
117 return ip;
118}
119
120/*
121 * Check the validity of the inode we just found it the cache
122 */
123static int
124xfs_iget_cache_hit(
125 struct xfs_perag *pag,
126 struct xfs_inode *ip,
127 int flags,
128 int lock_flags) __releases(pag->pag_ici_lock)
129{
130 struct xfs_mount *mp = ip->i_mount;
131 int error = EAGAIN;
132
133 /*
134 * If INEW is set this inode is being set up
135 * If IRECLAIM is set this inode is being torn down
136 * Pause and try again.
137 */
138 if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
139 XFS_STATS_INC(xs_ig_frecycle);
140 goto out_error;
141 }
142
143 /* If IRECLAIMABLE is set, we've torn down the vfs inode part */
144 if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
99 145
100 if (ip != NULL) {
101 /* 146 /*
102 * If INEW is set this inode is being set up 147 * If lookup is racing with unlink, then we should return an
103 * we need to pause and try again. 148 * error immediately so we don't remove it from the reclaim
149 * list and potentially leak the inode.
104 */ 150 */
105 if (xfs_iflags_test(ip, XFS_INEW)) { 151 if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
106 read_unlock(&pag->pag_ici_lock); 152 error = ENOENT;
107 delay(1); 153 goto out_error;
108 XFS_STATS_INC(xs_ig_frecycle);
109
110 goto again;
111 } 154 }
112 155
113 old_inode = ip->i_vnode; 156 xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
114 if (old_inode == NULL) {
115 /*
116 * If IRECLAIM is set this inode is
117 * on its way out of the system,
118 * we need to pause and try again.
119 */
120 if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
121 read_unlock(&pag->pag_ici_lock);
122 delay(1);
123 XFS_STATS_INC(xs_ig_frecycle);
124
125 goto again;
126 }
127 ASSERT(xfs_iflags_test(ip, XFS_IRECLAIMABLE));
128
129 /*
130 * If lookup is racing with unlink, then we
131 * should return an error immediately so we
132 * don't remove it from the reclaim list and
133 * potentially leak the inode.
134 */
135 if ((ip->i_d.di_mode == 0) &&
136 !(flags & XFS_IGET_CREATE)) {
137 read_unlock(&pag->pag_ici_lock);
138 xfs_put_perag(mp, pag);
139 return ENOENT;
140 }
141
142 xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
143
144 XFS_STATS_INC(xs_ig_found);
145 xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
146 read_unlock(&pag->pag_ici_lock);
147
148 XFS_MOUNT_ILOCK(mp);
149 list_del_init(&ip->i_reclaim);
150 XFS_MOUNT_IUNLOCK(mp);
151
152 goto finish_inode;
153
154 } else if (inode != old_inode) {
155 /* The inode is being torn down, pause and
156 * try again.
157 */
158 if (old_inode->i_state & (I_FREEING | I_CLEAR)) {
159 read_unlock(&pag->pag_ici_lock);
160 delay(1);
161 XFS_STATS_INC(xs_ig_frecycle);
162
163 goto again;
164 }
165/* Chances are the other vnode (the one in the inode) is being torn
166* down right now, and we landed on top of it. Question is, what do
167* we do? Unhook the old inode and hook up the new one?
168*/
169 cmn_err(CE_PANIC,
170 "xfs_iget_core: ambiguous vns: vp/0x%p, invp/0x%p",
171 old_inode, inode);
172 }
173 157
174 /* 158 /*
175 * Inode cache hit 159 * We need to re-initialise the VFS inode as it has been
160 * 'freed' by the VFS. Do this here so we can deal with
161 * errors cleanly, then tag it so it can be set up correctly
162 * later.
176 */ 163 */
177 read_unlock(&pag->pag_ici_lock); 164 if (!inode_init_always(mp->m_super, VFS_I(ip))) {
178 XFS_STATS_INC(xs_ig_found); 165 error = ENOMEM;
179 166 goto out_error;
180finish_inode:
181 if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
182 xfs_put_perag(mp, pag);
183 return ENOENT;
184 } 167 }
185 168
186 if (lock_flags != 0) 169 /*
187 xfs_ilock(ip, lock_flags); 170 * We must set the XFS_INEW flag before clearing the
171 * XFS_IRECLAIMABLE flag so that if a racing lookup does
172 * not find the XFS_IRECLAIMABLE above but has the igrab()
173 * below succeed we can safely check XFS_INEW to detect
174 * that this inode is still being initialised.
175 */
176 xfs_iflags_set(ip, XFS_INEW);
177 xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
178
179 /* clear the radix tree reclaim flag as well. */
180 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
181 } else if (!igrab(VFS_I(ip))) {
182 /* If the VFS inode is being torn down, pause and try again. */
183 XFS_STATS_INC(xs_ig_frecycle);
184 goto out_error;
185 } else if (xfs_iflags_test(ip, XFS_INEW)) {
186 /*
187 * We are racing with another cache hit that is
188 * currently recycling this inode out of the XFS_IRECLAIMABLE
189 * state. Wait for the initialisation to complete before
190 * continuing.
191 */
192 wait_on_inode(VFS_I(ip));
193 }
188 194
189 xfs_iflags_clear(ip, XFS_ISTALE); 195 if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
190 xfs_itrace_exit_tag(ip, "xfs_iget.found"); 196 error = ENOENT;
191 goto return_ip; 197 iput(VFS_I(ip));
198 goto out_error;
192 } 199 }
193 200
194 /* 201 /* We've got a live one. */
195 * Inode cache miss
196 */
197 read_unlock(&pag->pag_ici_lock); 202 read_unlock(&pag->pag_ici_lock);
198 XFS_STATS_INC(xs_ig_missed);
199 203
200 /* 204 if (lock_flags != 0)
201 * Read the disk inode attributes into a new inode structure and get 205 xfs_ilock(ip, lock_flags);
202 * a new vnode for it. This should also initialize i_ino and i_mount.
203 */
204 error = xfs_iread(mp, tp, ino, &ip, bno,
205 (flags & XFS_IGET_BULKSTAT) ? XFS_IMAP_BULKSTAT : 0);
206 if (error) {
207 xfs_put_perag(mp, pag);
208 return error;
209 }
210 206
211 xfs_itrace_exit_tag(ip, "xfs_iget.alloc"); 207 xfs_iflags_clear(ip, XFS_ISTALE);
208 xfs_itrace_exit_tag(ip, "xfs_iget.found");
209 XFS_STATS_INC(xs_ig_found);
210 return 0;
211
212out_error:
213 read_unlock(&pag->pag_ici_lock);
214 return error;
215}
212 216
213 217
214 mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, 218static int
215 "xfsino", ip->i_ino); 219xfs_iget_cache_miss(
216 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 220 struct xfs_mount *mp,
217 init_waitqueue_head(&ip->i_ipin_wait); 221 struct xfs_perag *pag,
218 atomic_set(&ip->i_pincount, 0); 222 xfs_trans_t *tp,
223 xfs_ino_t ino,
224 struct xfs_inode **ipp,
225 xfs_daddr_t bno,
226 int flags,
227 int lock_flags) __releases(pag->pag_ici_lock)
228{
229 struct xfs_inode *ip;
230 int error;
231 unsigned long first_index, mask;
232 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
219 233
220 /* 234 ip = xfs_inode_alloc(mp, ino);
221 * Because we want to use a counting completion, complete 235 if (!ip)
222 * the flush completion once to allow a single access to 236 return ENOMEM;
223 * the flush completion without blocking.
224 */
225 init_completion(&ip->i_flush);
226 complete(&ip->i_flush);
227 237
228 if (lock_flags) 238 error = xfs_iread(mp, tp, ip, bno, flags);
229 xfs_ilock(ip, lock_flags); 239 if (error)
240 goto out_destroy;
241
242 xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
230 243
231 if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { 244 if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
232 xfs_idestroy(ip); 245 error = ENOENT;
233 xfs_put_perag(mp, pag); 246 goto out_destroy;
234 return ENOENT;
235 } 247 }
236 248
249 if (lock_flags)
250 xfs_ilock(ip, lock_flags);
251
237 /* 252 /*
238 * Preload the radix tree so we can insert safely under the 253 * Preload the radix tree so we can insert safely under the
239 * write spinlock. 254 * write spinlock. Note that we cannot sleep inside the preload
255 * region.
240 */ 256 */
241 if (radix_tree_preload(GFP_KERNEL)) { 257 if (radix_tree_preload(GFP_KERNEL)) {
242 xfs_idestroy(ip); 258 error = EAGAIN;
243 delay(1); 259 goto out_unlock;
244 goto again;
245 } 260 }
261
246 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); 262 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
247 first_index = agino & mask; 263 first_index = agino & mask;
248 write_lock(&pag->pag_ici_lock); 264 write_lock(&pag->pag_ici_lock);
249 /* 265
250 * insert the new inode 266 /* insert the new inode */
251 */
252 error = radix_tree_insert(&pag->pag_ici_root, agino, ip); 267 error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
253 if (unlikely(error)) { 268 if (unlikely(error)) {
254 BUG_ON(error != -EEXIST); 269 WARN_ON(error != -EEXIST);
255 write_unlock(&pag->pag_ici_lock);
256 radix_tree_preload_end();
257 xfs_idestroy(ip);
258 XFS_STATS_INC(xs_ig_dup); 270 XFS_STATS_INC(xs_ig_dup);
259 goto again; 271 error = EAGAIN;
272 goto out_preload_end;
260 } 273 }
261 274
262 /* 275 /* These values _must_ be set before releasing the radix tree lock! */
263 * These values _must_ be set before releasing the radix tree lock!
264 */
265 ip->i_udquot = ip->i_gdquot = NULL; 276 ip->i_udquot = ip->i_gdquot = NULL;
266 xfs_iflags_set(ip, XFS_INEW); 277 xfs_iflags_set(ip, XFS_INEW);
267 278
268 write_unlock(&pag->pag_ici_lock); 279 write_unlock(&pag->pag_ici_lock);
269 radix_tree_preload_end(); 280 radix_tree_preload_end();
270
271 /*
272 * Link ip to its mount and thread it on the mount's inode list.
273 */
274 XFS_MOUNT_ILOCK(mp);
275 if ((iq = mp->m_inodes)) {
276 ASSERT(iq->i_mprev->i_mnext == iq);
277 ip->i_mprev = iq->i_mprev;
278 iq->i_mprev->i_mnext = ip;
279 iq->i_mprev = ip;
280 ip->i_mnext = iq;
281 } else {
282 ip->i_mnext = ip;
283 ip->i_mprev = ip;
284 }
285 mp->m_inodes = ip;
286
287 XFS_MOUNT_IUNLOCK(mp);
288 xfs_put_perag(mp, pag);
289
290 return_ip:
291 ASSERT(ip->i_df.if_ext_max ==
292 XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
293
294 xfs_iflags_set(ip, XFS_IMODIFIED);
295 *ipp = ip; 281 *ipp = ip;
296
297 /*
298 * Set up the Linux with the Linux inode.
299 */
300 ip->i_vnode = inode;
301 inode->i_private = ip;
302
303 /*
304 * If we have a real type for an on-disk inode, we can set ops(&unlock)
305 * now. If it's a new inode being created, xfs_ialloc will handle it.
306 */
307 if (ip->i_d.di_mode != 0)
308 xfs_setup_inode(ip);
309 return 0; 282 return 0;
310}
311 283
284out_preload_end:
285 write_unlock(&pag->pag_ici_lock);
286 radix_tree_preload_end();
287out_unlock:
288 if (lock_flags)
289 xfs_iunlock(ip, lock_flags);
290out_destroy:
291 xfs_destroy_inode(ip);
292 return error;
293}
312 294
313/* 295/*
314 * The 'normal' internal xfs_iget, if needed it will 296 * Look up an inode by number in the given file system.
315 * 'allocate', or 'get', the vnode. 297 * The inode is looked up in the cache held in each AG.
298 * If the inode is found in the cache, initialise the vfs inode
299 * if necessary.
300 *
301 * If it is not in core, read it in from the file system's device,
302 * add it to the cache and initialise the vfs inode.
303 *
304 * The inode is locked according to the value of the lock_flags parameter.
305 * This flag parameter indicates how and if the inode's IO lock and inode lock
306 * should be taken.
307 *
308 * mp -- the mount point structure for the current file system. It points
309 * to the inode hash table.
310 * tp -- a pointer to the current transaction if there is one. This is
311 * simply passed through to the xfs_iread() call.
312 * ino -- the number of the inode desired. This is the unique identifier
313 * within the file system for the inode being requested.
314 * lock_flags -- flags indicating how to lock the inode. See the comment
315 * for xfs_ilock() for a list of valid values.
316 * bno -- the block number starting the buffer containing the inode,
317 * if known (as by bulkstat), else 0.
316 */ 318 */
317int 319int
318xfs_iget( 320xfs_iget(
@@ -324,61 +326,64 @@ xfs_iget(
324 xfs_inode_t **ipp, 326 xfs_inode_t **ipp,
325 xfs_daddr_t bno) 327 xfs_daddr_t bno)
326{ 328{
327 struct inode *inode;
328 xfs_inode_t *ip; 329 xfs_inode_t *ip;
329 int error; 330 int error;
331 xfs_perag_t *pag;
332 xfs_agino_t agino;
330 333
331 XFS_STATS_INC(xs_ig_attempts); 334 /* the radix tree exists only in inode capable AGs */
335 if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi)
336 return EINVAL;
332 337
333retry: 338 /* get the perag structure and ensure that it's inode capable */
334 inode = iget_locked(mp->m_super, ino); 339 pag = xfs_get_perag(mp, ino);
335 if (!inode) 340 if (!pag->pagi_inodeok)
336 /* If we got no inode we are out of memory */ 341 return EINVAL;
337 return ENOMEM; 342 ASSERT(pag->pag_ici_init);
343 agino = XFS_INO_TO_AGINO(mp, ino);
338 344
339 if (inode->i_state & I_NEW) { 345again:
340 XFS_STATS_INC(vn_active); 346 error = 0;
341 XFS_STATS_INC(vn_alloc); 347 read_lock(&pag->pag_ici_lock);
342 348 ip = radix_tree_lookup(&pag->pag_ici_root, agino);
343 error = xfs_iget_core(inode, mp, tp, ino, flags, 349
344 lock_flags, ipp, bno); 350 if (ip) {
345 if (error) { 351 error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);
346 make_bad_inode(inode); 352 if (error)
347 if (inode->i_state & I_NEW) 353 goto out_error_or_again;
348 unlock_new_inode(inode); 354 } else {
349 iput(inode); 355 read_unlock(&pag->pag_ici_lock);
350 } 356 XFS_STATS_INC(xs_ig_missed);
351 return error; 357
358 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, bno,
359 flags, lock_flags);
360 if (error)
361 goto out_error_or_again;
352 } 362 }
363 xfs_put_perag(mp, pag);
353 364
365 *ipp = ip;
366
367 ASSERT(ip->i_df.if_ext_max ==
368 XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
354 /* 369 /*
355 * If the inode is not fully constructed due to 370 * If we have a real type for an on-disk inode, we can set ops(&unlock)
356 * filehandle mismatches wait for the inode to go 371 * now. If it's a new inode being created, xfs_ialloc will handle it.
357 * away and try again.
358 *
359 * iget_locked will call __wait_on_freeing_inode
360 * to wait for the inode to go away.
361 */ 372 */
362 if (is_bad_inode(inode)) { 373 if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
363 iput(inode); 374 xfs_setup_inode(ip);
364 delay(1); 375 return 0;
365 goto retry;
366 }
367 376
368 ip = XFS_I(inode); 377out_error_or_again:
369 if (!ip) { 378 if (error == EAGAIN) {
370 iput(inode);
371 delay(1); 379 delay(1);
372 goto retry; 380 goto again;
373 } 381 }
374 382 xfs_put_perag(mp, pag);
375 if (lock_flags != 0) 383 return error;
376 xfs_ilock(ip, lock_flags);
377 XFS_STATS_INC(xs_ig_found);
378 *ipp = ip;
379 return 0;
380} 384}
381 385
386
382/* 387/*
383 * Look for the inode corresponding to the given ino in the hash table. 388 * Look for the inode corresponding to the given ino in the hash table.
384 * If it is there and its i_transp pointer matches tp, return it. 389 * If it is there and its i_transp pointer matches tp, return it.
@@ -444,99 +449,109 @@ xfs_iput_new(
444 IRELE(ip); 449 IRELE(ip);
445} 450}
446 451
447
448/* 452/*
449 * This routine embodies the part of the reclaim code that pulls 453 * This is called free all the memory associated with an inode.
450 * the inode from the inode hash table and the mount structure's 454 * It must free the inode itself and any buffers allocated for
451 * inode list. 455 * if_extents/if_data and if_broot. It must also free the lock
452 * This should only be called from xfs_reclaim(). 456 * associated with the inode.
457 *
458 * Note: because we don't initialise everything on reallocation out
459 * of the zone, we must ensure we nullify everything correctly before
460 * freeing the structure.
453 */ 461 */
454void 462void
455xfs_ireclaim(xfs_inode_t *ip) 463xfs_ireclaim(
464 struct xfs_inode *ip)
456{ 465{
457 /* 466 struct xfs_mount *mp = ip->i_mount;
458 * Remove from old hash list and mount list. 467 struct xfs_perag *pag;
459 */
460 XFS_STATS_INC(xs_ig_reclaims);
461 468
462 xfs_iextract(ip); 469 XFS_STATS_INC(xs_ig_reclaims);
463
464 /*
465 * Here we do a spurious inode lock in order to coordinate with
466 * xfs_sync(). This is because xfs_sync() references the inodes
467 * in the mount list without taking references on the corresponding
468 * vnodes. We make that OK here by ensuring that we wait until
469 * the inode is unlocked in xfs_sync() before we go ahead and
470 * free it. We get both the regular lock and the io lock because
471 * the xfs_sync() code may need to drop the regular one but will
472 * still hold the io lock.
473 */
474 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
475
476 /*
477 * Release dquots (and their references) if any. An inode may escape
478 * xfs_inactive and get here via vn_alloc->vn_reclaim path.
479 */
480 XFS_QM_DQDETACH(ip->i_mount, ip);
481
482 /*
483 * Pull our behavior descriptor from the vnode chain.
484 */
485 if (ip->i_vnode) {
486 ip->i_vnode->i_private = NULL;
487 ip->i_vnode = NULL;
488 }
489 470
490 /* 471 /*
491 * Free all memory associated with the inode. 472 * Remove the inode from the per-AG radix tree. It doesn't matter
473 * if it was never added to it because radix_tree_delete can deal
474 * with that case just fine.
492 */ 475 */
493 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 476 pag = xfs_get_perag(mp, ip->i_ino);
494 xfs_idestroy(ip);
495}
496
497/*
498 * This routine removes an about-to-be-destroyed inode from
499 * all of the lists in which it is located with the exception
500 * of the behavior chain.
501 */
502void
503xfs_iextract(
504 xfs_inode_t *ip)
505{
506 xfs_mount_t *mp = ip->i_mount;
507 xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
508 xfs_inode_t *iq;
509
510 write_lock(&pag->pag_ici_lock); 477 write_lock(&pag->pag_ici_lock);
511 radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino)); 478 radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino));
512 write_unlock(&pag->pag_ici_lock); 479 write_unlock(&pag->pag_ici_lock);
513 xfs_put_perag(mp, pag); 480 xfs_put_perag(mp, pag);
514 481
515 /* 482 /*
516 * Remove from mount's inode list. 483 * Here we do an (almost) spurious inode lock in order to coordinate
484 * with inode cache radix tree lookups. This is because the lookup
485 * can reference the inodes in the cache without taking references.
486 *
487 * We make that OK here by ensuring that we wait until the inode is
488 * unlocked after the lookup before we go ahead and free it. We get
489 * both the ilock and the iolock because the code may need to drop the
490 * ilock one but will still hold the iolock.
517 */ 491 */
518 XFS_MOUNT_ILOCK(mp); 492 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
519 ASSERT((ip->i_mnext != NULL) && (ip->i_mprev != NULL));
520 iq = ip->i_mnext;
521 iq->i_mprev = ip->i_mprev;
522 ip->i_mprev->i_mnext = iq;
523
524 /* 493 /*
525 * Fix up the head pointer if it points to the inode being deleted. 494 * Release dquots (and their references) if any.
526 */ 495 */
527 if (mp->m_inodes == ip) { 496 XFS_QM_DQDETACH(ip->i_mount, ip);
528 if (ip == iq) { 497 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
529 mp->m_inodes = NULL; 498
530 } else { 499 switch (ip->i_d.di_mode & S_IFMT) {
531 mp->m_inodes = iq; 500 case S_IFREG:
532 } 501 case S_IFDIR:
502 case S_IFLNK:
503 xfs_idestroy_fork(ip, XFS_DATA_FORK);
504 break;
533 } 505 }
534 506
535 /* Deal with the deleted inodes list */ 507 if (ip->i_afp)
536 list_del_init(&ip->i_reclaim); 508 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
537 509
538 mp->m_ireclaims++; 510#ifdef XFS_INODE_TRACE
539 XFS_MOUNT_IUNLOCK(mp); 511 ktrace_free(ip->i_trace);
512#endif
513#ifdef XFS_BMAP_TRACE
514 ktrace_free(ip->i_xtrace);
515#endif
516#ifdef XFS_BTREE_TRACE
517 ktrace_free(ip->i_btrace);
518#endif
519#ifdef XFS_RW_TRACE
520 ktrace_free(ip->i_rwtrace);
521#endif
522#ifdef XFS_ILOCK_TRACE
523 ktrace_free(ip->i_lock_trace);
524#endif
525#ifdef XFS_DIR2_TRACE
526 ktrace_free(ip->i_dir_trace);
527#endif
528 if (ip->i_itemp) {
529 /*
530 * Only if we are shutting down the fs will we see an
531 * inode still in the AIL. If it is there, we should remove
532 * it to prevent a use-after-free from occurring.
533 */
534 xfs_log_item_t *lip = &ip->i_itemp->ili_item;
535 struct xfs_ail *ailp = lip->li_ailp;
536
537 ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
538 XFS_FORCED_SHUTDOWN(ip->i_mount));
539 if (lip->li_flags & XFS_LI_IN_AIL) {
540 spin_lock(&ailp->xa_lock);
541 if (lip->li_flags & XFS_LI_IN_AIL)
542 xfs_trans_ail_delete(ailp, lip);
543 else
544 spin_unlock(&ailp->xa_lock);
545 }
546 xfs_inode_item_destroy(ip);
547 ip->i_itemp = NULL;
548 }
549 /* asserts to verify all state is correct here */
550 ASSERT(atomic_read(&ip->i_iocount) == 0);
551 ASSERT(atomic_read(&ip->i_pincount) == 0);
552 ASSERT(!spin_is_locked(&ip->i_flags_lock));
553 ASSERT(completion_done(&ip->i_flush));
554 kmem_zone_free(xfs_inode_zone, ip);
540} 555}
541 556
542/* 557/*
@@ -737,7 +752,7 @@ xfs_iunlock(
737 * it is in the AIL and anyone is waiting on it. Don't do 752 * it is in the AIL and anyone is waiting on it. Don't do
738 * this if the caller has asked us not to. 753 * this if the caller has asked us not to.
739 */ 754 */
740 xfs_trans_unlocked_item(ip->i_mount, 755 xfs_trans_unlocked_item(ip->i_itemp->ili_item.li_ailp,
741 (xfs_log_item_t*)(ip->i_itemp)); 756 (xfs_log_item_t*)(ip->i_itemp));
742 } 757 }
743 xfs_ilock_trace(ip, 3, lock_flags, (inst_t *)__return_address); 758 xfs_ilock_trace(ip, 3, lock_flags, (inst_t *)__return_address);
@@ -790,3 +805,51 @@ xfs_isilocked(
790} 805}
791#endif 806#endif
792 807
808#ifdef XFS_INODE_TRACE
809
810#define KTRACE_ENTER(ip, vk, s, line, ra) \
811 ktrace_enter((ip)->i_trace, \
812/* 0 */ (void *)(__psint_t)(vk), \
813/* 1 */ (void *)(s), \
814/* 2 */ (void *)(__psint_t) line, \
815/* 3 */ (void *)(__psint_t)atomic_read(&VFS_I(ip)->i_count), \
816/* 4 */ (void *)(ra), \
817/* 5 */ NULL, \
818/* 6 */ (void *)(__psint_t)current_cpu(), \
819/* 7 */ (void *)(__psint_t)current_pid(), \
820/* 8 */ (void *)__return_address, \
821/* 9 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL)
822
823/*
824 * Vnode tracing code.
825 */
826void
827_xfs_itrace_entry(xfs_inode_t *ip, const char *func, inst_t *ra)
828{
829 KTRACE_ENTER(ip, INODE_KTRACE_ENTRY, func, 0, ra);
830}
831
832void
833_xfs_itrace_exit(xfs_inode_t *ip, const char *func, inst_t *ra)
834{
835 KTRACE_ENTER(ip, INODE_KTRACE_EXIT, func, 0, ra);
836}
837
838void
839xfs_itrace_hold(xfs_inode_t *ip, char *file, int line, inst_t *ra)
840{
841 KTRACE_ENTER(ip, INODE_KTRACE_HOLD, file, line, ra);
842}
843
844void
845_xfs_itrace_ref(xfs_inode_t *ip, char *file, int line, inst_t *ra)
846{
847 KTRACE_ENTER(ip, INODE_KTRACE_REF, file, line, ra);
848}
849
850void
851xfs_itrace_rele(xfs_inode_t *ip, char *file, int line, inst_t *ra)
852{
853 KTRACE_ENTER(ip, INODE_KTRACE_RELE, file, line, ra);
854}
855#endif /* XFS_INODE_TRACE */
diff --git a/fs/xfs/xfs_imap.h b/fs/xfs/xfs_imap.h
deleted file mode 100644
index d36450003983..000000000000
--- a/fs/xfs/xfs_imap.h
+++ /dev/null
@@ -1,40 +0,0 @@
1/*
2 * Copyright (c) 2000,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_IMAP_H__
19#define __XFS_IMAP_H__
20
21/*
22 * This is the structure passed to xfs_imap() to map
23 * an inode number to its on disk location.
24 */
25typedef struct xfs_imap {
26 xfs_daddr_t im_blkno; /* starting BB of inode chunk */
27 uint im_len; /* length in BBs of inode chunk */
28 xfs_agblock_t im_agblkno; /* logical block of inode chunk in ag */
29 ushort im_ioffset; /* inode offset in block in "inodes" */
30 ushort im_boffset; /* inode offset in block in bytes */
31} xfs_imap_t;
32
33#ifdef __KERNEL__
34struct xfs_mount;
35struct xfs_trans;
36int xfs_imap(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
37 xfs_imap_t *, uint);
38#endif
39
40#endif /* __XFS_IMAP_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index a391b955df01..5a5e035e5d38 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -23,7 +23,6 @@
23#include "xfs_bit.h" 23#include "xfs_bit.h"
24#include "xfs_log.h" 24#include "xfs_log.h"
25#include "xfs_inum.h" 25#include "xfs_inum.h"
26#include "xfs_imap.h"
27#include "xfs_trans.h" 26#include "xfs_trans.h"
28#include "xfs_trans_priv.h" 27#include "xfs_trans_priv.h"
29#include "xfs_sb.h" 28#include "xfs_sb.h"
@@ -41,6 +40,7 @@
41#include "xfs_buf_item.h" 40#include "xfs_buf_item.h"
42#include "xfs_inode_item.h" 41#include "xfs_inode_item.h"
43#include "xfs_btree.h" 42#include "xfs_btree.h"
43#include "xfs_btree_trace.h"
44#include "xfs_alloc.h" 44#include "xfs_alloc.h"
45#include "xfs_ialloc.h" 45#include "xfs_ialloc.h"
46#include "xfs_bmap.h" 46#include "xfs_bmap.h"
@@ -133,10 +133,10 @@ STATIC int
133xfs_imap_to_bp( 133xfs_imap_to_bp(
134 xfs_mount_t *mp, 134 xfs_mount_t *mp,
135 xfs_trans_t *tp, 135 xfs_trans_t *tp,
136 xfs_imap_t *imap, 136 struct xfs_imap *imap,
137 xfs_buf_t **bpp, 137 xfs_buf_t **bpp,
138 uint buf_flags, 138 uint buf_flags,
139 uint imap_flags) 139 uint iget_flags)
140{ 140{
141 int error; 141 int error;
142 int i; 142 int i;
@@ -173,12 +173,12 @@ xfs_imap_to_bp(
173 173
174 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 174 dip = (xfs_dinode_t *)xfs_buf_offset(bp,
175 (i << mp->m_sb.sb_inodelog)); 175 (i << mp->m_sb.sb_inodelog));
176 di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC && 176 di_ok = be16_to_cpu(dip->di_magic) == XFS_DINODE_MAGIC &&
177 XFS_DINODE_GOOD_VERSION(dip->di_core.di_version); 177 XFS_DINODE_GOOD_VERSION(dip->di_version);
178 if (unlikely(XFS_TEST_ERROR(!di_ok, mp, 178 if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
179 XFS_ERRTAG_ITOBP_INOTOBP, 179 XFS_ERRTAG_ITOBP_INOTOBP,
180 XFS_RANDOM_ITOBP_INOTOBP))) { 180 XFS_RANDOM_ITOBP_INOTOBP))) {
181 if (imap_flags & XFS_IMAP_BULKSTAT) { 181 if (iget_flags & XFS_IGET_BULKSTAT) {
182 xfs_trans_brelse(tp, bp); 182 xfs_trans_brelse(tp, bp);
183 return XFS_ERROR(EINVAL); 183 return XFS_ERROR(EINVAL);
184 } 184 }
@@ -190,7 +190,7 @@ xfs_imap_to_bp(
190 "daddr %lld #%d (magic=%x)", 190 "daddr %lld #%d (magic=%x)",
191 XFS_BUFTARG_NAME(mp->m_ddev_targp), 191 XFS_BUFTARG_NAME(mp->m_ddev_targp),
192 (unsigned long long)imap->im_blkno, i, 192 (unsigned long long)imap->im_blkno, i,
193 be16_to_cpu(dip->di_core.di_magic)); 193 be16_to_cpu(dip->di_magic));
194#endif 194#endif
195 xfs_trans_brelse(tp, bp); 195 xfs_trans_brelse(tp, bp);
196 return XFS_ERROR(EFSCORRUPTED); 196 return XFS_ERROR(EFSCORRUPTED);
@@ -221,25 +221,26 @@ xfs_imap_to_bp(
221 * Use xfs_imap() to determine the size and location of the 221 * Use xfs_imap() to determine the size and location of the
222 * buffer to read from disk. 222 * buffer to read from disk.
223 */ 223 */
224STATIC int 224int
225xfs_inotobp( 225xfs_inotobp(
226 xfs_mount_t *mp, 226 xfs_mount_t *mp,
227 xfs_trans_t *tp, 227 xfs_trans_t *tp,
228 xfs_ino_t ino, 228 xfs_ino_t ino,
229 xfs_dinode_t **dipp, 229 xfs_dinode_t **dipp,
230 xfs_buf_t **bpp, 230 xfs_buf_t **bpp,
231 int *offset) 231 int *offset,
232 uint imap_flags)
232{ 233{
233 xfs_imap_t imap; 234 struct xfs_imap imap;
234 xfs_buf_t *bp; 235 xfs_buf_t *bp;
235 int error; 236 int error;
236 237
237 imap.im_blkno = 0; 238 imap.im_blkno = 0;
238 error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP); 239 error = xfs_imap(mp, tp, ino, &imap, imap_flags);
239 if (error) 240 if (error)
240 return error; 241 return error;
241 242
242 error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, 0); 243 error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags);
243 if (error) 244 if (error)
244 return error; 245 return error;
245 246
@@ -260,15 +261,11 @@ xfs_inotobp(
260 * If a non-zero error is returned, then the contents of bpp and 261 * If a non-zero error is returned, then the contents of bpp and
261 * dipp are undefined. 262 * dipp are undefined.
262 * 263 *
263 * If the inode is new and has not yet been initialized, use xfs_imap() 264 * The inode is expected to already been mapped to its buffer and read
264 * to determine the size and location of the buffer to read from disk. 265 * in once, thus we can use the mapping information stored in the inode
265 * If the inode has already been mapped to its buffer and read in once, 266 * rather than calling xfs_imap(). This allows us to avoid the overhead
266 * then use the mapping information stored in the inode rather than 267 * of looking at the inode btree for small block file systems
267 * calling xfs_imap(). This allows us to avoid the overhead of looking 268 * (see xfs_imap()).
268 * at the inode btree for small block file systems (see xfs_dilocate()).
269 * We can tell whether the inode has been mapped in before by comparing
270 * its disk block address to 0. Only uninitialized inodes will have
271 * 0 for the disk block address.
272 */ 269 */
273int 270int
274xfs_itobp( 271xfs_itobp(
@@ -277,40 +274,14 @@ xfs_itobp(
277 xfs_inode_t *ip, 274 xfs_inode_t *ip,
278 xfs_dinode_t **dipp, 275 xfs_dinode_t **dipp,
279 xfs_buf_t **bpp, 276 xfs_buf_t **bpp,
280 xfs_daddr_t bno,
281 uint imap_flags,
282 uint buf_flags) 277 uint buf_flags)
283{ 278{
284 xfs_imap_t imap;
285 xfs_buf_t *bp; 279 xfs_buf_t *bp;
286 int error; 280 int error;
287 281
288 if (ip->i_blkno == (xfs_daddr_t)0) { 282 ASSERT(ip->i_imap.im_blkno != 0);
289 imap.im_blkno = bno;
290 error = xfs_imap(mp, tp, ip->i_ino, &imap,
291 XFS_IMAP_LOOKUP | imap_flags);
292 if (error)
293 return error;
294 283
295 /* 284 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, buf_flags, 0);
296 * Fill in the fields in the inode that will be used to
297 * map the inode to its buffer from now on.
298 */
299 ip->i_blkno = imap.im_blkno;
300 ip->i_len = imap.im_len;
301 ip->i_boffset = imap.im_boffset;
302 } else {
303 /*
304 * We've already mapped the inode once, so just use the
305 * mapping that we saved the first time.
306 */
307 imap.im_blkno = ip->i_blkno;
308 imap.im_len = ip->i_len;
309 imap.im_boffset = ip->i_boffset;
310 }
311 ASSERT(bno == 0 || bno == imap.im_blkno);
312
313 error = xfs_imap_to_bp(mp, tp, &imap, &bp, buf_flags, imap_flags);
314 if (error) 285 if (error)
315 return error; 286 return error;
316 287
@@ -321,7 +292,7 @@ xfs_itobp(
321 return EAGAIN; 292 return EAGAIN;
322 } 293 }
323 294
324 *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); 295 *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
325 *bpp = bp; 296 *bpp = bp;
326 return 0; 297 return 0;
327} 298}
@@ -348,26 +319,26 @@ xfs_iformat(
348 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 319 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
349 error = 0; 320 error = 0;
350 321
351 if (unlikely(be32_to_cpu(dip->di_core.di_nextents) + 322 if (unlikely(be32_to_cpu(dip->di_nextents) +
352 be16_to_cpu(dip->di_core.di_anextents) > 323 be16_to_cpu(dip->di_anextents) >
353 be64_to_cpu(dip->di_core.di_nblocks))) { 324 be64_to_cpu(dip->di_nblocks))) {
354 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 325 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
355 "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.", 326 "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
356 (unsigned long long)ip->i_ino, 327 (unsigned long long)ip->i_ino,
357 (int)(be32_to_cpu(dip->di_core.di_nextents) + 328 (int)(be32_to_cpu(dip->di_nextents) +
358 be16_to_cpu(dip->di_core.di_anextents)), 329 be16_to_cpu(dip->di_anextents)),
359 (unsigned long long) 330 (unsigned long long)
360 be64_to_cpu(dip->di_core.di_nblocks)); 331 be64_to_cpu(dip->di_nblocks));
361 XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW, 332 XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
362 ip->i_mount, dip); 333 ip->i_mount, dip);
363 return XFS_ERROR(EFSCORRUPTED); 334 return XFS_ERROR(EFSCORRUPTED);
364 } 335 }
365 336
366 if (unlikely(dip->di_core.di_forkoff > ip->i_mount->m_sb.sb_inodesize)) { 337 if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
367 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 338 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
368 "corrupt dinode %Lu, forkoff = 0x%x.", 339 "corrupt dinode %Lu, forkoff = 0x%x.",
369 (unsigned long long)ip->i_ino, 340 (unsigned long long)ip->i_ino,
370 dip->di_core.di_forkoff); 341 dip->di_forkoff);
371 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, 342 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
372 ip->i_mount, dip); 343 ip->i_mount, dip);
373 return XFS_ERROR(EFSCORRUPTED); 344 return XFS_ERROR(EFSCORRUPTED);
@@ -378,25 +349,25 @@ xfs_iformat(
378 case S_IFCHR: 349 case S_IFCHR:
379 case S_IFBLK: 350 case S_IFBLK:
380 case S_IFSOCK: 351 case S_IFSOCK:
381 if (unlikely(dip->di_core.di_format != XFS_DINODE_FMT_DEV)) { 352 if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
382 XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW, 353 XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
383 ip->i_mount, dip); 354 ip->i_mount, dip);
384 return XFS_ERROR(EFSCORRUPTED); 355 return XFS_ERROR(EFSCORRUPTED);
385 } 356 }
386 ip->i_d.di_size = 0; 357 ip->i_d.di_size = 0;
387 ip->i_size = 0; 358 ip->i_size = 0;
388 ip->i_df.if_u2.if_rdev = be32_to_cpu(dip->di_u.di_dev); 359 ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
389 break; 360 break;
390 361
391 case S_IFREG: 362 case S_IFREG:
392 case S_IFLNK: 363 case S_IFLNK:
393 case S_IFDIR: 364 case S_IFDIR:
394 switch (dip->di_core.di_format) { 365 switch (dip->di_format) {
395 case XFS_DINODE_FMT_LOCAL: 366 case XFS_DINODE_FMT_LOCAL:
396 /* 367 /*
397 * no local regular files yet 368 * no local regular files yet
398 */ 369 */
399 if (unlikely((be16_to_cpu(dip->di_core.di_mode) & S_IFMT) == S_IFREG)) { 370 if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) {
400 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 371 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
401 "corrupt inode %Lu " 372 "corrupt inode %Lu "
402 "(local format for regular file).", 373 "(local format for regular file).",
@@ -407,7 +378,7 @@ xfs_iformat(
407 return XFS_ERROR(EFSCORRUPTED); 378 return XFS_ERROR(EFSCORRUPTED);
408 } 379 }
409 380
410 di_size = be64_to_cpu(dip->di_core.di_size); 381 di_size = be64_to_cpu(dip->di_size);
411 if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { 382 if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
412 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 383 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
413 "corrupt inode %Lu " 384 "corrupt inode %Lu "
@@ -449,7 +420,7 @@ xfs_iformat(
449 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); 420 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
450 ip->i_afp->if_ext_max = 421 ip->i_afp->if_ext_max =
451 XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 422 XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
452 switch (dip->di_core.di_aformat) { 423 switch (dip->di_aformat) {
453 case XFS_DINODE_FMT_LOCAL: 424 case XFS_DINODE_FMT_LOCAL:
454 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); 425 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
455 size = be16_to_cpu(atp->hdr.totsize); 426 size = be16_to_cpu(atp->hdr.totsize);
@@ -621,7 +592,7 @@ xfs_iformat_btree(
621 ifp = XFS_IFORK_PTR(ip, whichfork); 592 ifp = XFS_IFORK_PTR(ip, whichfork);
622 dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); 593 dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
623 size = XFS_BMAP_BROOT_SPACE(dfp); 594 size = XFS_BMAP_BROOT_SPACE(dfp);
624 nrecs = XFS_BMAP_BROOT_NUMRECS(dfp); 595 nrecs = be16_to_cpu(dfp->bb_numrecs);
625 596
626 /* 597 /*
627 * blow out if -- fork has less extents than can fit in 598 * blow out if -- fork has less extents than can fit in
@@ -649,8 +620,9 @@ xfs_iformat_btree(
649 * Copy and convert from the on-disk structure 620 * Copy and convert from the on-disk structure
650 * to the in-memory structure. 621 * to the in-memory structure.
651 */ 622 */
652 xfs_bmdr_to_bmbt(dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), 623 xfs_bmdr_to_bmbt(ip->i_mount, dfp,
653 ifp->if_broot, size); 624 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
625 ifp->if_broot, size);
654 ifp->if_flags &= ~XFS_IFEXTENTS; 626 ifp->if_flags &= ~XFS_IFEXTENTS;
655 ifp->if_flags |= XFS_IFBROOT; 627 ifp->if_flags |= XFS_IFBROOT;
656 628
@@ -660,7 +632,7 @@ xfs_iformat_btree(
660void 632void
661xfs_dinode_from_disk( 633xfs_dinode_from_disk(
662 xfs_icdinode_t *to, 634 xfs_icdinode_t *to,
663 xfs_dinode_core_t *from) 635 xfs_dinode_t *from)
664{ 636{
665 to->di_magic = be16_to_cpu(from->di_magic); 637 to->di_magic = be16_to_cpu(from->di_magic);
666 to->di_mode = be16_to_cpu(from->di_mode); 638 to->di_mode = be16_to_cpu(from->di_mode);
@@ -694,7 +666,7 @@ xfs_dinode_from_disk(
694 666
695void 667void
696xfs_dinode_to_disk( 668xfs_dinode_to_disk(
697 xfs_dinode_core_t *to, 669 xfs_dinode_t *to,
698 xfs_icdinode_t *from) 670 xfs_icdinode_t *from)
699{ 671{
700 to->di_magic = cpu_to_be16(from->di_magic); 672 to->di_magic = cpu_to_be16(from->di_magic);
@@ -781,93 +753,57 @@ uint
781xfs_dic2xflags( 753xfs_dic2xflags(
782 xfs_dinode_t *dip) 754 xfs_dinode_t *dip)
783{ 755{
784 xfs_dinode_core_t *dic = &dip->di_core; 756 return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) |
785
786 return _xfs_dic2xflags(be16_to_cpu(dic->di_flags)) |
787 (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); 757 (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
788} 758}
789 759
790/* 760/*
791 * Given a mount structure and an inode number, return a pointer 761 * Read the disk inode attributes into the in-core inode structure.
792 * to a newly allocated in-core inode corresponding to the given
793 * inode number.
794 *
795 * Initialize the inode's attributes and extent pointers if it
796 * already has them (it will not if the inode has no links).
797 */ 762 */
798int 763int
799xfs_iread( 764xfs_iread(
800 xfs_mount_t *mp, 765 xfs_mount_t *mp,
801 xfs_trans_t *tp, 766 xfs_trans_t *tp,
802 xfs_ino_t ino, 767 xfs_inode_t *ip,
803 xfs_inode_t **ipp,
804 xfs_daddr_t bno, 768 xfs_daddr_t bno,
805 uint imap_flags) 769 uint iget_flags)
806{ 770{
807 xfs_buf_t *bp; 771 xfs_buf_t *bp;
808 xfs_dinode_t *dip; 772 xfs_dinode_t *dip;
809 xfs_inode_t *ip;
810 int error; 773 int error;
811 774
812 ASSERT(xfs_inode_zone != NULL);
813
814 ip = kmem_zone_zalloc(xfs_inode_zone, KM_SLEEP);
815 ip->i_ino = ino;
816 ip->i_mount = mp;
817 atomic_set(&ip->i_iocount, 0);
818 spin_lock_init(&ip->i_flags_lock);
819
820 /* 775 /*
821 * Get pointer's to the on-disk inode and the buffer containing it. 776 * Fill in the location information in the in-core inode.
822 * If the inode number refers to a block outside the file system
823 * then xfs_itobp() will return NULL. In this case we should
824 * return NULL as well. Set i_blkno to 0 so that xfs_itobp() will
825 * know that this is a new incore inode.
826 */ 777 */
827 error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags, XFS_BUF_LOCK); 778 ip->i_imap.im_blkno = bno;
828 if (error) { 779 error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
829 kmem_zone_free(xfs_inode_zone, ip); 780 if (error)
830 return error; 781 return error;
831 } 782 ASSERT(bno == 0 || bno == ip->i_imap.im_blkno);
832 783
833 /* 784 /*
834 * Initialize inode's trace buffers. 785 * Get pointers to the on-disk inode and the buffer containing it.
835 * Do this before xfs_iformat in case it adds entries.
836 */ 786 */
837#ifdef XFS_INODE_TRACE 787 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp,
838 ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS); 788 XFS_BUF_LOCK, iget_flags);
839#endif 789 if (error)
840#ifdef XFS_BMAP_TRACE 790 return error;
841 ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS); 791 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
842#endif
843#ifdef XFS_BMBT_TRACE
844 ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS);
845#endif
846#ifdef XFS_RW_TRACE
847 ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS);
848#endif
849#ifdef XFS_ILOCK_TRACE
850 ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS);
851#endif
852#ifdef XFS_DIR2_TRACE
853 ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
854#endif
855 792
856 /* 793 /*
857 * If we got something that isn't an inode it means someone 794 * If we got something that isn't an inode it means someone
858 * (nfs or dmi) has a stale handle. 795 * (nfs or dmi) has a stale handle.
859 */ 796 */
860 if (be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC) { 797 if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) {
861 kmem_zone_free(xfs_inode_zone, ip);
862 xfs_trans_brelse(tp, bp);
863#ifdef DEBUG 798#ifdef DEBUG
864 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: " 799 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
865 "dip->di_core.di_magic (0x%x) != " 800 "dip->di_magic (0x%x) != "
866 "XFS_DINODE_MAGIC (0x%x)", 801 "XFS_DINODE_MAGIC (0x%x)",
867 be16_to_cpu(dip->di_core.di_magic), 802 be16_to_cpu(dip->di_magic),
868 XFS_DINODE_MAGIC); 803 XFS_DINODE_MAGIC);
869#endif /* DEBUG */ 804#endif /* DEBUG */
870 return XFS_ERROR(EINVAL); 805 error = XFS_ERROR(EINVAL);
806 goto out_brelse;
871 } 807 }
872 808
873 /* 809 /*
@@ -877,24 +813,22 @@ xfs_iread(
877 * specific information. 813 * specific information.
878 * Otherwise, just get the truly permanent information. 814 * Otherwise, just get the truly permanent information.
879 */ 815 */
880 if (dip->di_core.di_mode) { 816 if (dip->di_mode) {
881 xfs_dinode_from_disk(&ip->i_d, &dip->di_core); 817 xfs_dinode_from_disk(&ip->i_d, dip);
882 error = xfs_iformat(ip, dip); 818 error = xfs_iformat(ip, dip);
883 if (error) { 819 if (error) {
884 kmem_zone_free(xfs_inode_zone, ip);
885 xfs_trans_brelse(tp, bp);
886#ifdef DEBUG 820#ifdef DEBUG
887 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: " 821 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
888 "xfs_iformat() returned error %d", 822 "xfs_iformat() returned error %d",
889 error); 823 error);
890#endif /* DEBUG */ 824#endif /* DEBUG */
891 return error; 825 goto out_brelse;
892 } 826 }
893 } else { 827 } else {
894 ip->i_d.di_magic = be16_to_cpu(dip->di_core.di_magic); 828 ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
895 ip->i_d.di_version = dip->di_core.di_version; 829 ip->i_d.di_version = dip->di_version;
896 ip->i_d.di_gen = be32_to_cpu(dip->di_core.di_gen); 830 ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
897 ip->i_d.di_flushiter = be16_to_cpu(dip->di_core.di_flushiter); 831 ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
898 /* 832 /*
899 * Make sure to pull in the mode here as well in 833 * Make sure to pull in the mode here as well in
900 * case the inode is released without being used. 834 * case the inode is released without being used.
@@ -911,8 +845,6 @@ xfs_iread(
911 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 845 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
912 } 846 }
913 847
914 INIT_LIST_HEAD(&ip->i_reclaim);
915
916 /* 848 /*
917 * The inode format changed when we moved the link count and 849 * The inode format changed when we moved the link count and
918 * made it 32 bits long. If this is an old format inode, 850 * made it 32 bits long. If this is an old format inode,
@@ -924,7 +856,7 @@ xfs_iread(
924 * the new format. We don't change the version number so that we 856 * the new format. We don't change the version number so that we
925 * can distinguish this from a real new format inode. 857 * can distinguish this from a real new format inode.
926 */ 858 */
927 if (ip->i_d.di_version == XFS_DINODE_VERSION_1) { 859 if (ip->i_d.di_version == 1) {
928 ip->i_d.di_nlink = ip->i_d.di_onlink; 860 ip->i_d.di_nlink = ip->i_d.di_onlink;
929 ip->i_d.di_onlink = 0; 861 ip->i_d.di_onlink = 0;
930 ip->i_d.di_projid = 0; 862 ip->i_d.di_projid = 0;
@@ -938,7 +870,7 @@ xfs_iread(
938 * around for a while. This helps to keep recently accessed 870 * around for a while. This helps to keep recently accessed
939 * meta-data in-core longer. 871 * meta-data in-core longer.
940 */ 872 */
941 XFS_BUF_SET_REF(bp, XFS_INO_REF); 873 XFS_BUF_SET_REF(bp, XFS_INO_REF);
942 874
943 /* 875 /*
944 * Use xfs_trans_brelse() to release the buffer containing the 876 * Use xfs_trans_brelse() to release the buffer containing the
@@ -953,9 +885,9 @@ xfs_iread(
953 * to worry about the inode being changed just because we released 885 * to worry about the inode being changed just because we released
954 * the buffer. 886 * the buffer.
955 */ 887 */
888 out_brelse:
956 xfs_trans_brelse(tp, bp); 889 xfs_trans_brelse(tp, bp);
957 *ipp = ip; 890 return error;
958 return 0;
959} 891}
960 892
961/* 893/*
@@ -1049,6 +981,7 @@ xfs_ialloc(
1049 uint flags; 981 uint flags;
1050 int error; 982 int error;
1051 timespec_t tv; 983 timespec_t tv;
984 int filestreams = 0;
1052 985
1053 /* 986 /*
1054 * Call the space management code to pick 987 * Call the space management code to pick
@@ -1056,9 +989,8 @@ xfs_ialloc(
1056 */ 989 */
1057 error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc, 990 error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
1058 ialloc_context, call_again, &ino); 991 ialloc_context, call_again, &ino);
1059 if (error != 0) { 992 if (error)
1060 return error; 993 return error;
1061 }
1062 if (*call_again || ino == NULLFSINO) { 994 if (*call_again || ino == NULLFSINO) {
1063 *ipp = NULL; 995 *ipp = NULL;
1064 return 0; 996 return 0;
@@ -1072,9 +1004,8 @@ xfs_ialloc(
1072 */ 1004 */
1073 error = xfs_trans_iget(tp->t_mountp, tp, ino, 1005 error = xfs_trans_iget(tp->t_mountp, tp, ino,
1074 XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip); 1006 XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip);
1075 if (error != 0) { 1007 if (error)
1076 return error; 1008 return error;
1077 }
1078 ASSERT(ip != NULL); 1009 ASSERT(ip != NULL);
1079 1010
1080 ip->i_d.di_mode = (__uint16_t)mode; 1011 ip->i_d.di_mode = (__uint16_t)mode;
@@ -1093,8 +1024,8 @@ xfs_ialloc(
1093 * here rather than here and in the flush/logging code. 1024 * here rather than here and in the flush/logging code.
1094 */ 1025 */
1095 if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) && 1026 if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) &&
1096 ip->i_d.di_version == XFS_DINODE_VERSION_1) { 1027 ip->i_d.di_version == 1) {
1097 ip->i_d.di_version = XFS_DINODE_VERSION_2; 1028 ip->i_d.di_version = 2;
1098 /* 1029 /*
1099 * We've already zeroed the old link count, the projid field, 1030 * We've already zeroed the old link count, the projid field,
1100 * and the pad field. 1031 * and the pad field.
@@ -1104,7 +1035,7 @@ xfs_ialloc(
1104 /* 1035 /*
1105 * Project ids won't be stored on disk if we are using a version 1 inode. 1036 * Project ids won't be stored on disk if we are using a version 1 inode.
1106 */ 1037 */
1107 if ((prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1)) 1038 if ((prid != 0) && (ip->i_d.di_version == 1))
1108 xfs_bump_ino_vers2(tp, ip); 1039 xfs_bump_ino_vers2(tp, ip);
1109 1040
1110 if (pip && XFS_INHERIT_GID(pip)) { 1041 if (pip && XFS_INHERIT_GID(pip)) {
@@ -1155,13 +1086,12 @@ xfs_ialloc(
1155 flags |= XFS_ILOG_DEV; 1086 flags |= XFS_ILOG_DEV;
1156 break; 1087 break;
1157 case S_IFREG: 1088 case S_IFREG:
1158 if (pip && xfs_inode_is_filestream(pip)) { 1089 /*
1159 error = xfs_filestream_associate(pip, ip); 1090 * we can't set up filestreams until after the VFS inode
1160 if (error < 0) 1091 * is set up properly.
1161 return -error; 1092 */
1162 if (!error) 1093 if (pip && xfs_inode_is_filestream(pip))
1163 xfs_iflags_set(ip, XFS_IFILESTREAM); 1094 filestreams = 1;
1164 }
1165 /* fall through */ 1095 /* fall through */
1166 case S_IFDIR: 1096 case S_IFDIR:
1167 if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { 1097 if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
@@ -1227,6 +1157,15 @@ xfs_ialloc(
1227 /* now that we have an i_mode we can setup inode ops and unlock */ 1157 /* now that we have an i_mode we can setup inode ops and unlock */
1228 xfs_setup_inode(ip); 1158 xfs_setup_inode(ip);
1229 1159
1160 /* now we have set up the vfs inode we can associate the filestream */
1161 if (filestreams) {
1162 error = xfs_filestream_associate(pip, ip);
1163 if (error < 0)
1164 return -error;
1165 if (!error)
1166 xfs_iflags_set(ip, XFS_IFILESTREAM);
1167 }
1168
1230 *ipp = ip; 1169 *ipp = ip;
1231 return 0; 1170 return 0;
1232} 1171}
@@ -1383,8 +1322,8 @@ xfs_itrunc_trace(
1383 * direct I/O with the truncate operation. Also, because we hold 1322 * direct I/O with the truncate operation. Also, because we hold
1384 * the IOLOCK in exclusive mode, we prevent new direct I/Os from being 1323 * the IOLOCK in exclusive mode, we prevent new direct I/Os from being
1385 * started until the truncate completes and drops the lock. Essentially, 1324 * started until the truncate completes and drops the lock. Essentially,
1386 * the vn_iowait() call forms an I/O barrier that provides strict ordering 1325 * the xfs_ioend_wait() call forms an I/O barrier that provides strict
1387 * between direct I/Os and the truncate operation. 1326 * ordering between direct I/Os and the truncate operation.
1388 * 1327 *
1389 * The flags parameter can have either the value XFS_ITRUNC_DEFINITE 1328 * The flags parameter can have either the value XFS_ITRUNC_DEFINITE
1390 * or XFS_ITRUNC_MAYBE. The XFS_ITRUNC_MAYBE value should be used 1329 * or XFS_ITRUNC_MAYBE. The XFS_ITRUNC_MAYBE value should be used
@@ -1415,7 +1354,7 @@ xfs_itruncate_start(
1415 1354
1416 /* wait for the completion of any pending DIOs */ 1355 /* wait for the completion of any pending DIOs */
1417 if (new_size == 0 || new_size < ip->i_size) 1356 if (new_size == 0 || new_size < ip->i_size)
1418 vn_iowait(ip); 1357 xfs_ioend_wait(ip);
1419 1358
1420 /* 1359 /*
1421 * Call toss_pages or flushinval_pages to get rid of pages 1360 * Call toss_pages or flushinval_pages to get rid of pages
@@ -1726,8 +1665,14 @@ xfs_itruncate_finish(
1726 xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 1665 xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1727 xfs_trans_ihold(ntp, ip); 1666 xfs_trans_ihold(ntp, ip);
1728 1667
1729 if (!error) 1668 if (error)
1730 error = xfs_trans_reserve(ntp, 0, 1669 return error;
1670 /*
1671 * transaction commit worked ok so we can drop the extra ticket
1672 * reference that we gained in xfs_trans_dup()
1673 */
1674 xfs_log_ticket_put(ntp->t_ticket);
1675 error = xfs_trans_reserve(ntp, 0,
1731 XFS_ITRUNCATE_LOG_RES(mp), 0, 1676 XFS_ITRUNCATE_LOG_RES(mp), 0,
1732 XFS_TRANS_PERM_LOG_RES, 1677 XFS_TRANS_PERM_LOG_RES,
1733 XFS_ITRUNCATE_LOG_COUNT); 1678 XFS_ITRUNCATE_LOG_COUNT);
@@ -1781,13 +1726,10 @@ xfs_iunlink(
1781 xfs_dinode_t *dip; 1726 xfs_dinode_t *dip;
1782 xfs_buf_t *agibp; 1727 xfs_buf_t *agibp;
1783 xfs_buf_t *ibp; 1728 xfs_buf_t *ibp;
1784 xfs_agnumber_t agno;
1785 xfs_daddr_t agdaddr;
1786 xfs_agino_t agino; 1729 xfs_agino_t agino;
1787 short bucket_index; 1730 short bucket_index;
1788 int offset; 1731 int offset;
1789 int error; 1732 int error;
1790 int agi_ok;
1791 1733
1792 ASSERT(ip->i_d.di_nlink == 0); 1734 ASSERT(ip->i_d.di_nlink == 0);
1793 ASSERT(ip->i_d.di_mode != 0); 1735 ASSERT(ip->i_d.di_mode != 0);
@@ -1795,31 +1737,15 @@ xfs_iunlink(
1795 1737
1796 mp = tp->t_mountp; 1738 mp = tp->t_mountp;
1797 1739
1798 agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
1799 agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
1800
1801 /* 1740 /*
1802 * Get the agi buffer first. It ensures lock ordering 1741 * Get the agi buffer first. It ensures lock ordering
1803 * on the list. 1742 * on the list.
1804 */ 1743 */
1805 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr, 1744 error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
1806 XFS_FSS_TO_BB(mp, 1), 0, &agibp);
1807 if (error) 1745 if (error)
1808 return error; 1746 return error;
1809
1810 /*
1811 * Validate the magic number of the agi block.
1812 */
1813 agi = XFS_BUF_TO_AGI(agibp); 1747 agi = XFS_BUF_TO_AGI(agibp);
1814 agi_ok = 1748
1815 be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
1816 XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
1817 if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK,
1818 XFS_RANDOM_IUNLINK))) {
1819 XFS_CORRUPTION_ERROR("xfs_iunlink", XFS_ERRLEVEL_LOW, mp, agi);
1820 xfs_trans_brelse(tp, agibp);
1821 return XFS_ERROR(EFSCORRUPTED);
1822 }
1823 /* 1749 /*
1824 * Get the index into the agi hash table for the 1750 * Get the index into the agi hash table for the
1825 * list this inode will go on. 1751 * list this inode will go on.
@@ -1837,14 +1763,14 @@ xfs_iunlink(
1837 * Here we put the head pointer into our next pointer, 1763 * Here we put the head pointer into our next pointer,
1838 * and then we fall through to point the head at us. 1764 * and then we fall through to point the head at us.
1839 */ 1765 */
1840 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK); 1766 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
1841 if (error) 1767 if (error)
1842 return error; 1768 return error;
1843 1769
1844 ASSERT(be32_to_cpu(dip->di_next_unlinked) == NULLAGINO); 1770 ASSERT(be32_to_cpu(dip->di_next_unlinked) == NULLAGINO);
1845 /* both on-disk, don't endian flip twice */ 1771 /* both on-disk, don't endian flip twice */
1846 dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; 1772 dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
1847 offset = ip->i_boffset + 1773 offset = ip->i_imap.im_boffset +
1848 offsetof(xfs_dinode_t, di_next_unlinked); 1774 offsetof(xfs_dinode_t, di_next_unlinked);
1849 xfs_trans_inode_buf(tp, ibp); 1775 xfs_trans_inode_buf(tp, ibp);
1850 xfs_trans_log_buf(tp, ibp, offset, 1776 xfs_trans_log_buf(tp, ibp, offset,
@@ -1879,7 +1805,6 @@ xfs_iunlink_remove(
1879 xfs_buf_t *agibp; 1805 xfs_buf_t *agibp;
1880 xfs_buf_t *ibp; 1806 xfs_buf_t *ibp;
1881 xfs_agnumber_t agno; 1807 xfs_agnumber_t agno;
1882 xfs_daddr_t agdaddr;
1883 xfs_agino_t agino; 1808 xfs_agino_t agino;
1884 xfs_agino_t next_agino; 1809 xfs_agino_t next_agino;
1885 xfs_buf_t *last_ibp; 1810 xfs_buf_t *last_ibp;
@@ -1887,45 +1812,20 @@ xfs_iunlink_remove(
1887 short bucket_index; 1812 short bucket_index;
1888 int offset, last_offset = 0; 1813 int offset, last_offset = 0;
1889 int error; 1814 int error;
1890 int agi_ok;
1891 1815
1892 /*
1893 * First pull the on-disk inode from the AGI unlinked list.
1894 */
1895 mp = tp->t_mountp; 1816 mp = tp->t_mountp;
1896
1897 agno = XFS_INO_TO_AGNO(mp, ip->i_ino); 1817 agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
1898 agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
1899 1818
1900 /* 1819 /*
1901 * Get the agi buffer first. It ensures lock ordering 1820 * Get the agi buffer first. It ensures lock ordering
1902 * on the list. 1821 * on the list.
1903 */ 1822 */
1904 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr, 1823 error = xfs_read_agi(mp, tp, agno, &agibp);
1905 XFS_FSS_TO_BB(mp, 1), 0, &agibp); 1824 if (error)
1906 if (error) {
1907 cmn_err(CE_WARN,
1908 "xfs_iunlink_remove: xfs_trans_read_buf() returned an error %d on %s. Returning error.",
1909 error, mp->m_fsname);
1910 return error; 1825 return error;
1911 } 1826
1912 /*
1913 * Validate the magic number of the agi block.
1914 */
1915 agi = XFS_BUF_TO_AGI(agibp); 1827 agi = XFS_BUF_TO_AGI(agibp);
1916 agi_ok = 1828
1917 be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
1918 XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
1919 if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK_REMOVE,
1920 XFS_RANDOM_IUNLINK_REMOVE))) {
1921 XFS_CORRUPTION_ERROR("xfs_iunlink_remove", XFS_ERRLEVEL_LOW,
1922 mp, agi);
1923 xfs_trans_brelse(tp, agibp);
1924 cmn_err(CE_WARN,
1925 "xfs_iunlink_remove: XFS_TEST_ERROR() returned an error on %s. Returning EFSCORRUPTED.",
1926 mp->m_fsname);
1927 return XFS_ERROR(EFSCORRUPTED);
1928 }
1929 /* 1829 /*
1930 * Get the index into the agi hash table for the 1830 * Get the index into the agi hash table for the
1931 * list this inode will go on. 1831 * list this inode will go on.
@@ -1945,7 +1845,7 @@ xfs_iunlink_remove(
1945 * of dealing with the buffer when there is no need to 1845 * of dealing with the buffer when there is no need to
1946 * change it. 1846 * change it.
1947 */ 1847 */
1948 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK); 1848 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
1949 if (error) { 1849 if (error) {
1950 cmn_err(CE_WARN, 1850 cmn_err(CE_WARN,
1951 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 1851 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.",
@@ -1956,7 +1856,7 @@ xfs_iunlink_remove(
1956 ASSERT(next_agino != 0); 1856 ASSERT(next_agino != 0);
1957 if (next_agino != NULLAGINO) { 1857 if (next_agino != NULLAGINO) {
1958 dip->di_next_unlinked = cpu_to_be32(NULLAGINO); 1858 dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
1959 offset = ip->i_boffset + 1859 offset = ip->i_imap.im_boffset +
1960 offsetof(xfs_dinode_t, di_next_unlinked); 1860 offsetof(xfs_dinode_t, di_next_unlinked);
1961 xfs_trans_inode_buf(tp, ibp); 1861 xfs_trans_inode_buf(tp, ibp);
1962 xfs_trans_log_buf(tp, ibp, offset, 1862 xfs_trans_log_buf(tp, ibp, offset,
@@ -1992,7 +1892,7 @@ xfs_iunlink_remove(
1992 } 1892 }
1993 next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino); 1893 next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
1994 error = xfs_inotobp(mp, tp, next_ino, &last_dip, 1894 error = xfs_inotobp(mp, tp, next_ino, &last_dip,
1995 &last_ibp, &last_offset); 1895 &last_ibp, &last_offset, 0);
1996 if (error) { 1896 if (error) {
1997 cmn_err(CE_WARN, 1897 cmn_err(CE_WARN,
1998 "xfs_iunlink_remove: xfs_inotobp() returned an error %d on %s. Returning error.", 1898 "xfs_iunlink_remove: xfs_inotobp() returned an error %d on %s. Returning error.",
@@ -2007,7 +1907,7 @@ xfs_iunlink_remove(
2007 * Now last_ibp points to the buffer previous to us on 1907 * Now last_ibp points to the buffer previous to us on
2008 * the unlinked list. Pull us from the list. 1908 * the unlinked list. Pull us from the list.
2009 */ 1909 */
2010 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK); 1910 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
2011 if (error) { 1911 if (error) {
2012 cmn_err(CE_WARN, 1912 cmn_err(CE_WARN,
2013 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 1913 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.",
@@ -2019,7 +1919,7 @@ xfs_iunlink_remove(
2019 ASSERT(next_agino != agino); 1919 ASSERT(next_agino != agino);
2020 if (next_agino != NULLAGINO) { 1920 if (next_agino != NULLAGINO) {
2021 dip->di_next_unlinked = cpu_to_be32(NULLAGINO); 1921 dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
2022 offset = ip->i_boffset + 1922 offset = ip->i_imap.im_boffset +
2023 offsetof(xfs_dinode_t, di_next_unlinked); 1923 offsetof(xfs_dinode_t, di_next_unlinked);
2024 xfs_trans_inode_buf(tp, ibp); 1924 xfs_trans_inode_buf(tp, ibp);
2025 xfs_trans_log_buf(tp, ibp, offset, 1925 xfs_trans_log_buf(tp, ibp, offset,
@@ -2160,9 +2060,9 @@ xfs_ifree_cluster(
2160 iip = (xfs_inode_log_item_t *)lip; 2060 iip = (xfs_inode_log_item_t *)lip;
2161 ASSERT(iip->ili_logged == 1); 2061 ASSERT(iip->ili_logged == 1);
2162 lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done; 2062 lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
2163 spin_lock(&mp->m_ail_lock); 2063 xfs_trans_ail_copy_lsn(mp->m_ail,
2164 iip->ili_flush_lsn = iip->ili_item.li_lsn; 2064 &iip->ili_flush_lsn,
2165 spin_unlock(&mp->m_ail_lock); 2065 &iip->ili_item.li_lsn);
2166 xfs_iflags_set(iip->ili_inode, XFS_ISTALE); 2066 xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
2167 pre_flushed++; 2067 pre_flushed++;
2168 } 2068 }
@@ -2183,9 +2083,8 @@ xfs_ifree_cluster(
2183 iip->ili_last_fields = iip->ili_format.ilf_fields; 2083 iip->ili_last_fields = iip->ili_format.ilf_fields;
2184 iip->ili_format.ilf_fields = 0; 2084 iip->ili_format.ilf_fields = 0;
2185 iip->ili_logged = 1; 2085 iip->ili_logged = 1;
2186 spin_lock(&mp->m_ail_lock); 2086 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
2187 iip->ili_flush_lsn = iip->ili_item.li_lsn; 2087 &iip->ili_item.li_lsn);
2188 spin_unlock(&mp->m_ail_lock);
2189 2088
2190 xfs_buf_attach_iodone(bp, 2089 xfs_buf_attach_iodone(bp,
2191 (void(*)(xfs_buf_t*,xfs_log_item_t*)) 2090 (void(*)(xfs_buf_t*,xfs_log_item_t*))
@@ -2263,7 +2162,7 @@ xfs_ifree(
2263 2162
2264 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2163 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2265 2164
2266 error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK); 2165 error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
2267 if (error) 2166 if (error)
2268 return error; 2167 return error;
2269 2168
@@ -2279,7 +2178,7 @@ xfs_ifree(
2279 * This is a temporary hack that would require a proper fix 2178 * This is a temporary hack that would require a proper fix
2280 * in the future. 2179 * in the future.
2281 */ 2180 */
2282 dip->di_core.di_mode = 0; 2181 dip->di_mode = 0;
2283 2182
2284 if (delete) { 2183 if (delete) {
2285 xfs_ifree_cluster(ip, tp, first_ino); 2184 xfs_ifree_cluster(ip, tp, first_ino);
@@ -2312,9 +2211,10 @@ xfs_iroot_realloc(
2312 int rec_diff, 2211 int rec_diff,
2313 int whichfork) 2212 int whichfork)
2314{ 2213{
2214 struct xfs_mount *mp = ip->i_mount;
2315 int cur_max; 2215 int cur_max;
2316 xfs_ifork_t *ifp; 2216 xfs_ifork_t *ifp;
2317 xfs_bmbt_block_t *new_broot; 2217 struct xfs_btree_block *new_broot;
2318 int new_max; 2218 int new_max;
2319 size_t new_size; 2219 size_t new_size;
2320 char *np; 2220 char *np;
@@ -2335,8 +2235,7 @@ xfs_iroot_realloc(
2335 */ 2235 */
2336 if (ifp->if_broot_bytes == 0) { 2236 if (ifp->if_broot_bytes == 0) {
2337 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff); 2237 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
2338 ifp->if_broot = (xfs_bmbt_block_t*)kmem_alloc(new_size, 2238 ifp->if_broot = kmem_alloc(new_size, KM_SLEEP);
2339 KM_SLEEP);
2340 ifp->if_broot_bytes = (int)new_size; 2239 ifp->if_broot_bytes = (int)new_size;
2341 return; 2240 return;
2342 } 2241 }
@@ -2347,18 +2246,16 @@ xfs_iroot_realloc(
2347 * location. The records don't change location because 2246 * location. The records don't change location because
2348 * they are kept butted up against the btree block header. 2247 * they are kept butted up against the btree block header.
2349 */ 2248 */
2350 cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes); 2249 cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
2351 new_max = cur_max + rec_diff; 2250 new_max = cur_max + rec_diff;
2352 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); 2251 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
2353 ifp->if_broot = (xfs_bmbt_block_t *) 2252 ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
2354 kmem_realloc(ifp->if_broot,
2355 new_size,
2356 (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */ 2253 (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
2357 KM_SLEEP); 2254 KM_SLEEP);
2358 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1, 2255 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
2359 ifp->if_broot_bytes); 2256 ifp->if_broot_bytes);
2360 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1, 2257 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
2361 (int)new_size); 2258 (int)new_size);
2362 ifp->if_broot_bytes = (int)new_size; 2259 ifp->if_broot_bytes = (int)new_size;
2363 ASSERT(ifp->if_broot_bytes <= 2260 ASSERT(ifp->if_broot_bytes <=
2364 XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); 2261 XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
@@ -2372,7 +2269,7 @@ xfs_iroot_realloc(
2372 * records, just get rid of the root and clear the status bit. 2269 * records, just get rid of the root and clear the status bit.
2373 */ 2270 */
2374 ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0)); 2271 ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
2375 cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes); 2272 cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
2376 new_max = cur_max + rec_diff; 2273 new_max = cur_max + rec_diff;
2377 ASSERT(new_max >= 0); 2274 ASSERT(new_max >= 0);
2378 if (new_max > 0) 2275 if (new_max > 0)
@@ -2380,11 +2277,11 @@ xfs_iroot_realloc(
2380 else 2277 else
2381 new_size = 0; 2278 new_size = 0;
2382 if (new_size > 0) { 2279 if (new_size > 0) {
2383 new_broot = (xfs_bmbt_block_t *)kmem_alloc(new_size, KM_SLEEP); 2280 new_broot = kmem_alloc(new_size, KM_SLEEP);
2384 /* 2281 /*
2385 * First copy over the btree block header. 2282 * First copy over the btree block header.
2386 */ 2283 */
2387 memcpy(new_broot, ifp->if_broot, sizeof(xfs_bmbt_block_t)); 2284 memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN);
2388 } else { 2285 } else {
2389 new_broot = NULL; 2286 new_broot = NULL;
2390 ifp->if_flags &= ~XFS_IFBROOT; 2287 ifp->if_flags &= ~XFS_IFBROOT;
@@ -2397,18 +2294,16 @@ xfs_iroot_realloc(
2397 /* 2294 /*
2398 * First copy the records. 2295 * First copy the records.
2399 */ 2296 */
2400 op = (char *)XFS_BMAP_BROOT_REC_ADDR(ifp->if_broot, 1, 2297 op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
2401 ifp->if_broot_bytes); 2298 np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
2402 np = (char *)XFS_BMAP_BROOT_REC_ADDR(new_broot, 1,
2403 (int)new_size);
2404 memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t)); 2299 memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
2405 2300
2406 /* 2301 /*
2407 * Then copy the pointers. 2302 * Then copy the pointers.
2408 */ 2303 */
2409 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1, 2304 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
2410 ifp->if_broot_bytes); 2305 ifp->if_broot_bytes);
2411 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(new_broot, 1, 2306 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
2412 (int)new_size); 2307 (int)new_size);
2413 memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t)); 2308 memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
2414 } 2309 }
@@ -2511,64 +2406,6 @@ xfs_idata_realloc(
2511 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); 2406 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
2512} 2407}
2513 2408
2514
2515
2516
2517/*
2518 * Map inode to disk block and offset.
2519 *
2520 * mp -- the mount point structure for the current file system
2521 * tp -- the current transaction
2522 * ino -- the inode number of the inode to be located
2523 * imap -- this structure is filled in with the information necessary
2524 * to retrieve the given inode from disk
2525 * flags -- flags to pass to xfs_dilocate indicating whether or not
2526 * lookups in the inode btree were OK or not
2527 */
2528int
2529xfs_imap(
2530 xfs_mount_t *mp,
2531 xfs_trans_t *tp,
2532 xfs_ino_t ino,
2533 xfs_imap_t *imap,
2534 uint flags)
2535{
2536 xfs_fsblock_t fsbno;
2537 int len;
2538 int off;
2539 int error;
2540
2541 fsbno = imap->im_blkno ?
2542 XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK;
2543 error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags);
2544 if (error)
2545 return error;
2546
2547 imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno);
2548 imap->im_len = XFS_FSB_TO_BB(mp, len);
2549 imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno);
2550 imap->im_ioffset = (ushort)off;
2551 imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog);
2552
2553 /*
2554 * If the inode number maps to a block outside the bounds
2555 * of the file system then return NULL rather than calling
2556 * read_buf and panicing when we get an error from the
2557 * driver.
2558 */
2559 if ((imap->im_blkno + imap->im_len) >
2560 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
2561 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
2562 "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > "
2563 " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)",
2564 (unsigned long long) imap->im_blkno,
2565 (unsigned long long) imap->im_len,
2566 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
2567 return EINVAL;
2568 }
2569 return 0;
2570}
2571
2572void 2409void
2573xfs_idestroy_fork( 2410xfs_idestroy_fork(
2574 xfs_inode_t *ip, 2411 xfs_inode_t *ip,
@@ -2613,70 +2450,6 @@ xfs_idestroy_fork(
2613} 2450}
2614 2451
2615/* 2452/*
2616 * This is called free all the memory associated with an inode.
2617 * It must free the inode itself and any buffers allocated for
2618 * if_extents/if_data and if_broot. It must also free the lock
2619 * associated with the inode.
2620 */
2621void
2622xfs_idestroy(
2623 xfs_inode_t *ip)
2624{
2625 switch (ip->i_d.di_mode & S_IFMT) {
2626 case S_IFREG:
2627 case S_IFDIR:
2628 case S_IFLNK:
2629 xfs_idestroy_fork(ip, XFS_DATA_FORK);
2630 break;
2631 }
2632 if (ip->i_afp)
2633 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
2634 mrfree(&ip->i_lock);
2635 mrfree(&ip->i_iolock);
2636
2637#ifdef XFS_INODE_TRACE
2638 ktrace_free(ip->i_trace);
2639#endif
2640#ifdef XFS_BMAP_TRACE
2641 ktrace_free(ip->i_xtrace);
2642#endif
2643#ifdef XFS_BMBT_TRACE
2644 ktrace_free(ip->i_btrace);
2645#endif
2646#ifdef XFS_RW_TRACE
2647 ktrace_free(ip->i_rwtrace);
2648#endif
2649#ifdef XFS_ILOCK_TRACE
2650 ktrace_free(ip->i_lock_trace);
2651#endif
2652#ifdef XFS_DIR2_TRACE
2653 ktrace_free(ip->i_dir_trace);
2654#endif
2655 if (ip->i_itemp) {
2656 /*
2657 * Only if we are shutting down the fs will we see an
2658 * inode still in the AIL. If it is there, we should remove
2659 * it to prevent a use-after-free from occurring.
2660 */
2661 xfs_mount_t *mp = ip->i_mount;
2662 xfs_log_item_t *lip = &ip->i_itemp->ili_item;
2663
2664 ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
2665 XFS_FORCED_SHUTDOWN(ip->i_mount));
2666 if (lip->li_flags & XFS_LI_IN_AIL) {
2667 spin_lock(&mp->m_ail_lock);
2668 if (lip->li_flags & XFS_LI_IN_AIL)
2669 xfs_trans_delete_ail(mp, lip);
2670 else
2671 spin_unlock(&mp->m_ail_lock);
2672 }
2673 xfs_inode_item_destroy(ip);
2674 }
2675 kmem_zone_free(xfs_inode_zone, ip);
2676}
2677
2678
2679/*
2680 * Increment the pin count of the given buffer. 2453 * Increment the pin count of the given buffer.
2681 * This value is protected by ipinlock spinlock in the mount structure. 2454 * This value is protected by ipinlock spinlock in the mount structure.
2682 */ 2455 */
@@ -2880,7 +2653,7 @@ xfs_iflush_fork(
2880 ASSERT(ifp->if_broot_bytes <= 2653 ASSERT(ifp->if_broot_bytes <=
2881 (XFS_IFORK_SIZE(ip, whichfork) + 2654 (XFS_IFORK_SIZE(ip, whichfork) +
2882 XFS_BROOT_SIZE_ADJ)); 2655 XFS_BROOT_SIZE_ADJ));
2883 xfs_bmbt_to_bmdr(ifp->if_broot, ifp->if_broot_bytes, 2656 xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
2884 (xfs_bmdr_block_t *)cp, 2657 (xfs_bmdr_block_t *)cp,
2885 XFS_DFORK_SIZE(dip, mp, whichfork)); 2658 XFS_DFORK_SIZE(dip, mp, whichfork));
2886 } 2659 }
@@ -2889,15 +2662,16 @@ xfs_iflush_fork(
2889 case XFS_DINODE_FMT_DEV: 2662 case XFS_DINODE_FMT_DEV:
2890 if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) { 2663 if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
2891 ASSERT(whichfork == XFS_DATA_FORK); 2664 ASSERT(whichfork == XFS_DATA_FORK);
2892 dip->di_u.di_dev = cpu_to_be32(ip->i_df.if_u2.if_rdev); 2665 xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
2893 } 2666 }
2894 break; 2667 break;
2895 2668
2896 case XFS_DINODE_FMT_UUID: 2669 case XFS_DINODE_FMT_UUID:
2897 if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) { 2670 if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
2898 ASSERT(whichfork == XFS_DATA_FORK); 2671 ASSERT(whichfork == XFS_DATA_FORK);
2899 memcpy(&dip->di_u.di_muuid, &ip->i_df.if_u2.if_uuid, 2672 memcpy(XFS_DFORK_DPTR(dip),
2900 sizeof(uuid_t)); 2673 &ip->i_df.if_u2.if_uuid,
2674 sizeof(uuid_t));
2901 } 2675 }
2902 break; 2676 break;
2903 2677
@@ -3030,7 +2804,6 @@ cluster_corrupt_out:
3030 XFS_BUF_CLR_BDSTRAT_FUNC(bp); 2804 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
3031 XFS_BUF_UNDONE(bp); 2805 XFS_BUF_UNDONE(bp);
3032 XFS_BUF_STALE(bp); 2806 XFS_BUF_STALE(bp);
3033 XFS_BUF_SHUT(bp);
3034 XFS_BUF_ERROR(bp,EIO); 2807 XFS_BUF_ERROR(bp,EIO);
3035 xfs_biodone(bp); 2808 xfs_biodone(bp);
3036 } else { 2809 } else {
@@ -3172,7 +2945,7 @@ xfs_iflush(
3172 /* 2945 /*
3173 * Get the buffer containing the on-disk inode. 2946 * Get the buffer containing the on-disk inode.
3174 */ 2947 */
3175 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0, 2948 error = xfs_itobp(mp, NULL, ip, &dip, &bp,
3176 noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK); 2949 noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK);
3177 if (error || !bp) { 2950 if (error || !bp) {
3178 xfs_ifunlock(ip); 2951 xfs_ifunlock(ip);
@@ -3253,7 +3026,7 @@ xfs_iflush_int(
3253 } 3026 }
3254 3027
3255 /* set *dip = inode's place in the buffer */ 3028 /* set *dip = inode's place in the buffer */
3256 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_boffset); 3029 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
3257 3030
3258 /* 3031 /*
3259 * Clear i_update_core before copying out the data. 3032 * Clear i_update_core before copying out the data.
@@ -3275,11 +3048,11 @@ xfs_iflush_int(
3275 */ 3048 */
3276 xfs_synchronize_atime(ip); 3049 xfs_synchronize_atime(ip);
3277 3050
3278 if (XFS_TEST_ERROR(be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC, 3051 if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC,
3279 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { 3052 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
3280 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 3053 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3281 "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p", 3054 "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p",
3282 ip->i_ino, be16_to_cpu(dip->di_core.di_magic), dip); 3055 ip->i_ino, be16_to_cpu(dip->di_magic), dip);
3283 goto corrupt_out; 3056 goto corrupt_out;
3284 } 3057 }
3285 if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC, 3058 if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
@@ -3342,7 +3115,7 @@ xfs_iflush_int(
3342 * because if the inode is dirty at all the core must 3115 * because if the inode is dirty at all the core must
3343 * be. 3116 * be.
3344 */ 3117 */
3345 xfs_dinode_to_disk(&dip->di_core, &ip->i_d); 3118 xfs_dinode_to_disk(dip, &ip->i_d);
3346 3119
3347 /* Wrap, we never let the log put out DI_MAX_FLUSH */ 3120 /* Wrap, we never let the log put out DI_MAX_FLUSH */
3348 if (ip->i_d.di_flushiter == DI_MAX_FLUSH) 3121 if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
@@ -3354,28 +3127,27 @@ xfs_iflush_int(
3354 * convert back to the old inode format. If the superblock version 3127 * convert back to the old inode format. If the superblock version
3355 * has been updated, then make the conversion permanent. 3128 * has been updated, then make the conversion permanent.
3356 */ 3129 */
3357 ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 || 3130 ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
3358 xfs_sb_version_hasnlink(&mp->m_sb)); 3131 if (ip->i_d.di_version == 1) {
3359 if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
3360 if (!xfs_sb_version_hasnlink(&mp->m_sb)) { 3132 if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
3361 /* 3133 /*
3362 * Convert it back. 3134 * Convert it back.
3363 */ 3135 */
3364 ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); 3136 ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
3365 dip->di_core.di_onlink = cpu_to_be16(ip->i_d.di_nlink); 3137 dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink);
3366 } else { 3138 } else {
3367 /* 3139 /*
3368 * The superblock version has already been bumped, 3140 * The superblock version has already been bumped,
3369 * so just make the conversion to the new inode 3141 * so just make the conversion to the new inode
3370 * format permanent. 3142 * format permanent.
3371 */ 3143 */
3372 ip->i_d.di_version = XFS_DINODE_VERSION_2; 3144 ip->i_d.di_version = 2;
3373 dip->di_core.di_version = XFS_DINODE_VERSION_2; 3145 dip->di_version = 2;
3374 ip->i_d.di_onlink = 0; 3146 ip->i_d.di_onlink = 0;
3375 dip->di_core.di_onlink = 0; 3147 dip->di_onlink = 0;
3376 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 3148 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
3377 memset(&(dip->di_core.di_pad[0]), 0, 3149 memset(&(dip->di_pad[0]), 0,
3378 sizeof(dip->di_core.di_pad)); 3150 sizeof(dip->di_pad));
3379 ASSERT(ip->i_d.di_projid == 0); 3151 ASSERT(ip->i_d.di_projid == 0);
3380 } 3152 }
3381 } 3153 }
@@ -3418,10 +3190,8 @@ xfs_iflush_int(
3418 iip->ili_format.ilf_fields = 0; 3190 iip->ili_format.ilf_fields = 0;
3419 iip->ili_logged = 1; 3191 iip->ili_logged = 1;
3420 3192
3421 ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */ 3193 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
3422 spin_lock(&mp->m_ail_lock); 3194 &iip->ili_item.li_lsn);
3423 iip->ili_flush_lsn = iip->ili_item.li_lsn;
3424 spin_unlock(&mp->m_ail_lock);
3425 3195
3426 /* 3196 /*
3427 * Attach the function xfs_iflush_done to the inode's 3197 * Attach the function xfs_iflush_done to the inode's
@@ -3459,45 +3229,8 @@ corrupt_out:
3459} 3229}
3460 3230
3461 3231
3462/*
3463 * Flush all inactive inodes in mp.
3464 */
3465void
3466xfs_iflush_all(
3467 xfs_mount_t *mp)
3468{
3469 xfs_inode_t *ip;
3470
3471 again:
3472 XFS_MOUNT_ILOCK(mp);
3473 ip = mp->m_inodes;
3474 if (ip == NULL)
3475 goto out;
3476
3477 do {
3478 /* Make sure we skip markers inserted by sync */
3479 if (ip->i_mount == NULL) {
3480 ip = ip->i_mnext;
3481 continue;
3482 }
3483
3484 if (!VFS_I(ip)) {
3485 XFS_MOUNT_IUNLOCK(mp);
3486 xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC);
3487 goto again;
3488 }
3489
3490 ASSERT(vn_count(VFS_I(ip)) == 0);
3491
3492 ip = ip->i_mnext;
3493 } while (ip != mp->m_inodes);
3494 out:
3495 XFS_MOUNT_IUNLOCK(mp);
3496}
3497 3232
3498#ifdef XFS_ILOCK_TRACE 3233#ifdef XFS_ILOCK_TRACE
3499ktrace_t *xfs_ilock_trace_buf;
3500
3501void 3234void
3502xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra) 3235xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra)
3503{ 3236{
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 6be310d41daf..1f175fa34b22 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -19,8 +19,7 @@
19#define __XFS_INODE_H__ 19#define __XFS_INODE_H__
20 20
21struct xfs_dinode; 21struct xfs_dinode;
22struct xfs_dinode_core; 22struct xfs_inode;
23
24 23
25/* 24/*
26 * Fork identifiers. 25 * Fork identifiers.
@@ -63,7 +62,7 @@ typedef struct xfs_ext_irec {
63typedef struct xfs_ifork { 62typedef struct xfs_ifork {
64 int if_bytes; /* bytes in if_u1 */ 63 int if_bytes; /* bytes in if_u1 */
65 int if_real_bytes; /* bytes allocated in if_u1 */ 64 int if_real_bytes; /* bytes allocated in if_u1 */
66 xfs_bmbt_block_t *if_broot; /* file's incore btree root */ 65 struct xfs_btree_block *if_broot; /* file's incore btree root */
67 short if_broot_bytes; /* bytes allocated for root */ 66 short if_broot_bytes; /* bytes allocated for root */
68 unsigned char if_flags; /* per-fork flags */ 67 unsigned char if_flags; /* per-fork flags */
69 unsigned char if_ext_max; /* max # of extent records */ 68 unsigned char if_ext_max; /* max # of extent records */
@@ -84,52 +83,14 @@ typedef struct xfs_ifork {
84} xfs_ifork_t; 83} xfs_ifork_t;
85 84
86/* 85/*
87 * Flags for xfs_ichgtime(). 86 * Inode location information. Stored in the inode and passed to
87 * xfs_imap_to_bp() to get a buffer and dinode for a given inode.
88 */ 88 */
89#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ 89struct xfs_imap {
90#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ 90 xfs_daddr_t im_blkno; /* starting BB of inode chunk */
91 91 ushort im_len; /* length in BBs of inode chunk */
92/* 92 ushort im_boffset; /* inode offset in block in bytes */
93 * Per-fork incore inode flags. 93};
94 */
95#define XFS_IFINLINE 0x01 /* Inline data is read in */
96#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */
97#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */
98#define XFS_IFEXTIREC 0x08 /* Indirection array of extent blocks */
99
100/*
101 * Flags for xfs_itobp(), xfs_imap() and xfs_dilocate().
102 */
103#define XFS_IMAP_LOOKUP 0x1
104#define XFS_IMAP_BULKSTAT 0x2
105
106#ifdef __KERNEL__
107struct bhv_desc;
108struct cred;
109struct ktrace;
110struct xfs_buf;
111struct xfs_bmap_free;
112struct xfs_bmbt_irec;
113struct xfs_bmbt_block;
114struct xfs_inode;
115struct xfs_inode_log_item;
116struct xfs_mount;
117struct xfs_trans;
118struct xfs_dquot;
119
120#if defined(XFS_ILOCK_TRACE)
121#define XFS_ILOCK_KTRACE_SIZE 32
122extern ktrace_t *xfs_ilock_trace_buf;
123extern void xfs_ilock_trace(struct xfs_inode *, int, unsigned int, inst_t *);
124#else
125#define xfs_ilock_trace(i,n,f,ra)
126#endif
127
128typedef struct dm_attrs_s {
129 __uint32_t da_dmevmask; /* DMIG event mask */
130 __uint16_t da_dmstate; /* DMIG state info */
131 __uint16_t da_pad; /* DMIG extra padding */
132} dm_attrs_t;
133 94
134/* 95/*
135 * This is the xfs in-core inode structure. 96 * This is the xfs in-core inode structure.
@@ -160,7 +121,7 @@ typedef struct xfs_ictimestamp {
160} xfs_ictimestamp_t; 121} xfs_ictimestamp_t;
161 122
162/* 123/*
163 * NOTE: This structure must be kept identical to struct xfs_dinode_core 124 * NOTE: This structure must be kept identical to struct xfs_dinode
164 * in xfs_dinode.h except for the endianess annotations. 125 * in xfs_dinode.h except for the endianess annotations.
165 */ 126 */
166typedef struct xfs_icdinode { 127typedef struct xfs_icdinode {
@@ -191,27 +152,97 @@ typedef struct xfs_icdinode {
191 __uint32_t di_gen; /* generation number */ 152 __uint32_t di_gen; /* generation number */
192} xfs_icdinode_t; 153} xfs_icdinode_t;
193 154
194typedef struct { 155/*
195 struct xfs_inode *ip_mnext; /* next inode in mount list */ 156 * Flags for xfs_ichgtime().
196 struct xfs_inode *ip_mprev; /* ptr to prev inode */ 157 */
197 struct xfs_mount *ip_mount; /* fs mount struct ptr */ 158#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */
198} xfs_iptr_t; 159#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */
160
161/*
162 * Per-fork incore inode flags.
163 */
164#define XFS_IFINLINE 0x01 /* Inline data is read in */
165#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */
166#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */
167#define XFS_IFEXTIREC 0x08 /* Indirection array of extent blocks */
168
169/*
170 * Fork handling.
171 */
172
173#define XFS_IFORK_Q(ip) ((ip)->i_d.di_forkoff != 0)
174#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_d.di_forkoff << 3))
175
176#define XFS_IFORK_PTR(ip,w) \
177 ((w) == XFS_DATA_FORK ? \
178 &(ip)->i_df : \
179 (ip)->i_afp)
180#define XFS_IFORK_DSIZE(ip) \
181 (XFS_IFORK_Q(ip) ? \
182 XFS_IFORK_BOFF(ip) : \
183 XFS_LITINO((ip)->i_mount))
184#define XFS_IFORK_ASIZE(ip) \
185 (XFS_IFORK_Q(ip) ? \
186 XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \
187 0)
188#define XFS_IFORK_SIZE(ip,w) \
189 ((w) == XFS_DATA_FORK ? \
190 XFS_IFORK_DSIZE(ip) : \
191 XFS_IFORK_ASIZE(ip))
192#define XFS_IFORK_FORMAT(ip,w) \
193 ((w) == XFS_DATA_FORK ? \
194 (ip)->i_d.di_format : \
195 (ip)->i_d.di_aformat)
196#define XFS_IFORK_FMT_SET(ip,w,n) \
197 ((w) == XFS_DATA_FORK ? \
198 ((ip)->i_d.di_format = (n)) : \
199 ((ip)->i_d.di_aformat = (n)))
200#define XFS_IFORK_NEXTENTS(ip,w) \
201 ((w) == XFS_DATA_FORK ? \
202 (ip)->i_d.di_nextents : \
203 (ip)->i_d.di_anextents)
204#define XFS_IFORK_NEXT_SET(ip,w,n) \
205 ((w) == XFS_DATA_FORK ? \
206 ((ip)->i_d.di_nextents = (n)) : \
207 ((ip)->i_d.di_anextents = (n)))
208
209
210
211#ifdef __KERNEL__
212
213struct bhv_desc;
214struct cred;
215struct ktrace;
216struct xfs_buf;
217struct xfs_bmap_free;
218struct xfs_bmbt_irec;
219struct xfs_inode_log_item;
220struct xfs_mount;
221struct xfs_trans;
222struct xfs_dquot;
223
224#if defined(XFS_ILOCK_TRACE)
225#define XFS_ILOCK_KTRACE_SIZE 32
226extern void xfs_ilock_trace(struct xfs_inode *, int, unsigned int, inst_t *);
227#else
228#define xfs_ilock_trace(i,n,f,ra)
229#endif
230
231typedef struct dm_attrs_s {
232 __uint32_t da_dmevmask; /* DMIG event mask */
233 __uint16_t da_dmstate; /* DMIG state info */
234 __uint16_t da_pad; /* DMIG extra padding */
235} dm_attrs_t;
199 236
200typedef struct xfs_inode { 237typedef struct xfs_inode {
201 /* Inode linking and identification information. */ 238 /* Inode linking and identification information. */
202 struct xfs_inode *i_mnext; /* next inode in mount list */
203 struct xfs_inode *i_mprev; /* ptr to prev inode */
204 struct xfs_mount *i_mount; /* fs mount struct ptr */ 239 struct xfs_mount *i_mount; /* fs mount struct ptr */
205 struct list_head i_reclaim; /* reclaim list */
206 struct inode *i_vnode; /* vnode backpointer */
207 struct xfs_dquot *i_udquot; /* user dquot */ 240 struct xfs_dquot *i_udquot; /* user dquot */
208 struct xfs_dquot *i_gdquot; /* group dquot */ 241 struct xfs_dquot *i_gdquot; /* group dquot */
209 242
210 /* Inode location stuff */ 243 /* Inode location stuff */
211 xfs_ino_t i_ino; /* inode number (agno/agino)*/ 244 xfs_ino_t i_ino; /* inode number (agno/agino)*/
212 xfs_daddr_t i_blkno; /* blkno of inode buffer */ 245 struct xfs_imap i_imap; /* location for xfs_imap() */
213 ushort i_len; /* len of inode buffer */
214 ushort i_boffset; /* off of inode in buffer */
215 246
216 /* Extent information. */ 247 /* Extent information. */
217 xfs_ifork_t *i_afp; /* attribute fork pointer */ 248 xfs_ifork_t *i_afp; /* attribute fork pointer */
@@ -230,7 +261,6 @@ typedef struct xfs_inode {
230 unsigned short i_flags; /* see defined flags below */ 261 unsigned short i_flags; /* see defined flags below */
231 unsigned char i_update_core; /* timestamps/size is dirty */ 262 unsigned char i_update_core; /* timestamps/size is dirty */
232 unsigned char i_update_size; /* di_size field is dirty */ 263 unsigned char i_update_size; /* di_size field is dirty */
233 unsigned int i_gen; /* generation count */
234 unsigned int i_delayed_blks; /* count of delay alloc blks */ 264 unsigned int i_delayed_blks; /* count of delay alloc blks */
235 265
236 xfs_icdinode_t i_d; /* most of ondisk inode */ 266 xfs_icdinode_t i_d; /* most of ondisk inode */
@@ -238,6 +268,10 @@ typedef struct xfs_inode {
238 xfs_fsize_t i_size; /* in-memory size */ 268 xfs_fsize_t i_size; /* in-memory size */
239 xfs_fsize_t i_new_size; /* size when write completes */ 269 xfs_fsize_t i_new_size; /* size when write completes */
240 atomic_t i_iocount; /* outstanding I/O count */ 270 atomic_t i_iocount; /* outstanding I/O count */
271
272 /* VFS inode */
273 struct inode i_vnode; /* embedded VFS inode */
274
241 /* Trace buffers per inode. */ 275 /* Trace buffers per inode. */
242#ifdef XFS_INODE_TRACE 276#ifdef XFS_INODE_TRACE
243 struct ktrace *i_trace; /* general inode trace */ 277 struct ktrace *i_trace; /* general inode trace */
@@ -245,7 +279,7 @@ typedef struct xfs_inode {
245#ifdef XFS_BMAP_TRACE 279#ifdef XFS_BMAP_TRACE
246 struct ktrace *i_xtrace; /* inode extent list trace */ 280 struct ktrace *i_xtrace; /* inode extent list trace */
247#endif 281#endif
248#ifdef XFS_BMBT_TRACE 282#ifdef XFS_BTREE_TRACE
249 struct ktrace *i_btrace; /* inode bmap btree trace */ 283 struct ktrace *i_btrace; /* inode bmap btree trace */
250#endif 284#endif
251#ifdef XFS_RW_TRACE 285#ifdef XFS_RW_TRACE
@@ -265,13 +299,30 @@ typedef struct xfs_inode {
265/* Convert from vfs inode to xfs inode */ 299/* Convert from vfs inode to xfs inode */
266static inline struct xfs_inode *XFS_I(struct inode *inode) 300static inline struct xfs_inode *XFS_I(struct inode *inode)
267{ 301{
268 return (struct xfs_inode *)inode->i_private; 302 return container_of(inode, struct xfs_inode, i_vnode);
269} 303}
270 304
271/* convert from xfs inode to vfs inode */ 305/* convert from xfs inode to vfs inode */
272static inline struct inode *VFS_I(struct xfs_inode *ip) 306static inline struct inode *VFS_I(struct xfs_inode *ip)
273{ 307{
274 return (struct inode *)ip->i_vnode; 308 return &ip->i_vnode;
309}
310
311/*
312 * Get rid of a partially initialized inode.
313 *
314 * We have to go through destroy_inode to make sure allocations
315 * from init_inode_always like the security data are undone.
316 *
317 * We mark the inode bad so that it takes the short cut in
318 * the reclaim path instead of going through the flush path
319 * which doesn't make sense for an inode that has never seen the
320 * light of day.
321 */
322static inline void xfs_destroy_inode(struct xfs_inode *ip)
323{
324 make_bad_inode(VFS_I(ip));
325 return destroy_inode(VFS_I(ip));
275} 326}
276 327
277/* 328/*
@@ -327,65 +378,36 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
327 spin_unlock(&ip->i_flags_lock); 378 spin_unlock(&ip->i_flags_lock);
328 return ret; 379 return ret;
329} 380}
330#endif /* __KERNEL__ */
331
332 381
333/* 382/*
334 * Fork handling. 383 * Manage the i_flush queue embedded in the inode. This completion
384 * queue synchronizes processes attempting to flush the in-core
385 * inode back to disk.
335 */ 386 */
387static inline void xfs_iflock(xfs_inode_t *ip)
388{
389 wait_for_completion(&ip->i_flush);
390}
336 391
337#define XFS_IFORK_Q(ip) ((ip)->i_d.di_forkoff != 0) 392static inline int xfs_iflock_nowait(xfs_inode_t *ip)
338#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_d.di_forkoff << 3)) 393{
339 394 return try_wait_for_completion(&ip->i_flush);
340#define XFS_IFORK_PTR(ip,w) \ 395}
341 ((w) == XFS_DATA_FORK ? \
342 &(ip)->i_df : \
343 (ip)->i_afp)
344#define XFS_IFORK_DSIZE(ip) \
345 (XFS_IFORK_Q(ip) ? \
346 XFS_IFORK_BOFF(ip) : \
347 XFS_LITINO((ip)->i_mount))
348#define XFS_IFORK_ASIZE(ip) \
349 (XFS_IFORK_Q(ip) ? \
350 XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \
351 0)
352#define XFS_IFORK_SIZE(ip,w) \
353 ((w) == XFS_DATA_FORK ? \
354 XFS_IFORK_DSIZE(ip) : \
355 XFS_IFORK_ASIZE(ip))
356#define XFS_IFORK_FORMAT(ip,w) \
357 ((w) == XFS_DATA_FORK ? \
358 (ip)->i_d.di_format : \
359 (ip)->i_d.di_aformat)
360#define XFS_IFORK_FMT_SET(ip,w,n) \
361 ((w) == XFS_DATA_FORK ? \
362 ((ip)->i_d.di_format = (n)) : \
363 ((ip)->i_d.di_aformat = (n)))
364#define XFS_IFORK_NEXTENTS(ip,w) \
365 ((w) == XFS_DATA_FORK ? \
366 (ip)->i_d.di_nextents : \
367 (ip)->i_d.di_anextents)
368#define XFS_IFORK_NEXT_SET(ip,w,n) \
369 ((w) == XFS_DATA_FORK ? \
370 ((ip)->i_d.di_nextents = (n)) : \
371 ((ip)->i_d.di_anextents = (n)))
372 396
373#ifdef __KERNEL__ 397static inline void xfs_ifunlock(xfs_inode_t *ip)
398{
399 complete(&ip->i_flush);
400}
374 401
375/* 402/*
376 * In-core inode flags. 403 * In-core inode flags.
377 */ 404 */
378#define XFS_IGRIO 0x0001 /* inode used for guaranteed rate i/o */ 405#define XFS_IRECLAIM 0x0001 /* we have started reclaiming this inode */
379#define XFS_IUIOSZ 0x0002 /* inode i/o sizes have been explicitly set */ 406#define XFS_ISTALE 0x0002 /* inode has been staled */
380#define XFS_IQUIESCE 0x0004 /* we have started quiescing for this inode */ 407#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
381#define XFS_IRECLAIM 0x0008 /* we have started reclaiming this inode */ 408#define XFS_INEW 0x0008 /* inode has just been allocated */
382#define XFS_ISTALE 0x0010 /* inode has been staled */ 409#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */
383#define XFS_IRECLAIMABLE 0x0020 /* inode can be reclaimed */ 410#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */
384#define XFS_INEW 0x0040
385#define XFS_IFILESTREAM 0x0080 /* inode is in a filestream directory */
386#define XFS_IMODIFIED 0x0100 /* XFS inode state possibly differs */
387 /* to the Linux inode state. */
388#define XFS_ITRUNCATED 0x0200 /* truncated down so flush-on-close */
389 411
390/* 412/*
391 * Flags for inode locking. 413 * Flags for inode locking.
@@ -460,16 +482,8 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
460 ((pip)->i_d.di_mode & S_ISGID)) 482 ((pip)->i_d.di_mode & S_ISGID))
461 483
462/* 484/*
463 * Flags for xfs_iget()
464 */
465#define XFS_IGET_CREATE 0x1
466#define XFS_IGET_BULKSTAT 0x2
467
468/*
469 * xfs_iget.c prototypes. 485 * xfs_iget.c prototypes.
470 */ 486 */
471void xfs_ihash_init(struct xfs_mount *);
472void xfs_ihash_free(struct xfs_mount *);
473xfs_inode_t *xfs_inode_incore(struct xfs_mount *, xfs_ino_t, 487xfs_inode_t *xfs_inode_incore(struct xfs_mount *, xfs_ino_t,
474 struct xfs_trans *); 488 struct xfs_trans *);
475int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, 489int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
@@ -484,25 +498,13 @@ int xfs_isilocked(xfs_inode_t *, uint);
484uint xfs_ilock_map_shared(xfs_inode_t *); 498uint xfs_ilock_map_shared(xfs_inode_t *);
485void xfs_iunlock_map_shared(xfs_inode_t *, uint); 499void xfs_iunlock_map_shared(xfs_inode_t *, uint);
486void xfs_ireclaim(xfs_inode_t *); 500void xfs_ireclaim(xfs_inode_t *);
487int xfs_finish_reclaim(xfs_inode_t *, int, int);
488int xfs_finish_reclaim_all(struct xfs_mount *, int);
489 501
490/* 502/*
491 * xfs_inode.c prototypes. 503 * xfs_inode.c prototypes.
492 */ 504 */
493int xfs_itobp(struct xfs_mount *, struct xfs_trans *,
494 xfs_inode_t *, struct xfs_dinode **, struct xfs_buf **,
495 xfs_daddr_t, uint, uint);
496int xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
497 xfs_inode_t **, xfs_daddr_t, uint);
498int xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int);
499int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t, 505int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t,
500 xfs_nlink_t, xfs_dev_t, cred_t *, xfs_prid_t, 506 xfs_nlink_t, xfs_dev_t, cred_t *, xfs_prid_t,
501 int, struct xfs_buf **, boolean_t *, xfs_inode_t **); 507 int, struct xfs_buf **, boolean_t *, xfs_inode_t **);
502void xfs_dinode_from_disk(struct xfs_icdinode *,
503 struct xfs_dinode_core *);
504void xfs_dinode_to_disk(struct xfs_dinode_core *,
505 struct xfs_icdinode *);
506 508
507uint xfs_ip2xflags(struct xfs_inode *); 509uint xfs_ip2xflags(struct xfs_inode *);
508uint xfs_dic2xflags(struct xfs_dinode *); 510uint xfs_dic2xflags(struct xfs_dinode *);
@@ -513,17 +515,10 @@ int xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *,
513 xfs_fsize_t, int, int); 515 xfs_fsize_t, int, int);
514int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); 516int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
515 517
516void xfs_idestroy_fork(xfs_inode_t *, int);
517void xfs_idestroy(xfs_inode_t *);
518void xfs_idata_realloc(xfs_inode_t *, int, int);
519void xfs_iextract(xfs_inode_t *);
520void xfs_iext_realloc(xfs_inode_t *, int, int); 518void xfs_iext_realloc(xfs_inode_t *, int, int);
521void xfs_iroot_realloc(xfs_inode_t *, int, int);
522void xfs_ipin(xfs_inode_t *); 519void xfs_ipin(xfs_inode_t *);
523void xfs_iunpin(xfs_inode_t *); 520void xfs_iunpin(xfs_inode_t *);
524int xfs_iextents_copy(xfs_inode_t *, xfs_bmbt_rec_t *, int);
525int xfs_iflush(xfs_inode_t *, uint); 521int xfs_iflush(xfs_inode_t *, uint);
526void xfs_iflush_all(struct xfs_mount *);
527void xfs_ichgtime(xfs_inode_t *, int); 522void xfs_ichgtime(xfs_inode_t *, int);
528xfs_fsize_t xfs_file_last_byte(xfs_inode_t *); 523xfs_fsize_t xfs_file_last_byte(xfs_inode_t *);
529void xfs_lock_inodes(xfs_inode_t **, int, uint); 524void xfs_lock_inodes(xfs_inode_t **, int, uint);
@@ -532,6 +527,77 @@ void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
532void xfs_synchronize_atime(xfs_inode_t *); 527void xfs_synchronize_atime(xfs_inode_t *);
533void xfs_mark_inode_dirty_sync(xfs_inode_t *); 528void xfs_mark_inode_dirty_sync(xfs_inode_t *);
534 529
530#if defined(XFS_INODE_TRACE)
531
532#define INODE_TRACE_SIZE 16 /* number of trace entries */
533#define INODE_KTRACE_ENTRY 1
534#define INODE_KTRACE_EXIT 2
535#define INODE_KTRACE_HOLD 3
536#define INODE_KTRACE_REF 4
537#define INODE_KTRACE_RELE 5
538
539extern void _xfs_itrace_entry(struct xfs_inode *, const char *, inst_t *);
540extern void _xfs_itrace_exit(struct xfs_inode *, const char *, inst_t *);
541extern void xfs_itrace_hold(struct xfs_inode *, char *, int, inst_t *);
542extern void _xfs_itrace_ref(struct xfs_inode *, char *, int, inst_t *);
543extern void xfs_itrace_rele(struct xfs_inode *, char *, int, inst_t *);
544#define xfs_itrace_entry(ip) \
545 _xfs_itrace_entry(ip, __func__, (inst_t *)__return_address)
546#define xfs_itrace_exit(ip) \
547 _xfs_itrace_exit(ip, __func__, (inst_t *)__return_address)
548#define xfs_itrace_exit_tag(ip, tag) \
549 _xfs_itrace_exit(ip, tag, (inst_t *)__return_address)
550#define xfs_itrace_ref(ip) \
551 _xfs_itrace_ref(ip, __FILE__, __LINE__, (inst_t *)__return_address)
552
553#else
554#define xfs_itrace_entry(a)
555#define xfs_itrace_exit(a)
556#define xfs_itrace_exit_tag(a, b)
557#define xfs_itrace_hold(a, b, c, d)
558#define xfs_itrace_ref(a)
559#define xfs_itrace_rele(a, b, c, d)
560#endif
561
562#define IHOLD(ip) \
563do { \
564 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
565 atomic_inc(&(VFS_I(ip)->i_count)); \
566 xfs_itrace_hold((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
567} while (0)
568
569#define IRELE(ip) \
570do { \
571 xfs_itrace_rele((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
572 iput(VFS_I(ip)); \
573} while (0)
574
575#endif /* __KERNEL__ */
576
577/*
578 * Flags for xfs_iget()
579 */
580#define XFS_IGET_CREATE 0x1
581#define XFS_IGET_BULKSTAT 0x2
582
583int xfs_inotobp(struct xfs_mount *, struct xfs_trans *,
584 xfs_ino_t, struct xfs_dinode **,
585 struct xfs_buf **, int *, uint);
586int xfs_itobp(struct xfs_mount *, struct xfs_trans *,
587 struct xfs_inode *, struct xfs_dinode **,
588 struct xfs_buf **, uint);
589int xfs_iread(struct xfs_mount *, struct xfs_trans *,
590 struct xfs_inode *, xfs_daddr_t, uint);
591void xfs_dinode_from_disk(struct xfs_icdinode *,
592 struct xfs_dinode *);
593void xfs_dinode_to_disk(struct xfs_dinode *,
594 struct xfs_icdinode *);
595void xfs_idestroy_fork(struct xfs_inode *, int);
596void xfs_idata_realloc(struct xfs_inode *, int, int);
597void xfs_iroot_realloc(struct xfs_inode *, int, int);
598int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
599int xfs_iextents_copy(struct xfs_inode *, xfs_bmbt_rec_t *, int);
600
535xfs_bmbt_rec_host_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t); 601xfs_bmbt_rec_host_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t);
536void xfs_iext_insert(xfs_ifork_t *, xfs_extnum_t, xfs_extnum_t, 602void xfs_iext_insert(xfs_ifork_t *, xfs_extnum_t, xfs_extnum_t,
537 xfs_bmbt_irec_t *); 603 xfs_bmbt_irec_t *);
@@ -561,7 +627,8 @@ void xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int);
561#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) 627#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
562 628
563#ifdef DEBUG 629#ifdef DEBUG
564void xfs_isize_check(struct xfs_mount *, xfs_inode_t *, xfs_fsize_t); 630void xfs_isize_check(struct xfs_mount *, struct xfs_inode *,
631 xfs_fsize_t);
565#else /* DEBUG */ 632#else /* DEBUG */
566#define xfs_isize_check(mp, ip, isize) 633#define xfs_isize_check(mp, ip, isize)
567#endif /* DEBUG */ 634#endif /* DEBUG */
@@ -576,26 +643,4 @@ extern struct kmem_zone *xfs_ifork_zone;
576extern struct kmem_zone *xfs_inode_zone; 643extern struct kmem_zone *xfs_inode_zone;
577extern struct kmem_zone *xfs_ili_zone; 644extern struct kmem_zone *xfs_ili_zone;
578 645
579/*
580 * Manage the i_flush queue embedded in the inode. This completion
581 * queue synchronizes processes attempting to flush the in-core
582 * inode back to disk.
583 */
584static inline void xfs_iflock(xfs_inode_t *ip)
585{
586 wait_for_completion(&ip->i_flush);
587}
588
589static inline int xfs_iflock_nowait(xfs_inode_t *ip)
590{
591 return try_wait_for_completion(&ip->i_flush);
592}
593
594static inline void xfs_ifunlock(xfs_inode_t *ip)
595{
596 complete(&ip->i_flush);
597}
598
599#endif /* __KERNEL__ */
600
601#endif /* __XFS_INODE_H__ */ 646#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 97c7452e2620..977c4aec587e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -281,7 +281,7 @@ xfs_inode_item_format(
281 xfs_mark_inode_dirty_sync(ip); 281 xfs_mark_inode_dirty_sync(ip);
282 282
283 vecp->i_addr = (xfs_caddr_t)&ip->i_d; 283 vecp->i_addr = (xfs_caddr_t)&ip->i_d;
284 vecp->i_len = sizeof(xfs_dinode_core_t); 284 vecp->i_len = sizeof(struct xfs_icdinode);
285 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE); 285 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE);
286 vecp++; 286 vecp++;
287 nvecs++; 287 nvecs++;
@@ -296,9 +296,8 @@ xfs_inode_item_format(
296 * has a new version number, then we don't bother converting back. 296 * has a new version number, then we don't bother converting back.
297 */ 297 */
298 mp = ip->i_mount; 298 mp = ip->i_mount;
299 ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 || 299 ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
300 xfs_sb_version_hasnlink(&mp->m_sb)); 300 if (ip->i_d.di_version == 1) {
301 if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
302 if (!xfs_sb_version_hasnlink(&mp->m_sb)) { 301 if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
303 /* 302 /*
304 * Convert it back. 303 * Convert it back.
@@ -311,7 +310,7 @@ xfs_inode_item_format(
311 * so just make the conversion to the new inode 310 * so just make the conversion to the new inode
312 * format permanent. 311 * format permanent.
313 */ 312 */
314 ip->i_d.di_version = XFS_DINODE_VERSION_2; 313 ip->i_d.di_version = 2;
315 ip->i_d.di_onlink = 0; 314 ip->i_d.di_onlink = 0;
316 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 315 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
317 } 316 }
@@ -932,6 +931,7 @@ xfs_inode_item_init(
932 iip->ili_item.li_type = XFS_LI_INODE; 931 iip->ili_item.li_type = XFS_LI_INODE;
933 iip->ili_item.li_ops = &xfs_inode_item_ops; 932 iip->ili_item.li_ops = &xfs_inode_item_ops;
934 iip->ili_item.li_mountp = mp; 933 iip->ili_item.li_mountp = mp;
934 iip->ili_item.li_ailp = mp->m_ail;
935 iip->ili_inode = ip; 935 iip->ili_inode = ip;
936 936
937 /* 937 /*
@@ -942,9 +942,9 @@ xfs_inode_item_init(
942 942
943 iip->ili_format.ilf_type = XFS_LI_INODE; 943 iip->ili_format.ilf_type = XFS_LI_INODE;
944 iip->ili_format.ilf_ino = ip->i_ino; 944 iip->ili_format.ilf_ino = ip->i_ino;
945 iip->ili_format.ilf_blkno = ip->i_blkno; 945 iip->ili_format.ilf_blkno = ip->i_imap.im_blkno;
946 iip->ili_format.ilf_len = ip->i_len; 946 iip->ili_format.ilf_len = ip->i_imap.im_len;
947 iip->ili_format.ilf_boffset = ip->i_boffset; 947 iip->ili_format.ilf_boffset = ip->i_imap.im_boffset;
948} 948}
949 949
950/* 950/*
@@ -976,9 +976,8 @@ xfs_iflush_done(
976 xfs_buf_t *bp, 976 xfs_buf_t *bp,
977 xfs_inode_log_item_t *iip) 977 xfs_inode_log_item_t *iip)
978{ 978{
979 xfs_inode_t *ip; 979 xfs_inode_t *ip = iip->ili_inode;
980 980 struct xfs_ail *ailp = iip->ili_item.li_ailp;
981 ip = iip->ili_inode;
982 981
983 /* 982 /*
984 * We only want to pull the item from the AIL if it is 983 * We only want to pull the item from the AIL if it is
@@ -991,15 +990,12 @@ xfs_iflush_done(
991 */ 990 */
992 if (iip->ili_logged && 991 if (iip->ili_logged &&
993 (iip->ili_item.li_lsn == iip->ili_flush_lsn)) { 992 (iip->ili_item.li_lsn == iip->ili_flush_lsn)) {
994 spin_lock(&ip->i_mount->m_ail_lock); 993 spin_lock(&ailp->xa_lock);
995 if (iip->ili_item.li_lsn == iip->ili_flush_lsn) { 994 if (iip->ili_item.li_lsn == iip->ili_flush_lsn) {
996 /* 995 /* xfs_trans_ail_delete() drops the AIL lock. */
997 * xfs_trans_delete_ail() drops the AIL lock. 996 xfs_trans_ail_delete(ailp, (xfs_log_item_t*)iip);
998 */
999 xfs_trans_delete_ail(ip->i_mount,
1000 (xfs_log_item_t*)iip);
1001 } else { 997 } else {
1002 spin_unlock(&ip->i_mount->m_ail_lock); 998 spin_unlock(&ailp->xa_lock);
1003 } 999 }
1004 } 1000 }
1005 1001
@@ -1031,21 +1027,20 @@ void
1031xfs_iflush_abort( 1027xfs_iflush_abort(
1032 xfs_inode_t *ip) 1028 xfs_inode_t *ip)
1033{ 1029{
1034 xfs_inode_log_item_t *iip; 1030 xfs_inode_log_item_t *iip = ip->i_itemp;
1035 xfs_mount_t *mp; 1031 xfs_mount_t *mp;
1036 1032
1037 iip = ip->i_itemp; 1033 iip = ip->i_itemp;
1038 mp = ip->i_mount; 1034 mp = ip->i_mount;
1039 if (iip) { 1035 if (iip) {
1036 struct xfs_ail *ailp = iip->ili_item.li_ailp;
1040 if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { 1037 if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
1041 spin_lock(&mp->m_ail_lock); 1038 spin_lock(&ailp->xa_lock);
1042 if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { 1039 if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
1043 /* 1040 /* xfs_trans_ail_delete() drops the AIL lock. */
1044 * xfs_trans_delete_ail() drops the AIL lock. 1041 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)iip);
1045 */
1046 xfs_trans_delete_ail(mp, (xfs_log_item_t *)iip);
1047 } else 1042 } else
1048 spin_unlock(&mp->m_ail_lock); 1043 spin_unlock(&ailp->xa_lock);
1049 } 1044 }
1050 iip->ili_logged = 0; 1045 iip->ili_logged = 0;
1051 /* 1046 /*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 40513077ab36..1ff04cc323ad 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -112,6 +112,24 @@ typedef struct xfs_inode_log_format_64 {
112#define XFS_ILI_IOLOCKED_ANY (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED) 112#define XFS_ILI_IOLOCKED_ANY (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED)
113 113
114 114
115#define XFS_ILOG_FBROOT(w) xfs_ilog_fbroot(w)
116static inline int xfs_ilog_fbroot(int w)
117{
118 return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
119}
120
121#define XFS_ILOG_FEXT(w) xfs_ilog_fext(w)
122static inline int xfs_ilog_fext(int w)
123{
124 return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
125}
126
127#define XFS_ILOG_FDATA(w) xfs_ilog_fdata(w)
128static inline int xfs_ilog_fdata(int w)
129{
130 return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
131}
132
115#ifdef __KERNEL__ 133#ifdef __KERNEL__
116 134
117struct xfs_buf; 135struct xfs_buf;
@@ -148,26 +166,6 @@ typedef struct xfs_inode_log_item {
148} xfs_inode_log_item_t; 166} xfs_inode_log_item_t;
149 167
150 168
151#define XFS_ILOG_FDATA(w) xfs_ilog_fdata(w)
152static inline int xfs_ilog_fdata(int w)
153{
154 return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
155}
156
157#endif /* __KERNEL__ */
158
159#define XFS_ILOG_FBROOT(w) xfs_ilog_fbroot(w)
160static inline int xfs_ilog_fbroot(int w)
161{
162 return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
163}
164
165#define XFS_ILOG_FEXT(w) xfs_ilog_fext(w)
166static inline int xfs_ilog_fext(int w)
167{
168 return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
169}
170
171static inline int xfs_inode_clean(xfs_inode_t *ip) 169static inline int xfs_inode_clean(xfs_inode_t *ip)
172{ 170{
173 return (!ip->i_itemp || 171 return (!ip->i_itemp ||
@@ -175,9 +173,6 @@ static inline int xfs_inode_clean(xfs_inode_t *ip)
175 !ip->i_update_core; 173 !ip->i_update_core;
176} 174}
177 175
178
179#ifdef __KERNEL__
180
181extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); 176extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
182extern void xfs_inode_item_destroy(struct xfs_inode *); 177extern void xfs_inode_item_destroy(struct xfs_inode *);
183extern void xfs_iflush_done(struct xfs_buf *, xfs_inode_log_item_t *); 178extern void xfs_iflush_done(struct xfs_buf *, xfs_inode_log_item_t *);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 67f22b2b44b3..911062cf73a6 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -290,7 +290,6 @@ STATIC int
290xfs_iomap_eof_align_last_fsb( 290xfs_iomap_eof_align_last_fsb(
291 xfs_mount_t *mp, 291 xfs_mount_t *mp,
292 xfs_inode_t *ip, 292 xfs_inode_t *ip,
293 xfs_fsize_t isize,
294 xfs_extlen_t extsize, 293 xfs_extlen_t extsize,
295 xfs_fileoff_t *last_fsb) 294 xfs_fileoff_t *last_fsb)
296{ 295{
@@ -306,14 +305,14 @@ xfs_iomap_eof_align_last_fsb(
306 * stripe width and we are allocating past the allocation eof. 305 * stripe width and we are allocating past the allocation eof.
307 */ 306 */
308 else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) && 307 else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) &&
309 (isize >= XFS_FSB_TO_B(mp, mp->m_swidth))) 308 (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth)))
310 new_last_fsb = roundup_64(*last_fsb, mp->m_swidth); 309 new_last_fsb = roundup_64(*last_fsb, mp->m_swidth);
311 /* 310 /*
312 * Roundup the allocation request to a stripe unit (m_dalign) boundary 311 * Roundup the allocation request to a stripe unit (m_dalign) boundary
313 * if the file size is >= stripe unit size, and we are allocating past 312 * if the file size is >= stripe unit size, and we are allocating past
314 * the allocation eof. 313 * the allocation eof.
315 */ 314 */
316 else if (mp->m_dalign && (isize >= XFS_FSB_TO_B(mp, mp->m_dalign))) 315 else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign)))
317 new_last_fsb = roundup_64(*last_fsb, mp->m_dalign); 316 new_last_fsb = roundup_64(*last_fsb, mp->m_dalign);
318 317
319 /* 318 /*
@@ -403,7 +402,6 @@ xfs_iomap_write_direct(
403 xfs_filblks_t count_fsb, resaligned; 402 xfs_filblks_t count_fsb, resaligned;
404 xfs_fsblock_t firstfsb; 403 xfs_fsblock_t firstfsb;
405 xfs_extlen_t extsz, temp; 404 xfs_extlen_t extsz, temp;
406 xfs_fsize_t isize;
407 int nimaps; 405 int nimaps;
408 int bmapi_flag; 406 int bmapi_flag;
409 int quota_flag; 407 int quota_flag;
@@ -426,15 +424,10 @@ xfs_iomap_write_direct(
426 rt = XFS_IS_REALTIME_INODE(ip); 424 rt = XFS_IS_REALTIME_INODE(ip);
427 extsz = xfs_get_extsz_hint(ip); 425 extsz = xfs_get_extsz_hint(ip);
428 426
429 isize = ip->i_size;
430 if (ip->i_new_size > isize)
431 isize = ip->i_new_size;
432
433 offset_fsb = XFS_B_TO_FSBT(mp, offset); 427 offset_fsb = XFS_B_TO_FSBT(mp, offset);
434 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); 428 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
435 if ((offset + count) > isize) { 429 if ((offset + count) > ip->i_size) {
436 error = xfs_iomap_eof_align_last_fsb(mp, ip, isize, extsz, 430 error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
437 &last_fsb);
438 if (error) 431 if (error)
439 goto error_out; 432 goto error_out;
440 } else { 433 } else {
@@ -559,7 +552,6 @@ STATIC int
559xfs_iomap_eof_want_preallocate( 552xfs_iomap_eof_want_preallocate(
560 xfs_mount_t *mp, 553 xfs_mount_t *mp,
561 xfs_inode_t *ip, 554 xfs_inode_t *ip,
562 xfs_fsize_t isize,
563 xfs_off_t offset, 555 xfs_off_t offset,
564 size_t count, 556 size_t count,
565 int ioflag, 557 int ioflag,
@@ -573,7 +565,7 @@ xfs_iomap_eof_want_preallocate(
573 int n, error, imaps; 565 int n, error, imaps;
574 566
575 *prealloc = 0; 567 *prealloc = 0;
576 if ((ioflag & BMAPI_SYNC) || (offset + count) <= isize) 568 if ((ioflag & BMAPI_SYNC) || (offset + count) <= ip->i_size)
577 return 0; 569 return 0;
578 570
579 /* 571 /*
@@ -617,7 +609,6 @@ xfs_iomap_write_delay(
617 xfs_fileoff_t ioalign; 609 xfs_fileoff_t ioalign;
618 xfs_fsblock_t firstblock; 610 xfs_fsblock_t firstblock;
619 xfs_extlen_t extsz; 611 xfs_extlen_t extsz;
620 xfs_fsize_t isize;
621 int nimaps; 612 int nimaps;
622 xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; 613 xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
623 int prealloc, fsynced = 0; 614 int prealloc, fsynced = 0;
@@ -637,11 +628,7 @@ xfs_iomap_write_delay(
637 offset_fsb = XFS_B_TO_FSBT(mp, offset); 628 offset_fsb = XFS_B_TO_FSBT(mp, offset);
638 629
639retry: 630retry:
640 isize = ip->i_size; 631 error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
641 if (ip->i_new_size > isize)
642 isize = ip->i_new_size;
643
644 error = xfs_iomap_eof_want_preallocate(mp, ip, isize, offset, count,
645 ioflag, imap, XFS_WRITE_IMAPS, &prealloc); 632 ioflag, imap, XFS_WRITE_IMAPS, &prealloc);
646 if (error) 633 if (error)
647 return error; 634 return error;
@@ -655,8 +642,7 @@ retry:
655 } 642 }
656 643
657 if (prealloc || extsz) { 644 if (prealloc || extsz) {
658 error = xfs_iomap_eof_align_last_fsb(mp, ip, isize, extsz, 645 error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
659 &last_fsb);
660 if (error) 646 if (error)
661 return error; 647 return error;
662 } 648 }
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index cf6754a3c5b3..e19d0a8d5618 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -69,7 +69,7 @@ xfs_bulkstat_one_iget(
69 } 69 }
70 70
71 ASSERT(ip != NULL); 71 ASSERT(ip != NULL);
72 ASSERT(ip->i_blkno != (xfs_daddr_t)0); 72 ASSERT(ip->i_imap.im_blkno != 0);
73 73
74 dic = &ip->i_d; 74 dic = &ip->i_d;
75 75
@@ -125,13 +125,9 @@ STATIC void
125xfs_bulkstat_one_dinode( 125xfs_bulkstat_one_dinode(
126 xfs_mount_t *mp, /* mount point for filesystem */ 126 xfs_mount_t *mp, /* mount point for filesystem */
127 xfs_ino_t ino, /* inode number to get data for */ 127 xfs_ino_t ino, /* inode number to get data for */
128 xfs_dinode_t *dip, /* dinode inode pointer */ 128 xfs_dinode_t *dic, /* dinode inode pointer */
129 xfs_bstat_t *buf) /* return buffer */ 129 xfs_bstat_t *buf) /* return buffer */
130{ 130{
131 xfs_dinode_core_t *dic; /* dinode core info pointer */
132
133 dic = &dip->di_core;
134
135 /* 131 /*
136 * The inode format changed when we moved the link count and 132 * The inode format changed when we moved the link count and
137 * made it 32 bits long. If this is an old format inode, 133 * made it 32 bits long. If this is an old format inode,
@@ -143,7 +139,7 @@ xfs_bulkstat_one_dinode(
143 * the new format. We don't change the version number so that we 139 * the new format. We don't change the version number so that we
144 * can distinguish this from a real new format inode. 140 * can distinguish this from a real new format inode.
145 */ 141 */
146 if (dic->di_version == XFS_DINODE_VERSION_1) { 142 if (dic->di_version == 1) {
147 buf->bs_nlink = be16_to_cpu(dic->di_onlink); 143 buf->bs_nlink = be16_to_cpu(dic->di_onlink);
148 buf->bs_projid = 0; 144 buf->bs_projid = 0;
149 } else { 145 } else {
@@ -162,7 +158,7 @@ xfs_bulkstat_one_dinode(
162 buf->bs_mtime.tv_nsec = be32_to_cpu(dic->di_mtime.t_nsec); 158 buf->bs_mtime.tv_nsec = be32_to_cpu(dic->di_mtime.t_nsec);
163 buf->bs_ctime.tv_sec = be32_to_cpu(dic->di_ctime.t_sec); 159 buf->bs_ctime.tv_sec = be32_to_cpu(dic->di_ctime.t_sec);
164 buf->bs_ctime.tv_nsec = be32_to_cpu(dic->di_ctime.t_nsec); 160 buf->bs_ctime.tv_nsec = be32_to_cpu(dic->di_ctime.t_nsec);
165 buf->bs_xflags = xfs_dic2xflags(dip); 161 buf->bs_xflags = xfs_dic2xflags(dic);
166 buf->bs_extsize = be32_to_cpu(dic->di_extsize) << mp->m_sb.sb_blocklog; 162 buf->bs_extsize = be32_to_cpu(dic->di_extsize) << mp->m_sb.sb_blocklog;
167 buf->bs_extents = be32_to_cpu(dic->di_nextents); 163 buf->bs_extents = be32_to_cpu(dic->di_nextents);
168 buf->bs_gen = be32_to_cpu(dic->di_gen); 164 buf->bs_gen = be32_to_cpu(dic->di_gen);
@@ -173,7 +169,7 @@ xfs_bulkstat_one_dinode(
173 169
174 switch (dic->di_format) { 170 switch (dic->di_format) {
175 case XFS_DINODE_FMT_DEV: 171 case XFS_DINODE_FMT_DEV:
176 buf->bs_rdev = be32_to_cpu(dip->di_u.di_dev); 172 buf->bs_rdev = xfs_dinode_get_rdev(dic);
177 buf->bs_blksize = BLKDEV_IOSIZE; 173 buf->bs_blksize = BLKDEV_IOSIZE;
178 buf->bs_blocks = 0; 174 buf->bs_blocks = 0;
179 break; 175 break;
@@ -192,27 +188,34 @@ xfs_bulkstat_one_dinode(
192 } 188 }
193} 189}
194 190
191/* Return 0 on success or positive error */
195STATIC int 192STATIC int
196xfs_bulkstat_one_fmt( 193xfs_bulkstat_one_fmt(
197 void __user *ubuffer, 194 void __user *ubuffer,
195 int ubsize,
196 int *ubused,
198 const xfs_bstat_t *buffer) 197 const xfs_bstat_t *buffer)
199{ 198{
199 if (ubsize < sizeof(*buffer))
200 return XFS_ERROR(ENOMEM);
200 if (copy_to_user(ubuffer, buffer, sizeof(*buffer))) 201 if (copy_to_user(ubuffer, buffer, sizeof(*buffer)))
201 return -EFAULT; 202 return XFS_ERROR(EFAULT);
202 return sizeof(*buffer); 203 if (ubused)
204 *ubused = sizeof(*buffer);
205 return 0;
203} 206}
204 207
205/* 208/*
206 * Return stat information for one inode. 209 * Return stat information for one inode.
207 * Return 0 if ok, else errno. 210 * Return 0 if ok, else errno.
208 */ 211 */
209int /* error status */ 212int /* error status */
210xfs_bulkstat_one( 213xfs_bulkstat_one_int(
211 xfs_mount_t *mp, /* mount point for filesystem */ 214 xfs_mount_t *mp, /* mount point for filesystem */
212 xfs_ino_t ino, /* inode number to get data for */ 215 xfs_ino_t ino, /* inode number to get data for */
213 void __user *buffer, /* buffer to place output in */ 216 void __user *buffer, /* buffer to place output in */
214 int ubsize, /* size of buffer */ 217 int ubsize, /* size of buffer */
215 void *private_data, /* my private data */ 218 bulkstat_one_fmt_pf formatter, /* formatter, copy to user */
216 xfs_daddr_t bno, /* starting bno of inode cluster */ 219 xfs_daddr_t bno, /* starting bno of inode cluster */
217 int *ubused, /* bytes used by me */ 220 int *ubused, /* bytes used by me */
218 void *dibuff, /* on-disk inode buffer */ 221 void *dibuff, /* on-disk inode buffer */
@@ -221,15 +224,12 @@ xfs_bulkstat_one(
221 xfs_bstat_t *buf; /* return buffer */ 224 xfs_bstat_t *buf; /* return buffer */
222 int error = 0; /* error value */ 225 int error = 0; /* error value */
223 xfs_dinode_t *dip; /* dinode inode pointer */ 226 xfs_dinode_t *dip; /* dinode inode pointer */
224 bulkstat_one_fmt_pf formatter = private_data ? : xfs_bulkstat_one_fmt;
225 227
226 dip = (xfs_dinode_t *)dibuff; 228 dip = (xfs_dinode_t *)dibuff;
227 *stat = BULKSTAT_RV_NOTHING; 229 *stat = BULKSTAT_RV_NOTHING;
228 230
229 if (!buffer || xfs_internal_inum(mp, ino)) 231 if (!buffer || xfs_internal_inum(mp, ino))
230 return XFS_ERROR(EINVAL); 232 return XFS_ERROR(EINVAL);
231 if (ubsize < sizeof(*buf))
232 return XFS_ERROR(ENOMEM);
233 233
234 buf = kmem_alloc(sizeof(*buf), KM_SLEEP); 234 buf = kmem_alloc(sizeof(*buf), KM_SLEEP);
235 235
@@ -244,21 +244,34 @@ xfs_bulkstat_one(
244 xfs_bulkstat_one_dinode(mp, ino, dip, buf); 244 xfs_bulkstat_one_dinode(mp, ino, dip, buf);
245 } 245 }
246 246
247 error = formatter(buffer, buf); 247 error = formatter(buffer, ubsize, ubused, buf);
248 if (error < 0) { 248 if (error)
249 error = EFAULT;
250 goto out_free; 249 goto out_free;
251 }
252 250
253 *stat = BULKSTAT_RV_DIDONE; 251 *stat = BULKSTAT_RV_DIDONE;
254 if (ubused)
255 *ubused = error;
256 252
257 out_free: 253 out_free:
258 kmem_free(buf); 254 kmem_free(buf);
259 return error; 255 return error;
260} 256}
261 257
258int
259xfs_bulkstat_one(
260 xfs_mount_t *mp, /* mount point for filesystem */
261 xfs_ino_t ino, /* inode number to get data for */
262 void __user *buffer, /* buffer to place output in */
263 int ubsize, /* size of buffer */
264 void *private_data, /* my private data */
265 xfs_daddr_t bno, /* starting bno of inode cluster */
266 int *ubused, /* bytes used by me */
267 void *dibuff, /* on-disk inode buffer */
268 int *stat) /* BULKSTAT_RV_... */
269{
270 return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
271 xfs_bulkstat_one_fmt, bno,
272 ubused, dibuff, stat);
273}
274
262/* 275/*
263 * Test to see whether we can use the ondisk inode directly, based 276 * Test to see whether we can use the ondisk inode directly, based
264 * on the given bulkstat flags, filling in dipp accordingly. 277 * on the given bulkstat flags, filling in dipp accordingly.
@@ -287,19 +300,19 @@ xfs_bulkstat_use_dinode(
287 * to disk yet. This is a temporary hack that would require a proper 300 * to disk yet. This is a temporary hack that would require a proper
288 * fix in the future. 301 * fix in the future.
289 */ 302 */
290 if (be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC || 303 if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
291 !XFS_DINODE_GOOD_VERSION(dip->di_core.di_version) || 304 !XFS_DINODE_GOOD_VERSION(dip->di_version) ||
292 !dip->di_core.di_mode) 305 !dip->di_mode)
293 return 0; 306 return 0;
294 if (flags & BULKSTAT_FG_QUICK) { 307 if (flags & BULKSTAT_FG_QUICK) {
295 *dipp = dip; 308 *dipp = dip;
296 return 1; 309 return 1;
297 } 310 }
298 /* BULKSTAT_FG_INLINE: if attr fork is local, or not there, use it */ 311 /* BULKSTAT_FG_INLINE: if attr fork is local, or not there, use it */
299 aformat = dip->di_core.di_aformat; 312 aformat = dip->di_aformat;
300 if ((XFS_DFORK_Q(dip) == 0) || 313 if ((XFS_DFORK_Q(dip) == 0) ||
301 (aformat == XFS_DINODE_FMT_LOCAL) || 314 (aformat == XFS_DINODE_FMT_LOCAL) ||
302 (aformat == XFS_DINODE_FMT_EXTENTS && !dip->di_core.di_anextents)) { 315 (aformat == XFS_DINODE_FMT_EXTENTS && !dip->di_anextents)) {
303 *dipp = dip; 316 *dipp = dip;
304 return 1; 317 return 1;
305 } 318 }
@@ -359,7 +372,6 @@ xfs_bulkstat(
359 int ubused; /* bytes used by formatter */ 372 int ubused; /* bytes used by formatter */
360 xfs_buf_t *bp; /* ptr to on-disk inode cluster buf */ 373 xfs_buf_t *bp; /* ptr to on-disk inode cluster buf */
361 xfs_dinode_t *dip; /* ptr into bp for specific inode */ 374 xfs_dinode_t *dip; /* ptr into bp for specific inode */
362 xfs_inode_t *ip; /* ptr to in-core inode struct */
363 375
364 /* 376 /*
365 * Get the last inode value, see if there's nothing to do. 377 * Get the last inode value, see if there's nothing to do.
@@ -416,8 +428,7 @@ xfs_bulkstat(
416 /* 428 /*
417 * Allocate and initialize a btree cursor for ialloc btree. 429 * Allocate and initialize a btree cursor for ialloc btree.
418 */ 430 */
419 cur = xfs_btree_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_INO, 431 cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
420 (xfs_inode_t *)0, 0);
421 irbp = irbuf; 432 irbp = irbuf;
422 irbufend = irbuf + nirbuf; 433 irbufend = irbuf + nirbuf;
423 end_of_ag = 0; 434 end_of_ag = 0;
@@ -472,7 +483,7 @@ xfs_bulkstat(
472 * In any case, increment to the next record. 483 * In any case, increment to the next record.
473 */ 484 */
474 if (!error) 485 if (!error)
475 error = xfs_inobt_increment(cur, 0, &tmp); 486 error = xfs_btree_increment(cur, 0, &tmp);
476 } else { 487 } else {
477 /* 488 /*
478 * Start of ag. Lookup the first inode chunk. 489 * Start of ag. Lookup the first inode chunk.
@@ -539,7 +550,7 @@ xfs_bulkstat(
539 * Set agino to after this chunk and bump the cursor. 550 * Set agino to after this chunk and bump the cursor.
540 */ 551 */
541 agino = gino + XFS_INODES_PER_CHUNK; 552 agino = gino + XFS_INODES_PER_CHUNK;
542 error = xfs_inobt_increment(cur, 0, &tmp); 553 error = xfs_btree_increment(cur, 0, &tmp);
543 cond_resched(); 554 cond_resched();
544 } 555 }
545 /* 556 /*
@@ -586,6 +597,8 @@ xfs_bulkstat(
586 597
587 if (flags & (BULKSTAT_FG_QUICK | 598 if (flags & (BULKSTAT_FG_QUICK |
588 BULKSTAT_FG_INLINE)) { 599 BULKSTAT_FG_INLINE)) {
600 int offset;
601
589 ino = XFS_AGINO_TO_INO(mp, agno, 602 ino = XFS_AGINO_TO_INO(mp, agno,
590 agino); 603 agino);
591 bno = XFS_AGB_TO_DADDR(mp, agno, 604 bno = XFS_AGB_TO_DADDR(mp, agno,
@@ -594,21 +607,15 @@ xfs_bulkstat(
594 /* 607 /*
595 * Get the inode cluster buffer 608 * Get the inode cluster buffer
596 */ 609 */
597 ASSERT(xfs_inode_zone != NULL);
598 ip = kmem_zone_zalloc(xfs_inode_zone,
599 KM_SLEEP);
600 ip->i_ino = ino;
601 ip->i_mount = mp;
602 spin_lock_init(&ip->i_flags_lock);
603 if (bp) 610 if (bp)
604 xfs_buf_relse(bp); 611 xfs_buf_relse(bp);
605 error = xfs_itobp(mp, NULL, ip, 612
606 &dip, &bp, bno, 613 error = xfs_inotobp(mp, NULL, ino, &dip,
607 XFS_IMAP_BULKSTAT, 614 &bp, &offset,
608 XFS_BUF_LOCK); 615 XFS_IGET_BULKSTAT);
616
609 if (!error) 617 if (!error)
610 clustidx = ip->i_boffset / mp->m_sb.sb_inodesize; 618 clustidx = offset / mp->m_sb.sb_inodesize;
611 kmem_zone_free(xfs_inode_zone, ip);
612 if (XFS_TEST_ERROR(error != 0, 619 if (XFS_TEST_ERROR(error != 0,
613 mp, XFS_ERRTAG_BULKSTAT_READ_CHUNK, 620 mp, XFS_ERRTAG_BULKSTAT_READ_CHUNK,
614 XFS_RANDOM_BULKSTAT_READ_CHUNK)) { 621 XFS_RANDOM_BULKSTAT_READ_CHUNK)) {
@@ -842,8 +849,7 @@ xfs_inumbers(
842 agino = 0; 849 agino = 0;
843 continue; 850 continue;
844 } 851 }
845 cur = xfs_btree_init_cursor(mp, NULL, agbp, agno, 852 cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
846 XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
847 error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp); 853 error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp);
848 if (error) { 854 if (error) {
849 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); 855 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
@@ -887,7 +893,7 @@ xfs_inumbers(
887 bufidx = 0; 893 bufidx = 0;
888 } 894 }
889 if (left) { 895 if (left) {
890 error = xfs_inobt_increment(cur, 0, &tmp); 896 error = xfs_btree_increment(cur, 0, &tmp);
891 if (error) { 897 if (error) {
892 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); 898 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
893 cur = NULL; 899 cur = NULL;
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index a1f18fce9b70..1fb04e7deb61 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -71,9 +71,23 @@ xfs_bulkstat_single(
71 71
72typedef int (*bulkstat_one_fmt_pf)( /* used size in bytes or negative error */ 72typedef int (*bulkstat_one_fmt_pf)( /* used size in bytes or negative error */
73 void __user *ubuffer, /* buffer to write to */ 73 void __user *ubuffer, /* buffer to write to */
74 int ubsize, /* remaining user buffer sz */
75 int *ubused, /* bytes used by formatter */
74 const xfs_bstat_t *buffer); /* buffer to read from */ 76 const xfs_bstat_t *buffer); /* buffer to read from */
75 77
76int 78int
79xfs_bulkstat_one_int(
80 xfs_mount_t *mp,
81 xfs_ino_t ino,
82 void __user *buffer,
83 int ubsize,
84 bulkstat_one_fmt_pf formatter,
85 xfs_daddr_t bno,
86 int *ubused,
87 void *dibuff,
88 int *stat);
89
90int
77xfs_bulkstat_one( 91xfs_bulkstat_one(
78 xfs_mount_t *mp, 92 xfs_mount_t *mp,
79 xfs_ino_t ino, 93 xfs_ino_t ino,
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3608a0f0a5f6..f4726f702a9e 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -100,12 +100,11 @@ STATIC void xlog_ungrant_log_space(xlog_t *log,
100 100
101 101
102/* local ticket functions */ 102/* local ticket functions */
103STATIC xlog_ticket_t *xlog_ticket_get(xlog_t *log, 103STATIC xlog_ticket_t *xlog_ticket_alloc(xlog_t *log,
104 int unit_bytes, 104 int unit_bytes,
105 int count, 105 int count,
106 char clientid, 106 char clientid,
107 uint flags); 107 uint flags);
108STATIC void xlog_ticket_put(xlog_t *log, xlog_ticket_t *ticket);
109 108
110#if defined(DEBUG) 109#if defined(DEBUG)
111STATIC void xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr); 110STATIC void xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr);
@@ -360,7 +359,7 @@ xfs_log_done(xfs_mount_t *mp,
360 */ 359 */
361 xlog_trace_loggrant(log, ticket, "xfs_log_done: (non-permanent)"); 360 xlog_trace_loggrant(log, ticket, "xfs_log_done: (non-permanent)");
362 xlog_ungrant_log_space(log, ticket); 361 xlog_ungrant_log_space(log, ticket);
363 xlog_ticket_put(log, ticket); 362 xfs_log_ticket_put(ticket);
364 } else { 363 } else {
365 xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)"); 364 xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)");
366 xlog_regrant_reserve_log_space(log, ticket); 365 xlog_regrant_reserve_log_space(log, ticket);
@@ -514,7 +513,7 @@ xfs_log_reserve(xfs_mount_t *mp,
514 retval = xlog_regrant_write_log_space(log, internal_ticket); 513 retval = xlog_regrant_write_log_space(log, internal_ticket);
515 } else { 514 } else {
516 /* may sleep if need to allocate more tickets */ 515 /* may sleep if need to allocate more tickets */
517 internal_ticket = xlog_ticket_get(log, unit_bytes, cnt, 516 internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt,
518 client, flags); 517 client, flags);
519 if (!internal_ticket) 518 if (!internal_ticket)
520 return XFS_ERROR(ENOMEM); 519 return XFS_ERROR(ENOMEM);
@@ -572,12 +571,12 @@ xfs_log_mount(
572 /* 571 /*
573 * Initialize the AIL now we have a log. 572 * Initialize the AIL now we have a log.
574 */ 573 */
575 spin_lock_init(&mp->m_ail_lock);
576 error = xfs_trans_ail_init(mp); 574 error = xfs_trans_ail_init(mp);
577 if (error) { 575 if (error) {
578 cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error); 576 cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error);
579 goto error; 577 goto error;
580 } 578 }
579 mp->m_log->l_ailp = mp->m_ail;
581 580
582 /* 581 /*
583 * skip log recovery on a norecovery mount. pretend it all 582 * skip log recovery on a norecovery mount. pretend it all
@@ -730,8 +729,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
730 spin_lock(&log->l_icloglock); 729 spin_lock(&log->l_icloglock);
731 iclog = log->l_iclog; 730 iclog = log->l_iclog;
732 atomic_inc(&iclog->ic_refcnt); 731 atomic_inc(&iclog->ic_refcnt);
733 spin_unlock(&log->l_icloglock);
734 xlog_state_want_sync(log, iclog); 732 xlog_state_want_sync(log, iclog);
733 spin_unlock(&log->l_icloglock);
735 error = xlog_state_release_iclog(log, iclog); 734 error = xlog_state_release_iclog(log, iclog);
736 735
737 spin_lock(&log->l_icloglock); 736 spin_lock(&log->l_icloglock);
@@ -749,7 +748,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
749 if (tic) { 748 if (tic) {
750 xlog_trace_loggrant(log, tic, "unmount rec"); 749 xlog_trace_loggrant(log, tic, "unmount rec");
751 xlog_ungrant_log_space(log, tic); 750 xlog_ungrant_log_space(log, tic);
752 xlog_ticket_put(log, tic); 751 xfs_log_ticket_put(tic);
753 } 752 }
754 } else { 753 } else {
755 /* 754 /*
@@ -768,9 +767,9 @@ xfs_log_unmount_write(xfs_mount_t *mp)
768 spin_lock(&log->l_icloglock); 767 spin_lock(&log->l_icloglock);
769 iclog = log->l_iclog; 768 iclog = log->l_iclog;
770 atomic_inc(&iclog->ic_refcnt); 769 atomic_inc(&iclog->ic_refcnt);
771 spin_unlock(&log->l_icloglock);
772 770
773 xlog_state_want_sync(log, iclog); 771 xlog_state_want_sync(log, iclog);
772 spin_unlock(&log->l_icloglock);
774 error = xlog_state_release_iclog(log, iclog); 773 error = xlog_state_release_iclog(log, iclog);
775 774
776 spin_lock(&log->l_icloglock); 775 spin_lock(&log->l_icloglock);
@@ -906,7 +905,7 @@ xfs_log_move_tail(xfs_mount_t *mp,
906int 905int
907xfs_log_need_covered(xfs_mount_t *mp) 906xfs_log_need_covered(xfs_mount_t *mp)
908{ 907{
909 int needed = 0, gen; 908 int needed = 0;
910 xlog_t *log = mp->m_log; 909 xlog_t *log = mp->m_log;
911 910
912 if (!xfs_fs_writable(mp)) 911 if (!xfs_fs_writable(mp))
@@ -915,7 +914,7 @@ xfs_log_need_covered(xfs_mount_t *mp)
915 spin_lock(&log->l_icloglock); 914 spin_lock(&log->l_icloglock);
916 if (((log->l_covered_state == XLOG_STATE_COVER_NEED) || 915 if (((log->l_covered_state == XLOG_STATE_COVER_NEED) ||
917 (log->l_covered_state == XLOG_STATE_COVER_NEED2)) 916 (log->l_covered_state == XLOG_STATE_COVER_NEED2))
918 && !xfs_trans_first_ail(mp, &gen) 917 && !xfs_trans_ail_tail(log->l_ailp)
919 && xlog_iclogs_empty(log)) { 918 && xlog_iclogs_empty(log)) {
920 if (log->l_covered_state == XLOG_STATE_COVER_NEED) 919 if (log->l_covered_state == XLOG_STATE_COVER_NEED)
921 log->l_covered_state = XLOG_STATE_COVER_DONE; 920 log->l_covered_state = XLOG_STATE_COVER_DONE;
@@ -952,7 +951,7 @@ xlog_assign_tail_lsn(xfs_mount_t *mp)
952 xfs_lsn_t tail_lsn; 951 xfs_lsn_t tail_lsn;
953 xlog_t *log = mp->m_log; 952 xlog_t *log = mp->m_log;
954 953
955 tail_lsn = xfs_trans_tail_ail(mp); 954 tail_lsn = xfs_trans_ail_tail(mp->m_ail);
956 spin_lock(&log->l_grant_lock); 955 spin_lock(&log->l_grant_lock);
957 if (tail_lsn != 0) { 956 if (tail_lsn != 0) {
958 log->l_tail_lsn = tail_lsn; 957 log->l_tail_lsn = tail_lsn;
@@ -1030,12 +1029,6 @@ xlog_iodone(xfs_buf_t *bp)
1030 ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long) 2); 1029 ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long) 2);
1031 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); 1030 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
1032 aborted = 0; 1031 aborted = 0;
1033
1034 /*
1035 * Some versions of cpp barf on the recursive definition of
1036 * ic_log -> hic_fields.ic_log and expand ic_log twice when
1037 * it is passed through two macros. Workaround broken cpp.
1038 */
1039 l = iclog->ic_log; 1032 l = iclog->ic_log;
1040 1033
1041 /* 1034 /*
@@ -1302,7 +1295,7 @@ xlog_alloc_log(xfs_mount_t *mp,
1302 XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb); 1295 XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
1303 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); 1296 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
1304 iclog->ic_bp = bp; 1297 iclog->ic_bp = bp;
1305 iclog->hic_data = bp->b_addr; 1298 iclog->ic_data = bp->b_addr;
1306#ifdef DEBUG 1299#ifdef DEBUG
1307 log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header); 1300 log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
1308#endif 1301#endif
@@ -1322,7 +1315,7 @@ xlog_alloc_log(xfs_mount_t *mp,
1322 atomic_set(&iclog->ic_refcnt, 0); 1315 atomic_set(&iclog->ic_refcnt, 0);
1323 spin_lock_init(&iclog->ic_callback_lock); 1316 spin_lock_init(&iclog->ic_callback_lock);
1324 iclog->ic_callback_tail = &(iclog->ic_callback); 1317 iclog->ic_callback_tail = &(iclog->ic_callback);
1325 iclog->ic_datap = (char *)iclog->hic_data + log->l_iclog_hsize; 1318 iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
1326 1319
1327 ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp)); 1320 ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
1328 ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0); 1321 ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
@@ -1446,7 +1439,7 @@ xlog_grant_push_ail(xfs_mount_t *mp,
1446 */ 1439 */
1447 if (threshold_lsn && 1440 if (threshold_lsn &&
1448 !XLOG_FORCED_SHUTDOWN(log)) 1441 !XLOG_FORCED_SHUTDOWN(log))
1449 xfs_trans_push_ail(mp, threshold_lsn); 1442 xfs_trans_ail_push(log->l_ailp, threshold_lsn);
1450} /* xlog_grant_push_ail */ 1443} /* xlog_grant_push_ail */
1451 1444
1452 1445
@@ -1991,7 +1984,9 @@ xlog_write(xfs_mount_t * mp,
1991 if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { 1984 if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
1992 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); 1985 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
1993 record_cnt = data_cnt = 0; 1986 record_cnt = data_cnt = 0;
1987 spin_lock(&log->l_icloglock);
1994 xlog_state_want_sync(log, iclog); 1988 xlog_state_want_sync(log, iclog);
1989 spin_unlock(&log->l_icloglock);
1995 if (commit_iclog) { 1990 if (commit_iclog) {
1996 ASSERT(flags & XLOG_COMMIT_TRANS); 1991 ASSERT(flags & XLOG_COMMIT_TRANS);
1997 *commit_iclog = iclog; 1992 *commit_iclog = iclog;
@@ -3200,7 +3195,7 @@ try_again:
3200STATIC void 3195STATIC void
3201xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog) 3196xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
3202{ 3197{
3203 spin_lock(&log->l_icloglock); 3198 ASSERT(spin_is_locked(&log->l_icloglock));
3204 3199
3205 if (iclog->ic_state == XLOG_STATE_ACTIVE) { 3200 if (iclog->ic_state == XLOG_STATE_ACTIVE) {
3206 xlog_state_switch_iclogs(log, iclog, 0); 3201 xlog_state_switch_iclogs(log, iclog, 0);
@@ -3208,10 +3203,7 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
3208 ASSERT(iclog->ic_state & 3203 ASSERT(iclog->ic_state &
3209 (XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR)); 3204 (XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR));
3210 } 3205 }
3211 3206}
3212 spin_unlock(&log->l_icloglock);
3213} /* xlog_state_want_sync */
3214
3215 3207
3216 3208
3217/***************************************************************************** 3209/*****************************************************************************
@@ -3222,22 +3214,33 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
3222 */ 3214 */
3223 3215
3224/* 3216/*
3225 * Free a used ticket. 3217 * Free a used ticket when it's refcount falls to zero.
3226 */ 3218 */
3227STATIC void 3219void
3228xlog_ticket_put(xlog_t *log, 3220xfs_log_ticket_put(
3229 xlog_ticket_t *ticket) 3221 xlog_ticket_t *ticket)
3230{ 3222{
3231 sv_destroy(&ticket->t_wait); 3223 ASSERT(atomic_read(&ticket->t_ref) > 0);
3232 kmem_zone_free(xfs_log_ticket_zone, ticket); 3224 if (atomic_dec_and_test(&ticket->t_ref)) {
3233} /* xlog_ticket_put */ 3225 sv_destroy(&ticket->t_wait);
3226 kmem_zone_free(xfs_log_ticket_zone, ticket);
3227 }
3228}
3234 3229
3230xlog_ticket_t *
3231xfs_log_ticket_get(
3232 xlog_ticket_t *ticket)
3233{
3234 ASSERT(atomic_read(&ticket->t_ref) > 0);
3235 atomic_inc(&ticket->t_ref);
3236 return ticket;
3237}
3235 3238
3236/* 3239/*
3237 * Allocate and initialise a new log ticket. 3240 * Allocate and initialise a new log ticket.
3238 */ 3241 */
3239STATIC xlog_ticket_t * 3242STATIC xlog_ticket_t *
3240xlog_ticket_get(xlog_t *log, 3243xlog_ticket_alloc(xlog_t *log,
3241 int unit_bytes, 3244 int unit_bytes,
3242 int cnt, 3245 int cnt,
3243 char client, 3246 char client,
@@ -3308,6 +3311,7 @@ xlog_ticket_get(xlog_t *log,
3308 unit_bytes += 2*BBSIZE; 3311 unit_bytes += 2*BBSIZE;
3309 } 3312 }
3310 3313
3314 atomic_set(&tic->t_ref, 1);
3311 tic->t_unit_res = unit_bytes; 3315 tic->t_unit_res = unit_bytes;
3312 tic->t_curr_res = unit_bytes; 3316 tic->t_curr_res = unit_bytes;
3313 tic->t_cnt = cnt; 3317 tic->t_cnt = cnt;
@@ -3323,7 +3327,7 @@ xlog_ticket_get(xlog_t *log,
3323 xlog_tic_reset_res(tic); 3327 xlog_tic_reset_res(tic);
3324 3328
3325 return tic; 3329 return tic;
3326} /* xlog_ticket_get */ 3330}
3327 3331
3328 3332
3329/****************************************************************************** 3333/******************************************************************************
@@ -3452,7 +3456,7 @@ xlog_verify_iclog(xlog_t *log,
3452 ptr = iclog->ic_datap; 3456 ptr = iclog->ic_datap;
3453 base_ptr = ptr; 3457 base_ptr = ptr;
3454 ophead = (xlog_op_header_t *)ptr; 3458 ophead = (xlog_op_header_t *)ptr;
3455 xhdr = (xlog_in_core_2_t *)&iclog->ic_header; 3459 xhdr = iclog->ic_data;
3456 for (i = 0; i < len; i++) { 3460 for (i = 0; i < len; i++) {
3457 ophead = (xlog_op_header_t *)ptr; 3461 ophead = (xlog_op_header_t *)ptr;
3458 3462
@@ -3558,7 +3562,8 @@ xfs_log_force_umount(
3558 if (!log || 3562 if (!log ||
3559 log->l_flags & XLOG_ACTIVE_RECOVERY) { 3563 log->l_flags & XLOG_ACTIVE_RECOVERY) {
3560 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; 3564 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
3561 XFS_BUF_DONE(mp->m_sb_bp); 3565 if (mp->m_sb_bp)
3566 XFS_BUF_DONE(mp->m_sb_bp);
3562 return 0; 3567 return 0;
3563 } 3568 }
3564 3569
@@ -3579,7 +3584,9 @@ xfs_log_force_umount(
3579 spin_lock(&log->l_icloglock); 3584 spin_lock(&log->l_icloglock);
3580 spin_lock(&log->l_grant_lock); 3585 spin_lock(&log->l_grant_lock);
3581 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; 3586 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
3582 XFS_BUF_DONE(mp->m_sb_bp); 3587 if (mp->m_sb_bp)
3588 XFS_BUF_DONE(mp->m_sb_bp);
3589
3583 /* 3590 /*
3584 * This flag is sort of redundant because of the mount flag, but 3591 * This flag is sort of redundant because of the mount flag, but
3585 * it's good to maintain the separation between the log and the rest 3592 * it's good to maintain the separation between the log and the rest
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index d47b91f10822..8a3e84e900a3 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -134,6 +134,7 @@ typedef struct xfs_log_callback {
134#ifdef __KERNEL__ 134#ifdef __KERNEL__
135/* Log manager interfaces */ 135/* Log manager interfaces */
136struct xfs_mount; 136struct xfs_mount;
137struct xlog_ticket;
137xfs_lsn_t xfs_log_done(struct xfs_mount *mp, 138xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
138 xfs_log_ticket_t ticket, 139 xfs_log_ticket_t ticket,
139 void **iclog, 140 void **iclog,
@@ -177,6 +178,9 @@ int xfs_log_need_covered(struct xfs_mount *mp);
177 178
178void xlog_iodone(struct xfs_buf *); 179void xlog_iodone(struct xfs_buf *);
179 180
181struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket);
182void xfs_log_ticket_put(struct xlog_ticket *ticket);
183
180#endif 184#endif
181 185
182 186
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index e7d8f84443fa..654167be0efb 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -245,6 +245,7 @@ typedef struct xlog_ticket {
245 struct xlog_ticket *t_next; /* :4|8 */ 245 struct xlog_ticket *t_next; /* :4|8 */
246 struct xlog_ticket *t_prev; /* :4|8 */ 246 struct xlog_ticket *t_prev; /* :4|8 */
247 xlog_tid_t t_tid; /* transaction identifier : 4 */ 247 xlog_tid_t t_tid; /* transaction identifier : 4 */
248 atomic_t t_ref; /* ticket reference count : 4 */
248 int t_curr_res; /* current reservation in bytes : 4 */ 249 int t_curr_res; /* current reservation in bytes : 4 */
249 int t_unit_res; /* unit reservation in bytes : 4 */ 250 int t_unit_res; /* unit reservation in bytes : 4 */
250 char t_ocnt; /* original count : 1 */ 251 char t_ocnt; /* original count : 1 */
@@ -309,6 +310,16 @@ typedef struct xlog_rec_ext_header {
309} xlog_rec_ext_header_t; 310} xlog_rec_ext_header_t;
310 311
311#ifdef __KERNEL__ 312#ifdef __KERNEL__
313
314/*
315 * Quite misnamed, because this union lays out the actual on-disk log buffer.
316 */
317typedef union xlog_in_core2 {
318 xlog_rec_header_t hic_header;
319 xlog_rec_ext_header_t hic_xheader;
320 char hic_sector[XLOG_HEADER_SIZE];
321} xlog_in_core_2_t;
322
312/* 323/*
313 * - A log record header is 512 bytes. There is plenty of room to grow the 324 * - A log record header is 512 bytes. There is plenty of room to grow the
314 * xlog_rec_header_t into the reserved space. 325 * xlog_rec_header_t into the reserved space.
@@ -338,7 +349,7 @@ typedef struct xlog_rec_ext_header {
338 * We'll put all the read-only and l_icloglock fields in the first cacheline, 349 * We'll put all the read-only and l_icloglock fields in the first cacheline,
339 * and move everything else out to subsequent cachelines. 350 * and move everything else out to subsequent cachelines.
340 */ 351 */
341typedef struct xlog_iclog_fields { 352typedef struct xlog_in_core {
342 sv_t ic_force_wait; 353 sv_t ic_force_wait;
343 sv_t ic_write_wait; 354 sv_t ic_write_wait;
344 struct xlog_in_core *ic_next; 355 struct xlog_in_core *ic_next;
@@ -361,41 +372,11 @@ typedef struct xlog_iclog_fields {
361 372
362 /* reference counts need their own cacheline */ 373 /* reference counts need their own cacheline */
363 atomic_t ic_refcnt ____cacheline_aligned_in_smp; 374 atomic_t ic_refcnt ____cacheline_aligned_in_smp;
364} xlog_iclog_fields_t; 375 xlog_in_core_2_t *ic_data;
365 376#define ic_header ic_data->hic_header
366typedef union xlog_in_core2 {
367 xlog_rec_header_t hic_header;
368 xlog_rec_ext_header_t hic_xheader;
369 char hic_sector[XLOG_HEADER_SIZE];
370} xlog_in_core_2_t;
371
372typedef struct xlog_in_core {
373 xlog_iclog_fields_t hic_fields;
374 xlog_in_core_2_t *hic_data;
375} xlog_in_core_t; 377} xlog_in_core_t;
376 378
377/* 379/*
378 * Defines to save our code from this glop.
379 */
380#define ic_force_wait hic_fields.ic_force_wait
381#define ic_write_wait hic_fields.ic_write_wait
382#define ic_next hic_fields.ic_next
383#define ic_prev hic_fields.ic_prev
384#define ic_bp hic_fields.ic_bp
385#define ic_log hic_fields.ic_log
386#define ic_callback hic_fields.ic_callback
387#define ic_callback_lock hic_fields.ic_callback_lock
388#define ic_callback_tail hic_fields.ic_callback_tail
389#define ic_trace hic_fields.ic_trace
390#define ic_size hic_fields.ic_size
391#define ic_offset hic_fields.ic_offset
392#define ic_refcnt hic_fields.ic_refcnt
393#define ic_bwritecnt hic_fields.ic_bwritecnt
394#define ic_state hic_fields.ic_state
395#define ic_datap hic_fields.ic_datap
396#define ic_header hic_data->hic_header
397
398/*
399 * The reservation head lsn is not made up of a cycle number and block number. 380 * The reservation head lsn is not made up of a cycle number and block number.
400 * Instead, it uses a cycle number and byte number. Logs don't expect to 381 * Instead, it uses a cycle number and byte number. Logs don't expect to
401 * overflow 31 bits worth of byte offset, so using a byte number will mean 382 * overflow 31 bits worth of byte offset, so using a byte number will mean
@@ -404,6 +385,7 @@ typedef struct xlog_in_core {
404typedef struct log { 385typedef struct log {
405 /* The following fields don't need locking */ 386 /* The following fields don't need locking */
406 struct xfs_mount *l_mp; /* mount point */ 387 struct xfs_mount *l_mp; /* mount point */
388 struct xfs_ail *l_ailp; /* AIL log is working with */
407 struct xfs_buf *l_xbuf; /* extra buffer for log 389 struct xfs_buf *l_xbuf; /* extra buffer for log
408 * wrapping */ 390 * wrapping */
409 struct xfs_buftarg *l_targ; /* buftarg of log */ 391 struct xfs_buftarg *l_targ; /* buftarg of log */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 70e3ba32e6be..35cca98bd94c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -36,7 +36,6 @@
36#include "xfs_dinode.h" 36#include "xfs_dinode.h"
37#include "xfs_inode.h" 37#include "xfs_inode.h"
38#include "xfs_inode_item.h" 38#include "xfs_inode_item.h"
39#include "xfs_imap.h"
40#include "xfs_alloc.h" 39#include "xfs_alloc.h"
41#include "xfs_ialloc.h" 40#include "xfs_ialloc.h"
42#include "xfs_log_priv.h" 41#include "xfs_log_priv.h"
@@ -54,10 +53,8 @@ STATIC void xlog_recover_insert_item_backq(xlog_recover_item_t **q,
54 xlog_recover_item_t *item); 53 xlog_recover_item_t *item);
55#if defined(DEBUG) 54#if defined(DEBUG)
56STATIC void xlog_recover_check_summary(xlog_t *); 55STATIC void xlog_recover_check_summary(xlog_t *);
57STATIC void xlog_recover_check_ail(xfs_mount_t *, xfs_log_item_t *, int);
58#else 56#else
59#define xlog_recover_check_summary(log) 57#define xlog_recover_check_summary(log)
60#define xlog_recover_check_ail(mp, lip, gen)
61#endif 58#endif
62 59
63 60
@@ -270,21 +267,16 @@ STATIC void
270xlog_recover_iodone( 267xlog_recover_iodone(
271 struct xfs_buf *bp) 268 struct xfs_buf *bp)
272{ 269{
273 xfs_mount_t *mp;
274
275 ASSERT(XFS_BUF_FSPRIVATE(bp, void *));
276
277 if (XFS_BUF_GETERROR(bp)) { 270 if (XFS_BUF_GETERROR(bp)) {
278 /* 271 /*
279 * We're not going to bother about retrying 272 * We're not going to bother about retrying
280 * this during recovery. One strike! 273 * this during recovery. One strike!
281 */ 274 */
282 mp = XFS_BUF_FSPRIVATE(bp, xfs_mount_t *);
283 xfs_ioerror_alert("xlog_recover_iodone", 275 xfs_ioerror_alert("xlog_recover_iodone",
284 mp, bp, XFS_BUF_ADDR(bp)); 276 bp->b_mount, bp, XFS_BUF_ADDR(bp));
285 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 277 xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
286 } 278 }
287 XFS_BUF_SET_FSPRIVATE(bp, NULL); 279 bp->b_mount = NULL;
288 XFS_BUF_CLR_IODONE_FUNC(bp); 280 XFS_BUF_CLR_IODONE_FUNC(bp);
289 xfs_biodone(bp); 281 xfs_biodone(bp);
290} 282}
@@ -2228,9 +2220,8 @@ xlog_recover_do_buffer_trans(
2228 XFS_BUF_STALE(bp); 2220 XFS_BUF_STALE(bp);
2229 error = xfs_bwrite(mp, bp); 2221 error = xfs_bwrite(mp, bp);
2230 } else { 2222 } else {
2231 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL || 2223 ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
2232 XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp); 2224 bp->b_mount = mp;
2233 XFS_BUF_SET_FSPRIVATE(bp, mp);
2234 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); 2225 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2235 xfs_bdwrite(mp, bp); 2226 xfs_bdwrite(mp, bp);
2236 } 2227 }
@@ -2247,7 +2238,6 @@ xlog_recover_do_inode_trans(
2247 xfs_inode_log_format_t *in_f; 2238 xfs_inode_log_format_t *in_f;
2248 xfs_mount_t *mp; 2239 xfs_mount_t *mp;
2249 xfs_buf_t *bp; 2240 xfs_buf_t *bp;
2250 xfs_imap_t imap;
2251 xfs_dinode_t *dip; 2241 xfs_dinode_t *dip;
2252 xfs_ino_t ino; 2242 xfs_ino_t ino;
2253 int len; 2243 int len;
@@ -2275,54 +2265,35 @@ xlog_recover_do_inode_trans(
2275 } 2265 }
2276 ino = in_f->ilf_ino; 2266 ino = in_f->ilf_ino;
2277 mp = log->l_mp; 2267 mp = log->l_mp;
2278 if (ITEM_TYPE(item) == XFS_LI_INODE) {
2279 imap.im_blkno = (xfs_daddr_t)in_f->ilf_blkno;
2280 imap.im_len = in_f->ilf_len;
2281 imap.im_boffset = in_f->ilf_boffset;
2282 } else {
2283 /*
2284 * It's an old inode format record. We don't know where
2285 * its cluster is located on disk, and we can't allow
2286 * xfs_imap() to figure it out because the inode btrees
2287 * are not ready to be used. Therefore do not pass the
2288 * XFS_IMAP_LOOKUP flag to xfs_imap(). This will give
2289 * us only the single block in which the inode lives
2290 * rather than its cluster, so we must make sure to
2291 * invalidate the buffer when we write it out below.
2292 */
2293 imap.im_blkno = 0;
2294 error = xfs_imap(log->l_mp, NULL, ino, &imap, 0);
2295 if (error)
2296 goto error;
2297 }
2298 2268
2299 /* 2269 /*
2300 * Inode buffers can be freed, look out for it, 2270 * Inode buffers can be freed, look out for it,
2301 * and do not replay the inode. 2271 * and do not replay the inode.
2302 */ 2272 */
2303 if (xlog_check_buffer_cancelled(log, imap.im_blkno, imap.im_len, 0)) { 2273 if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
2274 in_f->ilf_len, 0)) {
2304 error = 0; 2275 error = 0;
2305 goto error; 2276 goto error;
2306 } 2277 }
2307 2278
2308 bp = xfs_buf_read_flags(mp->m_ddev_targp, imap.im_blkno, imap.im_len, 2279 bp = xfs_buf_read_flags(mp->m_ddev_targp, in_f->ilf_blkno,
2309 XFS_BUF_LOCK); 2280 in_f->ilf_len, XFS_BUF_LOCK);
2310 if (XFS_BUF_ISERROR(bp)) { 2281 if (XFS_BUF_ISERROR(bp)) {
2311 xfs_ioerror_alert("xlog_recover_do..(read#2)", mp, 2282 xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
2312 bp, imap.im_blkno); 2283 bp, in_f->ilf_blkno);
2313 error = XFS_BUF_GETERROR(bp); 2284 error = XFS_BUF_GETERROR(bp);
2314 xfs_buf_relse(bp); 2285 xfs_buf_relse(bp);
2315 goto error; 2286 goto error;
2316 } 2287 }
2317 error = 0; 2288 error = 0;
2318 ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); 2289 ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
2319 dip = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); 2290 dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
2320 2291
2321 /* 2292 /*
2322 * Make sure the place we're flushing out to really looks 2293 * Make sure the place we're flushing out to really looks
2323 * like an inode! 2294 * like an inode!
2324 */ 2295 */
2325 if (unlikely(be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC)) { 2296 if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) {
2326 xfs_buf_relse(bp); 2297 xfs_buf_relse(bp);
2327 xfs_fs_cmn_err(CE_ALERT, mp, 2298 xfs_fs_cmn_err(CE_ALERT, mp,
2328 "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld", 2299 "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
@@ -2345,12 +2316,12 @@ xlog_recover_do_inode_trans(
2345 } 2316 }
2346 2317
2347 /* Skip replay when the on disk inode is newer than the log one */ 2318 /* Skip replay when the on disk inode is newer than the log one */
2348 if (dicp->di_flushiter < be16_to_cpu(dip->di_core.di_flushiter)) { 2319 if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
2349 /* 2320 /*
2350 * Deal with the wrap case, DI_MAX_FLUSH is less 2321 * Deal with the wrap case, DI_MAX_FLUSH is less
2351 * than smaller numbers 2322 * than smaller numbers
2352 */ 2323 */
2353 if (be16_to_cpu(dip->di_core.di_flushiter) == DI_MAX_FLUSH && 2324 if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
2354 dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) { 2325 dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
2355 /* do nothing */ 2326 /* do nothing */
2356 } else { 2327 } else {
@@ -2410,7 +2381,7 @@ xlog_recover_do_inode_trans(
2410 error = EFSCORRUPTED; 2381 error = EFSCORRUPTED;
2411 goto error; 2382 goto error;
2412 } 2383 }
2413 if (unlikely(item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t))) { 2384 if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
2414 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)", 2385 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
2415 XFS_ERRLEVEL_LOW, mp, dicp); 2386 XFS_ERRLEVEL_LOW, mp, dicp);
2416 xfs_buf_relse(bp); 2387 xfs_buf_relse(bp);
@@ -2422,23 +2393,24 @@ xlog_recover_do_inode_trans(
2422 } 2393 }
2423 2394
2424 /* The core is in in-core format */ 2395 /* The core is in in-core format */
2425 xfs_dinode_to_disk(&dip->di_core, 2396 xfs_dinode_to_disk(dip, (xfs_icdinode_t *)item->ri_buf[1].i_addr);
2426 (xfs_icdinode_t *)item->ri_buf[1].i_addr);
2427 2397
2428 /* the rest is in on-disk format */ 2398 /* the rest is in on-disk format */
2429 if (item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t)) { 2399 if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
2430 memcpy((xfs_caddr_t) dip + sizeof(xfs_dinode_core_t), 2400 memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode),
2431 item->ri_buf[1].i_addr + sizeof(xfs_dinode_core_t), 2401 item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode),
2432 item->ri_buf[1].i_len - sizeof(xfs_dinode_core_t)); 2402 item->ri_buf[1].i_len - sizeof(struct xfs_icdinode));
2433 } 2403 }
2434 2404
2435 fields = in_f->ilf_fields; 2405 fields = in_f->ilf_fields;
2436 switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) { 2406 switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
2437 case XFS_ILOG_DEV: 2407 case XFS_ILOG_DEV:
2438 dip->di_u.di_dev = cpu_to_be32(in_f->ilf_u.ilfu_rdev); 2408 xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
2439 break; 2409 break;
2440 case XFS_ILOG_UUID: 2410 case XFS_ILOG_UUID:
2441 dip->di_u.di_muuid = in_f->ilf_u.ilfu_uuid; 2411 memcpy(XFS_DFORK_DPTR(dip),
2412 &in_f->ilf_u.ilfu_uuid,
2413 sizeof(uuid_t));
2442 break; 2414 break;
2443 } 2415 }
2444 2416
@@ -2454,12 +2426,12 @@ xlog_recover_do_inode_trans(
2454 switch (fields & XFS_ILOG_DFORK) { 2426 switch (fields & XFS_ILOG_DFORK) {
2455 case XFS_ILOG_DDATA: 2427 case XFS_ILOG_DDATA:
2456 case XFS_ILOG_DEXT: 2428 case XFS_ILOG_DEXT:
2457 memcpy(&dip->di_u, src, len); 2429 memcpy(XFS_DFORK_DPTR(dip), src, len);
2458 break; 2430 break;
2459 2431
2460 case XFS_ILOG_DBROOT: 2432 case XFS_ILOG_DBROOT:
2461 xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len, 2433 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
2462 &(dip->di_u.di_bmbt), 2434 (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
2463 XFS_DFORK_DSIZE(dip, mp)); 2435 XFS_DFORK_DSIZE(dip, mp));
2464 break; 2436 break;
2465 2437
@@ -2496,8 +2468,8 @@ xlog_recover_do_inode_trans(
2496 2468
2497 case XFS_ILOG_ABROOT: 2469 case XFS_ILOG_ABROOT:
2498 dest = XFS_DFORK_APTR(dip); 2470 dest = XFS_DFORK_APTR(dip);
2499 xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len, 2471 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
2500 (xfs_bmdr_block_t*)dest, 2472 len, (xfs_bmdr_block_t*)dest,
2501 XFS_DFORK_ASIZE(dip, mp)); 2473 XFS_DFORK_ASIZE(dip, mp));
2502 break; 2474 break;
2503 2475
@@ -2512,9 +2484,8 @@ xlog_recover_do_inode_trans(
2512 2484
2513write_inode_buffer: 2485write_inode_buffer:
2514 if (ITEM_TYPE(item) == XFS_LI_INODE) { 2486 if (ITEM_TYPE(item) == XFS_LI_INODE) {
2515 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL || 2487 ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
2516 XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp); 2488 bp->b_mount = mp;
2517 XFS_BUF_SET_FSPRIVATE(bp, mp);
2518 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); 2489 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2519 xfs_bdwrite(mp, bp); 2490 xfs_bdwrite(mp, bp);
2520 } else { 2491 } else {
@@ -2645,9 +2616,8 @@ xlog_recover_do_dquot_trans(
2645 memcpy(ddq, recddq, item->ri_buf[1].i_len); 2616 memcpy(ddq, recddq, item->ri_buf[1].i_len);
2646 2617
2647 ASSERT(dq_f->qlf_size == 2); 2618 ASSERT(dq_f->qlf_size == 2);
2648 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL || 2619 ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
2649 XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp); 2620 bp->b_mount = mp;
2650 XFS_BUF_SET_FSPRIVATE(bp, mp);
2651 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); 2621 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2652 xfs_bdwrite(mp, bp); 2622 xfs_bdwrite(mp, bp);
2653 2623
@@ -2689,11 +2659,11 @@ xlog_recover_do_efi_trans(
2689 efip->efi_next_extent = efi_formatp->efi_nextents; 2659 efip->efi_next_extent = efi_formatp->efi_nextents;
2690 efip->efi_flags |= XFS_EFI_COMMITTED; 2660 efip->efi_flags |= XFS_EFI_COMMITTED;
2691 2661
2692 spin_lock(&mp->m_ail_lock); 2662 spin_lock(&log->l_ailp->xa_lock);
2693 /* 2663 /*
2694 * xfs_trans_update_ail() drops the AIL lock. 2664 * xfs_trans_ail_update() drops the AIL lock.
2695 */ 2665 */
2696 xfs_trans_update_ail(mp, (xfs_log_item_t *)efip, lsn); 2666 xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn);
2697 return 0; 2667 return 0;
2698} 2668}
2699 2669
@@ -2712,12 +2682,12 @@ xlog_recover_do_efd_trans(
2712 xlog_recover_item_t *item, 2682 xlog_recover_item_t *item,
2713 int pass) 2683 int pass)
2714{ 2684{
2715 xfs_mount_t *mp;
2716 xfs_efd_log_format_t *efd_formatp; 2685 xfs_efd_log_format_t *efd_formatp;
2717 xfs_efi_log_item_t *efip = NULL; 2686 xfs_efi_log_item_t *efip = NULL;
2718 xfs_log_item_t *lip; 2687 xfs_log_item_t *lip;
2719 int gen;
2720 __uint64_t efi_id; 2688 __uint64_t efi_id;
2689 struct xfs_ail_cursor cur;
2690 struct xfs_ail *ailp = log->l_ailp;
2721 2691
2722 if (pass == XLOG_RECOVER_PASS1) { 2692 if (pass == XLOG_RECOVER_PASS1) {
2723 return; 2693 return;
@@ -2734,25 +2704,26 @@ xlog_recover_do_efd_trans(
2734 * Search for the efi with the id in the efd format structure 2704 * Search for the efi with the id in the efd format structure
2735 * in the AIL. 2705 * in the AIL.
2736 */ 2706 */
2737 mp = log->l_mp; 2707 spin_lock(&ailp->xa_lock);
2738 spin_lock(&mp->m_ail_lock); 2708 lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
2739 lip = xfs_trans_first_ail(mp, &gen);
2740 while (lip != NULL) { 2709 while (lip != NULL) {
2741 if (lip->li_type == XFS_LI_EFI) { 2710 if (lip->li_type == XFS_LI_EFI) {
2742 efip = (xfs_efi_log_item_t *)lip; 2711 efip = (xfs_efi_log_item_t *)lip;
2743 if (efip->efi_format.efi_id == efi_id) { 2712 if (efip->efi_format.efi_id == efi_id) {
2744 /* 2713 /*
2745 * xfs_trans_delete_ail() drops the 2714 * xfs_trans_ail_delete() drops the
2746 * AIL lock. 2715 * AIL lock.
2747 */ 2716 */
2748 xfs_trans_delete_ail(mp, lip); 2717 xfs_trans_ail_delete(ailp, lip);
2749 xfs_efi_item_free(efip); 2718 xfs_efi_item_free(efip);
2750 return; 2719 spin_lock(&ailp->xa_lock);
2720 break;
2751 } 2721 }
2752 } 2722 }
2753 lip = xfs_trans_next_ail(mp, lip, &gen, NULL); 2723 lip = xfs_trans_ail_cursor_next(ailp, &cur);
2754 } 2724 }
2755 spin_unlock(&mp->m_ail_lock); 2725 xfs_trans_ail_cursor_done(ailp, &cur);
2726 spin_unlock(&ailp->xa_lock);
2756} 2727}
2757 2728
2758/* 2729/*
@@ -3036,33 +3007,6 @@ abort_error:
3036} 3007}
3037 3008
3038/* 3009/*
3039 * Verify that once we've encountered something other than an EFI
3040 * in the AIL that there are no more EFIs in the AIL.
3041 */
3042#if defined(DEBUG)
3043STATIC void
3044xlog_recover_check_ail(
3045 xfs_mount_t *mp,
3046 xfs_log_item_t *lip,
3047 int gen)
3048{
3049 int orig_gen = gen;
3050
3051 do {
3052 ASSERT(lip->li_type != XFS_LI_EFI);
3053 lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
3054 /*
3055 * The check will be bogus if we restart from the
3056 * beginning of the AIL, so ASSERT that we don't.
3057 * We never should since we're holding the AIL lock
3058 * the entire time.
3059 */
3060 ASSERT(gen == orig_gen);
3061 } while (lip != NULL);
3062}
3063#endif /* DEBUG */
3064
3065/*
3066 * When this is called, all of the EFIs which did not have 3010 * When this is called, all of the EFIs which did not have
3067 * corresponding EFDs should be in the AIL. What we do now 3011 * corresponding EFDs should be in the AIL. What we do now
3068 * is free the extents associated with each one. 3012 * is free the extents associated with each one.
@@ -3086,20 +3030,23 @@ xlog_recover_process_efis(
3086{ 3030{
3087 xfs_log_item_t *lip; 3031 xfs_log_item_t *lip;
3088 xfs_efi_log_item_t *efip; 3032 xfs_efi_log_item_t *efip;
3089 int gen;
3090 xfs_mount_t *mp;
3091 int error = 0; 3033 int error = 0;
3034 struct xfs_ail_cursor cur;
3035 struct xfs_ail *ailp;
3092 3036
3093 mp = log->l_mp; 3037 ailp = log->l_ailp;
3094 spin_lock(&mp->m_ail_lock); 3038 spin_lock(&ailp->xa_lock);
3095 3039 lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3096 lip = xfs_trans_first_ail(mp, &gen);
3097 while (lip != NULL) { 3040 while (lip != NULL) {
3098 /* 3041 /*
3099 * We're done when we see something other than an EFI. 3042 * We're done when we see something other than an EFI.
3043 * There should be no EFIs left in the AIL now.
3100 */ 3044 */
3101 if (lip->li_type != XFS_LI_EFI) { 3045 if (lip->li_type != XFS_LI_EFI) {
3102 xlog_recover_check_ail(mp, lip, gen); 3046#ifdef DEBUG
3047 for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
3048 ASSERT(lip->li_type != XFS_LI_EFI);
3049#endif
3103 break; 3050 break;
3104 } 3051 }
3105 3052
@@ -3108,18 +3055,20 @@ xlog_recover_process_efis(
3108 */ 3055 */
3109 efip = (xfs_efi_log_item_t *)lip; 3056 efip = (xfs_efi_log_item_t *)lip;
3110 if (efip->efi_flags & XFS_EFI_RECOVERED) { 3057 if (efip->efi_flags & XFS_EFI_RECOVERED) {
3111 lip = xfs_trans_next_ail(mp, lip, &gen, NULL); 3058 lip = xfs_trans_ail_cursor_next(ailp, &cur);
3112 continue; 3059 continue;
3113 } 3060 }
3114 3061
3115 spin_unlock(&mp->m_ail_lock); 3062 spin_unlock(&ailp->xa_lock);
3116 error = xlog_recover_process_efi(mp, efip); 3063 error = xlog_recover_process_efi(log->l_mp, efip);
3064 spin_lock(&ailp->xa_lock);
3117 if (error) 3065 if (error)
3118 return error; 3066 goto out;
3119 spin_lock(&mp->m_ail_lock); 3067 lip = xfs_trans_ail_cursor_next(ailp, &cur);
3120 lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
3121 } 3068 }
3122 spin_unlock(&mp->m_ail_lock); 3069out:
3070 xfs_trans_ail_cursor_done(ailp, &cur);
3071 spin_unlock(&ailp->xa_lock);
3123 return error; 3072 return error;
3124} 3073}
3125 3074
@@ -3140,19 +3089,16 @@ xlog_recover_clear_agi_bucket(
3140 int error; 3089 int error;
3141 3090
3142 tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET); 3091 tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
3143 error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0); 3092 error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp),
3144 if (!error) 3093 0, 0, 0);
3145 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
3146 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
3147 XFS_FSS_TO_BB(mp, 1), 0, &agibp);
3148 if (error) 3094 if (error)
3149 goto out_abort; 3095 goto out_abort;
3150 3096
3151 error = EINVAL; 3097 error = xfs_read_agi(mp, tp, agno, &agibp);
3152 agi = XFS_BUF_TO_AGI(agibp); 3098 if (error)
3153 if (be32_to_cpu(agi->agi_magicnum) != XFS_AGI_MAGIC)
3154 goto out_abort; 3099 goto out_abort;
3155 3100
3101 agi = XFS_BUF_TO_AGI(agibp);
3156 agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); 3102 agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
3157 offset = offsetof(xfs_agi_t, agi_unlinked) + 3103 offset = offsetof(xfs_agi_t, agi_unlinked) +
3158 (sizeof(xfs_agino_t) * bucket); 3104 (sizeof(xfs_agino_t) * bucket);
@@ -3172,6 +3118,62 @@ out_error:
3172 return; 3118 return;
3173} 3119}
3174 3120
3121STATIC xfs_agino_t
3122xlog_recover_process_one_iunlink(
3123 struct xfs_mount *mp,
3124 xfs_agnumber_t agno,
3125 xfs_agino_t agino,
3126 int bucket)
3127{
3128 struct xfs_buf *ibp;
3129 struct xfs_dinode *dip;
3130 struct xfs_inode *ip;
3131 xfs_ino_t ino;
3132 int error;
3133
3134 ino = XFS_AGINO_TO_INO(mp, agno, agino);
3135 error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0);
3136 if (error)
3137 goto fail;
3138
3139 /*
3140 * Get the on disk inode to find the next inode in the bucket.
3141 */
3142 error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XFS_BUF_LOCK);
3143 if (error)
3144 goto fail_iput;
3145
3146 ASSERT(ip->i_d.di_nlink == 0);
3147 ASSERT(ip->i_d.di_mode != 0);
3148
3149 /* setup for the next pass */
3150 agino = be32_to_cpu(dip->di_next_unlinked);
3151 xfs_buf_relse(ibp);
3152
3153 /*
3154 * Prevent any DMAPI event from being sent when the reference on
3155 * the inode is dropped.
3156 */
3157 ip->i_d.di_dmevmask = 0;
3158
3159 IRELE(ip);
3160 return agino;
3161
3162 fail_iput:
3163 IRELE(ip);
3164 fail:
3165 /*
3166 * We can't read in the inode this bucket points to, or this inode
3167 * is messed up. Just ditch this bucket of inodes. We will lose
3168 * some inodes and space, but at least we won't hang.
3169 *
3170 * Call xlog_recover_clear_agi_bucket() to perform a transaction to
3171 * clear the inode pointer in the bucket.
3172 */
3173 xlog_recover_clear_agi_bucket(mp, agno, bucket);
3174 return NULLAGINO;
3175}
3176
3175/* 3177/*
3176 * xlog_iunlink_recover 3178 * xlog_iunlink_recover
3177 * 3179 *
@@ -3192,11 +3194,7 @@ xlog_recover_process_iunlinks(
3192 xfs_agnumber_t agno; 3194 xfs_agnumber_t agno;
3193 xfs_agi_t *agi; 3195 xfs_agi_t *agi;
3194 xfs_buf_t *agibp; 3196 xfs_buf_t *agibp;
3195 xfs_buf_t *ibp;
3196 xfs_dinode_t *dip;
3197 xfs_inode_t *ip;
3198 xfs_agino_t agino; 3197 xfs_agino_t agino;
3199 xfs_ino_t ino;
3200 int bucket; 3198 int bucket;
3201 int error; 3199 int error;
3202 uint mp_dmevmask; 3200 uint mp_dmevmask;
@@ -3213,22 +3211,21 @@ xlog_recover_process_iunlinks(
3213 /* 3211 /*
3214 * Find the agi for this ag. 3212 * Find the agi for this ag.
3215 */ 3213 */
3216 agibp = xfs_buf_read(mp->m_ddev_targp, 3214 error = xfs_read_agi(mp, NULL, agno, &agibp);
3217 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), 3215 if (error) {
3218 XFS_FSS_TO_BB(mp, 1), 0); 3216 /*
3219 if (XFS_BUF_ISERROR(agibp)) { 3217 * AGI is b0rked. Don't process it.
3220 xfs_ioerror_alert("xlog_recover_process_iunlinks(#1)", 3218 *
3221 log->l_mp, agibp, 3219 * We should probably mark the filesystem as corrupt
3222 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp))); 3220 * after we've recovered all the ag's we can....
3221 */
3222 continue;
3223 } 3223 }
3224 agi = XFS_BUF_TO_AGI(agibp); 3224 agi = XFS_BUF_TO_AGI(agibp);
3225 ASSERT(XFS_AGI_MAGIC == be32_to_cpu(agi->agi_magicnum));
3226 3225
3227 for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { 3226 for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
3228
3229 agino = be32_to_cpu(agi->agi_unlinked[bucket]); 3227 agino = be32_to_cpu(agi->agi_unlinked[bucket]);
3230 while (agino != NULLAGINO) { 3228 while (agino != NULLAGINO) {
3231
3232 /* 3229 /*
3233 * Release the agi buffer so that it can 3230 * Release the agi buffer so that it can
3234 * be acquired in the normal course of the 3231 * be acquired in the normal course of the
@@ -3236,87 +3233,17 @@ xlog_recover_process_iunlinks(
3236 */ 3233 */
3237 xfs_buf_relse(agibp); 3234 xfs_buf_relse(agibp);
3238 3235
3239 ino = XFS_AGINO_TO_INO(mp, agno, agino); 3236 agino = xlog_recover_process_one_iunlink(mp,
3240 error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0); 3237 agno, agino, bucket);
3241 ASSERT(error || (ip != NULL));
3242
3243 if (!error) {
3244 /*
3245 * Get the on disk inode to find the
3246 * next inode in the bucket.
3247 */
3248 error = xfs_itobp(mp, NULL, ip, &dip,
3249 &ibp, 0, 0,
3250 XFS_BUF_LOCK);
3251 ASSERT(error || (dip != NULL));
3252 }
3253
3254 if (!error) {
3255 ASSERT(ip->i_d.di_nlink == 0);
3256
3257 /* setup for the next pass */
3258 agino = be32_to_cpu(
3259 dip->di_next_unlinked);
3260 xfs_buf_relse(ibp);
3261 /*
3262 * Prevent any DMAPI event from
3263 * being sent when the
3264 * reference on the inode is
3265 * dropped.
3266 */
3267 ip->i_d.di_dmevmask = 0;
3268
3269 /*
3270 * If this is a new inode, handle
3271 * it specially. Otherwise,
3272 * just drop our reference to the
3273 * inode. If there are no
3274 * other references, this will
3275 * send the inode to
3276 * xfs_inactive() which will
3277 * truncate the file and free
3278 * the inode.
3279 */
3280 if (ip->i_d.di_mode == 0)
3281 xfs_iput_new(ip, 0);
3282 else
3283 IRELE(ip);
3284 } else {
3285 /*
3286 * We can't read in the inode
3287 * this bucket points to, or
3288 * this inode is messed up. Just
3289 * ditch this bucket of inodes. We
3290 * will lose some inodes and space,
3291 * but at least we won't hang. Call
3292 * xlog_recover_clear_agi_bucket()
3293 * to perform a transaction to clear
3294 * the inode pointer in the bucket.
3295 */
3296 xlog_recover_clear_agi_bucket(mp, agno,
3297 bucket);
3298
3299 agino = NULLAGINO;
3300 }
3301 3238
3302 /* 3239 /*
3303 * Reacquire the agibuffer and continue around 3240 * Reacquire the agibuffer and continue around
3304 * the loop. 3241 * the loop. This should never fail as we know
3242 * the buffer was good earlier on.
3305 */ 3243 */
3306 agibp = xfs_buf_read(mp->m_ddev_targp, 3244 error = xfs_read_agi(mp, NULL, agno, &agibp);
3307 XFS_AG_DADDR(mp, agno, 3245 ASSERT(error == 0);
3308 XFS_AGI_DADDR(mp)),
3309 XFS_FSS_TO_BB(mp, 1), 0);
3310 if (XFS_BUF_ISERROR(agibp)) {
3311 xfs_ioerror_alert(
3312 "xlog_recover_process_iunlinks(#2)",
3313 log->l_mp, agibp,
3314 XFS_AG_DADDR(mp, agno,
3315 XFS_AGI_DADDR(mp)));
3316 }
3317 agi = XFS_BUF_TO_AGI(agibp); 3246 agi = XFS_BUF_TO_AGI(agibp);
3318 ASSERT(XFS_AGI_MAGIC == be32_to_cpu(
3319 agi->agi_magicnum));
3320 } 3247 }
3321 } 3248 }
3322 3249
@@ -3367,7 +3294,6 @@ xlog_pack_data(
3367 int size = iclog->ic_offset + roundoff; 3294 int size = iclog->ic_offset + roundoff;
3368 __be32 cycle_lsn; 3295 __be32 cycle_lsn;
3369 xfs_caddr_t dp; 3296 xfs_caddr_t dp;
3370 xlog_in_core_2_t *xhdr;
3371 3297
3372 xlog_pack_data_checksum(log, iclog, size); 3298 xlog_pack_data_checksum(log, iclog, size);
3373 3299
@@ -3382,7 +3308,8 @@ xlog_pack_data(
3382 } 3308 }
3383 3309
3384 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { 3310 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3385 xhdr = (xlog_in_core_2_t *)&iclog->ic_header; 3311 xlog_in_core_2_t *xhdr = iclog->ic_data;
3312
3386 for ( ; i < BTOBB(size); i++) { 3313 for ( ; i < BTOBB(size); i++) {
3387 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 3314 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3388 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 3315 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -3440,7 +3367,6 @@ xlog_unpack_data(
3440 xlog_t *log) 3367 xlog_t *log)
3441{ 3368{
3442 int i, j, k; 3369 int i, j, k;
3443 xlog_in_core_2_t *xhdr;
3444 3370
3445 for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && 3371 for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
3446 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { 3372 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
@@ -3449,7 +3375,7 @@ xlog_unpack_data(
3449 } 3375 }
3450 3376
3451 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { 3377 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3452 xhdr = (xlog_in_core_2_t *)rhead; 3378 xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
3453 for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { 3379 for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
3454 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 3380 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3455 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 3381 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -4003,11 +3929,8 @@ xlog_recover_check_summary(
4003{ 3929{
4004 xfs_mount_t *mp; 3930 xfs_mount_t *mp;
4005 xfs_agf_t *agfp; 3931 xfs_agf_t *agfp;
4006 xfs_agi_t *agip;
4007 xfs_buf_t *agfbp; 3932 xfs_buf_t *agfbp;
4008 xfs_buf_t *agibp; 3933 xfs_buf_t *agibp;
4009 xfs_daddr_t agfdaddr;
4010 xfs_daddr_t agidaddr;
4011 xfs_buf_t *sbbp; 3934 xfs_buf_t *sbbp;
4012#ifdef XFS_LOUD_RECOVERY 3935#ifdef XFS_LOUD_RECOVERY
4013 xfs_sb_t *sbp; 3936 xfs_sb_t *sbp;
@@ -4016,6 +3939,7 @@ xlog_recover_check_summary(
4016 __uint64_t freeblks; 3939 __uint64_t freeblks;
4017 __uint64_t itotal; 3940 __uint64_t itotal;
4018 __uint64_t ifree; 3941 __uint64_t ifree;
3942 int error;
4019 3943
4020 mp = log->l_mp; 3944 mp = log->l_mp;
4021 3945
@@ -4023,37 +3947,27 @@ xlog_recover_check_summary(
4023 itotal = 0LL; 3947 itotal = 0LL;
4024 ifree = 0LL; 3948 ifree = 0LL;
4025 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 3949 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
4026 agfdaddr = XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)); 3950 error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
4027 agfbp = xfs_buf_read(mp->m_ddev_targp, agfdaddr, 3951 if (error) {
4028 XFS_FSS_TO_BB(mp, 1), 0); 3952 xfs_fs_cmn_err(CE_ALERT, mp,
4029 if (XFS_BUF_ISERROR(agfbp)) { 3953 "xlog_recover_check_summary(agf)"
4030 xfs_ioerror_alert("xlog_recover_check_summary(agf)", 3954 "agf read failed agno %d error %d",
4031 mp, agfbp, agfdaddr); 3955 agno, error);
4032 } 3956 } else {
4033 agfp = XFS_BUF_TO_AGF(agfbp); 3957 agfp = XFS_BUF_TO_AGF(agfbp);
4034 ASSERT(XFS_AGF_MAGIC == be32_to_cpu(agfp->agf_magicnum)); 3958 freeblks += be32_to_cpu(agfp->agf_freeblks) +
4035 ASSERT(XFS_AGF_GOOD_VERSION(be32_to_cpu(agfp->agf_versionnum))); 3959 be32_to_cpu(agfp->agf_flcount);
4036 ASSERT(be32_to_cpu(agfp->agf_seqno) == agno); 3960 xfs_buf_relse(agfbp);
4037
4038 freeblks += be32_to_cpu(agfp->agf_freeblks) +
4039 be32_to_cpu(agfp->agf_flcount);
4040 xfs_buf_relse(agfbp);
4041
4042 agidaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
4043 agibp = xfs_buf_read(mp->m_ddev_targp, agidaddr,
4044 XFS_FSS_TO_BB(mp, 1), 0);
4045 if (XFS_BUF_ISERROR(agibp)) {
4046 xfs_ioerror_alert("xlog_recover_check_summary(agi)",
4047 mp, agibp, agidaddr);
4048 } 3961 }
4049 agip = XFS_BUF_TO_AGI(agibp);
4050 ASSERT(XFS_AGI_MAGIC == be32_to_cpu(agip->agi_magicnum));
4051 ASSERT(XFS_AGI_GOOD_VERSION(be32_to_cpu(agip->agi_versionnum)));
4052 ASSERT(be32_to_cpu(agip->agi_seqno) == agno);
4053 3962
4054 itotal += be32_to_cpu(agip->agi_count); 3963 error = xfs_read_agi(mp, NULL, agno, &agibp);
4055 ifree += be32_to_cpu(agip->agi_freecount); 3964 if (!error) {
4056 xfs_buf_relse(agibp); 3965 struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
3966
3967 itotal += be32_to_cpu(agi->agi_count);
3968 ifree += be32_to_cpu(agi->agi_freecount);
3969 xfs_buf_relse(agibp);
3970 }
4057 } 3971 }
4058 3972
4059 sbbp = xfs_getsb(mp, 0); 3973 sbbp = xfs_getsb(mp, 0);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 15f5dd22fbb2..3c97c6463a4e 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -567,8 +567,6 @@ xfs_readsb(xfs_mount_t *mp, int flags)
567STATIC void 567STATIC void
568xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp) 568xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
569{ 569{
570 int i;
571
572 mp->m_agfrotor = mp->m_agirotor = 0; 570 mp->m_agfrotor = mp->m_agirotor = 0;
573 spin_lock_init(&mp->m_agirotor_lock); 571 spin_lock_init(&mp->m_agirotor_lock);
574 mp->m_maxagi = mp->m_sb.sb_agcount; 572 mp->m_maxagi = mp->m_sb.sb_agcount;
@@ -577,12 +575,10 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
577 mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT; 575 mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
578 mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1; 576 mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
579 mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog; 577 mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
580 mp->m_litino = sbp->sb_inodesize - 578 mp->m_litino = sbp->sb_inodesize - sizeof(struct xfs_dinode);
581 ((uint)sizeof(xfs_dinode_core_t) + (uint)sizeof(xfs_agino_t));
582 mp->m_blockmask = sbp->sb_blocksize - 1; 579 mp->m_blockmask = sbp->sb_blocksize - 1;
583 mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG; 580 mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
584 mp->m_blockwmask = mp->m_blockwsize - 1; 581 mp->m_blockwmask = mp->m_blockwsize - 1;
585 INIT_LIST_HEAD(&mp->m_del_inodes);
586 582
587 /* 583 /*
588 * Setup for attributes, in case they get created. 584 * Setup for attributes, in case they get created.
@@ -605,24 +601,20 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
605 } 601 }
606 ASSERT(mp->m_attroffset < XFS_LITINO(mp)); 602 ASSERT(mp->m_attroffset < XFS_LITINO(mp));
607 603
608 for (i = 0; i < 2; i++) { 604 mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
609 mp->m_alloc_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize, 605 mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
610 xfs_alloc, i == 0); 606 mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
611 mp->m_alloc_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize, 607 mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
612 xfs_alloc, i == 0); 608
613 } 609 mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
614 for (i = 0; i < 2; i++) { 610 mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
615 mp->m_bmap_dmxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize, 611 mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
616 xfs_bmbt, i == 0); 612 mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
617 mp->m_bmap_dmnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize, 613
618 xfs_bmbt, i == 0); 614 mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
619 } 615 mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
620 for (i = 0; i < 2; i++) { 616 mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
621 mp->m_inobt_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize, 617 mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
622 xfs_inobt, i == 0);
623 mp->m_inobt_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
624 xfs_inobt, i == 0);
625 }
626 618
627 mp->m_bsize = XFS_FSB_TO_BB(mp, 1); 619 mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
628 mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, 620 mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
@@ -1228,6 +1220,16 @@ xfs_unmountfs(
1228 __uint64_t resblks; 1220 __uint64_t resblks;
1229 int error; 1221 int error;
1230 1222
1223 /*
1224 * Release dquot that rootinode, rbmino and rsumino might be holding,
1225 * and release the quota inodes.
1226 */
1227 XFS_QM_UNMOUNT(mp);
1228
1229 if (mp->m_rbmip)
1230 IRELE(mp->m_rbmip);
1231 if (mp->m_rsumip)
1232 IRELE(mp->m_rsumip);
1231 IRELE(mp->m_rootip); 1233 IRELE(mp->m_rootip);
1232 1234
1233 /* 1235 /*
@@ -1241,7 +1243,7 @@ xfs_unmountfs(
1241 * need to force the log first. 1243 * need to force the log first.
1242 */ 1244 */
1243 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); 1245 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
1244 xfs_iflush_all(mp); 1246 xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_ASYNC);
1245 1247
1246 XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING); 1248 XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
1247 1249
@@ -1288,11 +1290,6 @@ xfs_unmountfs(
1288 xfs_unmountfs_wait(mp); /* wait for async bufs */ 1290 xfs_unmountfs_wait(mp); /* wait for async bufs */
1289 xfs_log_unmount(mp); /* Done! No more fs ops. */ 1291 xfs_log_unmount(mp); /* Done! No more fs ops. */
1290 1292
1291 /*
1292 * All inodes from this mount point should be freed.
1293 */
1294 ASSERT(mp->m_inodes == NULL);
1295
1296 if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) 1293 if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0)
1297 uuid_table_remove(&mp->m_sb.sb_uuid); 1294 uuid_table_remove(&mp->m_sb.sb_uuid);
1298 1295
@@ -1365,24 +1362,6 @@ xfs_log_sbcount(
1365 return error; 1362 return error;
1366} 1363}
1367 1364
1368STATIC void
1369xfs_mark_shared_ro(
1370 xfs_mount_t *mp,
1371 xfs_buf_t *bp)
1372{
1373 xfs_dsb_t *sb = XFS_BUF_TO_SBP(bp);
1374 __uint16_t version;
1375
1376 if (!(sb->sb_flags & XFS_SBF_READONLY))
1377 sb->sb_flags |= XFS_SBF_READONLY;
1378
1379 version = be16_to_cpu(sb->sb_versionnum);
1380 if ((version & XFS_SB_VERSION_NUMBITS) != XFS_SB_VERSION_4 ||
1381 !(version & XFS_SB_VERSION_SHAREDBIT))
1382 version |= XFS_SB_VERSION_SHAREDBIT;
1383 sb->sb_versionnum = cpu_to_be16(version);
1384}
1385
1386int 1365int
1387xfs_unmountfs_writesb(xfs_mount_t *mp) 1366xfs_unmountfs_writesb(xfs_mount_t *mp)
1388{ 1367{
@@ -1398,12 +1377,6 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
1398 1377
1399 sbp = xfs_getsb(mp, 0); 1378 sbp = xfs_getsb(mp, 0);
1400 1379
1401 /*
1402 * mark shared-readonly if desired
1403 */
1404 if (mp->m_mk_sharedro)
1405 xfs_mark_shared_ro(mp, sbp);
1406
1407 XFS_BUF_UNDONE(sbp); 1380 XFS_BUF_UNDONE(sbp);
1408 XFS_BUF_UNREAD(sbp); 1381 XFS_BUF_UNREAD(sbp);
1409 XFS_BUF_UNDELAYWRITE(sbp); 1382 XFS_BUF_UNDELAYWRITE(sbp);
@@ -1415,8 +1388,6 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
1415 if (error) 1388 if (error)
1416 xfs_ioerror_alert("xfs_unmountfs_writesb", 1389 xfs_ioerror_alert("xfs_unmountfs_writesb",
1417 mp, sbp, XFS_BUF_ADDR(sbp)); 1390 mp, sbp, XFS_BUF_ADDR(sbp));
1418 if (error && mp->m_mk_sharedro)
1419 xfs_fs_cmn_err(CE_ALERT, mp, "Superblock write error detected while unmounting. Filesystem may not be marked shared readonly");
1420 xfs_buf_relse(sbp); 1391 xfs_buf_relse(sbp);
1421 } 1392 }
1422 return error; 1393 return error;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f3c1024b1241..c1e028467327 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -18,7 +18,6 @@
18#ifndef __XFS_MOUNT_H__ 18#ifndef __XFS_MOUNT_H__
19#define __XFS_MOUNT_H__ 19#define __XFS_MOUNT_H__
20 20
21
22typedef struct xfs_trans_reservations { 21typedef struct xfs_trans_reservations {
23 uint tr_write; /* extent alloc trans */ 22 uint tr_write; /* extent alloc trans */
24 uint tr_itruncate; /* truncate trans */ 23 uint tr_itruncate; /* truncate trans */
@@ -44,14 +43,16 @@ typedef struct xfs_trans_reservations {
44} xfs_trans_reservations_t; 43} xfs_trans_reservations_t;
45 44
46#ifndef __KERNEL__ 45#ifndef __KERNEL__
47/* 46
48 * Moved here from xfs_ag.h to avoid reordering header files
49 */
50#define XFS_DADDR_TO_AGNO(mp,d) \ 47#define XFS_DADDR_TO_AGNO(mp,d) \
51 ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks)) 48 ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks))
52#define XFS_DADDR_TO_AGBNO(mp,d) \ 49#define XFS_DADDR_TO_AGBNO(mp,d) \
53 ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks)) 50 ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks))
54#else 51
52#else /* __KERNEL__ */
53
54#include "xfs_sync.h"
55
55struct cred; 56struct cred;
56struct log; 57struct log;
57struct xfs_mount_args; 58struct xfs_mount_args;
@@ -62,6 +63,7 @@ struct xfs_extdelta;
62struct xfs_swapext; 63struct xfs_swapext;
63struct xfs_mru_cache; 64struct xfs_mru_cache;
64struct xfs_nameops; 65struct xfs_nameops;
66struct xfs_ail;
65 67
66/* 68/*
67 * Prototypes and functions for the Data Migration subsystem. 69 * Prototypes and functions for the Data Migration subsystem.
@@ -115,7 +117,7 @@ struct xfs_quotainfo;
115 117
116typedef int (*xfs_qminit_t)(struct xfs_mount *, uint *, uint *); 118typedef int (*xfs_qminit_t)(struct xfs_mount *, uint *, uint *);
117typedef int (*xfs_qmmount_t)(struct xfs_mount *, uint, uint); 119typedef int (*xfs_qmmount_t)(struct xfs_mount *, uint, uint);
118typedef int (*xfs_qmunmount_t)(struct xfs_mount *); 120typedef void (*xfs_qmunmount_t)(struct xfs_mount *);
119typedef void (*xfs_qmdone_t)(struct xfs_mount *); 121typedef void (*xfs_qmdone_t)(struct xfs_mount *);
120typedef void (*xfs_dqrele_t)(struct xfs_dquot *); 122typedef void (*xfs_dqrele_t)(struct xfs_dquot *);
121typedef int (*xfs_dqattach_t)(struct xfs_inode *, uint); 123typedef int (*xfs_dqattach_t)(struct xfs_inode *, uint);
@@ -132,7 +134,7 @@ typedef struct xfs_dquot * (*xfs_dqvopchown_t)(
132 struct xfs_dquot **, struct xfs_dquot *); 134 struct xfs_dquot **, struct xfs_dquot *);
133typedef int (*xfs_dqvopchownresv_t)(struct xfs_trans *, struct xfs_inode *, 135typedef int (*xfs_dqvopchownresv_t)(struct xfs_trans *, struct xfs_inode *,
134 struct xfs_dquot *, struct xfs_dquot *, uint); 136 struct xfs_dquot *, struct xfs_dquot *, uint);
135typedef void (*xfs_dqstatvfs_t)(struct xfs_inode *, bhv_statvfs_t *); 137typedef void (*xfs_dqstatvfs_t)(struct xfs_inode *, struct kstatfs *);
136typedef int (*xfs_dqsync_t)(struct xfs_mount *, int flags); 138typedef int (*xfs_dqsync_t)(struct xfs_mount *, int flags);
137typedef int (*xfs_quotactl_t)(struct xfs_mount *, int, int, xfs_caddr_t); 139typedef int (*xfs_quotactl_t)(struct xfs_mount *, int, int, xfs_caddr_t);
138 140
@@ -223,18 +225,10 @@ extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
223#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0) 225#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
224#endif 226#endif
225 227
226typedef struct xfs_ail {
227 struct list_head xa_ail;
228 uint xa_gen;
229 struct task_struct *xa_task;
230 xfs_lsn_t xa_target;
231} xfs_ail_t;
232
233typedef struct xfs_mount { 228typedef struct xfs_mount {
234 struct super_block *m_super; 229 struct super_block *m_super;
235 xfs_tid_t m_tid; /* next unused tid for fs */ 230 xfs_tid_t m_tid; /* next unused tid for fs */
236 spinlock_t m_ail_lock; /* fs AIL mutex */ 231 struct xfs_ail *m_ail; /* fs active log item list */
237 xfs_ail_t m_ail; /* fs active log item list */
238 xfs_sb_t m_sb; /* copy of fs superblock */ 232 xfs_sb_t m_sb; /* copy of fs superblock */
239 spinlock_t m_sb_lock; /* sb counter lock */ 233 spinlock_t m_sb_lock; /* sb counter lock */
240 struct xfs_buf *m_sb_bp; /* buffer for superblock */ 234 struct xfs_buf *m_sb_bp; /* buffer for superblock */
@@ -247,10 +241,6 @@ typedef struct xfs_mount {
247 xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */ 241 xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */
248 spinlock_t m_agirotor_lock;/* .. and lock protecting it */ 242 spinlock_t m_agirotor_lock;/* .. and lock protecting it */
249 xfs_agnumber_t m_maxagi; /* highest inode alloc group */ 243 xfs_agnumber_t m_maxagi; /* highest inode alloc group */
250 struct xfs_inode *m_inodes; /* active inode list */
251 struct list_head m_del_inodes; /* inodes to reclaim */
252 mutex_t m_ilock; /* inode list mutex */
253 uint m_ireclaims; /* count of calls to reclaim*/
254 uint m_readio_log; /* min read size log bytes */ 244 uint m_readio_log; /* min read size log bytes */
255 uint m_readio_blocks; /* min read size blocks */ 245 uint m_readio_blocks; /* min read size blocks */
256 uint m_writeio_log; /* min write size log bytes */ 246 uint m_writeio_log; /* min write size log bytes */
@@ -267,7 +257,6 @@ typedef struct xfs_mount {
267 xfs_buftarg_t *m_ddev_targp; /* saves taking the address */ 257 xfs_buftarg_t *m_ddev_targp; /* saves taking the address */
268 xfs_buftarg_t *m_logdev_targp;/* ptr to log device */ 258 xfs_buftarg_t *m_logdev_targp;/* ptr to log device */
269 xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */ 259 xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */
270 __uint8_t m_dircook_elog; /* log d-cookie entry bits */
271 __uint8_t m_blkbit_log; /* blocklog + NBBY */ 260 __uint8_t m_blkbit_log; /* blocklog + NBBY */
272 __uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ 261 __uint8_t m_blkbb_log; /* blocklog - BBSHIFT */
273 __uint8_t m_agno_log; /* log #ag's */ 262 __uint8_t m_agno_log; /* log #ag's */
@@ -276,12 +265,12 @@ typedef struct xfs_mount {
276 uint m_blockmask; /* sb_blocksize-1 */ 265 uint m_blockmask; /* sb_blocksize-1 */
277 uint m_blockwsize; /* sb_blocksize in words */ 266 uint m_blockwsize; /* sb_blocksize in words */
278 uint m_blockwmask; /* blockwsize-1 */ 267 uint m_blockwmask; /* blockwsize-1 */
279 uint m_alloc_mxr[2]; /* XFS_ALLOC_BLOCK_MAXRECS */ 268 uint m_alloc_mxr[2]; /* max alloc btree records */
280 uint m_alloc_mnr[2]; /* XFS_ALLOC_BLOCK_MINRECS */ 269 uint m_alloc_mnr[2]; /* min alloc btree records */
281 uint m_bmap_dmxr[2]; /* XFS_BMAP_BLOCK_DMAXRECS */ 270 uint m_bmap_dmxr[2]; /* max bmap btree records */
282 uint m_bmap_dmnr[2]; /* XFS_BMAP_BLOCK_DMINRECS */ 271 uint m_bmap_dmnr[2]; /* min bmap btree records */
283 uint m_inobt_mxr[2]; /* XFS_INOBT_BLOCK_MAXRECS */ 272 uint m_inobt_mxr[2]; /* max inobt btree records */
284 uint m_inobt_mnr[2]; /* XFS_INOBT_BLOCK_MINRECS */ 273 uint m_inobt_mnr[2]; /* min inobt btree records */
285 uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */ 274 uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
286 uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ 275 uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
287 uint m_in_maxlevels; /* XFS_IN_MAXLEVELS */ 276 uint m_in_maxlevels; /* XFS_IN_MAXLEVELS */
@@ -312,9 +301,6 @@ typedef struct xfs_mount {
312 int m_sinoalign; /* stripe unit inode alignment */ 301 int m_sinoalign; /* stripe unit inode alignment */
313 int m_attr_magicpct;/* 37% of the blocksize */ 302 int m_attr_magicpct;/* 37% of the blocksize */
314 int m_dir_magicpct; /* 37% of the dir blocksize */ 303 int m_dir_magicpct; /* 37% of the dir blocksize */
315 __uint8_t m_mk_sharedro; /* mark shared ro on unmount */
316 __uint8_t m_inode_quiesce;/* call quiesce on new inodes.
317 field governed by m_ilock */
318 __uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ 304 __uint8_t m_sectbb_log; /* sectlog - BBSHIFT */
319 const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */ 305 const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */
320 int m_dirblksize; /* directory block sz--bytes */ 306 int m_dirblksize; /* directory block sz--bytes */
@@ -362,7 +348,6 @@ typedef struct xfs_mount {
362#define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */ 348#define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */
363#define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */ 349#define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */
364#define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */ 350#define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */
365#define XFS_MOUNT_SHARED (1ULL << 11) /* shared mount */
366#define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */ 351#define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */
367#define XFS_MOUNT_OSYNCISOSYNC (1ULL << 13) /* o_sync is REALLY o_sync */ 352#define XFS_MOUNT_OSYNCISOSYNC (1ULL << 13) /* o_sync is REALLY o_sync */
368 /* osyncisdsync is now default*/ 353 /* osyncisdsync is now default*/
@@ -439,6 +424,16 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
439#define xfs_force_shutdown(m,f) \ 424#define xfs_force_shutdown(m,f) \
440 xfs_do_force_shutdown(m, f, __FILE__, __LINE__) 425 xfs_do_force_shutdown(m, f, __FILE__, __LINE__)
441 426
427#define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */
428#define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */
429#define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */
430#define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */
431#define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */
432#define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */
433
434#define xfs_test_for_freeze(mp) ((mp)->m_super->s_frozen)
435#define xfs_wait_for_freeze(mp,l) vfs_check_frozen((mp)->m_super, (l))
436
442/* 437/*
443 * Flags for xfs_mountfs 438 * Flags for xfs_mountfs
444 */ 439 */
@@ -508,14 +503,12 @@ typedef struct xfs_mod_sb {
508#define XFS_MOUNT_ILOCK(mp) mutex_lock(&((mp)->m_ilock)) 503#define XFS_MOUNT_ILOCK(mp) mutex_lock(&((mp)->m_ilock))
509#define XFS_MOUNT_IUNLOCK(mp) mutex_unlock(&((mp)->m_ilock)) 504#define XFS_MOUNT_IUNLOCK(mp) mutex_unlock(&((mp)->m_ilock))
510 505
511extern void xfs_mod_sb(xfs_trans_t *, __int64_t);
512extern int xfs_log_sbcount(xfs_mount_t *, uint); 506extern int xfs_log_sbcount(xfs_mount_t *, uint);
513extern int xfs_mountfs(xfs_mount_t *mp); 507extern int xfs_mountfs(xfs_mount_t *mp);
514extern void xfs_mountfs_check_barriers(xfs_mount_t *mp); 508extern void xfs_mountfs_check_barriers(xfs_mount_t *mp);
515 509
516extern void xfs_unmountfs(xfs_mount_t *); 510extern void xfs_unmountfs(xfs_mount_t *);
517extern int xfs_unmountfs_writesb(xfs_mount_t *); 511extern int xfs_unmountfs_writesb(xfs_mount_t *);
518extern int xfs_unmount_flush(xfs_mount_t *, int);
519extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); 512extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
520extern int xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t, 513extern int xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t,
521 int64_t, int); 514 int64_t, int);
@@ -525,20 +518,20 @@ extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
525extern int xfs_readsb(xfs_mount_t *, int); 518extern int xfs_readsb(xfs_mount_t *, int);
526extern void xfs_freesb(xfs_mount_t *); 519extern void xfs_freesb(xfs_mount_t *);
527extern int xfs_fs_writable(xfs_mount_t *); 520extern int xfs_fs_writable(xfs_mount_t *);
528extern int xfs_syncsub(xfs_mount_t *, int, int *);
529extern int xfs_sync_inodes(xfs_mount_t *, int, int *);
530extern xfs_agnumber_t xfs_initialize_perag(xfs_mount_t *, xfs_agnumber_t);
531extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
532extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
533extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t); 521extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
534 522
535extern int xfs_dmops_get(struct xfs_mount *, struct xfs_mount_args *); 523extern int xfs_dmops_get(struct xfs_mount *);
536extern void xfs_dmops_put(struct xfs_mount *); 524extern void xfs_dmops_put(struct xfs_mount *);
537extern int xfs_qmops_get(struct xfs_mount *, struct xfs_mount_args *); 525extern int xfs_qmops_get(struct xfs_mount *);
538extern void xfs_qmops_put(struct xfs_mount *); 526extern void xfs_qmops_put(struct xfs_mount *);
539 527
540extern struct xfs_dmops xfs_dmcore_xfs; 528extern struct xfs_dmops xfs_dmcore_xfs;
541 529
542#endif /* __KERNEL__ */ 530#endif /* __KERNEL__ */
543 531
532extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
533extern xfs_agnumber_t xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t);
534extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
535extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
536
544#endif /* __XFS_MOUNT_H__ */ 537#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c
index a294e58db8dd..27f80581520a 100644
--- a/fs/xfs/xfs_qmops.c
+++ b/fs/xfs/xfs_qmops.c
@@ -28,7 +28,6 @@
28#include "xfs_mount.h" 28#include "xfs_mount.h"
29#include "xfs_quota.h" 29#include "xfs_quota.h"
30#include "xfs_error.h" 30#include "xfs_error.h"
31#include "xfs_clnt.h"
32 31
33 32
34STATIC struct xfs_dquot * 33STATIC struct xfs_dquot *
@@ -131,9 +130,9 @@ static struct xfs_qmops xfs_qmcore_stub = {
131}; 130};
132 131
133int 132int
134xfs_qmops_get(struct xfs_mount *mp, struct xfs_mount_args *args) 133xfs_qmops_get(struct xfs_mount *mp)
135{ 134{
136 if (args->flags & (XFSMNT_UQUOTA | XFSMNT_PQUOTA | XFSMNT_GQUOTA)) { 135 if (XFS_IS_QUOTA_RUNNING(mp)) {
137#ifdef CONFIG_XFS_QUOTA 136#ifdef CONFIG_XFS_QUOTA
138 mp->m_qm_ops = &xfs_qmcore_xfs; 137 mp->m_qm_ops = &xfs_qmcore_xfs;
139#else 138#else
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 12c4ec775af8..48965ecaa155 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -84,11 +84,9 @@ typedef struct xfs_dqblk {
84#define XFS_DQ_USER 0x0001 /* a user quota */ 84#define XFS_DQ_USER 0x0001 /* a user quota */
85#define XFS_DQ_PROJ 0x0002 /* project quota */ 85#define XFS_DQ_PROJ 0x0002 /* project quota */
86#define XFS_DQ_GROUP 0x0004 /* a group quota */ 86#define XFS_DQ_GROUP 0x0004 /* a group quota */
87#define XFS_DQ_FLOCKED 0x0008 /* flush lock taken */ 87#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */
88#define XFS_DQ_DIRTY 0x0010 /* dquot is dirty */ 88#define XFS_DQ_WANT 0x0010 /* for lookup/reclaim race */
89#define XFS_DQ_WANT 0x0020 /* for lookup/reclaim race */ 89#define XFS_DQ_INACTIVE 0x0020 /* dq off mplist & hashlist */
90#define XFS_DQ_INACTIVE 0x0040 /* dq off mplist & hashlist */
91#define XFS_DQ_MARKER 0x0080 /* sentinel */
92 90
93#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP) 91#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
94 92
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index c903130be7fd..86471bb40fd4 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -42,31 +42,6 @@
42 42
43 43
44/* 44/*
45 * Given an array of up to 4 inode pointers, unlock the pointed to inodes.
46 * If there are fewer than 4 entries in the array, the empty entries will
47 * be at the end and will have NULL pointers in them.
48 */
49STATIC void
50xfs_rename_unlock4(
51 xfs_inode_t **i_tab,
52 uint lock_mode)
53{
54 int i;
55
56 xfs_iunlock(i_tab[0], lock_mode);
57 for (i = 1; i < 4; i++) {
58 if (i_tab[i] == NULL)
59 break;
60
61 /*
62 * Watch out for duplicate entries in the table.
63 */
64 if (i_tab[i] != i_tab[i-1])
65 xfs_iunlock(i_tab[i], lock_mode);
66 }
67}
68
69/*
70 * Enter all inodes for a rename transaction into a sorted array. 45 * Enter all inodes for a rename transaction into a sorted array.
71 */ 46 */
72STATIC void 47STATIC void
@@ -205,19 +180,6 @@ xfs_rename(
205 xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); 180 xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
206 181
207 /* 182 /*
208 * If we are using project inheritance, we only allow renames
209 * into our tree when the project IDs are the same; else the
210 * tree quota mechanism would be circumvented.
211 */
212 if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
213 (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) {
214 error = XFS_ERROR(EXDEV);
215 xfs_rename_unlock4(inodes, XFS_ILOCK_EXCL);
216 xfs_trans_cancel(tp, cancel_flags);
217 goto std_return;
218 }
219
220 /*
221 * Join all the inodes to the transaction. From this point on, 183 * Join all the inodes to the transaction. From this point on,
222 * we can rely on either trans_commit or trans_cancel to unlock 184 * we can rely on either trans_commit or trans_cancel to unlock
223 * them. Note that we need to add a vnode reference to the 185 * them. Note that we need to add a vnode reference to the
@@ -242,6 +204,17 @@ xfs_rename(
242 } 204 }
243 205
244 /* 206 /*
207 * If we are using project inheritance, we only allow renames
208 * into our tree when the project IDs are the same; else the
209 * tree quota mechanism would be circumvented.
210 */
211 if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
212 (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) {
213 error = XFS_ERROR(EXDEV);
214 goto error_return;
215 }
216
217 /*
245 * Set up the target. 218 * Set up the target.
246 */ 219 */
247 if (target_ip == NULL) { 220 if (target_ip == NULL) {
@@ -367,19 +340,11 @@ xfs_rename(
367 &first_block, &free_list, spaceres); 340 &first_block, &free_list, spaceres);
368 if (error) 341 if (error)
369 goto abort_return; 342 goto abort_return;
370 xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
371 343
372 /* 344 xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
373 * Update the generation counts on all the directory inodes
374 * that we're modifying.
375 */
376 src_dp->i_gen++;
377 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); 345 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
378 346 if (new_parent)
379 if (new_parent) {
380 target_dp->i_gen++;
381 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); 347 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
382 }
383 348
384 /* 349 /*
385 * If this is a synchronous mount, make sure that the 350 * If this is a synchronous mount, make sure that the
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index e2f68de16159..edf12c7b834c 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -85,7 +85,6 @@ xfs_growfs_rt_alloc(
85{ 85{
86 xfs_fileoff_t bno; /* block number in file */ 86 xfs_fileoff_t bno; /* block number in file */
87 xfs_buf_t *bp; /* temporary buffer for zeroing */ 87 xfs_buf_t *bp; /* temporary buffer for zeroing */
88 int cancelflags; /* flags for xfs_trans_cancel */
89 int committed; /* transaction committed flag */ 88 int committed; /* transaction committed flag */
90 xfs_daddr_t d; /* disk block address */ 89 xfs_daddr_t d; /* disk block address */
91 int error; /* error return value */ 90 int error; /* error return value */
@@ -96,15 +95,16 @@ xfs_growfs_rt_alloc(
96 xfs_bmbt_irec_t map; /* block map output */ 95 xfs_bmbt_irec_t map; /* block map output */
97 int nmap; /* number of block maps */ 96 int nmap; /* number of block maps */
98 int resblks; /* space reservation */ 97 int resblks; /* space reservation */
99 xfs_trans_t *tp; /* transaction pointer */
100 98
101 /* 99 /*
102 * Allocate space to the file, as necessary. 100 * Allocate space to the file, as necessary.
103 */ 101 */
104 while (oblocks < nblocks) { 102 while (oblocks < nblocks) {
103 int cancelflags = 0;
104 xfs_trans_t *tp;
105
105 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC); 106 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
106 resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks); 107 resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
107 cancelflags = 0;
108 /* 108 /*
109 * Reserve space & log for one extent added to the file. 109 * Reserve space & log for one extent added to the file.
110 */ 110 */
@@ -171,7 +171,9 @@ xfs_growfs_rt_alloc(
171 mp->m_bsize, 0); 171 mp->m_bsize, 0);
172 if (bp == NULL) { 172 if (bp == NULL) {
173 error = XFS_ERROR(EIO); 173 error = XFS_ERROR(EIO);
174 goto error_cancel; 174error_cancel:
175 xfs_trans_cancel(tp, cancelflags);
176 goto error;
175 } 177 }
176 memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize); 178 memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize);
177 xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); 179 xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
@@ -188,8 +190,6 @@ xfs_growfs_rt_alloc(
188 oblocks = map.br_startoff + map.br_blockcount; 190 oblocks = map.br_startoff + map.br_blockcount;
189 } 191 }
190 return 0; 192 return 0;
191error_cancel:
192 xfs_trans_cancel(tp, cancelflags);
193error: 193error:
194 return error; 194 return error;
195} 195}
@@ -1856,7 +1856,6 @@ xfs_growfs_rt(
1856{ 1856{
1857 xfs_rtblock_t bmbno; /* bitmap block number */ 1857 xfs_rtblock_t bmbno; /* bitmap block number */
1858 xfs_buf_t *bp; /* temporary buffer */ 1858 xfs_buf_t *bp; /* temporary buffer */
1859 int cancelflags; /* flags for xfs_trans_cancel */
1860 int error; /* error return value */ 1859 int error; /* error return value */
1861 xfs_inode_t *ip; /* bitmap inode, used as lock */ 1860 xfs_inode_t *ip; /* bitmap inode, used as lock */
1862 xfs_mount_t *nmp; /* new (fake) mount structure */ 1861 xfs_mount_t *nmp; /* new (fake) mount structure */
@@ -1872,13 +1871,13 @@ xfs_growfs_rt(
1872 xfs_extlen_t rsumblocks; /* current number of rt summary blks */ 1871 xfs_extlen_t rsumblocks; /* current number of rt summary blks */
1873 xfs_sb_t *sbp; /* old superblock */ 1872 xfs_sb_t *sbp; /* old superblock */
1874 xfs_fsblock_t sumbno; /* summary block number */ 1873 xfs_fsblock_t sumbno; /* summary block number */
1875 xfs_trans_t *tp; /* transaction pointer */
1876 1874
1877 sbp = &mp->m_sb; 1875 sbp = &mp->m_sb;
1878 cancelflags = 0;
1879 /* 1876 /*
1880 * Initial error checking. 1877 * Initial error checking.
1881 */ 1878 */
1879 if (!capable(CAP_SYS_ADMIN))
1880 return XFS_ERROR(EPERM);
1882 if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL || 1881 if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL ||
1883 (nrblocks = in->newblocks) <= sbp->sb_rblocks || 1882 (nrblocks = in->newblocks) <= sbp->sb_rblocks ||
1884 (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize))) 1883 (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize)))
@@ -1942,6 +1941,9 @@ xfs_growfs_rt(
1942 ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0); 1941 ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0);
1943 bmbno < nrbmblocks; 1942 bmbno < nrbmblocks;
1944 bmbno++) { 1943 bmbno++) {
1944 xfs_trans_t *tp;
1945 int cancelflags = 0;
1946
1945 *nmp = *mp; 1947 *nmp = *mp;
1946 nsbp = &nmp->m_sb; 1948 nsbp = &nmp->m_sb;
1947 /* 1949 /*
@@ -1967,16 +1969,15 @@ xfs_growfs_rt(
1967 * Start a transaction, get the log reservation. 1969 * Start a transaction, get the log reservation.
1968 */ 1970 */
1969 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE); 1971 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE);
1970 cancelflags = 0;
1971 if ((error = xfs_trans_reserve(tp, 0, 1972 if ((error = xfs_trans_reserve(tp, 0,
1972 XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0))) 1973 XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0)))
1973 break; 1974 goto error_cancel;
1974 /* 1975 /*
1975 * Lock out other callers by grabbing the bitmap inode lock. 1976 * Lock out other callers by grabbing the bitmap inode lock.
1976 */ 1977 */
1977 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, 1978 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
1978 XFS_ILOCK_EXCL, &ip))) 1979 XFS_ILOCK_EXCL, &ip)))
1979 break; 1980 goto error_cancel;
1980 ASSERT(ip == mp->m_rbmip); 1981 ASSERT(ip == mp->m_rbmip);
1981 /* 1982 /*
1982 * Update the bitmap inode's size. 1983 * Update the bitmap inode's size.
@@ -1990,7 +1991,7 @@ xfs_growfs_rt(
1990 */ 1991 */
1991 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0, 1992 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0,
1992 XFS_ILOCK_EXCL, &ip))) 1993 XFS_ILOCK_EXCL, &ip)))
1993 break; 1994 goto error_cancel;
1994 ASSERT(ip == mp->m_rsumip); 1995 ASSERT(ip == mp->m_rsumip);
1995 /* 1996 /*
1996 * Update the summary inode's size. 1997 * Update the summary inode's size.
@@ -2005,7 +2006,7 @@ xfs_growfs_rt(
2005 mp->m_rsumlevels != nmp->m_rsumlevels) { 2006 mp->m_rsumlevels != nmp->m_rsumlevels) {
2006 error = xfs_rtcopy_summary(mp, nmp, tp); 2007 error = xfs_rtcopy_summary(mp, nmp, tp);
2007 if (error) 2008 if (error)
2008 break; 2009 goto error_cancel;
2009 } 2010 }
2010 /* 2011 /*
2011 * Update superblock fields. 2012 * Update superblock fields.
@@ -2031,8 +2032,11 @@ xfs_growfs_rt(
2031 bp = NULL; 2032 bp = NULL;
2032 error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents, 2033 error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents,
2033 nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno); 2034 nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno);
2034 if (error) 2035 if (error) {
2036error_cancel:
2037 xfs_trans_cancel(tp, cancelflags);
2035 break; 2038 break;
2039 }
2036 /* 2040 /*
2037 * Mark more blocks free in the superblock. 2041 * Mark more blocks free in the superblock.
2038 */ 2042 */
@@ -2045,15 +2049,10 @@ xfs_growfs_rt(
2045 mp->m_rsumsize = nrsumsize; 2049 mp->m_rsumsize = nrsumsize;
2046 2050
2047 error = xfs_trans_commit(tp, 0); 2051 error = xfs_trans_commit(tp, 0);
2048 if (error) { 2052 if (error)
2049 tp = NULL;
2050 break; 2053 break;
2051 }
2052 } 2054 }
2053 2055
2054 if (error && tp)
2055 xfs_trans_cancel(tp, cancelflags);
2056
2057 /* 2056 /*
2058 * Free the fake mp structure. 2057 * Free the fake mp structure.
2059 */ 2058 */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 3a82576dde9a..36f3a21c54d2 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -406,7 +406,7 @@ xfs_bwrite(
406 * XXXsup how does this work for quotas. 406 * XXXsup how does this work for quotas.
407 */ 407 */
408 XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb); 408 XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
409 XFS_BUF_SET_FSPRIVATE3(bp, mp); 409 bp->b_mount = mp;
410 XFS_BUF_WRITE(bp); 410 XFS_BUF_WRITE(bp);
411 411
412 if ((error = XFS_bwrite(bp))) { 412 if ((error = XFS_bwrite(bp))) {
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 3f8cf1587f4c..1ed71916e4c9 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -79,6 +79,7 @@ struct xfs_mount;
79#define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */ 79#define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */
80#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004 80#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004
81#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ 81#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */
82#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */
82 83
83#define XFS_SB_VERSION2_OKREALFBITS \ 84#define XFS_SB_VERSION2_OKREALFBITS \
84 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ 85 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
@@ -296,30 +297,34 @@ typedef enum {
296 297
297#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS) 298#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
298 299
299#ifdef __KERNEL__
300static inline int xfs_sb_good_version(xfs_sb_t *sbp) 300static inline int xfs_sb_good_version(xfs_sb_t *sbp)
301{ 301{
302 return (((sbp->sb_versionnum >= XFS_SB_VERSION_1) && \ 302 /* We always support version 1-3 */
303 (sbp->sb_versionnum <= XFS_SB_VERSION_3)) || \ 303 if (sbp->sb_versionnum >= XFS_SB_VERSION_1 &&
304 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ 304 sbp->sb_versionnum <= XFS_SB_VERSION_3)
305 !((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) || \ 305 return 1;
306 ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && \ 306
307 (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS))) && \ 307 /* We support version 4 if all feature bits are supported */
308 (sbp->sb_shared_vn <= XFS_SB_MAX_SHARED_VN))); 308 if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) {
309} 309 if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) ||
310 ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
311 (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS)))
312 return 0;
313
314#ifdef __KERNEL__
315 if (sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
316 return 0;
310#else 317#else
311static inline int xfs_sb_good_version(xfs_sb_t *sbp) 318 if ((sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) &&
312{ 319 sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
313 return (((sbp->sb_versionnum >= XFS_SB_VERSION_1) && \ 320 return 0;
314 (sbp->sb_versionnum <= XFS_SB_VERSION_3)) || \ 321#endif
315 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ 322
316 !((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) || \ 323 return 1;
317 ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && \ 324 }
318 (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS))) && \ 325
319 (!(sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) || \ 326 return 0;
320 (sbp->sb_shared_vn <= XFS_SB_MAX_SHARED_VN))));
321} 327}
322#endif /* __KERNEL__ */
323 328
324/* 329/*
325 * Detect a mismatched features2 field. Older kernels read/wrote 330 * Detect a mismatched features2 field. Older kernels read/wrote
@@ -332,123 +337,127 @@ static inline int xfs_sb_has_mismatched_features2(xfs_sb_t *sbp)
332 337
333static inline unsigned xfs_sb_version_tonew(unsigned v) 338static inline unsigned xfs_sb_version_tonew(unsigned v)
334{ 339{
335 return ((((v) == XFS_SB_VERSION_1) ? \ 340 if (v == XFS_SB_VERSION_1)
336 0 : \ 341 return XFS_SB_VERSION_4;
337 (((v) == XFS_SB_VERSION_2) ? \ 342
338 XFS_SB_VERSION_ATTRBIT : \ 343 if (v == XFS_SB_VERSION_2)
339 (XFS_SB_VERSION_ATTRBIT | XFS_SB_VERSION_NLINKBIT))) | \ 344 return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT;
340 XFS_SB_VERSION_4); 345
346 return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT |
347 XFS_SB_VERSION_NLINKBIT;
341} 348}
342 349
343static inline unsigned xfs_sb_version_toold(unsigned v) 350static inline unsigned xfs_sb_version_toold(unsigned v)
344{ 351{
345 return (((v) & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT)) ? \ 352 if (v & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT))
346 0 : \ 353 return 0;
347 (((v) & XFS_SB_VERSION_NLINKBIT) ? \ 354 if (v & XFS_SB_VERSION_NLINKBIT)
348 XFS_SB_VERSION_3 : \ 355 return XFS_SB_VERSION_3;
349 (((v) & XFS_SB_VERSION_ATTRBIT) ? \ 356 if (v & XFS_SB_VERSION_ATTRBIT)
350 XFS_SB_VERSION_2 : \ 357 return XFS_SB_VERSION_2;
351 XFS_SB_VERSION_1))); 358 return XFS_SB_VERSION_1;
352} 359}
353 360
354static inline int xfs_sb_version_hasattr(xfs_sb_t *sbp) 361static inline int xfs_sb_version_hasattr(xfs_sb_t *sbp)
355{ 362{
356 return ((sbp)->sb_versionnum == XFS_SB_VERSION_2) || \ 363 return sbp->sb_versionnum == XFS_SB_VERSION_2 ||
357 ((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \ 364 sbp->sb_versionnum == XFS_SB_VERSION_3 ||
358 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ 365 (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
359 ((sbp)->sb_versionnum & XFS_SB_VERSION_ATTRBIT)); 366 (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT));
360} 367}
361 368
362static inline void xfs_sb_version_addattr(xfs_sb_t *sbp) 369static inline void xfs_sb_version_addattr(xfs_sb_t *sbp)
363{ 370{
364 (sbp)->sb_versionnum = (((sbp)->sb_versionnum == XFS_SB_VERSION_1) ? \ 371 if (sbp->sb_versionnum == XFS_SB_VERSION_1)
365 XFS_SB_VERSION_2 : \ 372 sbp->sb_versionnum = XFS_SB_VERSION_2;
366 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) ? \ 373 else if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
367 ((sbp)->sb_versionnum | XFS_SB_VERSION_ATTRBIT) : \ 374 sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
368 (XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT))); 375 else
376 sbp->sb_versionnum = XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT;
369} 377}
370 378
371static inline int xfs_sb_version_hasnlink(xfs_sb_t *sbp) 379static inline int xfs_sb_version_hasnlink(xfs_sb_t *sbp)
372{ 380{
373 return ((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \ 381 return sbp->sb_versionnum == XFS_SB_VERSION_3 ||
374 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ 382 (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
375 ((sbp)->sb_versionnum & XFS_SB_VERSION_NLINKBIT)); 383 (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT));
376} 384}
377 385
378static inline void xfs_sb_version_addnlink(xfs_sb_t *sbp) 386static inline void xfs_sb_version_addnlink(xfs_sb_t *sbp)
379{ 387{
380 (sbp)->sb_versionnum = ((sbp)->sb_versionnum <= XFS_SB_VERSION_2 ? \ 388 if (sbp->sb_versionnum <= XFS_SB_VERSION_2)
381 XFS_SB_VERSION_3 : \ 389 sbp->sb_versionnum = XFS_SB_VERSION_3;
382 ((sbp)->sb_versionnum | XFS_SB_VERSION_NLINKBIT)); 390 else
391 sbp->sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
383} 392}
384 393
385static inline int xfs_sb_version_hasquota(xfs_sb_t *sbp) 394static inline int xfs_sb_version_hasquota(xfs_sb_t *sbp)
386{ 395{
387 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ 396 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
388 ((sbp)->sb_versionnum & XFS_SB_VERSION_QUOTABIT); 397 (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
389} 398}
390 399
391static inline void xfs_sb_version_addquota(xfs_sb_t *sbp) 400static inline void xfs_sb_version_addquota(xfs_sb_t *sbp)
392{ 401{
393 (sbp)->sb_versionnum = \ 402 if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
394 (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 ? \ 403 sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
395 ((sbp)->sb_versionnum | XFS_SB_VERSION_QUOTABIT) : \ 404 else
396 (xfs_sb_version_tonew((sbp)->sb_versionnum) | \ 405 sbp->sb_versionnum = xfs_sb_version_tonew(sbp->sb_versionnum) |
397 XFS_SB_VERSION_QUOTABIT)); 406 XFS_SB_VERSION_QUOTABIT;
398} 407}
399 408
400static inline int xfs_sb_version_hasalign(xfs_sb_t *sbp) 409static inline int xfs_sb_version_hasalign(xfs_sb_t *sbp)
401{ 410{
402 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ 411 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
403 ((sbp)->sb_versionnum & XFS_SB_VERSION_ALIGNBIT); 412 (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT);
404} 413}
405 414
406static inline int xfs_sb_version_hasdalign(xfs_sb_t *sbp) 415static inline int xfs_sb_version_hasdalign(xfs_sb_t *sbp)
407{ 416{
408 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ 417 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
409 ((sbp)->sb_versionnum & XFS_SB_VERSION_DALIGNBIT); 418 (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
410} 419}
411 420
412static inline int xfs_sb_version_hasshared(xfs_sb_t *sbp) 421static inline int xfs_sb_version_hasshared(xfs_sb_t *sbp)
413{ 422{
414 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ 423 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
415 ((sbp)->sb_versionnum & XFS_SB_VERSION_SHAREDBIT); 424 (sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT);
416} 425}
417 426
418static inline int xfs_sb_version_hasdirv2(xfs_sb_t *sbp) 427static inline int xfs_sb_version_hasdirv2(xfs_sb_t *sbp)
419{ 428{
420 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ 429 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
421 ((sbp)->sb_versionnum & XFS_SB_VERSION_DIRV2BIT); 430 (sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
422} 431}
423 432
424static inline int xfs_sb_version_haslogv2(xfs_sb_t *sbp) 433static inline int xfs_sb_version_haslogv2(xfs_sb_t *sbp)
425{ 434{
426 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ 435 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
427 ((sbp)->sb_versionnum & XFS_SB_VERSION_LOGV2BIT); 436 (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
428} 437}
429 438
430static inline int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp) 439static inline int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp)
431{ 440{
432 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ 441 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
433 ((sbp)->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT); 442 (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
434} 443}
435 444
436static inline int xfs_sb_version_hassector(xfs_sb_t *sbp) 445static inline int xfs_sb_version_hassector(xfs_sb_t *sbp)
437{ 446{
438 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ 447 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
439 ((sbp)->sb_versionnum & XFS_SB_VERSION_SECTORBIT); 448 (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
440} 449}
441 450
442static inline int xfs_sb_version_hasasciici(xfs_sb_t *sbp) 451static inline int xfs_sb_version_hasasciici(xfs_sb_t *sbp)
443{ 452{
444 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ 453 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
445 (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT); 454 (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
446} 455}
447 456
448static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp) 457static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
449{ 458{
450 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ 459 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
451 ((sbp)->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); 460 (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
452} 461}
453 462
454/* 463/*
@@ -463,22 +472,20 @@ static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
463 472
464static inline int xfs_sb_version_haslazysbcount(xfs_sb_t *sbp) 473static inline int xfs_sb_version_haslazysbcount(xfs_sb_t *sbp)
465{ 474{
466 return (xfs_sb_version_hasmorebits(sbp) && \ 475 return xfs_sb_version_hasmorebits(sbp) &&
467 ((sbp)->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)); 476 (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT);
468} 477}
469 478
470static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp) 479static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp)
471{ 480{
472 return (xfs_sb_version_hasmorebits(sbp)) && \ 481 return xfs_sb_version_hasmorebits(sbp) &&
473 ((sbp)->sb_features2 & XFS_SB_VERSION2_ATTR2BIT); 482 (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT);
474} 483}
475 484
476static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp) 485static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp)
477{ 486{
478 ((sbp)->sb_versionnum = \ 487 sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
479 ((sbp)->sb_versionnum | XFS_SB_VERSION_MOREBITSBIT), \ 488 sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
480 ((sbp)->sb_features2 = \
481 ((sbp)->sb_features2 | XFS_SB_VERSION2_ATTR2BIT)));
482} 489}
483 490
484static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp) 491static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 4e1c22a23be5..8570b826fedd 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -290,7 +290,7 @@ xfs_trans_dup(
290 ASSERT(tp->t_ticket != NULL); 290 ASSERT(tp->t_ticket != NULL);
291 291
292 ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE); 292 ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE);
293 ntp->t_ticket = tp->t_ticket; 293 ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket);
294 ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used; 294 ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used;
295 tp->t_blk_res = tp->t_blk_res_used; 295 tp->t_blk_res = tp->t_blk_res_used;
296 ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used; 296 ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
@@ -1260,6 +1260,13 @@ xfs_trans_roll(
1260 trans = *tpp; 1260 trans = *tpp;
1261 1261
1262 /* 1262 /*
1263 * transaction commit worked ok so we can drop the extra ticket
1264 * reference that we gained in xfs_trans_dup()
1265 */
1266 xfs_log_ticket_put(trans->t_ticket);
1267
1268
1269 /*
1263 * Reserve space in the log for th next transaction. 1270 * Reserve space in the log for th next transaction.
1264 * This also pushes items in the "AIL", the list of logged items, 1271 * This also pushes items in the "AIL", the list of logged items,
1265 * out to disk if they are taking up space at the tail of the log 1272 * out to disk if they are taking up space at the tail of the log
@@ -1383,11 +1390,12 @@ xfs_trans_chunk_committed(
1383 xfs_log_item_desc_t *lidp; 1390 xfs_log_item_desc_t *lidp;
1384 xfs_log_item_t *lip; 1391 xfs_log_item_t *lip;
1385 xfs_lsn_t item_lsn; 1392 xfs_lsn_t item_lsn;
1386 struct xfs_mount *mp;
1387 int i; 1393 int i;
1388 1394
1389 lidp = licp->lic_descs; 1395 lidp = licp->lic_descs;
1390 for (i = 0; i < licp->lic_unused; i++, lidp++) { 1396 for (i = 0; i < licp->lic_unused; i++, lidp++) {
1397 struct xfs_ail *ailp;
1398
1391 if (xfs_lic_isfree(licp, i)) { 1399 if (xfs_lic_isfree(licp, i)) {
1392 continue; 1400 continue;
1393 } 1401 }
@@ -1424,19 +1432,19 @@ xfs_trans_chunk_committed(
1424 * This would cause the earlier transaction to fail 1432 * This would cause the earlier transaction to fail
1425 * the test below. 1433 * the test below.
1426 */ 1434 */
1427 mp = lip->li_mountp; 1435 ailp = lip->li_ailp;
1428 spin_lock(&mp->m_ail_lock); 1436 spin_lock(&ailp->xa_lock);
1429 if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) { 1437 if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
1430 /* 1438 /*
1431 * This will set the item's lsn to item_lsn 1439 * This will set the item's lsn to item_lsn
1432 * and update the position of the item in 1440 * and update the position of the item in
1433 * the AIL. 1441 * the AIL.
1434 * 1442 *
1435 * xfs_trans_update_ail() drops the AIL lock. 1443 * xfs_trans_ail_update() drops the AIL lock.
1436 */ 1444 */
1437 xfs_trans_update_ail(mp, lip, item_lsn); 1445 xfs_trans_ail_update(ailp, lip, item_lsn);
1438 } else { 1446 } else {
1439 spin_unlock(&mp->m_ail_lock); 1447 spin_unlock(&ailp->xa_lock);
1440 } 1448 }
1441 1449
1442 /* 1450 /*
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 74c80bd2b0ec..d6fe4a88d79f 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -18,6 +18,8 @@
18#ifndef __XFS_TRANS_H__ 18#ifndef __XFS_TRANS_H__
19#define __XFS_TRANS_H__ 19#define __XFS_TRANS_H__
20 20
21struct xfs_log_item;
22
21/* 23/*
22 * This is the structure written in the log at the head of 24 * This is the structure written in the log at the head of
23 * every transaction. It identifies the type and id of the 25 * every transaction. It identifies the type and id of the
@@ -98,76 +100,6 @@ typedef struct xfs_trans_header {
98#define XFS_TRANS_TYPE_MAX 41 100#define XFS_TRANS_TYPE_MAX 41
99/* new transaction types need to be reflected in xfs_logprint(8) */ 101/* new transaction types need to be reflected in xfs_logprint(8) */
100 102
101
102#ifdef __KERNEL__
103struct xfs_buf;
104struct xfs_buftarg;
105struct xfs_efd_log_item;
106struct xfs_efi_log_item;
107struct xfs_inode;
108struct xfs_item_ops;
109struct xfs_log_iovec;
110struct xfs_log_item;
111struct xfs_log_item_desc;
112struct xfs_mount;
113struct xfs_trans;
114struct xfs_dquot_acct;
115
116typedef struct xfs_log_item {
117 struct list_head li_ail; /* AIL pointers */
118 xfs_lsn_t li_lsn; /* last on-disk lsn */
119 struct xfs_log_item_desc *li_desc; /* ptr to current desc*/
120 struct xfs_mount *li_mountp; /* ptr to fs mount */
121 uint li_type; /* item type */
122 uint li_flags; /* misc flags */
123 struct xfs_log_item *li_bio_list; /* buffer item list */
124 void (*li_cb)(struct xfs_buf *,
125 struct xfs_log_item *);
126 /* buffer item iodone */
127 /* callback func */
128 struct xfs_item_ops *li_ops; /* function list */
129} xfs_log_item_t;
130
131#define XFS_LI_IN_AIL 0x1
132#define XFS_LI_ABORTED 0x2
133
134typedef struct xfs_item_ops {
135 uint (*iop_size)(xfs_log_item_t *);
136 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
137 void (*iop_pin)(xfs_log_item_t *);
138 void (*iop_unpin)(xfs_log_item_t *, int);
139 void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
140 uint (*iop_trylock)(xfs_log_item_t *);
141 void (*iop_unlock)(xfs_log_item_t *);
142 xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
143 void (*iop_push)(xfs_log_item_t *);
144 void (*iop_pushbuf)(xfs_log_item_t *);
145 void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
146} xfs_item_ops_t;
147
148#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip)
149#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp)
150#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip)
151#define IOP_UNPIN(ip, flags) (*(ip)->li_ops->iop_unpin)(ip, flags)
152#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
153#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip)
154#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip)
155#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn)
156#define IOP_PUSH(ip) (*(ip)->li_ops->iop_push)(ip)
157#define IOP_PUSHBUF(ip) (*(ip)->li_ops->iop_pushbuf)(ip)
158#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
159
160/*
161 * Return values for the IOP_TRYLOCK() routines.
162 */
163#define XFS_ITEM_SUCCESS 0
164#define XFS_ITEM_PINNED 1
165#define XFS_ITEM_LOCKED 2
166#define XFS_ITEM_FLUSHING 3
167#define XFS_ITEM_PUSHBUF 4
168
169#endif /* __KERNEL__ */
170
171/* 103/*
172 * This structure is used to track log items associated with 104 * This structure is used to track log items associated with
173 * a transaction. It points to the log item and keeps some 105 * a transaction. It points to the log item and keeps some
@@ -176,7 +108,7 @@ typedef struct xfs_item_ops {
176 * once we get to commit processing (see xfs_trans_commit()). 108 * once we get to commit processing (see xfs_trans_commit()).
177 */ 109 */
178typedef struct xfs_log_item_desc { 110typedef struct xfs_log_item_desc {
179 xfs_log_item_t *lid_item; 111 struct xfs_log_item *lid_item;
180 ushort lid_size; 112 ushort lid_size;
181 unsigned char lid_flags; 113 unsigned char lid_flags;
182 unsigned char lid_index; 114 unsigned char lid_index;
@@ -276,94 +208,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
276 (xfs_caddr_t)(((xfs_log_item_chunk_t*)0)->lic_descs)); 208 (xfs_caddr_t)(((xfs_log_item_chunk_t*)0)->lic_descs));
277} 209}
278 210
279#ifdef __KERNEL__
280/*
281 * This structure is used to maintain a list of block ranges that have been
282 * freed in the transaction. The ranges are listed in the perag[] busy list
283 * between when they're freed and the transaction is committed to disk.
284 */
285
286typedef struct xfs_log_busy_slot {
287 xfs_agnumber_t lbc_ag;
288 ushort lbc_idx; /* index in perag.busy[] */
289} xfs_log_busy_slot_t;
290
291#define XFS_LBC_NUM_SLOTS 31
292typedef struct xfs_log_busy_chunk {
293 struct xfs_log_busy_chunk *lbc_next;
294 uint lbc_free; /* free slots bitmask */
295 ushort lbc_unused; /* first unused */
296 xfs_log_busy_slot_t lbc_busy[XFS_LBC_NUM_SLOTS];
297} xfs_log_busy_chunk_t;
298
299#define XFS_LBC_MAX_SLOT (XFS_LBC_NUM_SLOTS - 1)
300#define XFS_LBC_FREEMASK ((1U << XFS_LBC_NUM_SLOTS) - 1)
301
302#define XFS_LBC_INIT(cp) ((cp)->lbc_free = XFS_LBC_FREEMASK)
303#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
304#define XFS_LBC_SLOT(cp, slot) (&((cp)->lbc_busy[(slot)]))
305#define XFS_LBC_VACANCY(cp) (((cp)->lbc_free) & XFS_LBC_FREEMASK)
306#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
307
308/*
309 * This is the type of function which can be given to xfs_trans_callback()
310 * to be called upon the transaction's commit to disk.
311 */
312typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *);
313
314/*
315 * This is the structure maintained for every active transaction.
316 */
317typedef struct xfs_trans {
318 unsigned int t_magic; /* magic number */
319 xfs_log_callback_t t_logcb; /* log callback struct */
320 unsigned int t_type; /* transaction type */
321 unsigned int t_log_res; /* amt of log space resvd */
322 unsigned int t_log_count; /* count for perm log res */
323 unsigned int t_blk_res; /* # of blocks resvd */
324 unsigned int t_blk_res_used; /* # of resvd blocks used */
325 unsigned int t_rtx_res; /* # of rt extents resvd */
326 unsigned int t_rtx_res_used; /* # of resvd rt extents used */
327 xfs_log_ticket_t t_ticket; /* log mgr ticket */
328 xfs_lsn_t t_lsn; /* log seq num of start of
329 * transaction. */
330 xfs_lsn_t t_commit_lsn; /* log seq num of end of
331 * transaction. */
332 struct xfs_mount *t_mountp; /* ptr to fs mount struct */
333 struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */
334 xfs_trans_callback_t t_callback; /* transaction callback */
335 void *t_callarg; /* callback arg */
336 unsigned int t_flags; /* misc flags */
337 int64_t t_icount_delta; /* superblock icount change */
338 int64_t t_ifree_delta; /* superblock ifree change */
339 int64_t t_fdblocks_delta; /* superblock fdblocks chg */
340 int64_t t_res_fdblocks_delta; /* on-disk only chg */
341 int64_t t_frextents_delta;/* superblock freextents chg*/
342 int64_t t_res_frextents_delta; /* on-disk only chg */
343#ifdef DEBUG
344 int64_t t_ag_freeblks_delta; /* debugging counter */
345 int64_t t_ag_flist_delta; /* debugging counter */
346 int64_t t_ag_btree_delta; /* debugging counter */
347#endif
348 int64_t t_dblocks_delta;/* superblock dblocks change */
349 int64_t t_agcount_delta;/* superblock agcount change */
350 int64_t t_imaxpct_delta;/* superblock imaxpct change */
351 int64_t t_rextsize_delta;/* superblock rextsize chg */
352 int64_t t_rbmblocks_delta;/* superblock rbmblocks chg */
353 int64_t t_rblocks_delta;/* superblock rblocks change */
354 int64_t t_rextents_delta;/* superblocks rextents chg */
355 int64_t t_rextslog_delta;/* superblocks rextslog chg */
356 unsigned int t_items_free; /* log item descs free */
357 xfs_log_item_chunk_t t_items; /* first log item desc chunk */
358 xfs_trans_header_t t_header; /* header for in-log trans */
359 unsigned int t_busy_free; /* busy descs free */
360 xfs_log_busy_chunk_t t_busy; /* busy/async free blocks */
361 unsigned long t_pflags; /* saved process flags state */
362} xfs_trans_t;
363
364#endif /* __KERNEL__ */
365
366
367#define XFS_TRANS_MAGIC 0x5452414E /* 'TRAN' */ 211#define XFS_TRANS_MAGIC 0x5452414E /* 'TRAN' */
368/* 212/*
369 * Values for t_flags. 213 * Values for t_flags.
@@ -906,6 +750,157 @@ typedef struct xfs_trans {
906#define XFS_DQUOT_REF 1 750#define XFS_DQUOT_REF 1
907 751
908#ifdef __KERNEL__ 752#ifdef __KERNEL__
753
754struct xfs_buf;
755struct xfs_buftarg;
756struct xfs_efd_log_item;
757struct xfs_efi_log_item;
758struct xfs_inode;
759struct xfs_item_ops;
760struct xfs_log_iovec;
761struct xfs_log_item_desc;
762struct xfs_mount;
763struct xfs_trans;
764struct xfs_dquot_acct;
765
766typedef struct xfs_log_item {
767 struct list_head li_ail; /* AIL pointers */
768 xfs_lsn_t li_lsn; /* last on-disk lsn */
769 struct xfs_log_item_desc *li_desc; /* ptr to current desc*/
770 struct xfs_mount *li_mountp; /* ptr to fs mount */
771 struct xfs_ail *li_ailp; /* ptr to AIL */
772 uint li_type; /* item type */
773 uint li_flags; /* misc flags */
774 struct xfs_log_item *li_bio_list; /* buffer item list */
775 void (*li_cb)(struct xfs_buf *,
776 struct xfs_log_item *);
777 /* buffer item iodone */
778 /* callback func */
779 struct xfs_item_ops *li_ops; /* function list */
780} xfs_log_item_t;
781
782#define XFS_LI_IN_AIL 0x1
783#define XFS_LI_ABORTED 0x2
784
785typedef struct xfs_item_ops {
786 uint (*iop_size)(xfs_log_item_t *);
787 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
788 void (*iop_pin)(xfs_log_item_t *);
789 void (*iop_unpin)(xfs_log_item_t *, int);
790 void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
791 uint (*iop_trylock)(xfs_log_item_t *);
792 void (*iop_unlock)(xfs_log_item_t *);
793 xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
794 void (*iop_push)(xfs_log_item_t *);
795 void (*iop_pushbuf)(xfs_log_item_t *);
796 void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
797} xfs_item_ops_t;
798
799#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip)
800#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp)
801#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip)
802#define IOP_UNPIN(ip, flags) (*(ip)->li_ops->iop_unpin)(ip, flags)
803#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
804#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip)
805#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip)
806#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn)
807#define IOP_PUSH(ip) (*(ip)->li_ops->iop_push)(ip)
808#define IOP_PUSHBUF(ip) (*(ip)->li_ops->iop_pushbuf)(ip)
809#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
810
811/*
812 * Return values for the IOP_TRYLOCK() routines.
813 */
814#define XFS_ITEM_SUCCESS 0
815#define XFS_ITEM_PINNED 1
816#define XFS_ITEM_LOCKED 2
817#define XFS_ITEM_FLUSHING 3
818#define XFS_ITEM_PUSHBUF 4
819
820/*
821 * This structure is used to maintain a list of block ranges that have been
822 * freed in the transaction. The ranges are listed in the perag[] busy list
823 * between when they're freed and the transaction is committed to disk.
824 */
825
826typedef struct xfs_log_busy_slot {
827 xfs_agnumber_t lbc_ag;
828 ushort lbc_idx; /* index in perag.busy[] */
829} xfs_log_busy_slot_t;
830
831#define XFS_LBC_NUM_SLOTS 31
832typedef struct xfs_log_busy_chunk {
833 struct xfs_log_busy_chunk *lbc_next;
834 uint lbc_free; /* free slots bitmask */
835 ushort lbc_unused; /* first unused */
836 xfs_log_busy_slot_t lbc_busy[XFS_LBC_NUM_SLOTS];
837} xfs_log_busy_chunk_t;
838
839#define XFS_LBC_MAX_SLOT (XFS_LBC_NUM_SLOTS - 1)
840#define XFS_LBC_FREEMASK ((1U << XFS_LBC_NUM_SLOTS) - 1)
841
842#define XFS_LBC_INIT(cp) ((cp)->lbc_free = XFS_LBC_FREEMASK)
843#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
844#define XFS_LBC_SLOT(cp, slot) (&((cp)->lbc_busy[(slot)]))
845#define XFS_LBC_VACANCY(cp) (((cp)->lbc_free) & XFS_LBC_FREEMASK)
846#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
847
848/*
849 * This is the type of function which can be given to xfs_trans_callback()
850 * to be called upon the transaction's commit to disk.
851 */
852typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *);
853
854/*
855 * This is the structure maintained for every active transaction.
856 */
857typedef struct xfs_trans {
858 unsigned int t_magic; /* magic number */
859 xfs_log_callback_t t_logcb; /* log callback struct */
860 unsigned int t_type; /* transaction type */
861 unsigned int t_log_res; /* amt of log space resvd */
862 unsigned int t_log_count; /* count for perm log res */
863 unsigned int t_blk_res; /* # of blocks resvd */
864 unsigned int t_blk_res_used; /* # of resvd blocks used */
865 unsigned int t_rtx_res; /* # of rt extents resvd */
866 unsigned int t_rtx_res_used; /* # of resvd rt extents used */
867 xfs_log_ticket_t t_ticket; /* log mgr ticket */
868 xfs_lsn_t t_lsn; /* log seq num of start of
869 * transaction. */
870 xfs_lsn_t t_commit_lsn; /* log seq num of end of
871 * transaction. */
872 struct xfs_mount *t_mountp; /* ptr to fs mount struct */
873 struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */
874 xfs_trans_callback_t t_callback; /* transaction callback */
875 void *t_callarg; /* callback arg */
876 unsigned int t_flags; /* misc flags */
877 int64_t t_icount_delta; /* superblock icount change */
878 int64_t t_ifree_delta; /* superblock ifree change */
879 int64_t t_fdblocks_delta; /* superblock fdblocks chg */
880 int64_t t_res_fdblocks_delta; /* on-disk only chg */
881 int64_t t_frextents_delta;/* superblock freextents chg*/
882 int64_t t_res_frextents_delta; /* on-disk only chg */
883#ifdef DEBUG
884 int64_t t_ag_freeblks_delta; /* debugging counter */
885 int64_t t_ag_flist_delta; /* debugging counter */
886 int64_t t_ag_btree_delta; /* debugging counter */
887#endif
888 int64_t t_dblocks_delta;/* superblock dblocks change */
889 int64_t t_agcount_delta;/* superblock agcount change */
890 int64_t t_imaxpct_delta;/* superblock imaxpct change */
891 int64_t t_rextsize_delta;/* superblock rextsize chg */
892 int64_t t_rbmblocks_delta;/* superblock rbmblocks chg */
893 int64_t t_rblocks_delta;/* superblock rblocks change */
894 int64_t t_rextents_delta;/* superblocks rextents chg */
895 int64_t t_rextslog_delta;/* superblocks rextslog chg */
896 unsigned int t_items_free; /* log item descs free */
897 xfs_log_item_chunk_t t_items; /* first log item desc chunk */
898 xfs_trans_header_t t_header; /* header for in-log trans */
899 unsigned int t_busy_free; /* busy descs free */
900 xfs_log_busy_chunk_t t_busy; /* busy/async free blocks */
901 unsigned long t_pflags; /* saved process flags state */
902} xfs_trans_t;
903
909/* 904/*
910 * XFS transaction mechanism exported interfaces that are 905 * XFS transaction mechanism exported interfaces that are
911 * actually macros. 906 * actually macros.
@@ -928,7 +923,6 @@ typedef struct xfs_trans {
928/* 923/*
929 * XFS transaction mechanism exported interfaces. 924 * XFS transaction mechanism exported interfaces.
930 */ 925 */
931void xfs_trans_init(struct xfs_mount *);
932xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); 926xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint);
933xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint); 927xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint);
934xfs_trans_t *xfs_trans_dup(xfs_trans_t *); 928xfs_trans_t *xfs_trans_dup(xfs_trans_t *);
@@ -975,13 +969,8 @@ int _xfs_trans_commit(xfs_trans_t *,
975 int *); 969 int *);
976#define xfs_trans_commit(tp, flags) _xfs_trans_commit(tp, flags, NULL) 970#define xfs_trans_commit(tp, flags) _xfs_trans_commit(tp, flags, NULL)
977void xfs_trans_cancel(xfs_trans_t *, int); 971void xfs_trans_cancel(xfs_trans_t *, int);
978int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
979int xfs_trans_ail_init(struct xfs_mount *); 972int xfs_trans_ail_init(struct xfs_mount *);
980void xfs_trans_ail_destroy(struct xfs_mount *); 973void xfs_trans_ail_destroy(struct xfs_mount *);
981void xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t);
982xfs_lsn_t xfs_trans_tail_ail(struct xfs_mount *);
983void xfs_trans_unlocked_item(struct xfs_mount *,
984 xfs_log_item_t *);
985xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, 974xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
986 xfs_agnumber_t ag, 975 xfs_agnumber_t ag,
987 xfs_extlen_t idx); 976 xfs_extlen_t idx);
@@ -990,4 +979,7 @@ extern kmem_zone_t *xfs_trans_zone;
990 979
991#endif /* __KERNEL__ */ 980#endif /* __KERNEL__ */
992 981
982void xfs_trans_init(struct xfs_mount *);
983int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
984
993#endif /* __XFS_TRANS_H__ */ 985#endif /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 1f77c00af566..2d47f10f8bed 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. 2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * Copyright (c) 2008 Dave Chinner
3 * All Rights Reserved. 4 * All Rights Reserved.
4 * 5 *
5 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -28,13 +29,13 @@
28#include "xfs_trans_priv.h" 29#include "xfs_trans_priv.h"
29#include "xfs_error.h" 30#include "xfs_error.h"
30 31
31STATIC void xfs_ail_insert(xfs_ail_t *, xfs_log_item_t *); 32STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *);
32STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_t *, xfs_log_item_t *); 33STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
33STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_t *); 34STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
34STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_t *, xfs_log_item_t *); 35STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
35 36
36#ifdef DEBUG 37#ifdef DEBUG
37STATIC void xfs_ail_check(xfs_ail_t *, xfs_log_item_t *); 38STATIC void xfs_ail_check(struct xfs_ail *, xfs_log_item_t *);
38#else 39#else
39#define xfs_ail_check(a,l) 40#define xfs_ail_check(a,l)
40#endif /* DEBUG */ 41#endif /* DEBUG */
@@ -50,20 +51,20 @@ STATIC void xfs_ail_check(xfs_ail_t *, xfs_log_item_t *);
50 * lsn of the last item in the AIL. 51 * lsn of the last item in the AIL.
51 */ 52 */
52xfs_lsn_t 53xfs_lsn_t
53xfs_trans_tail_ail( 54xfs_trans_ail_tail(
54 xfs_mount_t *mp) 55 struct xfs_ail *ailp)
55{ 56{
56 xfs_lsn_t lsn; 57 xfs_lsn_t lsn;
57 xfs_log_item_t *lip; 58 xfs_log_item_t *lip;
58 59
59 spin_lock(&mp->m_ail_lock); 60 spin_lock(&ailp->xa_lock);
60 lip = xfs_ail_min(&mp->m_ail); 61 lip = xfs_ail_min(ailp);
61 if (lip == NULL) { 62 if (lip == NULL) {
62 lsn = (xfs_lsn_t)0; 63 lsn = (xfs_lsn_t)0;
63 } else { 64 } else {
64 lsn = lip->li_lsn; 65 lsn = lip->li_lsn;
65 } 66 }
66 spin_unlock(&mp->m_ail_lock); 67 spin_unlock(&ailp->xa_lock);
67 68
68 return lsn; 69 return lsn;
69} 70}
@@ -85,16 +86,125 @@ xfs_trans_tail_ail(
85 * any of the objects, so the lock is not needed. 86 * any of the objects, so the lock is not needed.
86 */ 87 */
87void 88void
88xfs_trans_push_ail( 89xfs_trans_ail_push(
89 xfs_mount_t *mp, 90 struct xfs_ail *ailp,
90 xfs_lsn_t threshold_lsn) 91 xfs_lsn_t threshold_lsn)
91{ 92{
92 xfs_log_item_t *lip; 93 xfs_log_item_t *lip;
94
95 lip = xfs_ail_min(ailp);
96 if (lip && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
97 if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0)
98 xfsaild_wakeup(ailp, threshold_lsn);
99 }
100}
101
102/*
103 * AIL traversal cursor initialisation.
104 *
105 * The cursor keeps track of where our current traversal is up
106 * to by tracking the next ƣtem in the list for us. However, for
107 * this to be safe, removing an object from the AIL needs to invalidate
108 * any cursor that points to it. hence the traversal cursor needs to
109 * be linked to the struct xfs_ail so that deletion can search all the
110 * active cursors for invalidation.
111 *
112 * We don't link the push cursor because it is embedded in the struct
113 * xfs_ail and hence easily findable.
114 */
115STATIC void
116xfs_trans_ail_cursor_init(
117 struct xfs_ail *ailp,
118 struct xfs_ail_cursor *cur)
119{
120 cur->item = NULL;
121 if (cur == &ailp->xa_cursors)
122 return;
123
124 cur->next = ailp->xa_cursors.next;
125 ailp->xa_cursors.next = cur;
126}
127
128/*
129 * Set the cursor to the next item, because when we look
130 * up the cursor the current item may have been freed.
131 */
132STATIC void
133xfs_trans_ail_cursor_set(
134 struct xfs_ail *ailp,
135 struct xfs_ail_cursor *cur,
136 struct xfs_log_item *lip)
137{
138 if (lip)
139 cur->item = xfs_ail_next(ailp, lip);
140}
141
142/*
143 * Get the next item in the traversal and advance the cursor.
144 * If the cursor was invalidated (inidicated by a lip of 1),
145 * restart the traversal.
146 */
147struct xfs_log_item *
148xfs_trans_ail_cursor_next(
149 struct xfs_ail *ailp,
150 struct xfs_ail_cursor *cur)
151{
152 struct xfs_log_item *lip = cur->item;
153
154 if ((__psint_t)lip & 1)
155 lip = xfs_ail_min(ailp);
156 xfs_trans_ail_cursor_set(ailp, cur, lip);
157 return lip;
158}
159
160/*
161 * Now that the traversal is complete, we need to remove the cursor
162 * from the list of traversing cursors. Avoid removing the embedded
163 * push cursor, but use the fact it is alway present to make the
164 * list deletion simple.
165 */
166void
167xfs_trans_ail_cursor_done(
168 struct xfs_ail *ailp,
169 struct xfs_ail_cursor *done)
170{
171 struct xfs_ail_cursor *prev = NULL;
172 struct xfs_ail_cursor *cur;
173
174 done->item = NULL;
175 if (done == &ailp->xa_cursors)
176 return;
177 prev = &ailp->xa_cursors;
178 for (cur = prev->next; cur; prev = cur, cur = prev->next) {
179 if (cur == done) {
180 prev->next = cur->next;
181 break;
182 }
183 }
184 ASSERT(cur);
185}
186
187/*
188 * Invalidate any cursor that is pointing to this item. This is
189 * called when an item is removed from the AIL. Any cursor pointing
190 * to this object is now invalid and the traversal needs to be
191 * terminated so it doesn't reference a freed object. We set the
192 * cursor item to a value of 1 so we can distinguish between an
193 * invalidation and the end of the list when getting the next item
194 * from the cursor.
195 */
196STATIC void
197xfs_trans_ail_cursor_clear(
198 struct xfs_ail *ailp,
199 struct xfs_log_item *lip)
200{
201 struct xfs_ail_cursor *cur;
93 202
94 lip = xfs_ail_min(&mp->m_ail); 203 /* need to search all cursors */
95 if (lip && !XFS_FORCED_SHUTDOWN(mp)) { 204 for (cur = &ailp->xa_cursors; cur; cur = cur->next) {
96 if (XFS_LSN_CMP(threshold_lsn, mp->m_ail.xa_target) > 0) 205 if (cur->item == lip)
97 xfsaild_wakeup(mp, threshold_lsn); 206 cur->item = (struct xfs_log_item *)
207 ((__psint_t)cur->item | 1);
98 } 208 }
99} 209}
100 210
@@ -103,25 +213,27 @@ xfs_trans_push_ail(
103 * Return the current tree generation number for use 213 * Return the current tree generation number for use
104 * in calls to xfs_trans_next_ail(). 214 * in calls to xfs_trans_next_ail().
105 */ 215 */
106STATIC xfs_log_item_t * 216xfs_log_item_t *
107xfs_trans_first_push_ail( 217xfs_trans_ail_cursor_first(
108 xfs_mount_t *mp, 218 struct xfs_ail *ailp,
109 int *gen, 219 struct xfs_ail_cursor *cur,
110 xfs_lsn_t lsn) 220 xfs_lsn_t lsn)
111{ 221{
112 xfs_log_item_t *lip; 222 xfs_log_item_t *lip;
113 223
114 lip = xfs_ail_min(&mp->m_ail); 224 xfs_trans_ail_cursor_init(ailp, cur);
115 *gen = (int)mp->m_ail.xa_gen; 225 lip = xfs_ail_min(ailp);
116 if (lsn == 0) 226 if (lsn == 0)
117 return lip; 227 goto out;
118 228
119 list_for_each_entry(lip, &mp->m_ail.xa_ail, li_ail) { 229 list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
120 if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0) 230 if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0)
121 return lip; 231 goto out;
122 } 232 }
123 233 lip = NULL;
124 return NULL; 234out:
235 xfs_trans_ail_cursor_set(ailp, cur, lip);
236 return lip;
125} 237}
126 238
127/* 239/*
@@ -129,29 +241,29 @@ xfs_trans_first_push_ail(
129 */ 241 */
130long 242long
131xfsaild_push( 243xfsaild_push(
132 xfs_mount_t *mp, 244 struct xfs_ail *ailp,
133 xfs_lsn_t *last_lsn) 245 xfs_lsn_t *last_lsn)
134{ 246{
135 long tout = 1000; /* milliseconds */ 247 long tout = 1000; /* milliseconds */
136 xfs_lsn_t last_pushed_lsn = *last_lsn; 248 xfs_lsn_t last_pushed_lsn = *last_lsn;
137 xfs_lsn_t target = mp->m_ail.xa_target; 249 xfs_lsn_t target = ailp->xa_target;
138 xfs_lsn_t lsn; 250 xfs_lsn_t lsn;
139 xfs_log_item_t *lip; 251 xfs_log_item_t *lip;
140 int gen;
141 int restarts;
142 int flush_log, count, stuck; 252 int flush_log, count, stuck;
253 xfs_mount_t *mp = ailp->xa_mount;
254 struct xfs_ail_cursor *cur = &ailp->xa_cursors;
143 255
144#define XFS_TRANS_PUSH_AIL_RESTARTS 10 256 spin_lock(&ailp->xa_lock);
145 257 xfs_trans_ail_cursor_init(ailp, cur);
146 spin_lock(&mp->m_ail_lock); 258 lip = xfs_trans_ail_cursor_first(ailp, cur, *last_lsn);
147 lip = xfs_trans_first_push_ail(mp, &gen, *last_lsn);
148 if (!lip || XFS_FORCED_SHUTDOWN(mp)) { 259 if (!lip || XFS_FORCED_SHUTDOWN(mp)) {
149 /* 260 /*
150 * AIL is empty or our push has reached the end. 261 * AIL is empty or our push has reached the end.
151 */ 262 */
152 spin_unlock(&mp->m_ail_lock); 263 xfs_trans_ail_cursor_done(ailp, cur);
264 spin_unlock(&ailp->xa_lock);
153 last_pushed_lsn = 0; 265 last_pushed_lsn = 0;
154 goto out; 266 return tout;
155 } 267 }
156 268
157 XFS_STATS_INC(xs_push_ail); 269 XFS_STATS_INC(xs_push_ail);
@@ -169,7 +281,7 @@ xfsaild_push(
169 */ 281 */
170 tout = 10; 282 tout = 10;
171 lsn = lip->li_lsn; 283 lsn = lip->li_lsn;
172 flush_log = stuck = count = restarts = 0; 284 flush_log = stuck = count = 0;
173 while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) { 285 while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) {
174 int lock_result; 286 int lock_result;
175 /* 287 /*
@@ -184,7 +296,7 @@ xfsaild_push(
184 * skip to the next item in the list. 296 * skip to the next item in the list.
185 */ 297 */
186 lock_result = IOP_TRYLOCK(lip); 298 lock_result = IOP_TRYLOCK(lip);
187 spin_unlock(&mp->m_ail_lock); 299 spin_unlock(&ailp->xa_lock);
188 switch (lock_result) { 300 switch (lock_result) {
189 case XFS_ITEM_SUCCESS: 301 case XFS_ITEM_SUCCESS:
190 XFS_STATS_INC(xs_push_ail_success); 302 XFS_STATS_INC(xs_push_ail_success);
@@ -221,7 +333,7 @@ xfsaild_push(
221 break; 333 break;
222 } 334 }
223 335
224 spin_lock(&mp->m_ail_lock); 336 spin_lock(&ailp->xa_lock);
225 /* should we bother continuing? */ 337 /* should we bother continuing? */
226 if (XFS_FORCED_SHUTDOWN(mp)) 338 if (XFS_FORCED_SHUTDOWN(mp))
227 break; 339 break;
@@ -244,14 +356,13 @@ xfsaild_push(
244 if (stuck > 100) 356 if (stuck > 100)
245 break; 357 break;
246 358
247 lip = xfs_trans_next_ail(mp, lip, &gen, &restarts); 359 lip = xfs_trans_ail_cursor_next(ailp, cur);
248 if (lip == NULL) 360 if (lip == NULL)
249 break; 361 break;
250 if (restarts > XFS_TRANS_PUSH_AIL_RESTARTS)
251 break;
252 lsn = lip->li_lsn; 362 lsn = lip->li_lsn;
253 } 363 }
254 spin_unlock(&mp->m_ail_lock); 364 xfs_trans_ail_cursor_done(ailp, cur);
365 spin_unlock(&ailp->xa_lock);
255 366
256 if (flush_log) { 367 if (flush_log) {
257 /* 368 /*
@@ -274,8 +385,7 @@ xfsaild_push(
274 */ 385 */
275 tout += 20; 386 tout += 20;
276 last_pushed_lsn = 0; 387 last_pushed_lsn = 0;
277 } else if ((restarts > XFS_TRANS_PUSH_AIL_RESTARTS) || 388 } else if ((stuck * 100) / count > 90) {
278 ((stuck * 100) / count > 90)) {
279 /* 389 /*
280 * Either there is a lot of contention on the AIL or we 390 * Either there is a lot of contention on the AIL or we
281 * are stuck due to operations in progress. "Stuck" in this 391 * are stuck due to operations in progress. "Stuck" in this
@@ -287,7 +397,6 @@ xfsaild_push(
287 */ 397 */
288 tout += 10; 398 tout += 10;
289 } 399 }
290out:
291 *last_lsn = last_pushed_lsn; 400 *last_lsn = last_pushed_lsn;
292 return tout; 401 return tout;
293} /* xfsaild_push */ 402} /* xfsaild_push */
@@ -303,7 +412,7 @@ out:
303 */ 412 */
304void 413void
305xfs_trans_unlocked_item( 414xfs_trans_unlocked_item(
306 xfs_mount_t *mp, 415 struct xfs_ail *ailp,
307 xfs_log_item_t *lip) 416 xfs_log_item_t *lip)
308{ 417{
309 xfs_log_item_t *min_lip; 418 xfs_log_item_t *min_lip;
@@ -315,7 +424,7 @@ xfs_trans_unlocked_item(
315 * over some potentially valid data. 424 * over some potentially valid data.
316 */ 425 */
317 if (!(lip->li_flags & XFS_LI_IN_AIL) || 426 if (!(lip->li_flags & XFS_LI_IN_AIL) ||
318 XFS_FORCED_SHUTDOWN(mp)) { 427 XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
319 return; 428 return;
320 } 429 }
321 430
@@ -331,10 +440,10 @@ xfs_trans_unlocked_item(
331 * the call to xfs_log_move_tail() doesn't do anything if there's 440 * the call to xfs_log_move_tail() doesn't do anything if there's
332 * not enough free space to wake people up so we're safe calling it. 441 * not enough free space to wake people up so we're safe calling it.
333 */ 442 */
334 min_lip = xfs_ail_min(&mp->m_ail); 443 min_lip = xfs_ail_min(ailp);
335 444
336 if (min_lip == lip) 445 if (min_lip == lip)
337 xfs_log_move_tail(mp, 1); 446 xfs_log_move_tail(ailp->xa_mount, 1);
338} /* xfs_trans_unlocked_item */ 447} /* xfs_trans_unlocked_item */
339 448
340 449
@@ -347,41 +456,37 @@ xfs_trans_unlocked_item(
347 * we move in the AIL is the minimum one, update the tail lsn in the 456 * we move in the AIL is the minimum one, update the tail lsn in the
348 * log manager. 457 * log manager.
349 * 458 *
350 * Increment the AIL's generation count to indicate that the tree
351 * has changed.
352 *
353 * This function must be called with the AIL lock held. The lock 459 * This function must be called with the AIL lock held. The lock
354 * is dropped before returning. 460 * is dropped before returning.
355 */ 461 */
356void 462void
357xfs_trans_update_ail( 463xfs_trans_ail_update(
358 xfs_mount_t *mp, 464 struct xfs_ail *ailp,
359 xfs_log_item_t *lip, 465 xfs_log_item_t *lip,
360 xfs_lsn_t lsn) __releases(mp->m_ail_lock) 466 xfs_lsn_t lsn) __releases(ailp->xa_lock)
361{ 467{
362 xfs_log_item_t *dlip=NULL; 468 xfs_log_item_t *dlip = NULL;
363 xfs_log_item_t *mlip; /* ptr to minimum lip */ 469 xfs_log_item_t *mlip; /* ptr to minimum lip */
364 470
365 mlip = xfs_ail_min(&mp->m_ail); 471 mlip = xfs_ail_min(ailp);
366 472
367 if (lip->li_flags & XFS_LI_IN_AIL) { 473 if (lip->li_flags & XFS_LI_IN_AIL) {
368 dlip = xfs_ail_delete(&mp->m_ail, lip); 474 dlip = xfs_ail_delete(ailp, lip);
369 ASSERT(dlip == lip); 475 ASSERT(dlip == lip);
476 xfs_trans_ail_cursor_clear(ailp, dlip);
370 } else { 477 } else {
371 lip->li_flags |= XFS_LI_IN_AIL; 478 lip->li_flags |= XFS_LI_IN_AIL;
372 } 479 }
373 480
374 lip->li_lsn = lsn; 481 lip->li_lsn = lsn;
375 482 xfs_ail_insert(ailp, lip);
376 xfs_ail_insert(&mp->m_ail, lip);
377 mp->m_ail.xa_gen++;
378 483
379 if (mlip == dlip) { 484 if (mlip == dlip) {
380 mlip = xfs_ail_min(&mp->m_ail); 485 mlip = xfs_ail_min(ailp);
381 spin_unlock(&mp->m_ail_lock); 486 spin_unlock(&ailp->xa_lock);
382 xfs_log_move_tail(mp, mlip->li_lsn); 487 xfs_log_move_tail(ailp->xa_mount, mlip->li_lsn);
383 } else { 488 } else {
384 spin_unlock(&mp->m_ail_lock); 489 spin_unlock(&ailp->xa_lock);
385 } 490 }
386 491
387 492
@@ -403,29 +508,30 @@ xfs_trans_update_ail(
403 * is dropped before returning. 508 * is dropped before returning.
404 */ 509 */
405void 510void
406xfs_trans_delete_ail( 511xfs_trans_ail_delete(
407 xfs_mount_t *mp, 512 struct xfs_ail *ailp,
408 xfs_log_item_t *lip) __releases(mp->m_ail_lock) 513 xfs_log_item_t *lip) __releases(ailp->xa_lock)
409{ 514{
410 xfs_log_item_t *dlip; 515 xfs_log_item_t *dlip;
411 xfs_log_item_t *mlip; 516 xfs_log_item_t *mlip;
412 517
413 if (lip->li_flags & XFS_LI_IN_AIL) { 518 if (lip->li_flags & XFS_LI_IN_AIL) {
414 mlip = xfs_ail_min(&mp->m_ail); 519 mlip = xfs_ail_min(ailp);
415 dlip = xfs_ail_delete(&mp->m_ail, lip); 520 dlip = xfs_ail_delete(ailp, lip);
416 ASSERT(dlip == lip); 521 ASSERT(dlip == lip);
522 xfs_trans_ail_cursor_clear(ailp, dlip);
417 523
418 524
419 lip->li_flags &= ~XFS_LI_IN_AIL; 525 lip->li_flags &= ~XFS_LI_IN_AIL;
420 lip->li_lsn = 0; 526 lip->li_lsn = 0;
421 mp->m_ail.xa_gen++;
422 527
423 if (mlip == dlip) { 528 if (mlip == dlip) {
424 mlip = xfs_ail_min(&mp->m_ail); 529 mlip = xfs_ail_min(ailp);
425 spin_unlock(&mp->m_ail_lock); 530 spin_unlock(&ailp->xa_lock);
426 xfs_log_move_tail(mp, (mlip ? mlip->li_lsn : 0)); 531 xfs_log_move_tail(ailp->xa_mount,
532 (mlip ? mlip->li_lsn : 0));
427 } else { 533 } else {
428 spin_unlock(&mp->m_ail_lock); 534 spin_unlock(&ailp->xa_lock);
429 } 535 }
430 } 536 }
431 else { 537 else {
@@ -433,13 +539,13 @@ xfs_trans_delete_ail(
433 * If the file system is not being shutdown, we are in 539 * If the file system is not being shutdown, we are in
434 * serious trouble if we get to this stage. 540 * serious trouble if we get to this stage.
435 */ 541 */
436 if (XFS_FORCED_SHUTDOWN(mp)) 542 struct xfs_mount *mp = ailp->xa_mount;
437 spin_unlock(&mp->m_ail_lock); 543
438 else { 544 spin_unlock(&ailp->xa_lock);
545 if (!XFS_FORCED_SHUTDOWN(mp)) {
439 xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp, 546 xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
440 "%s: attempting to delete a log item that is not in the AIL", 547 "%s: attempting to delete a log item that is not in the AIL",
441 __func__); 548 __func__);
442 spin_unlock(&mp->m_ail_lock);
443 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 549 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
444 } 550 }
445 } 551 }
@@ -448,56 +554,6 @@ xfs_trans_delete_ail(
448 554
449 555
450/* 556/*
451 * Return the item in the AIL with the smallest lsn.
452 * Return the current tree generation number for use
453 * in calls to xfs_trans_next_ail().
454 */
455xfs_log_item_t *
456xfs_trans_first_ail(
457 xfs_mount_t *mp,
458 int *gen)
459{
460 xfs_log_item_t *lip;
461
462 lip = xfs_ail_min(&mp->m_ail);
463 *gen = (int)mp->m_ail.xa_gen;
464
465 return lip;
466}
467
468/*
469 * If the generation count of the tree has not changed since the
470 * caller last took something from the AIL, then return the elmt
471 * in the tree which follows the one given. If the count has changed,
472 * then return the minimum elmt of the AIL and bump the restarts counter
473 * if one is given.
474 */
475xfs_log_item_t *
476xfs_trans_next_ail(
477 xfs_mount_t *mp,
478 xfs_log_item_t *lip,
479 int *gen,
480 int *restarts)
481{
482 xfs_log_item_t *nlip;
483
484 ASSERT(mp && lip && gen);
485 if (mp->m_ail.xa_gen == *gen) {
486 nlip = xfs_ail_next(&mp->m_ail, lip);
487 } else {
488 nlip = xfs_ail_min(&mp->m_ail);
489 *gen = (int)mp->m_ail.xa_gen;
490 if (restarts != NULL) {
491 XFS_STATS_INC(xs_push_ail_restarts);
492 (*restarts)++;
493 }
494 }
495
496 return (nlip);
497}
498
499
500/*
501 * The active item list (AIL) is a doubly linked list of log 557 * The active item list (AIL) is a doubly linked list of log
502 * items sorted by ascending lsn. The base of the list is 558 * items sorted by ascending lsn. The base of the list is
503 * a forw/back pointer pair embedded in the xfs mount structure. 559 * a forw/back pointer pair embedded in the xfs mount structure.
@@ -515,15 +571,35 @@ int
515xfs_trans_ail_init( 571xfs_trans_ail_init(
516 xfs_mount_t *mp) 572 xfs_mount_t *mp)
517{ 573{
518 INIT_LIST_HEAD(&mp->m_ail.xa_ail); 574 struct xfs_ail *ailp;
519 return xfsaild_start(mp); 575 int error;
576
577 ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL);
578 if (!ailp)
579 return ENOMEM;
580
581 ailp->xa_mount = mp;
582 INIT_LIST_HEAD(&ailp->xa_ail);
583 spin_lock_init(&ailp->xa_lock);
584 error = xfsaild_start(ailp);
585 if (error)
586 goto out_free_ailp;
587 mp->m_ail = ailp;
588 return 0;
589
590out_free_ailp:
591 kmem_free(ailp);
592 return error;
520} 593}
521 594
522void 595void
523xfs_trans_ail_destroy( 596xfs_trans_ail_destroy(
524 xfs_mount_t *mp) 597 xfs_mount_t *mp)
525{ 598{
526 xfsaild_stop(mp); 599 struct xfs_ail *ailp = mp->m_ail;
600
601 xfsaild_stop(ailp);
602 kmem_free(ailp);
527} 603}
528 604
529/* 605/*
@@ -534,7 +610,7 @@ xfs_trans_ail_destroy(
534 */ 610 */
535STATIC void 611STATIC void
536xfs_ail_insert( 612xfs_ail_insert(
537 xfs_ail_t *ailp, 613 struct xfs_ail *ailp,
538 xfs_log_item_t *lip) 614 xfs_log_item_t *lip)
539/* ARGSUSED */ 615/* ARGSUSED */
540{ 616{
@@ -568,7 +644,7 @@ xfs_ail_insert(
568/*ARGSUSED*/ 644/*ARGSUSED*/
569STATIC xfs_log_item_t * 645STATIC xfs_log_item_t *
570xfs_ail_delete( 646xfs_ail_delete(
571 xfs_ail_t *ailp, 647 struct xfs_ail *ailp,
572 xfs_log_item_t *lip) 648 xfs_log_item_t *lip)
573/* ARGSUSED */ 649/* ARGSUSED */
574{ 650{
@@ -585,7 +661,7 @@ xfs_ail_delete(
585 */ 661 */
586STATIC xfs_log_item_t * 662STATIC xfs_log_item_t *
587xfs_ail_min( 663xfs_ail_min(
588 xfs_ail_t *ailp) 664 struct xfs_ail *ailp)
589/* ARGSUSED */ 665/* ARGSUSED */
590{ 666{
591 if (list_empty(&ailp->xa_ail)) 667 if (list_empty(&ailp->xa_ail))
@@ -601,7 +677,7 @@ xfs_ail_min(
601 */ 677 */
602STATIC xfs_log_item_t * 678STATIC xfs_log_item_t *
603xfs_ail_next( 679xfs_ail_next(
604 xfs_ail_t *ailp, 680 struct xfs_ail *ailp,
605 xfs_log_item_t *lip) 681 xfs_log_item_t *lip)
606/* ARGSUSED */ 682/* ARGSUSED */
607{ 683{
@@ -617,7 +693,7 @@ xfs_ail_next(
617 */ 693 */
618STATIC void 694STATIC void
619xfs_ail_check( 695xfs_ail_check(
620 xfs_ail_t *ailp, 696 struct xfs_ail *ailp,
621 xfs_log_item_t *lip) 697 xfs_log_item_t *lip)
622{ 698{
623 xfs_log_item_t *prev_lip; 699 xfs_log_item_t *prev_lip;
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 4e855b5ced66..8ee2f8c8b0a6 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -527,9 +527,8 @@ xfs_trans_brelse(xfs_trans_t *tp,
527 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 527 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
528 if (lip->li_type == XFS_LI_BUF) { 528 if (lip->li_type == XFS_LI_BUF) {
529 bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*); 529 bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*);
530 xfs_trans_unlocked_item( 530 xfs_trans_unlocked_item(bip->bli_item.li_ailp,
531 bip->bli_item.li_mountp, 531 lip);
532 lip);
533 } 532 }
534 } 533 }
535 xfs_buf_relse(bp); 534 xfs_buf_relse(bp);
@@ -626,7 +625,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
626 * tell the AIL that the buffer is being unlocked. 625 * tell the AIL that the buffer is being unlocked.
627 */ 626 */
628 if (bip != NULL) { 627 if (bip != NULL) {
629 xfs_trans_unlocked_item(bip->bli_item.li_mountp, 628 xfs_trans_unlocked_item(bip->bli_item.li_ailp,
630 (xfs_log_item_t*)bip); 629 (xfs_log_item_t*)bip);
631 } 630 }
632 631
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 2a1c0f071f91..23d276af2e0c 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -85,7 +85,6 @@ xfs_trans_iget(
85{ 85{
86 int error; 86 int error;
87 xfs_inode_t *ip; 87 xfs_inode_t *ip;
88 xfs_inode_log_item_t *iip;
89 88
90 /* 89 /*
91 * If the transaction pointer is NULL, just call the normal 90 * If the transaction pointer is NULL, just call the normal
@@ -138,34 +137,7 @@ xfs_trans_iget(
138 } 137 }
139 ASSERT(ip != NULL); 138 ASSERT(ip != NULL);
140 139
141 /* 140 xfs_trans_ijoin(tp, ip, lock_flags);
142 * Get a log_item_desc to point at the new item.
143 */
144 if (ip->i_itemp == NULL)
145 xfs_inode_item_init(ip, mp);
146 iip = ip->i_itemp;
147 (void) xfs_trans_add_item(tp, (xfs_log_item_t *)(iip));
148
149 xfs_trans_inode_broot_debug(ip);
150
151 /*
152 * If the IO lock has been acquired, mark that in
153 * the inode log item so we'll know to unlock it
154 * when the transaction commits.
155 */
156 ASSERT(iip->ili_flags == 0);
157 if (lock_flags & XFS_IOLOCK_EXCL) {
158 iip->ili_flags |= XFS_ILI_IOLOCKED_EXCL;
159 } else if (lock_flags & XFS_IOLOCK_SHARED) {
160 iip->ili_flags |= XFS_ILI_IOLOCKED_SHARED;
161 }
162
163 /*
164 * Initialize i_transp so we can find it with xfs_inode_incore()
165 * above.
166 */
167 ip->i_transp = tp;
168
169 *ipp = ip; 141 *ipp = ip;
170 return 0; 142 return 0;
171} 143}
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index 3c666e8317f8..e110bf57d7f4 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -22,6 +22,14 @@
22#include "xfs_inum.h" 22#include "xfs_inum.h"
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_trans_priv.h" 24#include "xfs_trans_priv.h"
25/* XXX: from here down needed until struct xfs_trans has it's own ailp */
26#include "xfs_bit.h"
27#include "xfs_buf_item.h"
28#include "xfs_sb.h"
29#include "xfs_ag.h"
30#include "xfs_dir2.h"
31#include "xfs_dmapi.h"
32#include "xfs_mount.h"
25 33
26STATIC int xfs_trans_unlock_chunk(xfs_log_item_chunk_t *, 34STATIC int xfs_trans_unlock_chunk(xfs_log_item_chunk_t *,
27 int, int, xfs_lsn_t); 35 int, int, xfs_lsn_t);
@@ -79,6 +87,7 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
79 lidp->lid_size = 0; 87 lidp->lid_size = 0;
80 lip->li_desc = lidp; 88 lip->li_desc = lidp;
81 lip->li_mountp = tp->t_mountp; 89 lip->li_mountp = tp->t_mountp;
90 lip->li_ailp = tp->t_mountp->m_ail;
82 return lidp; 91 return lidp;
83 } 92 }
84 93
@@ -120,6 +129,7 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
120 lidp->lid_size = 0; 129 lidp->lid_size = 0;
121 lip->li_desc = lidp; 130 lip->li_desc = lidp;
122 lip->li_mountp = tp->t_mountp; 131 lip->li_mountp = tp->t_mountp;
132 lip->li_ailp = tp->t_mountp->m_ail;
123 return lidp; 133 return lidp;
124} 134}
125 135
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 3c748c456ed4..73e2ad397432 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -44,25 +44,93 @@ xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
44 xfs_extlen_t idx); 44 xfs_extlen_t idx);
45 45
46/* 46/*
47 * From xfs_trans_ail.c 47 * AIL traversal cursor.
48 *
49 * Rather than using a generation number for detecting changes in the ail, use
50 * a cursor that is protected by the ail lock. The aild cursor exists in the
51 * struct xfs_ail, but other traversals can declare it on the stack and link it
52 * to the ail list.
53 *
54 * When an object is deleted from or moved int the AIL, the cursor list is
55 * searched to see if the object is a designated cursor item. If it is, it is
56 * deleted from the cursor so that the next time the cursor is used traversal
57 * will return to the start.
58 *
59 * This means a traversal colliding with a removal will cause a restart of the
60 * list scan, rather than any insertion or deletion anywhere in the list. The
61 * low bit of the item pointer is set if the cursor has been invalidated so
62 * that we can tell the difference between invalidation and reaching the end
63 * of the list to trigger traversal restarts.
48 */ 64 */
49void xfs_trans_update_ail(struct xfs_mount *mp, 65struct xfs_ail_cursor {
50 struct xfs_log_item *lip, xfs_lsn_t lsn) 66 struct xfs_ail_cursor *next;
51 __releases(mp->m_ail_lock); 67 struct xfs_log_item *item;
52void xfs_trans_delete_ail(struct xfs_mount *mp, 68};
53 struct xfs_log_item *lip)
54 __releases(mp->m_ail_lock);
55struct xfs_log_item *xfs_trans_first_ail(struct xfs_mount *, int *);
56struct xfs_log_item *xfs_trans_next_ail(struct xfs_mount *,
57 struct xfs_log_item *, int *, int *);
58 69
70/*
71 * Private AIL structures.
72 *
73 * Eventually we need to drive the locking in here as well.
74 */
75struct xfs_ail {
76 struct xfs_mount *xa_mount;
77 struct list_head xa_ail;
78 uint xa_gen;
79 struct task_struct *xa_task;
80 xfs_lsn_t xa_target;
81 struct xfs_ail_cursor xa_cursors;
82 spinlock_t xa_lock;
83};
59 84
60/* 85/*
61 * AIL push thread support 86 * From xfs_trans_ail.c
62 */ 87 */
63long xfsaild_push(struct xfs_mount *, xfs_lsn_t *); 88void xfs_trans_ail_update(struct xfs_ail *ailp,
64void xfsaild_wakeup(struct xfs_mount *, xfs_lsn_t); 89 struct xfs_log_item *lip, xfs_lsn_t lsn)
65int xfsaild_start(struct xfs_mount *); 90 __releases(ailp->xa_lock);
66void xfsaild_stop(struct xfs_mount *); 91void xfs_trans_ail_delete(struct xfs_ail *ailp,
92 struct xfs_log_item *lip)
93 __releases(ailp->xa_lock);
94void xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t);
95void xfs_trans_unlocked_item(struct xfs_ail *,
96 xfs_log_item_t *);
97
98xfs_lsn_t xfs_trans_ail_tail(struct xfs_ail *ailp);
99
100struct xfs_log_item *xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
101 struct xfs_ail_cursor *cur,
102 xfs_lsn_t lsn);
103struct xfs_log_item *xfs_trans_ail_cursor_next(struct xfs_ail *ailp,
104 struct xfs_ail_cursor *cur);
105void xfs_trans_ail_cursor_done(struct xfs_ail *ailp,
106 struct xfs_ail_cursor *cur);
107
108long xfsaild_push(struct xfs_ail *, xfs_lsn_t *);
109void xfsaild_wakeup(struct xfs_ail *, xfs_lsn_t);
110int xfsaild_start(struct xfs_ail *);
111void xfsaild_stop(struct xfs_ail *);
67 112
113#if BITS_PER_LONG != 64
114static inline void
115xfs_trans_ail_copy_lsn(
116 struct xfs_ail *ailp,
117 xfs_lsn_t *dst,
118 xfs_lsn_t *src)
119{
120 ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */
121 spin_lock(&ailp->xa_lock);
122 *dst = *src;
123 spin_unlock(&ailp->xa_lock);
124}
125#else
126static inline void
127xfs_trans_ail_copy_lsn(
128 struct xfs_ail *ailp,
129 xfs_lsn_t *dst,
130 xfs_lsn_t *src)
131{
132 ASSERT(sizeof(xfs_lsn_t) == 8);
133 *dst = *src;
134}
135#endif
68#endif /* __XFS_TRANS_PRIV_H__ */ 136#endif /* __XFS_TRANS_PRIV_H__ */
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 35d4d414bcc2..fcc2285d03ed 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -172,6 +172,12 @@ xfs_dir_ialloc(
172 *ipp = NULL; 172 *ipp = NULL;
173 return code; 173 return code;
174 } 174 }
175
176 /*
177 * transaction commit worked ok so we can drop the extra ticket
178 * reference that we gained in xfs_trans_dup()
179 */
180 xfs_log_ticket_put(tp->t_ticket);
175 code = xfs_trans_reserve(tp, 0, log_res, 0, 181 code = xfs_trans_reserve(tp, 0, log_res, 0,
176 XFS_TRANS_PERM_LOG_RES, log_count); 182 XFS_TRANS_PERM_LOG_RES, log_count);
177 /* 183 /*
@@ -268,9 +274,9 @@ xfs_bump_ino_vers2(
268 xfs_mount_t *mp; 274 xfs_mount_t *mp;
269 275
270 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 276 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
271 ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1); 277 ASSERT(ip->i_d.di_version == 1);
272 278
273 ip->i_d.di_version = XFS_DINODE_VERSION_2; 279 ip->i_d.di_version = 2;
274 ip->i_d.di_onlink = 0; 280 ip->i_d.di_onlink = 0;
275 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 281 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
276 mp = tp->t_mountp; 282 mp = tp->t_mountp;
@@ -302,7 +308,7 @@ xfs_bumplink(
302 ASSERT(ip->i_d.di_nlink > 0); 308 ASSERT(ip->i_d.di_nlink > 0);
303 ip->i_d.di_nlink++; 309 ip->i_d.di_nlink++;
304 inc_nlink(VFS_I(ip)); 310 inc_nlink(VFS_I(ip));
305 if ((ip->i_d.di_version == XFS_DINODE_VERSION_1) && 311 if ((ip->i_d.di_version == 1) &&
306 (ip->i_d.di_nlink > XFS_MAXLINK_1)) { 312 (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
307 /* 313 /*
308 * The inode has increased its number of links beyond 314 * The inode has increased its number of links beyond
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
deleted file mode 100644
index 439dd3939dda..000000000000
--- a/fs/xfs/xfs_vfsops.c
+++ /dev/null
@@ -1,757 +0,0 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h"
30#include "xfs_da_btree.h"
31#include "xfs_bmap_btree.h"
32#include "xfs_ialloc_btree.h"
33#include "xfs_alloc_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h"
37#include "xfs_inode.h"
38#include "xfs_inode_item.h"
39#include "xfs_btree.h"
40#include "xfs_alloc.h"
41#include "xfs_ialloc.h"
42#include "xfs_quota.h"
43#include "xfs_error.h"
44#include "xfs_bmap.h"
45#include "xfs_rw.h"
46#include "xfs_buf_item.h"
47#include "xfs_log_priv.h"
48#include "xfs_dir2_trace.h"
49#include "xfs_extfree_item.h"
50#include "xfs_acl.h"
51#include "xfs_attr.h"
52#include "xfs_clnt.h"
53#include "xfs_mru_cache.h"
54#include "xfs_filestream.h"
55#include "xfs_fsops.h"
56#include "xfs_vnodeops.h"
57#include "xfs_vfsops.h"
58#include "xfs_utils.h"
59
60
61STATIC void
62xfs_quiesce_fs(
63 xfs_mount_t *mp)
64{
65 int count = 0, pincount;
66
67 xfs_flush_buftarg(mp->m_ddev_targp, 0);
68 xfs_finish_reclaim_all(mp, 0);
69
70 /* This loop must run at least twice.
71 * The first instance of the loop will flush
72 * most meta data but that will generate more
73 * meta data (typically directory updates).
74 * Which then must be flushed and logged before
75 * we can write the unmount record.
76 */
77 do {
78 xfs_syncsub(mp, SYNC_INODE_QUIESCE, NULL);
79 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
80 if (!pincount) {
81 delay(50);
82 count++;
83 }
84 } while (count < 2);
85}
86
87/*
88 * Second stage of a quiesce. The data is already synced, now we have to take
89 * care of the metadata. New transactions are already blocked, so we need to
90 * wait for any remaining transactions to drain out before proceding.
91 */
92void
93xfs_attr_quiesce(
94 xfs_mount_t *mp)
95{
96 int error = 0;
97
98 /* wait for all modifications to complete */
99 while (atomic_read(&mp->m_active_trans) > 0)
100 delay(100);
101
102 /* flush inodes and push all remaining buffers out to disk */
103 xfs_quiesce_fs(mp);
104
105 ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0);
106
107 /* Push the superblock and write an unmount record */
108 error = xfs_log_sbcount(mp, 1);
109 if (error)
110 xfs_fs_cmn_err(CE_WARN, mp,
111 "xfs_attr_quiesce: failed to log sb changes. "
112 "Frozen image may not be consistent.");
113 xfs_log_unmount_write(mp);
114 xfs_unmountfs_writesb(mp);
115}
116
117/*
118 * xfs_unmount_flush implements a set of flush operation on special
119 * inodes, which are needed as a separate set of operations so that
120 * they can be called as part of relocation process.
121 */
122int
123xfs_unmount_flush(
124 xfs_mount_t *mp, /* Mount structure we are getting
125 rid of. */
126 int relocation) /* Called from vfs relocation. */
127{
128 xfs_inode_t *rip = mp->m_rootip;
129 xfs_inode_t *rbmip;
130 xfs_inode_t *rsumip = NULL;
131 int error;
132
133 xfs_ilock(rip, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
134 xfs_iflock(rip);
135
136 /*
137 * Flush out the real time inodes.
138 */
139 if ((rbmip = mp->m_rbmip) != NULL) {
140 xfs_ilock(rbmip, XFS_ILOCK_EXCL);
141 xfs_iflock(rbmip);
142 error = xfs_iflush(rbmip, XFS_IFLUSH_SYNC);
143 xfs_iunlock(rbmip, XFS_ILOCK_EXCL);
144
145 if (error == EFSCORRUPTED)
146 goto fscorrupt_out;
147
148 ASSERT(vn_count(VFS_I(rbmip)) == 1);
149
150 rsumip = mp->m_rsumip;
151 xfs_ilock(rsumip, XFS_ILOCK_EXCL);
152 xfs_iflock(rsumip);
153 error = xfs_iflush(rsumip, XFS_IFLUSH_SYNC);
154 xfs_iunlock(rsumip, XFS_ILOCK_EXCL);
155
156 if (error == EFSCORRUPTED)
157 goto fscorrupt_out;
158
159 ASSERT(vn_count(VFS_I(rsumip)) == 1);
160 }
161
162 /*
163 * Synchronously flush root inode to disk
164 */
165 error = xfs_iflush(rip, XFS_IFLUSH_SYNC);
166 if (error == EFSCORRUPTED)
167 goto fscorrupt_out2;
168
169 if (vn_count(VFS_I(rip)) != 1 && !relocation) {
170 xfs_iunlock(rip, XFS_ILOCK_EXCL);
171 return XFS_ERROR(EBUSY);
172 }
173
174 /*
175 * Release dquot that rootinode, rbmino and rsumino might be holding,
176 * flush and purge the quota inodes.
177 */
178 error = XFS_QM_UNMOUNT(mp);
179 if (error == EFSCORRUPTED)
180 goto fscorrupt_out2;
181
182 if (rbmip) {
183 IRELE(rbmip);
184 IRELE(rsumip);
185 }
186
187 xfs_iunlock(rip, XFS_ILOCK_EXCL);
188 return 0;
189
190fscorrupt_out:
191 xfs_ifunlock(rip);
192
193fscorrupt_out2:
194 xfs_iunlock(rip, XFS_ILOCK_EXCL);
195
196 return XFS_ERROR(EFSCORRUPTED);
197}
198
199/*
200 * xfs_sync flushes any pending I/O to file system vfsp.
201 *
202 * This routine is called by vfs_sync() to make sure that things make it
203 * out to disk eventually, on sync() system calls to flush out everything,
204 * and when the file system is unmounted. For the vfs_sync() case, all
205 * we really need to do is sync out the log to make all of our meta-data
206 * updates permanent (except for timestamps). For calls from pflushd(),
207 * dirty pages are kept moving by calling pdflush() on the inodes
208 * containing them. We also flush the inodes that we can lock without
209 * sleeping and the superblock if we can lock it without sleeping from
210 * vfs_sync() so that items at the tail of the log are always moving out.
211 *
212 * Flags:
213 * SYNC_BDFLUSH - We're being called from vfs_sync() so we don't want
214 * to sleep if we can help it. All we really need
215 * to do is ensure that the log is synced at least
216 * periodically. We also push the inodes and
217 * superblock if we can lock them without sleeping
218 * and they are not pinned.
219 * SYNC_ATTR - We need to flush the inodes. If SYNC_BDFLUSH is not
220 * set, then we really want to lock each inode and flush
221 * it.
222 * SYNC_WAIT - All the flushes that take place in this call should
223 * be synchronous.
224 * SYNC_DELWRI - This tells us to push dirty pages associated with
225 * inodes. SYNC_WAIT and SYNC_BDFLUSH are used to
226 * determine if they should be flushed sync, async, or
227 * delwri.
228 * SYNC_CLOSE - This flag is passed when the system is being
229 * unmounted. We should sync and invalidate everything.
230 * SYNC_FSDATA - This indicates that the caller would like to make
231 * sure the superblock is safe on disk. We can ensure
232 * this by simply making sure the log gets flushed
233 * if SYNC_BDFLUSH is set, and by actually writing it
234 * out otherwise.
235 * SYNC_IOWAIT - The caller wants us to wait for all data I/O to complete
236 * before we return (including direct I/O). Forms the drain
237 * side of the write barrier needed to safely quiesce the
238 * filesystem.
239 *
240 */
241int
242xfs_sync(
243 xfs_mount_t *mp,
244 int flags)
245{
246 int error;
247
248 /*
249 * Get the Quota Manager to flush the dquots.
250 *
251 * If XFS quota support is not enabled or this filesystem
252 * instance does not use quotas XFS_QM_DQSYNC will always
253 * return zero.
254 */
255 error = XFS_QM_DQSYNC(mp, flags);
256 if (error) {
257 /*
258 * If we got an IO error, we will be shutting down.
259 * So, there's nothing more for us to do here.
260 */
261 ASSERT(error != EIO || XFS_FORCED_SHUTDOWN(mp));
262 if (XFS_FORCED_SHUTDOWN(mp))
263 return XFS_ERROR(error);
264 }
265
266 if (flags & SYNC_IOWAIT)
267 xfs_filestream_flush(mp);
268
269 return xfs_syncsub(mp, flags, NULL);
270}
271
272/*
273 * xfs sync routine for internal use
274 *
275 * This routine supports all of the flags defined for the generic vfs_sync
276 * interface as explained above under xfs_sync.
277 *
278 */
279int
280xfs_sync_inodes(
281 xfs_mount_t *mp,
282 int flags,
283 int *bypassed)
284{
285 xfs_inode_t *ip = NULL;
286 struct inode *vp = NULL;
287 int error;
288 int last_error;
289 uint64_t fflag;
290 uint lock_flags;
291 uint base_lock_flags;
292 boolean_t mount_locked;
293 boolean_t vnode_refed;
294 int preempt;
295 xfs_iptr_t *ipointer;
296#ifdef DEBUG
297 boolean_t ipointer_in = B_FALSE;
298
299#define IPOINTER_SET ipointer_in = B_TRUE
300#define IPOINTER_CLR ipointer_in = B_FALSE
301#else
302#define IPOINTER_SET
303#define IPOINTER_CLR
304#endif
305
306
307/* Insert a marker record into the inode list after inode ip. The list
308 * must be locked when this is called. After the call the list will no
309 * longer be locked.
310 */
311#define IPOINTER_INSERT(ip, mp) { \
312 ASSERT(ipointer_in == B_FALSE); \
313 ipointer->ip_mnext = ip->i_mnext; \
314 ipointer->ip_mprev = ip; \
315 ip->i_mnext = (xfs_inode_t *)ipointer; \
316 ipointer->ip_mnext->i_mprev = (xfs_inode_t *)ipointer; \
317 preempt = 0; \
318 XFS_MOUNT_IUNLOCK(mp); \
319 mount_locked = B_FALSE; \
320 IPOINTER_SET; \
321 }
322
323/* Remove the marker from the inode list. If the marker was the only item
324 * in the list then there are no remaining inodes and we should zero out
325 * the whole list. If we are the current head of the list then move the head
326 * past us.
327 */
328#define IPOINTER_REMOVE(ip, mp) { \
329 ASSERT(ipointer_in == B_TRUE); \
330 if (ipointer->ip_mnext != (xfs_inode_t *)ipointer) { \
331 ip = ipointer->ip_mnext; \
332 ip->i_mprev = ipointer->ip_mprev; \
333 ipointer->ip_mprev->i_mnext = ip; \
334 if (mp->m_inodes == (xfs_inode_t *)ipointer) { \
335 mp->m_inodes = ip; \
336 } \
337 } else { \
338 ASSERT(mp->m_inodes == (xfs_inode_t *)ipointer); \
339 mp->m_inodes = NULL; \
340 ip = NULL; \
341 } \
342 IPOINTER_CLR; \
343 }
344
345#define XFS_PREEMPT_MASK 0x7f
346
347 ASSERT(!(flags & SYNC_BDFLUSH));
348
349 if (bypassed)
350 *bypassed = 0;
351 if (mp->m_flags & XFS_MOUNT_RDONLY)
352 return 0;
353 error = 0;
354 last_error = 0;
355 preempt = 0;
356
357 /* Allocate a reference marker */
358 ipointer = (xfs_iptr_t *)kmem_zalloc(sizeof(xfs_iptr_t), KM_SLEEP);
359
360 fflag = XFS_B_ASYNC; /* default is don't wait */
361 if (flags & SYNC_DELWRI)
362 fflag = XFS_B_DELWRI;
363 if (flags & SYNC_WAIT)
364 fflag = 0; /* synchronous overrides all */
365
366 base_lock_flags = XFS_ILOCK_SHARED;
367 if (flags & (SYNC_DELWRI | SYNC_CLOSE)) {
368 /*
369 * We need the I/O lock if we're going to call any of
370 * the flush/inval routines.
371 */
372 base_lock_flags |= XFS_IOLOCK_SHARED;
373 }
374
375 XFS_MOUNT_ILOCK(mp);
376
377 ip = mp->m_inodes;
378
379 mount_locked = B_TRUE;
380 vnode_refed = B_FALSE;
381
382 IPOINTER_CLR;
383
384 do {
385 ASSERT(ipointer_in == B_FALSE);
386 ASSERT(vnode_refed == B_FALSE);
387
388 lock_flags = base_lock_flags;
389
390 /*
391 * There were no inodes in the list, just break out
392 * of the loop.
393 */
394 if (ip == NULL) {
395 break;
396 }
397
398 /*
399 * We found another sync thread marker - skip it
400 */
401 if (ip->i_mount == NULL) {
402 ip = ip->i_mnext;
403 continue;
404 }
405
406 vp = VFS_I(ip);
407
408 /*
409 * If the vnode is gone then this is being torn down,
410 * call reclaim if it is flushed, else let regular flush
411 * code deal with it later in the loop.
412 */
413
414 if (vp == NULL) {
415 /* Skip ones already in reclaim */
416 if (ip->i_flags & XFS_IRECLAIM) {
417 ip = ip->i_mnext;
418 continue;
419 }
420 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) {
421 ip = ip->i_mnext;
422 } else if ((xfs_ipincount(ip) == 0) &&
423 xfs_iflock_nowait(ip)) {
424 IPOINTER_INSERT(ip, mp);
425
426 xfs_finish_reclaim(ip, 1,
427 XFS_IFLUSH_DELWRI_ELSE_ASYNC);
428
429 XFS_MOUNT_ILOCK(mp);
430 mount_locked = B_TRUE;
431 IPOINTER_REMOVE(ip, mp);
432 } else {
433 xfs_iunlock(ip, XFS_ILOCK_EXCL);
434 ip = ip->i_mnext;
435 }
436 continue;
437 }
438
439 if (VN_BAD(vp)) {
440 ip = ip->i_mnext;
441 continue;
442 }
443
444 if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) {
445 XFS_MOUNT_IUNLOCK(mp);
446 kmem_free(ipointer);
447 return 0;
448 }
449
450 /*
451 * Try to lock without sleeping. We're out of order with
452 * the inode list lock here, so if we fail we need to drop
453 * the mount lock and try again. If we're called from
454 * bdflush() here, then don't bother.
455 *
456 * The inode lock here actually coordinates with the
457 * almost spurious inode lock in xfs_ireclaim() to prevent
458 * the vnode we handle here without a reference from
459 * being freed while we reference it. If we lock the inode
460 * while it's on the mount list here, then the spurious inode
461 * lock in xfs_ireclaim() after the inode is pulled from
462 * the mount list will sleep until we release it here.
463 * This keeps the vnode from being freed while we reference
464 * it.
465 */
466 if (xfs_ilock_nowait(ip, lock_flags) == 0) {
467 if (vp == NULL) {
468 ip = ip->i_mnext;
469 continue;
470 }
471
472 vp = vn_grab(vp);
473 if (vp == NULL) {
474 ip = ip->i_mnext;
475 continue;
476 }
477
478 IPOINTER_INSERT(ip, mp);
479 xfs_ilock(ip, lock_flags);
480
481 ASSERT(vp == VFS_I(ip));
482 ASSERT(ip->i_mount == mp);
483
484 vnode_refed = B_TRUE;
485 }
486
487 /* From here on in the loop we may have a marker record
488 * in the inode list.
489 */
490
491 /*
492 * If we have to flush data or wait for I/O completion
493 * we need to drop the ilock that we currently hold.
494 * If we need to drop the lock, insert a marker if we
495 * have not already done so.
496 */
497 if ((flags & (SYNC_CLOSE|SYNC_IOWAIT)) ||
498 ((flags & SYNC_DELWRI) && VN_DIRTY(vp))) {
499 if (mount_locked) {
500 IPOINTER_INSERT(ip, mp);
501 }
502 xfs_iunlock(ip, XFS_ILOCK_SHARED);
503
504 if (flags & SYNC_CLOSE) {
505 /* Shutdown case. Flush and invalidate. */
506 if (XFS_FORCED_SHUTDOWN(mp))
507 xfs_tosspages(ip, 0, -1,
508 FI_REMAPF);
509 else
510 error = xfs_flushinval_pages(ip,
511 0, -1, FI_REMAPF);
512 } else if ((flags & SYNC_DELWRI) && VN_DIRTY(vp)) {
513 error = xfs_flush_pages(ip, 0,
514 -1, fflag, FI_NONE);
515 }
516
517 /*
518 * When freezing, we need to wait ensure all I/O (including direct
519 * I/O) is complete to ensure no further data modification can take
520 * place after this point
521 */
522 if (flags & SYNC_IOWAIT)
523 vn_iowait(ip);
524
525 xfs_ilock(ip, XFS_ILOCK_SHARED);
526 }
527
528 if ((flags & SYNC_ATTR) &&
529 (ip->i_update_core ||
530 (ip->i_itemp && ip->i_itemp->ili_format.ilf_fields))) {
531 if (mount_locked)
532 IPOINTER_INSERT(ip, mp);
533
534 if (flags & SYNC_WAIT) {
535 xfs_iflock(ip);
536 error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
537
538 /*
539 * If we can't acquire the flush lock, then the inode
540 * is already being flushed so don't bother waiting.
541 *
542 * If we can lock it then do a delwri flush so we can
543 * combine multiple inode flushes in each disk write.
544 */
545 } else if (xfs_iflock_nowait(ip)) {
546 error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
547 } else if (bypassed) {
548 (*bypassed)++;
549 }
550 }
551
552 if (lock_flags != 0) {
553 xfs_iunlock(ip, lock_flags);
554 }
555
556 if (vnode_refed) {
557 /*
558 * If we had to take a reference on the vnode
559 * above, then wait until after we've unlocked
560 * the inode to release the reference. This is
561 * because we can be already holding the inode
562 * lock when IRELE() calls xfs_inactive().
563 *
564 * Make sure to drop the mount lock before calling
565 * IRELE() so that we don't trip over ourselves if
566 * we have to go for the mount lock again in the
567 * inactive code.
568 */
569 if (mount_locked) {
570 IPOINTER_INSERT(ip, mp);
571 }
572
573 IRELE(ip);
574
575 vnode_refed = B_FALSE;
576 }
577
578 if (error) {
579 last_error = error;
580 }
581
582 /*
583 * bail out if the filesystem is corrupted.
584 */
585 if (error == EFSCORRUPTED) {
586 if (!mount_locked) {
587 XFS_MOUNT_ILOCK(mp);
588 IPOINTER_REMOVE(ip, mp);
589 }
590 XFS_MOUNT_IUNLOCK(mp);
591 ASSERT(ipointer_in == B_FALSE);
592 kmem_free(ipointer);
593 return XFS_ERROR(error);
594 }
595
596 /* Let other threads have a chance at the mount lock
597 * if we have looped many times without dropping the
598 * lock.
599 */
600 if ((++preempt & XFS_PREEMPT_MASK) == 0) {
601 if (mount_locked) {
602 IPOINTER_INSERT(ip, mp);
603 }
604 }
605
606 if (mount_locked == B_FALSE) {
607 XFS_MOUNT_ILOCK(mp);
608 mount_locked = B_TRUE;
609 IPOINTER_REMOVE(ip, mp);
610 continue;
611 }
612
613 ASSERT(ipointer_in == B_FALSE);
614 ip = ip->i_mnext;
615
616 } while (ip != mp->m_inodes);
617
618 XFS_MOUNT_IUNLOCK(mp);
619
620 ASSERT(ipointer_in == B_FALSE);
621
622 kmem_free(ipointer);
623 return XFS_ERROR(last_error);
624}
625
626/*
627 * xfs sync routine for internal use
628 *
629 * This routine supports all of the flags defined for the generic vfs_sync
630 * interface as explained above under xfs_sync.
631 *
632 */
633int
634xfs_syncsub(
635 xfs_mount_t *mp,
636 int flags,
637 int *bypassed)
638{
639 int error = 0;
640 int last_error = 0;
641 uint log_flags = XFS_LOG_FORCE;
642 xfs_buf_t *bp;
643 xfs_buf_log_item_t *bip;
644
645 /*
646 * Sync out the log. This ensures that the log is periodically
647 * flushed even if there is not enough activity to fill it up.
648 */
649 if (flags & SYNC_WAIT)
650 log_flags |= XFS_LOG_SYNC;
651
652 xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
653
654 if (flags & (SYNC_ATTR|SYNC_DELWRI)) {
655 if (flags & SYNC_BDFLUSH)
656 xfs_finish_reclaim_all(mp, 1);
657 else
658 error = xfs_sync_inodes(mp, flags, bypassed);
659 }
660
661 /*
662 * Flushing out dirty data above probably generated more
663 * log activity, so if this isn't vfs_sync() then flush
664 * the log again.
665 */
666 if (flags & SYNC_DELWRI) {
667 xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
668 }
669
670 if (flags & SYNC_FSDATA) {
671 /*
672 * If this is vfs_sync() then only sync the superblock
673 * if we can lock it without sleeping and it is not pinned.
674 */
675 if (flags & SYNC_BDFLUSH) {
676 bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
677 if (bp != NULL) {
678 bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*);
679 if ((bip != NULL) &&
680 xfs_buf_item_dirty(bip)) {
681 if (!(XFS_BUF_ISPINNED(bp))) {
682 XFS_BUF_ASYNC(bp);
683 error = xfs_bwrite(mp, bp);
684 } else {
685 xfs_buf_relse(bp);
686 }
687 } else {
688 xfs_buf_relse(bp);
689 }
690 }
691 } else {
692 bp = xfs_getsb(mp, 0);
693 /*
694 * If the buffer is pinned then push on the log so
695 * we won't get stuck waiting in the write for
696 * someone, maybe ourselves, to flush the log.
697 * Even though we just pushed the log above, we
698 * did not have the superblock buffer locked at
699 * that point so it can become pinned in between
700 * there and here.
701 */
702 if (XFS_BUF_ISPINNED(bp))
703 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
704 if (flags & SYNC_WAIT)
705 XFS_BUF_UNASYNC(bp);
706 else
707 XFS_BUF_ASYNC(bp);
708 error = xfs_bwrite(mp, bp);
709 }
710 if (error) {
711 last_error = error;
712 }
713 }
714
715 /*
716 * Now check to see if the log needs a "dummy" transaction.
717 */
718 if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) {
719 xfs_trans_t *tp;
720 xfs_inode_t *ip;
721
722 /*
723 * Put a dummy transaction in the log to tell
724 * recovery that all others are OK.
725 */
726 tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
727 if ((error = xfs_trans_reserve(tp, 0,
728 XFS_ICHANGE_LOG_RES(mp),
729 0, 0, 0))) {
730 xfs_trans_cancel(tp, 0);
731 return error;
732 }
733
734 ip = mp->m_rootip;
735 xfs_ilock(ip, XFS_ILOCK_EXCL);
736
737 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
738 xfs_trans_ihold(tp, ip);
739 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
740 error = xfs_trans_commit(tp, 0);
741 xfs_iunlock(ip, XFS_ILOCK_EXCL);
742 xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
743 }
744
745 /*
746 * When shutting down, we need to insure that the AIL is pushed
747 * to disk or the filesystem can appear corrupt from the PROM.
748 */
749 if ((flags & (SYNC_CLOSE|SYNC_WAIT)) == (SYNC_CLOSE|SYNC_WAIT)) {
750 XFS_bflush(mp->m_ddev_targp);
751 if (mp->m_rtdev_targp) {
752 XFS_bflush(mp->m_rtdev_targp);
753 }
754 }
755
756 return XFS_ERROR(last_error);
757}
diff --git a/fs/xfs/xfs_vfsops.h b/fs/xfs/xfs_vfsops.h
deleted file mode 100644
index a74b05087da4..000000000000
--- a/fs/xfs/xfs_vfsops.h
+++ /dev/null
@@ -1,16 +0,0 @@
1#ifndef _XFS_VFSOPS_H
2#define _XFS_VFSOPS_H 1
3
4struct cred;
5struct xfs_fid;
6struct inode;
7struct kstatfs;
8struct xfs_mount;
9struct xfs_mount_args;
10
11int xfs_sync(struct xfs_mount *mp, int flags);
12void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
13 int lnnum);
14void xfs_attr_quiesce(struct xfs_mount *mp);
15
16#endif /* _XFS_VFSOPS_H */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 8b6812f66a15..f07bf8768c3a 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -54,33 +54,10 @@
54#include "xfs_vnodeops.h" 54#include "xfs_vnodeops.h"
55 55
56int 56int
57xfs_open(
58 xfs_inode_t *ip)
59{
60 int mode;
61
62 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
63 return XFS_ERROR(EIO);
64
65 /*
66 * If it's a directory with any blocks, read-ahead block 0
67 * as we're almost certain to have the next operation be a read there.
68 */
69 if (S_ISDIR(ip->i_d.di_mode) && ip->i_d.di_nextents > 0) {
70 mode = xfs_ilock_map_shared(ip);
71 if (ip->i_d.di_nextents > 0)
72 (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
73 xfs_iunlock(ip, mode);
74 }
75 return 0;
76}
77
78int
79xfs_setattr( 57xfs_setattr(
80 struct xfs_inode *ip, 58 struct xfs_inode *ip,
81 struct iattr *iattr, 59 struct iattr *iattr,
82 int flags, 60 int flags)
83 cred_t *credp)
84{ 61{
85 xfs_mount_t *mp = ip->i_mount; 62 xfs_mount_t *mp = ip->i_mount;
86 struct inode *inode = VFS_I(ip); 63 struct inode *inode = VFS_I(ip);
@@ -93,7 +70,6 @@ xfs_setattr(
93 gid_t gid=0, igid=0; 70 gid_t gid=0, igid=0;
94 int timeflags = 0; 71 int timeflags = 0;
95 struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2; 72 struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2;
96 int file_owner;
97 int need_iolock = 1; 73 int need_iolock = 1;
98 74
99 xfs_itrace_entry(ip); 75 xfs_itrace_entry(ip);
@@ -104,6 +80,10 @@ xfs_setattr(
104 if (XFS_FORCED_SHUTDOWN(mp)) 80 if (XFS_FORCED_SHUTDOWN(mp))
105 return XFS_ERROR(EIO); 81 return XFS_ERROR(EIO);
106 82
83 code = -inode_change_ok(inode, iattr);
84 if (code)
85 return code;
86
107 olddquot1 = olddquot2 = NULL; 87 olddquot1 = olddquot2 = NULL;
108 udqp = gdqp = NULL; 88 udqp = gdqp = NULL;
109 89
@@ -181,62 +161,8 @@ xfs_setattr(
181 161
182 xfs_ilock(ip, lock_flags); 162 xfs_ilock(ip, lock_flags);
183 163
184 /* boolean: are we the file owner? */
185 file_owner = (current_fsuid() == ip->i_d.di_uid);
186
187 /*
188 * Change various properties of a file.
189 * Only the owner or users with CAP_FOWNER
190 * capability may do these things.
191 */
192 if (mask & (ATTR_MODE|ATTR_UID|ATTR_GID)) {
193 /*
194 * CAP_FOWNER overrides the following restrictions:
195 *
196 * The user ID of the calling process must be equal
197 * to the file owner ID, except in cases where the
198 * CAP_FSETID capability is applicable.
199 */
200 if (!file_owner && !capable(CAP_FOWNER)) {
201 code = XFS_ERROR(EPERM);
202 goto error_return;
203 }
204
205 /*
206 * CAP_FSETID overrides the following restrictions:
207 *
208 * The effective user ID of the calling process shall match
209 * the file owner when setting the set-user-ID and
210 * set-group-ID bits on that file.
211 *
212 * The effective group ID or one of the supplementary group
213 * IDs of the calling process shall match the group owner of
214 * the file when setting the set-group-ID bit on that file
215 */
216 if (mask & ATTR_MODE) {
217 mode_t m = 0;
218
219 if ((iattr->ia_mode & S_ISUID) && !file_owner)
220 m |= S_ISUID;
221 if ((iattr->ia_mode & S_ISGID) &&
222 !in_group_p((gid_t)ip->i_d.di_gid))
223 m |= S_ISGID;
224#if 0
225 /* Linux allows this, Irix doesn't. */
226 if ((iattr->ia_mode & S_ISVTX) && !S_ISDIR(ip->i_d.di_mode))
227 m |= S_ISVTX;
228#endif
229 if (m && !capable(CAP_FSETID))
230 iattr->ia_mode &= ~m;
231 }
232 }
233
234 /* 164 /*
235 * Change file ownership. Must be the owner or privileged. 165 * Change file ownership. Must be the owner or privileged.
236 * If the system was configured with the "restricted_chown"
237 * option, the owner is not permitted to give away the file,
238 * and can change the group id only to a group of which he
239 * or she is a member.
240 */ 166 */
241 if (mask & (ATTR_UID|ATTR_GID)) { 167 if (mask & (ATTR_UID|ATTR_GID)) {
242 /* 168 /*
@@ -251,23 +177,6 @@ xfs_setattr(
251 uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid; 177 uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
252 178
253 /* 179 /*
254 * CAP_CHOWN overrides the following restrictions:
255 *
256 * If _POSIX_CHOWN_RESTRICTED is defined, this capability
257 * shall override the restriction that a process cannot
258 * change the user ID of a file it owns and the restriction
259 * that the group ID supplied to the chown() function
260 * shall be equal to either the group ID or one of the
261 * supplementary group IDs of the calling process.
262 */
263 if (restricted_chown &&
264 (iuid != uid || (igid != gid &&
265 !in_group_p((gid_t)gid))) &&
266 !capable(CAP_CHOWN)) {
267 code = XFS_ERROR(EPERM);
268 goto error_return;
269 }
270 /*
271 * Do a quota reservation only if uid/gid is actually 180 * Do a quota reservation only if uid/gid is actually
272 * going to change. 181 * going to change.
273 */ 182 */
@@ -304,36 +213,22 @@ xfs_setattr(
304 code = XFS_ERROR(EINVAL); 213 code = XFS_ERROR(EINVAL);
305 goto error_return; 214 goto error_return;
306 } 215 }
216
307 /* 217 /*
308 * Make sure that the dquots are attached to the inode. 218 * Make sure that the dquots are attached to the inode.
309 */ 219 */
310 if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED))) 220 code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED);
221 if (code)
311 goto error_return; 222 goto error_return;
312 }
313
314 /*
315 * Change file access or modified times.
316 */
317 if (mask & (ATTR_ATIME|ATTR_MTIME)) {
318 if (!file_owner) {
319 if ((mask & (ATTR_MTIME_SET|ATTR_ATIME_SET)) &&
320 !capable(CAP_FOWNER)) {
321 code = XFS_ERROR(EPERM);
322 goto error_return;
323 }
324 }
325 }
326 223
327 /* 224 /*
328 * Now we can make the changes. Before we join the inode 225 * Now we can make the changes. Before we join the inode
329 * to the transaction, if ATTR_SIZE is set then take care of 226 * to the transaction, if ATTR_SIZE is set then take care of
330 * the part of the truncation that must be done without the 227 * the part of the truncation that must be done without the
331 * inode lock. This needs to be done before joining the inode 228 * inode lock. This needs to be done before joining the inode
332 * to the transaction, because the inode cannot be unlocked 229 * to the transaction, because the inode cannot be unlocked
333 * once it is a part of the transaction. 230 * once it is a part of the transaction.
334 */ 231 */
335 if (mask & ATTR_SIZE) {
336 code = 0;
337 if (iattr->ia_size > ip->i_size) { 232 if (iattr->ia_size > ip->i_size) {
338 /* 233 /*
339 * Do the first part of growing a file: zero any data 234 * Do the first part of growing a file: zero any data
@@ -366,7 +261,7 @@ xfs_setattr(
366 } 261 }
367 262
368 /* wait for all I/O to complete */ 263 /* wait for all I/O to complete */
369 vn_iowait(ip); 264 xfs_ioend_wait(ip);
370 265
371 if (!code) 266 if (!code)
372 code = xfs_itruncate_data(ip, iattr->ia_size); 267 code = xfs_itruncate_data(ip, iattr->ia_size);
@@ -388,17 +283,10 @@ xfs_setattr(
388 } 283 }
389 commit_flags = XFS_TRANS_RELEASE_LOG_RES; 284 commit_flags = XFS_TRANS_RELEASE_LOG_RES;
390 xfs_ilock(ip, XFS_ILOCK_EXCL); 285 xfs_ilock(ip, XFS_ILOCK_EXCL);
391 }
392 286
393 if (tp) {
394 xfs_trans_ijoin(tp, ip, lock_flags); 287 xfs_trans_ijoin(tp, ip, lock_flags);
395 xfs_trans_ihold(tp, ip); 288 xfs_trans_ihold(tp, ip);
396 }
397 289
398 /*
399 * Truncate file. Must have write permission and not be a directory.
400 */
401 if (mask & ATTR_SIZE) {
402 /* 290 /*
403 * Only change the c/mtime if we are changing the size 291 * Only change the c/mtime if we are changing the size
404 * or we are explicitly asked to change it. This handles 292 * or we are explicitly asked to change it. This handles
@@ -438,28 +326,13 @@ xfs_setattr(
438 */ 326 */
439 xfs_iflags_set(ip, XFS_ITRUNCATED); 327 xfs_iflags_set(ip, XFS_ITRUNCATED);
440 } 328 }
441 } 329 } else if (tp) {
442 330 xfs_trans_ijoin(tp, ip, lock_flags);
443 /* 331 xfs_trans_ihold(tp, ip);
444 * Change file access modes.
445 */
446 if (mask & ATTR_MODE) {
447 ip->i_d.di_mode &= S_IFMT;
448 ip->i_d.di_mode |= iattr->ia_mode & ~S_IFMT;
449
450 inode->i_mode &= S_IFMT;
451 inode->i_mode |= iattr->ia_mode & ~S_IFMT;
452
453 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
454 timeflags |= XFS_ICHGTIME_CHG;
455 } 332 }
456 333
457 /* 334 /*
458 * Change file ownership. Must be the owner or privileged. 335 * Change file ownership. Must be the owner or privileged.
459 * If the system was configured with the "restricted_chown"
460 * option, the owner is not permitted to give away the file,
461 * and can change the group id only to a group of which he
462 * or she is a member.
463 */ 336 */
464 if (mask & (ATTR_UID|ATTR_GID)) { 337 if (mask & (ATTR_UID|ATTR_GID)) {
465 /* 338 /*
@@ -503,6 +376,24 @@ xfs_setattr(
503 timeflags |= XFS_ICHGTIME_CHG; 376 timeflags |= XFS_ICHGTIME_CHG;
504 } 377 }
505 378
379 /*
380 * Change file access modes.
381 */
382 if (mask & ATTR_MODE) {
383 umode_t mode = iattr->ia_mode;
384
385 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
386 mode &= ~S_ISGID;
387
388 ip->i_d.di_mode &= S_IFMT;
389 ip->i_d.di_mode |= mode & ~S_IFMT;
390
391 inode->i_mode &= S_IFMT;
392 inode->i_mode |= mode & ~S_IFMT;
393
394 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
395 timeflags |= XFS_ICHGTIME_CHG;
396 }
506 397
507 /* 398 /*
508 * Change file access or modified times. 399 * Change file access or modified times.
@@ -713,7 +604,7 @@ xfs_fsync(
713 return XFS_ERROR(EIO); 604 return XFS_ERROR(EIO);
714 605
715 /* capture size updates in I/O completion before writing the inode. */ 606 /* capture size updates in I/O completion before writing the inode. */
716 error = filemap_fdatawait(VFS_I(ip)->i_mapping); 607 error = xfs_wait_on_pages(ip, 0, -1);
717 if (error) 608 if (error)
718 return XFS_ERROR(error); 609 return XFS_ERROR(error);
719 610
@@ -1029,6 +920,12 @@ xfs_inactive_symlink_rmt(
1029 goto error0; 920 goto error0;
1030 } 921 }
1031 /* 922 /*
923 * transaction commit worked ok so we can drop the extra ticket
924 * reference that we gained in xfs_trans_dup()
925 */
926 xfs_log_ticket_put(tp->t_ticket);
927
928 /*
1032 * Remove the memory for extent descriptions (just bookkeeping). 929 * Remove the memory for extent descriptions (just bookkeeping).
1033 */ 930 */
1034 if (ip->i_df.if_bytes) 931 if (ip->i_df.if_bytes)
@@ -1625,8 +1522,6 @@ xfs_create(
1625 xfs_trans_set_sync(tp); 1522 xfs_trans_set_sync(tp);
1626 } 1523 }
1627 1524
1628 dp->i_gen++;
1629
1630 /* 1525 /*
1631 * Attach the dquot(s) to the inodes and modify them incore. 1526 * Attach the dquot(s) to the inodes and modify them incore.
1632 * These ids of the inode couldn't have changed since the new 1527 * These ids of the inode couldn't have changed since the new
@@ -1993,13 +1888,6 @@ xfs_remove(
1993 } 1888 }
1994 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1889 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1995 1890
1996 /*
1997 * Bump the in memory generation count on the parent
1998 * directory so that other can know that it has changed.
1999 */
2000 dp->i_gen++;
2001 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2002
2003 if (is_dir) { 1891 if (is_dir) {
2004 /* 1892 /*
2005 * Drop the link from ip's "..". 1893 * Drop the link from ip's "..".
@@ -2009,7 +1897,7 @@ xfs_remove(
2009 goto out_bmap_cancel; 1897 goto out_bmap_cancel;
2010 1898
2011 /* 1899 /*
2012 * Drop the link from dp to ip. 1900 * Drop the "." link from ip to self.
2013 */ 1901 */
2014 error = xfs_droplink(tp, ip); 1902 error = xfs_droplink(tp, ip);
2015 if (error) 1903 if (error)
@@ -2017,14 +1905,14 @@ xfs_remove(
2017 } else { 1905 } else {
2018 /* 1906 /*
2019 * When removing a non-directory we need to log the parent 1907 * When removing a non-directory we need to log the parent
2020 * inode here for the i_gen update. For a directory this is 1908 * inode here. For a directory this is done implicitly
2021 * done implicitly by the xfs_droplink call for the ".." entry. 1909 * by the xfs_droplink call for the ".." entry.
2022 */ 1910 */
2023 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 1911 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2024 } 1912 }
2025 1913
2026 /* 1914 /*
2027 * Drop the "." link from ip to self. 1915 * Drop the link from dp to ip.
2028 */ 1916 */
2029 error = xfs_droplink(tp, ip); 1917 error = xfs_droplink(tp, ip);
2030 if (error) 1918 if (error)
@@ -2178,7 +2066,6 @@ xfs_link(
2178 if (error) 2066 if (error)
2179 goto abort_return; 2067 goto abort_return;
2180 xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2068 xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2181 tdp->i_gen++;
2182 xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); 2069 xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
2183 2070
2184 error = xfs_bumplink(tp, sip); 2071 error = xfs_bumplink(tp, sip);
@@ -2355,18 +2242,10 @@ xfs_mkdir(
2355 } 2242 }
2356 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2243 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2357 2244
2358 /*
2359 * Bump the in memory version number of the parent directory
2360 * so that other processes accessing it will recognize that
2361 * the directory has changed.
2362 */
2363 dp->i_gen++;
2364
2365 error = xfs_dir_init(tp, cdp, dp); 2245 error = xfs_dir_init(tp, cdp, dp);
2366 if (error) 2246 if (error)
2367 goto error2; 2247 goto error2;
2368 2248
2369 cdp->i_gen = 1;
2370 error = xfs_bumplink(tp, dp); 2249 error = xfs_bumplink(tp, dp);
2371 if (error) 2250 if (error)
2372 goto error2; 2251 goto error2;
@@ -2653,13 +2532,6 @@ xfs_symlink(
2653 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 2532 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2654 2533
2655 /* 2534 /*
2656 * Bump the in memory version number of the parent directory
2657 * so that other processes accessing it will recognize that
2658 * the directory has changed.
2659 */
2660 dp->i_gen++;
2661
2662 /*
2663 * If this is a synchronous mount, make sure that the 2535 * If this is a synchronous mount, make sure that the
2664 * symlink transaction goes to disk before returning to 2536 * symlink transaction goes to disk before returning to
2665 * the user. 2537 * the user.
@@ -2809,7 +2681,7 @@ xfs_reclaim(
2809 return 0; 2681 return 0;
2810 } 2682 }
2811 2683
2812 vn_iowait(ip); 2684 xfs_ioend_wait(ip);
2813 2685
2814 ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); 2686 ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
2815 2687
@@ -2833,122 +2705,10 @@ xfs_reclaim(
2833 if (!ip->i_update_core && (ip->i_itemp == NULL)) { 2705 if (!ip->i_update_core && (ip->i_itemp == NULL)) {
2834 xfs_ilock(ip, XFS_ILOCK_EXCL); 2706 xfs_ilock(ip, XFS_ILOCK_EXCL);
2835 xfs_iflock(ip); 2707 xfs_iflock(ip);
2836 return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC); 2708 xfs_iflags_set(ip, XFS_IRECLAIMABLE);
2837 } else { 2709 return xfs_reclaim_inode(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
2838 xfs_mount_t *mp = ip->i_mount;
2839
2840 /* Protect sync and unpin from us */
2841 XFS_MOUNT_ILOCK(mp);
2842 spin_lock(&ip->i_flags_lock);
2843 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
2844 VFS_I(ip)->i_private = NULL;
2845 ip->i_vnode = NULL;
2846 spin_unlock(&ip->i_flags_lock);
2847 list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
2848 XFS_MOUNT_IUNLOCK(mp);
2849 }
2850 return 0;
2851}
2852
2853int
2854xfs_finish_reclaim(
2855 xfs_inode_t *ip,
2856 int locked,
2857 int sync_mode)
2858{
2859 xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
2860 struct inode *vp = VFS_I(ip);
2861
2862 if (vp && VN_BAD(vp))
2863 goto reclaim;
2864
2865 /* The hash lock here protects a thread in xfs_iget_core from
2866 * racing with us on linking the inode back with a vnode.
2867 * Once we have the XFS_IRECLAIM flag set it will not touch
2868 * us.
2869 */
2870 write_lock(&pag->pag_ici_lock);
2871 spin_lock(&ip->i_flags_lock);
2872 if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
2873 (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) && vp == NULL)) {
2874 spin_unlock(&ip->i_flags_lock);
2875 write_unlock(&pag->pag_ici_lock);
2876 if (locked) {
2877 xfs_ifunlock(ip);
2878 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2879 }
2880 return 1;
2881 }
2882 __xfs_iflags_set(ip, XFS_IRECLAIM);
2883 spin_unlock(&ip->i_flags_lock);
2884 write_unlock(&pag->pag_ici_lock);
2885 xfs_put_perag(ip->i_mount, pag);
2886
2887 /*
2888 * If the inode is still dirty, then flush it out. If the inode
2889 * is not in the AIL, then it will be OK to flush it delwri as
2890 * long as xfs_iflush() does not keep any references to the inode.
2891 * We leave that decision up to xfs_iflush() since it has the
2892 * knowledge of whether it's OK to simply do a delwri flush of
2893 * the inode or whether we need to wait until the inode is
2894 * pulled from the AIL.
2895 * We get the flush lock regardless, though, just to make sure
2896 * we don't free it while it is being flushed.
2897 */
2898 if (!locked) {
2899 xfs_ilock(ip, XFS_ILOCK_EXCL);
2900 xfs_iflock(ip);
2901 } 2710 }
2902 2711 xfs_inode_set_reclaim_tag(ip);
2903 /*
2904 * In the case of a forced shutdown we rely on xfs_iflush() to
2905 * wait for the inode to be unpinned before returning an error.
2906 */
2907 if (xfs_iflush(ip, sync_mode) == 0) {
2908 /* synchronize with xfs_iflush_done */
2909 xfs_iflock(ip);
2910 xfs_ifunlock(ip);
2911 }
2912
2913 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2914
2915 reclaim:
2916 xfs_ireclaim(ip);
2917 return 0;
2918}
2919
2920int
2921xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
2922{
2923 int purged;
2924 xfs_inode_t *ip, *n;
2925 int done = 0;
2926
2927 while (!done) {
2928 purged = 0;
2929 XFS_MOUNT_ILOCK(mp);
2930 list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) {
2931 if (noblock) {
2932 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
2933 continue;
2934 if (xfs_ipincount(ip) ||
2935 !xfs_iflock_nowait(ip)) {
2936 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2937 continue;
2938 }
2939 }
2940 XFS_MOUNT_IUNLOCK(mp);
2941 if (xfs_finish_reclaim(ip, noblock,
2942 XFS_IFLUSH_DELWRI_ELSE_ASYNC))
2943 delay(1);
2944 purged = 1;
2945 break;
2946 }
2947
2948 done = !purged;
2949 }
2950
2951 XFS_MOUNT_IUNLOCK(mp);
2952 return 0; 2712 return 0;
2953} 2713}
2954 2714
@@ -3197,6 +2957,8 @@ xfs_zero_remaining_bytes(
3197 bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize, 2957 bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
3198 XFS_IS_REALTIME_INODE(ip) ? 2958 XFS_IS_REALTIME_INODE(ip) ?
3199 mp->m_rtdev_targp : mp->m_ddev_targp); 2959 mp->m_rtdev_targp : mp->m_ddev_targp);
2960 if (!bp)
2961 return XFS_ERROR(ENOMEM);
3200 2962
3201 for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { 2963 for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
3202 offset_fsb = XFS_B_TO_FSBT(mp, offset); 2964 offset_fsb = XFS_B_TO_FSBT(mp, offset);
@@ -3312,7 +3074,8 @@ xfs_free_file_space(
3312 need_iolock = 0; 3074 need_iolock = 0;
3313 if (need_iolock) { 3075 if (need_iolock) {
3314 xfs_ilock(ip, XFS_IOLOCK_EXCL); 3076 xfs_ilock(ip, XFS_IOLOCK_EXCL);
3315 vn_iowait(ip); /* wait for the completion of any pending DIOs */ 3077 /* wait for the completion of any pending DIOs */
3078 xfs_ioend_wait(ip);
3316 } 3079 }
3317 3080
3318 rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); 3081 rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
@@ -3474,7 +3237,6 @@ xfs_change_file_space(
3474 int cmd, 3237 int cmd,
3475 xfs_flock64_t *bf, 3238 xfs_flock64_t *bf,
3476 xfs_off_t offset, 3239 xfs_off_t offset,
3477 cred_t *credp,
3478 int attr_flags) 3240 int attr_flags)
3479{ 3241{
3480 xfs_mount_t *mp = ip->i_mount; 3242 xfs_mount_t *mp = ip->i_mount;
@@ -3562,7 +3324,7 @@ xfs_change_file_space(
3562 iattr.ia_valid = ATTR_SIZE; 3324 iattr.ia_valid = ATTR_SIZE;
3563 iattr.ia_size = startoffset; 3325 iattr.ia_size = startoffset;
3564 3326
3565 error = xfs_setattr(ip, &iattr, attr_flags, credp); 3327 error = xfs_setattr(ip, &iattr, attr_flags);
3566 3328
3567 if (error) 3329 if (error)
3568 return error; 3330 return error;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 7b0c2ab88333..76df328c61b4 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -14,9 +14,7 @@ struct xfs_inode;
14struct xfs_iomap; 14struct xfs_iomap;
15 15
16 16
17int xfs_open(struct xfs_inode *ip); 17int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags);
18int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags,
19 cred_t *credp);
20#define XFS_ATTR_DMI 0x01 /* invocation from a DMI function */ 18#define XFS_ATTR_DMI 0x01 /* invocation from a DMI function */
21#define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */ 19#define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */
22#define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */ 20#define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */
@@ -44,8 +42,7 @@ int xfs_inode_flush(struct xfs_inode *ip, int flags);
44int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); 42int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
45int xfs_reclaim(struct xfs_inode *ip); 43int xfs_reclaim(struct xfs_inode *ip);
46int xfs_change_file_space(struct xfs_inode *ip, int cmd, 44int xfs_change_file_space(struct xfs_inode *ip, int cmd,
47 xfs_flock64_t *bf, xfs_off_t offset, 45 xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
48 cred_t *credp, int attr_flags);
49int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, 46int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
50 struct xfs_inode *src_ip, struct xfs_inode *target_dp, 47 struct xfs_inode *src_ip, struct xfs_inode *target_dp,
51 struct xfs_name *target_name, struct xfs_inode *target_ip); 48 struct xfs_name *target_name, struct xfs_inode *target_ip);
@@ -56,8 +53,6 @@ int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value,
56int xfs_attr_remove(struct xfs_inode *dp, const char *name, int flags); 53int xfs_attr_remove(struct xfs_inode *dp, const char *name, int flags);
57int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, 54int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
58 int flags, struct attrlist_cursor_kern *cursor); 55 int flags, struct attrlist_cursor_kern *cursor);
59int xfs_ioctl(struct xfs_inode *ip, struct file *filp,
60 int ioflags, unsigned int cmd, void __user *arg);
61ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb, 56ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb,
62 const struct iovec *iovp, unsigned int segs, 57 const struct iovec *iovp, unsigned int segs,
63 loff_t *offset, int ioflags); 58 loff_t *offset, int ioflags);
@@ -78,5 +73,6 @@ int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first,
78 xfs_off_t last, int fiopt); 73 xfs_off_t last, int fiopt);
79int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first, 74int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
80 xfs_off_t last, uint64_t flags, int fiopt); 75 xfs_off_t last, uint64_t flags, int fiopt);
76int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last);
81 77
82#endif /* _XFS_VNODEOPS_H */ 78#endif /* _XFS_VNODEOPS_H */