aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorHaavard Skinnemoen <haavard.skinnemoen@atmel.com>2009-10-06 11:36:55 -0400
committerHaavard Skinnemoen <haavard.skinnemoen@atmel.com>2009-10-06 11:36:55 -0400
commitd94e5fcbf1420366dcb4102bafe04dbcfc0d0d4b (patch)
treea9b7de7df6da5c3132cc68169b9c47ba288ccd42 /fs
parentd55651168a20078a94597a297d5cdfd807bf07b6 (diff)
parent374576a8b6f865022c0fd1ca62396889b23d66dd (diff)
Merge commit 'v2.6.32-rc3'
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Kconfig9
-rw-r--r--fs/9p/Makefile3
-rw-r--r--fs/9p/cache.c474
-rw-r--r--fs/9p/cache.h176
-rw-r--r--fs/9p/v9fs.c217
-rw-r--r--fs/9p/v9fs.h14
-rw-r--r--fs/9p/v9fs_vfs.h6
-rw-r--r--fs/9p/vfs_addr.c88
-rw-r--r--fs/9p/vfs_file.c25
-rw-r--r--fs/9p/vfs_inode.c187
-rw-r--r--fs/9p/vfs_super.c55
-rw-r--r--fs/Kconfig3
-rw-r--r--fs/adfs/inode.c7
-rw-r--r--fs/afs/cache.h12
-rw-r--r--fs/afs/file.c18
-rw-r--r--fs/afs/flock.c2
-rw-r--r--fs/afs/internal.h2
-rw-r--r--fs/afs/proc.c8
-rw-r--r--fs/afs/write.c1
-rw-r--r--fs/aio.c57
-rw-r--r--fs/anon_inodes.c70
-rw-r--r--fs/attr.c46
-rw-r--r--fs/autofs/dirhash.c2
-rw-r--r--fs/autofs4/expire.c2
-rw-r--r--fs/befs/linuxvfs.c9
-rw-r--r--fs/binfmt_elf.c124
-rw-r--r--fs/binfmt_elf_fdpic.c73
-rw-r--r--fs/binfmt_flat.c22
-rw-r--r--fs/bio.c49
-rw-r--r--fs/block_dev.c172
-rw-r--r--fs/btrfs/acl.c6
-rw-r--r--fs/btrfs/async-thread.c254
-rw-r--r--fs/btrfs/async-thread.h12
-rw-r--r--fs/btrfs/btrfs_inode.h9
-rw-r--r--fs/btrfs/compression.c8
-rw-r--r--fs/btrfs/ctree.c6
-rw-r--r--fs/btrfs/ctree.h105
-rw-r--r--fs/btrfs/dir-item.c47
-rw-r--r--fs/btrfs/disk-io.c249
-rw-r--r--fs/btrfs/export.c133
-rw-r--r--fs/btrfs/extent-tree.c2038
-rw-r--r--fs/btrfs/extent_io.c416
-rw-r--r--fs/btrfs/extent_io.h29
-rw-r--r--fs/btrfs/extent_map.c103
-rw-r--r--fs/btrfs/extent_map.h5
-rw-r--r--fs/btrfs/file.c70
-rw-r--r--fs/btrfs/free-space-cache.c36
-rw-r--r--fs/btrfs/inode-item.c4
-rw-r--r--fs/btrfs/inode-map.c93
-rw-r--r--fs/btrfs/inode.c929
-rw-r--r--fs/btrfs/ioctl.c399
-rw-r--r--fs/btrfs/ioctl.h3
-rw-r--r--fs/btrfs/ordered-data.c127
-rw-r--r--fs/btrfs/ordered-data.h7
-rw-r--r--fs/btrfs/orphan.c20
-rw-r--r--fs/btrfs/relocation.c280
-rw-r--r--fs/btrfs/root-tree.c138
-rw-r--r--fs/btrfs/super.c7
-rw-r--r--fs/btrfs/transaction.c48
-rw-r--r--fs/btrfs/tree-log.c27
-rw-r--r--fs/btrfs/volumes.c125
-rw-r--r--fs/btrfs/volumes.h3
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/buffer.c79
-rw-r--r--fs/char_dev.c43
-rw-r--r--fs/cifs/CHANGES5
-rw-r--r--fs/cifs/Kconfig1
-rw-r--r--fs/cifs/cifs_dfs_ref.c4
-rw-r--r--fs/cifs/cifs_spnego.c2
-rw-r--r--fs/cifs/cifsacl.c4
-rw-r--r--fs/cifs/cifsencrypt.c1
-rw-r--r--fs/cifs/cifsfs.c108
-rw-r--r--fs/cifs/cifsfs.h4
-rw-r--r--fs/cifs/cifsglob.h30
-rw-r--r--fs/cifs/cifsproto.h11
-rw-r--r--fs/cifs/cifssmb.c317
-rw-r--r--fs/cifs/connect.c50
-rw-r--r--fs/cifs/dir.c66
-rw-r--r--fs/cifs/file.c174
-rw-r--r--fs/cifs/inode.c59
-rw-r--r--fs/cifs/misc.c34
-rw-r--r--fs/cifs/readdir.c4
-rw-r--r--fs/cifs/transport.c51
-rw-r--r--fs/coda/coda_int.h1
-rw-r--r--fs/coda/psdev.c1
-rw-r--r--fs/compat.c48
-rw-r--r--fs/configfs/inode.c1
-rw-r--r--fs/dcache.c1
-rw-r--r--fs/devpts/inode.c3
-rw-r--r--fs/dlm/debug_fs.c12
-rw-r--r--fs/dlm/lowcomms.c26
-rw-r--r--fs/dlm/netlink.c2
-rw-r--r--fs/drop_caches.c4
-rw-r--r--fs/ecryptfs/Kconfig4
-rw-r--r--fs/ecryptfs/crypto.c39
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/ecryptfs/inode.c2
-rw-r--r--fs/ecryptfs/keystore.c39
-rw-r--r--fs/ecryptfs/kthread.c24
-rw-r--r--fs/ecryptfs/main.c3
-rw-r--r--fs/ecryptfs/mmap.c6
-rw-r--r--fs/ecryptfs/read_write.c32
-rw-r--r--fs/ecryptfs/super.c2
-rw-r--r--fs/eventfd.c67
-rw-r--r--fs/exec.c188
-rw-r--r--fs/exofs/super.c6
-rw-r--r--fs/ext2/acl.c8
-rw-r--r--fs/ext2/acl.h4
-rw-r--r--fs/ext2/file.c2
-rw-r--r--fs/ext2/inode.c4
-rw-r--r--fs/ext2/namei.c10
-rw-r--r--fs/ext2/xip.c2
-rw-r--r--fs/ext3/acl.c8
-rw-r--r--fs/ext3/acl.h4
-rw-r--r--fs/ext3/file.c63
-rw-r--r--fs/ext3/fsync.c12
-rw-r--r--fs/ext3/inode.c31
-rw-r--r--fs/ext3/namei.c4
-rw-r--r--fs/ext3/super.c4
-rw-r--r--fs/ext4/Kconfig23
-rw-r--r--fs/ext4/acl.c8
-rw-r--r--fs/ext4/acl.h4
-rw-r--r--fs/ext4/balloc.c2
-rw-r--r--fs/ext4/ext4.h145
-rw-r--r--fs/ext4/ext4_extents.h11
-rw-r--r--fs/ext4/ext4_jbd2.c9
-rw-r--r--fs/ext4/ext4_jbd2.h6
-rw-r--r--fs/ext4/extents.c556
-rw-r--r--fs/ext4/file.c57
-rw-r--r--fs/ext4/fsync.c18
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/inode.c680
-rw-r--r--fs/ext4/ioctl.c7
-rw-r--r--fs/ext4/mballoc.c726
-rw-r--r--fs/ext4/mballoc.h57
-rw-r--r--fs/ext4/migrate.c24
-rw-r--r--fs/ext4/move_extent.c354
-rw-r--r--fs/ext4/namei.c29
-rw-r--r--fs/ext4/resize.c7
-rw-r--r--fs/ext4/super.c289
-rw-r--r--fs/ext4/xattr.c15
-rw-r--r--fs/fat/fat.h2
-rw-r--r--fs/fat/file.c22
-rw-r--r--fs/fat/inode.c34
-rw-r--r--fs/fat/misc.c12
-rw-r--r--fs/fat/namei_vfat.c15
-rw-r--r--fs/fcntl.c108
-rw-r--r--fs/file_table.c6
-rw-r--r--fs/fs-writeback.c1197
-rw-r--r--fs/fuse/control.c138
-rw-r--r--fs/fuse/dev.c10
-rw-r--r--fs/fuse/dir.c14
-rw-r--r--fs/fuse/file.c2
-rw-r--r--fs/fuse/fuse_i.h20
-rw-r--r--fs/fuse/inode.c94
-rw-r--r--fs/gfs2/Makefile2
-rw-r--r--fs/gfs2/acl.c106
-rw-r--r--fs/gfs2/aops.c3
-rw-r--r--fs/gfs2/dentry.c18
-rw-r--r--fs/gfs2/eaops.c157
-rw-r--r--fs/gfs2/eaops.h30
-rw-r--r--fs/gfs2/export.c36
-rw-r--r--fs/gfs2/file.c3
-rw-r--r--fs/gfs2/incore.h15
-rw-r--r--fs/gfs2/inode.c159
-rw-r--r--fs/gfs2/ops_fstype.c66
-rw-r--r--fs/gfs2/ops_inode.c83
-rw-r--r--fs/gfs2/rgrp.c90
-rw-r--r--fs/gfs2/rgrp.h6
-rw-r--r--fs/gfs2/super.c46
-rw-r--r--fs/gfs2/super.h5
-rw-r--r--fs/gfs2/sys.c31
-rw-r--r--fs/gfs2/util.c41
-rw-r--r--fs/gfs2/xattr.c (renamed from fs/gfs2/eattr.c)425
-rw-r--r--fs/gfs2/xattr.h (renamed from fs/gfs2/eattr.h)54
-rw-r--r--fs/hfs/mdb.c6
-rw-r--r--fs/hfsplus/super.c6
-rw-r--r--fs/hugetlbfs/inode.c49
-rw-r--r--fs/inode.c128
-rw-r--r--fs/internal.h1
-rw-r--r--fs/ioctl.c9
-rw-r--r--fs/isofs/inode.c8
-rw-r--r--fs/jbd/checkpoint.c6
-rw-r--r--fs/jbd/commit.c2
-rw-r--r--fs/jbd/journal.c30
-rw-r--r--fs/jbd/recovery.c18
-rw-r--r--fs/jbd/revoke.c16
-rw-r--r--fs/jbd/transaction.c9
-rw-r--r--fs/jbd2/checkpoint.c7
-rw-r--r--fs/jbd2/commit.c71
-rw-r--r--fs/jbd2/journal.c206
-rw-r--r--fs/jbd2/transaction.c7
-rw-r--r--fs/jffs2/acl.c7
-rw-r--r--fs/jffs2/acl.h4
-rw-r--r--fs/jffs2/background.c20
-rw-r--r--fs/jffs2/dir.c2
-rw-r--r--fs/jffs2/file.c2
-rw-r--r--fs/jffs2/malloc.c4
-rw-r--r--fs/jffs2/super.c2
-rw-r--r--fs/jffs2/symlink.c2
-rw-r--r--fs/jffs2/wbuf.c10
-rw-r--r--fs/jfs/acl.c7
-rw-r--r--fs/jfs/file.c2
-rw-r--r--fs/jfs/jfs_acl.h2
-rw-r--r--fs/jfs/namei.c2
-rw-r--r--fs/jfs/super.c9
-rw-r--r--fs/libfs.c13
-rw-r--r--fs/lockd/clntlock.c2
-rw-r--r--fs/lockd/clntproc.c2
-rw-r--r--fs/lockd/host.c18
-rw-r--r--fs/lockd/mon.c46
-rw-r--r--fs/lockd/svclock.c2
-rw-r--r--fs/lockd/svcsubs.c2
-rw-r--r--fs/lockd/xdr.c1
-rw-r--r--fs/lockd/xdr4.c1
-rw-r--r--fs/locks.c6
-rw-r--r--fs/minix/dir.c22
-rw-r--r--fs/namei.c110
-rw-r--r--fs/namespace.c77
-rw-r--r--fs/ncpfs/dir.c2
-rw-r--r--fs/ncpfs/inode.c12
-rw-r--r--fs/ncpfs/ioctl.c8
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/nfs/Makefile3
-rw-r--r--fs/nfs/cache_lib.c140
-rw-r--r--fs/nfs/cache_lib.h27
-rw-r--r--fs/nfs/callback.c26
-rw-r--r--fs/nfs/callback_xdr.c2
-rw-r--r--fs/nfs/client.c43
-rw-r--r--fs/nfs/direct.c3
-rw-r--r--fs/nfs/dns_resolve.c335
-rw-r--r--fs/nfs/dns_resolve.h14
-rw-r--r--fs/nfs/file.c54
-rw-r--r--fs/nfs/fscache.c25
-rw-r--r--fs/nfs/fscache.h6
-rw-r--r--fs/nfs/idmap.c6
-rw-r--r--fs/nfs/inode.c154
-rw-r--r--fs/nfs/internal.h39
-rw-r--r--fs/nfs/mount_clnt.c83
-rw-r--r--fs/nfs/nfs2xdr.c1
-rw-r--r--fs/nfs/nfs3proc.c2
-rw-r--r--fs/nfs/nfs3xdr.c1
-rw-r--r--fs/nfs/nfs4namespace.c24
-rw-r--r--fs/nfs/nfs4proc.c41
-rw-r--r--fs/nfs/nfs4state.c2
-rw-r--r--fs/nfs/nfs4xdr.c1461
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/super.c521
-rw-r--r--fs/nfs/write.c92
-rw-r--r--fs/nfsd/auth.c4
-rw-r--r--fs/nfsd/export.c18
-rw-r--r--fs/nfsd/nfs3xdr.c75
-rw-r--r--fs/nfsd/nfs4acl.c4
-rw-r--r--fs/nfsd/nfs4callback.c263
-rw-r--r--fs/nfsd/nfs4idmap.c21
-rw-r--r--fs/nfsd/nfs4proc.c89
-rw-r--r--fs/nfsd/nfs4state.c685
-rw-r--r--fs/nfsd/nfs4xdr.c42
-rw-r--r--fs/nfsd/nfsctl.c31
-rw-r--r--fs/nfsd/nfsfh.c158
-rw-r--r--fs/nfsd/nfssvc.c56
-rw-r--r--fs/nfsd/vfs.c12
-rw-r--r--fs/nilfs2/Kconfig2
-rw-r--r--fs/nilfs2/bmap.c151
-rw-r--r--fs/nilfs2/bmap.h76
-rw-r--r--fs/nilfs2/btnode.c5
-rw-r--r--fs/nilfs2/btree.c625
-rw-r--r--fs/nilfs2/cpfile.c11
-rw-r--r--fs/nilfs2/cpfile.h2
-rw-r--r--fs/nilfs2/dat.c42
-rw-r--r--fs/nilfs2/dat.h8
-rw-r--r--fs/nilfs2/dir.c2
-rw-r--r--fs/nilfs2/direct.c161
-rw-r--r--fs/nilfs2/file.c6
-rw-r--r--fs/nilfs2/gcinode.c2
-rw-r--r--fs/nilfs2/ifile.h1
-rw-r--r--fs/nilfs2/inode.c6
-rw-r--r--fs/nilfs2/ioctl.c26
-rw-r--r--fs/nilfs2/mdt.c46
-rw-r--r--fs/nilfs2/mdt.h3
-rw-r--r--fs/nilfs2/namei.c6
-rw-r--r--fs/nilfs2/nilfs.h14
-rw-r--r--fs/nilfs2/recovery.c3
-rw-r--r--fs/nilfs2/segbuf.c4
-rw-r--r--fs/nilfs2/segment.c7
-rw-r--r--fs/nilfs2/sufile.h1
-rw-r--r--fs/nilfs2/super.c104
-rw-r--r--fs/nilfs2/the_nilfs.c19
-rw-r--r--fs/nilfs2/the_nilfs.h43
-rw-r--r--fs/nls/nls_base.c11
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c33
-rw-r--r--fs/notify/inotify/inotify_user.c245
-rw-r--r--fs/ntfs/aops.c2
-rw-r--r--fs/ntfs/file.c54
-rw-r--r--fs/ntfs/layout.h2
-rw-r--r--fs/ntfs/malloc.h2
-rw-r--r--fs/ntfs/mft.c13
-rw-r--r--fs/ntfs/super.c10
-rw-r--r--fs/ocfs2/Makefile1
-rw-r--r--fs/ocfs2/alloc.c1342
-rw-r--r--fs/ocfs2/alloc.h101
-rw-r--r--fs/ocfs2/aops.c42
-rw-r--r--fs/ocfs2/aops.h2
-rw-r--r--fs/ocfs2/buffer_head_io.c47
-rw-r--r--fs/ocfs2/buffer_head_io.h8
-rw-r--r--fs/ocfs2/cluster/heartbeat.c2
-rw-r--r--fs/ocfs2/cluster/masklog.c1
-rw-r--r--fs/ocfs2/cluster/masklog.h1
-rw-r--r--fs/ocfs2/cluster/netdebug.c8
-rw-r--r--fs/ocfs2/dcache.c11
-rw-r--r--fs/ocfs2/dir.c107
-rw-r--r--fs/ocfs2/dlm/dlmast.c1
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c1
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c11
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c1
-rw-r--r--fs/ocfs2/dlm/dlmfs.c1
-rw-r--r--fs/ocfs2/dlm/dlmlock.c1
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c1
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c1
-rw-r--r--fs/ocfs2/dlm/dlmthread.c7
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c1
-rw-r--r--fs/ocfs2/dlmglue.c105
-rw-r--r--fs/ocfs2/dlmglue.h6
-rw-r--r--fs/ocfs2/extent_map.c33
-rw-r--r--fs/ocfs2/extent_map.h8
-rw-r--r--fs/ocfs2/file.c200
-rw-r--r--fs/ocfs2/file.h2
-rw-r--r--fs/ocfs2/inode.c86
-rw-r--r--fs/ocfs2/inode.h20
-rw-r--r--fs/ocfs2/ioctl.c14
-rw-r--r--fs/ocfs2/journal.c82
-rw-r--r--fs/ocfs2/journal.h94
-rw-r--r--fs/ocfs2/localalloc.c12
-rw-r--r--fs/ocfs2/mmap.c2
-rw-r--r--fs/ocfs2/namei.c341
-rw-r--r--fs/ocfs2/namei.h6
-rw-r--r--fs/ocfs2/ocfs2.h52
-rw-r--r--fs/ocfs2/ocfs2_fs.h107
-rw-r--r--fs/ocfs2/ocfs2_lockid.h5
-rw-r--r--fs/ocfs2/quota.h2
-rw-r--r--fs/ocfs2/quota_global.c9
-rw-r--r--fs/ocfs2/quota_local.c26
-rw-r--r--fs/ocfs2/refcounttree.c4313
-rw-r--r--fs/ocfs2/refcounttree.h106
-rw-r--r--fs/ocfs2/resize.c16
-rw-r--r--fs/ocfs2/slot_map.c10
-rw-r--r--fs/ocfs2/suballoc.c35
-rw-r--r--fs/ocfs2/super.c18
-rw-r--r--fs/ocfs2/symlink.c1
-rw-r--r--fs/ocfs2/uptodate.c265
-rw-r--r--fs/ocfs2/uptodate.h51
-rw-r--r--fs/ocfs2/xattr.c2056
-rw-r--r--fs/ocfs2/xattr.h15
-rw-r--r--fs/omfs/dir.c4
-rw-r--r--fs/omfs/file.c6
-rw-r--r--fs/omfs/inode.c2
-rw-r--r--fs/omfs/omfs.h10
-rw-r--r--fs/open.c17
-rw-r--r--fs/partitions/check.c4
-rw-r--r--fs/proc/array.c92
-rw-r--r--fs/proc/base.c67
-rw-r--r--fs/proc/kcore.c335
-rw-r--r--fs/proc/meminfo.c13
-rw-r--r--fs/proc/nommu.c2
-rw-r--r--fs/proc/page.c5
-rw-r--r--fs/proc/proc_sysctl.c2
-rw-r--r--fs/proc/task_mmu.c57
-rw-r--r--fs/proc/uptime.c7
-rw-r--r--fs/qnx4/Kconfig11
-rw-r--r--fs/qnx4/Makefile2
-rw-r--r--fs/qnx4/bitmap.c81
-rw-r--r--fs/qnx4/dir.c5
-rw-r--r--fs/qnx4/file.c40
-rw-r--r--fs/qnx4/inode.c84
-rw-r--r--fs/qnx4/namei.c105
-rw-r--r--fs/qnx4/qnx4.h8
-rw-r--r--fs/qnx4/truncate.c34
-rw-r--r--fs/quota/dquot.c4
-rw-r--r--fs/ramfs/file-nommu.c18
-rw-r--r--fs/ramfs/inode.c5
-rw-r--r--fs/read_write.c3
-rw-r--r--fs/reiserfs/super.c4
-rw-r--r--fs/romfs/super.c4
-rw-r--r--fs/select.c15
-rw-r--r--fs/seq_file.c74
-rw-r--r--fs/smbfs/inode.c10
-rw-r--r--fs/smbfs/proc.c2
-rw-r--r--fs/splice.c30
-rw-r--r--fs/squashfs/super.c4
-rw-r--r--fs/super.c80
-rw-r--r--fs/sync.c86
-rw-r--r--fs/sysfs/bin.c4
-rw-r--r--fs/sysfs/dir.c1
-rw-r--r--fs/sysfs/inode.c135
-rw-r--r--fs/sysfs/symlink.c2
-rw-r--r--fs/sysfs/sysfs.h12
-rw-r--r--fs/ubifs/budget.c34
-rw-r--r--fs/ubifs/commit.c2
-rw-r--r--fs/ubifs/debug.c112
-rw-r--r--fs/ubifs/debug.h5
-rw-r--r--fs/ubifs/file.c64
-rw-r--r--fs/ubifs/gc.c2
-rw-r--r--fs/ubifs/io.c29
-rw-r--r--fs/ubifs/journal.c13
-rw-r--r--fs/ubifs/key.h35
-rw-r--r--fs/ubifs/log.c17
-rw-r--r--fs/ubifs/lprops.c43
-rw-r--r--fs/ubifs/master.c20
-rw-r--r--fs/ubifs/orphan.c7
-rw-r--r--fs/ubifs/recovery.c4
-rw-r--r--fs/ubifs/replay.c6
-rw-r--r--fs/ubifs/scan.c32
-rw-r--r--fs/ubifs/super.c41
-rw-r--r--fs/ubifs/tnc.c76
-rw-r--r--fs/ubifs/tnc_commit.c2
-rw-r--r--fs/ubifs/ubifs-media.h7
-rw-r--r--fs/ubifs/ubifs.h13
-rw-r--r--fs/ubifs/xattr.c6
-rw-r--r--fs/udf/directory.c86
-rw-r--r--fs/udf/file.c2
-rw-r--r--fs/udf/inode.c19
-rw-r--r--fs/udf/lowlevel.c4
-rw-r--r--fs/udf/namei.c1
-rw-r--r--fs/xattr.c55
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c23
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c17
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c7
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_stats.c51
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c28
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c15
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.c3
-rw-r--r--fs/xfs/quota/xfs_qm_stats.c78
-rw-r--r--fs/xfs/xfs_ag.h9
-rw-r--r--fs/xfs/xfs_bmap.c2
-rw-r--r--fs/xfs/xfs_bmap.h11
-rw-r--r--fs/xfs/xfs_bmap_btree.c20
-rw-r--r--fs/xfs/xfs_bmap_btree.h1
-rw-r--r--fs/xfs/xfs_btree.c42
-rw-r--r--fs/xfs/xfs_btree.h15
-rw-r--r--fs/xfs/xfs_fs.h2
-rw-r--r--fs/xfs/xfs_ialloc.c805
-rw-r--r--fs/xfs/xfs_ialloc.h18
-rw-r--r--fs/xfs/xfs_iget.c27
-rw-r--r--fs/xfs/xfs_inode.c8
-rw-r--r--fs/xfs/xfs_inode.h8
-rw-r--r--fs/xfs/xfs_inode_item.c10
-rw-r--r--fs/xfs/xfs_inode_item.h2
-rw-r--r--fs/xfs/xfs_inum.h1
-rw-r--r--fs/xfs/xfs_itable.c98
-rw-r--r--fs/xfs/xfs_itable.h5
-rw-r--r--fs/xfs/xfs_log_priv.h2
-rw-r--r--fs/xfs/xfs_log_recover.c2
-rw-r--r--fs/xfs/xfs_mount.c2
-rw-r--r--fs/xfs/xfs_mount.h3
-rw-r--r--fs/xfs/xfs_mru_cache.c29
-rw-r--r--fs/xfs/xfs_mru_cache.h1
-rw-r--r--fs/xfs/xfs_rw.c84
-rw-r--r--fs/xfs/xfs_rw.h7
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--fs/xfs/xfs_trans_buf.c4
-rw-r--r--fs/xfs/xfs_trans_inode.c86
-rw-r--r--fs/xfs/xfs_vnodeops.c17
467 files changed, 23989 insertions, 12343 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 74e0723e90bc..795233702a4e 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -8,3 +8,12 @@ config 9P_FS
8 See <http://v9fs.sf.net> for more information. 8 See <http://v9fs.sf.net> for more information.
9 9
10 If unsure, say N. 10 If unsure, say N.
11
12config 9P_FSCACHE
13 bool "Enable 9P client caching support (EXPERIMENTAL)"
14 depends on EXPERIMENTAL
15 depends on 9P_FS=m && FSCACHE || 9P_FS=y && FSCACHE=y
16 help
17 Choose Y here to enable persistent, read-only local
18 caching support for 9p clients using FS-Cache
19
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index bc7f0d1551e6..1a940ec7af61 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -8,5 +8,6 @@ obj-$(CONFIG_9P_FS) := 9p.o
8 vfs_dir.o \ 8 vfs_dir.o \
9 vfs_dentry.o \ 9 vfs_dentry.o \
10 v9fs.o \ 10 v9fs.o \
11 fid.o \ 11 fid.o
12 12
139p-$(CONFIG_9P_FSCACHE) += cache.o
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
new file mode 100644
index 000000000000..51c94e26a346
--- /dev/null
+++ b/fs/9p/cache.c
@@ -0,0 +1,474 @@
1/*
2 * V9FS cache definitions.
3 *
4 * Copyright (C) 2009 by Abhishek Kulkarni <adkulkar@umail.iu.edu>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to:
17 * Free Software Foundation
18 * 51 Franklin Street, Fifth Floor
19 * Boston, MA 02111-1301 USA
20 *
21 */
22
23#include <linux/jiffies.h>
24#include <linux/file.h>
25#include <linux/stat.h>
26#include <linux/sched.h>
27#include <linux/fs.h>
28#include <net/9p/9p.h>
29
30#include "v9fs.h"
31#include "cache.h"
32
33#define CACHETAG_LEN 11
34
35struct kmem_cache *vcookie_cache;
36
37struct fscache_netfs v9fs_cache_netfs = {
38 .name = "9p",
39 .version = 0,
40};
41
42static void init_once(void *foo)
43{
44 struct v9fs_cookie *vcookie = (struct v9fs_cookie *) foo;
45 vcookie->fscache = NULL;
46 vcookie->qid = NULL;
47 inode_init_once(&vcookie->inode);
48}
49
50/**
51 * v9fs_init_vcookiecache - initialize a cache for vcookies to maintain
52 * vcookie to inode mapping
53 *
54 * Returns 0 on success.
55 */
56
57static int v9fs_init_vcookiecache(void)
58{
59 vcookie_cache = kmem_cache_create("vcookie_cache",
60 sizeof(struct v9fs_cookie),
61 0, (SLAB_RECLAIM_ACCOUNT|
62 SLAB_MEM_SPREAD),
63 init_once);
64 if (!vcookie_cache)
65 return -ENOMEM;
66
67 return 0;
68}
69
70/**
71 * v9fs_destroy_vcookiecache - destroy the cache of vcookies
72 *
73 */
74
75static void v9fs_destroy_vcookiecache(void)
76{
77 kmem_cache_destroy(vcookie_cache);
78}
79
80int __v9fs_cache_register(void)
81{
82 int ret;
83 ret = v9fs_init_vcookiecache();
84 if (ret < 0)
85 return ret;
86
87 return fscache_register_netfs(&v9fs_cache_netfs);
88}
89
90void __v9fs_cache_unregister(void)
91{
92 v9fs_destroy_vcookiecache();
93 fscache_unregister_netfs(&v9fs_cache_netfs);
94}
95
96/**
97 * v9fs_random_cachetag - Generate a random tag to be associated
98 * with a new cache session.
99 *
100 * The value of jiffies is used for a fairly randomly cache tag.
101 */
102
103static
104int v9fs_random_cachetag(struct v9fs_session_info *v9ses)
105{
106 v9ses->cachetag = kmalloc(CACHETAG_LEN, GFP_KERNEL);
107 if (!v9ses->cachetag)
108 return -ENOMEM;
109
110 return scnprintf(v9ses->cachetag, CACHETAG_LEN, "%lu", jiffies);
111}
112
113static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
114 void *buffer, uint16_t bufmax)
115{
116 struct v9fs_session_info *v9ses;
117 uint16_t klen = 0;
118
119 v9ses = (struct v9fs_session_info *)cookie_netfs_data;
120 P9_DPRINTK(P9_DEBUG_FSC, "session %p buf %p size %u", v9ses,
121 buffer, bufmax);
122
123 if (v9ses->cachetag)
124 klen = strlen(v9ses->cachetag);
125
126 if (klen > bufmax)
127 return 0;
128
129 memcpy(buffer, v9ses->cachetag, klen);
130 P9_DPRINTK(P9_DEBUG_FSC, "cache session tag %s", v9ses->cachetag);
131 return klen;
132}
133
134const struct fscache_cookie_def v9fs_cache_session_index_def = {
135 .name = "9P.session",
136 .type = FSCACHE_COOKIE_TYPE_INDEX,
137 .get_key = v9fs_cache_session_get_key,
138};
139
140void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
141{
142 /* If no cache session tag was specified, we generate a random one. */
143 if (!v9ses->cachetag)
144 v9fs_random_cachetag(v9ses);
145
146 v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index,
147 &v9fs_cache_session_index_def,
148 v9ses);
149 P9_DPRINTK(P9_DEBUG_FSC, "session %p get cookie %p", v9ses,
150 v9ses->fscache);
151}
152
153void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses)
154{
155 P9_DPRINTK(P9_DEBUG_FSC, "session %p put cookie %p", v9ses,
156 v9ses->fscache);
157 fscache_relinquish_cookie(v9ses->fscache, 0);
158 v9ses->fscache = NULL;
159}
160
161
162static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data,
163 void *buffer, uint16_t bufmax)
164{
165 const struct v9fs_cookie *vcookie = cookie_netfs_data;
166 memcpy(buffer, &vcookie->qid->path, sizeof(vcookie->qid->path));
167
168 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &vcookie->inode,
169 vcookie->qid->path);
170 return sizeof(vcookie->qid->path);
171}
172
173static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data,
174 uint64_t *size)
175{
176 const struct v9fs_cookie *vcookie = cookie_netfs_data;
177 *size = i_size_read(&vcookie->inode);
178
179 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &vcookie->inode,
180 *size);
181}
182
183static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
184 void *buffer, uint16_t buflen)
185{
186 const struct v9fs_cookie *vcookie = cookie_netfs_data;
187 memcpy(buffer, &vcookie->qid->version, sizeof(vcookie->qid->version));
188
189 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &vcookie->inode,
190 vcookie->qid->version);
191 return sizeof(vcookie->qid->version);
192}
193
194static enum
195fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
196 const void *buffer,
197 uint16_t buflen)
198{
199 const struct v9fs_cookie *vcookie = cookie_netfs_data;
200
201 if (buflen != sizeof(vcookie->qid->version))
202 return FSCACHE_CHECKAUX_OBSOLETE;
203
204 if (memcmp(buffer, &vcookie->qid->version,
205 sizeof(vcookie->qid->version)))
206 return FSCACHE_CHECKAUX_OBSOLETE;
207
208 return FSCACHE_CHECKAUX_OKAY;
209}
210
211static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
212{
213 struct v9fs_cookie *vcookie = cookie_netfs_data;
214 struct pagevec pvec;
215 pgoff_t first;
216 int loop, nr_pages;
217
218 pagevec_init(&pvec, 0);
219 first = 0;
220
221 for (;;) {
222 nr_pages = pagevec_lookup(&pvec, vcookie->inode.i_mapping,
223 first,
224 PAGEVEC_SIZE - pagevec_count(&pvec));
225 if (!nr_pages)
226 break;
227
228 for (loop = 0; loop < nr_pages; loop++)
229 ClearPageFsCache(pvec.pages[loop]);
230
231 first = pvec.pages[nr_pages - 1]->index + 1;
232
233 pvec.nr = nr_pages;
234 pagevec_release(&pvec);
235 cond_resched();
236 }
237}
238
239const struct fscache_cookie_def v9fs_cache_inode_index_def = {
240 .name = "9p.inode",
241 .type = FSCACHE_COOKIE_TYPE_DATAFILE,
242 .get_key = v9fs_cache_inode_get_key,
243 .get_attr = v9fs_cache_inode_get_attr,
244 .get_aux = v9fs_cache_inode_get_aux,
245 .check_aux = v9fs_cache_inode_check_aux,
246 .now_uncached = v9fs_cache_inode_now_uncached,
247};
248
249void v9fs_cache_inode_get_cookie(struct inode *inode)
250{
251 struct v9fs_cookie *vcookie;
252 struct v9fs_session_info *v9ses;
253
254 if (!S_ISREG(inode->i_mode))
255 return;
256
257 vcookie = v9fs_inode2cookie(inode);
258 if (vcookie->fscache)
259 return;
260
261 v9ses = v9fs_inode2v9ses(inode);
262 vcookie->fscache = fscache_acquire_cookie(v9ses->fscache,
263 &v9fs_cache_inode_index_def,
264 vcookie);
265
266 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode,
267 vcookie->fscache);
268}
269
270void v9fs_cache_inode_put_cookie(struct inode *inode)
271{
272 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
273
274 if (!vcookie->fscache)
275 return;
276 P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode,
277 vcookie->fscache);
278
279 fscache_relinquish_cookie(vcookie->fscache, 0);
280 vcookie->fscache = NULL;
281}
282
283void v9fs_cache_inode_flush_cookie(struct inode *inode)
284{
285 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
286
287 if (!vcookie->fscache)
288 return;
289 P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode,
290 vcookie->fscache);
291
292 fscache_relinquish_cookie(vcookie->fscache, 1);
293 vcookie->fscache = NULL;
294}
295
296void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp)
297{
298 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
299 struct p9_fid *fid;
300
301 if (!vcookie->fscache)
302 return;
303
304 spin_lock(&vcookie->lock);
305 fid = filp->private_data;
306 if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
307 v9fs_cache_inode_flush_cookie(inode);
308 else
309 v9fs_cache_inode_get_cookie(inode);
310
311 spin_unlock(&vcookie->lock);
312}
313
314void v9fs_cache_inode_reset_cookie(struct inode *inode)
315{
316 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
317 struct v9fs_session_info *v9ses;
318 struct fscache_cookie *old;
319
320 if (!vcookie->fscache)
321 return;
322
323 old = vcookie->fscache;
324
325 spin_lock(&vcookie->lock);
326 fscache_relinquish_cookie(vcookie->fscache, 1);
327
328 v9ses = v9fs_inode2v9ses(inode);
329 vcookie->fscache = fscache_acquire_cookie(v9ses->fscache,
330 &v9fs_cache_inode_index_def,
331 vcookie);
332
333 P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p",
334 inode, old, vcookie->fscache);
335
336 spin_unlock(&vcookie->lock);
337}
338
339int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
340{
341 struct inode *inode = page->mapping->host;
342 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
343
344 BUG_ON(!vcookie->fscache);
345
346 if (PageFsCache(page)) {
347 if (fscache_check_page_write(vcookie->fscache, page)) {
348 if (!(gfp & __GFP_WAIT))
349 return 0;
350 fscache_wait_on_page_write(vcookie->fscache, page);
351 }
352
353 fscache_uncache_page(vcookie->fscache, page);
354 ClearPageFsCache(page);
355 }
356
357 return 1;
358}
359
360void __v9fs_fscache_invalidate_page(struct page *page)
361{
362 struct inode *inode = page->mapping->host;
363 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
364
365 BUG_ON(!vcookie->fscache);
366
367 if (PageFsCache(page)) {
368 fscache_wait_on_page_write(vcookie->fscache, page);
369 BUG_ON(!PageLocked(page));
370 fscache_uncache_page(vcookie->fscache, page);
371 ClearPageFsCache(page);
372 }
373}
374
375static void v9fs_vfs_readpage_complete(struct page *page, void *data,
376 int error)
377{
378 if (!error)
379 SetPageUptodate(page);
380
381 unlock_page(page);
382}
383
384/**
385 * __v9fs_readpage_from_fscache - read a page from cache
386 *
387 * Returns 0 if the pages are in cache and a BIO is submitted,
388 * 1 if the pages are not in cache and -error otherwise.
389 */
390
391int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
392{
393 int ret;
394 const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
395
396 P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
397 if (!vcookie->fscache)
398 return -ENOBUFS;
399
400 ret = fscache_read_or_alloc_page(vcookie->fscache,
401 page,
402 v9fs_vfs_readpage_complete,
403 NULL,
404 GFP_KERNEL);
405 switch (ret) {
406 case -ENOBUFS:
407 case -ENODATA:
408 P9_DPRINTK(P9_DEBUG_FSC, "page/inode not in cache %d", ret);
409 return 1;
410 case 0:
411 P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted");
412 return ret;
413 default:
414 P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret);
415 return ret;
416 }
417}
418
419/**
420 * __v9fs_readpages_from_fscache - read multiple pages from cache
421 *
422 * Returns 0 if the pages are in cache and a BIO is submitted,
423 * 1 if the pages are not in cache and -error otherwise.
424 */
425
426int __v9fs_readpages_from_fscache(struct inode *inode,
427 struct address_space *mapping,
428 struct list_head *pages,
429 unsigned *nr_pages)
430{
431 int ret;
432 const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
433
434 P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages);
435 if (!vcookie->fscache)
436 return -ENOBUFS;
437
438 ret = fscache_read_or_alloc_pages(vcookie->fscache,
439 mapping, pages, nr_pages,
440 v9fs_vfs_readpage_complete,
441 NULL,
442 mapping_gfp_mask(mapping));
443 switch (ret) {
444 case -ENOBUFS:
445 case -ENODATA:
446 P9_DPRINTK(P9_DEBUG_FSC, "pages/inodes not in cache %d", ret);
447 return 1;
448 case 0:
449 BUG_ON(!list_empty(pages));
450 BUG_ON(*nr_pages != 0);
451 P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted");
452 return ret;
453 default:
454 P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret);
455 return ret;
456 }
457}
458
459/**
460 * __v9fs_readpage_to_fscache - write a page to the cache
461 *
462 */
463
464void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
465{
466 int ret;
467 const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
468
469 P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
470 ret = fscache_write_page(vcookie->fscache, page, GFP_KERNEL);
471 P9_DPRINTK(P9_DEBUG_FSC, "ret = %d", ret);
472 if (ret != 0)
473 v9fs_uncache_page(inode, page);
474}
diff --git a/fs/9p/cache.h b/fs/9p/cache.h
new file mode 100644
index 000000000000..a94192bfaee8
--- /dev/null
+++ b/fs/9p/cache.h
@@ -0,0 +1,176 @@
1/*
2 * V9FS cache definitions.
3 *
4 * Copyright (C) 2009 by Abhishek Kulkarni <adkulkar@umail.iu.edu>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to:
17 * Free Software Foundation
18 * 51 Franklin Street, Fifth Floor
19 * Boston, MA 02111-1301 USA
20 *
21 */
22
23#ifndef _9P_CACHE_H
24#ifdef CONFIG_9P_FSCACHE
25#include <linux/fscache.h>
26#include <linux/spinlock.h>
27
28extern struct kmem_cache *vcookie_cache;
29
30struct v9fs_cookie {
31 spinlock_t lock;
32 struct inode inode;
33 struct fscache_cookie *fscache;
34 struct p9_qid *qid;
35};
36
37static inline struct v9fs_cookie *v9fs_inode2cookie(const struct inode *inode)
38{
39 return container_of(inode, struct v9fs_cookie, inode);
40}
41
42extern struct fscache_netfs v9fs_cache_netfs;
43extern const struct fscache_cookie_def v9fs_cache_session_index_def;
44extern const struct fscache_cookie_def v9fs_cache_inode_index_def;
45
46extern void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses);
47extern void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses);
48
49extern void v9fs_cache_inode_get_cookie(struct inode *inode);
50extern void v9fs_cache_inode_put_cookie(struct inode *inode);
51extern void v9fs_cache_inode_flush_cookie(struct inode *inode);
52extern void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp);
53extern void v9fs_cache_inode_reset_cookie(struct inode *inode);
54
55extern int __v9fs_cache_register(void);
56extern void __v9fs_cache_unregister(void);
57
58extern int __v9fs_fscache_release_page(struct page *page, gfp_t gfp);
59extern void __v9fs_fscache_invalidate_page(struct page *page);
60extern int __v9fs_readpage_from_fscache(struct inode *inode,
61 struct page *page);
62extern int __v9fs_readpages_from_fscache(struct inode *inode,
63 struct address_space *mapping,
64 struct list_head *pages,
65 unsigned *nr_pages);
66extern void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page);
67
68
69/**
70 * v9fs_cache_register - Register v9fs file system with the cache
71 */
72static inline int v9fs_cache_register(void)
73{
74 return __v9fs_cache_register();
75}
76
77/**
78 * v9fs_cache_unregister - Unregister v9fs from the cache
79 */
80static inline void v9fs_cache_unregister(void)
81{
82 __v9fs_cache_unregister();
83}
84
85static inline int v9fs_fscache_release_page(struct page *page,
86 gfp_t gfp)
87{
88 return __v9fs_fscache_release_page(page, gfp);
89}
90
91static inline void v9fs_fscache_invalidate_page(struct page *page)
92{
93 __v9fs_fscache_invalidate_page(page);
94}
95
96static inline int v9fs_readpage_from_fscache(struct inode *inode,
97 struct page *page)
98{
99 return __v9fs_readpage_from_fscache(inode, page);
100}
101
102static inline int v9fs_readpages_from_fscache(struct inode *inode,
103 struct address_space *mapping,
104 struct list_head *pages,
105 unsigned *nr_pages)
106{
107 return __v9fs_readpages_from_fscache(inode, mapping, pages,
108 nr_pages);
109}
110
111static inline void v9fs_readpage_to_fscache(struct inode *inode,
112 struct page *page)
113{
114 if (PageFsCache(page))
115 __v9fs_readpage_to_fscache(inode, page);
116}
117
118static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
119{
120 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
121 fscache_uncache_page(vcookie->fscache, page);
122 BUG_ON(PageFsCache(page));
123}
124
125static inline void v9fs_vcookie_set_qid(struct inode *inode,
126 struct p9_qid *qid)
127{
128 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
129 spin_lock(&vcookie->lock);
130 vcookie->qid = qid;
131 spin_unlock(&vcookie->lock);
132}
133
134#else /* CONFIG_9P_FSCACHE */
135
136static inline int v9fs_cache_register(void)
137{
138 return 1;
139}
140
141static inline void v9fs_cache_unregister(void) {}
142
143static inline int v9fs_fscache_release_page(struct page *page,
144 gfp_t gfp) {
145 return 1;
146}
147
148static inline void v9fs_fscache_invalidate_page(struct page *page) {}
149
150static inline int v9fs_readpage_from_fscache(struct inode *inode,
151 struct page *page)
152{
153 return -ENOBUFS;
154}
155
156static inline int v9fs_readpages_from_fscache(struct inode *inode,
157 struct address_space *mapping,
158 struct list_head *pages,
159 unsigned *nr_pages)
160{
161 return -ENOBUFS;
162}
163
164static inline void v9fs_readpage_to_fscache(struct inode *inode,
165 struct page *page)
166{}
167
168static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
169{}
170
171static inline void v9fs_vcookie_set_qid(struct inode *inode,
172 struct p9_qid *qid)
173{}
174
175#endif /* CONFIG_9P_FSCACHE */
176#endif /* _9P_CACHE_H */
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 332b5ff02fec..cf62b05e296a 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -34,21 +34,25 @@
34#include <net/9p/transport.h> 34#include <net/9p/transport.h>
35#include "v9fs.h" 35#include "v9fs.h"
36#include "v9fs_vfs.h" 36#include "v9fs_vfs.h"
37#include "cache.h"
38
39static DEFINE_SPINLOCK(v9fs_sessionlist_lock);
40static LIST_HEAD(v9fs_sessionlist);
37 41
38/* 42/*
39 * Option Parsing (code inspired by NFS code) 43 * Option Parsing (code inspired by NFS code)
40 * NOTE: each transport will parse its own options 44 * NOTE: each transport will parse its own options
41 */ 45 */
42 46
43enum { 47enum {
44 /* Options that take integer arguments */ 48 /* Options that take integer arguments */
45 Opt_debug, Opt_dfltuid, Opt_dfltgid, Opt_afid, 49 Opt_debug, Opt_dfltuid, Opt_dfltgid, Opt_afid,
46 /* String options */ 50 /* String options */
47 Opt_uname, Opt_remotename, Opt_trans, 51 Opt_uname, Opt_remotename, Opt_trans, Opt_cache, Opt_cachetag,
48 /* Options that take no arguments */ 52 /* Options that take no arguments */
49 Opt_nodevmap, 53 Opt_nodevmap,
50 /* Cache options */ 54 /* Cache options */
51 Opt_cache_loose, 55 Opt_cache_loose, Opt_fscache,
52 /* Access options */ 56 /* Access options */
53 Opt_access, 57 Opt_access,
54 /* Error token */ 58 /* Error token */
@@ -63,8 +67,10 @@ static const match_table_t tokens = {
63 {Opt_uname, "uname=%s"}, 67 {Opt_uname, "uname=%s"},
64 {Opt_remotename, "aname=%s"}, 68 {Opt_remotename, "aname=%s"},
65 {Opt_nodevmap, "nodevmap"}, 69 {Opt_nodevmap, "nodevmap"},
66 {Opt_cache_loose, "cache=loose"}, 70 {Opt_cache, "cache=%s"},
67 {Opt_cache_loose, "loose"}, 71 {Opt_cache_loose, "loose"},
72 {Opt_fscache, "fscache"},
73 {Opt_cachetag, "cachetag=%s"},
68 {Opt_access, "access=%s"}, 74 {Opt_access, "access=%s"},
69 {Opt_err, NULL} 75 {Opt_err, NULL}
70}; 76};
@@ -76,7 +82,7 @@ static const match_table_t tokens = {
76 * Return 0 upon success, -ERRNO upon failure. 82 * Return 0 upon success, -ERRNO upon failure.
77 */ 83 */
78 84
79static int v9fs_parse_options(struct v9fs_session_info *v9ses) 85static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
80{ 86{
81 char *options; 87 char *options;
82 substring_t args[MAX_OPT_ARGS]; 88 substring_t args[MAX_OPT_ARGS];
@@ -89,16 +95,16 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses)
89 v9ses->afid = ~0; 95 v9ses->afid = ~0;
90 v9ses->debug = 0; 96 v9ses->debug = 0;
91 v9ses->cache = 0; 97 v9ses->cache = 0;
98#ifdef CONFIG_9P_FSCACHE
99 v9ses->cachetag = NULL;
100#endif
92 101
93 if (!v9ses->options) 102 if (!opts)
94 return 0; 103 return 0;
95 104
96 options = kstrdup(v9ses->options, GFP_KERNEL); 105 options = kstrdup(opts, GFP_KERNEL);
97 if (!options) { 106 if (!options)
98 P9_DPRINTK(P9_DEBUG_ERROR, 107 goto fail_option_alloc;
99 "failed to allocate copy of option string\n");
100 return -ENOMEM;
101 }
102 108
103 while ((p = strsep(&options, ",")) != NULL) { 109 while ((p = strsep(&options, ",")) != NULL) {
104 int token; 110 int token;
@@ -143,16 +149,33 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses)
143 case Opt_cache_loose: 149 case Opt_cache_loose:
144 v9ses->cache = CACHE_LOOSE; 150 v9ses->cache = CACHE_LOOSE;
145 break; 151 break;
152 case Opt_fscache:
153 v9ses->cache = CACHE_FSCACHE;
154 break;
155 case Opt_cachetag:
156#ifdef CONFIG_9P_FSCACHE
157 v9ses->cachetag = match_strdup(&args[0]);
158#endif
159 break;
160 case Opt_cache:
161 s = match_strdup(&args[0]);
162 if (!s)
163 goto fail_option_alloc;
164
165 if (strcmp(s, "loose") == 0)
166 v9ses->cache = CACHE_LOOSE;
167 else if (strcmp(s, "fscache") == 0)
168 v9ses->cache = CACHE_FSCACHE;
169 else
170 v9ses->cache = CACHE_NONE;
171 kfree(s);
172 break;
146 173
147 case Opt_access: 174 case Opt_access:
148 s = match_strdup(&args[0]); 175 s = match_strdup(&args[0]);
149 if (!s) { 176 if (!s)
150 P9_DPRINTK(P9_DEBUG_ERROR, 177 goto fail_option_alloc;
151 "failed to allocate copy" 178
152 " of option argument\n");
153 ret = -ENOMEM;
154 break;
155 }
156 v9ses->flags &= ~V9FS_ACCESS_MASK; 179 v9ses->flags &= ~V9FS_ACCESS_MASK;
157 if (strcmp(s, "user") == 0) 180 if (strcmp(s, "user") == 0)
158 v9ses->flags |= V9FS_ACCESS_USER; 181 v9ses->flags |= V9FS_ACCESS_USER;
@@ -173,6 +196,11 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses)
173 } 196 }
174 kfree(options); 197 kfree(options);
175 return ret; 198 return ret;
199
200fail_option_alloc:
201 P9_DPRINTK(P9_DEBUG_ERROR,
202 "failed to allocate copy of option argument\n");
203 return -ENOMEM;
176} 204}
177 205
178/** 206/**
@@ -200,30 +228,24 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
200 return ERR_PTR(-ENOMEM); 228 return ERR_PTR(-ENOMEM);
201 } 229 }
202 230
231 spin_lock(&v9fs_sessionlist_lock);
232 list_add(&v9ses->slist, &v9fs_sessionlist);
233 spin_unlock(&v9fs_sessionlist_lock);
234
203 v9ses->flags = V9FS_EXTENDED | V9FS_ACCESS_USER; 235 v9ses->flags = V9FS_EXTENDED | V9FS_ACCESS_USER;
204 strcpy(v9ses->uname, V9FS_DEFUSER); 236 strcpy(v9ses->uname, V9FS_DEFUSER);
205 strcpy(v9ses->aname, V9FS_DEFANAME); 237 strcpy(v9ses->aname, V9FS_DEFANAME);
206 v9ses->uid = ~0; 238 v9ses->uid = ~0;
207 v9ses->dfltuid = V9FS_DEFUID; 239 v9ses->dfltuid = V9FS_DEFUID;
208 v9ses->dfltgid = V9FS_DEFGID; 240 v9ses->dfltgid = V9FS_DEFGID;
209 if (data) {
210 v9ses->options = kstrdup(data, GFP_KERNEL);
211 if (!v9ses->options) {
212 P9_DPRINTK(P9_DEBUG_ERROR,
213 "failed to allocate copy of option string\n");
214 retval = -ENOMEM;
215 goto error;
216 }
217 }
218 241
219 rc = v9fs_parse_options(v9ses); 242 rc = v9fs_parse_options(v9ses, data);
220 if (rc < 0) { 243 if (rc < 0) {
221 retval = rc; 244 retval = rc;
222 goto error; 245 goto error;
223 } 246 }
224 247
225 v9ses->clnt = p9_client_create(dev_name, v9ses->options); 248 v9ses->clnt = p9_client_create(dev_name, data);
226
227 if (IS_ERR(v9ses->clnt)) { 249 if (IS_ERR(v9ses->clnt)) {
228 retval = PTR_ERR(v9ses->clnt); 250 retval = PTR_ERR(v9ses->clnt);
229 v9ses->clnt = NULL; 251 v9ses->clnt = NULL;
@@ -259,6 +281,11 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
259 else 281 else
260 fid->uid = ~0; 282 fid->uid = ~0;
261 283
284#ifdef CONFIG_9P_FSCACHE
285 /* register the session for caching */
286 v9fs_cache_session_get_cookie(v9ses);
287#endif
288
262 return fid; 289 return fid;
263 290
264error: 291error:
@@ -278,9 +305,18 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
278 v9ses->clnt = NULL; 305 v9ses->clnt = NULL;
279 } 306 }
280 307
308#ifdef CONFIG_9P_FSCACHE
309 if (v9ses->fscache) {
310 v9fs_cache_session_put_cookie(v9ses);
311 kfree(v9ses->cachetag);
312 }
313#endif
281 __putname(v9ses->uname); 314 __putname(v9ses->uname);
282 __putname(v9ses->aname); 315 __putname(v9ses->aname);
283 kfree(v9ses->options); 316
317 spin_lock(&v9fs_sessionlist_lock);
318 list_del(&v9ses->slist);
319 spin_unlock(&v9fs_sessionlist_lock);
284} 320}
285 321
286/** 322/**
@@ -297,25 +333,132 @@ void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
297 333
298extern int v9fs_error_init(void); 334extern int v9fs_error_init(void);
299 335
336static struct kobject *v9fs_kobj;
337
338#ifdef CONFIG_9P_FSCACHE
339/**
340 * caches_show - list caches associated with a session
341 *
342 * Returns the size of buffer written.
343 */
344
345static ssize_t caches_show(struct kobject *kobj,
346 struct kobj_attribute *attr,
347 char *buf)
348{
349 ssize_t n = 0, count = 0, limit = PAGE_SIZE;
350 struct v9fs_session_info *v9ses;
351
352 spin_lock(&v9fs_sessionlist_lock);
353 list_for_each_entry(v9ses, &v9fs_sessionlist, slist) {
354 if (v9ses->cachetag) {
355 n = snprintf(buf, limit, "%s\n", v9ses->cachetag);
356 if (n < 0) {
357 count = n;
358 break;
359 }
360
361 count += n;
362 limit -= n;
363 }
364 }
365
366 spin_unlock(&v9fs_sessionlist_lock);
367 return count;
368}
369
370static struct kobj_attribute v9fs_attr_cache = __ATTR_RO(caches);
371#endif /* CONFIG_9P_FSCACHE */
372
373static struct attribute *v9fs_attrs[] = {
374#ifdef CONFIG_9P_FSCACHE
375 &v9fs_attr_cache.attr,
376#endif
377 NULL,
378};
379
380static struct attribute_group v9fs_attr_group = {
381 .attrs = v9fs_attrs,
382};
383
300/** 384/**
301 * v9fs_init - Initialize module 385 * v9fs_sysfs_init - Initialize the v9fs sysfs interface
386 *
387 */
388
389static int v9fs_sysfs_init(void)
390{
391 v9fs_kobj = kobject_create_and_add("9p", fs_kobj);
392 if (!v9fs_kobj)
393 return -ENOMEM;
394
395 if (sysfs_create_group(v9fs_kobj, &v9fs_attr_group)) {
396 kobject_put(v9fs_kobj);
397 return -ENOMEM;
398 }
399
400 return 0;
401}
402
403/**
404 * v9fs_sysfs_cleanup - Unregister the v9fs sysfs interface
405 *
406 */
407
408static void v9fs_sysfs_cleanup(void)
409{
410 sysfs_remove_group(v9fs_kobj, &v9fs_attr_group);
411 kobject_put(v9fs_kobj);
412}
413
414/**
415 * init_v9fs - Initialize module
302 * 416 *
303 */ 417 */
304 418
305static int __init init_v9fs(void) 419static int __init init_v9fs(void)
306{ 420{
421 int err;
307 printk(KERN_INFO "Installing v9fs 9p2000 file system support\n"); 422 printk(KERN_INFO "Installing v9fs 9p2000 file system support\n");
308 /* TODO: Setup list of registered trasnport modules */ 423 /* TODO: Setup list of registered trasnport modules */
309 return register_filesystem(&v9fs_fs_type); 424 err = register_filesystem(&v9fs_fs_type);
425 if (err < 0) {
426 printk(KERN_ERR "Failed to register filesystem\n");
427 return err;
428 }
429
430 err = v9fs_cache_register();
431 if (err < 0) {
432 printk(KERN_ERR "Failed to register v9fs for caching\n");
433 goto out_fs_unreg;
434 }
435
436 err = v9fs_sysfs_init();
437 if (err < 0) {
438 printk(KERN_ERR "Failed to register with sysfs\n");
439 goto out_sysfs_cleanup;
440 }
441
442 return 0;
443
444out_sysfs_cleanup:
445 v9fs_sysfs_cleanup();
446
447out_fs_unreg:
448 unregister_filesystem(&v9fs_fs_type);
449
450 return err;
310} 451}
311 452
312/** 453/**
313 * v9fs_init - shutdown module 454 * exit_v9fs - shutdown module
314 * 455 *
315 */ 456 */
316 457
317static void __exit exit_v9fs(void) 458static void __exit exit_v9fs(void)
318{ 459{
460 v9fs_sysfs_cleanup();
461 v9fs_cache_unregister();
319 unregister_filesystem(&v9fs_fs_type); 462 unregister_filesystem(&v9fs_fs_type);
320} 463}
321 464
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index a7d567192998..019f4ccb70c1 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -51,6 +51,7 @@ enum p9_session_flags {
51enum p9_cache_modes { 51enum p9_cache_modes {
52 CACHE_NONE, 52 CACHE_NONE,
53 CACHE_LOOSE, 53 CACHE_LOOSE,
54 CACHE_FSCACHE,
54}; 55};
55 56
56/** 57/**
@@ -60,6 +61,8 @@ enum p9_cache_modes {
60 * @debug: debug level 61 * @debug: debug level
61 * @afid: authentication handle 62 * @afid: authentication handle
62 * @cache: cache mode of type &p9_cache_modes 63 * @cache: cache mode of type &p9_cache_modes
64 * @cachetag: the tag of the cache associated with this session
65 * @fscache: session cookie associated with FS-Cache
63 * @options: copy of options string given by user 66 * @options: copy of options string given by user
64 * @uname: string user name to mount hierarchy as 67 * @uname: string user name to mount hierarchy as
65 * @aname: mount specifier for remote hierarchy 68 * @aname: mount specifier for remote hierarchy
@@ -68,7 +71,7 @@ enum p9_cache_modes {
68 * @dfltgid: default numeric groupid to mount hierarchy as 71 * @dfltgid: default numeric groupid to mount hierarchy as
69 * @uid: if %V9FS_ACCESS_SINGLE, the numeric uid which mounted the hierarchy 72 * @uid: if %V9FS_ACCESS_SINGLE, the numeric uid which mounted the hierarchy
70 * @clnt: reference to 9P network client instantiated for this session 73 * @clnt: reference to 9P network client instantiated for this session
71 * @debugfs_dir: reference to debugfs_dir which can be used for add'l debug 74 * @slist: reference to list of registered 9p sessions
72 * 75 *
73 * This structure holds state for each session instance established during 76 * This structure holds state for each session instance established during
74 * a sys_mount() . 77 * a sys_mount() .
@@ -84,8 +87,11 @@ struct v9fs_session_info {
84 unsigned short debug; 87 unsigned short debug;
85 unsigned int afid; 88 unsigned int afid;
86 unsigned int cache; 89 unsigned int cache;
90#ifdef CONFIG_9P_FSCACHE
91 char *cachetag;
92 struct fscache_cookie *fscache;
93#endif
87 94
88 char *options; /* copy of mount options */
89 char *uname; /* user name to mount as */ 95 char *uname; /* user name to mount as */
90 char *aname; /* name of remote hierarchy being mounted */ 96 char *aname; /* name of remote hierarchy being mounted */
91 unsigned int maxdata; /* max data for client interface */ 97 unsigned int maxdata; /* max data for client interface */
@@ -93,11 +99,9 @@ struct v9fs_session_info {
93 unsigned int dfltgid; /* default gid for legacy support */ 99 unsigned int dfltgid; /* default gid for legacy support */
94 u32 uid; /* if ACCESS_SINGLE, the uid that has access */ 100 u32 uid; /* if ACCESS_SINGLE, the uid that has access */
95 struct p9_client *clnt; /* 9p client */ 101 struct p9_client *clnt; /* 9p client */
96 struct dentry *debugfs_dir; 102 struct list_head slist; /* list of sessions registered with v9fs */
97}; 103};
98 104
99extern struct dentry *v9fs_debugfs_root;
100
101struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, 105struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
102 char *); 106 char *);
103void v9fs_session_close(struct v9fs_session_info *v9ses); 107void v9fs_session_close(struct v9fs_session_info *v9ses);
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index f0c7de78e205..3a7560e35865 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -44,7 +44,13 @@ extern const struct file_operations v9fs_dir_operations;
44extern const struct dentry_operations v9fs_dentry_operations; 44extern const struct dentry_operations v9fs_dentry_operations;
45extern const struct dentry_operations v9fs_cached_dentry_operations; 45extern const struct dentry_operations v9fs_cached_dentry_operations;
46 46
47#ifdef CONFIG_9P_FSCACHE
48struct inode *v9fs_alloc_inode(struct super_block *sb);
49void v9fs_destroy_inode(struct inode *inode);
50#endif
51
47struct inode *v9fs_get_inode(struct super_block *sb, int mode); 52struct inode *v9fs_get_inode(struct super_block *sb, int mode);
53void v9fs_clear_inode(struct inode *inode);
48ino_t v9fs_qid2ino(struct p9_qid *qid); 54ino_t v9fs_qid2ino(struct p9_qid *qid);
49void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); 55void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
50int v9fs_dir_release(struct inode *inode, struct file *filp); 56int v9fs_dir_release(struct inode *inode, struct file *filp);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 92828281a30b..90e38449f4b3 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -38,6 +38,7 @@
38 38
39#include "v9fs.h" 39#include "v9fs.h"
40#include "v9fs_vfs.h" 40#include "v9fs_vfs.h"
41#include "cache.h"
41 42
42/** 43/**
43 * v9fs_vfs_readpage - read an entire page in from 9P 44 * v9fs_vfs_readpage - read an entire page in from 9P
@@ -52,18 +53,31 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page)
52 int retval; 53 int retval;
53 loff_t offset; 54 loff_t offset;
54 char *buffer; 55 char *buffer;
56 struct inode *inode;
55 57
58 inode = page->mapping->host;
56 P9_DPRINTK(P9_DEBUG_VFS, "\n"); 59 P9_DPRINTK(P9_DEBUG_VFS, "\n");
60
61 BUG_ON(!PageLocked(page));
62
63 retval = v9fs_readpage_from_fscache(inode, page);
64 if (retval == 0)
65 return retval;
66
57 buffer = kmap(page); 67 buffer = kmap(page);
58 offset = page_offset(page); 68 offset = page_offset(page);
59 69
60 retval = v9fs_file_readn(filp, buffer, NULL, PAGE_CACHE_SIZE, offset); 70 retval = v9fs_file_readn(filp, buffer, NULL, PAGE_CACHE_SIZE, offset);
61 if (retval < 0) 71 if (retval < 0) {
72 v9fs_uncache_page(inode, page);
62 goto done; 73 goto done;
74 }
63 75
64 memset(buffer + retval, 0, PAGE_CACHE_SIZE - retval); 76 memset(buffer + retval, 0, PAGE_CACHE_SIZE - retval);
65 flush_dcache_page(page); 77 flush_dcache_page(page);
66 SetPageUptodate(page); 78 SetPageUptodate(page);
79
80 v9fs_readpage_to_fscache(inode, page);
67 retval = 0; 81 retval = 0;
68 82
69done: 83done:
@@ -72,6 +86,78 @@ done:
72 return retval; 86 return retval;
73} 87}
74 88
89/**
90 * v9fs_vfs_readpages - read a set of pages from 9P
91 *
92 * @filp: file being read
93 * @mapping: the address space
94 * @pages: list of pages to read
95 * @nr_pages: count of pages to read
96 *
97 */
98
99static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping,
100 struct list_head *pages, unsigned nr_pages)
101{
102 int ret = 0;
103 struct inode *inode;
104
105 inode = mapping->host;
106 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp);
107
108 ret = v9fs_readpages_from_fscache(inode, mapping, pages, &nr_pages);
109 if (ret == 0)
110 return ret;
111
112 ret = read_cache_pages(mapping, pages, (void *)v9fs_vfs_readpage, filp);
113 P9_DPRINTK(P9_DEBUG_VFS, " = %d\n", ret);
114 return ret;
115}
116
117/**
118 * v9fs_release_page - release the private state associated with a page
119 *
120 * Returns 1 if the page can be released, false otherwise.
121 */
122
123static int v9fs_release_page(struct page *page, gfp_t gfp)
124{
125 if (PagePrivate(page))
126 return 0;
127
128 return v9fs_fscache_release_page(page, gfp);
129}
130
131/**
132 * v9fs_invalidate_page - Invalidate a page completely or partially
133 *
134 * @page: structure to page
135 * @offset: offset in the page
136 */
137
138static void v9fs_invalidate_page(struct page *page, unsigned long offset)
139{
140 if (offset == 0)
141 v9fs_fscache_invalidate_page(page);
142}
143
144/**
145 * v9fs_launder_page - Writeback a dirty page
146 * Since the writes go directly to the server, we simply return a 0
147 * here to indicate success.
148 *
149 * Returns 0 on success.
150 */
151
152static int v9fs_launder_page(struct page *page)
153{
154 return 0;
155}
156
75const struct address_space_operations v9fs_addr_operations = { 157const struct address_space_operations v9fs_addr_operations = {
76 .readpage = v9fs_vfs_readpage, 158 .readpage = v9fs_vfs_readpage,
159 .readpages = v9fs_vfs_readpages,
160 .releasepage = v9fs_release_page,
161 .invalidatepage = v9fs_invalidate_page,
162 .launder_page = v9fs_launder_page,
77}; 163};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 68bf2af6c389..3902bf43a088 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -32,6 +32,7 @@
32#include <linux/string.h> 32#include <linux/string.h>
33#include <linux/inet.h> 33#include <linux/inet.h>
34#include <linux/list.h> 34#include <linux/list.h>
35#include <linux/pagemap.h>
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36#include <linux/idr.h> 37#include <linux/idr.h>
37#include <net/9p/9p.h> 38#include <net/9p/9p.h>
@@ -40,6 +41,7 @@
40#include "v9fs.h" 41#include "v9fs.h"
41#include "v9fs_vfs.h" 42#include "v9fs_vfs.h"
42#include "fid.h" 43#include "fid.h"
44#include "cache.h"
43 45
44static const struct file_operations v9fs_cached_file_operations; 46static const struct file_operations v9fs_cached_file_operations;
45 47
@@ -72,7 +74,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
72 return err; 74 return err;
73 } 75 }
74 if (omode & P9_OTRUNC) { 76 if (omode & P9_OTRUNC) {
75 inode->i_size = 0; 77 i_size_write(inode, 0);
76 inode->i_blocks = 0; 78 inode->i_blocks = 0;
77 } 79 }
78 if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses))) 80 if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses)))
@@ -85,6 +87,10 @@ int v9fs_file_open(struct inode *inode, struct file *file)
85 /* enable cached file options */ 87 /* enable cached file options */
86 if(file->f_op == &v9fs_file_operations) 88 if(file->f_op == &v9fs_file_operations)
87 file->f_op = &v9fs_cached_file_operations; 89 file->f_op = &v9fs_cached_file_operations;
90
91#ifdef CONFIG_9P_FSCACHE
92 v9fs_cache_inode_set_cookie(inode, file);
93#endif
88 } 94 }
89 95
90 return 0; 96 return 0;
@@ -210,6 +216,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
210 struct p9_client *clnt; 216 struct p9_client *clnt;
211 struct inode *inode = filp->f_path.dentry->d_inode; 217 struct inode *inode = filp->f_path.dentry->d_inode;
212 int origin = *offset; 218 int origin = *offset;
219 unsigned long pg_start, pg_end;
213 220
214 P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data, 221 P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
215 (int)count, (int)*offset); 222 (int)count, (int)*offset);
@@ -225,7 +232,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
225 if (count < rsize) 232 if (count < rsize)
226 rsize = count; 233 rsize = count;
227 234
228 n = p9_client_write(fid, NULL, data+total, *offset+total, 235 n = p9_client_write(fid, NULL, data+total, origin+total,
229 rsize); 236 rsize);
230 if (n <= 0) 237 if (n <= 0)
231 break; 238 break;
@@ -234,14 +241,14 @@ v9fs_file_write(struct file *filp, const char __user * data,
234 } while (count > 0); 241 } while (count > 0);
235 242
236 if (total > 0) { 243 if (total > 0) {
237 invalidate_inode_pages2_range(inode->i_mapping, origin, 244 pg_start = origin >> PAGE_CACHE_SHIFT;
238 origin+total); 245 pg_end = (origin + total - 1) >> PAGE_CACHE_SHIFT;
246 if (inode->i_mapping && inode->i_mapping->nrpages)
247 invalidate_inode_pages2_range(inode->i_mapping,
248 pg_start, pg_end);
239 *offset += total; 249 *offset += total;
240 } 250 i_size_write(inode, i_size_read(inode) + total);
241 251 inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
242 if (*offset > inode->i_size) {
243 inode->i_size = *offset;
244 inode->i_blocks = (inode->i_size + 512 - 1) >> 9;
245 } 252 }
246 253
247 if (n < 0) 254 if (n < 0)
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 81f8bbf12f9f..5947628aefef 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -40,6 +40,7 @@
40#include "v9fs.h" 40#include "v9fs.h"
41#include "v9fs_vfs.h" 41#include "v9fs_vfs.h"
42#include "fid.h" 42#include "fid.h"
43#include "cache.h"
43 44
44static const struct inode_operations v9fs_dir_inode_operations; 45static const struct inode_operations v9fs_dir_inode_operations;
45static const struct inode_operations v9fs_dir_inode_operations_ext; 46static const struct inode_operations v9fs_dir_inode_operations_ext;
@@ -171,7 +172,6 @@ int v9fs_uflags2omode(int uflags, int extended)
171 172
172/** 173/**
173 * v9fs_blank_wstat - helper function to setup a 9P stat structure 174 * v9fs_blank_wstat - helper function to setup a 9P stat structure
174 * @v9ses: 9P session info (for determining extended mode)
175 * @wstat: structure to initialize 175 * @wstat: structure to initialize
176 * 176 *
177 */ 177 */
@@ -198,6 +198,39 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
198 wstat->extension = NULL; 198 wstat->extension = NULL;
199} 199}
200 200
201#ifdef CONFIG_9P_FSCACHE
202/**
203 * v9fs_alloc_inode - helper function to allocate an inode
204 * This callback is executed before setting up the inode so that we
205 * can associate a vcookie with each inode.
206 *
207 */
208
209struct inode *v9fs_alloc_inode(struct super_block *sb)
210{
211 struct v9fs_cookie *vcookie;
212 vcookie = (struct v9fs_cookie *)kmem_cache_alloc(vcookie_cache,
213 GFP_KERNEL);
214 if (!vcookie)
215 return NULL;
216
217 vcookie->fscache = NULL;
218 vcookie->qid = NULL;
219 spin_lock_init(&vcookie->lock);
220 return &vcookie->inode;
221}
222
223/**
224 * v9fs_destroy_inode - destroy an inode
225 *
226 */
227
228void v9fs_destroy_inode(struct inode *inode)
229{
230 kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode));
231}
232#endif
233
201/** 234/**
202 * v9fs_get_inode - helper function to setup an inode 235 * v9fs_get_inode - helper function to setup an inode
203 * @sb: superblock 236 * @sb: superblock
@@ -207,65 +240,72 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
207 240
208struct inode *v9fs_get_inode(struct super_block *sb, int mode) 241struct inode *v9fs_get_inode(struct super_block *sb, int mode)
209{ 242{
243 int err;
210 struct inode *inode; 244 struct inode *inode;
211 struct v9fs_session_info *v9ses = sb->s_fs_info; 245 struct v9fs_session_info *v9ses = sb->s_fs_info;
212 246
213 P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode); 247 P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
214 248
215 inode = new_inode(sb); 249 inode = new_inode(sb);
216 if (inode) { 250 if (!inode) {
217 inode->i_mode = mode;
218 inode->i_uid = current_fsuid();
219 inode->i_gid = current_fsgid();
220 inode->i_blocks = 0;
221 inode->i_rdev = 0;
222 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
223 inode->i_mapping->a_ops = &v9fs_addr_operations;
224
225 switch (mode & S_IFMT) {
226 case S_IFIFO:
227 case S_IFBLK:
228 case S_IFCHR:
229 case S_IFSOCK:
230 if (!v9fs_extended(v9ses)) {
231 P9_DPRINTK(P9_DEBUG_ERROR,
232 "special files without extended mode\n");
233 return ERR_PTR(-EINVAL);
234 }
235 init_special_inode(inode, inode->i_mode,
236 inode->i_rdev);
237 break;
238 case S_IFREG:
239 inode->i_op = &v9fs_file_inode_operations;
240 inode->i_fop = &v9fs_file_operations;
241 break;
242 case S_IFLNK:
243 if (!v9fs_extended(v9ses)) {
244 P9_DPRINTK(P9_DEBUG_ERROR,
245 "extended modes used w/o 9P2000.u\n");
246 return ERR_PTR(-EINVAL);
247 }
248 inode->i_op = &v9fs_symlink_inode_operations;
249 break;
250 case S_IFDIR:
251 inc_nlink(inode);
252 if (v9fs_extended(v9ses))
253 inode->i_op = &v9fs_dir_inode_operations_ext;
254 else
255 inode->i_op = &v9fs_dir_inode_operations;
256 inode->i_fop = &v9fs_dir_operations;
257 break;
258 default:
259 P9_DPRINTK(P9_DEBUG_ERROR,
260 "BAD mode 0x%x S_IFMT 0x%x\n",
261 mode, mode & S_IFMT);
262 return ERR_PTR(-EINVAL);
263 }
264 } else {
265 P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n"); 251 P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
266 return ERR_PTR(-ENOMEM); 252 return ERR_PTR(-ENOMEM);
267 } 253 }
254
255 inode->i_mode = mode;
256 inode->i_uid = current_fsuid();
257 inode->i_gid = current_fsgid();
258 inode->i_blocks = 0;
259 inode->i_rdev = 0;
260 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
261 inode->i_mapping->a_ops = &v9fs_addr_operations;
262
263 switch (mode & S_IFMT) {
264 case S_IFIFO:
265 case S_IFBLK:
266 case S_IFCHR:
267 case S_IFSOCK:
268 if (!v9fs_extended(v9ses)) {
269 P9_DPRINTK(P9_DEBUG_ERROR,
270 "special files without extended mode\n");
271 err = -EINVAL;
272 goto error;
273 }
274 init_special_inode(inode, inode->i_mode, inode->i_rdev);
275 break;
276 case S_IFREG:
277 inode->i_op = &v9fs_file_inode_operations;
278 inode->i_fop = &v9fs_file_operations;
279 break;
280 case S_IFLNK:
281 if (!v9fs_extended(v9ses)) {
282 P9_DPRINTK(P9_DEBUG_ERROR,
283 "extended modes used w/o 9P2000.u\n");
284 err = -EINVAL;
285 goto error;
286 }
287 inode->i_op = &v9fs_symlink_inode_operations;
288 break;
289 case S_IFDIR:
290 inc_nlink(inode);
291 if (v9fs_extended(v9ses))
292 inode->i_op = &v9fs_dir_inode_operations_ext;
293 else
294 inode->i_op = &v9fs_dir_inode_operations;
295 inode->i_fop = &v9fs_dir_operations;
296 break;
297 default:
298 P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
299 mode, mode & S_IFMT);
300 err = -EINVAL;
301 goto error;
302 }
303
268 return inode; 304 return inode;
305
306error:
307 iput(inode);
308 return ERR_PTR(err);
269} 309}
270 310
271/* 311/*
@@ -320,6 +360,21 @@ error:
320} 360}
321*/ 361*/
322 362
363
364/**
365 * v9fs_clear_inode - release an inode
366 * @inode: inode to release
367 *
368 */
369void v9fs_clear_inode(struct inode *inode)
370{
371 filemap_fdatawrite(inode->i_mapping);
372
373#ifdef CONFIG_9P_FSCACHE
374 v9fs_cache_inode_put_cookie(inode);
375#endif
376}
377
323/** 378/**
324 * v9fs_inode_from_fid - populate an inode by issuing a attribute request 379 * v9fs_inode_from_fid - populate an inode by issuing a attribute request
325 * @v9ses: session information 380 * @v9ses: session information
@@ -338,30 +393,31 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
338 393
339 ret = NULL; 394 ret = NULL;
340 st = p9_client_stat(fid); 395 st = p9_client_stat(fid);
341 if (IS_ERR(st)) { 396 if (IS_ERR(st))
342 err = PTR_ERR(st); 397 return ERR_CAST(st);
343 st = NULL;
344 goto error;
345 }
346 398
347 umode = p9mode2unixmode(v9ses, st->mode); 399 umode = p9mode2unixmode(v9ses, st->mode);
348 ret = v9fs_get_inode(sb, umode); 400 ret = v9fs_get_inode(sb, umode);
349 if (IS_ERR(ret)) { 401 if (IS_ERR(ret)) {
350 err = PTR_ERR(ret); 402 err = PTR_ERR(ret);
351 ret = NULL;
352 goto error; 403 goto error;
353 } 404 }
354 405
355 v9fs_stat2inode(st, ret, sb); 406 v9fs_stat2inode(st, ret, sb);
356 ret->i_ino = v9fs_qid2ino(&st->qid); 407 ret->i_ino = v9fs_qid2ino(&st->qid);
408
409#ifdef CONFIG_9P_FSCACHE
410 v9fs_vcookie_set_qid(ret, &st->qid);
411 v9fs_cache_inode_get_cookie(ret);
412#endif
413 p9stat_free(st);
357 kfree(st); 414 kfree(st);
415
358 return ret; 416 return ret;
359 417
360error: 418error:
419 p9stat_free(st);
361 kfree(st); 420 kfree(st);
362 if (ret)
363 iput(ret);
364
365 return ERR_PTR(err); 421 return ERR_PTR(err);
366} 422}
367 423
@@ -403,9 +459,9 @@ v9fs_open_created(struct inode *inode, struct file *file)
403 * @v9ses: session information 459 * @v9ses: session information
404 * @dir: directory that dentry is being created in 460 * @dir: directory that dentry is being created in
405 * @dentry: dentry that is being created 461 * @dentry: dentry that is being created
462 * @extension: 9p2000.u extension string to support devices, etc.
406 * @perm: create permissions 463 * @perm: create permissions
407 * @mode: open mode 464 * @mode: open mode
408 * @extension: 9p2000.u extension string to support devices, etc.
409 * 465 *
410 */ 466 */
411static struct p9_fid * 467static struct p9_fid *
@@ -470,7 +526,10 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
470 dentry->d_op = &v9fs_dentry_operations; 526 dentry->d_op = &v9fs_dentry_operations;
471 527
472 d_instantiate(dentry, inode); 528 d_instantiate(dentry, inode);
473 v9fs_fid_add(dentry, fid); 529 err = v9fs_fid_add(dentry, fid);
530 if (err < 0)
531 goto error;
532
474 return ofid; 533 return ofid;
475 534
476error: 535error:
@@ -747,7 +806,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
747 P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); 806 P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
748 err = -EPERM; 807 err = -EPERM;
749 v9ses = v9fs_inode2v9ses(dentry->d_inode); 808 v9ses = v9fs_inode2v9ses(dentry->d_inode);
750 if (v9ses->cache == CACHE_LOOSE) 809 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
751 return simple_getattr(mnt, dentry, stat); 810 return simple_getattr(mnt, dentry, stat);
752 811
753 fid = v9fs_fid_lookup(dentry); 812 fid = v9fs_fid_lookup(dentry);
@@ -868,10 +927,10 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
868 } else 927 } else
869 inode->i_rdev = 0; 928 inode->i_rdev = 0;
870 929
871 inode->i_size = stat->length; 930 i_size_write(inode, stat->length);
872 931
873 /* not real number of blocks, but 512 byte ones ... */ 932 /* not real number of blocks, but 512 byte ones ... */
874 inode->i_blocks = (inode->i_size + 512 - 1) >> 9; 933 inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
875} 934}
876 935
877/** 936/**
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 38d695d66a0b..14a86448572c 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -44,21 +44,9 @@
44#include "v9fs_vfs.h" 44#include "v9fs_vfs.h"
45#include "fid.h" 45#include "fid.h"
46 46
47static void v9fs_clear_inode(struct inode *);
48static const struct super_operations v9fs_super_ops; 47static const struct super_operations v9fs_super_ops;
49 48
50/** 49/**
51 * v9fs_clear_inode - release an inode
52 * @inode: inode to release
53 *
54 */
55
56static void v9fs_clear_inode(struct inode *inode)
57{
58 filemap_fdatawrite(inode->i_mapping);
59}
60
61/**
62 * v9fs_set_super - set the superblock 50 * v9fs_set_super - set the superblock
63 * @s: super block 51 * @s: super block
64 * @data: file system specific data 52 * @data: file system specific data
@@ -81,7 +69,7 @@ static int v9fs_set_super(struct super_block *s, void *data)
81 69
82static void 70static void
83v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, 71v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
84 int flags) 72 int flags, void *data)
85{ 73{
86 sb->s_maxbytes = MAX_LFS_FILESIZE; 74 sb->s_maxbytes = MAX_LFS_FILESIZE;
87 sb->s_blocksize_bits = fls(v9ses->maxdata - 1); 75 sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
@@ -91,6 +79,8 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
91 79
92 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | 80 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
93 MS_NOATIME; 81 MS_NOATIME;
82
83 save_mount_options(sb, data);
94} 84}
95 85
96/** 86/**
@@ -113,14 +103,11 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
113 struct v9fs_session_info *v9ses = NULL; 103 struct v9fs_session_info *v9ses = NULL;
114 struct p9_wstat *st = NULL; 104 struct p9_wstat *st = NULL;
115 int mode = S_IRWXUGO | S_ISVTX; 105 int mode = S_IRWXUGO | S_ISVTX;
116 uid_t uid = current_fsuid();
117 gid_t gid = current_fsgid();
118 struct p9_fid *fid; 106 struct p9_fid *fid;
119 int retval = 0; 107 int retval = 0;
120 108
121 P9_DPRINTK(P9_DEBUG_VFS, " \n"); 109 P9_DPRINTK(P9_DEBUG_VFS, " \n");
122 110
123 st = NULL;
124 v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL); 111 v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
125 if (!v9ses) 112 if (!v9ses)
126 return -ENOMEM; 113 return -ENOMEM;
@@ -142,7 +129,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
142 retval = PTR_ERR(sb); 129 retval = PTR_ERR(sb);
143 goto free_stat; 130 goto free_stat;
144 } 131 }
145 v9fs_fill_super(sb, v9ses, flags); 132 v9fs_fill_super(sb, v9ses, flags, data);
146 133
147 inode = v9fs_get_inode(sb, S_IFDIR | mode); 134 inode = v9fs_get_inode(sb, S_IFDIR | mode);
148 if (IS_ERR(inode)) { 135 if (IS_ERR(inode)) {
@@ -150,9 +137,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
150 goto release_sb; 137 goto release_sb;
151 } 138 }
152 139
153 inode->i_uid = uid;
154 inode->i_gid = gid;
155
156 root = d_alloc_root(inode); 140 root = d_alloc_root(inode);
157 if (!root) { 141 if (!root) {
158 iput(inode); 142 iput(inode);
@@ -173,10 +157,8 @@ P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
173 simple_set_mnt(mnt, sb); 157 simple_set_mnt(mnt, sb);
174 return 0; 158 return 0;
175 159
176release_sb:
177 deactivate_locked_super(sb);
178
179free_stat: 160free_stat:
161 p9stat_free(st);
180 kfree(st); 162 kfree(st);
181 163
182clunk_fid: 164clunk_fid:
@@ -185,7 +167,12 @@ clunk_fid:
185close_session: 167close_session:
186 v9fs_session_close(v9ses); 168 v9fs_session_close(v9ses);
187 kfree(v9ses); 169 kfree(v9ses);
170 return retval;
188 171
172release_sb:
173 p9stat_free(st);
174 kfree(st);
175 deactivate_locked_super(sb);
189 return retval; 176 return retval;
190} 177}
191 178
@@ -207,24 +194,10 @@ static void v9fs_kill_super(struct super_block *s)
207 194
208 v9fs_session_close(v9ses); 195 v9fs_session_close(v9ses);
209 kfree(v9ses); 196 kfree(v9ses);
197 s->s_fs_info = NULL;
210 P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n"); 198 P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n");
211} 199}
212 200
213/**
214 * v9fs_show_options - Show mount options in /proc/mounts
215 * @m: seq_file to write to
216 * @mnt: mount descriptor
217 *
218 */
219
220static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt)
221{
222 struct v9fs_session_info *v9ses = mnt->mnt_sb->s_fs_info;
223
224 seq_printf(m, "%s", v9ses->options);
225 return 0;
226}
227
228static void 201static void
229v9fs_umount_begin(struct super_block *sb) 202v9fs_umount_begin(struct super_block *sb)
230{ 203{
@@ -235,9 +208,13 @@ v9fs_umount_begin(struct super_block *sb)
235} 208}
236 209
237static const struct super_operations v9fs_super_ops = { 210static const struct super_operations v9fs_super_ops = {
211#ifdef CONFIG_9P_FSCACHE
212 .alloc_inode = v9fs_alloc_inode,
213 .destroy_inode = v9fs_destroy_inode,
214#endif
238 .statfs = simple_statfs, 215 .statfs = simple_statfs,
239 .clear_inode = v9fs_clear_inode, 216 .clear_inode = v9fs_clear_inode,
240 .show_options = v9fs_show_options, 217 .show_options = generic_show_options,
241 .umount_begin = v9fs_umount_begin, 218 .umount_begin = v9fs_umount_begin,
242}; 219};
243 220
diff --git a/fs/Kconfig b/fs/Kconfig
index 0e7da7bb5d93..d4bf8caad8d0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -43,6 +43,7 @@ source "fs/xfs/Kconfig"
43source "fs/gfs2/Kconfig" 43source "fs/gfs2/Kconfig"
44source "fs/ocfs2/Kconfig" 44source "fs/ocfs2/Kconfig"
45source "fs/btrfs/Kconfig" 45source "fs/btrfs/Kconfig"
46source "fs/nilfs2/Kconfig"
46 47
47endif # BLOCK 48endif # BLOCK
48 49
@@ -108,6 +109,7 @@ source "fs/sysfs/Kconfig"
108 109
109config TMPFS 110config TMPFS
110 bool "Virtual memory file system support (former shm fs)" 111 bool "Virtual memory file system support (former shm fs)"
112 depends on SHMEM
111 help 113 help
112 Tmpfs is a file system which keeps all files in virtual memory. 114 Tmpfs is a file system which keeps all files in virtual memory.
113 115
@@ -186,7 +188,6 @@ source "fs/romfs/Kconfig"
186source "fs/sysv/Kconfig" 188source "fs/sysv/Kconfig"
187source "fs/ufs/Kconfig" 189source "fs/ufs/Kconfig"
188source "fs/exofs/Kconfig" 190source "fs/exofs/Kconfig"
189source "fs/nilfs2/Kconfig"
190 191
191endif # MISC_FILESYSTEMS 192endif # MISC_FILESYSTEMS
192 193
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 798cb071d132..3f57ce4bee5d 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -19,9 +19,6 @@ static int
19adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh, 19adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh,
20 int create) 20 int create)
21{ 21{
22 if (block < 0)
23 goto abort_negative;
24
25 if (!create) { 22 if (!create) {
26 if (block >= inode->i_blocks) 23 if (block >= inode->i_blocks)
27 goto abort_toobig; 24 goto abort_toobig;
@@ -34,10 +31,6 @@ adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh,
34 /* don't support allocation of blocks yet */ 31 /* don't support allocation of blocks yet */
35 return -EIO; 32 return -EIO;
36 33
37abort_negative:
38 adfs_error(inode->i_sb, "block %d < 0", block);
39 return -EIO;
40
41abort_toobig: 34abort_toobig:
42 return 0; 35 return 0;
43} 36}
diff --git a/fs/afs/cache.h b/fs/afs/cache.h
deleted file mode 100644
index 5c4f6b499e90..000000000000
--- a/fs/afs/cache.h
+++ /dev/null
@@ -1,12 +0,0 @@
1/* AFS local cache management interface
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/fscache.h>
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 0149dab365e7..681c2a7b013f 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -134,9 +134,16 @@ static int afs_readpage(struct file *file, struct page *page)
134 134
135 inode = page->mapping->host; 135 inode = page->mapping->host;
136 136
137 ASSERT(file != NULL); 137 if (file) {
138 key = file->private_data; 138 key = file->private_data;
139 ASSERT(key != NULL); 139 ASSERT(key != NULL);
140 } else {
141 key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
142 if (IS_ERR(key)) {
143 ret = PTR_ERR(key);
144 goto error_nokey;
145 }
146 }
140 147
141 _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index); 148 _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
142 149
@@ -207,12 +214,17 @@ static int afs_readpage(struct file *file, struct page *page)
207 unlock_page(page); 214 unlock_page(page);
208 } 215 }
209 216
217 if (!file)
218 key_put(key);
210 _leave(" = 0"); 219 _leave(" = 0");
211 return 0; 220 return 0;
212 221
213error: 222error:
214 SetPageError(page); 223 SetPageError(page);
215 unlock_page(page); 224 unlock_page(page);
225 if (!file)
226 key_put(key);
227error_nokey:
216 _leave(" = %d", ret); 228 _leave(" = %d", ret);
217 return ret; 229 return ret;
218} 230}
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 3ff8bdd18fb3..0931bc1325eb 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -21,7 +21,7 @@ static void afs_fl_release_private(struct file_lock *fl);
21static struct workqueue_struct *afs_lock_manager; 21static struct workqueue_struct *afs_lock_manager;
22static DEFINE_MUTEX(afs_lock_manager_mutex); 22static DEFINE_MUTEX(afs_lock_manager_mutex);
23 23
24static struct file_lock_operations afs_lock_ops = { 24static const struct file_lock_operations afs_lock_ops = {
25 .fl_copy_lock = afs_fl_copy_lock, 25 .fl_copy_lock = afs_fl_copy_lock,
26 .fl_release_private = afs_fl_release_private, 26 .fl_release_private = afs_fl_release_private,
27}; 27};
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 106be66dafd2..6ece2a13bf71 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -18,10 +18,10 @@
18#include <linux/key.h> 18#include <linux/key.h>
19#include <linux/workqueue.h> 19#include <linux/workqueue.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/fscache.h>
21 22
22#include "afs.h" 23#include "afs.h"
23#include "afs_vl.h" 24#include "afs_vl.h"
24#include "cache.h"
25 25
26#define AFS_CELL_MAX_ADDRS 15 26#define AFS_CELL_MAX_ADDRS 15
27 27
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 8630615e57fe..852739d262a9 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -28,7 +28,7 @@ static int afs_proc_cells_show(struct seq_file *m, void *v);
28static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf, 28static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
29 size_t size, loff_t *_pos); 29 size_t size, loff_t *_pos);
30 30
31static struct seq_operations afs_proc_cells_ops = { 31static const struct seq_operations afs_proc_cells_ops = {
32 .start = afs_proc_cells_start, 32 .start = afs_proc_cells_start,
33 .next = afs_proc_cells_next, 33 .next = afs_proc_cells_next,
34 .stop = afs_proc_cells_stop, 34 .stop = afs_proc_cells_stop,
@@ -70,7 +70,7 @@ static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
70static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v); 70static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v);
71static int afs_proc_cell_volumes_show(struct seq_file *m, void *v); 71static int afs_proc_cell_volumes_show(struct seq_file *m, void *v);
72 72
73static struct seq_operations afs_proc_cell_volumes_ops = { 73static const struct seq_operations afs_proc_cell_volumes_ops = {
74 .start = afs_proc_cell_volumes_start, 74 .start = afs_proc_cell_volumes_start,
75 .next = afs_proc_cell_volumes_next, 75 .next = afs_proc_cell_volumes_next,
76 .stop = afs_proc_cell_volumes_stop, 76 .stop = afs_proc_cell_volumes_stop,
@@ -95,7 +95,7 @@ static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
95static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v); 95static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v);
96static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v); 96static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v);
97 97
98static struct seq_operations afs_proc_cell_vlservers_ops = { 98static const struct seq_operations afs_proc_cell_vlservers_ops = {
99 .start = afs_proc_cell_vlservers_start, 99 .start = afs_proc_cell_vlservers_start,
100 .next = afs_proc_cell_vlservers_next, 100 .next = afs_proc_cell_vlservers_next,
101 .stop = afs_proc_cell_vlservers_stop, 101 .stop = afs_proc_cell_vlservers_stop,
@@ -119,7 +119,7 @@ static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
119static void afs_proc_cell_servers_stop(struct seq_file *p, void *v); 119static void afs_proc_cell_servers_stop(struct seq_file *p, void *v);
120static int afs_proc_cell_servers_show(struct seq_file *m, void *v); 120static int afs_proc_cell_servers_show(struct seq_file *m, void *v);
121 121
122static struct seq_operations afs_proc_cell_servers_ops = { 122static const struct seq_operations afs_proc_cell_servers_ops = {
123 .start = afs_proc_cell_servers_start, 123 .start = afs_proc_cell_servers_start,
124 .next = afs_proc_cell_servers_next, 124 .next = afs_proc_cell_servers_next,
125 .stop = afs_proc_cell_servers_stop, 125 .stop = afs_proc_cell_servers_stop,
diff --git a/fs/afs/write.c b/fs/afs/write.c
index c2e7a7ff0080..c63a3c8beb73 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -712,7 +712,6 @@ int afs_writeback_all(struct afs_vnode *vnode)
712 .bdi = mapping->backing_dev_info, 712 .bdi = mapping->backing_dev_info,
713 .sync_mode = WB_SYNC_ALL, 713 .sync_mode = WB_SYNC_ALL,
714 .nr_to_write = LONG_MAX, 714 .nr_to_write = LONG_MAX,
715 .for_writepages = 1,
716 .range_cyclic = 1, 715 .range_cyclic = 1,
717 }; 716 };
718 int ret; 717 int ret;
diff --git a/fs/aio.c b/fs/aio.c
index d065b2c3273e..02a2c9340573 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -24,6 +24,7 @@
24#include <linux/file.h> 24#include <linux/file.h>
25#include <linux/mm.h> 25#include <linux/mm.h>
26#include <linux/mman.h> 26#include <linux/mman.h>
27#include <linux/mmu_context.h>
27#include <linux/slab.h> 28#include <linux/slab.h>
28#include <linux/timer.h> 29#include <linux/timer.h>
29#include <linux/aio.h> 30#include <linux/aio.h>
@@ -34,7 +35,6 @@
34 35
35#include <asm/kmap_types.h> 36#include <asm/kmap_types.h>
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
37#include <asm/mmu_context.h>
38 38
39#if DEBUG > 1 39#if DEBUG > 1
40#define dprintk printk 40#define dprintk printk
@@ -78,6 +78,7 @@ static int __init aio_setup(void)
78 78
79 return 0; 79 return 0;
80} 80}
81__initcall(aio_setup);
81 82
82static void aio_free_ring(struct kioctx *ctx) 83static void aio_free_ring(struct kioctx *ctx)
83{ 84{
@@ -380,6 +381,7 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
380 __set_current_state(TASK_RUNNING); 381 __set_current_state(TASK_RUNNING);
381 return iocb->ki_user_data; 382 return iocb->ki_user_data;
382} 383}
384EXPORT_SYMBOL(wait_on_sync_kiocb);
383 385
384/* exit_aio: called when the last user of mm goes away. At this point, 386/* exit_aio: called when the last user of mm goes away. At this point,
385 * there is no way for any new requests to be submited or any of the 387 * there is no way for any new requests to be submited or any of the
@@ -573,6 +575,7 @@ int aio_put_req(struct kiocb *req)
573 spin_unlock_irq(&ctx->ctx_lock); 575 spin_unlock_irq(&ctx->ctx_lock);
574 return ret; 576 return ret;
575} 577}
578EXPORT_SYMBOL(aio_put_req);
576 579
577static struct kioctx *lookup_ioctx(unsigned long ctx_id) 580static struct kioctx *lookup_ioctx(unsigned long ctx_id)
578{ 581{
@@ -595,51 +598,6 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
595} 598}
596 599
597/* 600/*
598 * use_mm
599 * Makes the calling kernel thread take on the specified
600 * mm context.
601 * Called by the retry thread execute retries within the
602 * iocb issuer's mm context, so that copy_from/to_user
603 * operations work seamlessly for aio.
604 * (Note: this routine is intended to be called only
605 * from a kernel thread context)
606 */
607static void use_mm(struct mm_struct *mm)
608{
609 struct mm_struct *active_mm;
610 struct task_struct *tsk = current;
611
612 task_lock(tsk);
613 active_mm = tsk->active_mm;
614 atomic_inc(&mm->mm_count);
615 tsk->mm = mm;
616 tsk->active_mm = mm;
617 switch_mm(active_mm, mm, tsk);
618 task_unlock(tsk);
619
620 mmdrop(active_mm);
621}
622
623/*
624 * unuse_mm
625 * Reverses the effect of use_mm, i.e. releases the
626 * specified mm context which was earlier taken on
627 * by the calling kernel thread
628 * (Note: this routine is intended to be called only
629 * from a kernel thread context)
630 */
631static void unuse_mm(struct mm_struct *mm)
632{
633 struct task_struct *tsk = current;
634
635 task_lock(tsk);
636 tsk->mm = NULL;
637 /* active_mm is still 'mm' */
638 enter_lazy_tlb(mm, tsk);
639 task_unlock(tsk);
640}
641
642/*
643 * Queue up a kiocb to be retried. Assumes that the kiocb 601 * Queue up a kiocb to be retried. Assumes that the kiocb
644 * has already been marked as kicked, and places it on 602 * has already been marked as kicked, and places it on
645 * the retry run list for the corresponding ioctx, if it 603 * the retry run list for the corresponding ioctx, if it
@@ -1037,6 +995,7 @@ put_rq:
1037 spin_unlock_irqrestore(&ctx->ctx_lock, flags); 995 spin_unlock_irqrestore(&ctx->ctx_lock, flags);
1038 return ret; 996 return ret;
1039} 997}
998EXPORT_SYMBOL(aio_complete);
1040 999
1041/* aio_read_evt 1000/* aio_read_evt
1042 * Pull an event off of the ioctx's event ring. Returns the number of 1001 * Pull an event off of the ioctx's event ring. Returns the number of
@@ -1825,9 +1784,3 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
1825 asmlinkage_protect(5, ret, ctx_id, min_nr, nr, events, timeout); 1784 asmlinkage_protect(5, ret, ctx_id, min_nr, nr, events, timeout);
1826 return ret; 1785 return ret;
1827} 1786}
1828
1829__initcall(aio_setup);
1830
1831EXPORT_SYMBOL(aio_complete);
1832EXPORT_SYMBOL(aio_put_req);
1833EXPORT_SYMBOL(wait_on_sync_kiocb);
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 47d4a01c5393..2ca7a7cafdbf 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -8,8 +8,10 @@
8 * 8 *
9 */ 9 */
10 10
11#include <linux/cred.h>
11#include <linux/file.h> 12#include <linux/file.h>
12#include <linux/poll.h> 13#include <linux/poll.h>
14#include <linux/sched.h>
13#include <linux/slab.h> 15#include <linux/slab.h>
14#include <linux/init.h> 16#include <linux/init.h>
15#include <linux/fs.h> 17#include <linux/fs.h>
@@ -77,28 +79,24 @@ static const struct address_space_operations anon_aops = {
77 * 79 *
78 * Creates a new file by hooking it on a single inode. This is useful for files 80 * Creates a new file by hooking it on a single inode. This is useful for files
79 * that do not need to have a full-fledged inode in order to operate correctly. 81 * that do not need to have a full-fledged inode in order to operate correctly.
80 * All the files created with anon_inode_getfd() will share a single inode, 82 * All the files created with anon_inode_getfile() will share a single inode,
81 * hence saving memory and avoiding code duplication for the file/inode/dentry 83 * hence saving memory and avoiding code duplication for the file/inode/dentry
82 * setup. Returns new descriptor or -error. 84 * setup. Returns the newly created file* or an error pointer.
83 */ 85 */
84int anon_inode_getfd(const char *name, const struct file_operations *fops, 86struct file *anon_inode_getfile(const char *name,
85 void *priv, int flags) 87 const struct file_operations *fops,
88 void *priv, int flags)
86{ 89{
87 struct qstr this; 90 struct qstr this;
88 struct dentry *dentry; 91 struct dentry *dentry;
89 struct file *file; 92 struct file *file;
90 int error, fd; 93 int error;
91 94
92 if (IS_ERR(anon_inode_inode)) 95 if (IS_ERR(anon_inode_inode))
93 return -ENODEV; 96 return ERR_PTR(-ENODEV);
94 97
95 if (fops->owner && !try_module_get(fops->owner)) 98 if (fops->owner && !try_module_get(fops->owner))
96 return -ENOENT; 99 return ERR_PTR(-ENOENT);
97
98 error = get_unused_fd_flags(flags);
99 if (error < 0)
100 goto err_module;
101 fd = error;
102 100
103 /* 101 /*
104 * Link the inode to a directory entry by creating a unique name 102 * Link the inode to a directory entry by creating a unique name
@@ -110,7 +108,7 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
110 this.hash = 0; 108 this.hash = 0;
111 dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this); 109 dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this);
112 if (!dentry) 110 if (!dentry)
113 goto err_put_unused_fd; 111 goto err_module;
114 112
115 /* 113 /*
116 * We know the anon_inode inode count is always greater than zero, 114 * We know the anon_inode inode count is always greater than zero,
@@ -136,16 +134,54 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
136 file->f_version = 0; 134 file->f_version = 0;
137 file->private_data = priv; 135 file->private_data = priv;
138 136
137 return file;
138
139err_dput:
140 dput(dentry);
141err_module:
142 module_put(fops->owner);
143 return ERR_PTR(error);
144}
145EXPORT_SYMBOL_GPL(anon_inode_getfile);
146
147/**
148 * anon_inode_getfd - creates a new file instance by hooking it up to an
149 * anonymous inode, and a dentry that describe the "class"
150 * of the file
151 *
152 * @name: [in] name of the "class" of the new file
153 * @fops: [in] file operations for the new file
154 * @priv: [in] private data for the new file (will be file's private_data)
155 * @flags: [in] flags
156 *
157 * Creates a new file by hooking it on a single inode. This is useful for files
158 * that do not need to have a full-fledged inode in order to operate correctly.
159 * All the files created with anon_inode_getfd() will share a single inode,
160 * hence saving memory and avoiding code duplication for the file/inode/dentry
161 * setup. Returns new descriptor or an error code.
162 */
163int anon_inode_getfd(const char *name, const struct file_operations *fops,
164 void *priv, int flags)
165{
166 int error, fd;
167 struct file *file;
168
169 error = get_unused_fd_flags(flags);
170 if (error < 0)
171 return error;
172 fd = error;
173
174 file = anon_inode_getfile(name, fops, priv, flags);
175 if (IS_ERR(file)) {
176 error = PTR_ERR(file);
177 goto err_put_unused_fd;
178 }
139 fd_install(fd, file); 179 fd_install(fd, file);
140 180
141 return fd; 181 return fd;
142 182
143err_dput:
144 dput(dentry);
145err_put_unused_fd: 183err_put_unused_fd:
146 put_unused_fd(fd); 184 put_unused_fd(fd);
147err_module:
148 module_put(fops->owner);
149 return error; 185 return error;
150} 186}
151EXPORT_SYMBOL_GPL(anon_inode_getfd); 187EXPORT_SYMBOL_GPL(anon_inode_getfd);
diff --git a/fs/attr.c b/fs/attr.c
index 9fe1b1bd30a8..96d394bdaddf 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -18,7 +18,7 @@
18/* Taken over from the old code... */ 18/* Taken over from the old code... */
19 19
20/* POSIX UID/GID verification for setting inode attributes. */ 20/* POSIX UID/GID verification for setting inode attributes. */
21int inode_change_ok(struct inode *inode, struct iattr *attr) 21int inode_change_ok(const struct inode *inode, struct iattr *attr)
22{ 22{
23 int retval = -EPERM; 23 int retval = -EPERM;
24 unsigned int ia_valid = attr->ia_valid; 24 unsigned int ia_valid = attr->ia_valid;
@@ -60,9 +60,51 @@ fine:
60error: 60error:
61 return retval; 61 return retval;
62} 62}
63
64EXPORT_SYMBOL(inode_change_ok); 63EXPORT_SYMBOL(inode_change_ok);
65 64
65/**
66 * inode_newsize_ok - may this inode be truncated to a given size
67 * @inode: the inode to be truncated
68 * @offset: the new size to assign to the inode
69 * @Returns: 0 on success, -ve errno on failure
70 *
71 * inode_newsize_ok will check filesystem limits and ulimits to check that the
72 * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
73 * when necessary. Caller must not proceed with inode size change if failure is
74 * returned. @inode must be a file (not directory), with appropriate
75 * permissions to allow truncate (inode_newsize_ok does NOT check these
76 * conditions).
77 *
78 * inode_newsize_ok must be called with i_mutex held.
79 */
80int inode_newsize_ok(const struct inode *inode, loff_t offset)
81{
82 if (inode->i_size < offset) {
83 unsigned long limit;
84
85 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
86 if (limit != RLIM_INFINITY && offset > limit)
87 goto out_sig;
88 if (offset > inode->i_sb->s_maxbytes)
89 goto out_big;
90 } else {
91 /*
92 * truncation of in-use swapfiles is disallowed - it would
93 * cause subsequent swapout to scribble on the now-freed
94 * blocks.
95 */
96 if (IS_SWAPFILE(inode))
97 return -ETXTBSY;
98 }
99
100 return 0;
101out_sig:
102 send_sig(SIGXFSZ, current, 0);
103out_big:
104 return -EFBIG;
105}
106EXPORT_SYMBOL(inode_newsize_ok);
107
66int inode_setattr(struct inode * inode, struct iattr * attr) 108int inode_setattr(struct inode * inode, struct iattr * attr)
67{ 109{
68 unsigned int ia_valid = attr->ia_valid; 110 unsigned int ia_valid = attr->ia_valid;
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
index 2316e944a109..e947915109e5 100644
--- a/fs/autofs/dirhash.c
+++ b/fs/autofs/dirhash.c
@@ -90,7 +90,7 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
90 DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name)); 90 DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
91 continue; 91 continue;
92 } 92 }
93 while (d_mountpoint(path.dentry) && follow_down(&path)); 93 while (d_mountpoint(path.dentry) && follow_down(&path))
94 ; 94 ;
95 umount_ok = may_umount(path.mnt); 95 umount_ok = may_umount(path.mnt);
96 path_put(&path); 96 path_put(&path);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index aa39ae83f019..3da18d453488 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -77,7 +77,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
77 } 77 }
78 78
79 /* Update the expiry counter if fs is busy */ 79 /* Update the expiry counter if fs is busy */
80 if (!may_umount_tree(mnt)) { 80 if (!may_umount_tree(path.mnt)) {
81 struct autofs_info *ino = autofs4_dentry_ino(top); 81 struct autofs_info *ino = autofs4_dentry_ino(top);
82 ino->last_used = jiffies; 82 ino->last_used = jiffies;
83 goto done; 83 goto done;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 615d5496fe0f..33baf27fac78 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -737,12 +737,7 @@ befs_put_super(struct super_block *sb)
737{ 737{
738 kfree(BEFS_SB(sb)->mount_opts.iocharset); 738 kfree(BEFS_SB(sb)->mount_opts.iocharset);
739 BEFS_SB(sb)->mount_opts.iocharset = NULL; 739 BEFS_SB(sb)->mount_opts.iocharset = NULL;
740 740 unload_nls(BEFS_SB(sb)->nls);
741 if (BEFS_SB(sb)->nls) {
742 unload_nls(BEFS_SB(sb)->nls);
743 BEFS_SB(sb)->nls = NULL;
744 }
745
746 kfree(sb->s_fs_info); 741 kfree(sb->s_fs_info);
747 sb->s_fs_info = NULL; 742 sb->s_fs_info = NULL;
748} 743}
@@ -842,7 +837,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
842 sb->s_magic = BEFS_SUPER_MAGIC; 837 sb->s_magic = BEFS_SUPER_MAGIC;
843 /* Set real blocksize of fs */ 838 /* Set real blocksize of fs */
844 sb_set_blocksize(sb, (ulong) befs_sb->block_size); 839 sb_set_blocksize(sb, (ulong) befs_sb->block_size);
845 sb->s_op = (struct super_operations *) &befs_sops; 840 sb->s_op = &befs_sops;
846 root = befs_iget(sb, iaddr2blockno(sb, &(befs_sb->root_dir))); 841 root = befs_iget(sb, iaddr2blockno(sb, &(befs_sb->root_dir)));
847 if (IS_ERR(root)) { 842 if (IS_ERR(root)) {
848 ret = PTR_ERR(root); 843 ret = PTR_ERR(root);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b7c1603cd4bd..b9b3bb51b1e4 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -501,22 +501,22 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
501 } 501 }
502 } 502 }
503 503
504 /* 504 if (last_bss > elf_bss) {
505 * Now fill out the bss section. First pad the last page up 505 /*
506 * to the page boundary, and then perform a mmap to make sure 506 * Now fill out the bss section. First pad the last page up
507 * that there are zero-mapped pages up to and including the 507 * to the page boundary, and then perform a mmap to make sure
508 * last bss page. 508 * that there are zero-mapped pages up to and including the
509 */ 509 * last bss page.
510 if (padzero(elf_bss)) { 510 */
511 error = -EFAULT; 511 if (padzero(elf_bss)) {
512 goto out_close; 512 error = -EFAULT;
513 } 513 goto out_close;
514 }
514 515
515 /* What we have mapped so far */ 516 /* What we have mapped so far */
516 elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1); 517 elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
517 518
518 /* Map the last of the bss segment */ 519 /* Map the last of the bss segment */
519 if (last_bss > elf_bss) {
520 down_write(&current->mm->mmap_sem); 520 down_write(&current->mm->mmap_sem);
521 error = do_brk(elf_bss, last_bss - elf_bss); 521 error = do_brk(elf_bss, last_bss - elf_bss);
522 up_write(&current->mm->mmap_sem); 522 up_write(&current->mm->mmap_sem);
@@ -1280,9 +1280,6 @@ static int writenote(struct memelfnote *men, struct file *file,
1280#define DUMP_WRITE(addr, nr) \ 1280#define DUMP_WRITE(addr, nr) \
1281 if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \ 1281 if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \
1282 goto end_coredump; 1282 goto end_coredump;
1283#define DUMP_SEEK(off) \
1284 if (!dump_seek(file, (off))) \
1285 goto end_coredump;
1286 1283
1287static void fill_elf_header(struct elfhdr *elf, int segs, 1284static void fill_elf_header(struct elfhdr *elf, int segs,
1288 u16 machine, u32 flags, u8 osabi) 1285 u16 machine, u32 flags, u8 osabi)
@@ -1714,42 +1711,52 @@ struct elf_note_info {
1714 int numnote; 1711 int numnote;
1715}; 1712};
1716 1713
1717static int fill_note_info(struct elfhdr *elf, int phdrs, 1714static int elf_note_info_init(struct elf_note_info *info)
1718 struct elf_note_info *info,
1719 long signr, struct pt_regs *regs)
1720{ 1715{
1721#define NUM_NOTES 6 1716 memset(info, 0, sizeof(*info));
1722 struct list_head *t;
1723
1724 info->notes = NULL;
1725 info->prstatus = NULL;
1726 info->psinfo = NULL;
1727 info->fpu = NULL;
1728#ifdef ELF_CORE_COPY_XFPREGS
1729 info->xfpu = NULL;
1730#endif
1731 INIT_LIST_HEAD(&info->thread_list); 1717 INIT_LIST_HEAD(&info->thread_list);
1732 1718
1733 info->notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote), 1719 /* Allocate space for six ELF notes */
1734 GFP_KERNEL); 1720 info->notes = kmalloc(6 * sizeof(struct memelfnote), GFP_KERNEL);
1735 if (!info->notes) 1721 if (!info->notes)
1736 return 0; 1722 return 0;
1737 info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL); 1723 info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);
1738 if (!info->psinfo) 1724 if (!info->psinfo)
1739 return 0; 1725 goto notes_free;
1740 info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL); 1726 info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL);
1741 if (!info->prstatus) 1727 if (!info->prstatus)
1742 return 0; 1728 goto psinfo_free;
1743 info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL); 1729 info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL);
1744 if (!info->fpu) 1730 if (!info->fpu)
1745 return 0; 1731 goto prstatus_free;
1746#ifdef ELF_CORE_COPY_XFPREGS 1732#ifdef ELF_CORE_COPY_XFPREGS
1747 info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL); 1733 info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL);
1748 if (!info->xfpu) 1734 if (!info->xfpu)
1749 return 0; 1735 goto fpu_free;
1736#endif
1737 return 1;
1738#ifdef ELF_CORE_COPY_XFPREGS
1739 fpu_free:
1740 kfree(info->fpu);
1750#endif 1741#endif
1742 prstatus_free:
1743 kfree(info->prstatus);
1744 psinfo_free:
1745 kfree(info->psinfo);
1746 notes_free:
1747 kfree(info->notes);
1748 return 0;
1749}
1750
1751static int fill_note_info(struct elfhdr *elf, int phdrs,
1752 struct elf_note_info *info,
1753 long signr, struct pt_regs *regs)
1754{
1755 struct list_head *t;
1756
1757 if (!elf_note_info_init(info))
1758 return 0;
1751 1759
1752 info->thread_status_size = 0;
1753 if (signr) { 1760 if (signr) {
1754 struct core_thread *ct; 1761 struct core_thread *ct;
1755 struct elf_thread_status *ets; 1762 struct elf_thread_status *ets;
@@ -1809,8 +1816,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
1809#endif 1816#endif
1810 1817
1811 return 1; 1818 return 1;
1812
1813#undef NUM_NOTES
1814} 1819}
1815 1820
1816static size_t get_note_info_size(struct elf_note_info *info) 1821static size_t get_note_info_size(struct elf_note_info *info)
@@ -2016,7 +2021,8 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
2016 goto end_coredump; 2021 goto end_coredump;
2017 2022
2018 /* Align to page */ 2023 /* Align to page */
2019 DUMP_SEEK(dataoff - foffset); 2024 if (!dump_seek(file, dataoff - foffset))
2025 goto end_coredump;
2020 2026
2021 for (vma = first_vma(current, gate_vma); vma != NULL; 2027 for (vma = first_vma(current, gate_vma); vma != NULL;
2022 vma = next_vma(vma, gate_vma)) { 2028 vma = next_vma(vma, gate_vma)) {
@@ -2027,33 +2033,19 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
2027 2033
2028 for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) { 2034 for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) {
2029 struct page *page; 2035 struct page *page;
2030 struct vm_area_struct *tmp_vma; 2036 int stop;
2031 2037
2032 if (get_user_pages(current, current->mm, addr, 1, 0, 1, 2038 page = get_dump_page(addr);
2033 &page, &tmp_vma) <= 0) { 2039 if (page) {
2034 DUMP_SEEK(PAGE_SIZE); 2040 void *kaddr = kmap(page);
2035 } else { 2041 stop = ((size += PAGE_SIZE) > limit) ||
2036 if (page == ZERO_PAGE(0)) { 2042 !dump_write(file, kaddr, PAGE_SIZE);
2037 if (!dump_seek(file, PAGE_SIZE)) { 2043 kunmap(page);
2038 page_cache_release(page);
2039 goto end_coredump;
2040 }
2041 } else {
2042 void *kaddr;
2043 flush_cache_page(tmp_vma, addr,
2044 page_to_pfn(page));
2045 kaddr = kmap(page);
2046 if ((size += PAGE_SIZE) > limit ||
2047 !dump_write(file, kaddr,
2048 PAGE_SIZE)) {
2049 kunmap(page);
2050 page_cache_release(page);
2051 goto end_coredump;
2052 }
2053 kunmap(page);
2054 }
2055 page_cache_release(page); 2044 page_cache_release(page);
2056 } 2045 } else
2046 stop = !dump_seek(file, PAGE_SIZE);
2047 if (stop)
2048 goto end_coredump;
2057 } 2049 }
2058 } 2050 }
2059 2051
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 20fbeced472b..38502c67987c 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -283,20 +283,23 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
283 } 283 }
284 284
285 stack_size = exec_params.stack_size; 285 stack_size = exec_params.stack_size;
286 if (stack_size < interp_params.stack_size)
287 stack_size = interp_params.stack_size;
288
289 if (exec_params.flags & ELF_FDPIC_FLAG_EXEC_STACK) 286 if (exec_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
290 executable_stack = EXSTACK_ENABLE_X; 287 executable_stack = EXSTACK_ENABLE_X;
291 else if (exec_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK) 288 else if (exec_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
292 executable_stack = EXSTACK_DISABLE_X; 289 executable_stack = EXSTACK_DISABLE_X;
293 else if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
294 executable_stack = EXSTACK_ENABLE_X;
295 else if (interp_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
296 executable_stack = EXSTACK_DISABLE_X;
297 else 290 else
298 executable_stack = EXSTACK_DEFAULT; 291 executable_stack = EXSTACK_DEFAULT;
299 292
293 if (stack_size == 0) {
294 stack_size = interp_params.stack_size;
295 if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
296 executable_stack = EXSTACK_ENABLE_X;
297 else if (interp_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
298 executable_stack = EXSTACK_DISABLE_X;
299 else
300 executable_stack = EXSTACK_DEFAULT;
301 }
302
300 retval = -ENOEXEC; 303 retval = -ENOEXEC;
301 if (stack_size == 0) 304 if (stack_size == 0)
302 goto error; 305 goto error;
@@ -1325,9 +1328,6 @@ static int writenote(struct memelfnote *men, struct file *file)
1325#define DUMP_WRITE(addr, nr) \ 1328#define DUMP_WRITE(addr, nr) \
1326 if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \ 1329 if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \
1327 goto end_coredump; 1330 goto end_coredump;
1328#define DUMP_SEEK(off) \
1329 if (!dump_seek(file, (off))) \
1330 goto end_coredump;
1331 1331
1332static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs) 1332static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs)
1333{ 1333{
@@ -1518,6 +1518,7 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
1518 unsigned long *limit, unsigned long mm_flags) 1518 unsigned long *limit, unsigned long mm_flags)
1519{ 1519{
1520 struct vm_area_struct *vma; 1520 struct vm_area_struct *vma;
1521 int err = 0;
1521 1522
1522 for (vma = current->mm->mmap; vma; vma = vma->vm_next) { 1523 for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
1523 unsigned long addr; 1524 unsigned long addr;
@@ -1525,43 +1526,26 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
1525 if (!maydump(vma, mm_flags)) 1526 if (!maydump(vma, mm_flags))
1526 continue; 1527 continue;
1527 1528
1528 for (addr = vma->vm_start; 1529 for (addr = vma->vm_start; addr < vma->vm_end;
1529 addr < vma->vm_end; 1530 addr += PAGE_SIZE) {
1530 addr += PAGE_SIZE 1531 struct page *page = get_dump_page(addr);
1531 ) { 1532 if (page) {
1532 struct vm_area_struct *vma; 1533 void *kaddr = kmap(page);
1533 struct page *page; 1534 *size += PAGE_SIZE;
1534 1535 if (*size > *limit)
1535 if (get_user_pages(current, current->mm, addr, 1, 0, 1, 1536 err = -EFBIG;
1536 &page, &vma) <= 0) { 1537 else if (!dump_write(file, kaddr, PAGE_SIZE))
1537 DUMP_SEEK(file->f_pos + PAGE_SIZE); 1538 err = -EIO;
1538 }
1539 else if (page == ZERO_PAGE(0)) {
1540 page_cache_release(page);
1541 DUMP_SEEK(file->f_pos + PAGE_SIZE);
1542 }
1543 else {
1544 void *kaddr;
1545
1546 flush_cache_page(vma, addr, page_to_pfn(page));
1547 kaddr = kmap(page);
1548 if ((*size += PAGE_SIZE) > *limit ||
1549 !dump_write(file, kaddr, PAGE_SIZE)
1550 ) {
1551 kunmap(page);
1552 page_cache_release(page);
1553 return -EIO;
1554 }
1555 kunmap(page); 1539 kunmap(page);
1556 page_cache_release(page); 1540 page_cache_release(page);
1557 } 1541 } else if (!dump_seek(file, file->f_pos + PAGE_SIZE))
1542 err = -EFBIG;
1543 if (err)
1544 goto out;
1558 } 1545 }
1559 } 1546 }
1560 1547out:
1561 return 0; 1548 return err;
1562
1563end_coredump:
1564 return -EFBIG;
1565} 1549}
1566#endif 1550#endif
1567 1551
@@ -1802,7 +1786,8 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
1802 goto end_coredump; 1786 goto end_coredump;
1803 } 1787 }
1804 1788
1805 DUMP_SEEK(dataoff); 1789 if (!dump_seek(file, dataoff))
1790 goto end_coredump;
1806 1791
1807 if (elf_fdpic_dump_segments(file, &size, &limit, mm_flags) < 0) 1792 if (elf_fdpic_dump_segments(file, &size, &limit, mm_flags) < 0)
1808 goto end_coredump; 1793 goto end_coredump;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index e92f229e3c6e..a2796651e756 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -278,8 +278,6 @@ static int decompress_exec(
278 ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos); 278 ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos);
279 if (ret <= 0) 279 if (ret <= 0)
280 break; 280 break;
281 if (ret >= (unsigned long) -4096)
282 break;
283 len -= ret; 281 len -= ret;
284 282
285 strm.next_in = buf; 283 strm.next_in = buf;
@@ -335,7 +333,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
335 "(%d != %d)", (unsigned) r, curid, id); 333 "(%d != %d)", (unsigned) r, curid, id);
336 goto failed; 334 goto failed;
337 } else if ( ! p->lib_list[id].loaded && 335 } else if ( ! p->lib_list[id].loaded &&
338 load_flat_shared_library(id, p) > (unsigned long) -4096) { 336 IS_ERR_VALUE(load_flat_shared_library(id, p))) {
339 printk("BINFMT_FLAT: failed to load library %d", id); 337 printk("BINFMT_FLAT: failed to load library %d", id);
340 goto failed; 338 goto failed;
341 } 339 }
@@ -545,7 +543,7 @@ static int load_flat_file(struct linux_binprm * bprm,
545 textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, 543 textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC,
546 MAP_PRIVATE|MAP_EXECUTABLE, 0); 544 MAP_PRIVATE|MAP_EXECUTABLE, 0);
547 up_write(&current->mm->mmap_sem); 545 up_write(&current->mm->mmap_sem);
548 if (!textpos || textpos >= (unsigned long) -4096) { 546 if (!textpos || IS_ERR_VALUE(textpos)) {
549 if (!textpos) 547 if (!textpos)
550 textpos = (unsigned long) -ENOMEM; 548 textpos = (unsigned long) -ENOMEM;
551 printk("Unable to mmap process text, errno %d\n", (int)-textpos); 549 printk("Unable to mmap process text, errno %d\n", (int)-textpos);
@@ -560,7 +558,7 @@ static int load_flat_file(struct linux_binprm * bprm,
560 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); 558 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
561 up_write(&current->mm->mmap_sem); 559 up_write(&current->mm->mmap_sem);
562 560
563 if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) { 561 if (realdatastart == 0 || IS_ERR_VALUE(realdatastart)) {
564 if (!realdatastart) 562 if (!realdatastart)
565 realdatastart = (unsigned long) -ENOMEM; 563 realdatastart = (unsigned long) -ENOMEM;
566 printk("Unable to allocate RAM for process data, errno %d\n", 564 printk("Unable to allocate RAM for process data, errno %d\n",
@@ -587,7 +585,7 @@ static int load_flat_file(struct linux_binprm * bprm,
587 result = bprm->file->f_op->read(bprm->file, (char *) datapos, 585 result = bprm->file->f_op->read(bprm->file, (char *) datapos,
588 data_len + (relocs * sizeof(unsigned long)), &fpos); 586 data_len + (relocs * sizeof(unsigned long)), &fpos);
589 } 587 }
590 if (result >= (unsigned long)-4096) { 588 if (IS_ERR_VALUE(result)) {
591 printk("Unable to read data+bss, errno %d\n", (int)-result); 589 printk("Unable to read data+bss, errno %d\n", (int)-result);
592 do_munmap(current->mm, textpos, text_len); 590 do_munmap(current->mm, textpos, text_len);
593 do_munmap(current->mm, realdatastart, data_len + extra); 591 do_munmap(current->mm, realdatastart, data_len + extra);
@@ -607,7 +605,7 @@ static int load_flat_file(struct linux_binprm * bprm,
607 PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); 605 PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
608 up_write(&current->mm->mmap_sem); 606 up_write(&current->mm->mmap_sem);
609 607
610 if (!textpos || textpos >= (unsigned long) -4096) { 608 if (!textpos || IS_ERR_VALUE(textpos)) {
611 if (!textpos) 609 if (!textpos)
612 textpos = (unsigned long) -ENOMEM; 610 textpos = (unsigned long) -ENOMEM;
613 printk("Unable to allocate RAM for process text/data, errno %d\n", 611 printk("Unable to allocate RAM for process text/data, errno %d\n",
@@ -641,7 +639,7 @@ static int load_flat_file(struct linux_binprm * bprm,
641 fpos = 0; 639 fpos = 0;
642 result = bprm->file->f_op->read(bprm->file, 640 result = bprm->file->f_op->read(bprm->file,
643 (char *) textpos, text_len, &fpos); 641 (char *) textpos, text_len, &fpos);
644 if (result < (unsigned long) -4096) 642 if (!IS_ERR_VALUE(result))
645 result = decompress_exec(bprm, text_len, (char *) datapos, 643 result = decompress_exec(bprm, text_len, (char *) datapos,
646 data_len + (relocs * sizeof(unsigned long)), 0); 644 data_len + (relocs * sizeof(unsigned long)), 0);
647 } 645 }
@@ -651,13 +649,13 @@ static int load_flat_file(struct linux_binprm * bprm,
651 fpos = 0; 649 fpos = 0;
652 result = bprm->file->f_op->read(bprm->file, 650 result = bprm->file->f_op->read(bprm->file,
653 (char *) textpos, text_len, &fpos); 651 (char *) textpos, text_len, &fpos);
654 if (result < (unsigned long) -4096) { 652 if (!IS_ERR_VALUE(result)) {
655 fpos = ntohl(hdr->data_start); 653 fpos = ntohl(hdr->data_start);
656 result = bprm->file->f_op->read(bprm->file, (char *) datapos, 654 result = bprm->file->f_op->read(bprm->file, (char *) datapos,
657 data_len + (relocs * sizeof(unsigned long)), &fpos); 655 data_len + (relocs * sizeof(unsigned long)), &fpos);
658 } 656 }
659 } 657 }
660 if (result >= (unsigned long)-4096) { 658 if (IS_ERR_VALUE(result)) {
661 printk("Unable to read code+data+bss, errno %d\n",(int)-result); 659 printk("Unable to read code+data+bss, errno %d\n",(int)-result);
662 do_munmap(current->mm, textpos, text_len + data_len + extra + 660 do_munmap(current->mm, textpos, text_len + data_len + extra +
663 MAX_SHARED_LIBS * sizeof(unsigned long)); 661 MAX_SHARED_LIBS * sizeof(unsigned long));
@@ -835,7 +833,7 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
835 833
836 res = prepare_binprm(&bprm); 834 res = prepare_binprm(&bprm);
837 835
838 if (res <= (unsigned long)-4096) 836 if (!IS_ERR_VALUE(res))
839 res = load_flat_file(&bprm, libs, id, NULL); 837 res = load_flat_file(&bprm, libs, id, NULL);
840 838
841 abort_creds(bprm.cred); 839 abort_creds(bprm.cred);
@@ -880,7 +878,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
880 stack_len += FLAT_DATA_ALIGN - 1; /* reserve for upcoming alignment */ 878 stack_len += FLAT_DATA_ALIGN - 1; /* reserve for upcoming alignment */
881 879
882 res = load_flat_file(bprm, &libinfo, 0, &stack_len); 880 res = load_flat_file(bprm, &libinfo, 0, &stack_len);
883 if (res > (unsigned long)-4096) 881 if (IS_ERR_VALUE(res))
884 return res; 882 return res;
885 883
886 /* Update data segment pointers for all libraries */ 884 /* Update data segment pointers for all libraries */
diff --git a/fs/bio.c b/fs/bio.c
index 76738005c8e8..402cb84a92a1 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -249,6 +249,7 @@ void bio_free(struct bio *bio, struct bio_set *bs)
249 249
250 mempool_free(p, bs->bio_pool); 250 mempool_free(p, bs->bio_pool);
251} 251}
252EXPORT_SYMBOL(bio_free);
252 253
253void bio_init(struct bio *bio) 254void bio_init(struct bio *bio)
254{ 255{
@@ -257,6 +258,7 @@ void bio_init(struct bio *bio)
257 bio->bi_comp_cpu = -1; 258 bio->bi_comp_cpu = -1;
258 atomic_set(&bio->bi_cnt, 1); 259 atomic_set(&bio->bi_cnt, 1);
259} 260}
261EXPORT_SYMBOL(bio_init);
260 262
261/** 263/**
262 * bio_alloc_bioset - allocate a bio for I/O 264 * bio_alloc_bioset - allocate a bio for I/O
@@ -311,6 +313,7 @@ err_free:
311 mempool_free(p, bs->bio_pool); 313 mempool_free(p, bs->bio_pool);
312 return NULL; 314 return NULL;
313} 315}
316EXPORT_SYMBOL(bio_alloc_bioset);
314 317
315static void bio_fs_destructor(struct bio *bio) 318static void bio_fs_destructor(struct bio *bio)
316{ 319{
@@ -337,6 +340,7 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
337 340
338 return bio; 341 return bio;
339} 342}
343EXPORT_SYMBOL(bio_alloc);
340 344
341static void bio_kmalloc_destructor(struct bio *bio) 345static void bio_kmalloc_destructor(struct bio *bio)
342{ 346{
@@ -380,6 +384,7 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
380 384
381 return bio; 385 return bio;
382} 386}
387EXPORT_SYMBOL(bio_kmalloc);
383 388
384void zero_fill_bio(struct bio *bio) 389void zero_fill_bio(struct bio *bio)
385{ 390{
@@ -416,6 +421,7 @@ void bio_put(struct bio *bio)
416 bio->bi_destructor(bio); 421 bio->bi_destructor(bio);
417 } 422 }
418} 423}
424EXPORT_SYMBOL(bio_put);
419 425
420inline int bio_phys_segments(struct request_queue *q, struct bio *bio) 426inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
421{ 427{
@@ -424,6 +430,7 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
424 430
425 return bio->bi_phys_segments; 431 return bio->bi_phys_segments;
426} 432}
433EXPORT_SYMBOL(bio_phys_segments);
427 434
428/** 435/**
429 * __bio_clone - clone a bio 436 * __bio_clone - clone a bio
@@ -451,6 +458,7 @@ void __bio_clone(struct bio *bio, struct bio *bio_src)
451 bio->bi_size = bio_src->bi_size; 458 bio->bi_size = bio_src->bi_size;
452 bio->bi_idx = bio_src->bi_idx; 459 bio->bi_idx = bio_src->bi_idx;
453} 460}
461EXPORT_SYMBOL(__bio_clone);
454 462
455/** 463/**
456 * bio_clone - clone a bio 464 * bio_clone - clone a bio
@@ -482,6 +490,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
482 490
483 return b; 491 return b;
484} 492}
493EXPORT_SYMBOL(bio_clone);
485 494
486/** 495/**
487 * bio_get_nr_vecs - return approx number of vecs 496 * bio_get_nr_vecs - return approx number of vecs
@@ -505,6 +514,7 @@ int bio_get_nr_vecs(struct block_device *bdev)
505 514
506 return nr_pages; 515 return nr_pages;
507} 516}
517EXPORT_SYMBOL(bio_get_nr_vecs);
508 518
509static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page 519static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
510 *page, unsigned int len, unsigned int offset, 520 *page, unsigned int len, unsigned int offset,
@@ -635,6 +645,7 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
635 return __bio_add_page(q, bio, page, len, offset, 645 return __bio_add_page(q, bio, page, len, offset,
636 queue_max_hw_sectors(q)); 646 queue_max_hw_sectors(q));
637} 647}
648EXPORT_SYMBOL(bio_add_pc_page);
638 649
639/** 650/**
640 * bio_add_page - attempt to add page to bio 651 * bio_add_page - attempt to add page to bio
@@ -655,6 +666,7 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
655 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 666 struct request_queue *q = bdev_get_queue(bio->bi_bdev);
656 return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q)); 667 return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
657} 668}
669EXPORT_SYMBOL(bio_add_page);
658 670
659struct bio_map_data { 671struct bio_map_data {
660 struct bio_vec *iovecs; 672 struct bio_vec *iovecs;
@@ -776,6 +788,7 @@ int bio_uncopy_user(struct bio *bio)
776 bio_put(bio); 788 bio_put(bio);
777 return ret; 789 return ret;
778} 790}
791EXPORT_SYMBOL(bio_uncopy_user);
779 792
780/** 793/**
781 * bio_copy_user_iov - copy user data to bio 794 * bio_copy_user_iov - copy user data to bio
@@ -920,6 +933,7 @@ struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
920 933
921 return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask); 934 return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
922} 935}
936EXPORT_SYMBOL(bio_copy_user);
923 937
924static struct bio *__bio_map_user_iov(struct request_queue *q, 938static struct bio *__bio_map_user_iov(struct request_queue *q,
925 struct block_device *bdev, 939 struct block_device *bdev,
@@ -1050,6 +1064,7 @@ struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
1050 1064
1051 return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask); 1065 return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
1052} 1066}
1067EXPORT_SYMBOL(bio_map_user);
1053 1068
1054/** 1069/**
1055 * bio_map_user_iov - map user sg_iovec table into bio 1070 * bio_map_user_iov - map user sg_iovec table into bio
@@ -1117,13 +1132,13 @@ void bio_unmap_user(struct bio *bio)
1117 __bio_unmap_user(bio); 1132 __bio_unmap_user(bio);
1118 bio_put(bio); 1133 bio_put(bio);
1119} 1134}
1135EXPORT_SYMBOL(bio_unmap_user);
1120 1136
1121static void bio_map_kern_endio(struct bio *bio, int err) 1137static void bio_map_kern_endio(struct bio *bio, int err)
1122{ 1138{
1123 bio_put(bio); 1139 bio_put(bio);
1124} 1140}
1125 1141
1126
1127static struct bio *__bio_map_kern(struct request_queue *q, void *data, 1142static struct bio *__bio_map_kern(struct request_queue *q, void *data,
1128 unsigned int len, gfp_t gfp_mask) 1143 unsigned int len, gfp_t gfp_mask)
1129{ 1144{
@@ -1189,6 +1204,7 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
1189 bio_put(bio); 1204 bio_put(bio);
1190 return ERR_PTR(-EINVAL); 1205 return ERR_PTR(-EINVAL);
1191} 1206}
1207EXPORT_SYMBOL(bio_map_kern);
1192 1208
1193static void bio_copy_kern_endio(struct bio *bio, int err) 1209static void bio_copy_kern_endio(struct bio *bio, int err)
1194{ 1210{
@@ -1250,6 +1266,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
1250 1266
1251 return bio; 1267 return bio;
1252} 1268}
1269EXPORT_SYMBOL(bio_copy_kern);
1253 1270
1254/* 1271/*
1255 * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions 1272 * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
@@ -1400,6 +1417,7 @@ void bio_endio(struct bio *bio, int error)
1400 if (bio->bi_end_io) 1417 if (bio->bi_end_io)
1401 bio->bi_end_io(bio, error); 1418 bio->bi_end_io(bio, error);
1402} 1419}
1420EXPORT_SYMBOL(bio_endio);
1403 1421
1404void bio_pair_release(struct bio_pair *bp) 1422void bio_pair_release(struct bio_pair *bp)
1405{ 1423{
@@ -1410,6 +1428,7 @@ void bio_pair_release(struct bio_pair *bp)
1410 mempool_free(bp, bp->bio2.bi_private); 1428 mempool_free(bp, bp->bio2.bi_private);
1411 } 1429 }
1412} 1430}
1431EXPORT_SYMBOL(bio_pair_release);
1413 1432
1414static void bio_pair_end_1(struct bio *bi, int err) 1433static void bio_pair_end_1(struct bio *bi, int err)
1415{ 1434{
@@ -1477,6 +1496,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
1477 1496
1478 return bp; 1497 return bp;
1479} 1498}
1499EXPORT_SYMBOL(bio_split);
1480 1500
1481/** 1501/**
1482 * bio_sector_offset - Find hardware sector offset in bio 1502 * bio_sector_offset - Find hardware sector offset in bio
@@ -1547,6 +1567,7 @@ void bioset_free(struct bio_set *bs)
1547 1567
1548 kfree(bs); 1568 kfree(bs);
1549} 1569}
1570EXPORT_SYMBOL(bioset_free);
1550 1571
1551/** 1572/**
1552 * bioset_create - Create a bio_set 1573 * bioset_create - Create a bio_set
@@ -1592,6 +1613,7 @@ bad:
1592 bioset_free(bs); 1613 bioset_free(bs);
1593 return NULL; 1614 return NULL;
1594} 1615}
1616EXPORT_SYMBOL(bioset_create);
1595 1617
1596static void __init biovec_init_slabs(void) 1618static void __init biovec_init_slabs(void)
1597{ 1619{
@@ -1636,29 +1658,4 @@ static int __init init_bio(void)
1636 1658
1637 return 0; 1659 return 0;
1638} 1660}
1639
1640subsys_initcall(init_bio); 1661subsys_initcall(init_bio);
1641
1642EXPORT_SYMBOL(bio_alloc);
1643EXPORT_SYMBOL(bio_kmalloc);
1644EXPORT_SYMBOL(bio_put);
1645EXPORT_SYMBOL(bio_free);
1646EXPORT_SYMBOL(bio_endio);
1647EXPORT_SYMBOL(bio_init);
1648EXPORT_SYMBOL(__bio_clone);
1649EXPORT_SYMBOL(bio_clone);
1650EXPORT_SYMBOL(bio_phys_segments);
1651EXPORT_SYMBOL(bio_add_page);
1652EXPORT_SYMBOL(bio_add_pc_page);
1653EXPORT_SYMBOL(bio_get_nr_vecs);
1654EXPORT_SYMBOL(bio_map_user);
1655EXPORT_SYMBOL(bio_unmap_user);
1656EXPORT_SYMBOL(bio_map_kern);
1657EXPORT_SYMBOL(bio_copy_kern);
1658EXPORT_SYMBOL(bio_pair_release);
1659EXPORT_SYMBOL(bio_split);
1660EXPORT_SYMBOL(bio_copy_user);
1661EXPORT_SYMBOL(bio_uncopy_user);
1662EXPORT_SYMBOL(bioset_create);
1663EXPORT_SYMBOL(bioset_free);
1664EXPORT_SYMBOL(bio_alloc_bioset);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 94dfda24c06e..9cf4b926f8e4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -216,8 +216,6 @@ EXPORT_SYMBOL(fsync_bdev);
216 * freeze_bdev -- lock a filesystem and force it into a consistent state 216 * freeze_bdev -- lock a filesystem and force it into a consistent state
217 * @bdev: blockdevice to lock 217 * @bdev: blockdevice to lock
218 * 218 *
219 * This takes the block device bd_mount_sem to make sure no new mounts
220 * happen on bdev until thaw_bdev() is called.
221 * If a superblock is found on this device, we take the s_umount semaphore 219 * If a superblock is found on this device, we take the s_umount semaphore
222 * on it to make sure nobody unmounts until the snapshot creation is done. 220 * on it to make sure nobody unmounts until the snapshot creation is done.
223 * The reference counter (bd_fsfreeze_count) guarantees that only the last 221 * The reference counter (bd_fsfreeze_count) guarantees that only the last
@@ -232,46 +230,55 @@ struct super_block *freeze_bdev(struct block_device *bdev)
232 int error = 0; 230 int error = 0;
233 231
234 mutex_lock(&bdev->bd_fsfreeze_mutex); 232 mutex_lock(&bdev->bd_fsfreeze_mutex);
235 if (bdev->bd_fsfreeze_count > 0) { 233 if (++bdev->bd_fsfreeze_count > 1) {
236 bdev->bd_fsfreeze_count++; 234 /*
235 * We don't even need to grab a reference - the first call
236 * to freeze_bdev grab an active reference and only the last
237 * thaw_bdev drops it.
238 */
237 sb = get_super(bdev); 239 sb = get_super(bdev);
240 drop_super(sb);
238 mutex_unlock(&bdev->bd_fsfreeze_mutex); 241 mutex_unlock(&bdev->bd_fsfreeze_mutex);
239 return sb; 242 return sb;
240 } 243 }
241 bdev->bd_fsfreeze_count++; 244
242 245 sb = get_active_super(bdev);
243 down(&bdev->bd_mount_sem); 246 if (!sb)
244 sb = get_super(bdev); 247 goto out;
245 if (sb && !(sb->s_flags & MS_RDONLY)) { 248 if (sb->s_flags & MS_RDONLY) {
246 sb->s_frozen = SB_FREEZE_WRITE; 249 deactivate_locked_super(sb);
247 smp_wmb(); 250 mutex_unlock(&bdev->bd_fsfreeze_mutex);
248 251 return sb;
249 sync_filesystem(sb); 252 }
250 253
251 sb->s_frozen = SB_FREEZE_TRANS; 254 sb->s_frozen = SB_FREEZE_WRITE;
252 smp_wmb(); 255 smp_wmb();
253 256
254 sync_blockdev(sb->s_bdev); 257 sync_filesystem(sb);
255 258
256 if (sb->s_op->freeze_fs) { 259 sb->s_frozen = SB_FREEZE_TRANS;
257 error = sb->s_op->freeze_fs(sb); 260 smp_wmb();
258 if (error) { 261
259 printk(KERN_ERR 262 sync_blockdev(sb->s_bdev);
260 "VFS:Filesystem freeze failed\n"); 263
261 sb->s_frozen = SB_UNFROZEN; 264 if (sb->s_op->freeze_fs) {
262 drop_super(sb); 265 error = sb->s_op->freeze_fs(sb);
263 up(&bdev->bd_mount_sem); 266 if (error) {
264 bdev->bd_fsfreeze_count--; 267 printk(KERN_ERR
265 mutex_unlock(&bdev->bd_fsfreeze_mutex); 268 "VFS:Filesystem freeze failed\n");
266 return ERR_PTR(error); 269 sb->s_frozen = SB_UNFROZEN;
267 } 270 deactivate_locked_super(sb);
271 bdev->bd_fsfreeze_count--;
272 mutex_unlock(&bdev->bd_fsfreeze_mutex);
273 return ERR_PTR(error);
268 } 274 }
269 } 275 }
276 up_write(&sb->s_umount);
270 277
278 out:
271 sync_blockdev(bdev); 279 sync_blockdev(bdev);
272 mutex_unlock(&bdev->bd_fsfreeze_mutex); 280 mutex_unlock(&bdev->bd_fsfreeze_mutex);
273 281 return sb; /* thaw_bdev releases s->s_umount */
274 return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
275} 282}
276EXPORT_SYMBOL(freeze_bdev); 283EXPORT_SYMBOL(freeze_bdev);
277 284
@@ -284,44 +291,44 @@ EXPORT_SYMBOL(freeze_bdev);
284 */ 291 */
285int thaw_bdev(struct block_device *bdev, struct super_block *sb) 292int thaw_bdev(struct block_device *bdev, struct super_block *sb)
286{ 293{
287 int error = 0; 294 int error = -EINVAL;
288 295
289 mutex_lock(&bdev->bd_fsfreeze_mutex); 296 mutex_lock(&bdev->bd_fsfreeze_mutex);
290 if (!bdev->bd_fsfreeze_count) { 297 if (!bdev->bd_fsfreeze_count)
291 mutex_unlock(&bdev->bd_fsfreeze_mutex); 298 goto out_unlock;
292 return -EINVAL; 299
293 } 300 error = 0;
294 301 if (--bdev->bd_fsfreeze_count > 0)
295 bdev->bd_fsfreeze_count--; 302 goto out_unlock;
296 if (bdev->bd_fsfreeze_count > 0) { 303
297 if (sb) 304 if (!sb)
298 drop_super(sb); 305 goto out_unlock;
299 mutex_unlock(&bdev->bd_fsfreeze_mutex); 306
300 return 0; 307 BUG_ON(sb->s_bdev != bdev);
301 } 308 down_write(&sb->s_umount);
302 309 if (sb->s_flags & MS_RDONLY)
303 if (sb) { 310 goto out_deactivate;
304 BUG_ON(sb->s_bdev != bdev); 311
305 if (!(sb->s_flags & MS_RDONLY)) { 312 if (sb->s_op->unfreeze_fs) {
306 if (sb->s_op->unfreeze_fs) { 313 error = sb->s_op->unfreeze_fs(sb);
307 error = sb->s_op->unfreeze_fs(sb); 314 if (error) {
308 if (error) { 315 printk(KERN_ERR
309 printk(KERN_ERR 316 "VFS:Filesystem thaw failed\n");
310 "VFS:Filesystem thaw failed\n"); 317 sb->s_frozen = SB_FREEZE_TRANS;
311 sb->s_frozen = SB_FREEZE_TRANS; 318 bdev->bd_fsfreeze_count++;
312 bdev->bd_fsfreeze_count++; 319 mutex_unlock(&bdev->bd_fsfreeze_mutex);
313 mutex_unlock(&bdev->bd_fsfreeze_mutex); 320 return error;
314 return error;
315 }
316 }
317 sb->s_frozen = SB_UNFROZEN;
318 smp_wmb();
319 wake_up(&sb->s_wait_unfrozen);
320 } 321 }
321 drop_super(sb);
322 } 322 }
323 323
324 up(&bdev->bd_mount_sem); 324 sb->s_frozen = SB_UNFROZEN;
325 smp_wmb();
326 wake_up(&sb->s_wait_unfrozen);
327
328out_deactivate:
329 if (sb)
330 deactivate_locked_super(sb);
331out_unlock:
325 mutex_unlock(&bdev->bd_fsfreeze_mutex); 332 mutex_unlock(&bdev->bd_fsfreeze_mutex);
326 return 0; 333 return 0;
327} 334}
@@ -420,7 +427,6 @@ static void bdev_destroy_inode(struct inode *inode)
420{ 427{
421 struct bdev_inode *bdi = BDEV_I(inode); 428 struct bdev_inode *bdi = BDEV_I(inode);
422 429
423 bdi->bdev.bd_inode_backing_dev_info = NULL;
424 kmem_cache_free(bdev_cachep, bdi); 430 kmem_cache_free(bdev_cachep, bdi);
425} 431}
426 432
@@ -431,7 +437,6 @@ static void init_once(void *foo)
431 437
432 memset(bdev, 0, sizeof(*bdev)); 438 memset(bdev, 0, sizeof(*bdev));
433 mutex_init(&bdev->bd_mutex); 439 mutex_init(&bdev->bd_mutex);
434 sema_init(&bdev->bd_mount_sem, 1);
435 INIT_LIST_HEAD(&bdev->bd_inodes); 440 INIT_LIST_HEAD(&bdev->bd_inodes);
436 INIT_LIST_HEAD(&bdev->bd_list); 441 INIT_LIST_HEAD(&bdev->bd_list);
437#ifdef CONFIG_SYSFS 442#ifdef CONFIG_SYSFS
@@ -1115,7 +1120,7 @@ EXPORT_SYMBOL(revalidate_disk);
1115int check_disk_change(struct block_device *bdev) 1120int check_disk_change(struct block_device *bdev)
1116{ 1121{
1117 struct gendisk *disk = bdev->bd_disk; 1122 struct gendisk *disk = bdev->bd_disk;
1118 struct block_device_operations * bdops = disk->fops; 1123 const struct block_device_operations *bdops = disk->fops;
1119 1124
1120 if (!bdops->media_changed) 1125 if (!bdops->media_changed)
1121 return 0; 1126 return 0;
@@ -1405,6 +1410,33 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
1405} 1410}
1406 1411
1407/* 1412/*
1413 * Write data to the block device. Only intended for the block device itself
1414 * and the raw driver which basically is a fake block device.
1415 *
1416 * Does not take i_mutex for the write and thus is not for general purpose
1417 * use.
1418 */
1419ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1420 unsigned long nr_segs, loff_t pos)
1421{
1422 struct file *file = iocb->ki_filp;
1423 ssize_t ret;
1424
1425 BUG_ON(iocb->ki_pos != pos);
1426
1427 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
1428 if (ret > 0 || ret == -EIOCBQUEUED) {
1429 ssize_t err;
1430
1431 err = generic_write_sync(file, pos, ret);
1432 if (err < 0 && ret > 0)
1433 ret = err;
1434 }
1435 return ret;
1436}
1437EXPORT_SYMBOL_GPL(blkdev_aio_write);
1438
1439/*
1408 * Try to release a page associated with block device when the system 1440 * Try to release a page associated with block device when the system
1409 * is under memory pressure. 1441 * is under memory pressure.
1410 */ 1442 */
@@ -1436,7 +1468,7 @@ const struct file_operations def_blk_fops = {
1436 .read = do_sync_read, 1468 .read = do_sync_read,
1437 .write = do_sync_write, 1469 .write = do_sync_write,
1438 .aio_read = generic_file_aio_read, 1470 .aio_read = generic_file_aio_read,
1439 .aio_write = generic_file_aio_write_nolock, 1471 .aio_write = blkdev_aio_write,
1440 .mmap = generic_file_mmap, 1472 .mmap = generic_file_mmap,
1441 .fsync = block_fsync, 1473 .fsync = block_fsync,
1442 .unlocked_ioctl = block_ioctl, 1474 .unlocked_ioctl = block_ioctl,
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index f128427b995b..69b355ae7f49 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -27,7 +27,7 @@
27#include "btrfs_inode.h" 27#include "btrfs_inode.h"
28#include "xattr.h" 28#include "xattr.h"
29 29
30#ifdef CONFIG_FS_POSIX_ACL 30#ifdef CONFIG_BTRFS_POSIX_ACL
31 31
32static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) 32static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
33{ 33{
@@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = {
313 .set = btrfs_xattr_acl_access_set, 313 .set = btrfs_xattr_acl_access_set,
314}; 314};
315 315
316#else /* CONFIG_FS_POSIX_ACL */ 316#else /* CONFIG_BTRFS_POSIX_ACL */
317 317
318int btrfs_acl_chmod(struct inode *inode) 318int btrfs_acl_chmod(struct inode *inode)
319{ 319{
@@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
325 return 0; 325 return 0;
326} 326}
327 327
328#endif /* CONFIG_FS_POSIX_ACL */ 328#endif /* CONFIG_BTRFS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 019e8af449ab..282ca085c2fb 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -48,6 +48,9 @@ struct btrfs_worker_thread {
48 /* number of things on the pending list */ 48 /* number of things on the pending list */
49 atomic_t num_pending; 49 atomic_t num_pending;
50 50
51 /* reference counter for this struct */
52 atomic_t refs;
53
51 unsigned long sequence; 54 unsigned long sequence;
52 55
53 /* protects the pending list. */ 56 /* protects the pending list. */
@@ -71,7 +74,12 @@ static void check_idle_worker(struct btrfs_worker_thread *worker)
71 unsigned long flags; 74 unsigned long flags;
72 spin_lock_irqsave(&worker->workers->lock, flags); 75 spin_lock_irqsave(&worker->workers->lock, flags);
73 worker->idle = 1; 76 worker->idle = 1;
74 list_move(&worker->worker_list, &worker->workers->idle_list); 77
78 /* the list may be empty if the worker is just starting */
79 if (!list_empty(&worker->worker_list)) {
80 list_move(&worker->worker_list,
81 &worker->workers->idle_list);
82 }
75 spin_unlock_irqrestore(&worker->workers->lock, flags); 83 spin_unlock_irqrestore(&worker->workers->lock, flags);
76 } 84 }
77} 85}
@@ -87,23 +95,49 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
87 unsigned long flags; 95 unsigned long flags;
88 spin_lock_irqsave(&worker->workers->lock, flags); 96 spin_lock_irqsave(&worker->workers->lock, flags);
89 worker->idle = 0; 97 worker->idle = 0;
90 list_move_tail(&worker->worker_list, 98
91 &worker->workers->worker_list); 99 if (!list_empty(&worker->worker_list)) {
100 list_move_tail(&worker->worker_list,
101 &worker->workers->worker_list);
102 }
92 spin_unlock_irqrestore(&worker->workers->lock, flags); 103 spin_unlock_irqrestore(&worker->workers->lock, flags);
93 } 104 }
94} 105}
95 106
96static noinline int run_ordered_completions(struct btrfs_workers *workers, 107static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
97 struct btrfs_work *work)
98{ 108{
109 struct btrfs_workers *workers = worker->workers;
99 unsigned long flags; 110 unsigned long flags;
100 111
112 rmb();
113 if (!workers->atomic_start_pending)
114 return;
115
116 spin_lock_irqsave(&workers->lock, flags);
117 if (!workers->atomic_start_pending)
118 goto out;
119
120 workers->atomic_start_pending = 0;
121 if (workers->num_workers >= workers->max_workers)
122 goto out;
123
124 spin_unlock_irqrestore(&workers->lock, flags);
125 btrfs_start_workers(workers, 1);
126 return;
127
128out:
129 spin_unlock_irqrestore(&workers->lock, flags);
130}
131
132static noinline int run_ordered_completions(struct btrfs_workers *workers,
133 struct btrfs_work *work)
134{
101 if (!workers->ordered) 135 if (!workers->ordered)
102 return 0; 136 return 0;
103 137
104 set_bit(WORK_DONE_BIT, &work->flags); 138 set_bit(WORK_DONE_BIT, &work->flags);
105 139
106 spin_lock_irqsave(&workers->lock, flags); 140 spin_lock(&workers->order_lock);
107 141
108 while (1) { 142 while (1) {
109 if (!list_empty(&workers->prio_order_list)) { 143 if (!list_empty(&workers->prio_order_list)) {
@@ -126,45 +160,118 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
126 if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) 160 if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
127 break; 161 break;
128 162
129 spin_unlock_irqrestore(&workers->lock, flags); 163 spin_unlock(&workers->order_lock);
130 164
131 work->ordered_func(work); 165 work->ordered_func(work);
132 166
133 /* now take the lock again and call the freeing code */ 167 /* now take the lock again and call the freeing code */
134 spin_lock_irqsave(&workers->lock, flags); 168 spin_lock(&workers->order_lock);
135 list_del(&work->order_list); 169 list_del(&work->order_list);
136 work->ordered_free(work); 170 work->ordered_free(work);
137 } 171 }
138 172
139 spin_unlock_irqrestore(&workers->lock, flags); 173 spin_unlock(&workers->order_lock);
140 return 0; 174 return 0;
141} 175}
142 176
177static void put_worker(struct btrfs_worker_thread *worker)
178{
179 if (atomic_dec_and_test(&worker->refs))
180 kfree(worker);
181}
182
183static int try_worker_shutdown(struct btrfs_worker_thread *worker)
184{
185 int freeit = 0;
186
187 spin_lock_irq(&worker->lock);
188 spin_lock(&worker->workers->lock);
189 if (worker->workers->num_workers > 1 &&
190 worker->idle &&
191 !worker->working &&
192 !list_empty(&worker->worker_list) &&
193 list_empty(&worker->prio_pending) &&
194 list_empty(&worker->pending) &&
195 atomic_read(&worker->num_pending) == 0) {
196 freeit = 1;
197 list_del_init(&worker->worker_list);
198 worker->workers->num_workers--;
199 }
200 spin_unlock(&worker->workers->lock);
201 spin_unlock_irq(&worker->lock);
202
203 if (freeit)
204 put_worker(worker);
205 return freeit;
206}
207
208static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
209 struct list_head *prio_head,
210 struct list_head *head)
211{
212 struct btrfs_work *work = NULL;
213 struct list_head *cur = NULL;
214
215 if(!list_empty(prio_head))
216 cur = prio_head->next;
217
218 smp_mb();
219 if (!list_empty(&worker->prio_pending))
220 goto refill;
221
222 if (!list_empty(head))
223 cur = head->next;
224
225 if (cur)
226 goto out;
227
228refill:
229 spin_lock_irq(&worker->lock);
230 list_splice_tail_init(&worker->prio_pending, prio_head);
231 list_splice_tail_init(&worker->pending, head);
232
233 if (!list_empty(prio_head))
234 cur = prio_head->next;
235 else if (!list_empty(head))
236 cur = head->next;
237 spin_unlock_irq(&worker->lock);
238
239 if (!cur)
240 goto out_fail;
241
242out:
243 work = list_entry(cur, struct btrfs_work, list);
244
245out_fail:
246 return work;
247}
248
143/* 249/*
144 * main loop for servicing work items 250 * main loop for servicing work items
145 */ 251 */
146static int worker_loop(void *arg) 252static int worker_loop(void *arg)
147{ 253{
148 struct btrfs_worker_thread *worker = arg; 254 struct btrfs_worker_thread *worker = arg;
149 struct list_head *cur; 255 struct list_head head;
256 struct list_head prio_head;
150 struct btrfs_work *work; 257 struct btrfs_work *work;
258
259 INIT_LIST_HEAD(&head);
260 INIT_LIST_HEAD(&prio_head);
261
151 do { 262 do {
152 spin_lock_irq(&worker->lock); 263again:
153again_locked:
154 while (1) { 264 while (1) {
155 if (!list_empty(&worker->prio_pending)) 265
156 cur = worker->prio_pending.next; 266
157 else if (!list_empty(&worker->pending)) 267 work = get_next_work(worker, &prio_head, &head);
158 cur = worker->pending.next; 268 if (!work)
159 else
160 break; 269 break;
161 270
162 work = list_entry(cur, struct btrfs_work, list);
163 list_del(&work->list); 271 list_del(&work->list);
164 clear_bit(WORK_QUEUED_BIT, &work->flags); 272 clear_bit(WORK_QUEUED_BIT, &work->flags);
165 273
166 work->worker = worker; 274 work->worker = worker;
167 spin_unlock_irq(&worker->lock);
168 275
169 work->func(work); 276 work->func(work);
170 277
@@ -175,9 +282,13 @@ again_locked:
175 */ 282 */
176 run_ordered_completions(worker->workers, work); 283 run_ordered_completions(worker->workers, work);
177 284
178 spin_lock_irq(&worker->lock); 285 check_pending_worker_creates(worker);
179 check_idle_worker(worker); 286
180 } 287 }
288
289 spin_lock_irq(&worker->lock);
290 check_idle_worker(worker);
291
181 if (freezing(current)) { 292 if (freezing(current)) {
182 worker->working = 0; 293 worker->working = 0;
183 spin_unlock_irq(&worker->lock); 294 spin_unlock_irq(&worker->lock);
@@ -216,8 +327,10 @@ again_locked:
216 spin_lock_irq(&worker->lock); 327 spin_lock_irq(&worker->lock);
217 set_current_state(TASK_INTERRUPTIBLE); 328 set_current_state(TASK_INTERRUPTIBLE);
218 if (!list_empty(&worker->pending) || 329 if (!list_empty(&worker->pending) ||
219 !list_empty(&worker->prio_pending)) 330 !list_empty(&worker->prio_pending)) {
220 goto again_locked; 331 spin_unlock_irq(&worker->lock);
332 goto again;
333 }
221 334
222 /* 335 /*
223 * this makes sure we get a wakeup when someone 336 * this makes sure we get a wakeup when someone
@@ -226,8 +339,13 @@ again_locked:
226 worker->working = 0; 339 worker->working = 0;
227 spin_unlock_irq(&worker->lock); 340 spin_unlock_irq(&worker->lock);
228 341
229 if (!kthread_should_stop()) 342 if (!kthread_should_stop()) {
230 schedule(); 343 schedule_timeout(HZ * 120);
344 if (!worker->working &&
345 try_worker_shutdown(worker)) {
346 return 0;
347 }
348 }
231 } 349 }
232 __set_current_state(TASK_RUNNING); 350 __set_current_state(TASK_RUNNING);
233 } 351 }
@@ -242,16 +360,30 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
242{ 360{
243 struct list_head *cur; 361 struct list_head *cur;
244 struct btrfs_worker_thread *worker; 362 struct btrfs_worker_thread *worker;
363 int can_stop;
245 364
365 spin_lock_irq(&workers->lock);
246 list_splice_init(&workers->idle_list, &workers->worker_list); 366 list_splice_init(&workers->idle_list, &workers->worker_list);
247 while (!list_empty(&workers->worker_list)) { 367 while (!list_empty(&workers->worker_list)) {
248 cur = workers->worker_list.next; 368 cur = workers->worker_list.next;
249 worker = list_entry(cur, struct btrfs_worker_thread, 369 worker = list_entry(cur, struct btrfs_worker_thread,
250 worker_list); 370 worker_list);
251 kthread_stop(worker->task); 371
252 list_del(&worker->worker_list); 372 atomic_inc(&worker->refs);
253 kfree(worker); 373 workers->num_workers -= 1;
374 if (!list_empty(&worker->worker_list)) {
375 list_del_init(&worker->worker_list);
376 put_worker(worker);
377 can_stop = 1;
378 } else
379 can_stop = 0;
380 spin_unlock_irq(&workers->lock);
381 if (can_stop)
382 kthread_stop(worker->task);
383 spin_lock_irq(&workers->lock);
384 put_worker(worker);
254 } 385 }
386 spin_unlock_irq(&workers->lock);
255 return 0; 387 return 0;
256} 388}
257 389
@@ -266,10 +398,13 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
266 INIT_LIST_HEAD(&workers->order_list); 398 INIT_LIST_HEAD(&workers->order_list);
267 INIT_LIST_HEAD(&workers->prio_order_list); 399 INIT_LIST_HEAD(&workers->prio_order_list);
268 spin_lock_init(&workers->lock); 400 spin_lock_init(&workers->lock);
401 spin_lock_init(&workers->order_lock);
269 workers->max_workers = max; 402 workers->max_workers = max;
270 workers->idle_thresh = 32; 403 workers->idle_thresh = 32;
271 workers->name = name; 404 workers->name = name;
272 workers->ordered = 0; 405 workers->ordered = 0;
406 workers->atomic_start_pending = 0;
407 workers->atomic_worker_start = 0;
273} 408}
274 409
275/* 410/*
@@ -293,7 +428,9 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
293 INIT_LIST_HEAD(&worker->prio_pending); 428 INIT_LIST_HEAD(&worker->prio_pending);
294 INIT_LIST_HEAD(&worker->worker_list); 429 INIT_LIST_HEAD(&worker->worker_list);
295 spin_lock_init(&worker->lock); 430 spin_lock_init(&worker->lock);
431
296 atomic_set(&worker->num_pending, 0); 432 atomic_set(&worker->num_pending, 0);
433 atomic_set(&worker->refs, 1);
297 worker->workers = workers; 434 worker->workers = workers;
298 worker->task = kthread_run(worker_loop, worker, 435 worker->task = kthread_run(worker_loop, worker,
299 "btrfs-%s-%d", workers->name, 436 "btrfs-%s-%d", workers->name,
@@ -303,7 +440,6 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
303 kfree(worker); 440 kfree(worker);
304 goto fail; 441 goto fail;
305 } 442 }
306
307 spin_lock_irq(&workers->lock); 443 spin_lock_irq(&workers->lock);
308 list_add_tail(&worker->worker_list, &workers->idle_list); 444 list_add_tail(&worker->worker_list, &workers->idle_list);
309 worker->idle = 1; 445 worker->idle = 1;
@@ -350,7 +486,6 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
350 */ 486 */
351 next = workers->worker_list.next; 487 next = workers->worker_list.next;
352 worker = list_entry(next, struct btrfs_worker_thread, worker_list); 488 worker = list_entry(next, struct btrfs_worker_thread, worker_list);
353 atomic_inc(&worker->num_pending);
354 worker->sequence++; 489 worker->sequence++;
355 490
356 if (worker->sequence % workers->idle_thresh == 0) 491 if (worker->sequence % workers->idle_thresh == 0)
@@ -367,28 +502,18 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
367{ 502{
368 struct btrfs_worker_thread *worker; 503 struct btrfs_worker_thread *worker;
369 unsigned long flags; 504 unsigned long flags;
505 struct list_head *fallback;
370 506
371again: 507again:
372 spin_lock_irqsave(&workers->lock, flags); 508 spin_lock_irqsave(&workers->lock, flags);
373 worker = next_worker(workers); 509 worker = next_worker(workers);
374 spin_unlock_irqrestore(&workers->lock, flags);
375 510
376 if (!worker) { 511 if (!worker) {
377 spin_lock_irqsave(&workers->lock, flags);
378 if (workers->num_workers >= workers->max_workers) { 512 if (workers->num_workers >= workers->max_workers) {
379 struct list_head *fallback = NULL; 513 goto fallback;
380 /* 514 } else if (workers->atomic_worker_start) {
381 * we have failed to find any workers, just 515 workers->atomic_start_pending = 1;
382 * return the force one 516 goto fallback;
383 */
384 if (!list_empty(&workers->worker_list))
385 fallback = workers->worker_list.next;
386 if (!list_empty(&workers->idle_list))
387 fallback = workers->idle_list.next;
388 BUG_ON(!fallback);
389 worker = list_entry(fallback,
390 struct btrfs_worker_thread, worker_list);
391 spin_unlock_irqrestore(&workers->lock, flags);
392 } else { 517 } else {
393 spin_unlock_irqrestore(&workers->lock, flags); 518 spin_unlock_irqrestore(&workers->lock, flags);
394 /* we're below the limit, start another worker */ 519 /* we're below the limit, start another worker */
@@ -396,6 +521,28 @@ again:
396 goto again; 521 goto again;
397 } 522 }
398 } 523 }
524 goto found;
525
526fallback:
527 fallback = NULL;
528 /*
529 * we have failed to find any workers, just
530 * return the first one we can find.
531 */
532 if (!list_empty(&workers->worker_list))
533 fallback = workers->worker_list.next;
534 if (!list_empty(&workers->idle_list))
535 fallback = workers->idle_list.next;
536 BUG_ON(!fallback);
537 worker = list_entry(fallback,
538 struct btrfs_worker_thread, worker_list);
539found:
540 /*
541 * this makes sure the worker doesn't exit before it is placed
542 * onto a busy/idle list
543 */
544 atomic_inc(&worker->num_pending);
545 spin_unlock_irqrestore(&workers->lock, flags);
399 return worker; 546 return worker;
400} 547}
401 548
@@ -427,7 +574,7 @@ int btrfs_requeue_work(struct btrfs_work *work)
427 spin_lock(&worker->workers->lock); 574 spin_lock(&worker->workers->lock);
428 worker->idle = 0; 575 worker->idle = 0;
429 list_move_tail(&worker->worker_list, 576 list_move_tail(&worker->worker_list,
430 &worker->workers->worker_list); 577 &worker->workers->worker_list);
431 spin_unlock(&worker->workers->lock); 578 spin_unlock(&worker->workers->lock);
432 } 579 }
433 if (!worker->working) { 580 if (!worker->working) {
@@ -435,9 +582,9 @@ int btrfs_requeue_work(struct btrfs_work *work)
435 worker->working = 1; 582 worker->working = 1;
436 } 583 }
437 584
438 spin_unlock_irqrestore(&worker->lock, flags);
439 if (wake) 585 if (wake)
440 wake_up_process(worker->task); 586 wake_up_process(worker->task);
587 spin_unlock_irqrestore(&worker->lock, flags);
441out: 588out:
442 589
443 return 0; 590 return 0;
@@ -463,14 +610,18 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
463 610
464 worker = find_worker(workers); 611 worker = find_worker(workers);
465 if (workers->ordered) { 612 if (workers->ordered) {
466 spin_lock_irqsave(&workers->lock, flags); 613 /*
614 * you're not allowed to do ordered queues from an
615 * interrupt handler
616 */
617 spin_lock(&workers->order_lock);
467 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) { 618 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
468 list_add_tail(&work->order_list, 619 list_add_tail(&work->order_list,
469 &workers->prio_order_list); 620 &workers->prio_order_list);
470 } else { 621 } else {
471 list_add_tail(&work->order_list, &workers->order_list); 622 list_add_tail(&work->order_list, &workers->order_list);
472 } 623 }
473 spin_unlock_irqrestore(&workers->lock, flags); 624 spin_unlock(&workers->order_lock);
474 } else { 625 } else {
475 INIT_LIST_HEAD(&work->order_list); 626 INIT_LIST_HEAD(&work->order_list);
476 } 627 }
@@ -481,7 +632,6 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
481 list_add_tail(&work->list, &worker->prio_pending); 632 list_add_tail(&work->list, &worker->prio_pending);
482 else 633 else
483 list_add_tail(&work->list, &worker->pending); 634 list_add_tail(&work->list, &worker->pending);
484 atomic_inc(&worker->num_pending);
485 check_busy_worker(worker); 635 check_busy_worker(worker);
486 636
487 /* 637 /*
@@ -492,10 +642,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
492 wake = 1; 642 wake = 1;
493 worker->working = 1; 643 worker->working = 1;
494 644
495 spin_unlock_irqrestore(&worker->lock, flags);
496
497 if (wake) 645 if (wake)
498 wake_up_process(worker->task); 646 wake_up_process(worker->task);
647 spin_unlock_irqrestore(&worker->lock, flags);
648
499out: 649out:
500 return 0; 650 return 0;
501} 651}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 1b511c109db6..fc089b95ec14 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -73,6 +73,15 @@ struct btrfs_workers {
73 /* force completions in the order they were queued */ 73 /* force completions in the order they were queued */
74 int ordered; 74 int ordered;
75 75
76 /* more workers required, but in an interrupt handler */
77 int atomic_start_pending;
78
79 /*
80 * are we allowed to sleep while starting workers or are we required
81 * to start them at a later time?
82 */
83 int atomic_worker_start;
84
76 /* list with all the work threads. The workers on the idle thread 85 /* list with all the work threads. The workers on the idle thread
77 * may be actively servicing jobs, but they haven't yet hit the 86 * may be actively servicing jobs, but they haven't yet hit the
78 * idle thresh limit above. 87 * idle thresh limit above.
@@ -90,6 +99,9 @@ struct btrfs_workers {
90 /* lock for finding the next worker thread to queue on */ 99 /* lock for finding the next worker thread to queue on */
91 spinlock_t lock; 100 spinlock_t lock;
92 101
102 /* lock for the ordered lists */
103 spinlock_t order_lock;
104
93 /* extra name for this worker, used for current->name */ 105 /* extra name for this worker, used for current->name */
94 char *name; 106 char *name;
95}; 107};
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ea1ea0af8c0e..a54d354cefcb 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -128,6 +128,14 @@ struct btrfs_inode {
128 u64 last_unlink_trans; 128 u64 last_unlink_trans;
129 129
130 /* 130 /*
131 * These two counters are for delalloc metadata reservations. We keep
132 * track of how many extents we've accounted for vs how many extents we
133 * have.
134 */
135 int delalloc_reserved_extents;
136 int delalloc_extents;
137
138 /*
131 * ordered_data_close is set by truncate when a file that used 139 * ordered_data_close is set by truncate when a file that used
132 * to have good data has been truncated to zero. When it is set 140 * to have good data has been truncated to zero. When it is set
133 * the btrfs file release call will add this inode to the 141 * the btrfs file release call will add this inode to the
@@ -138,6 +146,7 @@ struct btrfs_inode {
138 * of these. 146 * of these.
139 */ 147 */
140 unsigned ordered_data_close:1; 148 unsigned ordered_data_close:1;
149 unsigned dummy_inode:1;
141 150
142 struct inode vfs_inode; 151 struct inode vfs_inode;
143}; 152};
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 9d8ba4d54a37..a11a32058b50 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -506,10 +506,10 @@ static noinline int add_ra_bio_pages(struct inode *inode,
506 */ 506 */
507 set_page_extent_mapped(page); 507 set_page_extent_mapped(page);
508 lock_extent(tree, last_offset, end, GFP_NOFS); 508 lock_extent(tree, last_offset, end, GFP_NOFS);
509 spin_lock(&em_tree->lock); 509 read_lock(&em_tree->lock);
510 em = lookup_extent_mapping(em_tree, last_offset, 510 em = lookup_extent_mapping(em_tree, last_offset,
511 PAGE_CACHE_SIZE); 511 PAGE_CACHE_SIZE);
512 spin_unlock(&em_tree->lock); 512 read_unlock(&em_tree->lock);
513 513
514 if (!em || last_offset < em->start || 514 if (!em || last_offset < em->start ||
515 (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || 515 (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
@@ -593,11 +593,11 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
593 em_tree = &BTRFS_I(inode)->extent_tree; 593 em_tree = &BTRFS_I(inode)->extent_tree;
594 594
595 /* we need the actual starting offset of this extent in the file */ 595 /* we need the actual starting offset of this extent in the file */
596 spin_lock(&em_tree->lock); 596 read_lock(&em_tree->lock);
597 em = lookup_extent_mapping(em_tree, 597 em = lookup_extent_mapping(em_tree,
598 page_offset(bio->bi_io_vec->bv_page), 598 page_offset(bio->bi_io_vec->bv_page),
599 PAGE_CACHE_SIZE); 599 PAGE_CACHE_SIZE);
600 spin_unlock(&em_tree->lock); 600 read_unlock(&em_tree->lock);
601 601
602 compressed_len = em->block_len; 602 compressed_len = em->block_len;
603 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); 603 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3fdcc0512d3a..ec96f3a6d536 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2853,6 +2853,12 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
2853 int split; 2853 int split;
2854 int num_doubles = 0; 2854 int num_doubles = 0;
2855 2855
2856 l = path->nodes[0];
2857 slot = path->slots[0];
2858 if (extend && data_size + btrfs_item_size_nr(l, slot) +
2859 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
2860 return -EOVERFLOW;
2861
2856 /* first try to make some room by pushing left and right */ 2862 /* first try to make some room by pushing left and right */
2857 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { 2863 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
2858 wret = push_leaf_right(trans, root, path, data_size, 0); 2864 wret = push_leaf_right(trans, root, path, data_size, 0);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 837435ce84ca..dd8ced9814c4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -114,6 +114,10 @@ struct btrfs_ordered_sum;
114 */ 114 */
115#define BTRFS_DEV_ITEMS_OBJECTID 1ULL 115#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
116 116
117#define BTRFS_BTREE_INODE_OBJECTID 1
118
119#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
120
117/* 121/*
118 * we can actually store much bigger names, but lets not confuse the rest 122 * we can actually store much bigger names, but lets not confuse the rest
119 * of linux 123 * of linux
@@ -670,18 +674,20 @@ struct btrfs_space_info {
670 u64 bytes_reserved; /* total bytes the allocator has reserved for 674 u64 bytes_reserved; /* total bytes the allocator has reserved for
671 current allocations */ 675 current allocations */
672 u64 bytes_readonly; /* total bytes that are read only */ 676 u64 bytes_readonly; /* total bytes that are read only */
673 677 u64 bytes_super; /* total bytes reserved for the super blocks */
674 /* delalloc accounting */ 678 u64 bytes_root; /* the number of bytes needed to commit a
675 u64 bytes_delalloc; /* number of bytes reserved for allocation, 679 transaction */
676 this space is not necessarily reserved yet
677 by the allocator */
678 u64 bytes_may_use; /* number of bytes that may be used for 680 u64 bytes_may_use; /* number of bytes that may be used for
679 delalloc */ 681 delalloc/allocations */
682 u64 bytes_delalloc; /* number of bytes currently reserved for
683 delayed allocation */
680 684
681 int full; /* indicates that we cannot allocate any more 685 int full; /* indicates that we cannot allocate any more
682 chunks for this space */ 686 chunks for this space */
683 int force_alloc; /* set if we need to force a chunk alloc for 687 int force_alloc; /* set if we need to force a chunk alloc for
684 this space */ 688 this space */
689 int force_delalloc; /* make people start doing filemap_flush until
690 we're under a threshold */
685 691
686 struct list_head list; 692 struct list_head list;
687 693
@@ -690,6 +696,9 @@ struct btrfs_space_info {
690 spinlock_t lock; 696 spinlock_t lock;
691 struct rw_semaphore groups_sem; 697 struct rw_semaphore groups_sem;
692 atomic_t caching_threads; 698 atomic_t caching_threads;
699
700 int allocating_chunk;
701 wait_queue_head_t wait;
693}; 702};
694 703
695/* 704/*
@@ -726,6 +735,15 @@ enum btrfs_caching_type {
726 BTRFS_CACHE_FINISHED = 2, 735 BTRFS_CACHE_FINISHED = 2,
727}; 736};
728 737
738struct btrfs_caching_control {
739 struct list_head list;
740 struct mutex mutex;
741 wait_queue_head_t wait;
742 struct btrfs_block_group_cache *block_group;
743 u64 progress;
744 atomic_t count;
745};
746
729struct btrfs_block_group_cache { 747struct btrfs_block_group_cache {
730 struct btrfs_key key; 748 struct btrfs_key key;
731 struct btrfs_block_group_item item; 749 struct btrfs_block_group_item item;
@@ -733,6 +751,7 @@ struct btrfs_block_group_cache {
733 spinlock_t lock; 751 spinlock_t lock;
734 u64 pinned; 752 u64 pinned;
735 u64 reserved; 753 u64 reserved;
754 u64 bytes_super;
736 u64 flags; 755 u64 flags;
737 u64 sectorsize; 756 u64 sectorsize;
738 int extents_thresh; 757 int extents_thresh;
@@ -742,8 +761,9 @@ struct btrfs_block_group_cache {
742 int dirty; 761 int dirty;
743 762
744 /* cache tracking stuff */ 763 /* cache tracking stuff */
745 wait_queue_head_t caching_q;
746 int cached; 764 int cached;
765 struct btrfs_caching_control *caching_ctl;
766 u64 last_byte_to_unpin;
747 767
748 struct btrfs_space_info *space_info; 768 struct btrfs_space_info *space_info;
749 769
@@ -782,13 +802,16 @@ struct btrfs_fs_info {
782 802
783 /* the log root tree is a directory of all the other log roots */ 803 /* the log root tree is a directory of all the other log roots */
784 struct btrfs_root *log_root_tree; 804 struct btrfs_root *log_root_tree;
805
806 spinlock_t fs_roots_radix_lock;
785 struct radix_tree_root fs_roots_radix; 807 struct radix_tree_root fs_roots_radix;
786 808
787 /* block group cache stuff */ 809 /* block group cache stuff */
788 spinlock_t block_group_cache_lock; 810 spinlock_t block_group_cache_lock;
789 struct rb_root block_group_cache_tree; 811 struct rb_root block_group_cache_tree;
790 812
791 struct extent_io_tree pinned_extents; 813 struct extent_io_tree freed_extents[2];
814 struct extent_io_tree *pinned_extents;
792 815
793 /* logical->physical extent mapping */ 816 /* logical->physical extent mapping */
794 struct btrfs_mapping_tree mapping_tree; 817 struct btrfs_mapping_tree mapping_tree;
@@ -822,11 +845,7 @@ struct btrfs_fs_info {
822 struct mutex transaction_kthread_mutex; 845 struct mutex transaction_kthread_mutex;
823 struct mutex cleaner_mutex; 846 struct mutex cleaner_mutex;
824 struct mutex chunk_mutex; 847 struct mutex chunk_mutex;
825 struct mutex drop_mutex;
826 struct mutex volume_mutex; 848 struct mutex volume_mutex;
827 struct mutex tree_reloc_mutex;
828 struct rw_semaphore extent_commit_sem;
829
830 /* 849 /*
831 * this protects the ordered operations list only while we are 850 * this protects the ordered operations list only while we are
832 * processing all of the entries on it. This way we make 851 * processing all of the entries on it. This way we make
@@ -835,10 +854,16 @@ struct btrfs_fs_info {
835 * before jumping into the main commit. 854 * before jumping into the main commit.
836 */ 855 */
837 struct mutex ordered_operations_mutex; 856 struct mutex ordered_operations_mutex;
857 struct rw_semaphore extent_commit_sem;
858
859 struct rw_semaphore subvol_sem;
860
861 struct srcu_struct subvol_srcu;
838 862
839 struct list_head trans_list; 863 struct list_head trans_list;
840 struct list_head hashers; 864 struct list_head hashers;
841 struct list_head dead_roots; 865 struct list_head dead_roots;
866 struct list_head caching_block_groups;
842 867
843 atomic_t nr_async_submits; 868 atomic_t nr_async_submits;
844 atomic_t async_submit_draining; 869 atomic_t async_submit_draining;
@@ -996,10 +1021,12 @@ struct btrfs_root {
996 u32 stripesize; 1021 u32 stripesize;
997 1022
998 u32 type; 1023 u32 type;
999 u64 highest_inode; 1024
1000 u64 last_inode_alloc; 1025 u64 highest_objectid;
1001 int ref_cows; 1026 int ref_cows;
1002 int track_dirty; 1027 int track_dirty;
1028 int in_radix;
1029
1003 u64 defrag_trans_start; 1030 u64 defrag_trans_start;
1004 struct btrfs_key defrag_progress; 1031 struct btrfs_key defrag_progress;
1005 struct btrfs_key defrag_max; 1032 struct btrfs_key defrag_max;
@@ -1920,8 +1947,8 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
1920int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 1947int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1921 struct btrfs_root *root, unsigned long count); 1948 struct btrfs_root *root, unsigned long count);
1922int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); 1949int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
1923int btrfs_update_pinned_extents(struct btrfs_root *root, 1950int btrfs_pin_extent(struct btrfs_root *root,
1924 u64 bytenr, u64 num, int pin); 1951 u64 bytenr, u64 num, int reserved);
1925int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, 1952int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
1926 struct btrfs_root *root, struct extent_buffer *leaf); 1953 struct btrfs_root *root, struct extent_buffer *leaf);
1927int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 1954int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
@@ -1971,9 +1998,10 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
1971 u64 root_objectid, u64 owner, u64 offset); 1998 u64 root_objectid, u64 owner, u64 offset);
1972 1999
1973int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); 2000int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
2001int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
2002 struct btrfs_root *root);
1974int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 2003int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
1975 struct btrfs_root *root, 2004 struct btrfs_root *root);
1976 struct extent_io_tree *unpin);
1977int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2005int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1978 struct btrfs_root *root, 2006 struct btrfs_root *root,
1979 u64 bytenr, u64 num_bytes, u64 parent, 2007 u64 bytenr, u64 num_bytes, u64 parent,
@@ -1984,6 +2012,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1984int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr); 2012int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
1985int btrfs_free_block_groups(struct btrfs_fs_info *info); 2013int btrfs_free_block_groups(struct btrfs_fs_info *info);
1986int btrfs_read_block_groups(struct btrfs_root *root); 2014int btrfs_read_block_groups(struct btrfs_root *root);
2015int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr);
1987int btrfs_make_block_group(struct btrfs_trans_handle *trans, 2016int btrfs_make_block_group(struct btrfs_trans_handle *trans,
1988 struct btrfs_root *root, u64 bytes_used, 2017 struct btrfs_root *root, u64 bytes_used,
1989 u64 type, u64 chunk_objectid, u64 chunk_offset, 2018 u64 type, u64 chunk_objectid, u64 chunk_offset,
@@ -1997,7 +2026,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
1997void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); 2026void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
1998void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2027void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
1999 2028
2000int btrfs_check_metadata_free_space(struct btrfs_root *root); 2029int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
2030int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
2031int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2032 struct inode *inode, int num_items);
2033int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
2034 struct inode *inode, int num_items);
2001int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 2035int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
2002 u64 bytes); 2036 u64 bytes);
2003void btrfs_free_reserved_data_space(struct btrfs_root *root, 2037void btrfs_free_reserved_data_space(struct btrfs_root *root,
@@ -2006,7 +2040,6 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
2006 u64 bytes); 2040 u64 bytes);
2007void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, 2041void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
2008 u64 bytes); 2042 u64 bytes);
2009void btrfs_free_pinned_extents(struct btrfs_fs_info *info);
2010/* ctree.c */ 2043/* ctree.c */
2011int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2044int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2012 int level, int *slot); 2045 int level, int *slot);
@@ -2100,12 +2133,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
2100 struct extent_buffer *parent); 2133 struct extent_buffer *parent);
2101/* root-item.c */ 2134/* root-item.c */
2102int btrfs_find_root_ref(struct btrfs_root *tree_root, 2135int btrfs_find_root_ref(struct btrfs_root *tree_root,
2103 struct btrfs_path *path, 2136 struct btrfs_path *path,
2104 u64 root_id, u64 ref_id); 2137 u64 root_id, u64 ref_id);
2105int btrfs_add_root_ref(struct btrfs_trans_handle *trans, 2138int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
2106 struct btrfs_root *tree_root, 2139 struct btrfs_root *tree_root,
2107 u64 root_id, u8 type, u64 ref_id, 2140 u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
2108 u64 dirid, u64 sequence, 2141 const char *name, int name_len);
2142int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
2143 struct btrfs_root *tree_root,
2144 u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
2109 const char *name, int name_len); 2145 const char *name, int name_len);
2110int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2146int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2111 struct btrfs_key *key); 2147 struct btrfs_key *key);
@@ -2120,6 +2156,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
2120int btrfs_search_root(struct btrfs_root *root, u64 search_start, 2156int btrfs_search_root(struct btrfs_root *root, u64 search_start,
2121 u64 *found_objectid); 2157 u64 *found_objectid);
2122int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); 2158int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
2159int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
2123int btrfs_set_root_node(struct btrfs_root_item *item, 2160int btrfs_set_root_node(struct btrfs_root_item *item,
2124 struct extent_buffer *node); 2161 struct extent_buffer *node);
2125/* dir-item.c */ 2162/* dir-item.c */
@@ -2138,6 +2175,10 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
2138 struct btrfs_path *path, u64 dir, 2175 struct btrfs_path *path, u64 dir,
2139 u64 objectid, const char *name, int name_len, 2176 u64 objectid, const char *name, int name_len,
2140 int mod); 2177 int mod);
2178struct btrfs_dir_item *
2179btrfs_search_dir_index_item(struct btrfs_root *root,
2180 struct btrfs_path *path, u64 dirid,
2181 const char *name, int name_len);
2141struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, 2182struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
2142 struct btrfs_path *path, 2183 struct btrfs_path *path,
2143 const char *name, int name_len); 2184 const char *name, int name_len);
@@ -2160,6 +2201,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
2160 struct btrfs_root *root, u64 offset); 2201 struct btrfs_root *root, u64 offset);
2161int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, 2202int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
2162 struct btrfs_root *root, u64 offset); 2203 struct btrfs_root *root, u64 offset);
2204int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
2163 2205
2164/* inode-map.c */ 2206/* inode-map.c */
2165int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, 2207int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
@@ -2232,6 +2274,10 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2232int btrfs_add_link(struct btrfs_trans_handle *trans, 2274int btrfs_add_link(struct btrfs_trans_handle *trans,
2233 struct inode *parent_inode, struct inode *inode, 2275 struct inode *parent_inode, struct inode *inode,
2234 const char *name, int name_len, int add_backref, u64 index); 2276 const char *name, int name_len, int add_backref, u64 index);
2277int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
2278 struct btrfs_root *root,
2279 struct inode *dir, u64 objectid,
2280 const char *name, int name_len);
2235int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 2281int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2236 struct btrfs_root *root, 2282 struct btrfs_root *root,
2237 struct inode *inode, u64 new_size, 2283 struct inode *inode, u64 new_size,
@@ -2242,7 +2288,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
2242int btrfs_writepages(struct address_space *mapping, 2288int btrfs_writepages(struct address_space *mapping,
2243 struct writeback_control *wbc); 2289 struct writeback_control *wbc);
2244int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 2290int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
2245 struct btrfs_root *new_root, struct dentry *dentry, 2291 struct btrfs_root *new_root,
2246 u64 new_dirid, u64 alloc_hint); 2292 u64 new_dirid, u64 alloc_hint);
2247int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 2293int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
2248 size_t size, struct bio *bio, unsigned long bio_flags); 2294 size_t size, struct bio *bio, unsigned long bio_flags);
@@ -2258,6 +2304,7 @@ int btrfs_write_inode(struct inode *inode, int wait);
2258void btrfs_dirty_inode(struct inode *inode); 2304void btrfs_dirty_inode(struct inode *inode);
2259struct inode *btrfs_alloc_inode(struct super_block *sb); 2305struct inode *btrfs_alloc_inode(struct super_block *sb);
2260void btrfs_destroy_inode(struct inode *inode); 2306void btrfs_destroy_inode(struct inode *inode);
2307void btrfs_drop_inode(struct inode *inode);
2261int btrfs_init_cachep(void); 2308int btrfs_init_cachep(void);
2262void btrfs_destroy_cachep(void); 2309void btrfs_destroy_cachep(void);
2263long btrfs_ioctl_trans_end(struct file *file); 2310long btrfs_ioctl_trans_end(struct file *file);
@@ -2275,6 +2322,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2275int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); 2322int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2276void btrfs_orphan_cleanup(struct btrfs_root *root); 2323void btrfs_orphan_cleanup(struct btrfs_root *root);
2277int btrfs_cont_expand(struct inode *inode, loff_t size); 2324int btrfs_cont_expand(struct inode *inode, loff_t size);
2325int btrfs_invalidate_inodes(struct btrfs_root *root);
2326extern struct dentry_operations btrfs_dentry_operations;
2278 2327
2279/* ioctl.c */ 2328/* ioctl.c */
2280long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); 2329long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -2286,11 +2335,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
2286int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 2335int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
2287 int skip_pinned); 2336 int skip_pinned);
2288int btrfs_check_file(struct btrfs_root *root, struct inode *inode); 2337int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
2289extern struct file_operations btrfs_file_operations; 2338extern const struct file_operations btrfs_file_operations;
2290int btrfs_drop_extents(struct btrfs_trans_handle *trans, 2339int btrfs_drop_extents(struct btrfs_trans_handle *trans,
2291 struct btrfs_root *root, struct inode *inode, 2340 struct btrfs_root *root, struct inode *inode,
2292 u64 start, u64 end, u64 locked_end, 2341 u64 start, u64 end, u64 locked_end,
2293 u64 inline_limit, u64 *hint_block); 2342 u64 inline_limit, u64 *hint_block, int drop_cache);
2294int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, 2343int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
2295 struct btrfs_root *root, 2344 struct btrfs_root *root,
2296 struct inode *inode, u64 start, u64 end); 2345 struct inode *inode, u64 start, u64 end);
@@ -2317,7 +2366,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options);
2317int btrfs_sync_fs(struct super_block *sb, int wait); 2366int btrfs_sync_fs(struct super_block *sb, int wait);
2318 2367
2319/* acl.c */ 2368/* acl.c */
2320#ifdef CONFIG_FS_POSIX_ACL 2369#ifdef CONFIG_BTRFS_POSIX_ACL
2321int btrfs_check_acl(struct inode *inode, int mask); 2370int btrfs_check_acl(struct inode *inode, int mask);
2322#else 2371#else
2323#define btrfs_check_acl NULL 2372#define btrfs_check_acl NULL
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 1d70236ba00c..f3a6075519cc 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -281,6 +281,53 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
281 return btrfs_match_dir_item_name(root, path, name, name_len); 281 return btrfs_match_dir_item_name(root, path, name, name_len);
282} 282}
283 283
284struct btrfs_dir_item *
285btrfs_search_dir_index_item(struct btrfs_root *root,
286 struct btrfs_path *path, u64 dirid,
287 const char *name, int name_len)
288{
289 struct extent_buffer *leaf;
290 struct btrfs_dir_item *di;
291 struct btrfs_key key;
292 u32 nritems;
293 int ret;
294
295 key.objectid = dirid;
296 key.type = BTRFS_DIR_INDEX_KEY;
297 key.offset = 0;
298
299 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
300 if (ret < 0)
301 return ERR_PTR(ret);
302
303 leaf = path->nodes[0];
304 nritems = btrfs_header_nritems(leaf);
305
306 while (1) {
307 if (path->slots[0] >= nritems) {
308 ret = btrfs_next_leaf(root, path);
309 if (ret < 0)
310 return ERR_PTR(ret);
311 if (ret > 0)
312 break;
313 leaf = path->nodes[0];
314 nritems = btrfs_header_nritems(leaf);
315 continue;
316 }
317
318 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
319 if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
320 break;
321
322 di = btrfs_match_dir_item_name(root, path, name, name_len);
323 if (di)
324 return di;
325
326 path->slots[0]++;
327 }
328 return NULL;
329}
330
284struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, 331struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
285 struct btrfs_root *root, 332 struct btrfs_root *root,
286 struct btrfs_path *path, u64 dir, 333 struct btrfs_path *path, u64 dir,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e83be2e4602c..af0435f79fa6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -41,6 +41,7 @@
41 41
42static struct extent_io_ops btree_extent_io_ops; 42static struct extent_io_ops btree_extent_io_ops;
43static void end_workqueue_fn(struct btrfs_work *work); 43static void end_workqueue_fn(struct btrfs_work *work);
44static void free_fs_root(struct btrfs_root *root);
44 45
45static atomic_t btrfs_bdi_num = ATOMIC_INIT(0); 46static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
46 47
@@ -123,15 +124,15 @@ static struct extent_map *btree_get_extent(struct inode *inode,
123 struct extent_map *em; 124 struct extent_map *em;
124 int ret; 125 int ret;
125 126
126 spin_lock(&em_tree->lock); 127 read_lock(&em_tree->lock);
127 em = lookup_extent_mapping(em_tree, start, len); 128 em = lookup_extent_mapping(em_tree, start, len);
128 if (em) { 129 if (em) {
129 em->bdev = 130 em->bdev =
130 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 131 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
131 spin_unlock(&em_tree->lock); 132 read_unlock(&em_tree->lock);
132 goto out; 133 goto out;
133 } 134 }
134 spin_unlock(&em_tree->lock); 135 read_unlock(&em_tree->lock);
135 136
136 em = alloc_extent_map(GFP_NOFS); 137 em = alloc_extent_map(GFP_NOFS);
137 if (!em) { 138 if (!em) {
@@ -144,7 +145,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
144 em->block_start = 0; 145 em->block_start = 0;
145 em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 146 em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
146 147
147 spin_lock(&em_tree->lock); 148 write_lock(&em_tree->lock);
148 ret = add_extent_mapping(em_tree, em); 149 ret = add_extent_mapping(em_tree, em);
149 if (ret == -EEXIST) { 150 if (ret == -EEXIST) {
150 u64 failed_start = em->start; 151 u64 failed_start = em->start;
@@ -163,7 +164,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
163 free_extent_map(em); 164 free_extent_map(em);
164 em = NULL; 165 em = NULL;
165 } 166 }
166 spin_unlock(&em_tree->lock); 167 write_unlock(&em_tree->lock);
167 168
168 if (ret) 169 if (ret)
169 em = ERR_PTR(ret); 170 em = ERR_PTR(ret);
@@ -772,7 +773,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
772 } 773 }
773} 774}
774 775
775static struct address_space_operations btree_aops = { 776static const struct address_space_operations btree_aops = {
776 .readpage = btree_readpage, 777 .readpage = btree_readpage,
777 .writepage = btree_writepage, 778 .writepage = btree_writepage,
778 .writepages = btree_writepages, 779 .writepages = btree_writepages,
@@ -821,14 +822,14 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
821 822
822int btrfs_write_tree_block(struct extent_buffer *buf) 823int btrfs_write_tree_block(struct extent_buffer *buf)
823{ 824{
824 return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start, 825 return filemap_fdatawrite_range(buf->first_page->mapping, buf->start,
825 buf->start + buf->len - 1, WB_SYNC_ALL); 826 buf->start + buf->len - 1);
826} 827}
827 828
828int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) 829int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
829{ 830{
830 return btrfs_wait_on_page_writeback_range(buf->first_page->mapping, 831 return filemap_fdatawait_range(buf->first_page->mapping,
831 buf->start, buf->start + buf->len - 1); 832 buf->start, buf->start + buf->len - 1);
832} 833}
833 834
834struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, 835struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -895,8 +896,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
895 root->fs_info = fs_info; 896 root->fs_info = fs_info;
896 root->objectid = objectid; 897 root->objectid = objectid;
897 root->last_trans = 0; 898 root->last_trans = 0;
898 root->highest_inode = 0; 899 root->highest_objectid = 0;
899 root->last_inode_alloc = 0;
900 root->name = NULL; 900 root->name = NULL;
901 root->in_sysfs = 0; 901 root->in_sysfs = 0;
902 root->inode_tree.rb_node = NULL; 902 root->inode_tree.rb_node = NULL;
@@ -952,14 +952,16 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
952 root, fs_info, objectid); 952 root, fs_info, objectid);
953 ret = btrfs_find_last_root(tree_root, objectid, 953 ret = btrfs_find_last_root(tree_root, objectid,
954 &root->root_item, &root->root_key); 954 &root->root_item, &root->root_key);
955 if (ret > 0)
956 return -ENOENT;
955 BUG_ON(ret); 957 BUG_ON(ret);
956 958
957 generation = btrfs_root_generation(&root->root_item); 959 generation = btrfs_root_generation(&root->root_item);
958 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 960 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
959 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 961 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
960 blocksize, generation); 962 blocksize, generation);
961 root->commit_root = btrfs_root_node(root);
962 BUG_ON(!root->node); 963 BUG_ON(!root->node);
964 root->commit_root = btrfs_root_node(root);
963 return 0; 965 return 0;
964} 966}
965 967
@@ -1095,7 +1097,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1095 struct btrfs_fs_info *fs_info = tree_root->fs_info; 1097 struct btrfs_fs_info *fs_info = tree_root->fs_info;
1096 struct btrfs_path *path; 1098 struct btrfs_path *path;
1097 struct extent_buffer *l; 1099 struct extent_buffer *l;
1098 u64 highest_inode;
1099 u64 generation; 1100 u64 generation;
1100 u32 blocksize; 1101 u32 blocksize;
1101 int ret = 0; 1102 int ret = 0;
@@ -1110,7 +1111,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1110 kfree(root); 1111 kfree(root);
1111 return ERR_PTR(ret); 1112 return ERR_PTR(ret);
1112 } 1113 }
1113 goto insert; 1114 goto out;
1114 } 1115 }
1115 1116
1116 __setup_root(tree_root->nodesize, tree_root->leafsize, 1117 __setup_root(tree_root->nodesize, tree_root->leafsize,
@@ -1120,39 +1121,30 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1120 path = btrfs_alloc_path(); 1121 path = btrfs_alloc_path();
1121 BUG_ON(!path); 1122 BUG_ON(!path);
1122 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); 1123 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
1123 if (ret != 0) { 1124 if (ret == 0) {
1124 if (ret > 0) 1125 l = path->nodes[0];
1125 ret = -ENOENT; 1126 read_extent_buffer(l, &root->root_item,
1126 goto out; 1127 btrfs_item_ptr_offset(l, path->slots[0]),
1128 sizeof(root->root_item));
1129 memcpy(&root->root_key, location, sizeof(*location));
1127 } 1130 }
1128 l = path->nodes[0];
1129 read_extent_buffer(l, &root->root_item,
1130 btrfs_item_ptr_offset(l, path->slots[0]),
1131 sizeof(root->root_item));
1132 memcpy(&root->root_key, location, sizeof(*location));
1133 ret = 0;
1134out:
1135 btrfs_release_path(root, path);
1136 btrfs_free_path(path); 1131 btrfs_free_path(path);
1137 if (ret) { 1132 if (ret) {
1138 kfree(root); 1133 if (ret > 0)
1134 ret = -ENOENT;
1139 return ERR_PTR(ret); 1135 return ERR_PTR(ret);
1140 } 1136 }
1137
1141 generation = btrfs_root_generation(&root->root_item); 1138 generation = btrfs_root_generation(&root->root_item);
1142 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 1139 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1143 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1140 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1144 blocksize, generation); 1141 blocksize, generation);
1145 root->commit_root = btrfs_root_node(root); 1142 root->commit_root = btrfs_root_node(root);
1146 BUG_ON(!root->node); 1143 BUG_ON(!root->node);
1147insert: 1144out:
1148 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { 1145 if (location->objectid != BTRFS_TREE_LOG_OBJECTID)
1149 root->ref_cows = 1; 1146 root->ref_cows = 1;
1150 ret = btrfs_find_highest_inode(root, &highest_inode); 1147
1151 if (ret == 0) {
1152 root->highest_inode = highest_inode;
1153 root->last_inode_alloc = highest_inode;
1154 }
1155 }
1156 return root; 1148 return root;
1157} 1149}
1158 1150
@@ -1187,39 +1179,66 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1187 return fs_info->dev_root; 1179 return fs_info->dev_root;
1188 if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) 1180 if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
1189 return fs_info->csum_root; 1181 return fs_info->csum_root;
1190 1182again:
1183 spin_lock(&fs_info->fs_roots_radix_lock);
1191 root = radix_tree_lookup(&fs_info->fs_roots_radix, 1184 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1192 (unsigned long)location->objectid); 1185 (unsigned long)location->objectid);
1186 spin_unlock(&fs_info->fs_roots_radix_lock);
1193 if (root) 1187 if (root)
1194 return root; 1188 return root;
1195 1189
1190 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1191 if (ret == 0)
1192 ret = -ENOENT;
1193 if (ret < 0)
1194 return ERR_PTR(ret);
1195
1196 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); 1196 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
1197 if (IS_ERR(root)) 1197 if (IS_ERR(root))
1198 return root; 1198 return root;
1199 1199
1200 WARN_ON(btrfs_root_refs(&root->root_item) == 0);
1200 set_anon_super(&root->anon_super, NULL); 1201 set_anon_super(&root->anon_super, NULL);
1201 1202
1203 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
1204 if (ret)
1205 goto fail;
1206
1207 spin_lock(&fs_info->fs_roots_radix_lock);
1202 ret = radix_tree_insert(&fs_info->fs_roots_radix, 1208 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1203 (unsigned long)root->root_key.objectid, 1209 (unsigned long)root->root_key.objectid,
1204 root); 1210 root);
1211 if (ret == 0)
1212 root->in_radix = 1;
1213 spin_unlock(&fs_info->fs_roots_radix_lock);
1214 radix_tree_preload_end();
1205 if (ret) { 1215 if (ret) {
1206 free_extent_buffer(root->node); 1216 if (ret == -EEXIST) {
1207 kfree(root); 1217 free_fs_root(root);
1208 return ERR_PTR(ret); 1218 goto again;
1219 }
1220 goto fail;
1209 } 1221 }
1210 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 1222
1211 ret = btrfs_find_dead_roots(fs_info->tree_root, 1223 ret = btrfs_find_dead_roots(fs_info->tree_root,
1212 root->root_key.objectid); 1224 root->root_key.objectid);
1213 BUG_ON(ret); 1225 WARN_ON(ret);
1226
1227 if (!(fs_info->sb->s_flags & MS_RDONLY))
1214 btrfs_orphan_cleanup(root); 1228 btrfs_orphan_cleanup(root);
1215 } 1229
1216 return root; 1230 return root;
1231fail:
1232 free_fs_root(root);
1233 return ERR_PTR(ret);
1217} 1234}
1218 1235
1219struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, 1236struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
1220 struct btrfs_key *location, 1237 struct btrfs_key *location,
1221 const char *name, int namelen) 1238 const char *name, int namelen)
1222{ 1239{
1240 return btrfs_read_fs_root_no_name(fs_info, location);
1241#if 0
1223 struct btrfs_root *root; 1242 struct btrfs_root *root;
1224 int ret; 1243 int ret;
1225 1244
@@ -1236,7 +1255,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
1236 kfree(root); 1255 kfree(root);
1237 return ERR_PTR(ret); 1256 return ERR_PTR(ret);
1238 } 1257 }
1239#if 0 1258
1240 ret = btrfs_sysfs_add_root(root); 1259 ret = btrfs_sysfs_add_root(root);
1241 if (ret) { 1260 if (ret) {
1242 free_extent_buffer(root->node); 1261 free_extent_buffer(root->node);
@@ -1244,9 +1263,9 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
1244 kfree(root); 1263 kfree(root);
1245 return ERR_PTR(ret); 1264 return ERR_PTR(ret);
1246 } 1265 }
1247#endif
1248 root->in_sysfs = 1; 1266 root->in_sysfs = 1;
1249 return root; 1267 return root;
1268#endif
1250} 1269}
1251 1270
1252static int btrfs_congested_fn(void *congested_data, int bdi_bits) 1271static int btrfs_congested_fn(void *congested_data, int bdi_bits)
@@ -1325,9 +1344,9 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1325 offset = page_offset(page); 1344 offset = page_offset(page);
1326 1345
1327 em_tree = &BTRFS_I(inode)->extent_tree; 1346 em_tree = &BTRFS_I(inode)->extent_tree;
1328 spin_lock(&em_tree->lock); 1347 read_lock(&em_tree->lock);
1329 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); 1348 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
1330 spin_unlock(&em_tree->lock); 1349 read_unlock(&em_tree->lock);
1331 if (!em) { 1350 if (!em) {
1332 __unplug_io_fn(bdi, page); 1351 __unplug_io_fn(bdi, page);
1333 return; 1352 return;
@@ -1352,6 +1371,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1352{ 1371{
1353 int err; 1372 int err;
1354 1373
1374 bdi->name = "btrfs";
1355 bdi->capabilities = BDI_CAP_MAP_COPY; 1375 bdi->capabilities = BDI_CAP_MAP_COPY;
1356 err = bdi_init(bdi); 1376 err = bdi_init(bdi);
1357 if (err) 1377 if (err)
@@ -1359,8 +1379,10 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1359 1379
1360 err = bdi_register(bdi, NULL, "btrfs-%d", 1380 err = bdi_register(bdi, NULL, "btrfs-%d",
1361 atomic_inc_return(&btrfs_bdi_num)); 1381 atomic_inc_return(&btrfs_bdi_num));
1362 if (err) 1382 if (err) {
1383 bdi_destroy(bdi);
1363 return err; 1384 return err;
1385 }
1364 1386
1365 bdi->ra_pages = default_backing_dev_info.ra_pages; 1387 bdi->ra_pages = default_backing_dev_info.ra_pages;
1366 bdi->unplug_io_fn = btrfs_unplug_io_fn; 1388 bdi->unplug_io_fn = btrfs_unplug_io_fn;
@@ -1450,9 +1472,12 @@ static int cleaner_kthread(void *arg)
1450 break; 1472 break;
1451 1473
1452 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1474 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1453 mutex_lock(&root->fs_info->cleaner_mutex); 1475
1454 btrfs_clean_old_snapshots(root); 1476 if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
1455 mutex_unlock(&root->fs_info->cleaner_mutex); 1477 mutex_trylock(&root->fs_info->cleaner_mutex)) {
1478 btrfs_clean_old_snapshots(root);
1479 mutex_unlock(&root->fs_info->cleaner_mutex);
1480 }
1456 1481
1457 if (freezing(current)) { 1482 if (freezing(current)) {
1458 refrigerator(); 1483 refrigerator();
@@ -1557,15 +1582,36 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1557 err = -ENOMEM; 1582 err = -ENOMEM;
1558 goto fail; 1583 goto fail;
1559 } 1584 }
1560 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS); 1585
1586 ret = init_srcu_struct(&fs_info->subvol_srcu);
1587 if (ret) {
1588 err = ret;
1589 goto fail;
1590 }
1591
1592 ret = setup_bdi(fs_info, &fs_info->bdi);
1593 if (ret) {
1594 err = ret;
1595 goto fail_srcu;
1596 }
1597
1598 fs_info->btree_inode = new_inode(sb);
1599 if (!fs_info->btree_inode) {
1600 err = -ENOMEM;
1601 goto fail_bdi;
1602 }
1603
1604 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
1561 INIT_LIST_HEAD(&fs_info->trans_list); 1605 INIT_LIST_HEAD(&fs_info->trans_list);
1562 INIT_LIST_HEAD(&fs_info->dead_roots); 1606 INIT_LIST_HEAD(&fs_info->dead_roots);
1563 INIT_LIST_HEAD(&fs_info->hashers); 1607 INIT_LIST_HEAD(&fs_info->hashers);
1564 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 1608 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1565 INIT_LIST_HEAD(&fs_info->ordered_operations); 1609 INIT_LIST_HEAD(&fs_info->ordered_operations);
1610 INIT_LIST_HEAD(&fs_info->caching_block_groups);
1566 spin_lock_init(&fs_info->delalloc_lock); 1611 spin_lock_init(&fs_info->delalloc_lock);
1567 spin_lock_init(&fs_info->new_trans_lock); 1612 spin_lock_init(&fs_info->new_trans_lock);
1568 spin_lock_init(&fs_info->ref_cache_lock); 1613 spin_lock_init(&fs_info->ref_cache_lock);
1614 spin_lock_init(&fs_info->fs_roots_radix_lock);
1569 1615
1570 init_completion(&fs_info->kobj_unregister); 1616 init_completion(&fs_info->kobj_unregister);
1571 fs_info->tree_root = tree_root; 1617 fs_info->tree_root = tree_root;
@@ -1584,12 +1630,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1584 fs_info->sb = sb; 1630 fs_info->sb = sb;
1585 fs_info->max_extent = (u64)-1; 1631 fs_info->max_extent = (u64)-1;
1586 fs_info->max_inline = 8192 * 1024; 1632 fs_info->max_inline = 8192 * 1024;
1587 if (setup_bdi(fs_info, &fs_info->bdi)) 1633 fs_info->metadata_ratio = 0;
1588 goto fail_bdi;
1589 fs_info->btree_inode = new_inode(sb);
1590 fs_info->btree_inode->i_ino = 1;
1591 fs_info->btree_inode->i_nlink = 1;
1592 fs_info->metadata_ratio = 8;
1593 1634
1594 fs_info->thread_pool_size = min_t(unsigned long, 1635 fs_info->thread_pool_size = min_t(unsigned long,
1595 num_online_cpus() + 2, 8); 1636 num_online_cpus() + 2, 8);
@@ -1599,7 +1640,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1599 1640
1600 sb->s_blocksize = 4096; 1641 sb->s_blocksize = 4096;
1601 sb->s_blocksize_bits = blksize_bits(4096); 1642 sb->s_blocksize_bits = blksize_bits(4096);
1643 sb->s_bdi = &fs_info->bdi;
1602 1644
1645 fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
1646 fs_info->btree_inode->i_nlink = 1;
1603 /* 1647 /*
1604 * we set the i_size on the btree inode to the max possible int. 1648 * we set the i_size on the btree inode to the max possible int.
1605 * the real end of the address space is determined by all of 1649 * the real end of the address space is determined by all of
@@ -1618,28 +1662,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1618 1662
1619 BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; 1663 BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
1620 1664
1665 BTRFS_I(fs_info->btree_inode)->root = tree_root;
1666 memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
1667 sizeof(struct btrfs_key));
1668 BTRFS_I(fs_info->btree_inode)->dummy_inode = 1;
1669 insert_inode_hash(fs_info->btree_inode);
1670
1621 spin_lock_init(&fs_info->block_group_cache_lock); 1671 spin_lock_init(&fs_info->block_group_cache_lock);
1622 fs_info->block_group_cache_tree.rb_node = NULL; 1672 fs_info->block_group_cache_tree.rb_node = NULL;
1623 1673
1624 extent_io_tree_init(&fs_info->pinned_extents, 1674 extent_io_tree_init(&fs_info->freed_extents[0],
1625 fs_info->btree_inode->i_mapping, GFP_NOFS); 1675 fs_info->btree_inode->i_mapping, GFP_NOFS);
1676 extent_io_tree_init(&fs_info->freed_extents[1],
1677 fs_info->btree_inode->i_mapping, GFP_NOFS);
1678 fs_info->pinned_extents = &fs_info->freed_extents[0];
1626 fs_info->do_barriers = 1; 1679 fs_info->do_barriers = 1;
1627 1680
1628 BTRFS_I(fs_info->btree_inode)->root = tree_root;
1629 memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
1630 sizeof(struct btrfs_key));
1631 insert_inode_hash(fs_info->btree_inode);
1632 1681
1633 mutex_init(&fs_info->trans_mutex); 1682 mutex_init(&fs_info->trans_mutex);
1634 mutex_init(&fs_info->ordered_operations_mutex); 1683 mutex_init(&fs_info->ordered_operations_mutex);
1635 mutex_init(&fs_info->tree_log_mutex); 1684 mutex_init(&fs_info->tree_log_mutex);
1636 mutex_init(&fs_info->drop_mutex);
1637 mutex_init(&fs_info->chunk_mutex); 1685 mutex_init(&fs_info->chunk_mutex);
1638 mutex_init(&fs_info->transaction_kthread_mutex); 1686 mutex_init(&fs_info->transaction_kthread_mutex);
1639 mutex_init(&fs_info->cleaner_mutex); 1687 mutex_init(&fs_info->cleaner_mutex);
1640 mutex_init(&fs_info->volume_mutex); 1688 mutex_init(&fs_info->volume_mutex);
1641 mutex_init(&fs_info->tree_reloc_mutex);
1642 init_rwsem(&fs_info->extent_commit_sem); 1689 init_rwsem(&fs_info->extent_commit_sem);
1690 init_rwsem(&fs_info->subvol_sem);
1643 1691
1644 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); 1692 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
1645 btrfs_init_free_cluster(&fs_info->data_alloc_cluster); 1693 btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@@ -1698,7 +1746,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1698 err = -EINVAL; 1746 err = -EINVAL;
1699 goto fail_iput; 1747 goto fail_iput;
1700 } 1748 }
1701 1749printk("thread pool is %d\n", fs_info->thread_pool_size);
1702 /* 1750 /*
1703 * we need to start all the end_io workers up front because the 1751 * we need to start all the end_io workers up front because the
1704 * queue work function gets called at interrupt time, and so it 1752 * queue work function gets called at interrupt time, and so it
@@ -1743,20 +1791,22 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1743 fs_info->endio_workers.idle_thresh = 4; 1791 fs_info->endio_workers.idle_thresh = 4;
1744 fs_info->endio_meta_workers.idle_thresh = 4; 1792 fs_info->endio_meta_workers.idle_thresh = 4;
1745 1793
1746 fs_info->endio_write_workers.idle_thresh = 64; 1794 fs_info->endio_write_workers.idle_thresh = 2;
1747 fs_info->endio_meta_write_workers.idle_thresh = 64; 1795 fs_info->endio_meta_write_workers.idle_thresh = 2;
1796
1797 fs_info->endio_workers.atomic_worker_start = 1;
1798 fs_info->endio_meta_workers.atomic_worker_start = 1;
1799 fs_info->endio_write_workers.atomic_worker_start = 1;
1800 fs_info->endio_meta_write_workers.atomic_worker_start = 1;
1748 1801
1749 btrfs_start_workers(&fs_info->workers, 1); 1802 btrfs_start_workers(&fs_info->workers, 1);
1750 btrfs_start_workers(&fs_info->submit_workers, 1); 1803 btrfs_start_workers(&fs_info->submit_workers, 1);
1751 btrfs_start_workers(&fs_info->delalloc_workers, 1); 1804 btrfs_start_workers(&fs_info->delalloc_workers, 1);
1752 btrfs_start_workers(&fs_info->fixup_workers, 1); 1805 btrfs_start_workers(&fs_info->fixup_workers, 1);
1753 btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size); 1806 btrfs_start_workers(&fs_info->endio_workers, 1);
1754 btrfs_start_workers(&fs_info->endio_meta_workers, 1807 btrfs_start_workers(&fs_info->endio_meta_workers, 1);
1755 fs_info->thread_pool_size); 1808 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
1756 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1809 btrfs_start_workers(&fs_info->endio_write_workers, 1);
1757 fs_info->thread_pool_size);
1758 btrfs_start_workers(&fs_info->endio_write_workers,
1759 fs_info->thread_pool_size);
1760 1810
1761 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1811 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1762 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 1812 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1916,6 +1966,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1916 } 1966 }
1917 } 1967 }
1918 1968
1969 ret = btrfs_find_orphan_roots(tree_root);
1970 BUG_ON(ret);
1971
1919 if (!(sb->s_flags & MS_RDONLY)) { 1972 if (!(sb->s_flags & MS_RDONLY)) {
1920 ret = btrfs_recover_relocation(tree_root); 1973 ret = btrfs_recover_relocation(tree_root);
1921 BUG_ON(ret); 1974 BUG_ON(ret);
@@ -1975,6 +2028,8 @@ fail_iput:
1975 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2028 btrfs_mapping_tree_free(&fs_info->mapping_tree);
1976fail_bdi: 2029fail_bdi:
1977 bdi_destroy(&fs_info->bdi); 2030 bdi_destroy(&fs_info->bdi);
2031fail_srcu:
2032 cleanup_srcu_struct(&fs_info->subvol_srcu);
1978fail: 2033fail:
1979 kfree(extent_root); 2034 kfree(extent_root);
1980 kfree(tree_root); 2035 kfree(tree_root);
@@ -2234,20 +2289,29 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
2234 2289
2235int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) 2290int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
2236{ 2291{
2237 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); 2292 spin_lock(&fs_info->fs_roots_radix_lock);
2238 radix_tree_delete(&fs_info->fs_roots_radix, 2293 radix_tree_delete(&fs_info->fs_roots_radix,
2239 (unsigned long)root->root_key.objectid); 2294 (unsigned long)root->root_key.objectid);
2295 spin_unlock(&fs_info->fs_roots_radix_lock);
2296
2297 if (btrfs_root_refs(&root->root_item) == 0)
2298 synchronize_srcu(&fs_info->subvol_srcu);
2299
2300 free_fs_root(root);
2301 return 0;
2302}
2303
2304static void free_fs_root(struct btrfs_root *root)
2305{
2306 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
2240 if (root->anon_super.s_dev) { 2307 if (root->anon_super.s_dev) {
2241 down_write(&root->anon_super.s_umount); 2308 down_write(&root->anon_super.s_umount);
2242 kill_anon_super(&root->anon_super); 2309 kill_anon_super(&root->anon_super);
2243 } 2310 }
2244 if (root->node) 2311 free_extent_buffer(root->node);
2245 free_extent_buffer(root->node); 2312 free_extent_buffer(root->commit_root);
2246 if (root->commit_root)
2247 free_extent_buffer(root->commit_root);
2248 kfree(root->name); 2313 kfree(root->name);
2249 kfree(root); 2314 kfree(root);
2250 return 0;
2251} 2315}
2252 2316
2253static int del_fs_roots(struct btrfs_fs_info *fs_info) 2317static int del_fs_roots(struct btrfs_fs_info *fs_info)
@@ -2256,6 +2320,20 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info)
2256 struct btrfs_root *gang[8]; 2320 struct btrfs_root *gang[8];
2257 int i; 2321 int i;
2258 2322
2323 while (!list_empty(&fs_info->dead_roots)) {
2324 gang[0] = list_entry(fs_info->dead_roots.next,
2325 struct btrfs_root, root_list);
2326 list_del(&gang[0]->root_list);
2327
2328 if (gang[0]->in_radix) {
2329 btrfs_free_fs_root(fs_info, gang[0]);
2330 } else {
2331 free_extent_buffer(gang[0]->node);
2332 free_extent_buffer(gang[0]->commit_root);
2333 kfree(gang[0]);
2334 }
2335 }
2336
2259 while (1) { 2337 while (1) {
2260 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, 2338 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
2261 (void **)gang, 0, 2339 (void **)gang, 0,
@@ -2285,9 +2363,6 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
2285 root_objectid = gang[ret - 1]->root_key.objectid + 1; 2363 root_objectid = gang[ret - 1]->root_key.objectid + 1;
2286 for (i = 0; i < ret; i++) { 2364 for (i = 0; i < ret; i++) {
2287 root_objectid = gang[i]->root_key.objectid; 2365 root_objectid = gang[i]->root_key.objectid;
2288 ret = btrfs_find_dead_roots(fs_info->tree_root,
2289 root_objectid);
2290 BUG_ON(ret);
2291 btrfs_orphan_cleanup(gang[i]); 2366 btrfs_orphan_cleanup(gang[i]);
2292 } 2367 }
2293 root_objectid++; 2368 root_objectid++;
@@ -2357,7 +2432,6 @@ int close_ctree(struct btrfs_root *root)
2357 free_extent_buffer(root->fs_info->csum_root->commit_root); 2432 free_extent_buffer(root->fs_info->csum_root->commit_root);
2358 2433
2359 btrfs_free_block_groups(root->fs_info); 2434 btrfs_free_block_groups(root->fs_info);
2360 btrfs_free_pinned_extents(root->fs_info);
2361 2435
2362 del_fs_roots(fs_info); 2436 del_fs_roots(fs_info);
2363 2437
@@ -2376,6 +2450,7 @@ int close_ctree(struct btrfs_root *root)
2376 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2450 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2377 2451
2378 bdi_destroy(&fs_info->bdi); 2452 bdi_destroy(&fs_info->bdi);
2453 cleanup_srcu_struct(&fs_info->subvol_srcu);
2379 2454
2380 kfree(fs_info->extent_root); 2455 kfree(fs_info->extent_root);
2381 kfree(fs_info->tree_root); 2456 kfree(fs_info->tree_root);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 9596b40caa4e..ba5c3fd5ab8c 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -28,7 +28,7 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
28 len = BTRFS_FID_SIZE_NON_CONNECTABLE; 28 len = BTRFS_FID_SIZE_NON_CONNECTABLE;
29 type = FILEID_BTRFS_WITHOUT_PARENT; 29 type = FILEID_BTRFS_WITHOUT_PARENT;
30 30
31 fid->objectid = BTRFS_I(inode)->location.objectid; 31 fid->objectid = inode->i_ino;
32 fid->root_objectid = BTRFS_I(inode)->root->objectid; 32 fid->root_objectid = BTRFS_I(inode)->root->objectid;
33 fid->gen = inode->i_generation; 33 fid->gen = inode->i_generation;
34 34
@@ -60,34 +60,61 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
60} 60}
61 61
62static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, 62static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
63 u64 root_objectid, u32 generation) 63 u64 root_objectid, u32 generation,
64 int check_generation)
64{ 65{
66 struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
65 struct btrfs_root *root; 67 struct btrfs_root *root;
68 struct dentry *dentry;
66 struct inode *inode; 69 struct inode *inode;
67 struct btrfs_key key; 70 struct btrfs_key key;
71 int index;
72 int err = 0;
73
74 if (objectid < BTRFS_FIRST_FREE_OBJECTID)
75 return ERR_PTR(-ESTALE);
68 76
69 key.objectid = root_objectid; 77 key.objectid = root_objectid;
70 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 78 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
71 key.offset = (u64)-1; 79 key.offset = (u64)-1;
72 80
73 root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key); 81 index = srcu_read_lock(&fs_info->subvol_srcu);
74 if (IS_ERR(root)) 82
75 return ERR_CAST(root); 83 root = btrfs_read_fs_root_no_name(fs_info, &key);
84 if (IS_ERR(root)) {
85 err = PTR_ERR(root);
86 goto fail;
87 }
88
89 if (btrfs_root_refs(&root->root_item) == 0) {
90 err = -ENOENT;
91 goto fail;
92 }
76 93
77 key.objectid = objectid; 94 key.objectid = objectid;
78 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 95 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
79 key.offset = 0; 96 key.offset = 0;
80 97
81 inode = btrfs_iget(sb, &key, root); 98 inode = btrfs_iget(sb, &key, root);
82 if (IS_ERR(inode)) 99 if (IS_ERR(inode)) {
83 return (void *)inode; 100 err = PTR_ERR(inode);
101 goto fail;
102 }
103
104 srcu_read_unlock(&fs_info->subvol_srcu, index);
84 105
85 if (generation != inode->i_generation) { 106 if (check_generation && generation != inode->i_generation) {
86 iput(inode); 107 iput(inode);
87 return ERR_PTR(-ESTALE); 108 return ERR_PTR(-ESTALE);
88 } 109 }
89 110
90 return d_obtain_alias(inode); 111 dentry = d_obtain_alias(inode);
112 if (!IS_ERR(dentry))
113 dentry->d_op = &btrfs_dentry_operations;
114 return dentry;
115fail:
116 srcu_read_unlock(&fs_info->subvol_srcu, index);
117 return ERR_PTR(err);
91} 118}
92 119
93static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, 120static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
@@ -111,7 +138,7 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
111 objectid = fid->parent_objectid; 138 objectid = fid->parent_objectid;
112 generation = fid->parent_gen; 139 generation = fid->parent_gen;
113 140
114 return btrfs_get_dentry(sb, objectid, root_objectid, generation); 141 return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
115} 142}
116 143
117static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, 144static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
@@ -133,66 +160,76 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
133 root_objectid = fid->root_objectid; 160 root_objectid = fid->root_objectid;
134 generation = fid->gen; 161 generation = fid->gen;
135 162
136 return btrfs_get_dentry(sb, objectid, root_objectid, generation); 163 return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
137} 164}
138 165
139static struct dentry *btrfs_get_parent(struct dentry *child) 166static struct dentry *btrfs_get_parent(struct dentry *child)
140{ 167{
141 struct inode *dir = child->d_inode; 168 struct inode *dir = child->d_inode;
169 static struct dentry *dentry;
142 struct btrfs_root *root = BTRFS_I(dir)->root; 170 struct btrfs_root *root = BTRFS_I(dir)->root;
143 struct btrfs_key key;
144 struct btrfs_path *path; 171 struct btrfs_path *path;
145 struct extent_buffer *leaf; 172 struct extent_buffer *leaf;
146 int slot; 173 struct btrfs_root_ref *ref;
147 u64 objectid; 174 struct btrfs_key key;
175 struct btrfs_key found_key;
148 int ret; 176 int ret;
149 177
150 path = btrfs_alloc_path(); 178 path = btrfs_alloc_path();
151 179
152 key.objectid = dir->i_ino; 180 if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
153 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); 181 key.objectid = root->root_key.objectid;
154 key.offset = (u64)-1; 182 key.type = BTRFS_ROOT_BACKREF_KEY;
183 key.offset = (u64)-1;
184 root = root->fs_info->tree_root;
185 } else {
186 key.objectid = dir->i_ino;
187 key.type = BTRFS_INODE_REF_KEY;
188 key.offset = (u64)-1;
189 }
155 190
156 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 191 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
157 if (ret < 0) { 192 if (ret < 0)
158 /* Error */ 193 goto fail;
159 btrfs_free_path(path); 194
160 return ERR_PTR(ret); 195 BUG_ON(ret == 0);
196 if (path->slots[0] == 0) {
197 ret = -ENOENT;
198 goto fail;
161 } 199 }
200
201 path->slots[0]--;
162 leaf = path->nodes[0]; 202 leaf = path->nodes[0];
163 slot = path->slots[0]; 203
164 if (ret) { 204 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
165 /* btrfs_search_slot() returns the slot where we'd want to 205 if (found_key.objectid != key.objectid || found_key.type != key.type) {
166 insert a backref for parent inode #0xFFFFFFFFFFFFFFFF. 206 ret = -ENOENT;
167 The _real_ backref, telling us what the parent inode 207 goto fail;
168 _actually_ is, will be in the slot _before_ the one
169 that btrfs_search_slot() returns. */
170 if (!slot) {
171 /* Unless there is _no_ key in the tree before... */
172 btrfs_free_path(path);
173 return ERR_PTR(-EIO);
174 }
175 slot--;
176 } 208 }
177 209
178 btrfs_item_key_to_cpu(leaf, &key, slot); 210 if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
211 ref = btrfs_item_ptr(leaf, path->slots[0],
212 struct btrfs_root_ref);
213 key.objectid = btrfs_root_ref_dirid(leaf, ref);
214 } else {
215 key.objectid = found_key.offset;
216 }
179 btrfs_free_path(path); 217 btrfs_free_path(path);
180 218
181 if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY) 219 if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
182 return ERR_PTR(-EINVAL); 220 return btrfs_get_dentry(root->fs_info->sb, key.objectid,
183 221 found_key.offset, 0, 0);
184 objectid = key.offset; 222 }
185
186 /* If we are already at the root of a subvol, return the real root */
187 if (objectid == dir->i_ino)
188 return dget(dir->i_sb->s_root);
189 223
190 /* Build a new key for the inode item */ 224 key.type = BTRFS_INODE_ITEM_KEY;
191 key.objectid = objectid;
192 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
193 key.offset = 0; 225 key.offset = 0;
194 226 dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
195 return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root)); 227 if (!IS_ERR(dentry))
228 dentry->d_op = &btrfs_dentry_operations;
229 return dentry;
230fail:
231 btrfs_free_path(path);
232 return ERR_PTR(ret);
196} 233}
197 234
198const struct export_operations btrfs_export_ops = { 235const struct export_operations btrfs_export_ops = {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 72a2b9c28e9f..359a754c782c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -32,12 +32,12 @@
32#include "locking.h" 32#include "locking.h"
33#include "free-space-cache.h" 33#include "free-space-cache.h"
34 34
35static int update_reserved_extents(struct btrfs_root *root,
36 u64 bytenr, u64 num, int reserve);
37static int update_block_group(struct btrfs_trans_handle *trans, 35static int update_block_group(struct btrfs_trans_handle *trans,
38 struct btrfs_root *root, 36 struct btrfs_root *root,
39 u64 bytenr, u64 num_bytes, int alloc, 37 u64 bytenr, u64 num_bytes, int alloc,
40 int mark_free); 38 int mark_free);
39static int update_reserved_extents(struct btrfs_block_group_cache *cache,
40 u64 num_bytes, int reserve);
41static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 41static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
42 struct btrfs_root *root, 42 struct btrfs_root *root,
43 u64 bytenr, u64 num_bytes, u64 parent, 43 u64 bytenr, u64 num_bytes, u64 parent,
@@ -57,10 +57,19 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
57 u64 parent, u64 root_objectid, 57 u64 parent, u64 root_objectid,
58 u64 flags, struct btrfs_disk_key *key, 58 u64 flags, struct btrfs_disk_key *key,
59 int level, struct btrfs_key *ins); 59 int level, struct btrfs_key *ins);
60
61static int do_chunk_alloc(struct btrfs_trans_handle *trans, 60static int do_chunk_alloc(struct btrfs_trans_handle *trans,
62 struct btrfs_root *extent_root, u64 alloc_bytes, 61 struct btrfs_root *extent_root, u64 alloc_bytes,
63 u64 flags, int force); 62 u64 flags, int force);
63static int pin_down_bytes(struct btrfs_trans_handle *trans,
64 struct btrfs_root *root,
65 struct btrfs_path *path,
66 u64 bytenr, u64 num_bytes,
67 int is_data, int reserved,
68 struct extent_buffer **must_clean);
69static int find_next_key(struct btrfs_path *path, int level,
70 struct btrfs_key *key);
71static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
72 int dump_block_groups);
64 73
65static noinline int 74static noinline int
66block_group_cache_done(struct btrfs_block_group_cache *cache) 75block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -153,34 +162,34 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
153 return ret; 162 return ret;
154} 163}
155 164
156/* 165static int add_excluded_extent(struct btrfs_root *root,
157 * We always set EXTENT_LOCKED for the super mirror extents so we don't 166 u64 start, u64 num_bytes)
158 * overwrite them, so those bits need to be unset. Also, if we are unmounting
159 * with pinned extents still sitting there because we had a block group caching,
160 * we need to clear those now, since we are done.
161 */
162void btrfs_free_pinned_extents(struct btrfs_fs_info *info)
163{ 167{
164 u64 start, end, last = 0; 168 u64 end = start + num_bytes - 1;
165 int ret; 169 set_extent_bits(&root->fs_info->freed_extents[0],
170 start, end, EXTENT_UPTODATE, GFP_NOFS);
171 set_extent_bits(&root->fs_info->freed_extents[1],
172 start, end, EXTENT_UPTODATE, GFP_NOFS);
173 return 0;
174}
166 175
167 while (1) { 176static void free_excluded_extents(struct btrfs_root *root,
168 ret = find_first_extent_bit(&info->pinned_extents, last, 177 struct btrfs_block_group_cache *cache)
169 &start, &end, 178{
170 EXTENT_LOCKED|EXTENT_DIRTY); 179 u64 start, end;
171 if (ret)
172 break;
173 180
174 clear_extent_bits(&info->pinned_extents, start, end, 181 start = cache->key.objectid;
175 EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS); 182 end = start + cache->key.offset - 1;
176 last = end+1; 183
177 } 184 clear_extent_bits(&root->fs_info->freed_extents[0],
185 start, end, EXTENT_UPTODATE, GFP_NOFS);
186 clear_extent_bits(&root->fs_info->freed_extents[1],
187 start, end, EXTENT_UPTODATE, GFP_NOFS);
178} 188}
179 189
180static int remove_sb_from_cache(struct btrfs_root *root, 190static int exclude_super_stripes(struct btrfs_root *root,
181 struct btrfs_block_group_cache *cache) 191 struct btrfs_block_group_cache *cache)
182{ 192{
183 struct btrfs_fs_info *fs_info = root->fs_info;
184 u64 bytenr; 193 u64 bytenr;
185 u64 *logical; 194 u64 *logical;
186 int stripe_len; 195 int stripe_len;
@@ -192,17 +201,42 @@ static int remove_sb_from_cache(struct btrfs_root *root,
192 cache->key.objectid, bytenr, 201 cache->key.objectid, bytenr,
193 0, &logical, &nr, &stripe_len); 202 0, &logical, &nr, &stripe_len);
194 BUG_ON(ret); 203 BUG_ON(ret);
204
195 while (nr--) { 205 while (nr--) {
196 try_lock_extent(&fs_info->pinned_extents, 206 cache->bytes_super += stripe_len;
197 logical[nr], 207 ret = add_excluded_extent(root, logical[nr],
198 logical[nr] + stripe_len - 1, GFP_NOFS); 208 stripe_len);
209 BUG_ON(ret);
199 } 210 }
211
200 kfree(logical); 212 kfree(logical);
201 } 213 }
202
203 return 0; 214 return 0;
204} 215}
205 216
217static struct btrfs_caching_control *
218get_caching_control(struct btrfs_block_group_cache *cache)
219{
220 struct btrfs_caching_control *ctl;
221
222 spin_lock(&cache->lock);
223 if (cache->cached != BTRFS_CACHE_STARTED) {
224 spin_unlock(&cache->lock);
225 return NULL;
226 }
227
228 ctl = cache->caching_ctl;
229 atomic_inc(&ctl->count);
230 spin_unlock(&cache->lock);
231 return ctl;
232}
233
234static void put_caching_control(struct btrfs_caching_control *ctl)
235{
236 if (atomic_dec_and_test(&ctl->count))
237 kfree(ctl);
238}
239
206/* 240/*
207 * this is only called by cache_block_group, since we could have freed extents 241 * this is only called by cache_block_group, since we could have freed extents
208 * we need to check the pinned_extents for any extents that can't be used yet 242 * we need to check the pinned_extents for any extents that can't be used yet
@@ -215,9 +249,9 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
215 int ret; 249 int ret;
216 250
217 while (start < end) { 251 while (start < end) {
218 ret = find_first_extent_bit(&info->pinned_extents, start, 252 ret = find_first_extent_bit(info->pinned_extents, start,
219 &extent_start, &extent_end, 253 &extent_start, &extent_end,
220 EXTENT_DIRTY|EXTENT_LOCKED); 254 EXTENT_DIRTY | EXTENT_UPTODATE);
221 if (ret) 255 if (ret)
222 break; 256 break;
223 257
@@ -249,22 +283,27 @@ static int caching_kthread(void *data)
249{ 283{
250 struct btrfs_block_group_cache *block_group = data; 284 struct btrfs_block_group_cache *block_group = data;
251 struct btrfs_fs_info *fs_info = block_group->fs_info; 285 struct btrfs_fs_info *fs_info = block_group->fs_info;
252 u64 last = 0; 286 struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
287 struct btrfs_root *extent_root = fs_info->extent_root;
253 struct btrfs_path *path; 288 struct btrfs_path *path;
254 int ret = 0;
255 struct btrfs_key key;
256 struct extent_buffer *leaf; 289 struct extent_buffer *leaf;
257 int slot; 290 struct btrfs_key key;
258 u64 total_found = 0; 291 u64 total_found = 0;
259 292 u64 last = 0;
260 BUG_ON(!fs_info); 293 u32 nritems;
294 int ret = 0;
261 295
262 path = btrfs_alloc_path(); 296 path = btrfs_alloc_path();
263 if (!path) 297 if (!path)
264 return -ENOMEM; 298 return -ENOMEM;
265 299
266 atomic_inc(&block_group->space_info->caching_threads); 300 exclude_super_stripes(extent_root, block_group);
301 spin_lock(&block_group->space_info->lock);
302 block_group->space_info->bytes_super += block_group->bytes_super;
303 spin_unlock(&block_group->space_info->lock);
304
267 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 305 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
306
268 /* 307 /*
269 * We don't want to deadlock with somebody trying to allocate a new 308 * We don't want to deadlock with somebody trying to allocate a new
270 * extent for the extent root while also trying to search the extent 309 * extent for the extent root while also trying to search the extent
@@ -277,74 +316,64 @@ static int caching_kthread(void *data)
277 316
278 key.objectid = last; 317 key.objectid = last;
279 key.offset = 0; 318 key.offset = 0;
280 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); 319 key.type = BTRFS_EXTENT_ITEM_KEY;
281again: 320again:
321 mutex_lock(&caching_ctl->mutex);
282 /* need to make sure the commit_root doesn't disappear */ 322 /* need to make sure the commit_root doesn't disappear */
283 down_read(&fs_info->extent_commit_sem); 323 down_read(&fs_info->extent_commit_sem);
284 324
285 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); 325 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
286 if (ret < 0) 326 if (ret < 0)
287 goto err; 327 goto err;
288 328
329 leaf = path->nodes[0];
330 nritems = btrfs_header_nritems(leaf);
331
289 while (1) { 332 while (1) {
290 smp_mb(); 333 smp_mb();
291 if (block_group->fs_info->closing > 1) { 334 if (fs_info->closing > 1) {
292 last = (u64)-1; 335 last = (u64)-1;
293 break; 336 break;
294 } 337 }
295 338
296 leaf = path->nodes[0]; 339 if (path->slots[0] < nritems) {
297 slot = path->slots[0]; 340 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
298 if (slot >= btrfs_header_nritems(leaf)) { 341 } else {
299 ret = btrfs_next_leaf(fs_info->extent_root, path); 342 ret = find_next_key(path, 0, &key);
300 if (ret < 0) 343 if (ret)
301 goto err;
302 else if (ret)
303 break; 344 break;
304 345
305 if (need_resched() || 346 caching_ctl->progress = last;
306 btrfs_transaction_in_commit(fs_info)) { 347 btrfs_release_path(extent_root, path);
307 leaf = path->nodes[0]; 348 up_read(&fs_info->extent_commit_sem);
308 349 mutex_unlock(&caching_ctl->mutex);
309 /* this shouldn't happen, but if the 350 if (btrfs_transaction_in_commit(fs_info))
310 * leaf is empty just move on.
311 */
312 if (btrfs_header_nritems(leaf) == 0)
313 break;
314 /*
315 * we need to copy the key out so that
316 * we are sure the next search advances
317 * us forward in the btree.
318 */
319 btrfs_item_key_to_cpu(leaf, &key, 0);
320 btrfs_release_path(fs_info->extent_root, path);
321 up_read(&fs_info->extent_commit_sem);
322 schedule_timeout(1); 351 schedule_timeout(1);
323 goto again; 352 else
324 } 353 cond_resched();
354 goto again;
355 }
325 356
357 if (key.objectid < block_group->key.objectid) {
358 path->slots[0]++;
326 continue; 359 continue;
327 } 360 }
328 btrfs_item_key_to_cpu(leaf, &key, slot);
329 if (key.objectid < block_group->key.objectid)
330 goto next;
331 361
332 if (key.objectid >= block_group->key.objectid + 362 if (key.objectid >= block_group->key.objectid +
333 block_group->key.offset) 363 block_group->key.offset)
334 break; 364 break;
335 365
336 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) { 366 if (key.type == BTRFS_EXTENT_ITEM_KEY) {
337 total_found += add_new_free_space(block_group, 367 total_found += add_new_free_space(block_group,
338 fs_info, last, 368 fs_info, last,
339 key.objectid); 369 key.objectid);
340 last = key.objectid + key.offset; 370 last = key.objectid + key.offset;
341 }
342 371
343 if (total_found > (1024 * 1024 * 2)) { 372 if (total_found > (1024 * 1024 * 2)) {
344 total_found = 0; 373 total_found = 0;
345 wake_up(&block_group->caching_q); 374 wake_up(&caching_ctl->wait);
375 }
346 } 376 }
347next:
348 path->slots[0]++; 377 path->slots[0]++;
349 } 378 }
350 ret = 0; 379 ret = 0;
@@ -352,33 +381,65 @@ next:
352 total_found += add_new_free_space(block_group, fs_info, last, 381 total_found += add_new_free_space(block_group, fs_info, last,
353 block_group->key.objectid + 382 block_group->key.objectid +
354 block_group->key.offset); 383 block_group->key.offset);
384 caching_ctl->progress = (u64)-1;
355 385
356 spin_lock(&block_group->lock); 386 spin_lock(&block_group->lock);
387 block_group->caching_ctl = NULL;
357 block_group->cached = BTRFS_CACHE_FINISHED; 388 block_group->cached = BTRFS_CACHE_FINISHED;
358 spin_unlock(&block_group->lock); 389 spin_unlock(&block_group->lock);
359 390
360err: 391err:
361 btrfs_free_path(path); 392 btrfs_free_path(path);
362 up_read(&fs_info->extent_commit_sem); 393 up_read(&fs_info->extent_commit_sem);
363 atomic_dec(&block_group->space_info->caching_threads);
364 wake_up(&block_group->caching_q);
365 394
395 free_excluded_extents(extent_root, block_group);
396
397 mutex_unlock(&caching_ctl->mutex);
398 wake_up(&caching_ctl->wait);
399
400 put_caching_control(caching_ctl);
401 atomic_dec(&block_group->space_info->caching_threads);
366 return 0; 402 return 0;
367} 403}
368 404
369static int cache_block_group(struct btrfs_block_group_cache *cache) 405static int cache_block_group(struct btrfs_block_group_cache *cache)
370{ 406{
407 struct btrfs_fs_info *fs_info = cache->fs_info;
408 struct btrfs_caching_control *caching_ctl;
371 struct task_struct *tsk; 409 struct task_struct *tsk;
372 int ret = 0; 410 int ret = 0;
373 411
412 smp_mb();
413 if (cache->cached != BTRFS_CACHE_NO)
414 return 0;
415
416 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
417 BUG_ON(!caching_ctl);
418
419 INIT_LIST_HEAD(&caching_ctl->list);
420 mutex_init(&caching_ctl->mutex);
421 init_waitqueue_head(&caching_ctl->wait);
422 caching_ctl->block_group = cache;
423 caching_ctl->progress = cache->key.objectid;
424 /* one for caching kthread, one for caching block group list */
425 atomic_set(&caching_ctl->count, 2);
426
374 spin_lock(&cache->lock); 427 spin_lock(&cache->lock);
375 if (cache->cached != BTRFS_CACHE_NO) { 428 if (cache->cached != BTRFS_CACHE_NO) {
376 spin_unlock(&cache->lock); 429 spin_unlock(&cache->lock);
377 return ret; 430 kfree(caching_ctl);
431 return 0;
378 } 432 }
433 cache->caching_ctl = caching_ctl;
379 cache->cached = BTRFS_CACHE_STARTED; 434 cache->cached = BTRFS_CACHE_STARTED;
380 spin_unlock(&cache->lock); 435 spin_unlock(&cache->lock);
381 436
437 down_write(&fs_info->extent_commit_sem);
438 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
439 up_write(&fs_info->extent_commit_sem);
440
441 atomic_inc(&cache->space_info->caching_threads);
442
382 tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", 443 tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
383 cache->key.objectid); 444 cache->key.objectid);
384 if (IS_ERR(tsk)) { 445 if (IS_ERR(tsk)) {
@@ -1511,7 +1572,8 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
1511static void btrfs_issue_discard(struct block_device *bdev, 1572static void btrfs_issue_discard(struct block_device *bdev,
1512 u64 start, u64 len) 1573 u64 start, u64 len)
1513{ 1574{
1514 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); 1575 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
1576 DISCARD_FL_BARRIER);
1515} 1577}
1516#endif 1578#endif
1517 1579
@@ -1656,7 +1718,6 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1656 parent, ref_root, flags, 1718 parent, ref_root, flags,
1657 ref->objectid, ref->offset, 1719 ref->objectid, ref->offset,
1658 &ins, node->ref_mod); 1720 &ins, node->ref_mod);
1659 update_reserved_extents(root, ins.objectid, ins.offset, 0);
1660 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 1721 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1661 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 1722 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1662 node->num_bytes, parent, 1723 node->num_bytes, parent,
@@ -1782,7 +1843,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1782 extent_op->flags_to_set, 1843 extent_op->flags_to_set,
1783 &extent_op->key, 1844 &extent_op->key,
1784 ref->level, &ins); 1845 ref->level, &ins);
1785 update_reserved_extents(root, ins.objectid, ins.offset, 0);
1786 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 1846 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1787 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 1847 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1788 node->num_bytes, parent, ref_root, 1848 node->num_bytes, parent, ref_root,
@@ -1817,16 +1877,32 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1817 BUG_ON(extent_op); 1877 BUG_ON(extent_op);
1818 head = btrfs_delayed_node_to_head(node); 1878 head = btrfs_delayed_node_to_head(node);
1819 if (insert_reserved) { 1879 if (insert_reserved) {
1880 int mark_free = 0;
1881 struct extent_buffer *must_clean = NULL;
1882
1883 ret = pin_down_bytes(trans, root, NULL,
1884 node->bytenr, node->num_bytes,
1885 head->is_data, 1, &must_clean);
1886 if (ret > 0)
1887 mark_free = 1;
1888
1889 if (must_clean) {
1890 clean_tree_block(NULL, root, must_clean);
1891 btrfs_tree_unlock(must_clean);
1892 free_extent_buffer(must_clean);
1893 }
1820 if (head->is_data) { 1894 if (head->is_data) {
1821 ret = btrfs_del_csums(trans, root, 1895 ret = btrfs_del_csums(trans, root,
1822 node->bytenr, 1896 node->bytenr,
1823 node->num_bytes); 1897 node->num_bytes);
1824 BUG_ON(ret); 1898 BUG_ON(ret);
1825 } 1899 }
1826 btrfs_update_pinned_extents(root, node->bytenr, 1900 if (mark_free) {
1827 node->num_bytes, 1); 1901 ret = btrfs_free_reserved_extent(root,
1828 update_reserved_extents(root, node->bytenr, 1902 node->bytenr,
1829 node->num_bytes, 0); 1903 node->num_bytes);
1904 BUG_ON(ret);
1905 }
1830 } 1906 }
1831 mutex_unlock(&head->mutex); 1907 mutex_unlock(&head->mutex);
1832 return 0; 1908 return 0;
@@ -2691,60 +2767,346 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2691 alloc_target); 2767 alloc_target);
2692} 2768}
2693 2769
2770static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
2771{
2772 u64 num_bytes;
2773 int level;
2774
2775 level = BTRFS_MAX_LEVEL - 2;
2776 /*
2777 * NOTE: these calculations are absolutely the worst possible case.
2778 * This assumes that _every_ item we insert will require a new leaf, and
2779 * that the tree has grown to its maximum level size.
2780 */
2781
2782 /*
2783 * for every item we insert we could insert both an extent item and a
2784 * extent ref item. Then for ever item we insert, we will need to cow
2785 * both the original leaf, plus the leaf to the left and right of it.
2786 *
2787 * Unless we are talking about the extent root, then we just want the
2788 * number of items * 2, since we just need the extent item plus its ref.
2789 */
2790 if (root == root->fs_info->extent_root)
2791 num_bytes = num_items * 2;
2792 else
2793 num_bytes = (num_items + (2 * num_items)) * 3;
2794
2795 /*
2796 * num_bytes is total number of leaves we could need times the leaf
2797 * size, and then for every leaf we could end up cow'ing 2 nodes per
2798 * level, down to the leaf level.
2799 */
2800 num_bytes = (num_bytes * root->leafsize) +
2801 (num_bytes * (level * 2)) * root->nodesize;
2802
2803 return num_bytes;
2804}
2805
2694/* 2806/*
2695 * for now this just makes sure we have at least 5% of our metadata space free 2807 * Unreserve metadata space for delalloc. If we have less reserved credits than
2696 * for use. 2808 * we have extents, this function does nothing.
2697 */ 2809 */
2698int btrfs_check_metadata_free_space(struct btrfs_root *root) 2810int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2811 struct inode *inode, int num_items)
2699{ 2812{
2700 struct btrfs_fs_info *info = root->fs_info; 2813 struct btrfs_fs_info *info = root->fs_info;
2701 struct btrfs_space_info *meta_sinfo; 2814 struct btrfs_space_info *meta_sinfo;
2702 u64 alloc_target, thresh; 2815 u64 num_bytes;
2703 int committed = 0, ret; 2816 u64 alloc_target;
2817 bool bug = false;
2704 2818
2705 /* get the space info for where the metadata will live */ 2819 /* get the space info for where the metadata will live */
2706 alloc_target = btrfs_get_alloc_profile(root, 0); 2820 alloc_target = btrfs_get_alloc_profile(root, 0);
2707 meta_sinfo = __find_space_info(info, alloc_target); 2821 meta_sinfo = __find_space_info(info, alloc_target);
2708 2822
2709again: 2823 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2824 num_items);
2825
2710 spin_lock(&meta_sinfo->lock); 2826 spin_lock(&meta_sinfo->lock);
2711 if (!meta_sinfo->full) 2827 if (BTRFS_I(inode)->delalloc_reserved_extents <=
2712 thresh = meta_sinfo->total_bytes * 80; 2828 BTRFS_I(inode)->delalloc_extents) {
2713 else 2829 spin_unlock(&meta_sinfo->lock);
2714 thresh = meta_sinfo->total_bytes * 95; 2830 return 0;
2831 }
2832
2833 BTRFS_I(inode)->delalloc_reserved_extents--;
2834 BUG_ON(BTRFS_I(inode)->delalloc_reserved_extents < 0);
2835
2836 if (meta_sinfo->bytes_delalloc < num_bytes) {
2837 bug = true;
2838 meta_sinfo->bytes_delalloc = 0;
2839 } else {
2840 meta_sinfo->bytes_delalloc -= num_bytes;
2841 }
2842 spin_unlock(&meta_sinfo->lock);
2843
2844 BUG_ON(bug);
2845
2846 return 0;
2847}
2715 2848
2849static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
2850{
2851 u64 thresh;
2852
2853 thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2854 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2855 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2856 meta_sinfo->bytes_may_use;
2857
2858 thresh = meta_sinfo->total_bytes - thresh;
2859 thresh *= 80;
2716 do_div(thresh, 100); 2860 do_div(thresh, 100);
2861 if (thresh <= meta_sinfo->bytes_delalloc)
2862 meta_sinfo->force_delalloc = 1;
2863 else
2864 meta_sinfo->force_delalloc = 0;
2865}
2717 2866
2718 if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + 2867static int maybe_allocate_chunk(struct btrfs_root *root,
2719 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) { 2868 struct btrfs_space_info *info)
2720 struct btrfs_trans_handle *trans; 2869{
2721 if (!meta_sinfo->full) { 2870 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
2722 meta_sinfo->force_alloc = 1; 2871 struct btrfs_trans_handle *trans;
2723 spin_unlock(&meta_sinfo->lock); 2872 bool wait = false;
2873 int ret = 0;
2874 u64 min_metadata;
2875 u64 free_space;
2724 2876
2725 trans = btrfs_start_transaction(root, 1); 2877 free_space = btrfs_super_total_bytes(disk_super);
2726 if (!trans) 2878 /*
2727 return -ENOMEM; 2879 * we allow the metadata to grow to a max of either 5gb or 5% of the
2880 * space in the volume.
2881 */
2882 min_metadata = min((u64)5 * 1024 * 1024 * 1024,
2883 div64_u64(free_space * 5, 100));
2884 if (info->total_bytes >= min_metadata) {
2885 spin_unlock(&info->lock);
2886 return 0;
2887 }
2728 2888
2729 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 2889 if (info->full) {
2730 2 * 1024 * 1024, alloc_target, 0); 2890 spin_unlock(&info->lock);
2731 btrfs_end_transaction(trans, root); 2891 return 0;
2892 }
2893
2894 if (!info->allocating_chunk) {
2895 info->force_alloc = 1;
2896 info->allocating_chunk = 1;
2897 init_waitqueue_head(&info->wait);
2898 } else {
2899 wait = true;
2900 }
2901
2902 spin_unlock(&info->lock);
2903
2904 if (wait) {
2905 wait_event(info->wait,
2906 !info->allocating_chunk);
2907 return 1;
2908 }
2909
2910 trans = btrfs_start_transaction(root, 1);
2911 if (!trans) {
2912 ret = -ENOMEM;
2913 goto out;
2914 }
2915
2916 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2917 4096 + 2 * 1024 * 1024,
2918 info->flags, 0);
2919 btrfs_end_transaction(trans, root);
2920 if (ret)
2921 goto out;
2922out:
2923 spin_lock(&info->lock);
2924 info->allocating_chunk = 0;
2925 spin_unlock(&info->lock);
2926 wake_up(&info->wait);
2927
2928 if (ret)
2929 return 0;
2930 return 1;
2931}
2932
2933/*
2934 * Reserve metadata space for delalloc.
2935 */
2936int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
2937 struct inode *inode, int num_items)
2938{
2939 struct btrfs_fs_info *info = root->fs_info;
2940 struct btrfs_space_info *meta_sinfo;
2941 u64 num_bytes;
2942 u64 used;
2943 u64 alloc_target;
2944 int flushed = 0;
2945 int force_delalloc;
2946
2947 /* get the space info for where the metadata will live */
2948 alloc_target = btrfs_get_alloc_profile(root, 0);
2949 meta_sinfo = __find_space_info(info, alloc_target);
2950
2951 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2952 num_items);
2953again:
2954 spin_lock(&meta_sinfo->lock);
2955
2956 force_delalloc = meta_sinfo->force_delalloc;
2957
2958 if (unlikely(!meta_sinfo->bytes_root))
2959 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
2960
2961 if (!flushed)
2962 meta_sinfo->bytes_delalloc += num_bytes;
2963
2964 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2965 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2966 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2967 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
2968
2969 if (used > meta_sinfo->total_bytes) {
2970 flushed++;
2971
2972 if (flushed == 1) {
2973 if (maybe_allocate_chunk(root, meta_sinfo))
2974 goto again;
2975 flushed++;
2976 } else {
2977 spin_unlock(&meta_sinfo->lock);
2978 }
2979
2980 if (flushed == 2) {
2981 filemap_flush(inode->i_mapping);
2982 goto again;
2983 } else if (flushed == 3) {
2984 btrfs_start_delalloc_inodes(root);
2985 btrfs_wait_ordered_extents(root, 0);
2732 goto again; 2986 goto again;
2733 } 2987 }
2988 spin_lock(&meta_sinfo->lock);
2989 meta_sinfo->bytes_delalloc -= num_bytes;
2734 spin_unlock(&meta_sinfo->lock); 2990 spin_unlock(&meta_sinfo->lock);
2991 printk(KERN_ERR "enospc, has %d, reserved %d\n",
2992 BTRFS_I(inode)->delalloc_extents,
2993 BTRFS_I(inode)->delalloc_reserved_extents);
2994 dump_space_info(meta_sinfo, 0, 0);
2995 return -ENOSPC;
2996 }
2735 2997
2736 if (!committed) { 2998 BTRFS_I(inode)->delalloc_reserved_extents++;
2737 committed = 1; 2999 check_force_delalloc(meta_sinfo);
2738 trans = btrfs_join_transaction(root, 1); 3000 spin_unlock(&meta_sinfo->lock);
2739 if (!trans) 3001
2740 return -ENOMEM; 3002 if (!flushed && force_delalloc)
2741 ret = btrfs_commit_transaction(trans, root); 3003 filemap_flush(inode->i_mapping);
2742 if (ret) 3004
2743 return ret; 3005 return 0;
3006}
3007
3008/*
3009 * unreserve num_items number of items worth of metadata space. This needs to
3010 * be paired with btrfs_reserve_metadata_space.
3011 *
3012 * NOTE: if you have the option, run this _AFTER_ you do a
3013 * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
3014 * oprations which will result in more used metadata, so we want to make sure we
3015 * can do that without issue.
3016 */
3017int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
3018{
3019 struct btrfs_fs_info *info = root->fs_info;
3020 struct btrfs_space_info *meta_sinfo;
3021 u64 num_bytes;
3022 u64 alloc_target;
3023 bool bug = false;
3024
3025 /* get the space info for where the metadata will live */
3026 alloc_target = btrfs_get_alloc_profile(root, 0);
3027 meta_sinfo = __find_space_info(info, alloc_target);
3028
3029 num_bytes = calculate_bytes_needed(root, num_items);
3030
3031 spin_lock(&meta_sinfo->lock);
3032 if (meta_sinfo->bytes_may_use < num_bytes) {
3033 bug = true;
3034 meta_sinfo->bytes_may_use = 0;
3035 } else {
3036 meta_sinfo->bytes_may_use -= num_bytes;
3037 }
3038 spin_unlock(&meta_sinfo->lock);
3039
3040 BUG_ON(bug);
3041
3042 return 0;
3043}
3044
3045/*
3046 * Reserve some metadata space for use. We'll calculate the worste case number
3047 * of bytes that would be needed to modify num_items number of items. If we
3048 * have space, fantastic, if not, you get -ENOSPC. Please call
3049 * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
3050 * items you reserved, since whatever metadata you needed should have already
3051 * been allocated.
3052 *
3053 * This will commit the transaction to make more space if we don't have enough
3054 * metadata space. THe only time we don't do this is if we're reserving space
3055 * inside of a transaction, then we will just return -ENOSPC and it is the
3056 * callers responsibility to handle it properly.
3057 */
3058int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
3059{
3060 struct btrfs_fs_info *info = root->fs_info;
3061 struct btrfs_space_info *meta_sinfo;
3062 u64 num_bytes;
3063 u64 used;
3064 u64 alloc_target;
3065 int retries = 0;
3066
3067 /* get the space info for where the metadata will live */
3068 alloc_target = btrfs_get_alloc_profile(root, 0);
3069 meta_sinfo = __find_space_info(info, alloc_target);
3070
3071 num_bytes = calculate_bytes_needed(root, num_items);
3072again:
3073 spin_lock(&meta_sinfo->lock);
3074
3075 if (unlikely(!meta_sinfo->bytes_root))
3076 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3077
3078 if (!retries)
3079 meta_sinfo->bytes_may_use += num_bytes;
3080
3081 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3082 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3083 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3084 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3085
3086 if (used > meta_sinfo->total_bytes) {
3087 retries++;
3088 if (retries == 1) {
3089 if (maybe_allocate_chunk(root, meta_sinfo))
3090 goto again;
3091 retries++;
3092 } else {
3093 spin_unlock(&meta_sinfo->lock);
3094 }
3095
3096 if (retries == 2) {
3097 btrfs_start_delalloc_inodes(root);
3098 btrfs_wait_ordered_extents(root, 0);
2744 goto again; 3099 goto again;
2745 } 3100 }
3101 spin_lock(&meta_sinfo->lock);
3102 meta_sinfo->bytes_may_use -= num_bytes;
3103 spin_unlock(&meta_sinfo->lock);
3104
3105 dump_space_info(meta_sinfo, 0, 0);
2746 return -ENOSPC; 3106 return -ENOSPC;
2747 } 3107 }
3108
3109 check_force_delalloc(meta_sinfo);
2748 spin_unlock(&meta_sinfo->lock); 3110 spin_unlock(&meta_sinfo->lock);
2749 3111
2750 return 0; 3112 return 0;
@@ -2764,13 +3126,16 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
2764 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3126 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2765 3127
2766 data_sinfo = BTRFS_I(inode)->space_info; 3128 data_sinfo = BTRFS_I(inode)->space_info;
3129 if (!data_sinfo)
3130 goto alloc;
3131
2767again: 3132again:
2768 /* make sure we have enough space to handle the data first */ 3133 /* make sure we have enough space to handle the data first */
2769 spin_lock(&data_sinfo->lock); 3134 spin_lock(&data_sinfo->lock);
2770 if (data_sinfo->total_bytes - data_sinfo->bytes_used - 3135 if (data_sinfo->total_bytes - data_sinfo->bytes_used -
2771 data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - 3136 data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
2772 data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - 3137 data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
2773 data_sinfo->bytes_may_use < bytes) { 3138 data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) {
2774 struct btrfs_trans_handle *trans; 3139 struct btrfs_trans_handle *trans;
2775 3140
2776 /* 3141 /*
@@ -2782,7 +3147,7 @@ again:
2782 3147
2783 data_sinfo->force_alloc = 1; 3148 data_sinfo->force_alloc = 1;
2784 spin_unlock(&data_sinfo->lock); 3149 spin_unlock(&data_sinfo->lock);
2785 3150alloc:
2786 alloc_target = btrfs_get_alloc_profile(root, 1); 3151 alloc_target = btrfs_get_alloc_profile(root, 1);
2787 trans = btrfs_start_transaction(root, 1); 3152 trans = btrfs_start_transaction(root, 1);
2788 if (!trans) 3153 if (!trans)
@@ -2794,12 +3159,17 @@ again:
2794 btrfs_end_transaction(trans, root); 3159 btrfs_end_transaction(trans, root);
2795 if (ret) 3160 if (ret)
2796 return ret; 3161 return ret;
3162
3163 if (!data_sinfo) {
3164 btrfs_set_inode_space_info(root, inode);
3165 data_sinfo = BTRFS_I(inode)->space_info;
3166 }
2797 goto again; 3167 goto again;
2798 } 3168 }
2799 spin_unlock(&data_sinfo->lock); 3169 spin_unlock(&data_sinfo->lock);
2800 3170
2801 /* commit the current transaction and try again */ 3171 /* commit the current transaction and try again */
2802 if (!committed) { 3172 if (!committed && !root->fs_info->open_ioctl_trans) {
2803 committed = 1; 3173 committed = 1;
2804 trans = btrfs_join_transaction(root, 1); 3174 trans = btrfs_join_transaction(root, 1);
2805 if (!trans) 3175 if (!trans)
@@ -2827,7 +3197,7 @@ again:
2827 BTRFS_I(inode)->reserved_bytes += bytes; 3197 BTRFS_I(inode)->reserved_bytes += bytes;
2828 spin_unlock(&data_sinfo->lock); 3198 spin_unlock(&data_sinfo->lock);
2829 3199
2830 return btrfs_check_metadata_free_space(root); 3200 return 0;
2831} 3201}
2832 3202
2833/* 3203/*
@@ -2926,17 +3296,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
2926 BUG_ON(!space_info); 3296 BUG_ON(!space_info);
2927 3297
2928 spin_lock(&space_info->lock); 3298 spin_lock(&space_info->lock);
2929 if (space_info->force_alloc) { 3299 if (space_info->force_alloc)
2930 force = 1; 3300 force = 1;
2931 space_info->force_alloc = 0;
2932 }
2933 if (space_info->full) { 3301 if (space_info->full) {
2934 spin_unlock(&space_info->lock); 3302 spin_unlock(&space_info->lock);
2935 goto out; 3303 goto out;
2936 } 3304 }
2937 3305
2938 thresh = space_info->total_bytes - space_info->bytes_readonly; 3306 thresh = space_info->total_bytes - space_info->bytes_readonly;
2939 thresh = div_factor(thresh, 6); 3307 thresh = div_factor(thresh, 8);
2940 if (!force && 3308 if (!force &&
2941 (space_info->bytes_used + space_info->bytes_pinned + 3309 (space_info->bytes_used + space_info->bytes_pinned +
2942 space_info->bytes_reserved + alloc_bytes) < thresh) { 3310 space_info->bytes_reserved + alloc_bytes) < thresh) {
@@ -2950,7 +3318,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
2950 * we keep a reasonable number of metadata chunks allocated in the 3318 * we keep a reasonable number of metadata chunks allocated in the
2951 * FS as well. 3319 * FS as well.
2952 */ 3320 */
2953 if (flags & BTRFS_BLOCK_GROUP_DATA) { 3321 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
2954 fs_info->data_chunk_allocations++; 3322 fs_info->data_chunk_allocations++;
2955 if (!(fs_info->data_chunk_allocations % 3323 if (!(fs_info->data_chunk_allocations %
2956 fs_info->metadata_ratio)) 3324 fs_info->metadata_ratio))
@@ -2958,8 +3326,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
2958 } 3326 }
2959 3327
2960 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3328 ret = btrfs_alloc_chunk(trans, extent_root, flags);
3329 spin_lock(&space_info->lock);
2961 if (ret) 3330 if (ret)
2962 space_info->full = 1; 3331 space_info->full = 1;
3332 space_info->force_alloc = 0;
3333 spin_unlock(&space_info->lock);
2963out: 3334out:
2964 mutex_unlock(&extent_root->fs_info->chunk_mutex); 3335 mutex_unlock(&extent_root->fs_info->chunk_mutex);
2965 return ret; 3336 return ret;
@@ -3008,10 +3379,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3008 num_bytes = min(total, cache->key.offset - byte_in_group); 3379 num_bytes = min(total, cache->key.offset - byte_in_group);
3009 if (alloc) { 3380 if (alloc) {
3010 old_val += num_bytes; 3381 old_val += num_bytes;
3382 btrfs_set_block_group_used(&cache->item, old_val);
3383 cache->reserved -= num_bytes;
3011 cache->space_info->bytes_used += num_bytes; 3384 cache->space_info->bytes_used += num_bytes;
3385 cache->space_info->bytes_reserved -= num_bytes;
3012 if (cache->ro) 3386 if (cache->ro)
3013 cache->space_info->bytes_readonly -= num_bytes; 3387 cache->space_info->bytes_readonly -= num_bytes;
3014 btrfs_set_block_group_used(&cache->item, old_val);
3015 spin_unlock(&cache->lock); 3388 spin_unlock(&cache->lock);
3016 spin_unlock(&cache->space_info->lock); 3389 spin_unlock(&cache->space_info->lock);
3017 } else { 3390 } else {
@@ -3056,127 +3429,136 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
3056 return bytenr; 3429 return bytenr;
3057} 3430}
3058 3431
3059int btrfs_update_pinned_extents(struct btrfs_root *root, 3432/*
3060 u64 bytenr, u64 num, int pin) 3433 * this function must be called within transaction
3434 */
3435int btrfs_pin_extent(struct btrfs_root *root,
3436 u64 bytenr, u64 num_bytes, int reserved)
3061{ 3437{
3062 u64 len;
3063 struct btrfs_block_group_cache *cache;
3064 struct btrfs_fs_info *fs_info = root->fs_info; 3438 struct btrfs_fs_info *fs_info = root->fs_info;
3439 struct btrfs_block_group_cache *cache;
3065 3440
3066 if (pin) 3441 cache = btrfs_lookup_block_group(fs_info, bytenr);
3067 set_extent_dirty(&fs_info->pinned_extents, 3442 BUG_ON(!cache);
3068 bytenr, bytenr + num - 1, GFP_NOFS);
3069
3070 while (num > 0) {
3071 cache = btrfs_lookup_block_group(fs_info, bytenr);
3072 BUG_ON(!cache);
3073 len = min(num, cache->key.offset -
3074 (bytenr - cache->key.objectid));
3075 if (pin) {
3076 spin_lock(&cache->space_info->lock);
3077 spin_lock(&cache->lock);
3078 cache->pinned += len;
3079 cache->space_info->bytes_pinned += len;
3080 spin_unlock(&cache->lock);
3081 spin_unlock(&cache->space_info->lock);
3082 fs_info->total_pinned += len;
3083 } else {
3084 int unpin = 0;
3085 3443
3086 /* 3444 spin_lock(&cache->space_info->lock);
3087 * in order to not race with the block group caching, we 3445 spin_lock(&cache->lock);
3088 * only want to unpin the extent if we are cached. If 3446 cache->pinned += num_bytes;
3089 * we aren't cached, we want to start async caching this 3447 cache->space_info->bytes_pinned += num_bytes;
3090 * block group so we can free the extent the next time 3448 if (reserved) {
3091 * around. 3449 cache->reserved -= num_bytes;
3092 */ 3450 cache->space_info->bytes_reserved -= num_bytes;
3093 spin_lock(&cache->space_info->lock); 3451 }
3094 spin_lock(&cache->lock); 3452 spin_unlock(&cache->lock);
3095 unpin = (cache->cached == BTRFS_CACHE_FINISHED); 3453 spin_unlock(&cache->space_info->lock);
3096 if (likely(unpin)) {
3097 cache->pinned -= len;
3098 cache->space_info->bytes_pinned -= len;
3099 fs_info->total_pinned -= len;
3100 }
3101 spin_unlock(&cache->lock);
3102 spin_unlock(&cache->space_info->lock);
3103 3454
3104 if (likely(unpin)) 3455 btrfs_put_block_group(cache);
3105 clear_extent_dirty(&fs_info->pinned_extents,
3106 bytenr, bytenr + len -1,
3107 GFP_NOFS);
3108 else
3109 cache_block_group(cache);
3110 3456
3111 if (unpin) 3457 set_extent_dirty(fs_info->pinned_extents,
3112 btrfs_add_free_space(cache, bytenr, len); 3458 bytenr, bytenr + num_bytes - 1, GFP_NOFS);
3113 } 3459 return 0;
3114 btrfs_put_block_group(cache); 3460}
3115 bytenr += len; 3461
3116 num -= len; 3462static int update_reserved_extents(struct btrfs_block_group_cache *cache,
3463 u64 num_bytes, int reserve)
3464{
3465 spin_lock(&cache->space_info->lock);
3466 spin_lock(&cache->lock);
3467 if (reserve) {
3468 cache->reserved += num_bytes;
3469 cache->space_info->bytes_reserved += num_bytes;
3470 } else {
3471 cache->reserved -= num_bytes;
3472 cache->space_info->bytes_reserved -= num_bytes;
3117 } 3473 }
3474 spin_unlock(&cache->lock);
3475 spin_unlock(&cache->space_info->lock);
3118 return 0; 3476 return 0;
3119} 3477}
3120 3478
3121static int update_reserved_extents(struct btrfs_root *root, 3479int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
3122 u64 bytenr, u64 num, int reserve) 3480 struct btrfs_root *root)
3123{ 3481{
3124 u64 len;
3125 struct btrfs_block_group_cache *cache;
3126 struct btrfs_fs_info *fs_info = root->fs_info; 3482 struct btrfs_fs_info *fs_info = root->fs_info;
3483 struct btrfs_caching_control *next;
3484 struct btrfs_caching_control *caching_ctl;
3485 struct btrfs_block_group_cache *cache;
3127 3486
3128 while (num > 0) { 3487 down_write(&fs_info->extent_commit_sem);
3129 cache = btrfs_lookup_block_group(fs_info, bytenr);
3130 BUG_ON(!cache);
3131 len = min(num, cache->key.offset -
3132 (bytenr - cache->key.objectid));
3133 3488
3134 spin_lock(&cache->space_info->lock); 3489 list_for_each_entry_safe(caching_ctl, next,
3135 spin_lock(&cache->lock); 3490 &fs_info->caching_block_groups, list) {
3136 if (reserve) { 3491 cache = caching_ctl->block_group;
3137 cache->reserved += len; 3492 if (block_group_cache_done(cache)) {
3138 cache->space_info->bytes_reserved += len; 3493 cache->last_byte_to_unpin = (u64)-1;
3494 list_del_init(&caching_ctl->list);
3495 put_caching_control(caching_ctl);
3139 } else { 3496 } else {
3140 cache->reserved -= len; 3497 cache->last_byte_to_unpin = caching_ctl->progress;
3141 cache->space_info->bytes_reserved -= len;
3142 } 3498 }
3143 spin_unlock(&cache->lock);
3144 spin_unlock(&cache->space_info->lock);
3145 btrfs_put_block_group(cache);
3146 bytenr += len;
3147 num -= len;
3148 } 3499 }
3500
3501 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
3502 fs_info->pinned_extents = &fs_info->freed_extents[1];
3503 else
3504 fs_info->pinned_extents = &fs_info->freed_extents[0];
3505
3506 up_write(&fs_info->extent_commit_sem);
3149 return 0; 3507 return 0;
3150} 3508}
3151 3509
3152int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) 3510static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
3153{ 3511{
3154 u64 last = 0; 3512 struct btrfs_fs_info *fs_info = root->fs_info;
3155 u64 start; 3513 struct btrfs_block_group_cache *cache = NULL;
3156 u64 end; 3514 u64 len;
3157 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
3158 int ret;
3159 3515
3160 while (1) { 3516 while (start <= end) {
3161 ret = find_first_extent_bit(pinned_extents, last, 3517 if (!cache ||
3162 &start, &end, EXTENT_DIRTY); 3518 start >= cache->key.objectid + cache->key.offset) {
3163 if (ret) 3519 if (cache)
3164 break; 3520 btrfs_put_block_group(cache);
3521 cache = btrfs_lookup_block_group(fs_info, start);
3522 BUG_ON(!cache);
3523 }
3524
3525 len = cache->key.objectid + cache->key.offset - start;
3526 len = min(len, end + 1 - start);
3527
3528 if (start < cache->last_byte_to_unpin) {
3529 len = min(len, cache->last_byte_to_unpin - start);
3530 btrfs_add_free_space(cache, start, len);
3531 }
3165 3532
3166 set_extent_dirty(copy, start, end, GFP_NOFS); 3533 spin_lock(&cache->space_info->lock);
3167 last = end + 1; 3534 spin_lock(&cache->lock);
3535 cache->pinned -= len;
3536 cache->space_info->bytes_pinned -= len;
3537 spin_unlock(&cache->lock);
3538 spin_unlock(&cache->space_info->lock);
3539
3540 start += len;
3168 } 3541 }
3542
3543 if (cache)
3544 btrfs_put_block_group(cache);
3169 return 0; 3545 return 0;
3170} 3546}
3171 3547
3172int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 3548int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3173 struct btrfs_root *root, 3549 struct btrfs_root *root)
3174 struct extent_io_tree *unpin)
3175{ 3550{
3551 struct btrfs_fs_info *fs_info = root->fs_info;
3552 struct extent_io_tree *unpin;
3176 u64 start; 3553 u64 start;
3177 u64 end; 3554 u64 end;
3178 int ret; 3555 int ret;
3179 3556
3557 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
3558 unpin = &fs_info->freed_extents[1];
3559 else
3560 unpin = &fs_info->freed_extents[0];
3561
3180 while (1) { 3562 while (1) {
3181 ret = find_first_extent_bit(unpin, 0, &start, &end, 3563 ret = find_first_extent_bit(unpin, 0, &start, &end,
3182 EXTENT_DIRTY); 3564 EXTENT_DIRTY);
@@ -3185,10 +3567,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3185 3567
3186 ret = btrfs_discard_extent(root, start, end + 1 - start); 3568 ret = btrfs_discard_extent(root, start, end + 1 - start);
3187 3569
3188 /* unlocks the pinned mutex */
3189 btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
3190 clear_extent_dirty(unpin, start, end, GFP_NOFS); 3570 clear_extent_dirty(unpin, start, end, GFP_NOFS);
3191 3571 unpin_extent_range(root, start, end);
3192 cond_resched(); 3572 cond_resched();
3193 } 3573 }
3194 3574
@@ -3198,7 +3578,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3198static int pin_down_bytes(struct btrfs_trans_handle *trans, 3578static int pin_down_bytes(struct btrfs_trans_handle *trans,
3199 struct btrfs_root *root, 3579 struct btrfs_root *root,
3200 struct btrfs_path *path, 3580 struct btrfs_path *path,
3201 u64 bytenr, u64 num_bytes, int is_data, 3581 u64 bytenr, u64 num_bytes,
3582 int is_data, int reserved,
3202 struct extent_buffer **must_clean) 3583 struct extent_buffer **must_clean)
3203{ 3584{
3204 int err = 0; 3585 int err = 0;
@@ -3230,15 +3611,15 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
3230 } 3611 }
3231 free_extent_buffer(buf); 3612 free_extent_buffer(buf);
3232pinit: 3613pinit:
3233 btrfs_set_path_blocking(path); 3614 if (path)
3615 btrfs_set_path_blocking(path);
3234 /* unlocks the pinned mutex */ 3616 /* unlocks the pinned mutex */
3235 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); 3617 btrfs_pin_extent(root, bytenr, num_bytes, reserved);
3236 3618
3237 BUG_ON(err < 0); 3619 BUG_ON(err < 0);
3238 return 0; 3620 return 0;
3239} 3621}
3240 3622
3241
3242static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 3623static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3243 struct btrfs_root *root, 3624 struct btrfs_root *root,
3244 u64 bytenr, u64 num_bytes, u64 parent, 3625 u64 bytenr, u64 num_bytes, u64 parent,
@@ -3412,7 +3793,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3412 } 3793 }
3413 3794
3414 ret = pin_down_bytes(trans, root, path, bytenr, 3795 ret = pin_down_bytes(trans, root, path, bytenr,
3415 num_bytes, is_data, &must_clean); 3796 num_bytes, is_data, 0, &must_clean);
3416 if (ret > 0) 3797 if (ret > 0)
3417 mark_free = 1; 3798 mark_free = 1;
3418 BUG_ON(ret < 0); 3799 BUG_ON(ret < 0);
@@ -3543,8 +3924,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
3543 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { 3924 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
3544 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); 3925 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
3545 /* unlocks the pinned mutex */ 3926 /* unlocks the pinned mutex */
3546 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); 3927 btrfs_pin_extent(root, bytenr, num_bytes, 1);
3547 update_reserved_extents(root, bytenr, num_bytes, 0);
3548 ret = 0; 3928 ret = 0;
3549 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 3929 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
3550 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, 3930 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
@@ -3584,19 +3964,33 @@ static noinline int
3584wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, 3964wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
3585 u64 num_bytes) 3965 u64 num_bytes)
3586{ 3966{
3967 struct btrfs_caching_control *caching_ctl;
3587 DEFINE_WAIT(wait); 3968 DEFINE_WAIT(wait);
3588 3969
3589 prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE); 3970 caching_ctl = get_caching_control(cache);
3590 3971 if (!caching_ctl)
3591 if (block_group_cache_done(cache)) {
3592 finish_wait(&cache->caching_q, &wait);
3593 return 0; 3972 return 0;
3594 }
3595 schedule();
3596 finish_wait(&cache->caching_q, &wait);
3597 3973
3598 wait_event(cache->caching_q, block_group_cache_done(cache) || 3974 wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
3599 (cache->free_space >= num_bytes)); 3975 (cache->free_space >= num_bytes));
3976
3977 put_caching_control(caching_ctl);
3978 return 0;
3979}
3980
3981static noinline int
3982wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
3983{
3984 struct btrfs_caching_control *caching_ctl;
3985 DEFINE_WAIT(wait);
3986
3987 caching_ctl = get_caching_control(cache);
3988 if (!caching_ctl)
3989 return 0;
3990
3991 wait_event(caching_ctl->wait, block_group_cache_done(cache));
3992
3993 put_caching_control(caching_ctl);
3600 return 0; 3994 return 0;
3601} 3995}
3602 3996
@@ -3634,6 +4028,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3634 int last_ptr_loop = 0; 4028 int last_ptr_loop = 0;
3635 int loop = 0; 4029 int loop = 0;
3636 bool found_uncached_bg = false; 4030 bool found_uncached_bg = false;
4031 bool failed_cluster_refill = false;
3637 4032
3638 WARN_ON(num_bytes < root->sectorsize); 4033 WARN_ON(num_bytes < root->sectorsize);
3639 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 4034 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -3731,7 +4126,16 @@ have_block_group:
3731 if (unlikely(block_group->ro)) 4126 if (unlikely(block_group->ro))
3732 goto loop; 4127 goto loop;
3733 4128
3734 if (last_ptr) { 4129 /*
4130 * Ok we want to try and use the cluster allocator, so lets look
4131 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
4132 * have tried the cluster allocator plenty of times at this
4133 * point and not have found anything, so we are likely way too
4134 * fragmented for the clustering stuff to find anything, so lets
4135 * just skip it and let the allocator find whatever block it can
4136 * find
4137 */
4138 if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
3735 /* 4139 /*
3736 * the refill lock keeps out other 4140 * the refill lock keeps out other
3737 * people trying to start a new cluster 4141 * people trying to start a new cluster
@@ -3806,9 +4210,11 @@ refill_cluster:
3806 spin_unlock(&last_ptr->refill_lock); 4210 spin_unlock(&last_ptr->refill_lock);
3807 goto checks; 4211 goto checks;
3808 } 4212 }
3809 } else if (!cached && loop > LOOP_CACHING_NOWAIT) { 4213 } else if (!cached && loop > LOOP_CACHING_NOWAIT
4214 && !failed_cluster_refill) {
3810 spin_unlock(&last_ptr->refill_lock); 4215 spin_unlock(&last_ptr->refill_lock);
3811 4216
4217 failed_cluster_refill = true;
3812 wait_block_group_cache_progress(block_group, 4218 wait_block_group_cache_progress(block_group,
3813 num_bytes + empty_cluster + empty_size); 4219 num_bytes + empty_cluster + empty_size);
3814 goto have_block_group; 4220 goto have_block_group;
@@ -3820,13 +4226,9 @@ refill_cluster:
3820 * cluster. Free the cluster we've been trying 4226 * cluster. Free the cluster we've been trying
3821 * to use, and go to the next block group 4227 * to use, and go to the next block group
3822 */ 4228 */
3823 if (loop < LOOP_NO_EMPTY_SIZE) { 4229 btrfs_return_cluster_to_free_space(NULL, last_ptr);
3824 btrfs_return_cluster_to_free_space(NULL,
3825 last_ptr);
3826 spin_unlock(&last_ptr->refill_lock);
3827 goto loop;
3828 }
3829 spin_unlock(&last_ptr->refill_lock); 4230 spin_unlock(&last_ptr->refill_lock);
4231 goto loop;
3830 } 4232 }
3831 4233
3832 offset = btrfs_find_space_for_alloc(block_group, search_start, 4234 offset = btrfs_find_space_for_alloc(block_group, search_start,
@@ -3880,9 +4282,12 @@ checks:
3880 search_start - offset); 4282 search_start - offset);
3881 BUG_ON(offset > search_start); 4283 BUG_ON(offset > search_start);
3882 4284
4285 update_reserved_extents(block_group, num_bytes, 1);
4286
3883 /* we are all good, lets return */ 4287 /* we are all good, lets return */
3884 break; 4288 break;
3885loop: 4289loop:
4290 failed_cluster_refill = false;
3886 btrfs_put_block_group(block_group); 4291 btrfs_put_block_group(block_group);
3887 } 4292 }
3888 up_read(&space_info->groups_sem); 4293 up_read(&space_info->groups_sem);
@@ -3940,21 +4345,32 @@ loop:
3940 return ret; 4345 return ret;
3941} 4346}
3942 4347
3943static void dump_space_info(struct btrfs_space_info *info, u64 bytes) 4348static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4349 int dump_block_groups)
3944{ 4350{
3945 struct btrfs_block_group_cache *cache; 4351 struct btrfs_block_group_cache *cache;
3946 4352
4353 spin_lock(&info->lock);
3947 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 4354 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
3948 (unsigned long long)(info->total_bytes - info->bytes_used - 4355 (unsigned long long)(info->total_bytes - info->bytes_used -
3949 info->bytes_pinned - info->bytes_reserved), 4356 info->bytes_pinned - info->bytes_reserved -
4357 info->bytes_super),
3950 (info->full) ? "" : "not "); 4358 (info->full) ? "" : "not ");
3951 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," 4359 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
3952 " may_use=%llu, used=%llu\n", 4360 " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
4361 "\n",
3953 (unsigned long long)info->total_bytes, 4362 (unsigned long long)info->total_bytes,
3954 (unsigned long long)info->bytes_pinned, 4363 (unsigned long long)info->bytes_pinned,
3955 (unsigned long long)info->bytes_delalloc, 4364 (unsigned long long)info->bytes_delalloc,
3956 (unsigned long long)info->bytes_may_use, 4365 (unsigned long long)info->bytes_may_use,
3957 (unsigned long long)info->bytes_used); 4366 (unsigned long long)info->bytes_used,
4367 (unsigned long long)info->bytes_root,
4368 (unsigned long long)info->bytes_super,
4369 (unsigned long long)info->bytes_reserved);
4370 spin_unlock(&info->lock);
4371
4372 if (!dump_block_groups)
4373 return;
3958 4374
3959 down_read(&info->groups_sem); 4375 down_read(&info->groups_sem);
3960 list_for_each_entry(cache, &info->block_groups, list) { 4376 list_for_each_entry(cache, &info->block_groups, list) {
@@ -3972,12 +4388,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3972 up_read(&info->groups_sem); 4388 up_read(&info->groups_sem);
3973} 4389}
3974 4390
3975static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans, 4391int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3976 struct btrfs_root *root, 4392 struct btrfs_root *root,
3977 u64 num_bytes, u64 min_alloc_size, 4393 u64 num_bytes, u64 min_alloc_size,
3978 u64 empty_size, u64 hint_byte, 4394 u64 empty_size, u64 hint_byte,
3979 u64 search_end, struct btrfs_key *ins, 4395 u64 search_end, struct btrfs_key *ins,
3980 u64 data) 4396 u64 data)
3981{ 4397{
3982 int ret; 4398 int ret;
3983 u64 search_start = 0; 4399 u64 search_start = 0;
@@ -4022,7 +4438,7 @@ again:
4022 printk(KERN_ERR "btrfs allocation failed flags %llu, " 4438 printk(KERN_ERR "btrfs allocation failed flags %llu, "
4023 "wanted %llu\n", (unsigned long long)data, 4439 "wanted %llu\n", (unsigned long long)data,
4024 (unsigned long long)num_bytes); 4440 (unsigned long long)num_bytes);
4025 dump_space_info(sinfo, num_bytes); 4441 dump_space_info(sinfo, num_bytes, 1);
4026 } 4442 }
4027 4443
4028 return ret; 4444 return ret;
@@ -4043,25 +4459,8 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
4043 ret = btrfs_discard_extent(root, start, len); 4459 ret = btrfs_discard_extent(root, start, len);
4044 4460
4045 btrfs_add_free_space(cache, start, len); 4461 btrfs_add_free_space(cache, start, len);
4462 update_reserved_extents(cache, len, 0);
4046 btrfs_put_block_group(cache); 4463 btrfs_put_block_group(cache);
4047 update_reserved_extents(root, start, len, 0);
4048
4049 return ret;
4050}
4051
4052int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
4053 struct btrfs_root *root,
4054 u64 num_bytes, u64 min_alloc_size,
4055 u64 empty_size, u64 hint_byte,
4056 u64 search_end, struct btrfs_key *ins,
4057 u64 data)
4058{
4059 int ret;
4060 ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
4061 empty_size, hint_byte, search_end, ins,
4062 data);
4063 if (!ret)
4064 update_reserved_extents(root, ins->objectid, ins->offset, 1);
4065 4464
4066 return ret; 4465 return ret;
4067} 4466}
@@ -4222,15 +4621,46 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4222{ 4621{
4223 int ret; 4622 int ret;
4224 struct btrfs_block_group_cache *block_group; 4623 struct btrfs_block_group_cache *block_group;
4624 struct btrfs_caching_control *caching_ctl;
4625 u64 start = ins->objectid;
4626 u64 num_bytes = ins->offset;
4225 4627
4226 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 4628 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
4227 cache_block_group(block_group); 4629 cache_block_group(block_group);
4228 wait_event(block_group->caching_q, 4630 caching_ctl = get_caching_control(block_group);
4229 block_group_cache_done(block_group));
4230 4631
4231 ret = btrfs_remove_free_space(block_group, ins->objectid, 4632 if (!caching_ctl) {
4232 ins->offset); 4633 BUG_ON(!block_group_cache_done(block_group));
4233 BUG_ON(ret); 4634 ret = btrfs_remove_free_space(block_group, start, num_bytes);
4635 BUG_ON(ret);
4636 } else {
4637 mutex_lock(&caching_ctl->mutex);
4638
4639 if (start >= caching_ctl->progress) {
4640 ret = add_excluded_extent(root, start, num_bytes);
4641 BUG_ON(ret);
4642 } else if (start + num_bytes <= caching_ctl->progress) {
4643 ret = btrfs_remove_free_space(block_group,
4644 start, num_bytes);
4645 BUG_ON(ret);
4646 } else {
4647 num_bytes = caching_ctl->progress - start;
4648 ret = btrfs_remove_free_space(block_group,
4649 start, num_bytes);
4650 BUG_ON(ret);
4651
4652 start = caching_ctl->progress;
4653 num_bytes = ins->objectid + ins->offset -
4654 caching_ctl->progress;
4655 ret = add_excluded_extent(root, start, num_bytes);
4656 BUG_ON(ret);
4657 }
4658
4659 mutex_unlock(&caching_ctl->mutex);
4660 put_caching_control(caching_ctl);
4661 }
4662
4663 update_reserved_extents(block_group, ins->offset, 1);
4234 btrfs_put_block_group(block_group); 4664 btrfs_put_block_group(block_group);
4235 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 4665 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
4236 0, owner, offset, ins, 1); 4666 0, owner, offset, ins, 1);
@@ -4254,9 +4684,9 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
4254 int ret; 4684 int ret;
4255 u64 flags = 0; 4685 u64 flags = 0;
4256 4686
4257 ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes, 4687 ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
4258 empty_size, hint_byte, search_end, 4688 empty_size, hint_byte, search_end,
4259 ins, 0); 4689 ins, 0);
4260 if (ret) 4690 if (ret)
4261 return ret; 4691 return ret;
4262 4692
@@ -4267,7 +4697,6 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
4267 } else 4697 } else
4268 BUG_ON(parent > 0); 4698 BUG_ON(parent > 0);
4269 4699
4270 update_reserved_extents(root, ins->objectid, ins->offset, 1);
4271 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 4700 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4272 struct btrfs_delayed_extent_op *extent_op; 4701 struct btrfs_delayed_extent_op *extent_op;
4273 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 4702 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
@@ -4346,452 +4775,99 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
4346 return buf; 4775 return buf;
4347} 4776}
4348 4777
4349#if 0 4778struct walk_control {
4350int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, 4779 u64 refs[BTRFS_MAX_LEVEL];
4351 struct btrfs_root *root, struct extent_buffer *leaf) 4780 u64 flags[BTRFS_MAX_LEVEL];
4352{ 4781 struct btrfs_key update_progress;
4353 u64 disk_bytenr; 4782 int stage;
4354 u64 num_bytes; 4783 int level;
4355 struct btrfs_key key; 4784 int shared_level;
4356 struct btrfs_file_extent_item *fi; 4785 int update_ref;
4357 u32 nritems; 4786 int keep_locks;
4358 int i; 4787 int reada_slot;
4359 int ret; 4788 int reada_count;
4360 4789};
4361 BUG_ON(!btrfs_is_leaf(leaf));
4362 nritems = btrfs_header_nritems(leaf);
4363
4364 for (i = 0; i < nritems; i++) {
4365 cond_resched();
4366 btrfs_item_key_to_cpu(leaf, &key, i);
4367
4368 /* only extents have references, skip everything else */
4369 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
4370 continue;
4371
4372 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4373
4374 /* inline extents live in the btree, they don't have refs */
4375 if (btrfs_file_extent_type(leaf, fi) ==
4376 BTRFS_FILE_EXTENT_INLINE)
4377 continue;
4378
4379 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
4380
4381 /* holes don't have refs */
4382 if (disk_bytenr == 0)
4383 continue;
4384
4385 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
4386 ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes,
4387 leaf->start, 0, key.objectid, 0);
4388 BUG_ON(ret);
4389 }
4390 return 0;
4391}
4392
4393static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
4394 struct btrfs_root *root,
4395 struct btrfs_leaf_ref *ref)
4396{
4397 int i;
4398 int ret;
4399 struct btrfs_extent_info *info;
4400 struct refsort *sorted;
4401
4402 if (ref->nritems == 0)
4403 return 0;
4404
4405 sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
4406 for (i = 0; i < ref->nritems; i++) {
4407 sorted[i].bytenr = ref->extents[i].bytenr;
4408 sorted[i].slot = i;
4409 }
4410 sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
4411
4412 /*
4413 * the items in the ref were sorted when the ref was inserted
4414 * into the ref cache, so this is already in order
4415 */
4416 for (i = 0; i < ref->nritems; i++) {
4417 info = ref->extents + sorted[i].slot;
4418 ret = btrfs_free_extent(trans, root, info->bytenr,
4419 info->num_bytes, ref->bytenr,
4420 ref->owner, ref->generation,
4421 info->objectid, 0);
4422
4423 atomic_inc(&root->fs_info->throttle_gen);
4424 wake_up(&root->fs_info->transaction_throttle);
4425 cond_resched();
4426
4427 BUG_ON(ret);
4428 info++;
4429 }
4430
4431 kfree(sorted);
4432 return 0;
4433}
4434
4435
4436static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
4437 struct btrfs_root *root, u64 start,
4438 u64 len, u32 *refs)
4439{
4440 int ret;
4441
4442 ret = btrfs_lookup_extent_refs(trans, root, start, len, refs);
4443 BUG_ON(ret);
4444
4445#if 0 /* some debugging code in case we see problems here */
4446 /* if the refs count is one, it won't get increased again. But
4447 * if the ref count is > 1, someone may be decreasing it at
4448 * the same time we are.
4449 */
4450 if (*refs != 1) {
4451 struct extent_buffer *eb = NULL;
4452 eb = btrfs_find_create_tree_block(root, start, len);
4453 if (eb)
4454 btrfs_tree_lock(eb);
4455
4456 mutex_lock(&root->fs_info->alloc_mutex);
4457 ret = lookup_extent_ref(NULL, root, start, len, refs);
4458 BUG_ON(ret);
4459 mutex_unlock(&root->fs_info->alloc_mutex);
4460
4461 if (eb) {
4462 btrfs_tree_unlock(eb);
4463 free_extent_buffer(eb);
4464 }
4465 if (*refs == 1) {
4466 printk(KERN_ERR "btrfs block %llu went down to one "
4467 "during drop_snap\n", (unsigned long long)start);
4468 }
4469
4470 }
4471#endif
4472
4473 cond_resched();
4474 return ret;
4475}
4476 4790
4791#define DROP_REFERENCE 1
4792#define UPDATE_BACKREF 2
4477 4793
4478/* 4794static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
4479 * this is used while deleting old snapshots, and it drops the refs 4795 struct btrfs_root *root,
4480 * on a whole subtree starting from a level 1 node. 4796 struct walk_control *wc,
4481 * 4797 struct btrfs_path *path)
4482 * The idea is to sort all the leaf pointers, and then drop the
4483 * ref on all the leaves in order. Most of the time the leaves
4484 * will have ref cache entries, so no leaf IOs will be required to
4485 * find the extents they have references on.
4486 *
4487 * For each leaf, any references it has are also dropped in order
4488 *
4489 * This ends up dropping the references in something close to optimal
4490 * order for reading and modifying the extent allocation tree.
4491 */
4492static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
4493 struct btrfs_root *root,
4494 struct btrfs_path *path)
4495{ 4798{
4496 u64 bytenr; 4799 u64 bytenr;
4497 u64 root_owner; 4800 u64 generation;
4498 u64 root_gen; 4801 u64 refs;
4499 struct extent_buffer *eb = path->nodes[1]; 4802 u64 last = 0;
4500 struct extent_buffer *leaf; 4803 u32 nritems;
4501 struct btrfs_leaf_ref *ref; 4804 u32 blocksize;
4502 struct refsort *sorted = NULL; 4805 struct btrfs_key key;
4503 int nritems = btrfs_header_nritems(eb); 4806 struct extent_buffer *eb;
4504 int ret; 4807 int ret;
4505 int i; 4808 int slot;
4506 int refi = 0; 4809 int nread = 0;
4507 int slot = path->slots[1];
4508 u32 blocksize = btrfs_level_size(root, 0);
4509 u32 refs;
4510
4511 if (nritems == 0)
4512 goto out;
4513
4514 root_owner = btrfs_header_owner(eb);
4515 root_gen = btrfs_header_generation(eb);
4516 sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
4517 4810
4518 /* 4811 if (path->slots[wc->level] < wc->reada_slot) {
4519 * step one, sort all the leaf pointers so we don't scribble 4812 wc->reada_count = wc->reada_count * 2 / 3;
4520 * randomly into the extent allocation tree 4813 wc->reada_count = max(wc->reada_count, 2);
4521 */ 4814 } else {
4522 for (i = slot; i < nritems; i++) { 4815 wc->reada_count = wc->reada_count * 3 / 2;
4523 sorted[refi].bytenr = btrfs_node_blockptr(eb, i); 4816 wc->reada_count = min_t(int, wc->reada_count,
4524 sorted[refi].slot = i; 4817 BTRFS_NODEPTRS_PER_BLOCK(root));
4525 refi++;
4526 } 4818 }
4527 4819
4528 /* 4820 eb = path->nodes[wc->level];
4529 * nritems won't be zero, but if we're picking up drop_snapshot 4821 nritems = btrfs_header_nritems(eb);
4530 * after a crash, slot might be > 0, so double check things 4822 blocksize = btrfs_level_size(root, wc->level - 1);
4531 * just in case.
4532 */
4533 if (refi == 0)
4534 goto out;
4535 4823
4536 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); 4824 for (slot = path->slots[wc->level]; slot < nritems; slot++) {
4825 if (nread >= wc->reada_count)
4826 break;
4537 4827
4538 /* 4828 cond_resched();
4539 * the first loop frees everything the leaves point to 4829 bytenr = btrfs_node_blockptr(eb, slot);
4540 */ 4830 generation = btrfs_node_ptr_generation(eb, slot);
4541 for (i = 0; i < refi; i++) {
4542 u64 ptr_gen;
4543 4831
4544 bytenr = sorted[i].bytenr; 4832 if (slot == path->slots[wc->level])
4833 goto reada;
4545 4834
4546 /* 4835 if (wc->stage == UPDATE_BACKREF &&
4547 * check the reference count on this leaf. If it is > 1 4836 generation <= root->root_key.offset)
4548 * we just decrement it below and don't update any
4549 * of the refs the leaf points to.
4550 */
4551 ret = drop_snap_lookup_refcount(trans, root, bytenr,
4552 blocksize, &refs);
4553 BUG_ON(ret);
4554 if (refs != 1)
4555 continue; 4837 continue;
4556 4838
4557 ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot); 4839 if (wc->stage == DROP_REFERENCE) {
4558 4840 ret = btrfs_lookup_extent_info(trans, root,
4559 /* 4841 bytenr, blocksize,
4560 * the leaf only had one reference, which means the 4842 &refs, NULL);
4561 * only thing pointing to this leaf is the snapshot
4562 * we're deleting. It isn't possible for the reference
4563 * count to increase again later
4564 *
4565 * The reference cache is checked for the leaf,
4566 * and if found we'll be able to drop any refs held by
4567 * the leaf without needing to read it in.
4568 */
4569 ref = btrfs_lookup_leaf_ref(root, bytenr);
4570 if (ref && ref->generation != ptr_gen) {
4571 btrfs_free_leaf_ref(root, ref);
4572 ref = NULL;
4573 }
4574 if (ref) {
4575 ret = cache_drop_leaf_ref(trans, root, ref);
4576 BUG_ON(ret);
4577 btrfs_remove_leaf_ref(root, ref);
4578 btrfs_free_leaf_ref(root, ref);
4579 } else {
4580 /*
4581 * the leaf wasn't in the reference cache, so
4582 * we have to read it.
4583 */
4584 leaf = read_tree_block(root, bytenr, blocksize,
4585 ptr_gen);
4586 ret = btrfs_drop_leaf_ref(trans, root, leaf);
4587 BUG_ON(ret); 4843 BUG_ON(ret);
4588 free_extent_buffer(leaf); 4844 BUG_ON(refs == 0);
4589 } 4845 if (refs == 1)
4590 atomic_inc(&root->fs_info->throttle_gen); 4846 goto reada;
4591 wake_up(&root->fs_info->transaction_throttle);
4592 cond_resched();
4593 }
4594
4595 /*
4596 * run through the loop again to free the refs on the leaves.
4597 * This is faster than doing it in the loop above because
4598 * the leaves are likely to be clustered together. We end up
4599 * working in nice chunks on the extent allocation tree.
4600 */
4601 for (i = 0; i < refi; i++) {
4602 bytenr = sorted[i].bytenr;
4603 ret = btrfs_free_extent(trans, root, bytenr,
4604 blocksize, eb->start,
4605 root_owner, root_gen, 0, 1);
4606 BUG_ON(ret);
4607
4608 atomic_inc(&root->fs_info->throttle_gen);
4609 wake_up(&root->fs_info->transaction_throttle);
4610 cond_resched();
4611 }
4612out:
4613 kfree(sorted);
4614
4615 /*
4616 * update the path to show we've processed the entire level 1
4617 * node. This will get saved into the root's drop_snapshot_progress
4618 * field so these drops are not repeated again if this transaction
4619 * commits.
4620 */
4621 path->slots[1] = nritems;
4622 return 0;
4623}
4624
4625/*
4626 * helper function for drop_snapshot, this walks down the tree dropping ref
4627 * counts as it goes.
4628 */
4629static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4630 struct btrfs_root *root,
4631 struct btrfs_path *path, int *level)
4632{
4633 u64 root_owner;
4634 u64 root_gen;
4635 u64 bytenr;
4636 u64 ptr_gen;
4637 struct extent_buffer *next;
4638 struct extent_buffer *cur;
4639 struct extent_buffer *parent;
4640 u32 blocksize;
4641 int ret;
4642 u32 refs;
4643
4644 WARN_ON(*level < 0);
4645 WARN_ON(*level >= BTRFS_MAX_LEVEL);
4646 ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
4647 path->nodes[*level]->len, &refs);
4648 BUG_ON(ret);
4649 if (refs > 1)
4650 goto out;
4651
4652 /*
4653 * walk down to the last node level and free all the leaves
4654 */
4655 while (*level >= 0) {
4656 WARN_ON(*level < 0);
4657 WARN_ON(*level >= BTRFS_MAX_LEVEL);
4658 cur = path->nodes[*level];
4659 4847
4660 if (btrfs_header_level(cur) != *level) 4848 if (!wc->update_ref ||
4661 WARN_ON(1); 4849 generation <= root->root_key.offset)
4662 4850 continue;
4663 if (path->slots[*level] >= 4851 btrfs_node_key_to_cpu(eb, &key, slot);
4664 btrfs_header_nritems(cur)) 4852 ret = btrfs_comp_cpu_keys(&key,
4665 break; 4853 &wc->update_progress);
4666 4854 if (ret < 0)
4667 /* the new code goes down to level 1 and does all the 4855 continue;
4668 * leaves pointed to that node in bulk. So, this check
4669 * for level 0 will always be false.
4670 *
4671 * But, the disk format allows the drop_snapshot_progress
4672 * field in the root to leave things in a state where
4673 * a leaf will need cleaning up here. If someone crashes
4674 * with the old code and then boots with the new code,
4675 * we might find a leaf here.
4676 */
4677 if (*level == 0) {
4678 ret = btrfs_drop_leaf_ref(trans, root, cur);
4679 BUG_ON(ret);
4680 break;
4681 } 4856 }
4682 4857reada:
4683 /* 4858 ret = readahead_tree_block(root, bytenr, blocksize,
4684 * once we get to level one, process the whole node 4859 generation);
4685 * at once, including everything below it. 4860 if (ret)
4686 */
4687 if (*level == 1) {
4688 ret = drop_level_one_refs(trans, root, path);
4689 BUG_ON(ret);
4690 break; 4861 break;
4691 } 4862 last = bytenr + blocksize;
4692 4863 nread++;
4693 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
4694 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
4695 blocksize = btrfs_level_size(root, *level - 1);
4696
4697 ret = drop_snap_lookup_refcount(trans, root, bytenr,
4698 blocksize, &refs);
4699 BUG_ON(ret);
4700
4701 /*
4702 * if there is more than one reference, we don't need
4703 * to read that node to drop any references it has. We
4704 * just drop the ref we hold on that node and move on to the
4705 * next slot in this level.
4706 */
4707 if (refs != 1) {
4708 parent = path->nodes[*level];
4709 root_owner = btrfs_header_owner(parent);
4710 root_gen = btrfs_header_generation(parent);
4711 path->slots[*level]++;
4712
4713 ret = btrfs_free_extent(trans, root, bytenr,
4714 blocksize, parent->start,
4715 root_owner, root_gen,
4716 *level - 1, 1);
4717 BUG_ON(ret);
4718
4719 atomic_inc(&root->fs_info->throttle_gen);
4720 wake_up(&root->fs_info->transaction_throttle);
4721 cond_resched();
4722
4723 continue;
4724 }
4725
4726 /*
4727 * we need to keep freeing things in the next level down.
4728 * read the block and loop around to process it
4729 */
4730 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
4731 WARN_ON(*level <= 0);
4732 if (path->nodes[*level-1])
4733 free_extent_buffer(path->nodes[*level-1]);
4734 path->nodes[*level-1] = next;
4735 *level = btrfs_header_level(next);
4736 path->slots[*level] = 0;
4737 cond_resched();
4738 } 4864 }
4739out: 4865 wc->reada_slot = slot;
4740 WARN_ON(*level < 0);
4741 WARN_ON(*level >= BTRFS_MAX_LEVEL);
4742
4743 if (path->nodes[*level] == root->node) {
4744 parent = path->nodes[*level];
4745 bytenr = path->nodes[*level]->start;
4746 } else {
4747 parent = path->nodes[*level + 1];
4748 bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
4749 }
4750
4751 blocksize = btrfs_level_size(root, *level);
4752 root_owner = btrfs_header_owner(parent);
4753 root_gen = btrfs_header_generation(parent);
4754
4755 /*
4756 * cleanup and free the reference on the last node
4757 * we processed
4758 */
4759 ret = btrfs_free_extent(trans, root, bytenr, blocksize,
4760 parent->start, root_owner, root_gen,
4761 *level, 1);
4762 free_extent_buffer(path->nodes[*level]);
4763 path->nodes[*level] = NULL;
4764
4765 *level += 1;
4766 BUG_ON(ret);
4767
4768 cond_resched();
4769 return 0;
4770} 4866}
4771#endif
4772
4773struct walk_control {
4774 u64 refs[BTRFS_MAX_LEVEL];
4775 u64 flags[BTRFS_MAX_LEVEL];
4776 struct btrfs_key update_progress;
4777 int stage;
4778 int level;
4779 int shared_level;
4780 int update_ref;
4781 int keep_locks;
4782};
4783
4784#define DROP_REFERENCE 1
4785#define UPDATE_BACKREF 2
4786 4867
4787/* 4868/*
4788 * hepler to process tree block while walking down the tree. 4869 * hepler to process tree block while walking down the tree.
4789 * 4870 *
4790 * when wc->stage == DROP_REFERENCE, this function checks
4791 * reference count of the block. if the block is shared and
4792 * we need update back refs for the subtree rooted at the
4793 * block, this function changes wc->stage to UPDATE_BACKREF
4794 *
4795 * when wc->stage == UPDATE_BACKREF, this function updates 4871 * when wc->stage == UPDATE_BACKREF, this function updates
4796 * back refs for pointers in the block. 4872 * back refs for pointers in the block.
4797 * 4873 *
@@ -4804,7 +4880,6 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4804{ 4880{
4805 int level = wc->level; 4881 int level = wc->level;
4806 struct extent_buffer *eb = path->nodes[level]; 4882 struct extent_buffer *eb = path->nodes[level];
4807 struct btrfs_key key;
4808 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 4883 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
4809 int ret; 4884 int ret;
4810 4885
@@ -4827,21 +4902,6 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4827 BUG_ON(wc->refs[level] == 0); 4902 BUG_ON(wc->refs[level] == 0);
4828 } 4903 }
4829 4904
4830 if (wc->stage == DROP_REFERENCE &&
4831 wc->update_ref && wc->refs[level] > 1) {
4832 BUG_ON(eb == root->node);
4833 BUG_ON(path->slots[level] > 0);
4834 if (level == 0)
4835 btrfs_item_key_to_cpu(eb, &key, path->slots[level]);
4836 else
4837 btrfs_node_key_to_cpu(eb, &key, path->slots[level]);
4838 if (btrfs_header_owner(eb) == root->root_key.objectid &&
4839 btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) {
4840 wc->stage = UPDATE_BACKREF;
4841 wc->shared_level = level;
4842 }
4843 }
4844
4845 if (wc->stage == DROP_REFERENCE) { 4905 if (wc->stage == DROP_REFERENCE) {
4846 if (wc->refs[level] > 1) 4906 if (wc->refs[level] > 1)
4847 return 1; 4907 return 1;
@@ -4878,6 +4938,123 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4878} 4938}
4879 4939
4880/* 4940/*
4941 * hepler to process tree block pointer.
4942 *
4943 * when wc->stage == DROP_REFERENCE, this function checks
4944 * reference count of the block pointed to. if the block
4945 * is shared and we need update back refs for the subtree
4946 * rooted at the block, this function changes wc->stage to
4947 * UPDATE_BACKREF. if the block is shared and there is no
4948 * need to update back, this function drops the reference
4949 * to the block.
4950 *
4951 * NOTE: return value 1 means we should stop walking down.
4952 */
4953static noinline int do_walk_down(struct btrfs_trans_handle *trans,
4954 struct btrfs_root *root,
4955 struct btrfs_path *path,
4956 struct walk_control *wc)
4957{
4958 u64 bytenr;
4959 u64 generation;
4960 u64 parent;
4961 u32 blocksize;
4962 struct btrfs_key key;
4963 struct extent_buffer *next;
4964 int level = wc->level;
4965 int reada = 0;
4966 int ret = 0;
4967
4968 generation = btrfs_node_ptr_generation(path->nodes[level],
4969 path->slots[level]);
4970 /*
4971 * if the lower level block was created before the snapshot
4972 * was created, we know there is no need to update back refs
4973 * for the subtree
4974 */
4975 if (wc->stage == UPDATE_BACKREF &&
4976 generation <= root->root_key.offset)
4977 return 1;
4978
4979 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
4980 blocksize = btrfs_level_size(root, level - 1);
4981
4982 next = btrfs_find_tree_block(root, bytenr, blocksize);
4983 if (!next) {
4984 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
4985 reada = 1;
4986 }
4987 btrfs_tree_lock(next);
4988 btrfs_set_lock_blocking(next);
4989
4990 if (wc->stage == DROP_REFERENCE) {
4991 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
4992 &wc->refs[level - 1],
4993 &wc->flags[level - 1]);
4994 BUG_ON(ret);
4995 BUG_ON(wc->refs[level - 1] == 0);
4996
4997 if (wc->refs[level - 1] > 1) {
4998 if (!wc->update_ref ||
4999 generation <= root->root_key.offset)
5000 goto skip;
5001
5002 btrfs_node_key_to_cpu(path->nodes[level], &key,
5003 path->slots[level]);
5004 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
5005 if (ret < 0)
5006 goto skip;
5007
5008 wc->stage = UPDATE_BACKREF;
5009 wc->shared_level = level - 1;
5010 }
5011 }
5012
5013 if (!btrfs_buffer_uptodate(next, generation)) {
5014 btrfs_tree_unlock(next);
5015 free_extent_buffer(next);
5016 next = NULL;
5017 }
5018
5019 if (!next) {
5020 if (reada && level == 1)
5021 reada_walk_down(trans, root, wc, path);
5022 next = read_tree_block(root, bytenr, blocksize, generation);
5023 btrfs_tree_lock(next);
5024 btrfs_set_lock_blocking(next);
5025 }
5026
5027 level--;
5028 BUG_ON(level != btrfs_header_level(next));
5029 path->nodes[level] = next;
5030 path->slots[level] = 0;
5031 path->locks[level] = 1;
5032 wc->level = level;
5033 if (wc->level == 1)
5034 wc->reada_slot = 0;
5035 return 0;
5036skip:
5037 wc->refs[level - 1] = 0;
5038 wc->flags[level - 1] = 0;
5039
5040 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5041 parent = path->nodes[level]->start;
5042 } else {
5043 BUG_ON(root->root_key.objectid !=
5044 btrfs_header_owner(path->nodes[level]));
5045 parent = 0;
5046 }
5047
5048 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
5049 root->root_key.objectid, level - 1, 0);
5050 BUG_ON(ret);
5051
5052 btrfs_tree_unlock(next);
5053 free_extent_buffer(next);
5054 return 1;
5055}
5056
5057/*
4881 * hepler to process tree block while walking up the tree. 5058 * hepler to process tree block while walking up the tree.
4882 * 5059 *
4883 * when wc->stage == DROP_REFERENCE, this function drops 5060 * when wc->stage == DROP_REFERENCE, this function drops
@@ -4904,7 +5081,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
4904 if (level < wc->shared_level) 5081 if (level < wc->shared_level)
4905 goto out; 5082 goto out;
4906 5083
4907 BUG_ON(wc->refs[level] <= 1);
4908 ret = find_next_key(path, level + 1, &wc->update_progress); 5084 ret = find_next_key(path, level + 1, &wc->update_progress);
4909 if (ret > 0) 5085 if (ret > 0)
4910 wc->update_ref = 0; 5086 wc->update_ref = 0;
@@ -4935,8 +5111,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
4935 path->locks[level] = 0; 5111 path->locks[level] = 0;
4936 return 1; 5112 return 1;
4937 } 5113 }
4938 } else {
4939 BUG_ON(level != 0);
4940 } 5114 }
4941 } 5115 }
4942 5116
@@ -4989,17 +5163,13 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4989 struct btrfs_path *path, 5163 struct btrfs_path *path,
4990 struct walk_control *wc) 5164 struct walk_control *wc)
4991{ 5165{
4992 struct extent_buffer *next;
4993 struct extent_buffer *cur;
4994 u64 bytenr;
4995 u64 ptr_gen;
4996 u32 blocksize;
4997 int level = wc->level; 5166 int level = wc->level;
4998 int ret; 5167 int ret;
4999 5168
5000 while (level >= 0) { 5169 while (level >= 0) {
5001 cur = path->nodes[level]; 5170 if (path->slots[level] >=
5002 BUG_ON(path->slots[level] >= btrfs_header_nritems(cur)); 5171 btrfs_header_nritems(path->nodes[level]))
5172 break;
5003 5173
5004 ret = walk_down_proc(trans, root, path, wc); 5174 ret = walk_down_proc(trans, root, path, wc);
5005 if (ret > 0) 5175 if (ret > 0)
@@ -5008,20 +5178,12 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5008 if (level == 0) 5178 if (level == 0)
5009 break; 5179 break;
5010 5180
5011 bytenr = btrfs_node_blockptr(cur, path->slots[level]); 5181 ret = do_walk_down(trans, root, path, wc);
5012 blocksize = btrfs_level_size(root, level - 1); 5182 if (ret > 0) {
5013 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]); 5183 path->slots[level]++;
5014 5184 continue;
5015 next = read_tree_block(root, bytenr, blocksize, ptr_gen); 5185 }
5016 btrfs_tree_lock(next); 5186 level = wc->level;
5017 btrfs_set_lock_blocking(next);
5018
5019 level--;
5020 BUG_ON(level != btrfs_header_level(next));
5021 path->nodes[level] = next;
5022 path->slots[level] = 0;
5023 path->locks[level] = 1;
5024 wc->level = level;
5025 } 5187 }
5026 return 0; 5188 return 0;
5027} 5189}
@@ -5111,9 +5273,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5111 err = ret; 5273 err = ret;
5112 goto out; 5274 goto out;
5113 } 5275 }
5114 btrfs_node_key_to_cpu(path->nodes[level], &key, 5276 WARN_ON(ret > 0);
5115 path->slots[level]);
5116 WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key)));
5117 5277
5118 /* 5278 /*
5119 * unlock our path, this is safe because only this 5279 * unlock our path, this is safe because only this
@@ -5148,6 +5308,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5148 wc->stage = DROP_REFERENCE; 5308 wc->stage = DROP_REFERENCE;
5149 wc->update_ref = update_ref; 5309 wc->update_ref = update_ref;
5150 wc->keep_locks = 0; 5310 wc->keep_locks = 0;
5311 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
5151 5312
5152 while (1) { 5313 while (1) {
5153 ret = walk_down_tree(trans, root, path, wc); 5314 ret = walk_down_tree(trans, root, path, wc);
@@ -5200,9 +5361,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5200 ret = btrfs_del_root(trans, tree_root, &root->root_key); 5361 ret = btrfs_del_root(trans, tree_root, &root->root_key);
5201 BUG_ON(ret); 5362 BUG_ON(ret);
5202 5363
5203 free_extent_buffer(root->node); 5364 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
5204 free_extent_buffer(root->commit_root); 5365 ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
5205 kfree(root); 5366 NULL, NULL);
5367 BUG_ON(ret < 0);
5368 if (ret > 0) {
5369 ret = btrfs_del_orphan_item(trans, tree_root,
5370 root->root_key.objectid);
5371 BUG_ON(ret);
5372 }
5373 }
5374
5375 if (root->in_radix) {
5376 btrfs_free_fs_root(tree_root->fs_info, root);
5377 } else {
5378 free_extent_buffer(root->node);
5379 free_extent_buffer(root->commit_root);
5380 kfree(root);
5381 }
5206out: 5382out:
5207 btrfs_end_transaction(trans, tree_root); 5383 btrfs_end_transaction(trans, tree_root);
5208 kfree(wc); 5384 kfree(wc);
@@ -5254,6 +5430,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
5254 wc->stage = DROP_REFERENCE; 5430 wc->stage = DROP_REFERENCE;
5255 wc->update_ref = 0; 5431 wc->update_ref = 0;
5256 wc->keep_locks = 1; 5432 wc->keep_locks = 1;
5433 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
5257 5434
5258 while (1) { 5435 while (1) {
5259 wret = walk_down_tree(trans, root, path, wc); 5436 wret = walk_down_tree(trans, root, path, wc);
@@ -5396,9 +5573,9 @@ static noinline int relocate_data_extent(struct inode *reloc_inode,
5396 lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); 5573 lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
5397 while (1) { 5574 while (1) {
5398 int ret; 5575 int ret;
5399 spin_lock(&em_tree->lock); 5576 write_lock(&em_tree->lock);
5400 ret = add_extent_mapping(em_tree, em); 5577 ret = add_extent_mapping(em_tree, em);
5401 spin_unlock(&em_tree->lock); 5578 write_unlock(&em_tree->lock);
5402 if (ret != -EEXIST) { 5579 if (ret != -EEXIST) {
5403 free_extent_map(em); 5580 free_extent_map(em);
5404 break; 5581 break;
@@ -6841,287 +7018,86 @@ int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
6841 return 0; 7018 return 0;
6842} 7019}
6843 7020
6844#if 0 7021/*
6845static int __insert_orphan_inode(struct btrfs_trans_handle *trans, 7022 * checks to see if its even possible to relocate this block group.
6846 struct btrfs_root *root, 7023 *
6847 u64 objectid, u64 size) 7024 * @return - -1 if it's not a good idea to relocate this block group, 0 if its
6848{ 7025 * ok to go ahead and try.
6849 struct btrfs_path *path; 7026 */
6850 struct btrfs_inode_item *item; 7027int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
6851 struct extent_buffer *leaf;
6852 int ret;
6853
6854 path = btrfs_alloc_path();
6855 if (!path)
6856 return -ENOMEM;
6857
6858 path->leave_spinning = 1;
6859 ret = btrfs_insert_empty_inode(trans, root, path, objectid);
6860 if (ret)
6861 goto out;
6862
6863 leaf = path->nodes[0];
6864 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
6865 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
6866 btrfs_set_inode_generation(leaf, item, 1);
6867 btrfs_set_inode_size(leaf, item, size);
6868 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
6869 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
6870 btrfs_mark_buffer_dirty(leaf);
6871 btrfs_release_path(root, path);
6872out:
6873 btrfs_free_path(path);
6874 return ret;
6875}
6876
6877static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
6878 struct btrfs_block_group_cache *group)
6879{ 7028{
6880 struct inode *inode = NULL; 7029 struct btrfs_block_group_cache *block_group;
6881 struct btrfs_trans_handle *trans; 7030 struct btrfs_space_info *space_info;
6882 struct btrfs_root *root; 7031 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
6883 struct btrfs_key root_key; 7032 struct btrfs_device *device;
6884 u64 objectid = BTRFS_FIRST_FREE_OBJECTID; 7033 int full = 0;
6885 int err = 0; 7034 int ret = 0;
6886 7035
6887 root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; 7036 block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
6888 root_key.type = BTRFS_ROOT_ITEM_KEY;
6889 root_key.offset = (u64)-1;
6890 root = btrfs_read_fs_root_no_name(fs_info, &root_key);
6891 if (IS_ERR(root))
6892 return ERR_CAST(root);
6893 7037
6894 trans = btrfs_start_transaction(root, 1); 7038 /* odd, couldn't find the block group, leave it alone */
6895 BUG_ON(!trans); 7039 if (!block_group)
7040 return -1;
6896 7041
6897 err = btrfs_find_free_objectid(trans, root, objectid, &objectid); 7042 /* no bytes used, we're good */
6898 if (err) 7043 if (!btrfs_block_group_used(&block_group->item))
6899 goto out; 7044 goto out;
6900 7045
6901 err = __insert_orphan_inode(trans, root, objectid, group->key.offset); 7046 space_info = block_group->space_info;
6902 BUG_ON(err); 7047 spin_lock(&space_info->lock);
6903
6904 err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
6905 group->key.offset, 0, group->key.offset,
6906 0, 0, 0);
6907 BUG_ON(err);
6908
6909 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
6910 if (inode->i_state & I_NEW) {
6911 BTRFS_I(inode)->root = root;
6912 BTRFS_I(inode)->location.objectid = objectid;
6913 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
6914 BTRFS_I(inode)->location.offset = 0;
6915 btrfs_read_locked_inode(inode);
6916 unlock_new_inode(inode);
6917 BUG_ON(is_bad_inode(inode));
6918 } else {
6919 BUG_ON(1);
6920 }
6921 BTRFS_I(inode)->index_cnt = group->key.objectid;
6922
6923 err = btrfs_orphan_add(trans, inode);
6924out:
6925 btrfs_end_transaction(trans, root);
6926 if (err) {
6927 if (inode)
6928 iput(inode);
6929 inode = ERR_PTR(err);
6930 }
6931 return inode;
6932}
6933
6934int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
6935{
6936
6937 struct btrfs_ordered_sum *sums;
6938 struct btrfs_sector_sum *sector_sum;
6939 struct btrfs_ordered_extent *ordered;
6940 struct btrfs_root *root = BTRFS_I(inode)->root;
6941 struct list_head list;
6942 size_t offset;
6943 int ret;
6944 u64 disk_bytenr;
6945
6946 INIT_LIST_HEAD(&list);
6947
6948 ordered = btrfs_lookup_ordered_extent(inode, file_pos);
6949 BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
6950
6951 disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
6952 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
6953 disk_bytenr + len - 1, &list);
6954
6955 while (!list_empty(&list)) {
6956 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
6957 list_del_init(&sums->list);
6958
6959 sector_sum = sums->sums;
6960 sums->bytenr = ordered->start;
6961 7048
6962 offset = 0; 7049 full = space_info->full;
6963 while (offset < sums->len) {
6964 sector_sum->bytenr += ordered->start - disk_bytenr;
6965 sector_sum++;
6966 offset += root->sectorsize;
6967 }
6968 7050
6969 btrfs_add_ordered_sum(inode, ordered, sums); 7051 /*
7052 * if this is the last block group we have in this space, we can't
7053 * relocate it unless we're able to allocate a new chunk below.
7054 *
7055 * Otherwise, we need to make sure we have room in the space to handle
7056 * all of the extents from this block group. If we can, we're good
7057 */
7058 if ((space_info->total_bytes != block_group->key.offset) &&
7059 (space_info->bytes_used + space_info->bytes_reserved +
7060 space_info->bytes_pinned + space_info->bytes_readonly +
7061 btrfs_block_group_used(&block_group->item) <
7062 space_info->total_bytes)) {
7063 spin_unlock(&space_info->lock);
7064 goto out;
6970 } 7065 }
6971 btrfs_put_ordered_extent(ordered); 7066 spin_unlock(&space_info->lock);
6972 return 0;
6973}
6974
6975int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
6976{
6977 struct btrfs_trans_handle *trans;
6978 struct btrfs_path *path;
6979 struct btrfs_fs_info *info = root->fs_info;
6980 struct extent_buffer *leaf;
6981 struct inode *reloc_inode;
6982 struct btrfs_block_group_cache *block_group;
6983 struct btrfs_key key;
6984 u64 skipped;
6985 u64 cur_byte;
6986 u64 total_found;
6987 u32 nritems;
6988 int ret;
6989 int progress;
6990 int pass = 0;
6991
6992 root = root->fs_info->extent_root;
6993
6994 block_group = btrfs_lookup_block_group(info, group_start);
6995 BUG_ON(!block_group);
6996
6997 printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
6998 (unsigned long long)block_group->key.objectid,
6999 (unsigned long long)block_group->flags);
7000
7001 path = btrfs_alloc_path();
7002 BUG_ON(!path);
7003
7004 reloc_inode = create_reloc_inode(info, block_group);
7005 BUG_ON(IS_ERR(reloc_inode));
7006
7007 __alloc_chunk_for_shrink(root, block_group, 1);
7008 set_block_group_readonly(block_group);
7009
7010 btrfs_start_delalloc_inodes(info->tree_root);
7011 btrfs_wait_ordered_extents(info->tree_root, 0);
7012again:
7013 skipped = 0;
7014 total_found = 0;
7015 progress = 0;
7016 key.objectid = block_group->key.objectid;
7017 key.offset = 0;
7018 key.type = 0;
7019 cur_byte = key.objectid;
7020
7021 trans = btrfs_start_transaction(info->tree_root, 1);
7022 btrfs_commit_transaction(trans, info->tree_root);
7023 7067
7024 mutex_lock(&root->fs_info->cleaner_mutex); 7068 /*
7025 btrfs_clean_old_snapshots(info->tree_root); 7069 * ok we don't have enough space, but maybe we have free space on our
7026 btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1); 7070 * devices to allocate new chunks for relocation, so loop through our
7027 mutex_unlock(&root->fs_info->cleaner_mutex); 7071 * alloc devices and guess if we have enough space. However, if we
7072 * were marked as full, then we know there aren't enough chunks, and we
7073 * can just return.
7074 */
7075 ret = -1;
7076 if (full)
7077 goto out;
7028 7078
7029 trans = btrfs_start_transaction(info->tree_root, 1); 7079 mutex_lock(&root->fs_info->chunk_mutex);
7030 btrfs_commit_transaction(trans, info->tree_root); 7080 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7081 u64 min_free = btrfs_block_group_used(&block_group->item);
7082 u64 dev_offset, max_avail;
7031 7083
7032 while (1) { 7084 /*
7033 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7085 * check to make sure we can actually find a chunk with enough
7034 if (ret < 0) 7086 * space to fit our block group in.
7035 goto out; 7087 */
7036next: 7088 if (device->total_bytes > device->bytes_used + min_free) {
7037 leaf = path->nodes[0]; 7089 ret = find_free_dev_extent(NULL, device, min_free,
7038 nritems = btrfs_header_nritems(leaf); 7090 &dev_offset, &max_avail);
7039 if (path->slots[0] >= nritems) { 7091 if (!ret)
7040 ret = btrfs_next_leaf(root, path);
7041 if (ret < 0)
7042 goto out;
7043 if (ret == 1) {
7044 ret = 0;
7045 break; 7092 break;
7046 } 7093 ret = -1;
7047 leaf = path->nodes[0];
7048 nritems = btrfs_header_nritems(leaf);
7049 }
7050
7051 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
7052
7053 if (key.objectid >= block_group->key.objectid +
7054 block_group->key.offset)
7055 break;
7056
7057 if (progress && need_resched()) {
7058 btrfs_release_path(root, path);
7059 cond_resched();
7060 progress = 0;
7061 continue;
7062 }
7063 progress = 1;
7064
7065 if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
7066 key.objectid + key.offset <= cur_byte) {
7067 path->slots[0]++;
7068 goto next;
7069 }
7070
7071 total_found++;
7072 cur_byte = key.objectid + key.offset;
7073 btrfs_release_path(root, path);
7074
7075 __alloc_chunk_for_shrink(root, block_group, 0);
7076 ret = relocate_one_extent(root, path, &key, block_group,
7077 reloc_inode, pass);
7078 BUG_ON(ret < 0);
7079 if (ret > 0)
7080 skipped++;
7081
7082 key.objectid = cur_byte;
7083 key.type = 0;
7084 key.offset = 0;
7085 }
7086
7087 btrfs_release_path(root, path);
7088
7089 if (pass == 0) {
7090 btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
7091 invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
7092 }
7093
7094 if (total_found > 0) {
7095 printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
7096 (unsigned long long)total_found, pass);
7097 pass++;
7098 if (total_found == skipped && pass > 2) {
7099 iput(reloc_inode);
7100 reloc_inode = create_reloc_inode(info, block_group);
7101 pass = 0;
7102 } 7094 }
7103 goto again;
7104 } 7095 }
7105 7096 mutex_unlock(&root->fs_info->chunk_mutex);
7106 /* delete reloc_inode */
7107 iput(reloc_inode);
7108
7109 /* unpin extents in this range */
7110 trans = btrfs_start_transaction(info->tree_root, 1);
7111 btrfs_commit_transaction(trans, info->tree_root);
7112
7113 spin_lock(&block_group->lock);
7114 WARN_ON(block_group->pinned > 0);
7115 WARN_ON(block_group->reserved > 0);
7116 WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
7117 spin_unlock(&block_group->lock);
7118 btrfs_put_block_group(block_group);
7119 ret = 0;
7120out: 7097out:
7121 btrfs_free_path(path); 7098 btrfs_put_block_group(block_group);
7122 return ret; 7099 return ret;
7123} 7100}
7124#endif
7125 7101
7126static int find_first_block_group(struct btrfs_root *root, 7102static int find_first_block_group(struct btrfs_root *root,
7127 struct btrfs_path *path, struct btrfs_key *key) 7103 struct btrfs_path *path, struct btrfs_key *key)
@@ -7164,8 +7140,18 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
7164{ 7140{
7165 struct btrfs_block_group_cache *block_group; 7141 struct btrfs_block_group_cache *block_group;
7166 struct btrfs_space_info *space_info; 7142 struct btrfs_space_info *space_info;
7143 struct btrfs_caching_control *caching_ctl;
7167 struct rb_node *n; 7144 struct rb_node *n;
7168 7145
7146 down_write(&info->extent_commit_sem);
7147 while (!list_empty(&info->caching_block_groups)) {
7148 caching_ctl = list_entry(info->caching_block_groups.next,
7149 struct btrfs_caching_control, list);
7150 list_del(&caching_ctl->list);
7151 put_caching_control(caching_ctl);
7152 }
7153 up_write(&info->extent_commit_sem);
7154
7169 spin_lock(&info->block_group_cache_lock); 7155 spin_lock(&info->block_group_cache_lock);
7170 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 7156 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
7171 block_group = rb_entry(n, struct btrfs_block_group_cache, 7157 block_group = rb_entry(n, struct btrfs_block_group_cache,
@@ -7179,8 +7165,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
7179 up_write(&block_group->space_info->groups_sem); 7165 up_write(&block_group->space_info->groups_sem);
7180 7166
7181 if (block_group->cached == BTRFS_CACHE_STARTED) 7167 if (block_group->cached == BTRFS_CACHE_STARTED)
7182 wait_event(block_group->caching_q, 7168 wait_block_group_cache_done(block_group);
7183 block_group_cache_done(block_group));
7184 7169
7185 btrfs_remove_free_space_cache(block_group); 7170 btrfs_remove_free_space_cache(block_group);
7186 7171
@@ -7250,7 +7235,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7250 spin_lock_init(&cache->lock); 7235 spin_lock_init(&cache->lock);
7251 spin_lock_init(&cache->tree_lock); 7236 spin_lock_init(&cache->tree_lock);
7252 cache->fs_info = info; 7237 cache->fs_info = info;
7253 init_waitqueue_head(&cache->caching_q);
7254 INIT_LIST_HEAD(&cache->list); 7238 INIT_LIST_HEAD(&cache->list);
7255 INIT_LIST_HEAD(&cache->cluster_list); 7239 INIT_LIST_HEAD(&cache->cluster_list);
7256 7240
@@ -7272,8 +7256,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7272 cache->flags = btrfs_block_group_flags(&cache->item); 7256 cache->flags = btrfs_block_group_flags(&cache->item);
7273 cache->sectorsize = root->sectorsize; 7257 cache->sectorsize = root->sectorsize;
7274 7258
7275 remove_sb_from_cache(root, cache);
7276
7277 /* 7259 /*
7278 * check for two cases, either we are full, and therefore 7260 * check for two cases, either we are full, and therefore
7279 * don't need to bother with the caching work since we won't 7261 * don't need to bother with the caching work since we won't
@@ -7282,13 +7264,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7282 * time, particularly in the full case. 7264 * time, particularly in the full case.
7283 */ 7265 */
7284 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 7266 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
7267 exclude_super_stripes(root, cache);
7268 cache->last_byte_to_unpin = (u64)-1;
7285 cache->cached = BTRFS_CACHE_FINISHED; 7269 cache->cached = BTRFS_CACHE_FINISHED;
7270 free_excluded_extents(root, cache);
7286 } else if (btrfs_block_group_used(&cache->item) == 0) { 7271 } else if (btrfs_block_group_used(&cache->item) == 0) {
7272 exclude_super_stripes(root, cache);
7273 cache->last_byte_to_unpin = (u64)-1;
7287 cache->cached = BTRFS_CACHE_FINISHED; 7274 cache->cached = BTRFS_CACHE_FINISHED;
7288 add_new_free_space(cache, root->fs_info, 7275 add_new_free_space(cache, root->fs_info,
7289 found_key.objectid, 7276 found_key.objectid,
7290 found_key.objectid + 7277 found_key.objectid +
7291 found_key.offset); 7278 found_key.offset);
7279 free_excluded_extents(root, cache);
7292 } 7280 }
7293 7281
7294 ret = update_space_info(info, cache->flags, found_key.offset, 7282 ret = update_space_info(info, cache->flags, found_key.offset,
@@ -7296,6 +7284,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7296 &space_info); 7284 &space_info);
7297 BUG_ON(ret); 7285 BUG_ON(ret);
7298 cache->space_info = space_info; 7286 cache->space_info = space_info;
7287 spin_lock(&cache->space_info->lock);
7288 cache->space_info->bytes_super += cache->bytes_super;
7289 spin_unlock(&cache->space_info->lock);
7290
7299 down_write(&space_info->groups_sem); 7291 down_write(&space_info->groups_sem);
7300 list_add_tail(&cache->list, &space_info->block_groups); 7292 list_add_tail(&cache->list, &space_info->block_groups);
7301 up_write(&space_info->groups_sem); 7293 up_write(&space_info->groups_sem);
@@ -7345,7 +7337,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7345 atomic_set(&cache->count, 1); 7337 atomic_set(&cache->count, 1);
7346 spin_lock_init(&cache->lock); 7338 spin_lock_init(&cache->lock);
7347 spin_lock_init(&cache->tree_lock); 7339 spin_lock_init(&cache->tree_lock);
7348 init_waitqueue_head(&cache->caching_q);
7349 INIT_LIST_HEAD(&cache->list); 7340 INIT_LIST_HEAD(&cache->list);
7350 INIT_LIST_HEAD(&cache->cluster_list); 7341 INIT_LIST_HEAD(&cache->cluster_list);
7351 7342
@@ -7354,15 +7345,23 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7354 cache->flags = type; 7345 cache->flags = type;
7355 btrfs_set_block_group_flags(&cache->item, type); 7346 btrfs_set_block_group_flags(&cache->item, type);
7356 7347
7348 cache->last_byte_to_unpin = (u64)-1;
7357 cache->cached = BTRFS_CACHE_FINISHED; 7349 cache->cached = BTRFS_CACHE_FINISHED;
7358 remove_sb_from_cache(root, cache); 7350 exclude_super_stripes(root, cache);
7359 7351
7360 add_new_free_space(cache, root->fs_info, chunk_offset, 7352 add_new_free_space(cache, root->fs_info, chunk_offset,
7361 chunk_offset + size); 7353 chunk_offset + size);
7362 7354
7355 free_excluded_extents(root, cache);
7356
7363 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 7357 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7364 &cache->space_info); 7358 &cache->space_info);
7365 BUG_ON(ret); 7359 BUG_ON(ret);
7360
7361 spin_lock(&cache->space_info->lock);
7362 cache->space_info->bytes_super += cache->bytes_super;
7363 spin_unlock(&cache->space_info->lock);
7364
7366 down_write(&cache->space_info->groups_sem); 7365 down_write(&cache->space_info->groups_sem);
7367 list_add_tail(&cache->list, &cache->space_info->block_groups); 7366 list_add_tail(&cache->list, &cache->space_info->block_groups);
7368 up_write(&cache->space_info->groups_sem); 7367 up_write(&cache->space_info->groups_sem);
@@ -7428,8 +7427,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7428 up_write(&block_group->space_info->groups_sem); 7427 up_write(&block_group->space_info->groups_sem);
7429 7428
7430 if (block_group->cached == BTRFS_CACHE_STARTED) 7429 if (block_group->cached == BTRFS_CACHE_STARTED)
7431 wait_event(block_group->caching_q, 7430 wait_block_group_cache_done(block_group);
7432 block_group_cache_done(block_group));
7433 7431
7434 btrfs_remove_free_space_cache(block_group); 7432 btrfs_remove_free_space_cache(block_group);
7435 7433
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 68260180f587..de1793ba004a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -280,6 +280,14 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
280 return NULL; 280 return NULL;
281} 281}
282 282
283static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
284 struct extent_state *other)
285{
286 if (tree->ops && tree->ops->merge_extent_hook)
287 tree->ops->merge_extent_hook(tree->mapping->host, new,
288 other);
289}
290
283/* 291/*
284 * utility function to look for merge candidates inside a given range. 292 * utility function to look for merge candidates inside a given range.
285 * Any extents with matching state are merged together into a single 293 * Any extents with matching state are merged together into a single
@@ -303,6 +311,7 @@ static int merge_state(struct extent_io_tree *tree,
303 other = rb_entry(other_node, struct extent_state, rb_node); 311 other = rb_entry(other_node, struct extent_state, rb_node);
304 if (other->end == state->start - 1 && 312 if (other->end == state->start - 1 &&
305 other->state == state->state) { 313 other->state == state->state) {
314 merge_cb(tree, state, other);
306 state->start = other->start; 315 state->start = other->start;
307 other->tree = NULL; 316 other->tree = NULL;
308 rb_erase(&other->rb_node, &tree->state); 317 rb_erase(&other->rb_node, &tree->state);
@@ -314,33 +323,37 @@ static int merge_state(struct extent_io_tree *tree,
314 other = rb_entry(other_node, struct extent_state, rb_node); 323 other = rb_entry(other_node, struct extent_state, rb_node);
315 if (other->start == state->end + 1 && 324 if (other->start == state->end + 1 &&
316 other->state == state->state) { 325 other->state == state->state) {
326 merge_cb(tree, state, other);
317 other->start = state->start; 327 other->start = state->start;
318 state->tree = NULL; 328 state->tree = NULL;
319 rb_erase(&state->rb_node, &tree->state); 329 rb_erase(&state->rb_node, &tree->state);
320 free_extent_state(state); 330 free_extent_state(state);
331 state = NULL;
321 } 332 }
322 } 333 }
334
323 return 0; 335 return 0;
324} 336}
325 337
326static void set_state_cb(struct extent_io_tree *tree, 338static int set_state_cb(struct extent_io_tree *tree,
327 struct extent_state *state, 339 struct extent_state *state,
328 unsigned long bits) 340 unsigned long bits)
329{ 341{
330 if (tree->ops && tree->ops->set_bit_hook) { 342 if (tree->ops && tree->ops->set_bit_hook) {
331 tree->ops->set_bit_hook(tree->mapping->host, state->start, 343 return tree->ops->set_bit_hook(tree->mapping->host,
332 state->end, state->state, bits); 344 state->start, state->end,
345 state->state, bits);
333 } 346 }
347
348 return 0;
334} 349}
335 350
336static void clear_state_cb(struct extent_io_tree *tree, 351static void clear_state_cb(struct extent_io_tree *tree,
337 struct extent_state *state, 352 struct extent_state *state,
338 unsigned long bits) 353 unsigned long bits)
339{ 354{
340 if (tree->ops && tree->ops->clear_bit_hook) { 355 if (tree->ops && tree->ops->clear_bit_hook)
341 tree->ops->clear_bit_hook(tree->mapping->host, state->start, 356 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
342 state->end, state->state, bits);
343 }
344} 357}
345 358
346/* 359/*
@@ -358,6 +371,7 @@ static int insert_state(struct extent_io_tree *tree,
358 int bits) 371 int bits)
359{ 372{
360 struct rb_node *node; 373 struct rb_node *node;
374 int ret;
361 375
362 if (end < start) { 376 if (end < start) {
363 printk(KERN_ERR "btrfs end < start %llu %llu\n", 377 printk(KERN_ERR "btrfs end < start %llu %llu\n",
@@ -365,12 +379,15 @@ static int insert_state(struct extent_io_tree *tree,
365 (unsigned long long)start); 379 (unsigned long long)start);
366 WARN_ON(1); 380 WARN_ON(1);
367 } 381 }
382 state->start = start;
383 state->end = end;
384 ret = set_state_cb(tree, state, bits);
385 if (ret)
386 return ret;
387
368 if (bits & EXTENT_DIRTY) 388 if (bits & EXTENT_DIRTY)
369 tree->dirty_bytes += end - start + 1; 389 tree->dirty_bytes += end - start + 1;
370 set_state_cb(tree, state, bits);
371 state->state |= bits; 390 state->state |= bits;
372 state->start = start;
373 state->end = end;
374 node = tree_insert(&tree->state, end, &state->rb_node); 391 node = tree_insert(&tree->state, end, &state->rb_node);
375 if (node) { 392 if (node) {
376 struct extent_state *found; 393 struct extent_state *found;
@@ -387,6 +404,15 @@ static int insert_state(struct extent_io_tree *tree,
387 return 0; 404 return 0;
388} 405}
389 406
407static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
408 u64 split)
409{
410 if (tree->ops && tree->ops->split_extent_hook)
411 return tree->ops->split_extent_hook(tree->mapping->host,
412 orig, split);
413 return 0;
414}
415
390/* 416/*
391 * split a given extent state struct in two, inserting the preallocated 417 * split a given extent state struct in two, inserting the preallocated
392 * struct 'prealloc' as the newly created second half. 'split' indicates an 418 * struct 'prealloc' as the newly created second half. 'split' indicates an
@@ -405,6 +431,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
405 struct extent_state *prealloc, u64 split) 431 struct extent_state *prealloc, u64 split)
406{ 432{
407 struct rb_node *node; 433 struct rb_node *node;
434
435 split_cb(tree, orig, split);
436
408 prealloc->start = orig->start; 437 prealloc->start = orig->start;
409 prealloc->end = split - 1; 438 prealloc->end = split - 1;
410 prealloc->state = orig->state; 439 prealloc->state = orig->state;
@@ -471,10 +500,14 @@ static int clear_state_bit(struct extent_io_tree *tree,
471 * bits were already set, or zero if none of the bits were already set. 500 * bits were already set, or zero if none of the bits were already set.
472 */ 501 */
473int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 502int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
474 int bits, int wake, int delete, gfp_t mask) 503 int bits, int wake, int delete,
504 struct extent_state **cached_state,
505 gfp_t mask)
475{ 506{
476 struct extent_state *state; 507 struct extent_state *state;
508 struct extent_state *cached;
477 struct extent_state *prealloc = NULL; 509 struct extent_state *prealloc = NULL;
510 struct rb_node *next_node;
478 struct rb_node *node; 511 struct rb_node *node;
479 u64 last_end; 512 u64 last_end;
480 int err; 513 int err;
@@ -488,6 +521,17 @@ again:
488 } 521 }
489 522
490 spin_lock(&tree->lock); 523 spin_lock(&tree->lock);
524 if (cached_state) {
525 cached = *cached_state;
526 *cached_state = NULL;
527 cached_state = NULL;
528 if (cached && cached->tree && cached->start == start) {
529 atomic_dec(&cached->refs);
530 state = cached;
531 goto hit_next;
532 }
533 free_extent_state(cached);
534 }
491 /* 535 /*
492 * this search will find the extents that end after 536 * this search will find the extents that end after
493 * our range starts 537 * our range starts
@@ -496,6 +540,7 @@ again:
496 if (!node) 540 if (!node)
497 goto out; 541 goto out;
498 state = rb_entry(node, struct extent_state, rb_node); 542 state = rb_entry(node, struct extent_state, rb_node);
543hit_next:
499 if (state->start > end) 544 if (state->start > end)
500 goto out; 545 goto out;
501 WARN_ON(state->end < start); 546 WARN_ON(state->end < start);
@@ -526,13 +571,11 @@ again:
526 if (err) 571 if (err)
527 goto out; 572 goto out;
528 if (state->end <= end) { 573 if (state->end <= end) {
529 set |= clear_state_bit(tree, state, bits, 574 set |= clear_state_bit(tree, state, bits, wake,
530 wake, delete); 575 delete);
531 if (last_end == (u64)-1) 576 if (last_end == (u64)-1)
532 goto out; 577 goto out;
533 start = last_end + 1; 578 start = last_end + 1;
534 } else {
535 start = state->start;
536 } 579 }
537 goto search_again; 580 goto search_again;
538 } 581 }
@@ -547,19 +590,30 @@ again:
547 prealloc = alloc_extent_state(GFP_ATOMIC); 590 prealloc = alloc_extent_state(GFP_ATOMIC);
548 err = split_state(tree, state, prealloc, end + 1); 591 err = split_state(tree, state, prealloc, end + 1);
549 BUG_ON(err == -EEXIST); 592 BUG_ON(err == -EEXIST);
550
551 if (wake) 593 if (wake)
552 wake_up(&state->wq); 594 wake_up(&state->wq);
553 set |= clear_state_bit(tree, prealloc, bits, 595
554 wake, delete); 596 set |= clear_state_bit(tree, prealloc, bits, wake, delete);
597
555 prealloc = NULL; 598 prealloc = NULL;
556 goto out; 599 goto out;
557 } 600 }
558 601
602 if (state->end < end && prealloc && !need_resched())
603 next_node = rb_next(&state->rb_node);
604 else
605 next_node = NULL;
606
559 set |= clear_state_bit(tree, state, bits, wake, delete); 607 set |= clear_state_bit(tree, state, bits, wake, delete);
560 if (last_end == (u64)-1) 608 if (last_end == (u64)-1)
561 goto out; 609 goto out;
562 start = last_end + 1; 610 start = last_end + 1;
611 if (start <= end && next_node) {
612 state = rb_entry(next_node, struct extent_state,
613 rb_node);
614 if (state->start == start)
615 goto hit_next;
616 }
563 goto search_again; 617 goto search_again;
564 618
565out: 619out:
@@ -641,40 +695,59 @@ out:
641 return 0; 695 return 0;
642} 696}
643 697
644static void set_state_bits(struct extent_io_tree *tree, 698static int set_state_bits(struct extent_io_tree *tree,
645 struct extent_state *state, 699 struct extent_state *state,
646 int bits) 700 int bits)
647{ 701{
702 int ret;
703
704 ret = set_state_cb(tree, state, bits);
705 if (ret)
706 return ret;
707
648 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 708 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
649 u64 range = state->end - state->start + 1; 709 u64 range = state->end - state->start + 1;
650 tree->dirty_bytes += range; 710 tree->dirty_bytes += range;
651 } 711 }
652 set_state_cb(tree, state, bits);
653 state->state |= bits; 712 state->state |= bits;
713
714 return 0;
715}
716
717static void cache_state(struct extent_state *state,
718 struct extent_state **cached_ptr)
719{
720 if (cached_ptr && !(*cached_ptr)) {
721 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
722 *cached_ptr = state;
723 atomic_inc(&state->refs);
724 }
725 }
654} 726}
655 727
656/* 728/*
657 * set some bits on a range in the tree. This may require allocations 729 * set some bits on a range in the tree. This may require allocations or
658 * or sleeping, so the gfp mask is used to indicate what is allowed. 730 * sleeping, so the gfp mask is used to indicate what is allowed.
659 * 731 *
660 * If 'exclusive' == 1, this will fail with -EEXIST if some part of the 732 * If any of the exclusive bits are set, this will fail with -EEXIST if some
661 * range already has the desired bits set. The start of the existing 733 * part of the range already has the desired bits set. The start of the
662 * range is returned in failed_start in this case. 734 * existing range is returned in failed_start in this case.
663 * 735 *
664 * [start, end] is inclusive 736 * [start, end] is inclusive This takes the tree lock.
665 * This takes the tree lock.
666 */ 737 */
738
667static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 739static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
668 int bits, int exclusive, u64 *failed_start, 740 int bits, int exclusive_bits, u64 *failed_start,
741 struct extent_state **cached_state,
669 gfp_t mask) 742 gfp_t mask)
670{ 743{
671 struct extent_state *state; 744 struct extent_state *state;
672 struct extent_state *prealloc = NULL; 745 struct extent_state *prealloc = NULL;
673 struct rb_node *node; 746 struct rb_node *node;
674 int err = 0; 747 int err = 0;
675 int set;
676 u64 last_start; 748 u64 last_start;
677 u64 last_end; 749 u64 last_end;
750
678again: 751again:
679 if (!prealloc && (mask & __GFP_WAIT)) { 752 if (!prealloc && (mask & __GFP_WAIT)) {
680 prealloc = alloc_extent_state(mask); 753 prealloc = alloc_extent_state(mask);
@@ -683,6 +756,13 @@ again:
683 } 756 }
684 757
685 spin_lock(&tree->lock); 758 spin_lock(&tree->lock);
759 if (cached_state && *cached_state) {
760 state = *cached_state;
761 if (state->start == start && state->tree) {
762 node = &state->rb_node;
763 goto hit_next;
764 }
765 }
686 /* 766 /*
687 * this search will find all the extents that end after 767 * this search will find all the extents that end after
688 * our range starts. 768 * our range starts.
@@ -694,8 +774,8 @@ again:
694 BUG_ON(err == -EEXIST); 774 BUG_ON(err == -EEXIST);
695 goto out; 775 goto out;
696 } 776 }
697
698 state = rb_entry(node, struct extent_state, rb_node); 777 state = rb_entry(node, struct extent_state, rb_node);
778hit_next:
699 last_start = state->start; 779 last_start = state->start;
700 last_end = state->end; 780 last_end = state->end;
701 781
@@ -706,17 +786,32 @@ again:
706 * Just lock what we found and keep going 786 * Just lock what we found and keep going
707 */ 787 */
708 if (state->start == start && state->end <= end) { 788 if (state->start == start && state->end <= end) {
709 set = state->state & bits; 789 struct rb_node *next_node;
710 if (set && exclusive) { 790 if (state->state & exclusive_bits) {
711 *failed_start = state->start; 791 *failed_start = state->start;
712 err = -EEXIST; 792 err = -EEXIST;
713 goto out; 793 goto out;
714 } 794 }
715 set_state_bits(tree, state, bits); 795
796 err = set_state_bits(tree, state, bits);
797 if (err)
798 goto out;
799
800 cache_state(state, cached_state);
716 merge_state(tree, state); 801 merge_state(tree, state);
717 if (last_end == (u64)-1) 802 if (last_end == (u64)-1)
718 goto out; 803 goto out;
804
719 start = last_end + 1; 805 start = last_end + 1;
806 if (start < end && prealloc && !need_resched()) {
807 next_node = rb_next(node);
808 if (next_node) {
809 state = rb_entry(next_node, struct extent_state,
810 rb_node);
811 if (state->start == start)
812 goto hit_next;
813 }
814 }
720 goto search_again; 815 goto search_again;
721 } 816 }
722 817
@@ -737,8 +832,7 @@ again:
737 * desired bit on it. 832 * desired bit on it.
738 */ 833 */
739 if (state->start < start) { 834 if (state->start < start) {
740 set = state->state & bits; 835 if (state->state & exclusive_bits) {
741 if (exclusive && set) {
742 *failed_start = start; 836 *failed_start = start;
743 err = -EEXIST; 837 err = -EEXIST;
744 goto out; 838 goto out;
@@ -749,13 +843,14 @@ again:
749 if (err) 843 if (err)
750 goto out; 844 goto out;
751 if (state->end <= end) { 845 if (state->end <= end) {
752 set_state_bits(tree, state, bits); 846 err = set_state_bits(tree, state, bits);
847 if (err)
848 goto out;
849 cache_state(state, cached_state);
753 merge_state(tree, state); 850 merge_state(tree, state);
754 if (last_end == (u64)-1) 851 if (last_end == (u64)-1)
755 goto out; 852 goto out;
756 start = last_end + 1; 853 start = last_end + 1;
757 } else {
758 start = state->start;
759 } 854 }
760 goto search_again; 855 goto search_again;
761 } 856 }
@@ -774,10 +869,13 @@ again:
774 this_end = last_start - 1; 869 this_end = last_start - 1;
775 err = insert_state(tree, prealloc, start, this_end, 870 err = insert_state(tree, prealloc, start, this_end,
776 bits); 871 bits);
777 prealloc = NULL;
778 BUG_ON(err == -EEXIST); 872 BUG_ON(err == -EEXIST);
779 if (err) 873 if (err) {
874 prealloc = NULL;
780 goto out; 875 goto out;
876 }
877 cache_state(prealloc, cached_state);
878 prealloc = NULL;
781 start = this_end + 1; 879 start = this_end + 1;
782 goto search_again; 880 goto search_again;
783 } 881 }
@@ -788,8 +886,7 @@ again:
788 * on the first half 886 * on the first half
789 */ 887 */
790 if (state->start <= end && state->end > end) { 888 if (state->start <= end && state->end > end) {
791 set = state->state & bits; 889 if (state->state & exclusive_bits) {
792 if (exclusive && set) {
793 *failed_start = start; 890 *failed_start = start;
794 err = -EEXIST; 891 err = -EEXIST;
795 goto out; 892 goto out;
@@ -797,7 +894,12 @@ again:
797 err = split_state(tree, state, prealloc, end + 1); 894 err = split_state(tree, state, prealloc, end + 1);
798 BUG_ON(err == -EEXIST); 895 BUG_ON(err == -EEXIST);
799 896
800 set_state_bits(tree, prealloc, bits); 897 err = set_state_bits(tree, prealloc, bits);
898 if (err) {
899 prealloc = NULL;
900 goto out;
901 }
902 cache_state(prealloc, cached_state);
801 merge_state(tree, prealloc); 903 merge_state(tree, prealloc);
802 prealloc = NULL; 904 prealloc = NULL;
803 goto out; 905 goto out;
@@ -826,86 +928,64 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
826 gfp_t mask) 928 gfp_t mask)
827{ 929{
828 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, 930 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
829 mask); 931 NULL, mask);
830}
831
832int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
833 gfp_t mask)
834{
835 return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
836} 932}
837 933
838int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 934int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
839 int bits, gfp_t mask) 935 int bits, gfp_t mask)
840{ 936{
841 return set_extent_bit(tree, start, end, bits, 0, NULL, 937 return set_extent_bit(tree, start, end, bits, 0, NULL,
842 mask); 938 NULL, mask);
843} 939}
844 940
845int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 941int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
846 int bits, gfp_t mask) 942 int bits, gfp_t mask)
847{ 943{
848 return clear_extent_bit(tree, start, end, bits, 0, 0, mask); 944 return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
849} 945}
850 946
851int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 947int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
852 gfp_t mask) 948 gfp_t mask)
853{ 949{
854 return set_extent_bit(tree, start, end, 950 return set_extent_bit(tree, start, end,
855 EXTENT_DELALLOC | EXTENT_DIRTY, 951 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
856 0, NULL, mask); 952 0, NULL, NULL, mask);
857} 953}
858 954
859int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 955int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
860 gfp_t mask) 956 gfp_t mask)
861{ 957{
862 return clear_extent_bit(tree, start, end, 958 return clear_extent_bit(tree, start, end,
863 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask); 959 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
864} 960 NULL, mask);
865
866int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
867 gfp_t mask)
868{
869 return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
870} 961}
871 962
872int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 963int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
873 gfp_t mask) 964 gfp_t mask)
874{ 965{
875 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, 966 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
876 mask); 967 NULL, mask);
877} 968}
878 969
879static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 970static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
880 gfp_t mask) 971 gfp_t mask)
881{ 972{
882 return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask); 973 return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0,
974 NULL, mask);
883} 975}
884 976
885int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 977int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
886 gfp_t mask) 978 gfp_t mask)
887{ 979{
888 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, 980 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
889 mask); 981 NULL, mask);
890} 982}
891 983
892static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, 984static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
893 u64 end, gfp_t mask) 985 u64 end, gfp_t mask)
894{ 986{
895 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask); 987 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
896} 988 NULL, mask);
897
898static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
899 gfp_t mask)
900{
901 return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
902 0, NULL, mask);
903}
904
905static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
906 u64 end, gfp_t mask)
907{
908 return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
909} 989}
910 990
911int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) 991int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
@@ -917,13 +997,15 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
917 * either insert or lock state struct between start and end use mask to tell 997 * either insert or lock state struct between start and end use mask to tell
918 * us if waiting is desired. 998 * us if waiting is desired.
919 */ 999 */
920int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) 1000int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1001 int bits, struct extent_state **cached_state, gfp_t mask)
921{ 1002{
922 int err; 1003 int err;
923 u64 failed_start; 1004 u64 failed_start;
924 while (1) { 1005 while (1) {
925 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 1006 err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
926 &failed_start, mask); 1007 EXTENT_LOCKED, &failed_start,
1008 cached_state, mask);
927 if (err == -EEXIST && (mask & __GFP_WAIT)) { 1009 if (err == -EEXIST && (mask & __GFP_WAIT)) {
928 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 1010 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
929 start = failed_start; 1011 start = failed_start;
@@ -935,27 +1017,40 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
935 return err; 1017 return err;
936} 1018}
937 1019
1020int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
1021{
1022 return lock_extent_bits(tree, start, end, 0, NULL, mask);
1023}
1024
938int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 1025int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
939 gfp_t mask) 1026 gfp_t mask)
940{ 1027{
941 int err; 1028 int err;
942 u64 failed_start; 1029 u64 failed_start;
943 1030
944 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 1031 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
945 &failed_start, mask); 1032 &failed_start, NULL, mask);
946 if (err == -EEXIST) { 1033 if (err == -EEXIST) {
947 if (failed_start > start) 1034 if (failed_start > start)
948 clear_extent_bit(tree, start, failed_start - 1, 1035 clear_extent_bit(tree, start, failed_start - 1,
949 EXTENT_LOCKED, 1, 0, mask); 1036 EXTENT_LOCKED, 1, 0, NULL, mask);
950 return 0; 1037 return 0;
951 } 1038 }
952 return 1; 1039 return 1;
953} 1040}
954 1041
1042int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1043 struct extent_state **cached, gfp_t mask)
1044{
1045 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
1046 mask);
1047}
1048
955int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, 1049int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
956 gfp_t mask) 1050 gfp_t mask)
957{ 1051{
958 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask); 1052 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1053 mask);
959} 1054}
960 1055
961/* 1056/*
@@ -974,7 +1069,6 @@ int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
974 page_cache_release(page); 1069 page_cache_release(page);
975 index++; 1070 index++;
976 } 1071 }
977 set_extent_dirty(tree, start, end, GFP_NOFS);
978 return 0; 1072 return 0;
979} 1073}
980 1074
@@ -994,7 +1088,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
994 page_cache_release(page); 1088 page_cache_release(page);
995 index++; 1089 index++;
996 } 1090 }
997 set_extent_writeback(tree, start, end, GFP_NOFS);
998 return 0; 1091 return 0;
999} 1092}
1000 1093
@@ -1232,6 +1325,7 @@ static noinline u64 find_lock_delalloc_range(struct inode *inode,
1232 u64 delalloc_start; 1325 u64 delalloc_start;
1233 u64 delalloc_end; 1326 u64 delalloc_end;
1234 u64 found; 1327 u64 found;
1328 struct extent_state *cached_state = NULL;
1235 int ret; 1329 int ret;
1236 int loops = 0; 1330 int loops = 0;
1237 1331
@@ -1269,6 +1363,7 @@ again:
1269 /* some of the pages are gone, lets avoid looping by 1363 /* some of the pages are gone, lets avoid looping by
1270 * shortening the size of the delalloc range we're searching 1364 * shortening the size of the delalloc range we're searching
1271 */ 1365 */
1366 free_extent_state(cached_state);
1272 if (!loops) { 1367 if (!loops) {
1273 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); 1368 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1274 max_bytes = PAGE_CACHE_SIZE - offset; 1369 max_bytes = PAGE_CACHE_SIZE - offset;
@@ -1282,18 +1377,21 @@ again:
1282 BUG_ON(ret); 1377 BUG_ON(ret);
1283 1378
1284 /* step three, lock the state bits for the whole range */ 1379 /* step three, lock the state bits for the whole range */
1285 lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); 1380 lock_extent_bits(tree, delalloc_start, delalloc_end,
1381 0, &cached_state, GFP_NOFS);
1286 1382
1287 /* then test to make sure it is all still delalloc */ 1383 /* then test to make sure it is all still delalloc */
1288 ret = test_range_bit(tree, delalloc_start, delalloc_end, 1384 ret = test_range_bit(tree, delalloc_start, delalloc_end,
1289 EXTENT_DELALLOC, 1); 1385 EXTENT_DELALLOC, 1, cached_state);
1290 if (!ret) { 1386 if (!ret) {
1291 unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); 1387 unlock_extent_cached(tree, delalloc_start, delalloc_end,
1388 &cached_state, GFP_NOFS);
1292 __unlock_for_delalloc(inode, locked_page, 1389 __unlock_for_delalloc(inode, locked_page,
1293 delalloc_start, delalloc_end); 1390 delalloc_start, delalloc_end);
1294 cond_resched(); 1391 cond_resched();
1295 goto again; 1392 goto again;
1296 } 1393 }
1394 free_extent_state(cached_state);
1297 *start = delalloc_start; 1395 *start = delalloc_start;
1298 *end = delalloc_end; 1396 *end = delalloc_end;
1299out_failed: 1397out_failed:
@@ -1307,7 +1405,8 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1307 int clear_unlock, 1405 int clear_unlock,
1308 int clear_delalloc, int clear_dirty, 1406 int clear_delalloc, int clear_dirty,
1309 int set_writeback, 1407 int set_writeback,
1310 int end_writeback) 1408 int end_writeback,
1409 int set_private2)
1311{ 1410{
1312 int ret; 1411 int ret;
1313 struct page *pages[16]; 1412 struct page *pages[16];
@@ -1325,8 +1424,9 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1325 if (clear_delalloc) 1424 if (clear_delalloc)
1326 clear_bits |= EXTENT_DELALLOC; 1425 clear_bits |= EXTENT_DELALLOC;
1327 1426
1328 clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS); 1427 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1329 if (!(unlock_pages || clear_dirty || set_writeback || end_writeback)) 1428 if (!(unlock_pages || clear_dirty || set_writeback || end_writeback ||
1429 set_private2))
1330 return 0; 1430 return 0;
1331 1431
1332 while (nr_pages > 0) { 1432 while (nr_pages > 0) {
@@ -1334,6 +1434,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1334 min_t(unsigned long, 1434 min_t(unsigned long,
1335 nr_pages, ARRAY_SIZE(pages)), pages); 1435 nr_pages, ARRAY_SIZE(pages)), pages);
1336 for (i = 0; i < ret; i++) { 1436 for (i = 0; i < ret; i++) {
1437
1438 if (set_private2)
1439 SetPagePrivate2(pages[i]);
1440
1337 if (pages[i] == locked_page) { 1441 if (pages[i] == locked_page) {
1338 page_cache_release(pages[i]); 1442 page_cache_release(pages[i]);
1339 continue; 1443 continue;
@@ -1476,14 +1580,17 @@ out:
1476 * range is found set. 1580 * range is found set.
1477 */ 1581 */
1478int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 1582int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1479 int bits, int filled) 1583 int bits, int filled, struct extent_state *cached)
1480{ 1584{
1481 struct extent_state *state = NULL; 1585 struct extent_state *state = NULL;
1482 struct rb_node *node; 1586 struct rb_node *node;
1483 int bitset = 0; 1587 int bitset = 0;
1484 1588
1485 spin_lock(&tree->lock); 1589 spin_lock(&tree->lock);
1486 node = tree_search(tree, start); 1590 if (cached && cached->tree && cached->start == start)
1591 node = &cached->rb_node;
1592 else
1593 node = tree_search(tree, start);
1487 while (node && start <= end) { 1594 while (node && start <= end) {
1488 state = rb_entry(node, struct extent_state, rb_node); 1595 state = rb_entry(node, struct extent_state, rb_node);
1489 1596
@@ -1503,6 +1610,10 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1503 bitset = 0; 1610 bitset = 0;
1504 break; 1611 break;
1505 } 1612 }
1613
1614 if (state->end == (u64)-1)
1615 break;
1616
1506 start = state->end + 1; 1617 start = state->end + 1;
1507 if (start > end) 1618 if (start > end)
1508 break; 1619 break;
@@ -1526,7 +1637,7 @@ static int check_page_uptodate(struct extent_io_tree *tree,
1526{ 1637{
1527 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1638 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1528 u64 end = start + PAGE_CACHE_SIZE - 1; 1639 u64 end = start + PAGE_CACHE_SIZE - 1;
1529 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1)) 1640 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1530 SetPageUptodate(page); 1641 SetPageUptodate(page);
1531 return 0; 1642 return 0;
1532} 1643}
@@ -1540,7 +1651,7 @@ static int check_page_locked(struct extent_io_tree *tree,
1540{ 1651{
1541 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1652 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1542 u64 end = start + PAGE_CACHE_SIZE - 1; 1653 u64 end = start + PAGE_CACHE_SIZE - 1;
1543 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0)) 1654 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1544 unlock_page(page); 1655 unlock_page(page);
1545 return 0; 1656 return 0;
1546} 1657}
@@ -1552,10 +1663,7 @@ static int check_page_locked(struct extent_io_tree *tree,
1552static int check_page_writeback(struct extent_io_tree *tree, 1663static int check_page_writeback(struct extent_io_tree *tree,
1553 struct page *page) 1664 struct page *page)
1554{ 1665{
1555 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1666 end_page_writeback(page);
1556 u64 end = start + PAGE_CACHE_SIZE - 1;
1557 if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1558 end_page_writeback(page);
1559 return 0; 1667 return 0;
1560} 1668}
1561 1669
@@ -1613,13 +1721,11 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
1613 } 1721 }
1614 1722
1615 if (!uptodate) { 1723 if (!uptodate) {
1616 clear_extent_uptodate(tree, start, end, GFP_ATOMIC); 1724 clear_extent_uptodate(tree, start, end, GFP_NOFS);
1617 ClearPageUptodate(page); 1725 ClearPageUptodate(page);
1618 SetPageError(page); 1726 SetPageError(page);
1619 } 1727 }
1620 1728
1621 clear_extent_writeback(tree, start, end, GFP_ATOMIC);
1622
1623 if (whole_page) 1729 if (whole_page)
1624 end_page_writeback(page); 1730 end_page_writeback(page);
1625 else 1731 else
@@ -1983,7 +2089,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
1983 continue; 2089 continue;
1984 } 2090 }
1985 /* the get_extent function already copied into the page */ 2091 /* the get_extent function already copied into the page */
1986 if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) { 2092 if (test_range_bit(tree, cur, cur_end,
2093 EXTENT_UPTODATE, 1, NULL)) {
1987 check_page_uptodate(tree, page); 2094 check_page_uptodate(tree, page);
1988 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2095 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1989 cur = cur + iosize; 2096 cur = cur + iosize;
@@ -2078,6 +2185,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2078 u64 iosize; 2185 u64 iosize;
2079 u64 unlock_start; 2186 u64 unlock_start;
2080 sector_t sector; 2187 sector_t sector;
2188 struct extent_state *cached_state = NULL;
2081 struct extent_map *em; 2189 struct extent_map *em;
2082 struct block_device *bdev; 2190 struct block_device *bdev;
2083 int ret; 2191 int ret;
@@ -2124,6 +2232,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2124 delalloc_end = 0; 2232 delalloc_end = 0;
2125 page_started = 0; 2233 page_started = 0;
2126 if (!epd->extent_locked) { 2234 if (!epd->extent_locked) {
2235 u64 delalloc_to_write = 0;
2127 /* 2236 /*
2128 * make sure the wbc mapping index is at least updated 2237 * make sure the wbc mapping index is at least updated
2129 * to this page. 2238 * to this page.
@@ -2143,8 +2252,24 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2143 tree->ops->fill_delalloc(inode, page, delalloc_start, 2252 tree->ops->fill_delalloc(inode, page, delalloc_start,
2144 delalloc_end, &page_started, 2253 delalloc_end, &page_started,
2145 &nr_written); 2254 &nr_written);
2255 /*
2256 * delalloc_end is already one less than the total
2257 * length, so we don't subtract one from
2258 * PAGE_CACHE_SIZE
2259 */
2260 delalloc_to_write += (delalloc_end - delalloc_start +
2261 PAGE_CACHE_SIZE) >>
2262 PAGE_CACHE_SHIFT;
2146 delalloc_start = delalloc_end + 1; 2263 delalloc_start = delalloc_end + 1;
2147 } 2264 }
2265 if (wbc->nr_to_write < delalloc_to_write) {
2266 int thresh = 8192;
2267
2268 if (delalloc_to_write < thresh * 2)
2269 thresh = delalloc_to_write;
2270 wbc->nr_to_write = min_t(u64, delalloc_to_write,
2271 thresh);
2272 }
2148 2273
2149 /* did the fill delalloc function already unlock and start 2274 /* did the fill delalloc function already unlock and start
2150 * the IO? 2275 * the IO?
@@ -2160,15 +2285,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2160 goto done_unlocked; 2285 goto done_unlocked;
2161 } 2286 }
2162 } 2287 }
2163 lock_extent(tree, start, page_end, GFP_NOFS);
2164
2165 unlock_start = start;
2166
2167 if (tree->ops && tree->ops->writepage_start_hook) { 2288 if (tree->ops && tree->ops->writepage_start_hook) {
2168 ret = tree->ops->writepage_start_hook(page, start, 2289 ret = tree->ops->writepage_start_hook(page, start,
2169 page_end); 2290 page_end);
2170 if (ret == -EAGAIN) { 2291 if (ret == -EAGAIN) {
2171 unlock_extent(tree, start, page_end, GFP_NOFS);
2172 redirty_page_for_writepage(wbc, page); 2292 redirty_page_for_writepage(wbc, page);
2173 update_nr_written(page, wbc, nr_written); 2293 update_nr_written(page, wbc, nr_written);
2174 unlock_page(page); 2294 unlock_page(page);
@@ -2184,12 +2304,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2184 update_nr_written(page, wbc, nr_written + 1); 2304 update_nr_written(page, wbc, nr_written + 1);
2185 2305
2186 end = page_end; 2306 end = page_end;
2187 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
2188 printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
2189
2190 if (last_byte <= start) { 2307 if (last_byte <= start) {
2191 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
2192 unlock_extent(tree, start, page_end, GFP_NOFS);
2193 if (tree->ops && tree->ops->writepage_end_io_hook) 2308 if (tree->ops && tree->ops->writepage_end_io_hook)
2194 tree->ops->writepage_end_io_hook(page, start, 2309 tree->ops->writepage_end_io_hook(page, start,
2195 page_end, NULL, 1); 2310 page_end, NULL, 1);
@@ -2197,13 +2312,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2197 goto done; 2312 goto done;
2198 } 2313 }
2199 2314
2200 set_extent_uptodate(tree, start, page_end, GFP_NOFS);
2201 blocksize = inode->i_sb->s_blocksize; 2315 blocksize = inode->i_sb->s_blocksize;
2202 2316
2203 while (cur <= end) { 2317 while (cur <= end) {
2204 if (cur >= last_byte) { 2318 if (cur >= last_byte) {
2205 clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
2206 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2207 if (tree->ops && tree->ops->writepage_end_io_hook) 2319 if (tree->ops && tree->ops->writepage_end_io_hook)
2208 tree->ops->writepage_end_io_hook(page, cur, 2320 tree->ops->writepage_end_io_hook(page, cur,
2209 page_end, NULL, 1); 2321 page_end, NULL, 1);
@@ -2235,12 +2347,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2235 */ 2347 */
2236 if (compressed || block_start == EXTENT_MAP_HOLE || 2348 if (compressed || block_start == EXTENT_MAP_HOLE ||
2237 block_start == EXTENT_MAP_INLINE) { 2349 block_start == EXTENT_MAP_INLINE) {
2238 clear_extent_dirty(tree, cur,
2239 cur + iosize - 1, GFP_NOFS);
2240
2241 unlock_extent(tree, unlock_start, cur + iosize - 1,
2242 GFP_NOFS);
2243
2244 /* 2350 /*
2245 * end_io notification does not happen here for 2351 * end_io notification does not happen here for
2246 * compressed extents 2352 * compressed extents
@@ -2265,13 +2371,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2265 } 2371 }
2266 /* leave this out until we have a page_mkwrite call */ 2372 /* leave this out until we have a page_mkwrite call */
2267 if (0 && !test_range_bit(tree, cur, cur + iosize - 1, 2373 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2268 EXTENT_DIRTY, 0)) { 2374 EXTENT_DIRTY, 0, NULL)) {
2269 cur = cur + iosize; 2375 cur = cur + iosize;
2270 pg_offset += iosize; 2376 pg_offset += iosize;
2271 continue; 2377 continue;
2272 } 2378 }
2273 2379
2274 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
2275 if (tree->ops && tree->ops->writepage_io_hook) { 2380 if (tree->ops && tree->ops->writepage_io_hook) {
2276 ret = tree->ops->writepage_io_hook(page, cur, 2381 ret = tree->ops->writepage_io_hook(page, cur,
2277 cur + iosize - 1); 2382 cur + iosize - 1);
@@ -2309,12 +2414,12 @@ done:
2309 set_page_writeback(page); 2414 set_page_writeback(page);
2310 end_page_writeback(page); 2415 end_page_writeback(page);
2311 } 2416 }
2312 if (unlock_start <= page_end)
2313 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2314 unlock_page(page); 2417 unlock_page(page);
2315 2418
2316done_unlocked: 2419done_unlocked:
2317 2420
2421 /* drop our reference on any cached states */
2422 free_extent_state(cached_state);
2318 return 0; 2423 return 0;
2319} 2424}
2320 2425
@@ -2339,9 +2444,9 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
2339 writepage_t writepage, void *data, 2444 writepage_t writepage, void *data,
2340 void (*flush_fn)(void *)) 2445 void (*flush_fn)(void *))
2341{ 2446{
2342 struct backing_dev_info *bdi = mapping->backing_dev_info;
2343 int ret = 0; 2447 int ret = 0;
2344 int done = 0; 2448 int done = 0;
2449 int nr_to_write_done = 0;
2345 struct pagevec pvec; 2450 struct pagevec pvec;
2346 int nr_pages; 2451 int nr_pages;
2347 pgoff_t index; 2452 pgoff_t index;
@@ -2361,7 +2466,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
2361 scanned = 1; 2466 scanned = 1;
2362 } 2467 }
2363retry: 2468retry:
2364 while (!done && (index <= end) && 2469 while (!done && !nr_to_write_done && (index <= end) &&
2365 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 2470 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2366 PAGECACHE_TAG_DIRTY, min(end - index, 2471 PAGECACHE_TAG_DIRTY, min(end - index,
2367 (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 2472 (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
@@ -2412,12 +2517,15 @@ retry:
2412 unlock_page(page); 2517 unlock_page(page);
2413 ret = 0; 2518 ret = 0;
2414 } 2519 }
2415 if (ret || wbc->nr_to_write <= 0) 2520 if (ret)
2416 done = 1;
2417 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2418 wbc->encountered_congestion = 1;
2419 done = 1; 2521 done = 1;
2420 } 2522
2523 /*
2524 * the filesystem may choose to bump up nr_to_write.
2525 * We have to make sure to honor the new nr_to_write
2526 * at any time
2527 */
2528 nr_to_write_done = wbc->nr_to_write <= 0;
2421 } 2529 }
2422 pagevec_release(&pvec); 2530 pagevec_release(&pvec);
2423 cond_resched(); 2531 cond_resched();
@@ -2604,10 +2712,10 @@ int extent_invalidatepage(struct extent_io_tree *tree,
2604 return 0; 2712 return 0;
2605 2713
2606 lock_extent(tree, start, end, GFP_NOFS); 2714 lock_extent(tree, start, end, GFP_NOFS);
2607 wait_on_extent_writeback(tree, start, end); 2715 wait_on_page_writeback(page);
2608 clear_extent_bit(tree, start, end, 2716 clear_extent_bit(tree, start, end,
2609 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, 2717 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
2610 1, 1, GFP_NOFS); 2718 1, 1, NULL, GFP_NOFS);
2611 return 0; 2719 return 0;
2612} 2720}
2613 2721
@@ -2687,7 +2795,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
2687 !isnew && !PageUptodate(page) && 2795 !isnew && !PageUptodate(page) &&
2688 (block_off_end > to || block_off_start < from) && 2796 (block_off_end > to || block_off_start < from) &&
2689 !test_range_bit(tree, block_start, cur_end, 2797 !test_range_bit(tree, block_start, cur_end,
2690 EXTENT_UPTODATE, 1)) { 2798 EXTENT_UPTODATE, 1, NULL)) {
2691 u64 sector; 2799 u64 sector;
2692 u64 extent_offset = block_start - em->start; 2800 u64 extent_offset = block_start - em->start;
2693 size_t iosize; 2801 size_t iosize;
@@ -2701,7 +2809,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
2701 */ 2809 */
2702 set_extent_bit(tree, block_start, 2810 set_extent_bit(tree, block_start,
2703 block_start + iosize - 1, 2811 block_start + iosize - 1,
2704 EXTENT_LOCKED, 0, NULL, GFP_NOFS); 2812 EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS);
2705 ret = submit_extent_page(READ, tree, page, 2813 ret = submit_extent_page(READ, tree, page,
2706 sector, iosize, page_offset, em->bdev, 2814 sector, iosize, page_offset, em->bdev,
2707 NULL, 1, 2815 NULL, 1,
@@ -2742,13 +2850,18 @@ int try_release_extent_state(struct extent_map_tree *map,
2742 int ret = 1; 2850 int ret = 1;
2743 2851
2744 if (test_range_bit(tree, start, end, 2852 if (test_range_bit(tree, start, end,
2745 EXTENT_IOBITS | EXTENT_ORDERED, 0)) 2853 EXTENT_IOBITS, 0, NULL))
2746 ret = 0; 2854 ret = 0;
2747 else { 2855 else {
2748 if ((mask & GFP_NOFS) == GFP_NOFS) 2856 if ((mask & GFP_NOFS) == GFP_NOFS)
2749 mask = GFP_NOFS; 2857 mask = GFP_NOFS;
2750 clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 2858 /*
2751 1, 1, mask); 2859 * at this point we can safely clear everything except the
2860 * locked bit and the nodatasum bit
2861 */
2862 clear_extent_bit(tree, start, end,
2863 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
2864 0, 0, NULL, mask);
2752 } 2865 }
2753 return ret; 2866 return ret;
2754} 2867}
@@ -2771,29 +2884,28 @@ int try_release_extent_mapping(struct extent_map_tree *map,
2771 u64 len; 2884 u64 len;
2772 while (start <= end) { 2885 while (start <= end) {
2773 len = end - start + 1; 2886 len = end - start + 1;
2774 spin_lock(&map->lock); 2887 write_lock(&map->lock);
2775 em = lookup_extent_mapping(map, start, len); 2888 em = lookup_extent_mapping(map, start, len);
2776 if (!em || IS_ERR(em)) { 2889 if (!em || IS_ERR(em)) {
2777 spin_unlock(&map->lock); 2890 write_unlock(&map->lock);
2778 break; 2891 break;
2779 } 2892 }
2780 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 2893 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2781 em->start != start) { 2894 em->start != start) {
2782 spin_unlock(&map->lock); 2895 write_unlock(&map->lock);
2783 free_extent_map(em); 2896 free_extent_map(em);
2784 break; 2897 break;
2785 } 2898 }
2786 if (!test_range_bit(tree, em->start, 2899 if (!test_range_bit(tree, em->start,
2787 extent_map_end(em) - 1, 2900 extent_map_end(em) - 1,
2788 EXTENT_LOCKED | EXTENT_WRITEBACK | 2901 EXTENT_LOCKED | EXTENT_WRITEBACK,
2789 EXTENT_ORDERED, 2902 0, NULL)) {
2790 0)) {
2791 remove_extent_mapping(map, em); 2903 remove_extent_mapping(map, em);
2792 /* once for the rb tree */ 2904 /* once for the rb tree */
2793 free_extent_map(em); 2905 free_extent_map(em);
2794 } 2906 }
2795 start = extent_map_end(em); 2907 start = extent_map_end(em);
2796 spin_unlock(&map->lock); 2908 write_unlock(&map->lock);
2797 2909
2798 /* once for us */ 2910 /* once for us */
2799 free_extent_map(em); 2911 free_extent_map(em);
@@ -3203,7 +3315,7 @@ int extent_range_uptodate(struct extent_io_tree *tree,
3203 int uptodate; 3315 int uptodate;
3204 unsigned long index; 3316 unsigned long index;
3205 3317
3206 ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1); 3318 ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
3207 if (ret) 3319 if (ret)
3208 return 1; 3320 return 1;
3209 while (start <= end) { 3321 while (start <= end) {
@@ -3233,7 +3345,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3233 return 1; 3345 return 1;
3234 3346
3235 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3347 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3236 EXTENT_UPTODATE, 1); 3348 EXTENT_UPTODATE, 1, NULL);
3237 if (ret) 3349 if (ret)
3238 return ret; 3350 return ret;
3239 3351
@@ -3269,7 +3381,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3269 return 0; 3381 return 0;
3270 3382
3271 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3383 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3272 EXTENT_UPTODATE, 1)) { 3384 EXTENT_UPTODATE, 1, NULL)) {
3273 return 0; 3385 return 0;
3274 } 3386 }
3275 3387
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 5bc20abf3f3d..4794ec891fed 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -13,10 +13,8 @@
13#define EXTENT_DEFRAG (1 << 6) 13#define EXTENT_DEFRAG (1 << 6)
14#define EXTENT_DEFRAG_DONE (1 << 7) 14#define EXTENT_DEFRAG_DONE (1 << 7)
15#define EXTENT_BUFFER_FILLED (1 << 8) 15#define EXTENT_BUFFER_FILLED (1 << 8)
16#define EXTENT_ORDERED (1 << 9) 16#define EXTENT_BOUNDARY (1 << 9)
17#define EXTENT_ORDERED_METADATA (1 << 10) 17#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_BOUNDARY (1 << 11)
19#define EXTENT_NODATASUM (1 << 12)
20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 18#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21 19
22/* flags for bio submission */ 20/* flags for bio submission */
@@ -62,8 +60,13 @@ struct extent_io_ops {
62 struct extent_state *state, int uptodate); 60 struct extent_state *state, int uptodate);
63 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, 61 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
64 unsigned long old, unsigned long bits); 62 unsigned long old, unsigned long bits);
65 int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end, 63 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
66 unsigned long old, unsigned long bits); 64 unsigned long bits);
65 int (*merge_extent_hook)(struct inode *inode,
66 struct extent_state *new,
67 struct extent_state *other);
68 int (*split_extent_hook)(struct inode *inode,
69 struct extent_state *orig, u64 split);
67 int (*write_cache_pages_lock_hook)(struct page *page); 70 int (*write_cache_pages_lock_hook)(struct page *page);
68}; 71};
69 72
@@ -81,10 +84,14 @@ struct extent_state {
81 u64 start; 84 u64 start;
82 u64 end; /* inclusive */ 85 u64 end; /* inclusive */
83 struct rb_node rb_node; 86 struct rb_node rb_node;
87
88 /* ADD NEW ELEMENTS AFTER THIS */
84 struct extent_io_tree *tree; 89 struct extent_io_tree *tree;
85 wait_queue_head_t wq; 90 wait_queue_head_t wq;
86 atomic_t refs; 91 atomic_t refs;
87 unsigned long state; 92 unsigned long state;
93 u64 split_start;
94 u64 split_end;
88 95
89 /* for use by the FS */ 96 /* for use by the FS */
90 u64 private; 97 u64 private;
@@ -142,6 +149,8 @@ int try_release_extent_state(struct extent_map_tree *map,
142 struct extent_io_tree *tree, struct page *page, 149 struct extent_io_tree *tree, struct page *page,
143 gfp_t mask); 150 gfp_t mask);
144int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); 151int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
152int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
153 int bits, struct extent_state **cached, gfp_t mask);
145int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); 154int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
146int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 155int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
147 gfp_t mask); 156 gfp_t mask);
@@ -155,11 +164,12 @@ u64 count_range_bits(struct extent_io_tree *tree,
155 u64 max_bytes, unsigned long bits); 164 u64 max_bytes, unsigned long bits);
156 165
157int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 166int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
158 int bits, int filled); 167 int bits, int filled, struct extent_state *cached_state);
159int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 168int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
160 int bits, gfp_t mask); 169 int bits, gfp_t mask);
161int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 170int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
162 int bits, int wake, int delete, gfp_t mask); 171 int bits, int wake, int delete, struct extent_state **cached,
172 gfp_t mask);
163int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 173int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
164 int bits, gfp_t mask); 174 int bits, gfp_t mask);
165int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 175int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
@@ -282,5 +292,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
282 int clear_unlock, 292 int clear_unlock,
283 int clear_delalloc, int clear_dirty, 293 int clear_delalloc, int clear_dirty,
284 int set_writeback, 294 int set_writeback,
285 int end_writeback); 295 int end_writeback,
296 int set_private2);
286#endif 297#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 30c9365861e6..2c726b7b9faa 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -36,7 +36,7 @@ void extent_map_exit(void)
36void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) 36void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
37{ 37{
38 tree->map.rb_node = NULL; 38 tree->map.rb_node = NULL;
39 spin_lock_init(&tree->lock); 39 rwlock_init(&tree->lock);
40} 40}
41 41
42/** 42/**
@@ -198,6 +198,56 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
198 return 0; 198 return 0;
199} 199}
200 200
201int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
202{
203 int ret = 0;
204 struct extent_map *merge = NULL;
205 struct rb_node *rb;
206 struct extent_map *em;
207
208 write_lock(&tree->lock);
209 em = lookup_extent_mapping(tree, start, len);
210
211 WARN_ON(em->start != start || !em);
212
213 if (!em)
214 goto out;
215
216 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
217
218 if (em->start != 0) {
219 rb = rb_prev(&em->rb_node);
220 if (rb)
221 merge = rb_entry(rb, struct extent_map, rb_node);
222 if (rb && mergable_maps(merge, em)) {
223 em->start = merge->start;
224 em->len += merge->len;
225 em->block_len += merge->block_len;
226 em->block_start = merge->block_start;
227 merge->in_tree = 0;
228 rb_erase(&merge->rb_node, &tree->map);
229 free_extent_map(merge);
230 }
231 }
232
233 rb = rb_next(&em->rb_node);
234 if (rb)
235 merge = rb_entry(rb, struct extent_map, rb_node);
236 if (rb && mergable_maps(em, merge)) {
237 em->len += merge->len;
238 em->block_len += merge->len;
239 rb_erase(&merge->rb_node, &tree->map);
240 merge->in_tree = 0;
241 free_extent_map(merge);
242 }
243
244 free_extent_map(em);
245out:
246 write_unlock(&tree->lock);
247 return ret;
248
249}
250
201/** 251/**
202 * add_extent_mapping - add new extent map to the extent tree 252 * add_extent_mapping - add new extent map to the extent tree
203 * @tree: tree to insert new map in 253 * @tree: tree to insert new map in
@@ -222,7 +272,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
222 ret = -EEXIST; 272 ret = -EEXIST;
223 goto out; 273 goto out;
224 } 274 }
225 assert_spin_locked(&tree->lock);
226 rb = tree_insert(&tree->map, em->start, &em->rb_node); 275 rb = tree_insert(&tree->map, em->start, &em->rb_node);
227 if (rb) { 276 if (rb) {
228 ret = -EEXIST; 277 ret = -EEXIST;
@@ -285,7 +334,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
285 struct rb_node *next = NULL; 334 struct rb_node *next = NULL;
286 u64 end = range_end(start, len); 335 u64 end = range_end(start, len);
287 336
288 assert_spin_locked(&tree->lock);
289 rb_node = __tree_search(&tree->map, start, &prev, &next); 337 rb_node = __tree_search(&tree->map, start, &prev, &next);
290 if (!rb_node && prev) { 338 if (!rb_node && prev) {
291 em = rb_entry(prev, struct extent_map, rb_node); 339 em = rb_entry(prev, struct extent_map, rb_node);
@@ -319,6 +367,54 @@ out:
319} 367}
320 368
321/** 369/**
370 * search_extent_mapping - find a nearby extent map
371 * @tree: tree to lookup in
372 * @start: byte offset to start the search
373 * @len: length of the lookup range
374 *
375 * Find and return the first extent_map struct in @tree that intersects the
376 * [start, len] range.
377 *
378 * If one can't be found, any nearby extent may be returned
379 */
380struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
381 u64 start, u64 len)
382{
383 struct extent_map *em;
384 struct rb_node *rb_node;
385 struct rb_node *prev = NULL;
386 struct rb_node *next = NULL;
387
388 rb_node = __tree_search(&tree->map, start, &prev, &next);
389 if (!rb_node && prev) {
390 em = rb_entry(prev, struct extent_map, rb_node);
391 goto found;
392 }
393 if (!rb_node && next) {
394 em = rb_entry(next, struct extent_map, rb_node);
395 goto found;
396 }
397 if (!rb_node) {
398 em = NULL;
399 goto out;
400 }
401 if (IS_ERR(rb_node)) {
402 em = ERR_PTR(PTR_ERR(rb_node));
403 goto out;
404 }
405 em = rb_entry(rb_node, struct extent_map, rb_node);
406 goto found;
407
408 em = NULL;
409 goto out;
410
411found:
412 atomic_inc(&em->refs);
413out:
414 return em;
415}
416
417/**
322 * remove_extent_mapping - removes an extent_map from the extent tree 418 * remove_extent_mapping - removes an extent_map from the extent tree
323 * @tree: extent tree to remove from 419 * @tree: extent tree to remove from
324 * @em: extent map beeing removed 420 * @em: extent map beeing removed
@@ -331,7 +427,6 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
331 int ret = 0; 427 int ret = 0;
332 428
333 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); 429 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
334 assert_spin_locked(&tree->lock);
335 rb_erase(&em->rb_node, &tree->map); 430 rb_erase(&em->rb_node, &tree->map);
336 em->in_tree = 0; 431 em->in_tree = 0;
337 return ret; 432 return ret;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index fb6eeef06bb0..ab6d74b6e647 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -31,7 +31,7 @@ struct extent_map {
31 31
32struct extent_map_tree { 32struct extent_map_tree {
33 struct rb_root map; 33 struct rb_root map;
34 spinlock_t lock; 34 rwlock_t lock;
35}; 35};
36 36
37static inline u64 extent_map_end(struct extent_map *em) 37static inline u64 extent_map_end(struct extent_map *em)
@@ -59,4 +59,7 @@ struct extent_map *alloc_extent_map(gfp_t mask);
59void free_extent_map(struct extent_map *em); 59void free_extent_map(struct extent_map *em);
60int __init extent_map_init(void); 60int __init extent_map_init(void);
61void extent_map_exit(void); 61void extent_map_exit(void);
62int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len);
63struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
64 u64 start, u64 len);
62#endif 65#endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4b833972273a..f19e1259a971 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -112,8 +112,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
112 int err = 0; 112 int err = 0;
113 int i; 113 int i;
114 struct inode *inode = fdentry(file)->d_inode; 114 struct inode *inode = fdentry(file)->d_inode;
115 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
116 u64 hint_byte;
117 u64 num_bytes; 115 u64 num_bytes;
118 u64 start_pos; 116 u64 start_pos;
119 u64 end_of_last_block; 117 u64 end_of_last_block;
@@ -125,23 +123,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
125 root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 123 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
126 124
127 end_of_last_block = start_pos + num_bytes - 1; 125 end_of_last_block = start_pos + num_bytes - 1;
126 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
127 if (err)
128 return err;
128 129
129 lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
130 trans = btrfs_join_transaction(root, 1);
131 if (!trans) {
132 err = -ENOMEM;
133 goto out_unlock;
134 }
135 btrfs_set_trans_block_group(trans, inode);
136 hint_byte = 0;
137
138 set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
139
140 /* check for reserved extents on each page, we don't want
141 * to reset the delalloc bit on things that already have
142 * extents reserved.
143 */
144 btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
145 for (i = 0; i < num_pages; i++) { 130 for (i = 0; i < num_pages; i++) {
146 struct page *p = pages[i]; 131 struct page *p = pages[i];
147 SetPageUptodate(p); 132 SetPageUptodate(p);
@@ -155,9 +140,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
155 * at this time. 140 * at this time.
156 */ 141 */
157 } 142 }
158 err = btrfs_end_transaction(trans, root);
159out_unlock:
160 unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
161 return err; 143 return err;
162} 144}
163 145
@@ -189,18 +171,18 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
189 if (!split2) 171 if (!split2)
190 split2 = alloc_extent_map(GFP_NOFS); 172 split2 = alloc_extent_map(GFP_NOFS);
191 173
192 spin_lock(&em_tree->lock); 174 write_lock(&em_tree->lock);
193 em = lookup_extent_mapping(em_tree, start, len); 175 em = lookup_extent_mapping(em_tree, start, len);
194 if (!em) { 176 if (!em) {
195 spin_unlock(&em_tree->lock); 177 write_unlock(&em_tree->lock);
196 break; 178 break;
197 } 179 }
198 flags = em->flags; 180 flags = em->flags;
199 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { 181 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
200 spin_unlock(&em_tree->lock);
201 if (em->start <= start && 182 if (em->start <= start &&
202 (!testend || em->start + em->len >= start + len)) { 183 (!testend || em->start + em->len >= start + len)) {
203 free_extent_map(em); 184 free_extent_map(em);
185 write_unlock(&em_tree->lock);
204 break; 186 break;
205 } 187 }
206 if (start < em->start) { 188 if (start < em->start) {
@@ -210,6 +192,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
210 start = em->start + em->len; 192 start = em->start + em->len;
211 } 193 }
212 free_extent_map(em); 194 free_extent_map(em);
195 write_unlock(&em_tree->lock);
213 continue; 196 continue;
214 } 197 }
215 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 198 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
@@ -260,7 +243,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
260 free_extent_map(split); 243 free_extent_map(split);
261 split = NULL; 244 split = NULL;
262 } 245 }
263 spin_unlock(&em_tree->lock); 246 write_unlock(&em_tree->lock);
264 247
265 /* once for us */ 248 /* once for us */
266 free_extent_map(em); 249 free_extent_map(em);
@@ -289,7 +272,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
289noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, 272noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
290 struct btrfs_root *root, struct inode *inode, 273 struct btrfs_root *root, struct inode *inode,
291 u64 start, u64 end, u64 locked_end, 274 u64 start, u64 end, u64 locked_end,
292 u64 inline_limit, u64 *hint_byte) 275 u64 inline_limit, u64 *hint_byte, int drop_cache)
293{ 276{
294 u64 extent_end = 0; 277 u64 extent_end = 0;
295 u64 search_start = start; 278 u64 search_start = start;
@@ -314,7 +297,8 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
314 int ret; 297 int ret;
315 298
316 inline_limit = 0; 299 inline_limit = 0;
317 btrfs_drop_extent_cache(inode, start, end - 1, 0); 300 if (drop_cache)
301 btrfs_drop_extent_cache(inode, start, end - 1, 0);
318 302
319 path = btrfs_alloc_path(); 303 path = btrfs_alloc_path();
320 if (!path) 304 if (!path)
@@ -936,21 +920,35 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
936 start_pos = pos; 920 start_pos = pos;
937 921
938 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 922 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
923
924 /* do the reserve before the mutex lock in case we have to do some
925 * flushing. We wouldn't deadlock, but this is more polite.
926 */
927 err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
928 if (err)
929 goto out_nolock;
930
931 mutex_lock(&inode->i_mutex);
932
939 current->backing_dev_info = inode->i_mapping->backing_dev_info; 933 current->backing_dev_info = inode->i_mapping->backing_dev_info;
940 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 934 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
941 if (err) 935 if (err)
942 goto out_nolock; 936 goto out;
937
943 if (count == 0) 938 if (count == 0)
944 goto out_nolock; 939 goto out;
945 940
946 err = file_remove_suid(file); 941 err = file_remove_suid(file);
947 if (err) 942 if (err)
948 goto out_nolock; 943 goto out;
944
949 file_update_time(file); 945 file_update_time(file);
950 946
951 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 947 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
952 948
953 mutex_lock(&inode->i_mutex); 949 /* generic_write_checks can change our pos */
950 start_pos = pos;
951
954 BTRFS_I(inode)->sequence++; 952 BTRFS_I(inode)->sequence++;
955 first_index = pos >> PAGE_CACHE_SHIFT; 953 first_index = pos >> PAGE_CACHE_SHIFT;
956 last_index = (pos + count) >> PAGE_CACHE_SHIFT; 954 last_index = (pos + count) >> PAGE_CACHE_SHIFT;
@@ -1024,9 +1022,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1024 } 1022 }
1025 1023
1026 if (will_write) { 1024 if (will_write) {
1027 btrfs_fdatawrite_range(inode->i_mapping, pos, 1025 filemap_fdatawrite_range(inode->i_mapping, pos,
1028 pos + write_bytes - 1, 1026 pos + write_bytes - 1);
1029 WB_SYNC_ALL);
1030 } else { 1027 } else {
1031 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1028 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
1032 num_pages); 1029 num_pages);
@@ -1047,6 +1044,7 @@ out:
1047 mutex_unlock(&inode->i_mutex); 1044 mutex_unlock(&inode->i_mutex);
1048 if (ret) 1045 if (ret)
1049 err = ret; 1046 err = ret;
1047 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
1050 1048
1051out_nolock: 1049out_nolock:
1052 kfree(pages); 1050 kfree(pages);
@@ -1203,7 +1201,7 @@ out:
1203 return ret > 0 ? EIO : ret; 1201 return ret > 0 ? EIO : ret;
1204} 1202}
1205 1203
1206static struct vm_operations_struct btrfs_file_vm_ops = { 1204static const struct vm_operations_struct btrfs_file_vm_ops = {
1207 .fault = filemap_fault, 1205 .fault = filemap_fault,
1208 .page_mkwrite = btrfs_page_mkwrite, 1206 .page_mkwrite = btrfs_page_mkwrite,
1209}; 1207};
@@ -1215,7 +1213,7 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1215 return 0; 1213 return 0;
1216} 1214}
1217 1215
1218struct file_operations btrfs_file_operations = { 1216const struct file_operations btrfs_file_operations = {
1219 .llseek = generic_file_llseek, 1217 .llseek = generic_file_llseek,
1220 .read = do_sync_read, 1218 .read = do_sync_read,
1221 .aio_read = generic_file_aio_read, 1219 .aio_read = generic_file_aio_read,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 5edcee3a617f..5c2caad76212 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -259,7 +259,9 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
259 259
260static void recalculate_thresholds(struct btrfs_block_group_cache *block_group) 260static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
261{ 261{
262 u64 max_bytes, possible_bytes; 262 u64 max_bytes;
263 u64 bitmap_bytes;
264 u64 extent_bytes;
263 265
264 /* 266 /*
265 * The goal is to keep the total amount of memory used per 1gb of space 267 * The goal is to keep the total amount of memory used per 1gb of space
@@ -269,22 +271,27 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
269 max_bytes = MAX_CACHE_BYTES_PER_GIG * 271 max_bytes = MAX_CACHE_BYTES_PER_GIG *
270 (div64_u64(block_group->key.offset, 1024 * 1024 * 1024)); 272 (div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
271 273
272 possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) + 274 /*
273 (sizeof(struct btrfs_free_space) * 275 * we want to account for 1 more bitmap than what we have so we can make
274 block_group->extents_thresh); 276 * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
277 * we add more bitmaps.
278 */
279 bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE;
275 280
276 if (possible_bytes > max_bytes) { 281 if (bitmap_bytes >= max_bytes) {
277 int extent_bytes = max_bytes - 282 block_group->extents_thresh = 0;
278 (block_group->total_bitmaps * PAGE_CACHE_SIZE); 283 return;
284 }
279 285
280 if (extent_bytes <= 0) { 286 /*
281 block_group->extents_thresh = 0; 287 * we want the extent entry threshold to always be at most 1/2 the maxw
282 return; 288 * bytes we can have, or whatever is less than that.
283 } 289 */
290 extent_bytes = max_bytes - bitmap_bytes;
291 extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
284 292
285 block_group->extents_thresh = extent_bytes / 293 block_group->extents_thresh =
286 (sizeof(struct btrfs_free_space)); 294 div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
287 }
288} 295}
289 296
290static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group, 297static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
@@ -403,6 +410,7 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
403 BUG_ON(block_group->total_bitmaps >= max_bitmaps); 410 BUG_ON(block_group->total_bitmaps >= max_bitmaps);
404 411
405 info->offset = offset_to_bitmap(block_group, offset); 412 info->offset = offset_to_bitmap(block_group, offset);
413 info->bytes = 0;
406 link_free_space(block_group, info); 414 link_free_space(block_group, info);
407 block_group->total_bitmaps++; 415 block_group->total_bitmaps++;
408 416
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 6b627c611808..72ce3c173d6a 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -149,6 +149,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
149 ptr = (unsigned long)(ref + 1); 149 ptr = (unsigned long)(ref + 1);
150 ret = 0; 150 ret = 0;
151 } else if (ret < 0) { 151 } else if (ret < 0) {
152 if (ret == -EOVERFLOW)
153 ret = -EMLINK;
152 goto out; 154 goto out;
153 } else { 155 } else {
154 ref = btrfs_item_ptr(path->nodes[0], path->slots[0], 156 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -177,8 +179,6 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
177 179
178 ret = btrfs_insert_empty_item(trans, root, path, &key, 180 ret = btrfs_insert_empty_item(trans, root, path, &key,
179 sizeof(struct btrfs_inode_item)); 181 sizeof(struct btrfs_inode_item));
180 if (ret == 0 && objectid > root->highest_inode)
181 root->highest_inode = objectid;
182 return ret; 182 return ret;
183} 183}
184 184
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 9abbced1123d..c56eb5909172 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -43,9 +43,10 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
43 slot = path->slots[0] - 1; 43 slot = path->slots[0] - 1;
44 l = path->nodes[0]; 44 l = path->nodes[0];
45 btrfs_item_key_to_cpu(l, &found_key, slot); 45 btrfs_item_key_to_cpu(l, &found_key, slot);
46 *objectid = found_key.objectid; 46 *objectid = max_t(u64, found_key.objectid,
47 BTRFS_FIRST_FREE_OBJECTID - 1);
47 } else { 48 } else {
48 *objectid = BTRFS_FIRST_FREE_OBJECTID; 49 *objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
49 } 50 }
50 ret = 0; 51 ret = 0;
51error: 52error:
@@ -53,91 +54,27 @@ error:
53 return ret; 54 return ret;
54} 55}
55 56
56/*
57 * walks the btree of allocated inodes and find a hole.
58 */
59int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, 57int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
60 struct btrfs_root *root, 58 struct btrfs_root *root,
61 u64 dirid, u64 *objectid) 59 u64 dirid, u64 *objectid)
62{ 60{
63 struct btrfs_path *path;
64 struct btrfs_key key;
65 int ret; 61 int ret;
66 int slot = 0;
67 u64 last_ino = 0;
68 int start_found;
69 struct extent_buffer *l;
70 struct btrfs_key search_key;
71 u64 search_start = dirid;
72
73 mutex_lock(&root->objectid_mutex); 62 mutex_lock(&root->objectid_mutex);
74 if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID &&
75 root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) {
76 *objectid = ++root->last_inode_alloc;
77 mutex_unlock(&root->objectid_mutex);
78 return 0;
79 }
80 path = btrfs_alloc_path();
81 BUG_ON(!path);
82 search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID);
83 search_key.objectid = search_start;
84 search_key.type = 0;
85 search_key.offset = 0;
86
87 start_found = 0;
88 ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
89 if (ret < 0)
90 goto error;
91 63
92 while (1) { 64 if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
93 l = path->nodes[0]; 65 ret = btrfs_find_highest_inode(root, &root->highest_objectid);
94 slot = path->slots[0]; 66 if (ret)
95 if (slot >= btrfs_header_nritems(l)) { 67 goto out;
96 ret = btrfs_next_leaf(root, path); 68 }
97 if (ret == 0)
98 continue;
99 if (ret < 0)
100 goto error;
101 if (!start_found) {
102 *objectid = search_start;
103 start_found = 1;
104 goto found;
105 }
106 *objectid = last_ino > search_start ?
107 last_ino : search_start;
108 goto found;
109 }
110 btrfs_item_key_to_cpu(l, &key, slot);
111 if (key.objectid >= search_start) {
112 if (start_found) {
113 if (last_ino < search_start)
114 last_ino = search_start;
115 if (key.objectid > last_ino) {
116 *objectid = last_ino;
117 goto found;
118 }
119 } else if (key.objectid > search_start) {
120 *objectid = search_start;
121 goto found;
122 }
123 }
124 if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
125 break;
126 69
127 start_found = 1; 70 if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
128 last_ino = key.objectid + 1; 71 ret = -ENOSPC;
129 path->slots[0]++; 72 goto out;
130 } 73 }
131 BUG_ON(1); 74
132found: 75 *objectid = ++root->highest_objectid;
133 btrfs_release_path(root, path); 76 ret = 0;
134 btrfs_free_path(path); 77out:
135 BUG_ON(*objectid < search_start);
136 mutex_unlock(&root->objectid_mutex);
137 return 0;
138error:
139 btrfs_release_path(root, path);
140 btrfs_free_path(path);
141 mutex_unlock(&root->objectid_mutex); 78 mutex_unlock(&root->objectid_mutex);
142 return ret; 79 return ret;
143} 80}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 59cba180fe83..112e5aa85892 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -55,14 +55,14 @@ struct btrfs_iget_args {
55 struct btrfs_root *root; 55 struct btrfs_root *root;
56}; 56};
57 57
58static struct inode_operations btrfs_dir_inode_operations; 58static const struct inode_operations btrfs_dir_inode_operations;
59static struct inode_operations btrfs_symlink_inode_operations; 59static const struct inode_operations btrfs_symlink_inode_operations;
60static struct inode_operations btrfs_dir_ro_inode_operations; 60static const struct inode_operations btrfs_dir_ro_inode_operations;
61static struct inode_operations btrfs_special_inode_operations; 61static const struct inode_operations btrfs_special_inode_operations;
62static struct inode_operations btrfs_file_inode_operations; 62static const struct inode_operations btrfs_file_inode_operations;
63static struct address_space_operations btrfs_aops; 63static const struct address_space_operations btrfs_aops;
64static struct address_space_operations btrfs_symlink_aops; 64static const struct address_space_operations btrfs_symlink_aops;
65static struct file_operations btrfs_dir_file_operations; 65static const struct file_operations btrfs_dir_file_operations;
66static struct extent_io_ops btrfs_extent_io_ops; 66static struct extent_io_ops btrfs_extent_io_ops;
67 67
68static struct kmem_cache *btrfs_inode_cachep; 68static struct kmem_cache *btrfs_inode_cachep;
@@ -231,7 +231,8 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
231 } 231 }
232 232
233 ret = btrfs_drop_extents(trans, root, inode, start, 233 ret = btrfs_drop_extents(trans, root, inode, start,
234 aligned_end, aligned_end, start, &hint_byte); 234 aligned_end, aligned_end, start,
235 &hint_byte, 1);
235 BUG_ON(ret); 236 BUG_ON(ret);
236 237
237 if (isize > actual_end) 238 if (isize > actual_end)
@@ -240,7 +241,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
240 inline_len, compressed_size, 241 inline_len, compressed_size,
241 compressed_pages); 242 compressed_pages);
242 BUG_ON(ret); 243 BUG_ON(ret);
243 btrfs_drop_extent_cache(inode, start, aligned_end, 0); 244 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
244 return 0; 245 return 0;
245} 246}
246 247
@@ -425,7 +426,7 @@ again:
425 extent_clear_unlock_delalloc(inode, 426 extent_clear_unlock_delalloc(inode,
426 &BTRFS_I(inode)->io_tree, 427 &BTRFS_I(inode)->io_tree,
427 start, end, NULL, 1, 0, 428 start, end, NULL, 1, 0,
428 0, 1, 1, 1); 429 0, 1, 1, 1, 0);
429 ret = 0; 430 ret = 0;
430 goto free_pages_out; 431 goto free_pages_out;
431 } 432 }
@@ -611,9 +612,9 @@ static noinline int submit_compressed_extents(struct inode *inode,
611 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 612 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
612 613
613 while (1) { 614 while (1) {
614 spin_lock(&em_tree->lock); 615 write_lock(&em_tree->lock);
615 ret = add_extent_mapping(em_tree, em); 616 ret = add_extent_mapping(em_tree, em);
616 spin_unlock(&em_tree->lock); 617 write_unlock(&em_tree->lock);
617 if (ret != -EEXIST) { 618 if (ret != -EEXIST) {
618 free_extent_map(em); 619 free_extent_map(em);
619 break; 620 break;
@@ -640,7 +641,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
640 async_extent->start, 641 async_extent->start,
641 async_extent->start + 642 async_extent->start +
642 async_extent->ram_size - 1, 643 async_extent->ram_size - 1,
643 NULL, 1, 1, 0, 1, 1, 0); 644 NULL, 1, 1, 0, 1, 1, 0, 0);
644 645
645 ret = btrfs_submit_compressed_write(inode, 646 ret = btrfs_submit_compressed_write(inode,
646 async_extent->start, 647 async_extent->start,
@@ -713,7 +714,7 @@ static noinline int cow_file_range(struct inode *inode,
713 extent_clear_unlock_delalloc(inode, 714 extent_clear_unlock_delalloc(inode,
714 &BTRFS_I(inode)->io_tree, 715 &BTRFS_I(inode)->io_tree,
715 start, end, NULL, 1, 1, 716 start, end, NULL, 1, 1,
716 1, 1, 1, 1); 717 1, 1, 1, 1, 0);
717 *nr_written = *nr_written + 718 *nr_written = *nr_written +
718 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; 719 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
719 *page_started = 1; 720 *page_started = 1;
@@ -725,6 +726,15 @@ static noinline int cow_file_range(struct inode *inode,
725 BUG_ON(disk_num_bytes > 726 BUG_ON(disk_num_bytes >
726 btrfs_super_total_bytes(&root->fs_info->super_copy)); 727 btrfs_super_total_bytes(&root->fs_info->super_copy));
727 728
729
730 read_lock(&BTRFS_I(inode)->extent_tree.lock);
731 em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
732 start, num_bytes);
733 if (em) {
734 alloc_hint = em->block_start;
735 free_extent_map(em);
736 }
737 read_unlock(&BTRFS_I(inode)->extent_tree.lock);
728 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 738 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
729 739
730 while (disk_num_bytes > 0) { 740 while (disk_num_bytes > 0) {
@@ -737,7 +747,6 @@ static noinline int cow_file_range(struct inode *inode,
737 em = alloc_extent_map(GFP_NOFS); 747 em = alloc_extent_map(GFP_NOFS);
738 em->start = start; 748 em->start = start;
739 em->orig_start = em->start; 749 em->orig_start = em->start;
740
741 ram_size = ins.offset; 750 ram_size = ins.offset;
742 em->len = ins.offset; 751 em->len = ins.offset;
743 752
@@ -747,9 +756,9 @@ static noinline int cow_file_range(struct inode *inode,
747 set_bit(EXTENT_FLAG_PINNED, &em->flags); 756 set_bit(EXTENT_FLAG_PINNED, &em->flags);
748 757
749 while (1) { 758 while (1) {
750 spin_lock(&em_tree->lock); 759 write_lock(&em_tree->lock);
751 ret = add_extent_mapping(em_tree, em); 760 ret = add_extent_mapping(em_tree, em);
752 spin_unlock(&em_tree->lock); 761 write_unlock(&em_tree->lock);
753 if (ret != -EEXIST) { 762 if (ret != -EEXIST) {
754 free_extent_map(em); 763 free_extent_map(em);
755 break; 764 break;
@@ -776,11 +785,14 @@ static noinline int cow_file_range(struct inode *inode,
776 /* we're not doing compressed IO, don't unlock the first 785 /* we're not doing compressed IO, don't unlock the first
777 * page (which the caller expects to stay locked), don't 786 * page (which the caller expects to stay locked), don't
778 * clear any dirty bits and don't set any writeback bits 787 * clear any dirty bits and don't set any writeback bits
788 *
789 * Do set the Private2 bit so we know this page was properly
790 * setup for writepage
779 */ 791 */
780 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 792 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
781 start, start + ram_size - 1, 793 start, start + ram_size - 1,
782 locked_page, unlock, 1, 794 locked_page, unlock, 1,
783 1, 0, 0, 0); 795 1, 0, 0, 0, 1);
784 disk_num_bytes -= cur_alloc_size; 796 disk_num_bytes -= cur_alloc_size;
785 num_bytes -= cur_alloc_size; 797 num_bytes -= cur_alloc_size;
786 alloc_hint = ins.objectid + ins.offset; 798 alloc_hint = ins.objectid + ins.offset;
@@ -853,7 +865,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
853 int limit = 10 * 1024 * 1042; 865 int limit = 10 * 1024 * 1042;
854 866
855 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | 867 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
856 EXTENT_DELALLOC, 1, 0, GFP_NOFS); 868 EXTENT_DELALLOC, 1, 0, NULL, GFP_NOFS);
857 while (start < end) { 869 while (start < end) {
858 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); 870 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
859 async_cow->inode = inode; 871 async_cow->inode = inode;
@@ -1080,9 +1092,9 @@ out_check:
1080 em->bdev = root->fs_info->fs_devices->latest_bdev; 1092 em->bdev = root->fs_info->fs_devices->latest_bdev;
1081 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1093 set_bit(EXTENT_FLAG_PINNED, &em->flags);
1082 while (1) { 1094 while (1) {
1083 spin_lock(&em_tree->lock); 1095 write_lock(&em_tree->lock);
1084 ret = add_extent_mapping(em_tree, em); 1096 ret = add_extent_mapping(em_tree, em);
1085 spin_unlock(&em_tree->lock); 1097 write_unlock(&em_tree->lock);
1086 if (ret != -EEXIST) { 1098 if (ret != -EEXIST) {
1087 free_extent_map(em); 1099 free_extent_map(em);
1088 break; 1100 break;
@@ -1101,7 +1113,7 @@ out_check:
1101 1113
1102 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 1114 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1103 cur_offset, cur_offset + num_bytes - 1, 1115 cur_offset, cur_offset + num_bytes - 1,
1104 locked_page, 1, 1, 1, 0, 0, 0); 1116 locked_page, 1, 1, 1, 0, 0, 0, 1);
1105 cur_offset = extent_end; 1117 cur_offset = extent_end;
1106 if (cur_offset > end) 1118 if (cur_offset > end)
1107 break; 1119 break;
@@ -1147,6 +1159,83 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1147 return ret; 1159 return ret;
1148} 1160}
1149 1161
1162static int btrfs_split_extent_hook(struct inode *inode,
1163 struct extent_state *orig, u64 split)
1164{
1165 struct btrfs_root *root = BTRFS_I(inode)->root;
1166 u64 size;
1167
1168 if (!(orig->state & EXTENT_DELALLOC))
1169 return 0;
1170
1171 size = orig->end - orig->start + 1;
1172 if (size > root->fs_info->max_extent) {
1173 u64 num_extents;
1174 u64 new_size;
1175
1176 new_size = orig->end - split + 1;
1177 num_extents = div64_u64(size + root->fs_info->max_extent - 1,
1178 root->fs_info->max_extent);
1179
1180 /*
1181 * if we break a large extent up then leave delalloc_extents be,
1182 * since we've already accounted for the large extent.
1183 */
1184 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1185 root->fs_info->max_extent) < num_extents)
1186 return 0;
1187 }
1188
1189 BTRFS_I(inode)->delalloc_extents++;
1190
1191 return 0;
1192}
1193
1194/*
1195 * extent_io.c merge_extent_hook, used to track merged delayed allocation
1196 * extents so we can keep track of new extents that are just merged onto old
1197 * extents, such as when we are doing sequential writes, so we can properly
1198 * account for the metadata space we'll need.
1199 */
1200static int btrfs_merge_extent_hook(struct inode *inode,
1201 struct extent_state *new,
1202 struct extent_state *other)
1203{
1204 struct btrfs_root *root = BTRFS_I(inode)->root;
1205 u64 new_size, old_size;
1206 u64 num_extents;
1207
1208 /* not delalloc, ignore it */
1209 if (!(other->state & EXTENT_DELALLOC))
1210 return 0;
1211
1212 old_size = other->end - other->start + 1;
1213 if (new->start < other->start)
1214 new_size = other->end - new->start + 1;
1215 else
1216 new_size = new->end - other->start + 1;
1217
1218 /* we're not bigger than the max, unreserve the space and go */
1219 if (new_size <= root->fs_info->max_extent) {
1220 BTRFS_I(inode)->delalloc_extents--;
1221 return 0;
1222 }
1223
1224 /*
1225 * If we grew by another max_extent, just return, we want to keep that
1226 * reserved amount.
1227 */
1228 num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
1229 root->fs_info->max_extent);
1230 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1231 root->fs_info->max_extent) > num_extents)
1232 return 0;
1233
1234 BTRFS_I(inode)->delalloc_extents--;
1235
1236 return 0;
1237}
1238
1150/* 1239/*
1151 * extent_io.c set_bit_hook, used to track delayed allocation 1240 * extent_io.c set_bit_hook, used to track delayed allocation
1152 * bytes in this file, and to maintain the list of inodes that 1241 * bytes in this file, and to maintain the list of inodes that
@@ -1155,6 +1244,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1155static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, 1244static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1156 unsigned long old, unsigned long bits) 1245 unsigned long old, unsigned long bits)
1157{ 1246{
1247
1158 /* 1248 /*
1159 * set_bit and clear bit hooks normally require _irqsave/restore 1249 * set_bit and clear bit hooks normally require _irqsave/restore
1160 * but in this case, we are only testeing for the DELALLOC 1250 * but in this case, we are only testeing for the DELALLOC
@@ -1162,6 +1252,8 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1162 */ 1252 */
1163 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1253 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1164 struct btrfs_root *root = BTRFS_I(inode)->root; 1254 struct btrfs_root *root = BTRFS_I(inode)->root;
1255
1256 BTRFS_I(inode)->delalloc_extents++;
1165 btrfs_delalloc_reserve_space(root, inode, end - start + 1); 1257 btrfs_delalloc_reserve_space(root, inode, end - start + 1);
1166 spin_lock(&root->fs_info->delalloc_lock); 1258 spin_lock(&root->fs_info->delalloc_lock);
1167 BTRFS_I(inode)->delalloc_bytes += end - start + 1; 1259 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
@@ -1178,22 +1270,27 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1178/* 1270/*
1179 * extent_io.c clear_bit_hook, see set_bit_hook for why 1271 * extent_io.c clear_bit_hook, see set_bit_hook for why
1180 */ 1272 */
1181static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, 1273static int btrfs_clear_bit_hook(struct inode *inode,
1182 unsigned long old, unsigned long bits) 1274 struct extent_state *state, unsigned long bits)
1183{ 1275{
1184 /* 1276 /*
1185 * set_bit and clear bit hooks normally require _irqsave/restore 1277 * set_bit and clear bit hooks normally require _irqsave/restore
1186 * but in this case, we are only testeing for the DELALLOC 1278 * but in this case, we are only testeing for the DELALLOC
1187 * bit, which is only set or cleared with irqs on 1279 * bit, which is only set or cleared with irqs on
1188 */ 1280 */
1189 if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1281 if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1190 struct btrfs_root *root = BTRFS_I(inode)->root; 1282 struct btrfs_root *root = BTRFS_I(inode)->root;
1191 1283
1284 BTRFS_I(inode)->delalloc_extents--;
1285 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
1286
1192 spin_lock(&root->fs_info->delalloc_lock); 1287 spin_lock(&root->fs_info->delalloc_lock);
1193 if (end - start + 1 > root->fs_info->delalloc_bytes) { 1288 if (state->end - state->start + 1 >
1289 root->fs_info->delalloc_bytes) {
1194 printk(KERN_INFO "btrfs warning: delalloc account " 1290 printk(KERN_INFO "btrfs warning: delalloc account "
1195 "%llu %llu\n", 1291 "%llu %llu\n",
1196 (unsigned long long)end - start + 1, 1292 (unsigned long long)
1293 state->end - state->start + 1,
1197 (unsigned long long) 1294 (unsigned long long)
1198 root->fs_info->delalloc_bytes); 1295 root->fs_info->delalloc_bytes);
1199 btrfs_delalloc_free_space(root, inode, (u64)-1); 1296 btrfs_delalloc_free_space(root, inode, (u64)-1);
@@ -1201,9 +1298,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
1201 BTRFS_I(inode)->delalloc_bytes = 0; 1298 BTRFS_I(inode)->delalloc_bytes = 0;
1202 } else { 1299 } else {
1203 btrfs_delalloc_free_space(root, inode, 1300 btrfs_delalloc_free_space(root, inode,
1204 end - start + 1); 1301 state->end -
1205 root->fs_info->delalloc_bytes -= end - start + 1; 1302 state->start + 1);
1206 BTRFS_I(inode)->delalloc_bytes -= end - start + 1; 1303 root->fs_info->delalloc_bytes -= state->end -
1304 state->start + 1;
1305 BTRFS_I(inode)->delalloc_bytes -= state->end -
1306 state->start + 1;
1207 } 1307 }
1208 if (BTRFS_I(inode)->delalloc_bytes == 0 && 1308 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
1209 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1309 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
@@ -1374,10 +1474,8 @@ again:
1374 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); 1474 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
1375 1475
1376 /* already ordered? We're done */ 1476 /* already ordered? We're done */
1377 if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 1477 if (PagePrivate2(page))
1378 EXTENT_ORDERED, 0)) {
1379 goto out; 1478 goto out;
1380 }
1381 1479
1382 ordered = btrfs_lookup_ordered_extent(inode, page_start); 1480 ordered = btrfs_lookup_ordered_extent(inode, page_start);
1383 if (ordered) { 1481 if (ordered) {
@@ -1413,11 +1511,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1413 struct inode *inode = page->mapping->host; 1511 struct inode *inode = page->mapping->host;
1414 struct btrfs_writepage_fixup *fixup; 1512 struct btrfs_writepage_fixup *fixup;
1415 struct btrfs_root *root = BTRFS_I(inode)->root; 1513 struct btrfs_root *root = BTRFS_I(inode)->root;
1416 int ret;
1417 1514
1418 ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end, 1515 /* this page is properly in the ordered list */
1419 EXTENT_ORDERED, 0); 1516 if (TestClearPagePrivate2(page))
1420 if (ret)
1421 return 0; 1517 return 0;
1422 1518
1423 if (PageChecked(page)) 1519 if (PageChecked(page))
@@ -1455,9 +1551,19 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1455 BUG_ON(!path); 1551 BUG_ON(!path);
1456 1552
1457 path->leave_spinning = 1; 1553 path->leave_spinning = 1;
1554
1555 /*
1556 * we may be replacing one extent in the tree with another.
1557 * The new extent is pinned in the extent map, and we don't want
1558 * to drop it from the cache until it is completely in the btree.
1559 *
1560 * So, tell btrfs_drop_extents to leave this extent in the cache.
1561 * the caller is expected to unpin it and allow it to be merged
1562 * with the others.
1563 */
1458 ret = btrfs_drop_extents(trans, root, inode, file_pos, 1564 ret = btrfs_drop_extents(trans, root, inode, file_pos,
1459 file_pos + num_bytes, locked_end, 1565 file_pos + num_bytes, locked_end,
1460 file_pos, &hint); 1566 file_pos, &hint, 0);
1461 BUG_ON(ret); 1567 BUG_ON(ret);
1462 1568
1463 ins.objectid = inode->i_ino; 1569 ins.objectid = inode->i_ino;
@@ -1485,7 +1591,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1485 btrfs_mark_buffer_dirty(leaf); 1591 btrfs_mark_buffer_dirty(leaf);
1486 1592
1487 inode_add_bytes(inode, num_bytes); 1593 inode_add_bytes(inode, num_bytes);
1488 btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
1489 1594
1490 ins.objectid = disk_bytenr; 1595 ins.objectid = disk_bytenr;
1491 ins.offset = disk_num_bytes; 1596 ins.offset = disk_num_bytes;
@@ -1596,6 +1701,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1596 ordered_extent->len, 1701 ordered_extent->len,
1597 compressed, 0, 0, 1702 compressed, 0, 0,
1598 BTRFS_FILE_EXTENT_REG); 1703 BTRFS_FILE_EXTENT_REG);
1704 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
1705 ordered_extent->file_offset,
1706 ordered_extent->len);
1599 BUG_ON(ret); 1707 BUG_ON(ret);
1600 } 1708 }
1601 unlock_extent(io_tree, ordered_extent->file_offset, 1709 unlock_extent(io_tree, ordered_extent->file_offset,
@@ -1623,6 +1731,7 @@ nocow:
1623static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, 1731static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1624 struct extent_state *state, int uptodate) 1732 struct extent_state *state, int uptodate)
1625{ 1733{
1734 ClearPagePrivate2(page);
1626 return btrfs_finish_ordered_io(page->mapping->host, start, end); 1735 return btrfs_finish_ordered_io(page->mapping->host, start, end);
1627} 1736}
1628 1737
@@ -1669,13 +1778,13 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1669 failrec->last_mirror = 0; 1778 failrec->last_mirror = 0;
1670 failrec->bio_flags = 0; 1779 failrec->bio_flags = 0;
1671 1780
1672 spin_lock(&em_tree->lock); 1781 read_lock(&em_tree->lock);
1673 em = lookup_extent_mapping(em_tree, start, failrec->len); 1782 em = lookup_extent_mapping(em_tree, start, failrec->len);
1674 if (em->start > start || em->start + em->len < start) { 1783 if (em->start > start || em->start + em->len < start) {
1675 free_extent_map(em); 1784 free_extent_map(em);
1676 em = NULL; 1785 em = NULL;
1677 } 1786 }
1678 spin_unlock(&em_tree->lock); 1787 read_unlock(&em_tree->lock);
1679 1788
1680 if (!em || IS_ERR(em)) { 1789 if (!em || IS_ERR(em)) {
1681 kfree(failrec); 1790 kfree(failrec);
@@ -1794,7 +1903,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1794 return 0; 1903 return 0;
1795 1904
1796 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && 1905 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
1797 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) { 1906 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
1798 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, 1907 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
1799 GFP_NOFS); 1908 GFP_NOFS);
1800 return 0; 1909 return 0;
@@ -2352,6 +2461,69 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2352 return ret; 2461 return ret;
2353} 2462}
2354 2463
2464int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
2465 struct btrfs_root *root,
2466 struct inode *dir, u64 objectid,
2467 const char *name, int name_len)
2468{
2469 struct btrfs_path *path;
2470 struct extent_buffer *leaf;
2471 struct btrfs_dir_item *di;
2472 struct btrfs_key key;
2473 u64 index;
2474 int ret;
2475
2476 path = btrfs_alloc_path();
2477 if (!path)
2478 return -ENOMEM;
2479
2480 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2481 name, name_len, -1);
2482 BUG_ON(!di || IS_ERR(di));
2483
2484 leaf = path->nodes[0];
2485 btrfs_dir_item_key_to_cpu(leaf, di, &key);
2486 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
2487 ret = btrfs_delete_one_dir_name(trans, root, path, di);
2488 BUG_ON(ret);
2489 btrfs_release_path(root, path);
2490
2491 ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
2492 objectid, root->root_key.objectid,
2493 dir->i_ino, &index, name, name_len);
2494 if (ret < 0) {
2495 BUG_ON(ret != -ENOENT);
2496 di = btrfs_search_dir_index_item(root, path, dir->i_ino,
2497 name, name_len);
2498 BUG_ON(!di || IS_ERR(di));
2499
2500 leaf = path->nodes[0];
2501 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2502 btrfs_release_path(root, path);
2503 index = key.offset;
2504 }
2505
2506 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
2507 index, name, name_len, -1);
2508 BUG_ON(!di || IS_ERR(di));
2509
2510 leaf = path->nodes[0];
2511 btrfs_dir_item_key_to_cpu(leaf, di, &key);
2512 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
2513 ret = btrfs_delete_one_dir_name(trans, root, path, di);
2514 BUG_ON(ret);
2515 btrfs_release_path(root, path);
2516
2517 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
2518 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2519 ret = btrfs_update_inode(trans, root, dir);
2520 BUG_ON(ret);
2521 dir->i_sb->s_dirt = 1;
2522
2523 btrfs_free_path(path);
2524 return 0;
2525}
2526
2355static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) 2527static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2356{ 2528{
2357 struct inode *inode = dentry->d_inode; 2529 struct inode *inode = dentry->d_inode;
@@ -2361,29 +2533,31 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2361 struct btrfs_trans_handle *trans; 2533 struct btrfs_trans_handle *trans;
2362 unsigned long nr = 0; 2534 unsigned long nr = 0;
2363 2535
2364 /*
2365 * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
2366 * the root of a subvolume or snapshot
2367 */
2368 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || 2536 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
2369 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) { 2537 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
2370 return -ENOTEMPTY; 2538 return -ENOTEMPTY;
2371 }
2372 2539
2373 trans = btrfs_start_transaction(root, 1); 2540 trans = btrfs_start_transaction(root, 1);
2374 btrfs_set_trans_block_group(trans, dir); 2541 btrfs_set_trans_block_group(trans, dir);
2375 2542
2543 if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
2544 err = btrfs_unlink_subvol(trans, root, dir,
2545 BTRFS_I(inode)->location.objectid,
2546 dentry->d_name.name,
2547 dentry->d_name.len);
2548 goto out;
2549 }
2550
2376 err = btrfs_orphan_add(trans, inode); 2551 err = btrfs_orphan_add(trans, inode);
2377 if (err) 2552 if (err)
2378 goto fail_trans; 2553 goto out;
2379 2554
2380 /* now the directory is empty */ 2555 /* now the directory is empty */
2381 err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 2556 err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2382 dentry->d_name.name, dentry->d_name.len); 2557 dentry->d_name.name, dentry->d_name.len);
2383 if (!err) 2558 if (!err)
2384 btrfs_i_size_write(inode, 0); 2559 btrfs_i_size_write(inode, 0);
2385 2560out:
2386fail_trans:
2387 nr = trans->blocks_used; 2561 nr = trans->blocks_used;
2388 ret = btrfs_end_transaction_throttle(trans, root); 2562 ret = btrfs_end_transaction_throttle(trans, root);
2389 btrfs_btree_balance_dirty(root, nr); 2563 btrfs_btree_balance_dirty(root, nr);
@@ -2864,7 +3038,12 @@ again:
2864 goto again; 3038 goto again;
2865 } 3039 }
2866 3040
2867 btrfs_set_extent_delalloc(inode, page_start, page_end); 3041 ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
3042 if (ret) {
3043 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3044 goto out_unlock;
3045 }
3046
2868 ret = 0; 3047 ret = 0;
2869 if (offset != PAGE_CACHE_SIZE) { 3048 if (offset != PAGE_CACHE_SIZE) {
2870 kaddr = kmap(page); 3049 kaddr = kmap(page);
@@ -2895,15 +3074,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
2895 u64 last_byte; 3074 u64 last_byte;
2896 u64 cur_offset; 3075 u64 cur_offset;
2897 u64 hole_size; 3076 u64 hole_size;
2898 int err; 3077 int err = 0;
2899 3078
2900 if (size <= hole_start) 3079 if (size <= hole_start)
2901 return 0; 3080 return 0;
2902 3081
2903 err = btrfs_check_metadata_free_space(root);
2904 if (err)
2905 return err;
2906
2907 btrfs_truncate_page(inode->i_mapping, inode->i_size); 3082 btrfs_truncate_page(inode->i_mapping, inode->i_size);
2908 3083
2909 while (1) { 3084 while (1) {
@@ -2935,15 +3110,21 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
2935 cur_offset, 3110 cur_offset,
2936 cur_offset + hole_size, 3111 cur_offset + hole_size,
2937 block_end, 3112 block_end,
2938 cur_offset, &hint_byte); 3113 cur_offset, &hint_byte, 1);
2939 if (err) 3114 if (err)
2940 break; 3115 break;
3116
3117 err = btrfs_reserve_metadata_space(root, 1);
3118 if (err)
3119 break;
3120
2941 err = btrfs_insert_file_extent(trans, root, 3121 err = btrfs_insert_file_extent(trans, root,
2942 inode->i_ino, cur_offset, 0, 3122 inode->i_ino, cur_offset, 0,
2943 0, hole_size, 0, hole_size, 3123 0, hole_size, 0, hole_size,
2944 0, 0, 0); 3124 0, 0, 0);
2945 btrfs_drop_extent_cache(inode, hole_start, 3125 btrfs_drop_extent_cache(inode, hole_start,
2946 last_byte - 1, 0); 3126 last_byte - 1, 0);
3127 btrfs_unreserve_metadata_space(root, 1);
2947 } 3128 }
2948 free_extent_map(em); 3129 free_extent_map(em);
2949 cur_offset = last_byte; 3130 cur_offset = last_byte;
@@ -3003,6 +3184,11 @@ void btrfs_delete_inode(struct inode *inode)
3003 } 3184 }
3004 btrfs_wait_ordered_range(inode, 0, (u64)-1); 3185 btrfs_wait_ordered_range(inode, 0, (u64)-1);
3005 3186
3187 if (inode->i_nlink > 0) {
3188 BUG_ON(btrfs_root_refs(&root->root_item) != 0);
3189 goto no_delete;
3190 }
3191
3006 btrfs_i_size_write(inode, 0); 3192 btrfs_i_size_write(inode, 0);
3007 trans = btrfs_join_transaction(root, 1); 3193 trans = btrfs_join_transaction(root, 1);
3008 3194
@@ -3070,29 +3256,67 @@ out_err:
3070 * is kind of like crossing a mount point. 3256 * is kind of like crossing a mount point.
3071 */ 3257 */
3072static int fixup_tree_root_location(struct btrfs_root *root, 3258static int fixup_tree_root_location(struct btrfs_root *root,
3073 struct btrfs_key *location, 3259 struct inode *dir,
3074 struct btrfs_root **sub_root, 3260 struct dentry *dentry,
3075 struct dentry *dentry) 3261 struct btrfs_key *location,
3262 struct btrfs_root **sub_root)
3076{ 3263{
3077 struct btrfs_root_item *ri; 3264 struct btrfs_path *path;
3265 struct btrfs_root *new_root;
3266 struct btrfs_root_ref *ref;
3267 struct extent_buffer *leaf;
3268 int ret;
3269 int err = 0;
3078 3270
3079 if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY) 3271 path = btrfs_alloc_path();
3080 return 0; 3272 if (!path) {
3081 if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) 3273 err = -ENOMEM;
3082 return 0; 3274 goto out;
3275 }
3083 3276
3084 *sub_root = btrfs_read_fs_root(root->fs_info, location, 3277 err = -ENOENT;
3085 dentry->d_name.name, 3278 ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
3086 dentry->d_name.len); 3279 BTRFS_I(dir)->root->root_key.objectid,
3087 if (IS_ERR(*sub_root)) 3280 location->objectid);
3088 return PTR_ERR(*sub_root); 3281 if (ret) {
3282 if (ret < 0)
3283 err = ret;
3284 goto out;
3285 }
3089 3286
3090 ri = &(*sub_root)->root_item; 3287 leaf = path->nodes[0];
3091 location->objectid = btrfs_root_dirid(ri); 3288 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
3092 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); 3289 if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino ||
3093 location->offset = 0; 3290 btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
3291 goto out;
3094 3292
3095 return 0; 3293 ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
3294 (unsigned long)(ref + 1),
3295 dentry->d_name.len);
3296 if (ret)
3297 goto out;
3298
3299 btrfs_release_path(root->fs_info->tree_root, path);
3300
3301 new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
3302 if (IS_ERR(new_root)) {
3303 err = PTR_ERR(new_root);
3304 goto out;
3305 }
3306
3307 if (btrfs_root_refs(&new_root->root_item) == 0) {
3308 err = -ENOENT;
3309 goto out;
3310 }
3311
3312 *sub_root = new_root;
3313 location->objectid = btrfs_root_dirid(&new_root->root_item);
3314 location->type = BTRFS_INODE_ITEM_KEY;
3315 location->offset = 0;
3316 err = 0;
3317out:
3318 btrfs_free_path(path);
3319 return err;
3096} 3320}
3097 3321
3098static void inode_tree_add(struct inode *inode) 3322static void inode_tree_add(struct inode *inode)
@@ -3101,11 +3325,13 @@ static void inode_tree_add(struct inode *inode)
3101 struct btrfs_inode *entry; 3325 struct btrfs_inode *entry;
3102 struct rb_node **p; 3326 struct rb_node **p;
3103 struct rb_node *parent; 3327 struct rb_node *parent;
3104
3105again: 3328again:
3106 p = &root->inode_tree.rb_node; 3329 p = &root->inode_tree.rb_node;
3107 parent = NULL; 3330 parent = NULL;
3108 3331
3332 if (hlist_unhashed(&inode->i_hash))
3333 return;
3334
3109 spin_lock(&root->inode_lock); 3335 spin_lock(&root->inode_lock);
3110 while (*p) { 3336 while (*p) {
3111 parent = *p; 3337 parent = *p;
@@ -3132,13 +3358,87 @@ again:
3132static void inode_tree_del(struct inode *inode) 3358static void inode_tree_del(struct inode *inode)
3133{ 3359{
3134 struct btrfs_root *root = BTRFS_I(inode)->root; 3360 struct btrfs_root *root = BTRFS_I(inode)->root;
3361 int empty = 0;
3135 3362
3136 spin_lock(&root->inode_lock); 3363 spin_lock(&root->inode_lock);
3137 if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { 3364 if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
3138 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); 3365 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
3139 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); 3366 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
3367 empty = RB_EMPTY_ROOT(&root->inode_tree);
3368 }
3369 spin_unlock(&root->inode_lock);
3370
3371 if (empty && btrfs_root_refs(&root->root_item) == 0) {
3372 synchronize_srcu(&root->fs_info->subvol_srcu);
3373 spin_lock(&root->inode_lock);
3374 empty = RB_EMPTY_ROOT(&root->inode_tree);
3375 spin_unlock(&root->inode_lock);
3376 if (empty)
3377 btrfs_add_dead_root(root);
3378 }
3379}
3380
3381int btrfs_invalidate_inodes(struct btrfs_root *root)
3382{
3383 struct rb_node *node;
3384 struct rb_node *prev;
3385 struct btrfs_inode *entry;
3386 struct inode *inode;
3387 u64 objectid = 0;
3388
3389 WARN_ON(btrfs_root_refs(&root->root_item) != 0);
3390
3391 spin_lock(&root->inode_lock);
3392again:
3393 node = root->inode_tree.rb_node;
3394 prev = NULL;
3395 while (node) {
3396 prev = node;
3397 entry = rb_entry(node, struct btrfs_inode, rb_node);
3398
3399 if (objectid < entry->vfs_inode.i_ino)
3400 node = node->rb_left;
3401 else if (objectid > entry->vfs_inode.i_ino)
3402 node = node->rb_right;
3403 else
3404 break;
3405 }
3406 if (!node) {
3407 while (prev) {
3408 entry = rb_entry(prev, struct btrfs_inode, rb_node);
3409 if (objectid <= entry->vfs_inode.i_ino) {
3410 node = prev;
3411 break;
3412 }
3413 prev = rb_next(prev);
3414 }
3415 }
3416 while (node) {
3417 entry = rb_entry(node, struct btrfs_inode, rb_node);
3418 objectid = entry->vfs_inode.i_ino + 1;
3419 inode = igrab(&entry->vfs_inode);
3420 if (inode) {
3421 spin_unlock(&root->inode_lock);
3422 if (atomic_read(&inode->i_count) > 1)
3423 d_prune_aliases(inode);
3424 /*
3425 * btrfs_drop_inode will remove it from
3426 * the inode cache when its usage count
3427 * hits zero.
3428 */
3429 iput(inode);
3430 cond_resched();
3431 spin_lock(&root->inode_lock);
3432 goto again;
3433 }
3434
3435 if (cond_resched_lock(&root->inode_lock))
3436 goto again;
3437
3438 node = rb_next(node);
3140 } 3439 }
3141 spin_unlock(&root->inode_lock); 3440 spin_unlock(&root->inode_lock);
3441 return 0;
3142} 3442}
3143 3443
3144static noinline void init_btrfs_i(struct inode *inode) 3444static noinline void init_btrfs_i(struct inode *inode)
@@ -3225,15 +3525,41 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3225 return inode; 3525 return inode;
3226} 3526}
3227 3527
3528static struct inode *new_simple_dir(struct super_block *s,
3529 struct btrfs_key *key,
3530 struct btrfs_root *root)
3531{
3532 struct inode *inode = new_inode(s);
3533
3534 if (!inode)
3535 return ERR_PTR(-ENOMEM);
3536
3537 init_btrfs_i(inode);
3538
3539 BTRFS_I(inode)->root = root;
3540 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
3541 BTRFS_I(inode)->dummy_inode = 1;
3542
3543 inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
3544 inode->i_op = &simple_dir_inode_operations;
3545 inode->i_fop = &simple_dir_operations;
3546 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
3547 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
3548
3549 return inode;
3550}
3551
3228struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) 3552struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3229{ 3553{
3230 struct inode *inode; 3554 struct inode *inode;
3231 struct btrfs_inode *bi = BTRFS_I(dir); 3555 struct btrfs_root *root = BTRFS_I(dir)->root;
3232 struct btrfs_root *root = bi->root;
3233 struct btrfs_root *sub_root = root; 3556 struct btrfs_root *sub_root = root;
3234 struct btrfs_key location; 3557 struct btrfs_key location;
3558 int index;
3235 int ret; 3559 int ret;
3236 3560
3561 dentry->d_op = &btrfs_dentry_operations;
3562
3237 if (dentry->d_name.len > BTRFS_NAME_LEN) 3563 if (dentry->d_name.len > BTRFS_NAME_LEN)
3238 return ERR_PTR(-ENAMETOOLONG); 3564 return ERR_PTR(-ENAMETOOLONG);
3239 3565
@@ -3242,29 +3568,50 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3242 if (ret < 0) 3568 if (ret < 0)
3243 return ERR_PTR(ret); 3569 return ERR_PTR(ret);
3244 3570
3245 inode = NULL; 3571 if (location.objectid == 0)
3246 if (location.objectid) { 3572 return NULL;
3247 ret = fixup_tree_root_location(root, &location, &sub_root, 3573
3248 dentry); 3574 if (location.type == BTRFS_INODE_ITEM_KEY) {
3249 if (ret < 0) 3575 inode = btrfs_iget(dir->i_sb, &location, root);
3250 return ERR_PTR(ret); 3576 return inode;
3251 if (ret > 0) 3577 }
3252 return ERR_PTR(-ENOENT); 3578
3579 BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
3580
3581 index = srcu_read_lock(&root->fs_info->subvol_srcu);
3582 ret = fixup_tree_root_location(root, dir, dentry,
3583 &location, &sub_root);
3584 if (ret < 0) {
3585 if (ret != -ENOENT)
3586 inode = ERR_PTR(ret);
3587 else
3588 inode = new_simple_dir(dir->i_sb, &location, sub_root);
3589 } else {
3253 inode = btrfs_iget(dir->i_sb, &location, sub_root); 3590 inode = btrfs_iget(dir->i_sb, &location, sub_root);
3254 if (IS_ERR(inode))
3255 return ERR_CAST(inode);
3256 } 3591 }
3592 srcu_read_unlock(&root->fs_info->subvol_srcu, index);
3593
3257 return inode; 3594 return inode;
3258} 3595}
3259 3596
3597static int btrfs_dentry_delete(struct dentry *dentry)
3598{
3599 struct btrfs_root *root;
3600
3601 if (!dentry->d_inode)
3602 return 0;
3603
3604 root = BTRFS_I(dentry->d_inode)->root;
3605 if (btrfs_root_refs(&root->root_item) == 0)
3606 return 1;
3607 return 0;
3608}
3609
3260static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 3610static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
3261 struct nameidata *nd) 3611 struct nameidata *nd)
3262{ 3612{
3263 struct inode *inode; 3613 struct inode *inode;
3264 3614
3265 if (dentry->d_name.len > BTRFS_NAME_LEN)
3266 return ERR_PTR(-ENAMETOOLONG);
3267
3268 inode = btrfs_lookup_dentry(dir, dentry); 3615 inode = btrfs_lookup_dentry(dir, dentry);
3269 if (IS_ERR(inode)) 3616 if (IS_ERR(inode))
3270 return ERR_CAST(inode); 3617 return ERR_CAST(inode);
@@ -3603,9 +3950,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3603 if (ret != 0) 3950 if (ret != 0)
3604 goto fail; 3951 goto fail;
3605 3952
3606 if (objectid > root->highest_inode)
3607 root->highest_inode = objectid;
3608
3609 inode->i_uid = current_fsuid(); 3953 inode->i_uid = current_fsuid();
3610 3954
3611 if (dir && (dir->i_mode & S_ISGID)) { 3955 if (dir && (dir->i_mode & S_ISGID)) {
@@ -3673,26 +4017,35 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
3673 struct inode *parent_inode, struct inode *inode, 4017 struct inode *parent_inode, struct inode *inode,
3674 const char *name, int name_len, int add_backref, u64 index) 4018 const char *name, int name_len, int add_backref, u64 index)
3675{ 4019{
3676 int ret; 4020 int ret = 0;
3677 struct btrfs_key key; 4021 struct btrfs_key key;
3678 struct btrfs_root *root = BTRFS_I(parent_inode)->root; 4022 struct btrfs_root *root = BTRFS_I(parent_inode)->root;
3679 4023
3680 key.objectid = inode->i_ino; 4024 if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
3681 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 4025 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
3682 key.offset = 0; 4026 } else {
4027 key.objectid = inode->i_ino;
4028 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
4029 key.offset = 0;
4030 }
4031
4032 if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
4033 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
4034 key.objectid, root->root_key.objectid,
4035 parent_inode->i_ino,
4036 index, name, name_len);
4037 } else if (add_backref) {
4038 ret = btrfs_insert_inode_ref(trans, root,
4039 name, name_len, inode->i_ino,
4040 parent_inode->i_ino, index);
4041 }
3683 4042
3684 ret = btrfs_insert_dir_item(trans, root, name, name_len,
3685 parent_inode->i_ino,
3686 &key, btrfs_inode_type(inode),
3687 index);
3688 if (ret == 0) { 4043 if (ret == 0) {
3689 if (add_backref) { 4044 ret = btrfs_insert_dir_item(trans, root, name, name_len,
3690 ret = btrfs_insert_inode_ref(trans, root, 4045 parent_inode->i_ino, &key,
3691 name, name_len, 4046 btrfs_inode_type(inode), index);
3692 inode->i_ino, 4047 BUG_ON(ret);
3693 parent_inode->i_ino, 4048
3694 index);
3695 }
3696 btrfs_i_size_write(parent_inode, parent_inode->i_size + 4049 btrfs_i_size_write(parent_inode, parent_inode->i_size +
3697 name_len * 2); 4050 name_len * 2);
3698 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; 4051 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
@@ -3732,11 +4085,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3732 if (!new_valid_dev(rdev)) 4085 if (!new_valid_dev(rdev))
3733 return -EINVAL; 4086 return -EINVAL;
3734 4087
3735 err = btrfs_check_metadata_free_space(root); 4088 /*
4089 * 2 for inode item and ref
4090 * 2 for dir items
4091 * 1 for xattr if selinux is on
4092 */
4093 err = btrfs_reserve_metadata_space(root, 5);
3736 if (err) 4094 if (err)
3737 goto fail; 4095 return err;
3738 4096
3739 trans = btrfs_start_transaction(root, 1); 4097 trans = btrfs_start_transaction(root, 1);
4098 if (!trans)
4099 goto fail;
3740 btrfs_set_trans_block_group(trans, dir); 4100 btrfs_set_trans_block_group(trans, dir);
3741 4101
3742 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4102 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -3774,6 +4134,7 @@ out_unlock:
3774 nr = trans->blocks_used; 4134 nr = trans->blocks_used;
3775 btrfs_end_transaction_throttle(trans, root); 4135 btrfs_end_transaction_throttle(trans, root);
3776fail: 4136fail:
4137 btrfs_unreserve_metadata_space(root, 5);
3777 if (drop_inode) { 4138 if (drop_inode) {
3778 inode_dec_link_count(inode); 4139 inode_dec_link_count(inode);
3779 iput(inode); 4140 iput(inode);
@@ -3794,10 +4155,18 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
3794 u64 objectid; 4155 u64 objectid;
3795 u64 index = 0; 4156 u64 index = 0;
3796 4157
3797 err = btrfs_check_metadata_free_space(root); 4158 /*
4159 * 2 for inode item and ref
4160 * 2 for dir items
4161 * 1 for xattr if selinux is on
4162 */
4163 err = btrfs_reserve_metadata_space(root, 5);
3798 if (err) 4164 if (err)
3799 goto fail; 4165 return err;
4166
3800 trans = btrfs_start_transaction(root, 1); 4167 trans = btrfs_start_transaction(root, 1);
4168 if (!trans)
4169 goto fail;
3801 btrfs_set_trans_block_group(trans, dir); 4170 btrfs_set_trans_block_group(trans, dir);
3802 4171
3803 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4172 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -3838,6 +4207,7 @@ out_unlock:
3838 nr = trans->blocks_used; 4207 nr = trans->blocks_used;
3839 btrfs_end_transaction_throttle(trans, root); 4208 btrfs_end_transaction_throttle(trans, root);
3840fail: 4209fail:
4210 btrfs_unreserve_metadata_space(root, 5);
3841 if (drop_inode) { 4211 if (drop_inode) {
3842 inode_dec_link_count(inode); 4212 inode_dec_link_count(inode);
3843 iput(inode); 4213 iput(inode);
@@ -3860,10 +4230,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
3860 if (inode->i_nlink == 0) 4230 if (inode->i_nlink == 0)
3861 return -ENOENT; 4231 return -ENOENT;
3862 4232
3863 btrfs_inc_nlink(inode); 4233 /*
3864 err = btrfs_check_metadata_free_space(root); 4234 * 1 item for inode ref
4235 * 2 items for dir items
4236 */
4237 err = btrfs_reserve_metadata_space(root, 3);
3865 if (err) 4238 if (err)
3866 goto fail; 4239 return err;
4240
4241 btrfs_inc_nlink(inode);
4242
3867 err = btrfs_set_inode_index(dir, &index); 4243 err = btrfs_set_inode_index(dir, &index);
3868 if (err) 4244 if (err)
3869 goto fail; 4245 goto fail;
@@ -3875,20 +4251,19 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
3875 4251
3876 err = btrfs_add_nondir(trans, dentry, inode, 1, index); 4252 err = btrfs_add_nondir(trans, dentry, inode, 1, index);
3877 4253
3878 if (err) 4254 if (err) {
3879 drop_inode = 1;
3880
3881 btrfs_update_inode_block_group(trans, dir);
3882 err = btrfs_update_inode(trans, root, inode);
3883
3884 if (err)
3885 drop_inode = 1; 4255 drop_inode = 1;
4256 } else {
4257 btrfs_update_inode_block_group(trans, dir);
4258 err = btrfs_update_inode(trans, root, inode);
4259 BUG_ON(err);
4260 btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
4261 }
3886 4262
3887 nr = trans->blocks_used; 4263 nr = trans->blocks_used;
3888
3889 btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
3890 btrfs_end_transaction_throttle(trans, root); 4264 btrfs_end_transaction_throttle(trans, root);
3891fail: 4265fail:
4266 btrfs_unreserve_metadata_space(root, 3);
3892 if (drop_inode) { 4267 if (drop_inode) {
3893 inode_dec_link_count(inode); 4268 inode_dec_link_count(inode);
3894 iput(inode); 4269 iput(inode);
@@ -3908,17 +4283,21 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
3908 u64 index = 0; 4283 u64 index = 0;
3909 unsigned long nr = 1; 4284 unsigned long nr = 1;
3910 4285
3911 err = btrfs_check_metadata_free_space(root); 4286 /*
4287 * 2 items for inode and ref
4288 * 2 items for dir items
4289 * 1 for xattr if selinux is on
4290 */
4291 err = btrfs_reserve_metadata_space(root, 5);
3912 if (err) 4292 if (err)
3913 goto out_unlock; 4293 return err;
3914 4294
3915 trans = btrfs_start_transaction(root, 1); 4295 trans = btrfs_start_transaction(root, 1);
3916 btrfs_set_trans_block_group(trans, dir); 4296 if (!trans) {
3917 4297 err = -ENOMEM;
3918 if (IS_ERR(trans)) {
3919 err = PTR_ERR(trans);
3920 goto out_unlock; 4298 goto out_unlock;
3921 } 4299 }
4300 btrfs_set_trans_block_group(trans, dir);
3922 4301
3923 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4302 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3924 if (err) { 4303 if (err) {
@@ -3967,6 +4346,7 @@ out_fail:
3967 btrfs_end_transaction_throttle(trans, root); 4346 btrfs_end_transaction_throttle(trans, root);
3968 4347
3969out_unlock: 4348out_unlock:
4349 btrfs_unreserve_metadata_space(root, 5);
3970 if (drop_on_err) 4350 if (drop_on_err)
3971 iput(inode); 4351 iput(inode);
3972 btrfs_btree_balance_dirty(root, nr); 4352 btrfs_btree_balance_dirty(root, nr);
@@ -4064,11 +4444,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
4064 int compressed; 4444 int compressed;
4065 4445
4066again: 4446again:
4067 spin_lock(&em_tree->lock); 4447 read_lock(&em_tree->lock);
4068 em = lookup_extent_mapping(em_tree, start, len); 4448 em = lookup_extent_mapping(em_tree, start, len);
4069 if (em) 4449 if (em)
4070 em->bdev = root->fs_info->fs_devices->latest_bdev; 4450 em->bdev = root->fs_info->fs_devices->latest_bdev;
4071 spin_unlock(&em_tree->lock); 4451 read_unlock(&em_tree->lock);
4072 4452
4073 if (em) { 4453 if (em) {
4074 if (em->start > start || em->start + em->len <= start) 4454 if (em->start > start || em->start + em->len <= start)
@@ -4215,6 +4595,11 @@ again:
4215 map = kmap(page); 4595 map = kmap(page);
4216 read_extent_buffer(leaf, map + pg_offset, ptr, 4596 read_extent_buffer(leaf, map + pg_offset, ptr,
4217 copy_size); 4597 copy_size);
4598 if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
4599 memset(map + pg_offset + copy_size, 0,
4600 PAGE_CACHE_SIZE - pg_offset -
4601 copy_size);
4602 }
4218 kunmap(page); 4603 kunmap(page);
4219 } 4604 }
4220 flush_dcache_page(page); 4605 flush_dcache_page(page);
@@ -4259,7 +4644,7 @@ insert:
4259 } 4644 }
4260 4645
4261 err = 0; 4646 err = 0;
4262 spin_lock(&em_tree->lock); 4647 write_lock(&em_tree->lock);
4263 ret = add_extent_mapping(em_tree, em); 4648 ret = add_extent_mapping(em_tree, em);
4264 /* it is possible that someone inserted the extent into the tree 4649 /* it is possible that someone inserted the extent into the tree
4265 * while we had the lock dropped. It is also possible that 4650 * while we had the lock dropped. It is also possible that
@@ -4299,7 +4684,7 @@ insert:
4299 err = 0; 4684 err = 0;
4300 } 4685 }
4301 } 4686 }
4302 spin_unlock(&em_tree->lock); 4687 write_unlock(&em_tree->lock);
4303out: 4688out:
4304 if (path) 4689 if (path)
4305 btrfs_free_path(path); 4690 btrfs_free_path(path);
@@ -4398,13 +4783,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4398 u64 page_start = page_offset(page); 4783 u64 page_start = page_offset(page);
4399 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 4784 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
4400 4785
4786
4787 /*
4788 * we have the page locked, so new writeback can't start,
4789 * and the dirty bit won't be cleared while we are here.
4790 *
4791 * Wait for IO on this page so that we can safely clear
4792 * the PagePrivate2 bit and do ordered accounting
4793 */
4401 wait_on_page_writeback(page); 4794 wait_on_page_writeback(page);
4795
4402 tree = &BTRFS_I(page->mapping->host)->io_tree; 4796 tree = &BTRFS_I(page->mapping->host)->io_tree;
4403 if (offset) { 4797 if (offset) {
4404 btrfs_releasepage(page, GFP_NOFS); 4798 btrfs_releasepage(page, GFP_NOFS);
4405 return; 4799 return;
4406 } 4800 }
4407
4408 lock_extent(tree, page_start, page_end, GFP_NOFS); 4801 lock_extent(tree, page_start, page_end, GFP_NOFS);
4409 ordered = btrfs_lookup_ordered_extent(page->mapping->host, 4802 ordered = btrfs_lookup_ordered_extent(page->mapping->host,
4410 page_offset(page)); 4803 page_offset(page));
@@ -4415,16 +4808,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4415 */ 4808 */
4416 clear_extent_bit(tree, page_start, page_end, 4809 clear_extent_bit(tree, page_start, page_end,
4417 EXTENT_DIRTY | EXTENT_DELALLOC | 4810 EXTENT_DIRTY | EXTENT_DELALLOC |
4418 EXTENT_LOCKED, 1, 0, GFP_NOFS); 4811 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
4419 btrfs_finish_ordered_io(page->mapping->host, 4812 /*
4420 page_start, page_end); 4813 * whoever cleared the private bit is responsible
4814 * for the finish_ordered_io
4815 */
4816 if (TestClearPagePrivate2(page)) {
4817 btrfs_finish_ordered_io(page->mapping->host,
4818 page_start, page_end);
4819 }
4421 btrfs_put_ordered_extent(ordered); 4820 btrfs_put_ordered_extent(ordered);
4422 lock_extent(tree, page_start, page_end, GFP_NOFS); 4821 lock_extent(tree, page_start, page_end, GFP_NOFS);
4423 } 4822 }
4424 clear_extent_bit(tree, page_start, page_end, 4823 clear_extent_bit(tree, page_start, page_end,
4425 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 4824 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
4426 EXTENT_ORDERED, 4825 1, 1, NULL, GFP_NOFS);
4427 1, 1, GFP_NOFS);
4428 __btrfs_releasepage(page, GFP_NOFS); 4826 __btrfs_releasepage(page, GFP_NOFS);
4429 4827
4430 ClearPageChecked(page); 4828 ClearPageChecked(page);
@@ -4473,6 +4871,13 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
4473 goto out; 4871 goto out;
4474 } 4872 }
4475 4873
4874 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
4875 if (ret) {
4876 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
4877 ret = VM_FAULT_SIGBUS;
4878 goto out;
4879 }
4880
4476 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 4881 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
4477again: 4882again:
4478 lock_page(page); 4883 lock_page(page);
@@ -4504,7 +4909,23 @@ again:
4504 goto again; 4909 goto again;
4505 } 4910 }
4506 4911
4507 btrfs_set_extent_delalloc(inode, page_start, page_end); 4912 /*
4913 * XXX - page_mkwrite gets called every time the page is dirtied, even
4914 * if it was already dirty, so for space accounting reasons we need to
4915 * clear any delalloc bits for the range we are fixing to save. There
4916 * is probably a better way to do this, but for now keep consistent with
4917 * prepare_pages in the normal write path.
4918 */
4919 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
4920 EXTENT_DIRTY | EXTENT_DELALLOC, GFP_NOFS);
4921
4922 ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
4923 if (ret) {
4924 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4925 ret = VM_FAULT_SIGBUS;
4926 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
4927 goto out_unlock;
4928 }
4508 ret = 0; 4929 ret = 0;
4509 4930
4510 /* page is wholly or partially inside EOF */ 4931 /* page is wholly or partially inside EOF */
@@ -4521,11 +4942,15 @@ again:
4521 } 4942 }
4522 ClearPageChecked(page); 4943 ClearPageChecked(page);
4523 set_page_dirty(page); 4944 set_page_dirty(page);
4945 SetPageUptodate(page);
4524 4946
4525 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; 4947 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
4526 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 4948 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4527 4949
4528out_unlock: 4950out_unlock:
4951 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
4952 if (!ret)
4953 return VM_FAULT_LOCKED;
4529 unlock_page(page); 4954 unlock_page(page);
4530out: 4955out:
4531 return ret; 4956 return ret;
@@ -4594,11 +5019,11 @@ out:
4594 * create a new subvolume directory/inode (helper for the ioctl). 5019 * create a new subvolume directory/inode (helper for the ioctl).
4595 */ 5020 */
4596int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 5021int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
4597 struct btrfs_root *new_root, struct dentry *dentry, 5022 struct btrfs_root *new_root,
4598 u64 new_dirid, u64 alloc_hint) 5023 u64 new_dirid, u64 alloc_hint)
4599{ 5024{
4600 struct inode *inode; 5025 struct inode *inode;
4601 int error; 5026 int err;
4602 u64 index = 0; 5027 u64 index = 0;
4603 5028
4604 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, 5029 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
@@ -4611,11 +5036,10 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
4611 inode->i_nlink = 1; 5036 inode->i_nlink = 1;
4612 btrfs_i_size_write(inode, 0); 5037 btrfs_i_size_write(inode, 0);
4613 5038
4614 error = btrfs_update_inode(trans, new_root, inode); 5039 err = btrfs_update_inode(trans, new_root, inode);
4615 if (error) 5040 BUG_ON(err);
4616 return error;
4617 5041
4618 d_instantiate(dentry, inode); 5042 iput(inode);
4619 return 0; 5043 return 0;
4620} 5044}
4621 5045
@@ -4641,6 +5065,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
4641 return NULL; 5065 return NULL;
4642 ei->last_trans = 0; 5066 ei->last_trans = 0;
4643 ei->logged_trans = 0; 5067 ei->logged_trans = 0;
5068 ei->delalloc_extents = 0;
5069 ei->delalloc_reserved_extents = 0;
4644 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 5070 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
4645 INIT_LIST_HEAD(&ei->i_orphan); 5071 INIT_LIST_HEAD(&ei->i_orphan);
4646 INIT_LIST_HEAD(&ei->ordered_operations); 5072 INIT_LIST_HEAD(&ei->ordered_operations);
@@ -4693,6 +5119,16 @@ void btrfs_destroy_inode(struct inode *inode)
4693 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 5119 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
4694} 5120}
4695 5121
5122void btrfs_drop_inode(struct inode *inode)
5123{
5124 struct btrfs_root *root = BTRFS_I(inode)->root;
5125
5126 if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
5127 generic_delete_inode(inode);
5128 else
5129 generic_drop_inode(inode);
5130}
5131
4696static void init_once(void *foo) 5132static void init_once(void *foo)
4697{ 5133{
4698 struct btrfs_inode *ei = (struct btrfs_inode *) foo; 5134 struct btrfs_inode *ei = (struct btrfs_inode *) foo;
@@ -4761,31 +5197,37 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4761{ 5197{
4762 struct btrfs_trans_handle *trans; 5198 struct btrfs_trans_handle *trans;
4763 struct btrfs_root *root = BTRFS_I(old_dir)->root; 5199 struct btrfs_root *root = BTRFS_I(old_dir)->root;
5200 struct btrfs_root *dest = BTRFS_I(new_dir)->root;
4764 struct inode *new_inode = new_dentry->d_inode; 5201 struct inode *new_inode = new_dentry->d_inode;
4765 struct inode *old_inode = old_dentry->d_inode; 5202 struct inode *old_inode = old_dentry->d_inode;
4766 struct timespec ctime = CURRENT_TIME; 5203 struct timespec ctime = CURRENT_TIME;
4767 u64 index = 0; 5204 u64 index = 0;
5205 u64 root_objectid;
4768 int ret; 5206 int ret;
4769 5207
4770 /* we're not allowed to rename between subvolumes */ 5208 if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
4771 if (BTRFS_I(old_inode)->root->root_key.objectid != 5209 return -EPERM;
4772 BTRFS_I(new_dir)->root->root_key.objectid) 5210
5211 /* we only allow rename subvolume link between subvolumes */
5212 if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
4773 return -EXDEV; 5213 return -EXDEV;
4774 5214
5215 if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
5216 (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID))
5217 return -ENOTEMPTY;
5218
4775 if (S_ISDIR(old_inode->i_mode) && new_inode && 5219 if (S_ISDIR(old_inode->i_mode) && new_inode &&
4776 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) { 5220 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
4777 return -ENOTEMPTY; 5221 return -ENOTEMPTY;
4778 }
4779 5222
4780 /* to rename a snapshot or subvolume, we need to juggle the 5223 /*
4781 * backrefs. This isn't coded yet 5224 * 2 items for dir items
5225 * 1 item for orphan entry
5226 * 1 item for ref
4782 */ 5227 */
4783 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 5228 ret = btrfs_reserve_metadata_space(root, 4);
4784 return -EXDEV;
4785
4786 ret = btrfs_check_metadata_free_space(root);
4787 if (ret) 5229 if (ret)
4788 goto out_unlock; 5230 return ret;
4789 5231
4790 /* 5232 /*
4791 * we're using rename to replace one file with another. 5233 * we're using rename to replace one file with another.
@@ -4796,8 +5238,40 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4796 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) 5238 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
4797 filemap_flush(old_inode->i_mapping); 5239 filemap_flush(old_inode->i_mapping);
4798 5240
5241 /* close the racy window with snapshot create/destroy ioctl */
5242 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5243 down_read(&root->fs_info->subvol_sem);
5244
4799 trans = btrfs_start_transaction(root, 1); 5245 trans = btrfs_start_transaction(root, 1);
5246 btrfs_set_trans_block_group(trans, new_dir);
5247
5248 if (dest != root)
5249 btrfs_record_root_in_trans(trans, dest);
5250
5251 ret = btrfs_set_inode_index(new_dir, &index);
5252 if (ret)
5253 goto out_fail;
4800 5254
5255 if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
5256 /* force full log commit if subvolume involved. */
5257 root->fs_info->last_trans_log_full_commit = trans->transid;
5258 } else {
5259 ret = btrfs_insert_inode_ref(trans, dest,
5260 new_dentry->d_name.name,
5261 new_dentry->d_name.len,
5262 old_inode->i_ino,
5263 new_dir->i_ino, index);
5264 if (ret)
5265 goto out_fail;
5266 /*
5267 * this is an ugly little race, but the rename is required
5268 * to make sure that if we crash, the inode is either at the
5269 * old name or the new one. pinning the log transaction lets
5270 * us make sure we don't allow a log commit to come in after
5271 * we unlink the name but before we add the new name back in.
5272 */
5273 btrfs_pin_log_trans(root);
5274 }
4801 /* 5275 /*
4802 * make sure the inode gets flushed if it is replacing 5276 * make sure the inode gets flushed if it is replacing
4803 * something. 5277 * something.
@@ -4807,18 +5281,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4807 btrfs_add_ordered_operation(trans, root, old_inode); 5281 btrfs_add_ordered_operation(trans, root, old_inode);
4808 } 5282 }
4809 5283
4810 /*
4811 * this is an ugly little race, but the rename is required to make
4812 * sure that if we crash, the inode is either at the old name
4813 * or the new one. pinning the log transaction lets us make sure
4814 * we don't allow a log commit to come in after we unlink the
4815 * name but before we add the new name back in.
4816 */
4817 btrfs_pin_log_trans(root);
4818
4819 btrfs_set_trans_block_group(trans, new_dir);
4820
4821 btrfs_inc_nlink(old_dentry->d_inode);
4822 old_dir->i_ctime = old_dir->i_mtime = ctime; 5284 old_dir->i_ctime = old_dir->i_mtime = ctime;
4823 new_dir->i_ctime = new_dir->i_mtime = ctime; 5285 new_dir->i_ctime = new_dir->i_mtime = ctime;
4824 old_inode->i_ctime = ctime; 5286 old_inode->i_ctime = ctime;
@@ -4826,47 +5288,60 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4826 if (old_dentry->d_parent != new_dentry->d_parent) 5288 if (old_dentry->d_parent != new_dentry->d_parent)
4827 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); 5289 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
4828 5290
4829 ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode, 5291 if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
4830 old_dentry->d_name.name, 5292 root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
4831 old_dentry->d_name.len); 5293 ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
4832 if (ret) 5294 old_dentry->d_name.name,
4833 goto out_fail; 5295 old_dentry->d_name.len);
5296 } else {
5297 btrfs_inc_nlink(old_dentry->d_inode);
5298 ret = btrfs_unlink_inode(trans, root, old_dir,
5299 old_dentry->d_inode,
5300 old_dentry->d_name.name,
5301 old_dentry->d_name.len);
5302 }
5303 BUG_ON(ret);
4834 5304
4835 if (new_inode) { 5305 if (new_inode) {
4836 new_inode->i_ctime = CURRENT_TIME; 5306 new_inode->i_ctime = CURRENT_TIME;
4837 ret = btrfs_unlink_inode(trans, root, new_dir, 5307 if (unlikely(new_inode->i_ino ==
4838 new_dentry->d_inode, 5308 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4839 new_dentry->d_name.name, 5309 root_objectid = BTRFS_I(new_inode)->location.objectid;
4840 new_dentry->d_name.len); 5310 ret = btrfs_unlink_subvol(trans, dest, new_dir,
4841 if (ret) 5311 root_objectid,
4842 goto out_fail; 5312 new_dentry->d_name.name,
5313 new_dentry->d_name.len);
5314 BUG_ON(new_inode->i_nlink == 0);
5315 } else {
5316 ret = btrfs_unlink_inode(trans, dest, new_dir,
5317 new_dentry->d_inode,
5318 new_dentry->d_name.name,
5319 new_dentry->d_name.len);
5320 }
5321 BUG_ON(ret);
4843 if (new_inode->i_nlink == 0) { 5322 if (new_inode->i_nlink == 0) {
4844 ret = btrfs_orphan_add(trans, new_dentry->d_inode); 5323 ret = btrfs_orphan_add(trans, new_dentry->d_inode);
4845 if (ret) 5324 BUG_ON(ret);
4846 goto out_fail;
4847 } 5325 }
4848
4849 } 5326 }
4850 ret = btrfs_set_inode_index(new_dir, &index);
4851 if (ret)
4852 goto out_fail;
4853 5327
4854 ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode, 5328 ret = btrfs_add_link(trans, new_dir, old_inode,
4855 old_inode, new_dentry->d_name.name, 5329 new_dentry->d_name.name,
4856 new_dentry->d_name.len, 1, index); 5330 new_dentry->d_name.len, 0, index);
4857 if (ret) 5331 BUG_ON(ret);
4858 goto out_fail;
4859 5332
4860 btrfs_log_new_name(trans, old_inode, old_dir, 5333 if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
4861 new_dentry->d_parent); 5334 btrfs_log_new_name(trans, old_inode, old_dir,
5335 new_dentry->d_parent);
5336 btrfs_end_log_trans(root);
5337 }
4862out_fail: 5338out_fail:
4863
4864 /* this btrfs_end_log_trans just allows the current
4865 * log-sub transaction to complete
4866 */
4867 btrfs_end_log_trans(root);
4868 btrfs_end_transaction_throttle(trans, root); 5339 btrfs_end_transaction_throttle(trans, root);
4869out_unlock: 5340
5341 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5342 up_read(&root->fs_info->subvol_sem);
5343
5344 btrfs_unreserve_metadata_space(root, 4);
4870 return ret; 5345 return ret;
4871} 5346}
4872 5347
@@ -4938,11 +5413,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
4938 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 5413 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
4939 return -ENAMETOOLONG; 5414 return -ENAMETOOLONG;
4940 5415
4941 err = btrfs_check_metadata_free_space(root); 5416 /*
5417 * 2 items for inode item and ref
5418 * 2 items for dir items
5419 * 1 item for xattr if selinux is on
5420 */
5421 err = btrfs_reserve_metadata_space(root, 5);
4942 if (err) 5422 if (err)
4943 goto out_fail; 5423 return err;
4944 5424
4945 trans = btrfs_start_transaction(root, 1); 5425 trans = btrfs_start_transaction(root, 1);
5426 if (!trans)
5427 goto out_fail;
4946 btrfs_set_trans_block_group(trans, dir); 5428 btrfs_set_trans_block_group(trans, dir);
4947 5429
4948 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 5430 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -5023,6 +5505,7 @@ out_unlock:
5023 nr = trans->blocks_used; 5505 nr = trans->blocks_used;
5024 btrfs_end_transaction_throttle(trans, root); 5506 btrfs_end_transaction_throttle(trans, root);
5025out_fail: 5507out_fail:
5508 btrfs_unreserve_metadata_space(root, 5);
5026 if (drop_inode) { 5509 if (drop_inode) {
5027 inode_dec_link_count(inode); 5510 inode_dec_link_count(inode);
5028 iput(inode); 5511 iput(inode);
@@ -5044,6 +5527,11 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
5044 5527
5045 while (num_bytes > 0) { 5528 while (num_bytes > 0) {
5046 alloc_size = min(num_bytes, root->fs_info->max_extent); 5529 alloc_size = min(num_bytes, root->fs_info->max_extent);
5530
5531 ret = btrfs_reserve_metadata_space(root, 1);
5532 if (ret)
5533 goto out;
5534
5047 ret = btrfs_reserve_extent(trans, root, alloc_size, 5535 ret = btrfs_reserve_extent(trans, root, alloc_size,
5048 root->sectorsize, 0, alloc_hint, 5536 root->sectorsize, 0, alloc_hint,
5049 (u64)-1, &ins, 1); 5537 (u64)-1, &ins, 1);
@@ -5058,9 +5546,12 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
5058 0, 0, 0, 5546 0, 0, 0,
5059 BTRFS_FILE_EXTENT_PREALLOC); 5547 BTRFS_FILE_EXTENT_PREALLOC);
5060 BUG_ON(ret); 5548 BUG_ON(ret);
5549 btrfs_drop_extent_cache(inode, cur_offset,
5550 cur_offset + ins.offset -1, 0);
5061 num_bytes -= ins.offset; 5551 num_bytes -= ins.offset;
5062 cur_offset += ins.offset; 5552 cur_offset += ins.offset;
5063 alloc_hint = ins.objectid + ins.offset; 5553 alloc_hint = ins.objectid + ins.offset;
5554 btrfs_unreserve_metadata_space(root, 1);
5064 } 5555 }
5065out: 5556out:
5066 if (cur_offset > start) { 5557 if (cur_offset > start) {
@@ -5201,7 +5692,7 @@ static int btrfs_permission(struct inode *inode, int mask)
5201 return generic_permission(inode, mask, btrfs_check_acl); 5692 return generic_permission(inode, mask, btrfs_check_acl);
5202} 5693}
5203 5694
5204static struct inode_operations btrfs_dir_inode_operations = { 5695static const struct inode_operations btrfs_dir_inode_operations = {
5205 .getattr = btrfs_getattr, 5696 .getattr = btrfs_getattr,
5206 .lookup = btrfs_lookup, 5697 .lookup = btrfs_lookup,
5207 .create = btrfs_create, 5698 .create = btrfs_create,
@@ -5219,11 +5710,12 @@ static struct inode_operations btrfs_dir_inode_operations = {
5219 .removexattr = btrfs_removexattr, 5710 .removexattr = btrfs_removexattr,
5220 .permission = btrfs_permission, 5711 .permission = btrfs_permission,
5221}; 5712};
5222static struct inode_operations btrfs_dir_ro_inode_operations = { 5713static const struct inode_operations btrfs_dir_ro_inode_operations = {
5223 .lookup = btrfs_lookup, 5714 .lookup = btrfs_lookup,
5224 .permission = btrfs_permission, 5715 .permission = btrfs_permission,
5225}; 5716};
5226static struct file_operations btrfs_dir_file_operations = { 5717
5718static const struct file_operations btrfs_dir_file_operations = {
5227 .llseek = generic_file_llseek, 5719 .llseek = generic_file_llseek,
5228 .read = generic_read_dir, 5720 .read = generic_read_dir,
5229 .readdir = btrfs_real_readdir, 5721 .readdir = btrfs_real_readdir,
@@ -5245,6 +5737,8 @@ static struct extent_io_ops btrfs_extent_io_ops = {
5245 .readpage_io_failed_hook = btrfs_io_failed_hook, 5737 .readpage_io_failed_hook = btrfs_io_failed_hook,
5246 .set_bit_hook = btrfs_set_bit_hook, 5738 .set_bit_hook = btrfs_set_bit_hook,
5247 .clear_bit_hook = btrfs_clear_bit_hook, 5739 .clear_bit_hook = btrfs_clear_bit_hook,
5740 .merge_extent_hook = btrfs_merge_extent_hook,
5741 .split_extent_hook = btrfs_split_extent_hook,
5248}; 5742};
5249 5743
5250/* 5744/*
@@ -5259,7 +5753,7 @@ static struct extent_io_ops btrfs_extent_io_ops = {
5259 * 5753 *
5260 * For now we're avoiding this by dropping bmap. 5754 * For now we're avoiding this by dropping bmap.
5261 */ 5755 */
5262static struct address_space_operations btrfs_aops = { 5756static const struct address_space_operations btrfs_aops = {
5263 .readpage = btrfs_readpage, 5757 .readpage = btrfs_readpage,
5264 .writepage = btrfs_writepage, 5758 .writepage = btrfs_writepage,
5265 .writepages = btrfs_writepages, 5759 .writepages = btrfs_writepages,
@@ -5269,16 +5763,17 @@ static struct address_space_operations btrfs_aops = {
5269 .invalidatepage = btrfs_invalidatepage, 5763 .invalidatepage = btrfs_invalidatepage,
5270 .releasepage = btrfs_releasepage, 5764 .releasepage = btrfs_releasepage,
5271 .set_page_dirty = btrfs_set_page_dirty, 5765 .set_page_dirty = btrfs_set_page_dirty,
5766 .error_remove_page = generic_error_remove_page,
5272}; 5767};
5273 5768
5274static struct address_space_operations btrfs_symlink_aops = { 5769static const struct address_space_operations btrfs_symlink_aops = {
5275 .readpage = btrfs_readpage, 5770 .readpage = btrfs_readpage,
5276 .writepage = btrfs_writepage, 5771 .writepage = btrfs_writepage,
5277 .invalidatepage = btrfs_invalidatepage, 5772 .invalidatepage = btrfs_invalidatepage,
5278 .releasepage = btrfs_releasepage, 5773 .releasepage = btrfs_releasepage,
5279}; 5774};
5280 5775
5281static struct inode_operations btrfs_file_inode_operations = { 5776static const struct inode_operations btrfs_file_inode_operations = {
5282 .truncate = btrfs_truncate, 5777 .truncate = btrfs_truncate,
5283 .getattr = btrfs_getattr, 5778 .getattr = btrfs_getattr,
5284 .setattr = btrfs_setattr, 5779 .setattr = btrfs_setattr,
@@ -5290,7 +5785,7 @@ static struct inode_operations btrfs_file_inode_operations = {
5290 .fallocate = btrfs_fallocate, 5785 .fallocate = btrfs_fallocate,
5291 .fiemap = btrfs_fiemap, 5786 .fiemap = btrfs_fiemap,
5292}; 5787};
5293static struct inode_operations btrfs_special_inode_operations = { 5788static const struct inode_operations btrfs_special_inode_operations = {
5294 .getattr = btrfs_getattr, 5789 .getattr = btrfs_getattr,
5295 .setattr = btrfs_setattr, 5790 .setattr = btrfs_setattr,
5296 .permission = btrfs_permission, 5791 .permission = btrfs_permission,
@@ -5299,7 +5794,7 @@ static struct inode_operations btrfs_special_inode_operations = {
5299 .listxattr = btrfs_listxattr, 5794 .listxattr = btrfs_listxattr,
5300 .removexattr = btrfs_removexattr, 5795 .removexattr = btrfs_removexattr,
5301}; 5796};
5302static struct inode_operations btrfs_symlink_inode_operations = { 5797static const struct inode_operations btrfs_symlink_inode_operations = {
5303 .readlink = generic_readlink, 5798 .readlink = generic_readlink,
5304 .follow_link = page_follow_link_light, 5799 .follow_link = page_follow_link_light,
5305 .put_link = page_put_link, 5800 .put_link = page_put_link,
@@ -5309,3 +5804,7 @@ static struct inode_operations btrfs_symlink_inode_operations = {
5309 .listxattr = btrfs_listxattr, 5804 .listxattr = btrfs_listxattr,
5310 .removexattr = btrfs_removexattr, 5805 .removexattr = btrfs_removexattr,
5311}; 5806};
5807
5808struct dentry_operations btrfs_dentry_operations = {
5809 .d_delete = btrfs_dentry_delete,
5810};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bd88f25889f7..9a780c8d0ac8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -230,8 +230,8 @@ static noinline int create_subvol(struct btrfs_root *root,
230 struct btrfs_root_item root_item; 230 struct btrfs_root_item root_item;
231 struct btrfs_inode_item *inode_item; 231 struct btrfs_inode_item *inode_item;
232 struct extent_buffer *leaf; 232 struct extent_buffer *leaf;
233 struct btrfs_root *new_root = root; 233 struct btrfs_root *new_root;
234 struct inode *dir; 234 struct inode *dir = dentry->d_parent->d_inode;
235 int ret; 235 int ret;
236 int err; 236 int err;
237 u64 objectid; 237 u64 objectid;
@@ -239,9 +239,15 @@ static noinline int create_subvol(struct btrfs_root *root,
239 u64 index = 0; 239 u64 index = 0;
240 unsigned long nr = 1; 240 unsigned long nr = 1;
241 241
242 ret = btrfs_check_metadata_free_space(root); 242 /*
243 * 1 - inode item
244 * 2 - refs
245 * 1 - root item
246 * 2 - dir items
247 */
248 ret = btrfs_reserve_metadata_space(root, 6);
243 if (ret) 249 if (ret)
244 goto fail_commit; 250 return ret;
245 251
246 trans = btrfs_start_transaction(root, 1); 252 trans = btrfs_start_transaction(root, 1);
247 BUG_ON(!trans); 253 BUG_ON(!trans);
@@ -304,11 +310,17 @@ static noinline int create_subvol(struct btrfs_root *root,
304 if (ret) 310 if (ret)
305 goto fail; 311 goto fail;
306 312
313 key.offset = (u64)-1;
314 new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
315 BUG_ON(IS_ERR(new_root));
316
317 btrfs_record_root_in_trans(trans, new_root);
318
319 ret = btrfs_create_subvol_root(trans, new_root, new_dirid,
320 BTRFS_I(dir)->block_group);
307 /* 321 /*
308 * insert the directory item 322 * insert the directory item
309 */ 323 */
310 key.offset = (u64)-1;
311 dir = dentry->d_parent->d_inode;
312 ret = btrfs_set_inode_index(dir, &index); 324 ret = btrfs_set_inode_index(dir, &index);
313 BUG_ON(ret); 325 BUG_ON(ret);
314 326
@@ -322,43 +334,20 @@ static noinline int create_subvol(struct btrfs_root *root,
322 ret = btrfs_update_inode(trans, root, dir); 334 ret = btrfs_update_inode(trans, root, dir);
323 BUG_ON(ret); 335 BUG_ON(ret);
324 336
325 /* add the backref first */
326 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, 337 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
327 objectid, BTRFS_ROOT_BACKREF_KEY, 338 objectid, root->root_key.objectid,
328 root->root_key.objectid,
329 dir->i_ino, index, name, namelen); 339 dir->i_ino, index, name, namelen);
330 340
331 BUG_ON(ret); 341 BUG_ON(ret);
332 342
333 /* now add the forward ref */ 343 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
334 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
335 root->root_key.objectid, BTRFS_ROOT_REF_KEY,
336 objectid,
337 dir->i_ino, index, name, namelen);
338
339 BUG_ON(ret);
340
341 ret = btrfs_commit_transaction(trans, root);
342 if (ret)
343 goto fail_commit;
344
345 new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
346 BUG_ON(!new_root);
347
348 trans = btrfs_start_transaction(new_root, 1);
349 BUG_ON(!trans);
350
351 ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid,
352 BTRFS_I(dir)->block_group);
353 if (ret)
354 goto fail;
355
356fail: 344fail:
357 nr = trans->blocks_used; 345 nr = trans->blocks_used;
358 err = btrfs_commit_transaction(trans, new_root); 346 err = btrfs_commit_transaction(trans, root);
359 if (err && !ret) 347 if (err && !ret)
360 ret = err; 348 ret = err;
361fail_commit: 349
350 btrfs_unreserve_metadata_space(root, 6);
362 btrfs_btree_balance_dirty(root, nr); 351 btrfs_btree_balance_dirty(root, nr);
363 return ret; 352 return ret;
364} 353}
@@ -375,19 +364,27 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
375 if (!root->ref_cows) 364 if (!root->ref_cows)
376 return -EINVAL; 365 return -EINVAL;
377 366
378 ret = btrfs_check_metadata_free_space(root); 367 /*
368 * 1 - inode item
369 * 2 - refs
370 * 1 - root item
371 * 2 - dir items
372 */
373 ret = btrfs_reserve_metadata_space(root, 6);
379 if (ret) 374 if (ret)
380 goto fail_unlock; 375 goto fail_unlock;
381 376
382 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 377 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
383 if (!pending_snapshot) { 378 if (!pending_snapshot) {
384 ret = -ENOMEM; 379 ret = -ENOMEM;
380 btrfs_unreserve_metadata_space(root, 6);
385 goto fail_unlock; 381 goto fail_unlock;
386 } 382 }
387 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); 383 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
388 if (!pending_snapshot->name) { 384 if (!pending_snapshot->name) {
389 ret = -ENOMEM; 385 ret = -ENOMEM;
390 kfree(pending_snapshot); 386 kfree(pending_snapshot);
387 btrfs_unreserve_metadata_space(root, 6);
391 goto fail_unlock; 388 goto fail_unlock;
392 } 389 }
393 memcpy(pending_snapshot->name, name, namelen); 390 memcpy(pending_snapshot->name, name, namelen);
@@ -420,14 +417,15 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
420 * sys_mkdirat and vfs_mkdir, but we only do a single component lookup 417 * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
421 * inside this filesystem so it's quite a bit simpler. 418 * inside this filesystem so it's quite a bit simpler.
422 */ 419 */
423static noinline int btrfs_mksubvol(struct path *parent, char *name, 420static noinline int btrfs_mksubvol(struct path *parent,
424 int mode, int namelen, 421 char *name, int namelen,
425 struct btrfs_root *snap_src) 422 struct btrfs_root *snap_src)
426{ 423{
424 struct inode *dir = parent->dentry->d_inode;
427 struct dentry *dentry; 425 struct dentry *dentry;
428 int error; 426 int error;
429 427
430 mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT); 428 mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
431 429
432 dentry = lookup_one_len(name, parent->dentry, namelen); 430 dentry = lookup_one_len(name, parent->dentry, namelen);
433 error = PTR_ERR(dentry); 431 error = PTR_ERR(dentry);
@@ -438,99 +436,39 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
438 if (dentry->d_inode) 436 if (dentry->d_inode)
439 goto out_dput; 437 goto out_dput;
440 438
441 if (!IS_POSIXACL(parent->dentry->d_inode))
442 mode &= ~current_umask();
443
444 error = mnt_want_write(parent->mnt); 439 error = mnt_want_write(parent->mnt);
445 if (error) 440 if (error)
446 goto out_dput; 441 goto out_dput;
447 442
448 error = btrfs_may_create(parent->dentry->d_inode, dentry); 443 error = btrfs_may_create(dir, dentry);
449 if (error) 444 if (error)
450 goto out_drop_write; 445 goto out_drop_write;
451 446
452 /* 447 down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
453 * Actually perform the low-level subvolume creation after all 448
454 * this VFS fuzz. 449 if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
455 * 450 goto out_up_read;
456 * Eventually we want to pass in an inode under which we create this 451
457 * subvolume, but for now all are under the filesystem root.
458 *
459 * Also we should pass on the mode eventually to allow creating new
460 * subvolume with specific mode bits.
461 */
462 if (snap_src) { 452 if (snap_src) {
463 struct dentry *dir = dentry->d_parent; 453 error = create_snapshot(snap_src, dentry,
464 struct dentry *test = dir->d_parent; 454 name, namelen);
465 struct btrfs_path *path = btrfs_alloc_path();
466 int ret;
467 u64 test_oid;
468 u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid;
469
470 test_oid = snap_src->root_key.objectid;
471
472 ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
473 path, parent_oid, test_oid);
474 if (ret == 0)
475 goto create;
476 btrfs_release_path(snap_src->fs_info->tree_root, path);
477
478 /* we need to make sure we aren't creating a directory loop
479 * by taking a snapshot of something that has our current
480 * subvol in its directory tree. So, this loops through
481 * the dentries and checks the forward refs for each subvolume
482 * to see if is references the subvolume where we are
483 * placing this new snapshot.
484 */
485 while (1) {
486 if (!test ||
487 dir == snap_src->fs_info->sb->s_root ||
488 test == snap_src->fs_info->sb->s_root ||
489 test->d_inode->i_sb != snap_src->fs_info->sb) {
490 break;
491 }
492 if (S_ISLNK(test->d_inode->i_mode)) {
493 printk(KERN_INFO "Btrfs symlink in snapshot "
494 "path, failed\n");
495 error = -EMLINK;
496 btrfs_free_path(path);
497 goto out_drop_write;
498 }
499 test_oid =
500 BTRFS_I(test->d_inode)->root->root_key.objectid;
501 ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
502 path, test_oid, parent_oid);
503 if (ret == 0) {
504 printk(KERN_INFO "Btrfs snapshot creation "
505 "failed, looping\n");
506 error = -EMLINK;
507 btrfs_free_path(path);
508 goto out_drop_write;
509 }
510 btrfs_release_path(snap_src->fs_info->tree_root, path);
511 test = test->d_parent;
512 }
513create:
514 btrfs_free_path(path);
515 error = create_snapshot(snap_src, dentry, name, namelen);
516 } else { 455 } else {
517 error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root, 456 error = create_subvol(BTRFS_I(dir)->root, dentry,
518 dentry, name, namelen); 457 name, namelen);
519 } 458 }
520 if (error) 459 if (!error)
521 goto out_drop_write; 460 fsnotify_mkdir(dir, dentry);
522 461out_up_read:
523 fsnotify_mkdir(parent->dentry->d_inode, dentry); 462 up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
524out_drop_write: 463out_drop_write:
525 mnt_drop_write(parent->mnt); 464 mnt_drop_write(parent->mnt);
526out_dput: 465out_dput:
527 dput(dentry); 466 dput(dentry);
528out_unlock: 467out_unlock:
529 mutex_unlock(&parent->dentry->d_inode->i_mutex); 468 mutex_unlock(&dir->i_mutex);
530 return error; 469 return error;
531} 470}
532 471
533
534static int btrfs_defrag_file(struct file *file) 472static int btrfs_defrag_file(struct file *file)
535{ 473{
536 struct inode *inode = fdentry(file)->d_inode; 474 struct inode *inode = fdentry(file)->d_inode;
@@ -596,9 +534,8 @@ again:
596 clear_page_dirty_for_io(page); 534 clear_page_dirty_for_io(page);
597 535
598 btrfs_set_extent_delalloc(inode, page_start, page_end); 536 btrfs_set_extent_delalloc(inode, page_start, page_end);
599
600 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
601 set_page_dirty(page); 537 set_page_dirty(page);
538 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
602 unlock_page(page); 539 unlock_page(page);
603 page_cache_release(page); 540 page_cache_release(page);
604 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); 541 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
@@ -609,7 +546,8 @@ out_unlock:
609 return 0; 546 return 0;
610} 547}
611 548
612static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) 549static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
550 void __user *arg)
613{ 551{
614 u64 new_size; 552 u64 new_size;
615 u64 old_size; 553 u64 old_size;
@@ -718,10 +656,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
718{ 656{
719 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 657 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
720 struct btrfs_ioctl_vol_args *vol_args; 658 struct btrfs_ioctl_vol_args *vol_args;
721 struct btrfs_dir_item *di;
722 struct btrfs_path *path;
723 struct file *src_file; 659 struct file *src_file;
724 u64 root_dirid;
725 int namelen; 660 int namelen;
726 int ret = 0; 661 int ret = 0;
727 662
@@ -739,32 +674,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
739 goto out; 674 goto out;
740 } 675 }
741 676
742 path = btrfs_alloc_path();
743 if (!path) {
744 ret = -ENOMEM;
745 goto out;
746 }
747
748 root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
749 di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
750 path, root_dirid,
751 vol_args->name, namelen, 0);
752 btrfs_free_path(path);
753
754 if (di && !IS_ERR(di)) {
755 ret = -EEXIST;
756 goto out;
757 }
758
759 if (IS_ERR(di)) {
760 ret = PTR_ERR(di);
761 goto out;
762 }
763
764 if (subvol) { 677 if (subvol) {
765 ret = btrfs_mksubvol(&file->f_path, vol_args->name, 678 ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
766 file->f_path.dentry->d_inode->i_mode, 679 NULL);
767 namelen, NULL);
768 } else { 680 } else {
769 struct inode *src_inode; 681 struct inode *src_inode;
770 src_file = fget(vol_args->fd); 682 src_file = fget(vol_args->fd);
@@ -781,17 +693,156 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
781 fput(src_file); 693 fput(src_file);
782 goto out; 694 goto out;
783 } 695 }
784 ret = btrfs_mksubvol(&file->f_path, vol_args->name, 696 ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
785 file->f_path.dentry->d_inode->i_mode, 697 BTRFS_I(src_inode)->root);
786 namelen, BTRFS_I(src_inode)->root);
787 fput(src_file); 698 fput(src_file);
788 } 699 }
789
790out: 700out:
791 kfree(vol_args); 701 kfree(vol_args);
792 return ret; 702 return ret;
793} 703}
794 704
705/*
706 * helper to check if the subvolume references other subvolumes
707 */
708static noinline int may_destroy_subvol(struct btrfs_root *root)
709{
710 struct btrfs_path *path;
711 struct btrfs_key key;
712 int ret;
713
714 path = btrfs_alloc_path();
715 if (!path)
716 return -ENOMEM;
717
718 key.objectid = root->root_key.objectid;
719 key.type = BTRFS_ROOT_REF_KEY;
720 key.offset = (u64)-1;
721
722 ret = btrfs_search_slot(NULL, root->fs_info->tree_root,
723 &key, path, 0, 0);
724 if (ret < 0)
725 goto out;
726 BUG_ON(ret == 0);
727
728 ret = 0;
729 if (path->slots[0] > 0) {
730 path->slots[0]--;
731 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
732 if (key.objectid == root->root_key.objectid &&
733 key.type == BTRFS_ROOT_REF_KEY)
734 ret = -ENOTEMPTY;
735 }
736out:
737 btrfs_free_path(path);
738 return ret;
739}
740
741static noinline int btrfs_ioctl_snap_destroy(struct file *file,
742 void __user *arg)
743{
744 struct dentry *parent = fdentry(file);
745 struct dentry *dentry;
746 struct inode *dir = parent->d_inode;
747 struct inode *inode;
748 struct btrfs_root *root = BTRFS_I(dir)->root;
749 struct btrfs_root *dest = NULL;
750 struct btrfs_ioctl_vol_args *vol_args;
751 struct btrfs_trans_handle *trans;
752 int namelen;
753 int ret;
754 int err = 0;
755
756 if (!capable(CAP_SYS_ADMIN))
757 return -EPERM;
758
759 vol_args = memdup_user(arg, sizeof(*vol_args));
760 if (IS_ERR(vol_args))
761 return PTR_ERR(vol_args);
762
763 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
764 namelen = strlen(vol_args->name);
765 if (strchr(vol_args->name, '/') ||
766 strncmp(vol_args->name, "..", namelen) == 0) {
767 err = -EINVAL;
768 goto out;
769 }
770
771 err = mnt_want_write(file->f_path.mnt);
772 if (err)
773 goto out;
774
775 mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
776 dentry = lookup_one_len(vol_args->name, parent, namelen);
777 if (IS_ERR(dentry)) {
778 err = PTR_ERR(dentry);
779 goto out_unlock_dir;
780 }
781
782 if (!dentry->d_inode) {
783 err = -ENOENT;
784 goto out_dput;
785 }
786
787 inode = dentry->d_inode;
788 if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
789 err = -EINVAL;
790 goto out_dput;
791 }
792
793 dest = BTRFS_I(inode)->root;
794
795 mutex_lock(&inode->i_mutex);
796 err = d_invalidate(dentry);
797 if (err)
798 goto out_unlock;
799
800 down_write(&root->fs_info->subvol_sem);
801
802 err = may_destroy_subvol(dest);
803 if (err)
804 goto out_up_write;
805
806 trans = btrfs_start_transaction(root, 1);
807 ret = btrfs_unlink_subvol(trans, root, dir,
808 dest->root_key.objectid,
809 dentry->d_name.name,
810 dentry->d_name.len);
811 BUG_ON(ret);
812
813 btrfs_record_root_in_trans(trans, dest);
814
815 memset(&dest->root_item.drop_progress, 0,
816 sizeof(dest->root_item.drop_progress));
817 dest->root_item.drop_level = 0;
818 btrfs_set_root_refs(&dest->root_item, 0);
819
820 ret = btrfs_insert_orphan_item(trans,
821 root->fs_info->tree_root,
822 dest->root_key.objectid);
823 BUG_ON(ret);
824
825 ret = btrfs_commit_transaction(trans, root);
826 BUG_ON(ret);
827 inode->i_flags |= S_DEAD;
828out_up_write:
829 up_write(&root->fs_info->subvol_sem);
830out_unlock:
831 mutex_unlock(&inode->i_mutex);
832 if (!err) {
833 btrfs_invalidate_inodes(dest);
834 d_delete(dentry);
835 }
836out_dput:
837 dput(dentry);
838out_unlock_dir:
839 mutex_unlock(&dir->i_mutex);
840 mnt_drop_write(file->f_path.mnt);
841out:
842 kfree(vol_args);
843 return err;
844}
845
795static int btrfs_ioctl_defrag(struct file *file) 846static int btrfs_ioctl_defrag(struct file *file)
796{ 847{
797 struct inode *inode = fdentry(file)->d_inode; 848 struct inode *inode = fdentry(file)->d_inode;
@@ -865,8 +916,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
865 return ret; 916 return ret;
866} 917}
867 918
868static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, 919static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
869 u64 off, u64 olen, u64 destoff) 920 u64 off, u64 olen, u64 destoff)
870{ 921{
871 struct inode *inode = fdentry(file)->d_inode; 922 struct inode *inode = fdentry(file)->d_inode;
872 struct btrfs_root *root = BTRFS_I(inode)->root; 923 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -976,7 +1027,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
976 1027
977 /* punch hole in destination first */ 1028 /* punch hole in destination first */
978 btrfs_drop_extents(trans, root, inode, off, off + len, 1029 btrfs_drop_extents(trans, root, inode, off, off + len,
979 off + len, 0, &hint_byte); 1030 off + len, 0, &hint_byte, 1);
980 1031
981 /* clone data */ 1032 /* clone data */
982 key.objectid = src->i_ino; 1033 key.objectid = src->i_ino;
@@ -1071,8 +1122,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1071 datao += off - key.offset; 1122 datao += off - key.offset;
1072 datal -= off - key.offset; 1123 datal -= off - key.offset;
1073 } 1124 }
1074 if (key.offset + datao + datal + key.offset > 1125 if (key.offset + datao + datal > off + len)
1075 off + len)
1076 datal = off + len - key.offset - datao; 1126 datal = off + len - key.offset - datao;
1077 /* disko == 0 means it's a hole */ 1127 /* disko == 0 means it's a hole */
1078 if (!disko) 1128 if (!disko)
@@ -1182,15 +1232,15 @@ static long btrfs_ioctl_trans_start(struct file *file)
1182 struct inode *inode = fdentry(file)->d_inode; 1232 struct inode *inode = fdentry(file)->d_inode;
1183 struct btrfs_root *root = BTRFS_I(inode)->root; 1233 struct btrfs_root *root = BTRFS_I(inode)->root;
1184 struct btrfs_trans_handle *trans; 1234 struct btrfs_trans_handle *trans;
1185 int ret = 0; 1235 int ret;
1186 1236
1237 ret = -EPERM;
1187 if (!capable(CAP_SYS_ADMIN)) 1238 if (!capable(CAP_SYS_ADMIN))
1188 return -EPERM; 1239 goto out;
1189 1240
1190 if (file->private_data) { 1241 ret = -EINPROGRESS;
1191 ret = -EINPROGRESS; 1242 if (file->private_data)
1192 goto out; 1243 goto out;
1193 }
1194 1244
1195 ret = mnt_want_write(file->f_path.mnt); 1245 ret = mnt_want_write(file->f_path.mnt);
1196 if (ret) 1246 if (ret)
@@ -1200,12 +1250,19 @@ static long btrfs_ioctl_trans_start(struct file *file)
1200 root->fs_info->open_ioctl_trans++; 1250 root->fs_info->open_ioctl_trans++;
1201 mutex_unlock(&root->fs_info->trans_mutex); 1251 mutex_unlock(&root->fs_info->trans_mutex);
1202 1252
1253 ret = -ENOMEM;
1203 trans = btrfs_start_ioctl_transaction(root, 0); 1254 trans = btrfs_start_ioctl_transaction(root, 0);
1204 if (trans) 1255 if (!trans)
1205 file->private_data = trans; 1256 goto out_drop;
1206 else 1257
1207 ret = -ENOMEM; 1258 file->private_data = trans;
1208 /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/ 1259 return 0;
1260
1261out_drop:
1262 mutex_lock(&root->fs_info->trans_mutex);
1263 root->fs_info->open_ioctl_trans--;
1264 mutex_unlock(&root->fs_info->trans_mutex);
1265 mnt_drop_write(file->f_path.mnt);
1209out: 1266out:
1210 return ret; 1267 return ret;
1211} 1268}
@@ -1221,24 +1278,20 @@ long btrfs_ioctl_trans_end(struct file *file)
1221 struct inode *inode = fdentry(file)->d_inode; 1278 struct inode *inode = fdentry(file)->d_inode;
1222 struct btrfs_root *root = BTRFS_I(inode)->root; 1279 struct btrfs_root *root = BTRFS_I(inode)->root;
1223 struct btrfs_trans_handle *trans; 1280 struct btrfs_trans_handle *trans;
1224 int ret = 0;
1225 1281
1226 trans = file->private_data; 1282 trans = file->private_data;
1227 if (!trans) { 1283 if (!trans)
1228 ret = -EINVAL; 1284 return -EINVAL;
1229 goto out;
1230 }
1231 btrfs_end_transaction(trans, root);
1232 file->private_data = NULL; 1285 file->private_data = NULL;
1233 1286
1287 btrfs_end_transaction(trans, root);
1288
1234 mutex_lock(&root->fs_info->trans_mutex); 1289 mutex_lock(&root->fs_info->trans_mutex);
1235 root->fs_info->open_ioctl_trans--; 1290 root->fs_info->open_ioctl_trans--;
1236 mutex_unlock(&root->fs_info->trans_mutex); 1291 mutex_unlock(&root->fs_info->trans_mutex);
1237 1292
1238 mnt_drop_write(file->f_path.mnt); 1293 mnt_drop_write(file->f_path.mnt);
1239 1294 return 0;
1240out:
1241 return ret;
1242} 1295}
1243 1296
1244long btrfs_ioctl(struct file *file, unsigned int 1297long btrfs_ioctl(struct file *file, unsigned int
@@ -1258,6 +1311,8 @@ long btrfs_ioctl(struct file *file, unsigned int
1258 return btrfs_ioctl_snap_create(file, argp, 0); 1311 return btrfs_ioctl_snap_create(file, argp, 0);
1259 case BTRFS_IOC_SUBVOL_CREATE: 1312 case BTRFS_IOC_SUBVOL_CREATE:
1260 return btrfs_ioctl_snap_create(file, argp, 1); 1313 return btrfs_ioctl_snap_create(file, argp, 1);
1314 case BTRFS_IOC_SNAP_DESTROY:
1315 return btrfs_ioctl_snap_destroy(file, argp);
1261 case BTRFS_IOC_DEFRAG: 1316 case BTRFS_IOC_DEFRAG:
1262 return btrfs_ioctl_defrag(file); 1317 return btrfs_ioctl_defrag(file);
1263 case BTRFS_IOC_RESIZE: 1318 case BTRFS_IOC_RESIZE:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index b320b103fa13..bc49914475eb 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -65,5 +65,6 @@ struct btrfs_ioctl_clone_range_args {
65 65
66#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \ 66#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
67 struct btrfs_ioctl_vol_args) 67 struct btrfs_ioctl_vol_args)
68 68#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
69 struct btrfs_ioctl_vol_args)
69#endif 70#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index d6f0806c682f..897fba835f89 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -159,8 +159,6 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
159 * 159 *
160 * len is the length of the extent 160 * len is the length of the extent
161 * 161 *
162 * This also sets the EXTENT_ORDERED bit on the range in the inode.
163 *
164 * The tree is given a single reference on the ordered extent that was 162 * The tree is given a single reference on the ordered extent that was
165 * inserted. 163 * inserted.
166 */ 164 */
@@ -181,6 +179,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
181 entry->start = start; 179 entry->start = start;
182 entry->len = len; 180 entry->len = len;
183 entry->disk_len = disk_len; 181 entry->disk_len = disk_len;
182 entry->bytes_left = len;
184 entry->inode = inode; 183 entry->inode = inode;
185 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 184 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
186 set_bit(type, &entry->flags); 185 set_bit(type, &entry->flags);
@@ -195,9 +194,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
195 &entry->rb_node); 194 &entry->rb_node);
196 BUG_ON(node); 195 BUG_ON(node);
197 196
198 set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
199 entry_end(entry) - 1, GFP_NOFS);
200
201 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 197 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
202 list_add_tail(&entry->root_extent_list, 198 list_add_tail(&entry->root_extent_list,
203 &BTRFS_I(inode)->root->fs_info->ordered_extents); 199 &BTRFS_I(inode)->root->fs_info->ordered_extents);
@@ -241,13 +237,10 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
241 struct btrfs_ordered_inode_tree *tree; 237 struct btrfs_ordered_inode_tree *tree;
242 struct rb_node *node; 238 struct rb_node *node;
243 struct btrfs_ordered_extent *entry; 239 struct btrfs_ordered_extent *entry;
244 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
245 int ret; 240 int ret;
246 241
247 tree = &BTRFS_I(inode)->ordered_tree; 242 tree = &BTRFS_I(inode)->ordered_tree;
248 mutex_lock(&tree->mutex); 243 mutex_lock(&tree->mutex);
249 clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
250 GFP_NOFS);
251 node = tree_search(tree, file_offset); 244 node = tree_search(tree, file_offset);
252 if (!node) { 245 if (!node) {
253 ret = 1; 246 ret = 1;
@@ -260,11 +253,16 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
260 goto out; 253 goto out;
261 } 254 }
262 255
263 ret = test_range_bit(io_tree, entry->file_offset, 256 if (io_size > entry->bytes_left) {
264 entry->file_offset + entry->len - 1, 257 printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
265 EXTENT_ORDERED, 0); 258 (unsigned long long)entry->bytes_left,
266 if (ret == 0) 259 (unsigned long long)io_size);
260 }
261 entry->bytes_left -= io_size;
262 if (entry->bytes_left == 0)
267 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 263 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
264 else
265 ret = 1;
268out: 266out:
269 mutex_unlock(&tree->mutex); 267 mutex_unlock(&tree->mutex);
270 return ret == 0; 268 return ret == 0;
@@ -460,7 +458,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
460 * start IO on any dirty ones so the wait doesn't stall waiting 458 * start IO on any dirty ones so the wait doesn't stall waiting
461 * for pdflush to find them 459 * for pdflush to find them
462 */ 460 */
463 btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL); 461 filemap_fdatawrite_range(inode->i_mapping, start, end);
464 if (wait) { 462 if (wait) {
465 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, 463 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
466 &entry->flags)); 464 &entry->flags));
@@ -476,6 +474,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
476 u64 orig_end; 474 u64 orig_end;
477 u64 wait_end; 475 u64 wait_end;
478 struct btrfs_ordered_extent *ordered; 476 struct btrfs_ordered_extent *ordered;
477 int found;
479 478
480 if (start + len < start) { 479 if (start + len < start) {
481 orig_end = INT_LIMIT(loff_t); 480 orig_end = INT_LIMIT(loff_t);
@@ -489,19 +488,18 @@ again:
489 /* start IO across the range first to instantiate any delalloc 488 /* start IO across the range first to instantiate any delalloc
490 * extents 489 * extents
491 */ 490 */
492 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); 491 filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
493 492
494 /* The compression code will leave pages locked but return from 493 /* The compression code will leave pages locked but return from
495 * writepage without setting the page writeback. Starting again 494 * writepage without setting the page writeback. Starting again
496 * with WB_SYNC_ALL will end up waiting for the IO to actually start. 495 * with WB_SYNC_ALL will end up waiting for the IO to actually start.
497 */ 496 */
498 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); 497 filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
499 498
500 btrfs_wait_on_page_writeback_range(inode->i_mapping, 499 filemap_fdatawait_range(inode->i_mapping, start, orig_end);
501 start >> PAGE_CACHE_SHIFT,
502 orig_end >> PAGE_CACHE_SHIFT);
503 500
504 end = orig_end; 501 end = orig_end;
502 found = 0;
505 while (1) { 503 while (1) {
506 ordered = btrfs_lookup_first_ordered_extent(inode, end); 504 ordered = btrfs_lookup_first_ordered_extent(inode, end);
507 if (!ordered) 505 if (!ordered)
@@ -514,6 +512,7 @@ again:
514 btrfs_put_ordered_extent(ordered); 512 btrfs_put_ordered_extent(ordered);
515 break; 513 break;
516 } 514 }
515 found++;
517 btrfs_start_ordered_extent(inode, ordered, 1); 516 btrfs_start_ordered_extent(inode, ordered, 1);
518 end = ordered->file_offset; 517 end = ordered->file_offset;
519 btrfs_put_ordered_extent(ordered); 518 btrfs_put_ordered_extent(ordered);
@@ -521,8 +520,8 @@ again:
521 break; 520 break;
522 end--; 521 end--;
523 } 522 }
524 if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, 523 if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
525 EXTENT_ORDERED | EXTENT_DELALLOC, 0)) { 524 EXTENT_DELALLOC, 0, NULL)) {
526 schedule_timeout(1); 525 schedule_timeout(1);
527 goto again; 526 goto again;
528 } 527 }
@@ -613,7 +612,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
613 */ 612 */
614 if (test_range_bit(io_tree, disk_i_size, 613 if (test_range_bit(io_tree, disk_i_size,
615 ordered->file_offset + ordered->len - 1, 614 ordered->file_offset + ordered->len - 1,
616 EXTENT_DELALLOC, 0)) { 615 EXTENT_DELALLOC, 0, NULL)) {
617 goto out; 616 goto out;
618 } 617 }
619 /* 618 /*
@@ -664,7 +663,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
664 */ 663 */
665 if (i_size_test > entry_end(ordered) && 664 if (i_size_test > entry_end(ordered) &&
666 !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1, 665 !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
667 EXTENT_DELALLOC, 0)) { 666 EXTENT_DELALLOC, 0, NULL)) {
668 new_i_size = min_t(u64, i_size_test, i_size_read(inode)); 667 new_i_size = min_t(u64, i_size_test, i_size_read(inode));
669 } 668 }
670 BTRFS_I(inode)->disk_i_size = new_i_size; 669 BTRFS_I(inode)->disk_i_size = new_i_size;
@@ -715,90 +714,6 @@ out:
715} 714}
716 715
717 716
718/**
719 * taken from mm/filemap.c because it isn't exported
720 *
721 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
722 * @mapping: address space structure to write
723 * @start: offset in bytes where the range starts
724 * @end: offset in bytes where the range ends (inclusive)
725 * @sync_mode: enable synchronous operation
726 *
727 * Start writeback against all of a mapping's dirty pages that lie
728 * within the byte offsets <start, end> inclusive.
729 *
730 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
731 * opposed to a regular memory cleansing writeback. The difference between
732 * these two operations is that if a dirty page/buffer is encountered, it must
733 * be waited upon, and not just skipped over.
734 */
735int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
736 loff_t end, int sync_mode)
737{
738 struct writeback_control wbc = {
739 .sync_mode = sync_mode,
740 .nr_to_write = mapping->nrpages * 2,
741 .range_start = start,
742 .range_end = end,
743 .for_writepages = 1,
744 };
745 return btrfs_writepages(mapping, &wbc);
746}
747
748/**
749 * taken from mm/filemap.c because it isn't exported
750 *
751 * wait_on_page_writeback_range - wait for writeback to complete
752 * @mapping: target address_space
753 * @start: beginning page index
754 * @end: ending page index
755 *
756 * Wait for writeback to complete against pages indexed by start->end
757 * inclusive
758 */
759int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
760 pgoff_t start, pgoff_t end)
761{
762 struct pagevec pvec;
763 int nr_pages;
764 int ret = 0;
765 pgoff_t index;
766
767 if (end < start)
768 return 0;
769
770 pagevec_init(&pvec, 0);
771 index = start;
772 while ((index <= end) &&
773 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
774 PAGECACHE_TAG_WRITEBACK,
775 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
776 unsigned i;
777
778 for (i = 0; i < nr_pages; i++) {
779 struct page *page = pvec.pages[i];
780
781 /* until radix tree lookup accepts end_index */
782 if (page->index > end)
783 continue;
784
785 wait_on_page_writeback(page);
786 if (PageError(page))
787 ret = -EIO;
788 }
789 pagevec_release(&pvec);
790 cond_resched();
791 }
792
793 /* Check for outstanding write errors */
794 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
795 ret = -ENOSPC;
796 if (test_and_clear_bit(AS_EIO, &mapping->flags))
797 ret = -EIO;
798
799 return ret;
800}
801
802/* 717/*
803 * add a given inode to the list of inodes that must be fully on 718 * add a given inode to the list of inodes that must be fully on
804 * disk before a transaction commit finishes. 719 * disk before a transaction commit finishes.
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 3d31c8827b01..f82e87488ca8 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -85,6 +85,9 @@ struct btrfs_ordered_extent {
85 /* extent length on disk */ 85 /* extent length on disk */
86 u64 disk_len; 86 u64 disk_len;
87 87
88 /* number of bytes that still need writing */
89 u64 bytes_left;
90
88 /* flags (described above) */ 91 /* flags (described above) */
89 unsigned long flags; 92 unsigned long flags;
90 93
@@ -150,10 +153,6 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
150int btrfs_ordered_update_i_size(struct inode *inode, 153int btrfs_ordered_update_i_size(struct inode *inode,
151 struct btrfs_ordered_extent *ordered); 154 struct btrfs_ordered_extent *ordered);
152int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 155int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
153int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
154 pgoff_t start, pgoff_t end);
155int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
156 loff_t end, int sync_mode);
157int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); 156int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
158int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); 157int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
159int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, 158int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 3c0d52af4f80..79cba5fbc28e 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -65,3 +65,23 @@ out:
65 btrfs_free_path(path); 65 btrfs_free_path(path);
66 return ret; 66 return ret;
67} 67}
68
69int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset)
70{
71 struct btrfs_path *path;
72 struct btrfs_key key;
73 int ret;
74
75 key.objectid = BTRFS_ORPHAN_OBJECTID;
76 key.type = BTRFS_ORPHAN_ITEM_KEY;
77 key.offset = offset;
78
79 path = btrfs_alloc_path();
80 if (!path)
81 return -ENOMEM;
82
83 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
84
85 btrfs_free_path(path);
86 return ret;
87}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index c04f7f212602..361ad323faac 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -121,6 +121,15 @@ struct inodevec {
121 int nr; 121 int nr;
122}; 122};
123 123
124#define MAX_EXTENTS 128
125
126struct file_extent_cluster {
127 u64 start;
128 u64 end;
129 u64 boundary[MAX_EXTENTS];
130 unsigned int nr;
131};
132
124struct reloc_control { 133struct reloc_control {
125 /* block group to relocate */ 134 /* block group to relocate */
126 struct btrfs_block_group_cache *block_group; 135 struct btrfs_block_group_cache *block_group;
@@ -2180,7 +2189,7 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
2180 struct reloc_control *rc) 2189 struct reloc_control *rc)
2181{ 2190{
2182 if (test_range_bit(&rc->processed_blocks, bytenr, 2191 if (test_range_bit(&rc->processed_blocks, bytenr,
2183 bytenr + blocksize - 1, EXTENT_DIRTY, 1)) 2192 bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2184 return 1; 2193 return 1;
2185 return 0; 2194 return 0;
2186} 2195}
@@ -2529,56 +2538,94 @@ out:
2529} 2538}
2530 2539
2531static noinline_for_stack 2540static noinline_for_stack
2532int relocate_inode_pages(struct inode *inode, u64 start, u64 len) 2541int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
2542 u64 block_start)
2543{
2544 struct btrfs_root *root = BTRFS_I(inode)->root;
2545 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2546 struct extent_map *em;
2547 int ret = 0;
2548
2549 em = alloc_extent_map(GFP_NOFS);
2550 if (!em)
2551 return -ENOMEM;
2552
2553 em->start = start;
2554 em->len = end + 1 - start;
2555 em->block_len = em->len;
2556 em->block_start = block_start;
2557 em->bdev = root->fs_info->fs_devices->latest_bdev;
2558 set_bit(EXTENT_FLAG_PINNED, &em->flags);
2559
2560 lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2561 while (1) {
2562 write_lock(&em_tree->lock);
2563 ret = add_extent_mapping(em_tree, em);
2564 write_unlock(&em_tree->lock);
2565 if (ret != -EEXIST) {
2566 free_extent_map(em);
2567 break;
2568 }
2569 btrfs_drop_extent_cache(inode, start, end, 0);
2570 }
2571 unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2572 return ret;
2573}
2574
2575static int relocate_file_extent_cluster(struct inode *inode,
2576 struct file_extent_cluster *cluster)
2533{ 2577{
2534 u64 page_start; 2578 u64 page_start;
2535 u64 page_end; 2579 u64 page_end;
2536 unsigned long i; 2580 u64 offset = BTRFS_I(inode)->index_cnt;
2537 unsigned long first_index; 2581 unsigned long index;
2538 unsigned long last_index; 2582 unsigned long last_index;
2539 unsigned int total_read = 0; 2583 unsigned int dirty_page = 0;
2540 unsigned int total_dirty = 0;
2541 struct page *page; 2584 struct page *page;
2542 struct file_ra_state *ra; 2585 struct file_ra_state *ra;
2543 struct btrfs_ordered_extent *ordered; 2586 int nr = 0;
2544 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2545 int ret = 0; 2587 int ret = 0;
2546 2588
2589 if (!cluster->nr)
2590 return 0;
2591
2547 ra = kzalloc(sizeof(*ra), GFP_NOFS); 2592 ra = kzalloc(sizeof(*ra), GFP_NOFS);
2548 if (!ra) 2593 if (!ra)
2549 return -ENOMEM; 2594 return -ENOMEM;
2550 2595
2596 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2597 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2598
2551 mutex_lock(&inode->i_mutex); 2599 mutex_lock(&inode->i_mutex);
2552 first_index = start >> PAGE_CACHE_SHIFT;
2553 last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
2554 2600
2555 /* make sure the dirty trick played by the caller work */ 2601 i_size_write(inode, cluster->end + 1 - offset);
2556 while (1) { 2602 ret = setup_extent_mapping(inode, cluster->start - offset,
2557 ret = invalidate_inode_pages2_range(inode->i_mapping, 2603 cluster->end - offset, cluster->start);
2558 first_index, last_index);
2559 if (ret != -EBUSY)
2560 break;
2561 schedule_timeout(HZ/10);
2562 }
2563 if (ret) 2604 if (ret)
2564 goto out_unlock; 2605 goto out_unlock;
2565 2606
2566 file_ra_state_init(ra, inode->i_mapping); 2607 file_ra_state_init(ra, inode->i_mapping);
2567 2608
2568 for (i = first_index ; i <= last_index; i++) { 2609 WARN_ON(cluster->start != cluster->boundary[0]);
2569 if (total_read % ra->ra_pages == 0) { 2610 while (index <= last_index) {
2570 btrfs_force_ra(inode->i_mapping, ra, NULL, i, 2611 page = find_lock_page(inode->i_mapping, index);
2571 min(last_index, ra->ra_pages + i - 1));
2572 }
2573 total_read++;
2574again:
2575 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
2576 BUG_ON(1);
2577 page = grab_cache_page(inode->i_mapping, i);
2578 if (!page) { 2612 if (!page) {
2579 ret = -ENOMEM; 2613 page_cache_sync_readahead(inode->i_mapping,
2580 goto out_unlock; 2614 ra, NULL, index,
2615 last_index + 1 - index);
2616 page = grab_cache_page(inode->i_mapping, index);
2617 if (!page) {
2618 ret = -ENOMEM;
2619 goto out_unlock;
2620 }
2621 }
2622
2623 if (PageReadahead(page)) {
2624 page_cache_async_readahead(inode->i_mapping,
2625 ra, NULL, page, index,
2626 last_index + 1 - index);
2581 } 2627 }
2628
2582 if (!PageUptodate(page)) { 2629 if (!PageUptodate(page)) {
2583 btrfs_readpage(NULL, page); 2630 btrfs_readpage(NULL, page);
2584 lock_page(page); 2631 lock_page(page);
@@ -2589,75 +2636,79 @@ again:
2589 goto out_unlock; 2636 goto out_unlock;
2590 } 2637 }
2591 } 2638 }
2592 wait_on_page_writeback(page);
2593 2639
2594 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 2640 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2595 page_end = page_start + PAGE_CACHE_SIZE - 1; 2641 page_end = page_start + PAGE_CACHE_SIZE - 1;
2596 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 2642
2597 2643 lock_extent(&BTRFS_I(inode)->io_tree,
2598 ordered = btrfs_lookup_ordered_extent(inode, page_start); 2644 page_start, page_end, GFP_NOFS);
2599 if (ordered) { 2645
2600 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2601 unlock_page(page);
2602 page_cache_release(page);
2603 btrfs_start_ordered_extent(inode, ordered, 1);
2604 btrfs_put_ordered_extent(ordered);
2605 goto again;
2606 }
2607 set_page_extent_mapped(page); 2646 set_page_extent_mapped(page);
2608 2647
2609 if (i == first_index) 2648 if (nr < cluster->nr &&
2610 set_extent_bits(io_tree, page_start, page_end, 2649 page_start + offset == cluster->boundary[nr]) {
2650 set_extent_bits(&BTRFS_I(inode)->io_tree,
2651 page_start, page_end,
2611 EXTENT_BOUNDARY, GFP_NOFS); 2652 EXTENT_BOUNDARY, GFP_NOFS);
2653 nr++;
2654 }
2612 btrfs_set_extent_delalloc(inode, page_start, page_end); 2655 btrfs_set_extent_delalloc(inode, page_start, page_end);
2613 2656
2614 set_page_dirty(page); 2657 set_page_dirty(page);
2615 total_dirty++; 2658 dirty_page++;
2616 2659
2617 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 2660 unlock_extent(&BTRFS_I(inode)->io_tree,
2661 page_start, page_end, GFP_NOFS);
2618 unlock_page(page); 2662 unlock_page(page);
2619 page_cache_release(page); 2663 page_cache_release(page);
2664
2665 index++;
2666 if (nr < cluster->nr &&
2667 page_end + 1 + offset == cluster->boundary[nr]) {
2668 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2669 dirty_page);
2670 dirty_page = 0;
2671 }
2672 }
2673 if (dirty_page) {
2674 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2675 dirty_page);
2620 } 2676 }
2677 WARN_ON(nr != cluster->nr);
2621out_unlock: 2678out_unlock:
2622 mutex_unlock(&inode->i_mutex); 2679 mutex_unlock(&inode->i_mutex);
2623 kfree(ra); 2680 kfree(ra);
2624 balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
2625 return ret; 2681 return ret;
2626} 2682}
2627 2683
2628static noinline_for_stack 2684static noinline_for_stack
2629int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key) 2685int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
2686 struct file_extent_cluster *cluster)
2630{ 2687{
2631 struct btrfs_root *root = BTRFS_I(inode)->root; 2688 int ret;
2632 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2633 struct extent_map *em;
2634 u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt;
2635 u64 end = start + extent_key->offset - 1;
2636
2637 em = alloc_extent_map(GFP_NOFS);
2638 em->start = start;
2639 em->len = extent_key->offset;
2640 em->block_len = extent_key->offset;
2641 em->block_start = extent_key->objectid;
2642 em->bdev = root->fs_info->fs_devices->latest_bdev;
2643 set_bit(EXTENT_FLAG_PINNED, &em->flags);
2644 2689
2645 /* setup extent map to cheat btrfs_readpage */ 2690 if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
2646 lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); 2691 ret = relocate_file_extent_cluster(inode, cluster);
2647 while (1) { 2692 if (ret)
2648 int ret; 2693 return ret;
2649 spin_lock(&em_tree->lock); 2694 cluster->nr = 0;
2650 ret = add_extent_mapping(em_tree, em);
2651 spin_unlock(&em_tree->lock);
2652 if (ret != -EEXIST) {
2653 free_extent_map(em);
2654 break;
2655 }
2656 btrfs_drop_extent_cache(inode, start, end, 0);
2657 } 2695 }
2658 unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2659 2696
2660 return relocate_inode_pages(inode, start, extent_key->offset); 2697 if (!cluster->nr)
2698 cluster->start = extent_key->objectid;
2699 else
2700 BUG_ON(cluster->nr >= MAX_EXTENTS);
2701 cluster->end = extent_key->objectid + extent_key->offset - 1;
2702 cluster->boundary[cluster->nr] = extent_key->objectid;
2703 cluster->nr++;
2704
2705 if (cluster->nr >= MAX_EXTENTS) {
2706 ret = relocate_file_extent_cluster(inode, cluster);
2707 if (ret)
2708 return ret;
2709 cluster->nr = 0;
2710 }
2711 return 0;
2661} 2712}
2662 2713
2663#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2714#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
@@ -3203,10 +3254,12 @@ static int check_extent_flags(u64 flags)
3203 return 0; 3254 return 0;
3204} 3255}
3205 3256
3257
3206static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 3258static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3207{ 3259{
3208 struct rb_root blocks = RB_ROOT; 3260 struct rb_root blocks = RB_ROOT;
3209 struct btrfs_key key; 3261 struct btrfs_key key;
3262 struct file_extent_cluster *cluster;
3210 struct btrfs_trans_handle *trans = NULL; 3263 struct btrfs_trans_handle *trans = NULL;
3211 struct btrfs_path *path; 3264 struct btrfs_path *path;
3212 struct btrfs_extent_item *ei; 3265 struct btrfs_extent_item *ei;
@@ -3216,10 +3269,17 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3216 int ret; 3269 int ret;
3217 int err = 0; 3270 int err = 0;
3218 3271
3272 cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
3273 if (!cluster)
3274 return -ENOMEM;
3275
3219 path = btrfs_alloc_path(); 3276 path = btrfs_alloc_path();
3220 if (!path) 3277 if (!path)
3221 return -ENOMEM; 3278 return -ENOMEM;
3222 3279
3280 rc->extents_found = 0;
3281 rc->extents_skipped = 0;
3282
3223 rc->search_start = rc->block_group->key.objectid; 3283 rc->search_start = rc->block_group->key.objectid;
3224 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, 3284 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3225 GFP_NOFS); 3285 GFP_NOFS);
@@ -3306,14 +3366,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3306 } 3366 }
3307 3367
3308 nr = trans->blocks_used; 3368 nr = trans->blocks_used;
3309 btrfs_end_transaction_throttle(trans, rc->extent_root); 3369 btrfs_end_transaction(trans, rc->extent_root);
3310 trans = NULL; 3370 trans = NULL;
3311 btrfs_btree_balance_dirty(rc->extent_root, nr); 3371 btrfs_btree_balance_dirty(rc->extent_root, nr);
3312 3372
3313 if (rc->stage == MOVE_DATA_EXTENTS && 3373 if (rc->stage == MOVE_DATA_EXTENTS &&
3314 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3374 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3315 rc->found_file_extent = 1; 3375 rc->found_file_extent = 1;
3316 ret = relocate_data_extent(rc->data_inode, &key); 3376 ret = relocate_data_extent(rc->data_inode,
3377 &key, cluster);
3317 if (ret < 0) { 3378 if (ret < 0) {
3318 err = ret; 3379 err = ret;
3319 break; 3380 break;
@@ -3328,6 +3389,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3328 btrfs_btree_balance_dirty(rc->extent_root, nr); 3389 btrfs_btree_balance_dirty(rc->extent_root, nr);
3329 } 3390 }
3330 3391
3392 if (!err) {
3393 ret = relocate_file_extent_cluster(rc->data_inode, cluster);
3394 if (ret < 0)
3395 err = ret;
3396 }
3397
3398 kfree(cluster);
3399
3331 rc->create_reloc_root = 0; 3400 rc->create_reloc_root = 0;
3332 smp_mb(); 3401 smp_mb();
3333 3402
@@ -3348,8 +3417,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3348} 3417}
3349 3418
3350static int __insert_orphan_inode(struct btrfs_trans_handle *trans, 3419static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3351 struct btrfs_root *root, 3420 struct btrfs_root *root, u64 objectid)
3352 u64 objectid, u64 size)
3353{ 3421{
3354 struct btrfs_path *path; 3422 struct btrfs_path *path;
3355 struct btrfs_inode_item *item; 3423 struct btrfs_inode_item *item;
@@ -3368,7 +3436,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3368 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); 3436 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
3369 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); 3437 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
3370 btrfs_set_inode_generation(leaf, item, 1); 3438 btrfs_set_inode_generation(leaf, item, 1);
3371 btrfs_set_inode_size(leaf, item, size); 3439 btrfs_set_inode_size(leaf, item, 0);
3372 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 3440 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3373 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); 3441 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
3374 btrfs_mark_buffer_dirty(leaf); 3442 btrfs_mark_buffer_dirty(leaf);
@@ -3404,12 +3472,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3404 if (err) 3472 if (err)
3405 goto out; 3473 goto out;
3406 3474
3407 err = __insert_orphan_inode(trans, root, objectid, group->key.offset); 3475 err = __insert_orphan_inode(trans, root, objectid);
3408 BUG_ON(err);
3409
3410 err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
3411 group->key.offset, 0, group->key.offset,
3412 0, 0, 0);
3413 BUG_ON(err); 3476 BUG_ON(err);
3414 3477
3415 key.objectid = objectid; 3478 key.objectid = objectid;
@@ -3475,14 +3538,15 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3475 btrfs_wait_ordered_extents(fs_info->tree_root, 0); 3538 btrfs_wait_ordered_extents(fs_info->tree_root, 0);
3476 3539
3477 while (1) { 3540 while (1) {
3478 mutex_lock(&fs_info->cleaner_mutex);
3479 btrfs_clean_old_snapshots(fs_info->tree_root);
3480 mutex_unlock(&fs_info->cleaner_mutex);
3481
3482 rc->extents_found = 0; 3541 rc->extents_found = 0;
3483 rc->extents_skipped = 0; 3542 rc->extents_skipped = 0;
3484 3543
3544 mutex_lock(&fs_info->cleaner_mutex);
3545
3546 btrfs_clean_old_snapshots(fs_info->tree_root);
3485 ret = relocate_block_group(rc); 3547 ret = relocate_block_group(rc);
3548
3549 mutex_unlock(&fs_info->cleaner_mutex);
3486 if (ret < 0) { 3550 if (ret < 0) {
3487 err = ret; 3551 err = ret;
3488 break; 3552 break;
@@ -3514,10 +3578,10 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3514 } 3578 }
3515 } 3579 }
3516 3580
3517 filemap_fdatawrite_range(fs_info->btree_inode->i_mapping, 3581 filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
3518 rc->block_group->key.objectid, 3582 rc->block_group->key.objectid,
3519 rc->block_group->key.objectid + 3583 rc->block_group->key.objectid +
3520 rc->block_group->key.offset - 1); 3584 rc->block_group->key.offset - 1);
3521 3585
3522 WARN_ON(rc->block_group->pinned > 0); 3586 WARN_ON(rc->block_group->pinned > 0);
3523 WARN_ON(rc->block_group->reserved > 0); 3587 WARN_ON(rc->block_group->reserved > 0);
@@ -3530,6 +3594,26 @@ out:
3530 return err; 3594 return err;
3531} 3595}
3532 3596
3597static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
3598{
3599 struct btrfs_trans_handle *trans;
3600 int ret;
3601
3602 trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
3603
3604 memset(&root->root_item.drop_progress, 0,
3605 sizeof(root->root_item.drop_progress));
3606 root->root_item.drop_level = 0;
3607 btrfs_set_root_refs(&root->root_item, 0);
3608 ret = btrfs_update_root(trans, root->fs_info->tree_root,
3609 &root->root_key, &root->root_item);
3610 BUG_ON(ret);
3611
3612 ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
3613 BUG_ON(ret);
3614 return 0;
3615}
3616
3533/* 3617/*
3534 * recover relocation interrupted by system crash. 3618 * recover relocation interrupted by system crash.
3535 * 3619 *
@@ -3589,8 +3673,12 @@ int btrfs_recover_relocation(struct btrfs_root *root)
3589 fs_root = read_fs_root(root->fs_info, 3673 fs_root = read_fs_root(root->fs_info,
3590 reloc_root->root_key.offset); 3674 reloc_root->root_key.offset);
3591 if (IS_ERR(fs_root)) { 3675 if (IS_ERR(fs_root)) {
3592 err = PTR_ERR(fs_root); 3676 ret = PTR_ERR(fs_root);
3593 goto out; 3677 if (ret != -ENOENT) {
3678 err = ret;
3679 goto out;
3680 }
3681 mark_garbage_root(reloc_root);
3594 } 3682 }
3595 } 3683 }
3596 3684
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 0ddc6d61c55a..9351428f30e2 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -94,17 +94,23 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
94 goto out; 94 goto out;
95 95
96 BUG_ON(ret == 0); 96 BUG_ON(ret == 0);
97 if (path->slots[0] == 0) {
98 ret = 1;
99 goto out;
100 }
97 l = path->nodes[0]; 101 l = path->nodes[0];
98 BUG_ON(path->slots[0] == 0);
99 slot = path->slots[0] - 1; 102 slot = path->slots[0] - 1;
100 btrfs_item_key_to_cpu(l, &found_key, slot); 103 btrfs_item_key_to_cpu(l, &found_key, slot);
101 if (found_key.objectid != objectid) { 104 if (found_key.objectid != objectid ||
105 found_key.type != BTRFS_ROOT_ITEM_KEY) {
102 ret = 1; 106 ret = 1;
103 goto out; 107 goto out;
104 } 108 }
105 read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot), 109 if (item)
106 sizeof(*item)); 110 read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
107 memcpy(key, &found_key, sizeof(found_key)); 111 sizeof(*item));
112 if (key)
113 memcpy(key, &found_key, sizeof(found_key));
108 ret = 0; 114 ret = 0;
109out: 115out:
110 btrfs_free_path(path); 116 btrfs_free_path(path);
@@ -249,6 +255,59 @@ err:
249 return ret; 255 return ret;
250} 256}
251 257
258int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
259{
260 struct extent_buffer *leaf;
261 struct btrfs_path *path;
262 struct btrfs_key key;
263 int err = 0;
264 int ret;
265
266 path = btrfs_alloc_path();
267 if (!path)
268 return -ENOMEM;
269
270 key.objectid = BTRFS_ORPHAN_OBJECTID;
271 key.type = BTRFS_ORPHAN_ITEM_KEY;
272 key.offset = 0;
273
274 while (1) {
275 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
276 if (ret < 0) {
277 err = ret;
278 break;
279 }
280
281 leaf = path->nodes[0];
282 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
283 ret = btrfs_next_leaf(tree_root, path);
284 if (ret < 0)
285 err = ret;
286 if (ret != 0)
287 break;
288 leaf = path->nodes[0];
289 }
290
291 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
292 btrfs_release_path(tree_root, path);
293
294 if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
295 key.type != BTRFS_ORPHAN_ITEM_KEY)
296 break;
297
298 ret = btrfs_find_dead_roots(tree_root, key.offset);
299 if (ret) {
300 err = ret;
301 break;
302 }
303
304 key.offset++;
305 }
306
307 btrfs_free_path(path);
308 return err;
309}
310
252/* drop the root item for 'key' from 'root' */ 311/* drop the root item for 'key' from 'root' */
253int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, 312int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
254 struct btrfs_key *key) 313 struct btrfs_key *key)
@@ -278,31 +337,57 @@ out:
278 return ret; 337 return ret;
279} 338}
280 339
281#if 0 /* this will get used when snapshot deletion is implemented */
282int btrfs_del_root_ref(struct btrfs_trans_handle *trans, 340int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
283 struct btrfs_root *tree_root, 341 struct btrfs_root *tree_root,
284 u64 root_id, u8 type, u64 ref_id) 342 u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
343 const char *name, int name_len)
344
285{ 345{
346 struct btrfs_path *path;
347 struct btrfs_root_ref *ref;
348 struct extent_buffer *leaf;
286 struct btrfs_key key; 349 struct btrfs_key key;
350 unsigned long ptr;
351 int err = 0;
287 int ret; 352 int ret;
288 struct btrfs_path *path;
289 353
290 path = btrfs_alloc_path(); 354 path = btrfs_alloc_path();
355 if (!path)
356 return -ENOMEM;
291 357
292 key.objectid = root_id; 358 key.objectid = root_id;
293 key.type = type; 359 key.type = BTRFS_ROOT_BACKREF_KEY;
294 key.offset = ref_id; 360 key.offset = ref_id;
295 361again:
296 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); 362 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
297 BUG_ON(ret); 363 BUG_ON(ret < 0);
298 364 if (ret == 0) {
299 ret = btrfs_del_item(trans, tree_root, path); 365 leaf = path->nodes[0];
300 BUG_ON(ret); 366 ref = btrfs_item_ptr(leaf, path->slots[0],
367 struct btrfs_root_ref);
368
369 WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
370 WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
371 ptr = (unsigned long)(ref + 1);
372 WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
373 *sequence = btrfs_root_ref_sequence(leaf, ref);
374
375 ret = btrfs_del_item(trans, tree_root, path);
376 BUG_ON(ret);
377 } else
378 err = -ENOENT;
379
380 if (key.type == BTRFS_ROOT_BACKREF_KEY) {
381 btrfs_release_path(tree_root, path);
382 key.objectid = ref_id;
383 key.type = BTRFS_ROOT_REF_KEY;
384 key.offset = root_id;
385 goto again;
386 }
301 387
302 btrfs_free_path(path); 388 btrfs_free_path(path);
303 return ret; 389 return err;
304} 390}
305#endif
306 391
307int btrfs_find_root_ref(struct btrfs_root *tree_root, 392int btrfs_find_root_ref(struct btrfs_root *tree_root,
308 struct btrfs_path *path, 393 struct btrfs_path *path,
@@ -319,7 +404,6 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
319 return ret; 404 return ret;
320} 405}
321 406
322
323/* 407/*
324 * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY 408 * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY
325 * or BTRFS_ROOT_BACKREF_KEY. 409 * or BTRFS_ROOT_BACKREF_KEY.
@@ -335,8 +419,7 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
335 */ 419 */
336int btrfs_add_root_ref(struct btrfs_trans_handle *trans, 420int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
337 struct btrfs_root *tree_root, 421 struct btrfs_root *tree_root,
338 u64 root_id, u8 type, u64 ref_id, 422 u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
339 u64 dirid, u64 sequence,
340 const char *name, int name_len) 423 const char *name, int name_len)
341{ 424{
342 struct btrfs_key key; 425 struct btrfs_key key;
@@ -346,13 +429,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
346 struct extent_buffer *leaf; 429 struct extent_buffer *leaf;
347 unsigned long ptr; 430 unsigned long ptr;
348 431
349
350 path = btrfs_alloc_path(); 432 path = btrfs_alloc_path();
433 if (!path)
434 return -ENOMEM;
351 435
352 key.objectid = root_id; 436 key.objectid = root_id;
353 key.type = type; 437 key.type = BTRFS_ROOT_BACKREF_KEY;
354 key.offset = ref_id; 438 key.offset = ref_id;
355 439again:
356 ret = btrfs_insert_empty_item(trans, tree_root, path, &key, 440 ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
357 sizeof(*ref) + name_len); 441 sizeof(*ref) + name_len);
358 BUG_ON(ret); 442 BUG_ON(ret);
@@ -366,6 +450,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
366 write_extent_buffer(leaf, name, ptr, name_len); 450 write_extent_buffer(leaf, name, ptr, name_len);
367 btrfs_mark_buffer_dirty(leaf); 451 btrfs_mark_buffer_dirty(leaf);
368 452
453 if (key.type == BTRFS_ROOT_BACKREF_KEY) {
454 btrfs_release_path(tree_root, path);
455 key.objectid = ref_id;
456 key.type = BTRFS_ROOT_REF_KEY;
457 key.offset = root_id;
458 goto again;
459 }
460
369 btrfs_free_path(path); 461 btrfs_free_path(path);
370 return ret; 462 return 0;
371} 463}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6d6d06cb6dfc..9de9b2236419 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -51,7 +51,7 @@
51#include "export.h" 51#include "export.h"
52#include "compression.h" 52#include "compression.h"
53 53
54static struct super_operations btrfs_super_ops; 54static const struct super_operations btrfs_super_ops;
55 55
56static void btrfs_put_super(struct super_block *sb) 56static void btrfs_put_super(struct super_block *sb)
57{ 57{
@@ -344,7 +344,9 @@ static int btrfs_fill_super(struct super_block *sb,
344 sb->s_export_op = &btrfs_export_ops; 344 sb->s_export_op = &btrfs_export_ops;
345 sb->s_xattr = btrfs_xattr_handlers; 345 sb->s_xattr = btrfs_xattr_handlers;
346 sb->s_time_gran = 1; 346 sb->s_time_gran = 1;
347#ifdef CONFIG_BTRFS_POSIX_ACL
347 sb->s_flags |= MS_POSIXACL; 348 sb->s_flags |= MS_POSIXACL;
349#endif
348 350
349 tree_root = open_ctree(sb, fs_devices, (char *)data); 351 tree_root = open_ctree(sb, fs_devices, (char *)data);
350 352
@@ -675,7 +677,8 @@ static int btrfs_unfreeze(struct super_block *sb)
675 return 0; 677 return 0;
676} 678}
677 679
678static struct super_operations btrfs_super_ops = { 680static const struct super_operations btrfs_super_ops = {
681 .drop_inode = btrfs_drop_inode,
679 .delete_inode = btrfs_delete_inode, 682 .delete_inode = btrfs_delete_inode,
680 .put_super = btrfs_put_super, 683 .put_super = btrfs_put_super,
681 .sync_fs = btrfs_sync_fs, 684 .sync_fs = btrfs_sync_fs,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index cdbb5022da52..0b8f36d4400a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -104,7 +104,6 @@ static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
104{ 104{
105 if (root->ref_cows && root->last_trans < trans->transid) { 105 if (root->ref_cows && root->last_trans < trans->transid) {
106 WARN_ON(root == root->fs_info->extent_root); 106 WARN_ON(root == root->fs_info->extent_root);
107 WARN_ON(root->root_item.refs == 0);
108 WARN_ON(root->commit_root != root->node); 107 WARN_ON(root->commit_root != root->node);
109 108
110 radix_tree_tag_set(&root->fs_info->fs_roots_radix, 109 radix_tree_tag_set(&root->fs_info->fs_roots_radix,
@@ -187,6 +186,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
187 h->alloc_exclude_start = 0; 186 h->alloc_exclude_start = 0;
188 h->delayed_ref_updates = 0; 187 h->delayed_ref_updates = 0;
189 188
189 if (!current->journal_info)
190 current->journal_info = h;
191
190 root->fs_info->running_transaction->use_count++; 192 root->fs_info->running_transaction->use_count++;
191 record_root_in_trans(h, root); 193 record_root_in_trans(h, root);
192 mutex_unlock(&root->fs_info->trans_mutex); 194 mutex_unlock(&root->fs_info->trans_mutex);
@@ -318,6 +320,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
318 wake_up(&cur_trans->writer_wait); 320 wake_up(&cur_trans->writer_wait);
319 put_transaction(cur_trans); 321 put_transaction(cur_trans);
320 mutex_unlock(&info->trans_mutex); 322 mutex_unlock(&info->trans_mutex);
323
324 if (current->journal_info == trans)
325 current->journal_info = NULL;
321 memset(trans, 0, sizeof(*trans)); 326 memset(trans, 0, sizeof(*trans));
322 kmem_cache_free(btrfs_trans_handle_cachep, trans); 327 kmem_cache_free(btrfs_trans_handle_cachep, trans);
323 328
@@ -720,7 +725,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
720 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); 725 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
721 726
722 key.objectid = objectid; 727 key.objectid = objectid;
723 key.offset = 0; 728 /* record when the snapshot was created in key.offset */
729 key.offset = trans->transid;
724 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 730 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
725 731
726 old = btrfs_lock_root_node(root); 732 old = btrfs_lock_root_node(root);
@@ -743,6 +749,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
743 memcpy(&pending->root_key, &key, sizeof(key)); 749 memcpy(&pending->root_key, &key, sizeof(key));
744fail: 750fail:
745 kfree(new_root_item); 751 kfree(new_root_item);
752 btrfs_unreserve_metadata_space(root, 6);
746 return ret; 753 return ret;
747} 754}
748 755
@@ -778,24 +785,14 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
778 ret = btrfs_update_inode(trans, parent_root, parent_inode); 785 ret = btrfs_update_inode(trans, parent_root, parent_inode);
779 BUG_ON(ret); 786 BUG_ON(ret);
780 787
781 /* add the backref first */
782 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, 788 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
783 pending->root_key.objectid, 789 pending->root_key.objectid,
784 BTRFS_ROOT_BACKREF_KEY,
785 parent_root->root_key.objectid, 790 parent_root->root_key.objectid,
786 parent_inode->i_ino, index, pending->name, 791 parent_inode->i_ino, index, pending->name,
787 namelen); 792 namelen);
788 793
789 BUG_ON(ret); 794 BUG_ON(ret);
790 795
791 /* now add the forward ref */
792 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
793 parent_root->root_key.objectid,
794 BTRFS_ROOT_REF_KEY,
795 pending->root_key.objectid,
796 parent_inode->i_ino, index, pending->name,
797 namelen);
798
799 inode = btrfs_lookup_dentry(parent_inode, pending->dentry); 796 inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
800 d_instantiate(pending->dentry, inode); 797 d_instantiate(pending->dentry, inode);
801fail: 798fail:
@@ -874,7 +871,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
874 unsigned long timeout = 1; 871 unsigned long timeout = 1;
875 struct btrfs_transaction *cur_trans; 872 struct btrfs_transaction *cur_trans;
876 struct btrfs_transaction *prev_trans = NULL; 873 struct btrfs_transaction *prev_trans = NULL;
877 struct extent_io_tree *pinned_copy;
878 DEFINE_WAIT(wait); 874 DEFINE_WAIT(wait);
879 int ret; 875 int ret;
880 int should_grow = 0; 876 int should_grow = 0;
@@ -915,13 +911,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
915 return 0; 911 return 0;
916 } 912 }
917 913
918 pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
919 if (!pinned_copy)
920 return -ENOMEM;
921
922 extent_io_tree_init(pinned_copy,
923 root->fs_info->btree_inode->i_mapping, GFP_NOFS);
924
925 trans->transaction->in_commit = 1; 914 trans->transaction->in_commit = 1;
926 trans->transaction->blocked = 1; 915 trans->transaction->blocked = 1;
927 if (cur_trans->list.prev != &root->fs_info->trans_list) { 916 if (cur_trans->list.prev != &root->fs_info->trans_list) {
@@ -1019,6 +1008,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1019 ret = commit_cowonly_roots(trans, root); 1008 ret = commit_cowonly_roots(trans, root);
1020 BUG_ON(ret); 1009 BUG_ON(ret);
1021 1010
1011 btrfs_prepare_extent_commit(trans, root);
1012
1022 cur_trans = root->fs_info->running_transaction; 1013 cur_trans = root->fs_info->running_transaction;
1023 spin_lock(&root->fs_info->new_trans_lock); 1014 spin_lock(&root->fs_info->new_trans_lock);
1024 root->fs_info->running_transaction = NULL; 1015 root->fs_info->running_transaction = NULL;
@@ -1042,8 +1033,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1042 memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, 1033 memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
1043 sizeof(root->fs_info->super_copy)); 1034 sizeof(root->fs_info->super_copy));
1044 1035
1045 btrfs_copy_pinned(root, pinned_copy);
1046
1047 trans->transaction->blocked = 0; 1036 trans->transaction->blocked = 0;
1048 1037
1049 wake_up(&root->fs_info->transaction_wait); 1038 wake_up(&root->fs_info->transaction_wait);
@@ -1059,8 +1048,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1059 */ 1048 */
1060 mutex_unlock(&root->fs_info->tree_log_mutex); 1049 mutex_unlock(&root->fs_info->tree_log_mutex);
1061 1050
1062 btrfs_finish_extent_commit(trans, root, pinned_copy); 1051 btrfs_finish_extent_commit(trans, root);
1063 kfree(pinned_copy);
1064 1052
1065 /* do the directory inserts of any pending snapshot creations */ 1053 /* do the directory inserts of any pending snapshot creations */
1066 finish_pending_snapshots(trans, root->fs_info); 1054 finish_pending_snapshots(trans, root->fs_info);
@@ -1078,6 +1066,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1078 1066
1079 mutex_unlock(&root->fs_info->trans_mutex); 1067 mutex_unlock(&root->fs_info->trans_mutex);
1080 1068
1069 if (current->journal_info == trans)
1070 current->journal_info = NULL;
1071
1081 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1072 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1082 return ret; 1073 return ret;
1083} 1074}
@@ -1096,8 +1087,13 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
1096 1087
1097 while (!list_empty(&list)) { 1088 while (!list_empty(&list)) {
1098 root = list_entry(list.next, struct btrfs_root, root_list); 1089 root = list_entry(list.next, struct btrfs_root, root_list);
1099 list_del_init(&root->root_list); 1090 list_del(&root->root_list);
1100 btrfs_drop_snapshot(root, 0); 1091
1092 if (btrfs_header_backref_rev(root->node) <
1093 BTRFS_MIXED_BACKREF_REV)
1094 btrfs_drop_snapshot(root, 0);
1095 else
1096 btrfs_drop_snapshot(root, 1);
1101 } 1097 }
1102 return 0; 1098 return 0;
1103} 1099}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d91b0de7c502..7827841b55cb 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -263,8 +263,8 @@ static int process_one_buffer(struct btrfs_root *log,
263 struct walk_control *wc, u64 gen) 263 struct walk_control *wc, u64 gen)
264{ 264{
265 if (wc->pin) 265 if (wc->pin)
266 btrfs_update_pinned_extents(log->fs_info->extent_root, 266 btrfs_pin_extent(log->fs_info->extent_root,
267 eb->start, eb->len, 1); 267 eb->start, eb->len, 0);
268 268
269 if (btrfs_buffer_uptodate(eb, gen)) { 269 if (btrfs_buffer_uptodate(eb, gen)) {
270 if (wc->write) 270 if (wc->write)
@@ -534,7 +534,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
534 saved_nbytes = inode_get_bytes(inode); 534 saved_nbytes = inode_get_bytes(inode);
535 /* drop any overlapping extents */ 535 /* drop any overlapping extents */
536 ret = btrfs_drop_extents(trans, root, inode, 536 ret = btrfs_drop_extents(trans, root, inode,
537 start, extent_end, extent_end, start, &alloc_hint); 537 start, extent_end, extent_end, start, &alloc_hint, 1);
538 BUG_ON(ret); 538 BUG_ON(ret);
539 539
540 if (found_type == BTRFS_FILE_EXTENT_REG || 540 if (found_type == BTRFS_FILE_EXTENT_REG ||
@@ -2605,7 +2605,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2605 extent); 2605 extent);
2606 cs = btrfs_file_extent_offset(src, extent); 2606 cs = btrfs_file_extent_offset(src, extent);
2607 cl = btrfs_file_extent_num_bytes(src, 2607 cl = btrfs_file_extent_num_bytes(src,
2608 extent);; 2608 extent);
2609 if (btrfs_file_extent_compression(src, 2609 if (btrfs_file_extent_compression(src,
2610 extent)) { 2610 extent)) {
2611 cs = 0; 2611 cs = 0;
@@ -2841,7 +2841,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
2841 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 2841 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2842 break; 2842 break;
2843 2843
2844 if (parent == sb->s_root) 2844 if (IS_ROOT(parent))
2845 break; 2845 break;
2846 2846
2847 parent = parent->d_parent; 2847 parent = parent->d_parent;
@@ -2880,6 +2880,12 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2880 goto end_no_trans; 2880 goto end_no_trans;
2881 } 2881 }
2882 2882
2883 if (root != BTRFS_I(inode)->root ||
2884 btrfs_root_refs(&root->root_item) == 0) {
2885 ret = 1;
2886 goto end_no_trans;
2887 }
2888
2883 ret = check_parent_dirs_for_sync(trans, inode, parent, 2889 ret = check_parent_dirs_for_sync(trans, inode, parent,
2884 sb, last_committed); 2890 sb, last_committed);
2885 if (ret) 2891 if (ret)
@@ -2907,12 +2913,15 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2907 break; 2913 break;
2908 2914
2909 inode = parent->d_inode; 2915 inode = parent->d_inode;
2916 if (root != BTRFS_I(inode)->root)
2917 break;
2918
2910 if (BTRFS_I(inode)->generation > 2919 if (BTRFS_I(inode)->generation >
2911 root->fs_info->last_trans_committed) { 2920 root->fs_info->last_trans_committed) {
2912 ret = btrfs_log_inode(trans, root, inode, inode_only); 2921 ret = btrfs_log_inode(trans, root, inode, inode_only);
2913 BUG_ON(ret); 2922 BUG_ON(ret);
2914 } 2923 }
2915 if (parent == sb->s_root) 2924 if (IS_ROOT(parent))
2916 break; 2925 break;
2917 2926
2918 parent = parent->d_parent; 2927 parent = parent->d_parent;
@@ -2951,7 +2960,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
2951 struct btrfs_key tmp_key; 2960 struct btrfs_key tmp_key;
2952 struct btrfs_root *log; 2961 struct btrfs_root *log;
2953 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 2962 struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
2954 u64 highest_inode;
2955 struct walk_control wc = { 2963 struct walk_control wc = {
2956 .process_func = process_one_buffer, 2964 .process_func = process_one_buffer,
2957 .stage = 0, 2965 .stage = 0,
@@ -3010,11 +3018,6 @@ again:
3010 path); 3018 path);
3011 BUG_ON(ret); 3019 BUG_ON(ret);
3012 } 3020 }
3013 ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
3014 if (ret == 0) {
3015 wc.replay_dest->highest_inode = highest_inode;
3016 wc.replay_dest->last_inode_alloc = highest_inode;
3017 }
3018 3021
3019 key.offset = found_key.offset - 1; 3022 key.offset = found_key.offset - 1;
3020 wc.replay_dest->log_root = NULL; 3023 wc.replay_dest->log_root = NULL;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5dbefd11b4af..7eda483d7b5a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -260,7 +260,7 @@ loop_lock:
260 num_run++; 260 num_run++;
261 batch_run++; 261 batch_run++;
262 262
263 if (bio_sync(cur)) 263 if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
264 num_sync_run++; 264 num_sync_run++;
265 265
266 if (need_resched()) { 266 if (need_resched()) {
@@ -276,7 +276,7 @@ loop_lock:
276 * is now congested. Back off and let other work structs 276 * is now congested. Back off and let other work structs
277 * run instead 277 * run instead
278 */ 278 */
279 if (pending && bdi_write_congested(bdi) && batch_run > 32 && 279 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
280 fs_info->fs_devices->open_devices > 1) { 280 fs_info->fs_devices->open_devices > 1) {
281 struct io_context *ioc; 281 struct io_context *ioc;
282 282
@@ -446,8 +446,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
446 goto error; 446 goto error;
447 447
448 device->name = kstrdup(orig_dev->name, GFP_NOFS); 448 device->name = kstrdup(orig_dev->name, GFP_NOFS);
449 if (!device->name) 449 if (!device->name) {
450 kfree(device);
450 goto error; 451 goto error;
452 }
451 453
452 device->devid = orig_dev->devid; 454 device->devid = orig_dev->devid;
453 device->work.func = pending_bios_fn; 455 device->work.func = pending_bios_fn;
@@ -719,10 +721,9 @@ error:
719 * called very infrequently and that a given device has a small number 721 * called very infrequently and that a given device has a small number
720 * of extents 722 * of extents
721 */ 723 */
722static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, 724int find_free_dev_extent(struct btrfs_trans_handle *trans,
723 struct btrfs_device *device, 725 struct btrfs_device *device, u64 num_bytes,
724 u64 num_bytes, u64 *start, 726 u64 *start, u64 *max_avail)
725 u64 *max_avail)
726{ 727{
727 struct btrfs_key key; 728 struct btrfs_key key;
728 struct btrfs_root *root = device->dev_root; 729 struct btrfs_root *root = device->dev_root;
@@ -1736,6 +1737,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1736 extent_root = root->fs_info->extent_root; 1737 extent_root = root->fs_info->extent_root;
1737 em_tree = &root->fs_info->mapping_tree.map_tree; 1738 em_tree = &root->fs_info->mapping_tree.map_tree;
1738 1739
1740 ret = btrfs_can_relocate(extent_root, chunk_offset);
1741 if (ret)
1742 return -ENOSPC;
1743
1739 /* step one, relocate all the extents inside this chunk */ 1744 /* step one, relocate all the extents inside this chunk */
1740 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 1745 ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1741 BUG_ON(ret); 1746 BUG_ON(ret);
@@ -1749,9 +1754,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1749 * step two, delete the device extents and the 1754 * step two, delete the device extents and the
1750 * chunk tree entries 1755 * chunk tree entries
1751 */ 1756 */
1752 spin_lock(&em_tree->lock); 1757 read_lock(&em_tree->lock);
1753 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 1758 em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1754 spin_unlock(&em_tree->lock); 1759 read_unlock(&em_tree->lock);
1755 1760
1756 BUG_ON(em->start > chunk_offset || 1761 BUG_ON(em->start > chunk_offset ||
1757 em->start + em->len < chunk_offset); 1762 em->start + em->len < chunk_offset);
@@ -1780,9 +1785,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1780 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); 1785 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
1781 BUG_ON(ret); 1786 BUG_ON(ret);
1782 1787
1783 spin_lock(&em_tree->lock); 1788 write_lock(&em_tree->lock);
1784 remove_extent_mapping(em_tree, em); 1789 remove_extent_mapping(em_tree, em);
1785 spin_unlock(&em_tree->lock); 1790 write_unlock(&em_tree->lock);
1786 1791
1787 kfree(map); 1792 kfree(map);
1788 em->bdev = NULL; 1793 em->bdev = NULL;
@@ -1807,12 +1812,15 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
1807 struct btrfs_key found_key; 1812 struct btrfs_key found_key;
1808 u64 chunk_tree = chunk_root->root_key.objectid; 1813 u64 chunk_tree = chunk_root->root_key.objectid;
1809 u64 chunk_type; 1814 u64 chunk_type;
1815 bool retried = false;
1816 int failed = 0;
1810 int ret; 1817 int ret;
1811 1818
1812 path = btrfs_alloc_path(); 1819 path = btrfs_alloc_path();
1813 if (!path) 1820 if (!path)
1814 return -ENOMEM; 1821 return -ENOMEM;
1815 1822
1823again:
1816 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 1824 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1817 key.offset = (u64)-1; 1825 key.offset = (u64)-1;
1818 key.type = BTRFS_CHUNK_ITEM_KEY; 1826 key.type = BTRFS_CHUNK_ITEM_KEY;
@@ -1842,7 +1850,10 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
1842 ret = btrfs_relocate_chunk(chunk_root, chunk_tree, 1850 ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
1843 found_key.objectid, 1851 found_key.objectid,
1844 found_key.offset); 1852 found_key.offset);
1845 BUG_ON(ret); 1853 if (ret == -ENOSPC)
1854 failed++;
1855 else if (ret)
1856 BUG();
1846 } 1857 }
1847 1858
1848 if (found_key.offset == 0) 1859 if (found_key.offset == 0)
@@ -1850,6 +1861,14 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
1850 key.offset = found_key.offset - 1; 1861 key.offset = found_key.offset - 1;
1851 } 1862 }
1852 ret = 0; 1863 ret = 0;
1864 if (failed && !retried) {
1865 failed = 0;
1866 retried = true;
1867 goto again;
1868 } else if (failed && retried) {
1869 WARN_ON(1);
1870 ret = -ENOSPC;
1871 }
1853error: 1872error:
1854 btrfs_free_path(path); 1873 btrfs_free_path(path);
1855 return ret; 1874 return ret;
@@ -1894,6 +1913,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
1894 continue; 1913 continue;
1895 1914
1896 ret = btrfs_shrink_device(device, old_size - size_to_free); 1915 ret = btrfs_shrink_device(device, old_size - size_to_free);
1916 if (ret == -ENOSPC)
1917 break;
1897 BUG_ON(ret); 1918 BUG_ON(ret);
1898 1919
1899 trans = btrfs_start_transaction(dev_root, 1); 1920 trans = btrfs_start_transaction(dev_root, 1);
@@ -1938,9 +1959,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
1938 chunk = btrfs_item_ptr(path->nodes[0], 1959 chunk = btrfs_item_ptr(path->nodes[0],
1939 path->slots[0], 1960 path->slots[0],
1940 struct btrfs_chunk); 1961 struct btrfs_chunk);
1941 key.offset = found_key.offset;
1942 /* chunk zero is special */ 1962 /* chunk zero is special */
1943 if (key.offset == 0) 1963 if (found_key.offset == 0)
1944 break; 1964 break;
1945 1965
1946 btrfs_release_path(chunk_root, path); 1966 btrfs_release_path(chunk_root, path);
@@ -1948,7 +1968,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
1948 chunk_root->root_key.objectid, 1968 chunk_root->root_key.objectid,
1949 found_key.objectid, 1969 found_key.objectid,
1950 found_key.offset); 1970 found_key.offset);
1951 BUG_ON(ret); 1971 BUG_ON(ret && ret != -ENOSPC);
1972 key.offset = found_key.offset - 1;
1952 } 1973 }
1953 ret = 0; 1974 ret = 0;
1954error: 1975error:
@@ -1974,10 +1995,13 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
1974 u64 chunk_offset; 1995 u64 chunk_offset;
1975 int ret; 1996 int ret;
1976 int slot; 1997 int slot;
1998 int failed = 0;
1999 bool retried = false;
1977 struct extent_buffer *l; 2000 struct extent_buffer *l;
1978 struct btrfs_key key; 2001 struct btrfs_key key;
1979 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2002 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1980 u64 old_total = btrfs_super_total_bytes(super_copy); 2003 u64 old_total = btrfs_super_total_bytes(super_copy);
2004 u64 old_size = device->total_bytes;
1981 u64 diff = device->total_bytes - new_size; 2005 u64 diff = device->total_bytes - new_size;
1982 2006
1983 if (new_size >= device->total_bytes) 2007 if (new_size >= device->total_bytes)
@@ -1987,12 +2011,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
1987 if (!path) 2011 if (!path)
1988 return -ENOMEM; 2012 return -ENOMEM;
1989 2013
1990 trans = btrfs_start_transaction(root, 1);
1991 if (!trans) {
1992 ret = -ENOMEM;
1993 goto done;
1994 }
1995
1996 path->reada = 2; 2014 path->reada = 2;
1997 2015
1998 lock_chunks(root); 2016 lock_chunks(root);
@@ -2001,8 +2019,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2001 if (device->writeable) 2019 if (device->writeable)
2002 device->fs_devices->total_rw_bytes -= diff; 2020 device->fs_devices->total_rw_bytes -= diff;
2003 unlock_chunks(root); 2021 unlock_chunks(root);
2004 btrfs_end_transaction(trans, root);
2005 2022
2023again:
2006 key.objectid = device->devid; 2024 key.objectid = device->devid;
2007 key.offset = (u64)-1; 2025 key.offset = (u64)-1;
2008 key.type = BTRFS_DEV_EXTENT_KEY; 2026 key.type = BTRFS_DEV_EXTENT_KEY;
@@ -2017,6 +2035,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2017 goto done; 2035 goto done;
2018 if (ret) { 2036 if (ret) {
2019 ret = 0; 2037 ret = 0;
2038 btrfs_release_path(root, path);
2020 break; 2039 break;
2021 } 2040 }
2022 2041
@@ -2024,14 +2043,18 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2024 slot = path->slots[0]; 2043 slot = path->slots[0];
2025 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 2044 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
2026 2045
2027 if (key.objectid != device->devid) 2046 if (key.objectid != device->devid) {
2047 btrfs_release_path(root, path);
2028 break; 2048 break;
2049 }
2029 2050
2030 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 2051 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2031 length = btrfs_dev_extent_length(l, dev_extent); 2052 length = btrfs_dev_extent_length(l, dev_extent);
2032 2053
2033 if (key.offset + length <= new_size) 2054 if (key.offset + length <= new_size) {
2055 btrfs_release_path(root, path);
2034 break; 2056 break;
2057 }
2035 2058
2036 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 2059 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2037 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 2060 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
@@ -2040,8 +2063,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2040 2063
2041 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, 2064 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
2042 chunk_offset); 2065 chunk_offset);
2043 if (ret) 2066 if (ret && ret != -ENOSPC)
2044 goto done; 2067 goto done;
2068 if (ret == -ENOSPC)
2069 failed++;
2070 key.offset -= 1;
2071 }
2072
2073 if (failed && !retried) {
2074 failed = 0;
2075 retried = true;
2076 goto again;
2077 } else if (failed && retried) {
2078 ret = -ENOSPC;
2079 lock_chunks(root);
2080
2081 device->total_bytes = old_size;
2082 if (device->writeable)
2083 device->fs_devices->total_rw_bytes += diff;
2084 unlock_chunks(root);
2085 goto done;
2045 } 2086 }
2046 2087
2047 /* Shrinking succeeded, else we would be at "done". */ 2088 /* Shrinking succeeded, else we would be at "done". */
@@ -2294,9 +2335,9 @@ again:
2294 em->block_len = em->len; 2335 em->block_len = em->len;
2295 2336
2296 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 2337 em_tree = &extent_root->fs_info->mapping_tree.map_tree;
2297 spin_lock(&em_tree->lock); 2338 write_lock(&em_tree->lock);
2298 ret = add_extent_mapping(em_tree, em); 2339 ret = add_extent_mapping(em_tree, em);
2299 spin_unlock(&em_tree->lock); 2340 write_unlock(&em_tree->lock);
2300 BUG_ON(ret); 2341 BUG_ON(ret);
2301 free_extent_map(em); 2342 free_extent_map(em);
2302 2343
@@ -2491,9 +2532,9 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
2491 int readonly = 0; 2532 int readonly = 0;
2492 int i; 2533 int i;
2493 2534
2494 spin_lock(&map_tree->map_tree.lock); 2535 read_lock(&map_tree->map_tree.lock);
2495 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 2536 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2496 spin_unlock(&map_tree->map_tree.lock); 2537 read_unlock(&map_tree->map_tree.lock);
2497 if (!em) 2538 if (!em)
2498 return 1; 2539 return 1;
2499 2540
@@ -2518,11 +2559,11 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
2518 struct extent_map *em; 2559 struct extent_map *em;
2519 2560
2520 while (1) { 2561 while (1) {
2521 spin_lock(&tree->map_tree.lock); 2562 write_lock(&tree->map_tree.lock);
2522 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 2563 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
2523 if (em) 2564 if (em)
2524 remove_extent_mapping(&tree->map_tree, em); 2565 remove_extent_mapping(&tree->map_tree, em);
2525 spin_unlock(&tree->map_tree.lock); 2566 write_unlock(&tree->map_tree.lock);
2526 if (!em) 2567 if (!em)
2527 break; 2568 break;
2528 kfree(em->bdev); 2569 kfree(em->bdev);
@@ -2540,9 +2581,9 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
2540 struct extent_map_tree *em_tree = &map_tree->map_tree; 2581 struct extent_map_tree *em_tree = &map_tree->map_tree;
2541 int ret; 2582 int ret;
2542 2583
2543 spin_lock(&em_tree->lock); 2584 read_lock(&em_tree->lock);
2544 em = lookup_extent_mapping(em_tree, logical, len); 2585 em = lookup_extent_mapping(em_tree, logical, len);
2545 spin_unlock(&em_tree->lock); 2586 read_unlock(&em_tree->lock);
2546 BUG_ON(!em); 2587 BUG_ON(!em);
2547 2588
2548 BUG_ON(em->start > logical || em->start + em->len < logical); 2589 BUG_ON(em->start > logical || em->start + em->len < logical);
@@ -2604,9 +2645,9 @@ again:
2604 atomic_set(&multi->error, 0); 2645 atomic_set(&multi->error, 0);
2605 } 2646 }
2606 2647
2607 spin_lock(&em_tree->lock); 2648 read_lock(&em_tree->lock);
2608 em = lookup_extent_mapping(em_tree, logical, *length); 2649 em = lookup_extent_mapping(em_tree, logical, *length);
2609 spin_unlock(&em_tree->lock); 2650 read_unlock(&em_tree->lock);
2610 2651
2611 if (!em && unplug_page) 2652 if (!em && unplug_page)
2612 return 0; 2653 return 0;
@@ -2763,9 +2804,9 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
2763 u64 stripe_nr; 2804 u64 stripe_nr;
2764 int i, j, nr = 0; 2805 int i, j, nr = 0;
2765 2806
2766 spin_lock(&em_tree->lock); 2807 read_lock(&em_tree->lock);
2767 em = lookup_extent_mapping(em_tree, chunk_start, 1); 2808 em = lookup_extent_mapping(em_tree, chunk_start, 1);
2768 spin_unlock(&em_tree->lock); 2809 read_unlock(&em_tree->lock);
2769 2810
2770 BUG_ON(!em || em->start != chunk_start); 2811 BUG_ON(!em || em->start != chunk_start);
2771 map = (struct map_lookup *)em->bdev; 2812 map = (struct map_lookup *)em->bdev;
@@ -2903,7 +2944,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
2903 bio->bi_rw |= rw; 2944 bio->bi_rw |= rw;
2904 2945
2905 spin_lock(&device->io_lock); 2946 spin_lock(&device->io_lock);
2906 if (bio_sync(bio)) 2947 if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
2907 pending_bios = &device->pending_sync_bios; 2948 pending_bios = &device->pending_sync_bios;
2908 else 2949 else
2909 pending_bios = &device->pending_bios; 2950 pending_bios = &device->pending_bios;
@@ -3053,9 +3094,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
3053 logical = key->offset; 3094 logical = key->offset;
3054 length = btrfs_chunk_length(leaf, chunk); 3095 length = btrfs_chunk_length(leaf, chunk);
3055 3096
3056 spin_lock(&map_tree->map_tree.lock); 3097 read_lock(&map_tree->map_tree.lock);
3057 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 3098 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
3058 spin_unlock(&map_tree->map_tree.lock); 3099 read_unlock(&map_tree->map_tree.lock);
3059 3100
3060 /* already mapped? */ 3101 /* already mapped? */
3061 if (em && em->start <= logical && em->start + em->len > logical) { 3102 if (em && em->start <= logical && em->start + em->len > logical) {
@@ -3114,9 +3155,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
3114 map->stripes[i].dev->in_fs_metadata = 1; 3155 map->stripes[i].dev->in_fs_metadata = 1;
3115 } 3156 }
3116 3157
3117 spin_lock(&map_tree->map_tree.lock); 3158 write_lock(&map_tree->map_tree.lock);
3118 ret = add_extent_mapping(&map_tree->map_tree, em); 3159 ret = add_extent_mapping(&map_tree->map_tree, em);
3119 spin_unlock(&map_tree->map_tree.lock); 3160 write_unlock(&map_tree->map_tree.lock);
3120 BUG_ON(ret); 3161 BUG_ON(ret);
3121 free_extent_map(em); 3162 free_extent_map(em);
3122 3163
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5139a833f721..31b0fabdd2ea 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -181,4 +181,7 @@ int btrfs_balance(struct btrfs_root *dev_root);
181void btrfs_unlock_volumes(void); 181void btrfs_unlock_volumes(void);
182void btrfs_lock_volumes(void); 182void btrfs_lock_volumes(void);
183int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); 183int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
184int find_free_dev_extent(struct btrfs_trans_handle *trans,
185 struct btrfs_device *device, u64 num_bytes,
186 u64 *start, u64 *max_avail);
184#endif 187#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index a9d3bf4d2689..b0fc93f95fd0 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -260,7 +260,7 @@ err:
260 * attributes are handled directly. 260 * attributes are handled directly.
261 */ 261 */
262struct xattr_handler *btrfs_xattr_handlers[] = { 262struct xattr_handler *btrfs_xattr_handlers[] = {
263#ifdef CONFIG_FS_POSIX_ACL 263#ifdef CONFIG_BTRFS_POSIX_ACL
264 &btrfs_xattr_acl_access_handler, 264 &btrfs_xattr_acl_access_handler,
265 &btrfs_xattr_acl_default_handler, 265 &btrfs_xattr_acl_default_handler,
266#endif 266#endif
diff --git a/fs/buffer.c b/fs/buffer.c
index 28f320fac4d4..6fa530256bfd 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -52,6 +52,7 @@ init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
52 bh->b_end_io = handler; 52 bh->b_end_io = handler;
53 bh->b_private = private; 53 bh->b_private = private;
54} 54}
55EXPORT_SYMBOL(init_buffer);
55 56
56static int sync_buffer(void *word) 57static int sync_buffer(void *word)
57{ 58{
@@ -80,6 +81,7 @@ void unlock_buffer(struct buffer_head *bh)
80 smp_mb__after_clear_bit(); 81 smp_mb__after_clear_bit();
81 wake_up_bit(&bh->b_state, BH_Lock); 82 wake_up_bit(&bh->b_state, BH_Lock);
82} 83}
84EXPORT_SYMBOL(unlock_buffer);
83 85
84/* 86/*
85 * Block until a buffer comes unlocked. This doesn't stop it 87 * Block until a buffer comes unlocked. This doesn't stop it
@@ -90,6 +92,7 @@ void __wait_on_buffer(struct buffer_head * bh)
90{ 92{
91 wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE); 93 wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
92} 94}
95EXPORT_SYMBOL(__wait_on_buffer);
93 96
94static void 97static void
95__clear_page_buffers(struct page *page) 98__clear_page_buffers(struct page *page)
@@ -144,6 +147,7 @@ void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
144 __end_buffer_read_notouch(bh, uptodate); 147 __end_buffer_read_notouch(bh, uptodate);
145 put_bh(bh); 148 put_bh(bh);
146} 149}
150EXPORT_SYMBOL(end_buffer_read_sync);
147 151
148void end_buffer_write_sync(struct buffer_head *bh, int uptodate) 152void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
149{ 153{
@@ -164,6 +168,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
164 unlock_buffer(bh); 168 unlock_buffer(bh);
165 put_bh(bh); 169 put_bh(bh);
166} 170}
171EXPORT_SYMBOL(end_buffer_write_sync);
167 172
168/* 173/*
169 * Various filesystems appear to want __find_get_block to be non-blocking. 174 * Various filesystems appear to want __find_get_block to be non-blocking.
@@ -272,16 +277,17 @@ void invalidate_bdev(struct block_device *bdev)
272 invalidate_bh_lrus(); 277 invalidate_bh_lrus();
273 invalidate_mapping_pages(mapping, 0, -1); 278 invalidate_mapping_pages(mapping, 0, -1);
274} 279}
280EXPORT_SYMBOL(invalidate_bdev);
275 281
276/* 282/*
277 * Kick pdflush then try to free up some ZONE_NORMAL memory. 283 * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
278 */ 284 */
279static void free_more_memory(void) 285static void free_more_memory(void)
280{ 286{
281 struct zone *zone; 287 struct zone *zone;
282 int nid; 288 int nid;
283 289
284 wakeup_pdflush(1024); 290 wakeup_flusher_threads(1024);
285 yield(); 291 yield();
286 292
287 for_each_online_node(nid) { 293 for_each_online_node(nid) {
@@ -410,6 +416,7 @@ still_busy:
410 local_irq_restore(flags); 416 local_irq_restore(flags);
411 return; 417 return;
412} 418}
419EXPORT_SYMBOL(end_buffer_async_write);
413 420
414/* 421/*
415 * If a page's buffers are under async readin (end_buffer_async_read 422 * If a page's buffers are under async readin (end_buffer_async_read
@@ -438,8 +445,8 @@ static void mark_buffer_async_read(struct buffer_head *bh)
438 set_buffer_async_read(bh); 445 set_buffer_async_read(bh);
439} 446}
440 447
441void mark_buffer_async_write_endio(struct buffer_head *bh, 448static void mark_buffer_async_write_endio(struct buffer_head *bh,
442 bh_end_io_t *handler) 449 bh_end_io_t *handler)
443{ 450{
444 bh->b_end_io = handler; 451 bh->b_end_io = handler;
445 set_buffer_async_write(bh); 452 set_buffer_async_write(bh);
@@ -553,7 +560,7 @@ repeat:
553 return err; 560 return err;
554} 561}
555 562
556void do_thaw_all(struct work_struct *work) 563static void do_thaw_all(struct work_struct *work)
557{ 564{
558 struct super_block *sb; 565 struct super_block *sb;
559 char b[BDEVNAME_SIZE]; 566 char b[BDEVNAME_SIZE];
@@ -1172,6 +1179,7 @@ void mark_buffer_dirty(struct buffer_head *bh)
1172 } 1179 }
1173 } 1180 }
1174} 1181}
1182EXPORT_SYMBOL(mark_buffer_dirty);
1175 1183
1176/* 1184/*
1177 * Decrement a buffer_head's reference count. If all buffers against a page 1185 * Decrement a buffer_head's reference count. If all buffers against a page
@@ -1188,6 +1196,7 @@ void __brelse(struct buffer_head * buf)
1188 } 1196 }
1189 WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n"); 1197 WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1190} 1198}
1199EXPORT_SYMBOL(__brelse);
1191 1200
1192/* 1201/*
1193 * bforget() is like brelse(), except it discards any 1202 * bforget() is like brelse(), except it discards any
@@ -1206,6 +1215,7 @@ void __bforget(struct buffer_head *bh)
1206 } 1215 }
1207 __brelse(bh); 1216 __brelse(bh);
1208} 1217}
1218EXPORT_SYMBOL(__bforget);
1209 1219
1210static struct buffer_head *__bread_slow(struct buffer_head *bh) 1220static struct buffer_head *__bread_slow(struct buffer_head *bh)
1211{ 1221{
@@ -1699,9 +1709,9 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1699 /* 1709 /*
1700 * If it's a fully non-blocking write attempt and we cannot 1710 * If it's a fully non-blocking write attempt and we cannot
1701 * lock the buffer then redirty the page. Note that this can 1711 * lock the buffer then redirty the page. Note that this can
1702 * potentially cause a busy-wait loop from pdflush and kswapd 1712 * potentially cause a busy-wait loop from writeback threads
1703 * activity, but those code paths have their own higher-level 1713 * and kswapd activity, but those code paths have their own
1704 * throttling. 1714 * higher-level throttling.
1705 */ 1715 */
1706 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { 1716 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1707 lock_buffer(bh); 1717 lock_buffer(bh);
@@ -2218,6 +2228,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
2218 } 2228 }
2219 return 0; 2229 return 0;
2220} 2230}
2231EXPORT_SYMBOL(block_read_full_page);
2221 2232
2222/* utility function for filesystems that need to do work on expanding 2233/* utility function for filesystems that need to do work on expanding
2223 * truncates. Uses filesystem pagecache writes to allow the filesystem to 2234 * truncates. Uses filesystem pagecache writes to allow the filesystem to
@@ -2228,16 +2239,10 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size)
2228 struct address_space *mapping = inode->i_mapping; 2239 struct address_space *mapping = inode->i_mapping;
2229 struct page *page; 2240 struct page *page;
2230 void *fsdata; 2241 void *fsdata;
2231 unsigned long limit;
2232 int err; 2242 int err;
2233 2243
2234 err = -EFBIG; 2244 err = inode_newsize_ok(inode, size);
2235 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; 2245 if (err)
2236 if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2237 send_sig(SIGXFSZ, current, 0);
2238 goto out;
2239 }
2240 if (size > inode->i_sb->s_maxbytes)
2241 goto out; 2246 goto out;
2242 2247
2243 err = pagecache_write_begin(NULL, mapping, size, 0, 2248 err = pagecache_write_begin(NULL, mapping, size, 0,
@@ -2252,6 +2257,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size)
2252out: 2257out:
2253 return err; 2258 return err;
2254} 2259}
2260EXPORT_SYMBOL(generic_cont_expand_simple);
2255 2261
2256static int cont_expand_zero(struct file *file, struct address_space *mapping, 2262static int cont_expand_zero(struct file *file, struct address_space *mapping,
2257 loff_t pos, loff_t *bytes) 2263 loff_t pos, loff_t *bytes)
@@ -2352,6 +2358,7 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
2352out: 2358out:
2353 return err; 2359 return err;
2354} 2360}
2361EXPORT_SYMBOL(cont_write_begin);
2355 2362
2356int block_prepare_write(struct page *page, unsigned from, unsigned to, 2363int block_prepare_write(struct page *page, unsigned from, unsigned to,
2357 get_block_t *get_block) 2364 get_block_t *get_block)
@@ -2362,6 +2369,7 @@ int block_prepare_write(struct page *page, unsigned from, unsigned to,
2362 ClearPageUptodate(page); 2369 ClearPageUptodate(page);
2363 return err; 2370 return err;
2364} 2371}
2372EXPORT_SYMBOL(block_prepare_write);
2365 2373
2366int block_commit_write(struct page *page, unsigned from, unsigned to) 2374int block_commit_write(struct page *page, unsigned from, unsigned to)
2367{ 2375{
@@ -2369,6 +2377,7 @@ int block_commit_write(struct page *page, unsigned from, unsigned to)
2369 __block_commit_write(inode,page,from,to); 2377 __block_commit_write(inode,page,from,to);
2370 return 0; 2378 return 0;
2371} 2379}
2380EXPORT_SYMBOL(block_commit_write);
2372 2381
2373/* 2382/*
2374 * block_page_mkwrite() is not allowed to change the file size as it gets 2383 * block_page_mkwrite() is not allowed to change the file size as it gets
@@ -2426,6 +2435,7 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2426out: 2435out:
2427 return ret; 2436 return ret;
2428} 2437}
2438EXPORT_SYMBOL(block_page_mkwrite);
2429 2439
2430/* 2440/*
2431 * nobh_write_begin()'s prereads are special: the buffer_heads are freed 2441 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
@@ -2849,6 +2859,7 @@ unlock:
2849out: 2859out:
2850 return err; 2860 return err;
2851} 2861}
2862EXPORT_SYMBOL(block_truncate_page);
2852 2863
2853/* 2864/*
2854 * The generic ->writepage function for buffer-backed address_spaces 2865 * The generic ->writepage function for buffer-backed address_spaces
@@ -2890,6 +2901,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2890 zero_user_segment(page, offset, PAGE_CACHE_SIZE); 2901 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2891 return __block_write_full_page(inode, page, get_block, wbc, handler); 2902 return __block_write_full_page(inode, page, get_block, wbc, handler);
2892} 2903}
2904EXPORT_SYMBOL(block_write_full_page_endio);
2893 2905
2894/* 2906/*
2895 * The generic ->writepage function for buffer-backed address_spaces 2907 * The generic ->writepage function for buffer-backed address_spaces
@@ -2900,7 +2912,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
2900 return block_write_full_page_endio(page, get_block, wbc, 2912 return block_write_full_page_endio(page, get_block, wbc,
2901 end_buffer_async_write); 2913 end_buffer_async_write);
2902} 2914}
2903 2915EXPORT_SYMBOL(block_write_full_page);
2904 2916
2905sector_t generic_block_bmap(struct address_space *mapping, sector_t block, 2917sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2906 get_block_t *get_block) 2918 get_block_t *get_block)
@@ -2913,6 +2925,7 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2913 get_block(inode, block, &tmp, 0); 2925 get_block(inode, block, &tmp, 0);
2914 return tmp.b_blocknr; 2926 return tmp.b_blocknr;
2915} 2927}
2928EXPORT_SYMBOL(generic_block_bmap);
2916 2929
2917static void end_bio_bh_io_sync(struct bio *bio, int err) 2930static void end_bio_bh_io_sync(struct bio *bio, int err)
2918{ 2931{
@@ -2982,6 +2995,7 @@ int submit_bh(int rw, struct buffer_head * bh)
2982 bio_put(bio); 2995 bio_put(bio);
2983 return ret; 2996 return ret;
2984} 2997}
2998EXPORT_SYMBOL(submit_bh);
2985 2999
2986/** 3000/**
2987 * ll_rw_block: low-level access to block devices (DEPRECATED) 3001 * ll_rw_block: low-level access to block devices (DEPRECATED)
@@ -3043,6 +3057,7 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
3043 unlock_buffer(bh); 3057 unlock_buffer(bh);
3044 } 3058 }
3045} 3059}
3060EXPORT_SYMBOL(ll_rw_block);
3046 3061
3047/* 3062/*
3048 * For a data-integrity writeout, we need to wait upon any in-progress I/O 3063 * For a data-integrity writeout, we need to wait upon any in-progress I/O
@@ -3071,6 +3086,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
3071 } 3086 }
3072 return ret; 3087 return ret;
3073} 3088}
3089EXPORT_SYMBOL(sync_dirty_buffer);
3074 3090
3075/* 3091/*
3076 * try_to_free_buffers() checks if all the buffers on this particular page 3092 * try_to_free_buffers() checks if all the buffers on this particular page
@@ -3185,13 +3201,14 @@ void block_sync_page(struct page *page)
3185 if (mapping) 3201 if (mapping)
3186 blk_run_backing_dev(mapping->backing_dev_info, page); 3202 blk_run_backing_dev(mapping->backing_dev_info, page);
3187} 3203}
3204EXPORT_SYMBOL(block_sync_page);
3188 3205
3189/* 3206/*
3190 * There are no bdflush tunables left. But distributions are 3207 * There are no bdflush tunables left. But distributions are
3191 * still running obsolete flush daemons, so we terminate them here. 3208 * still running obsolete flush daemons, so we terminate them here.
3192 * 3209 *
3193 * Use of bdflush() is deprecated and will be removed in a future kernel. 3210 * Use of bdflush() is deprecated and will be removed in a future kernel.
3194 * The `pdflush' kernel threads fully replace bdflush daemons and this call. 3211 * The `flush-X' kernel threads fully replace bdflush daemons and this call.
3195 */ 3212 */
3196SYSCALL_DEFINE2(bdflush, int, func, long, data) 3213SYSCALL_DEFINE2(bdflush, int, func, long, data)
3197{ 3214{
@@ -3361,29 +3378,3 @@ void __init buffer_init(void)
3361 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head)); 3378 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3362 hotcpu_notifier(buffer_cpu_notify, 0); 3379 hotcpu_notifier(buffer_cpu_notify, 0);
3363} 3380}
3364
3365EXPORT_SYMBOL(__bforget);
3366EXPORT_SYMBOL(__brelse);
3367EXPORT_SYMBOL(__wait_on_buffer);
3368EXPORT_SYMBOL(block_commit_write);
3369EXPORT_SYMBOL(block_prepare_write);
3370EXPORT_SYMBOL(block_page_mkwrite);
3371EXPORT_SYMBOL(block_read_full_page);
3372EXPORT_SYMBOL(block_sync_page);
3373EXPORT_SYMBOL(block_truncate_page);
3374EXPORT_SYMBOL(block_write_full_page);
3375EXPORT_SYMBOL(block_write_full_page_endio);
3376EXPORT_SYMBOL(cont_write_begin);
3377EXPORT_SYMBOL(end_buffer_read_sync);
3378EXPORT_SYMBOL(end_buffer_write_sync);
3379EXPORT_SYMBOL(end_buffer_async_write);
3380EXPORT_SYMBOL(file_fsync);
3381EXPORT_SYMBOL(generic_block_bmap);
3382EXPORT_SYMBOL(generic_cont_expand_simple);
3383EXPORT_SYMBOL(init_buffer);
3384EXPORT_SYMBOL(invalidate_bdev);
3385EXPORT_SYMBOL(ll_rw_block);
3386EXPORT_SYMBOL(mark_buffer_dirty);
3387EXPORT_SYMBOL(submit_bh);
3388EXPORT_SYMBOL(sync_dirty_buffer);
3389EXPORT_SYMBOL(unlock_buffer);
diff --git a/fs/char_dev.c b/fs/char_dev.c
index a173551e19d7..d6db933df2b2 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -31,6 +31,7 @@
31 * - no readahead or I/O queue unplugging required 31 * - no readahead or I/O queue unplugging required
32 */ 32 */
33struct backing_dev_info directly_mappable_cdev_bdi = { 33struct backing_dev_info directly_mappable_cdev_bdi = {
34 .name = "char",
34 .capabilities = ( 35 .capabilities = (
35#ifdef CONFIG_MMU 36#ifdef CONFIG_MMU
36 /* permit private copies of the data to be taken */ 37 /* permit private copies of the data to be taken */
@@ -237,8 +238,10 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
237} 238}
238 239
239/** 240/**
240 * register_chrdev() - Register a major number for character devices. 241 * __register_chrdev() - create and register a cdev occupying a range of minors
241 * @major: major device number or 0 for dynamic allocation 242 * @major: major device number or 0 for dynamic allocation
243 * @baseminor: first of the requested range of minor numbers
244 * @count: the number of minor numbers required
242 * @name: name of this range of devices 245 * @name: name of this range of devices
243 * @fops: file operations associated with this devices 246 * @fops: file operations associated with this devices
244 * 247 *
@@ -254,19 +257,16 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
254 * /dev. It only helps to keep track of the different owners of devices. If 257 * /dev. It only helps to keep track of the different owners of devices. If
255 * your module name has only one type of devices it's ok to use e.g. the name 258 * your module name has only one type of devices it's ok to use e.g. the name
256 * of the module here. 259 * of the module here.
257 *
258 * This function registers a range of 256 minor numbers. The first minor number
259 * is 0.
260 */ 260 */
261int register_chrdev(unsigned int major, const char *name, 261int __register_chrdev(unsigned int major, unsigned int baseminor,
262 const struct file_operations *fops) 262 unsigned int count, const char *name,
263 const struct file_operations *fops)
263{ 264{
264 struct char_device_struct *cd; 265 struct char_device_struct *cd;
265 struct cdev *cdev; 266 struct cdev *cdev;
266 char *s;
267 int err = -ENOMEM; 267 int err = -ENOMEM;
268 268
269 cd = __register_chrdev_region(major, 0, 256, name); 269 cd = __register_chrdev_region(major, baseminor, count, name);
270 if (IS_ERR(cd)) 270 if (IS_ERR(cd))
271 return PTR_ERR(cd); 271 return PTR_ERR(cd);
272 272
@@ -277,10 +277,8 @@ int register_chrdev(unsigned int major, const char *name,
277 cdev->owner = fops->owner; 277 cdev->owner = fops->owner;
278 cdev->ops = fops; 278 cdev->ops = fops;
279 kobject_set_name(&cdev->kobj, "%s", name); 279 kobject_set_name(&cdev->kobj, "%s", name);
280 for (s = strchr(kobject_name(&cdev->kobj),'/'); s; s = strchr(s, '/'))
281 *s = '!';
282 280
283 err = cdev_add(cdev, MKDEV(cd->major, 0), 256); 281 err = cdev_add(cdev, MKDEV(cd->major, baseminor), count);
284 if (err) 282 if (err)
285 goto out; 283 goto out;
286 284
@@ -290,7 +288,7 @@ int register_chrdev(unsigned int major, const char *name,
290out: 288out:
291 kobject_put(&cdev->kobj); 289 kobject_put(&cdev->kobj);
292out2: 290out2:
293 kfree(__unregister_chrdev_region(cd->major, 0, 256)); 291 kfree(__unregister_chrdev_region(cd->major, baseminor, count));
294 return err; 292 return err;
295} 293}
296 294
@@ -316,10 +314,23 @@ void unregister_chrdev_region(dev_t from, unsigned count)
316 } 314 }
317} 315}
318 316
319void unregister_chrdev(unsigned int major, const char *name) 317/**
318 * __unregister_chrdev - unregister and destroy a cdev
319 * @major: major device number
320 * @baseminor: first of the range of minor numbers
321 * @count: the number of minor numbers this cdev is occupying
322 * @name: name of this range of devices
323 *
324 * Unregister and destroy the cdev occupying the region described by
325 * @major, @baseminor and @count. This function undoes what
326 * __register_chrdev() did.
327 */
328void __unregister_chrdev(unsigned int major, unsigned int baseminor,
329 unsigned int count, const char *name)
320{ 330{
321 struct char_device_struct *cd; 331 struct char_device_struct *cd;
322 cd = __unregister_chrdev_region(major, 0, 256); 332
333 cd = __unregister_chrdev_region(major, baseminor, count);
323 if (cd && cd->cdev) 334 if (cd && cd->cdev)
324 cdev_del(cd->cdev); 335 cdev_del(cd->cdev);
325 kfree(cd); 336 kfree(cd);
@@ -568,6 +579,6 @@ EXPORT_SYMBOL(cdev_alloc);
568EXPORT_SYMBOL(cdev_del); 579EXPORT_SYMBOL(cdev_del);
569EXPORT_SYMBOL(cdev_add); 580EXPORT_SYMBOL(cdev_add);
570EXPORT_SYMBOL(cdev_index); 581EXPORT_SYMBOL(cdev_index);
571EXPORT_SYMBOL(register_chrdev); 582EXPORT_SYMBOL(__register_chrdev);
572EXPORT_SYMBOL(unregister_chrdev); 583EXPORT_SYMBOL(__unregister_chrdev);
573EXPORT_SYMBOL(directly_mappable_cdev_bdi); 584EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index e85b1e4389e0..145540a316ab 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -3,7 +3,10 @@ Version 1.60
3Fix memory leak in reconnect. Fix oops in DFS mount error path. 3Fix memory leak in reconnect. Fix oops in DFS mount error path.
4Set s_maxbytes to smaller (the max that vfs can handle) so that 4Set s_maxbytes to smaller (the max that vfs can handle) so that
5sendfile will now work over cifs mounts again. Add noforcegid 5sendfile will now work over cifs mounts again. Add noforcegid
6and noforceuid mount parameters. 6and noforceuid mount parameters. Fix small mem leak when using
7ntlmv2. Fix 2nd mount to same server but with different port to
8be allowed (rather than reusing the 1st port) - only when the
9user explicitly overrides the port on the 2nd mount.
7 10
8Version 1.59 11Version 1.59
9------------ 12------------
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 6994a0f54f02..80f352596807 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -2,6 +2,7 @@ config CIFS
2 tristate "CIFS support (advanced network filesystem, SMBFS successor)" 2 tristate "CIFS support (advanced network filesystem, SMBFS successor)"
3 depends on INET 3 depends on INET
4 select NLS 4 select NLS
5 select SLOW_WORK
5 help 6 help
6 This is the client VFS module for the Common Internet File System 7 This is the client VFS module for the Common Internet File System
7 (CIFS) protocol which is the successor to the Server Message Block 8 (CIFS) protocol which is the successor to the Server Message Block
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 606912d8f2a8..fea9e898c4ba 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -142,7 +142,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
142 rc = dns_resolve_server_name_to_ip(*devname, &srvIP); 142 rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
143 if (rc != 0) { 143 if (rc != 0) {
144 cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d", 144 cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d",
145 __func__, *devname, rc));; 145 __func__, *devname, rc));
146 goto compose_mount_options_err; 146 goto compose_mount_options_err;
147 } 147 }
148 /* md_len = strlen(...) + 12 for 'sep+prefixpath=' 148 /* md_len = strlen(...) + 12 for 'sep+prefixpath='
@@ -385,7 +385,7 @@ out_err:
385 goto out; 385 goto out;
386} 386}
387 387
388struct inode_operations cifs_dfs_referral_inode_operations = { 388const struct inode_operations cifs_dfs_referral_inode_operations = {
389 .follow_link = cifs_dfs_follow_mountpoint, 389 .follow_link = cifs_dfs_follow_mountpoint,
390}; 390};
391 391
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 051caecf7d67..8ec7736ce954 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -125,7 +125,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
125 if (server->addr.sockAddr.sin_family == AF_INET) 125 if (server->addr.sockAddr.sin_family == AF_INET)
126 sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr); 126 sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr);
127 else if (server->addr.sockAddr.sin_family == AF_INET6) 127 else if (server->addr.sockAddr.sin_family == AF_INET6)
128 sprintf(dp, "ip6=%pi6", &server->addr.sockAddr6.sin6_addr); 128 sprintf(dp, "ip6=%pI6", &server->addr.sockAddr6.sin6_addr);
129 else 129 else
130 goto out; 130 goto out;
131 131
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 6941c22398a6..7dfe0842a6f6 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -607,7 +607,7 @@ static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
607 return get_cifs_acl_by_path(cifs_sb, path, pacllen); 607 return get_cifs_acl_by_path(cifs_sb, path, pacllen);
608 608
609 pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen); 609 pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen);
610 atomic_dec(&open_file->wrtPending); 610 cifsFileInfo_put(open_file);
611 return pntsd; 611 return pntsd;
612} 612}
613 613
@@ -665,7 +665,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
665 return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen); 665 return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
666 666
667 rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen); 667 rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen);
668 atomic_dec(&open_file->wrtPending); 668 cifsFileInfo_put(open_file);
669 return rc; 669 return rc;
670} 670}
671 671
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 7c9809523f42..7efe1745494d 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -373,6 +373,7 @@ calc_exit_2:
373 compare with the NTLM example */ 373 compare with the NTLM example */
374 hmac_md5_final(ses->server->ntlmv2_hash, pctxt); 374 hmac_md5_final(ses->server->ntlmv2_hash, pctxt);
375 375
376 kfree(pctxt);
376 return rc; 377 return rc;
377} 378}
378 379
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 84b75253b05a..9a5e4f5f3122 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -50,7 +50,7 @@
50#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ 50#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */
51 51
52#ifdef CONFIG_CIFS_QUOTA 52#ifdef CONFIG_CIFS_QUOTA
53static struct quotactl_ops cifs_quotactl_ops; 53static const struct quotactl_ops cifs_quotactl_ops;
54#endif /* QUOTA */ 54#endif /* QUOTA */
55 55
56int cifsFYI = 0; 56int cifsFYI = 0;
@@ -64,9 +64,6 @@ unsigned int multiuser_mount = 0;
64unsigned int extended_security = CIFSSEC_DEF; 64unsigned int extended_security = CIFSSEC_DEF;
65/* unsigned int ntlmv2_support = 0; */ 65/* unsigned int ntlmv2_support = 0; */
66unsigned int sign_CIFS_PDUs = 1; 66unsigned int sign_CIFS_PDUs = 1;
67extern struct task_struct *oplockThread; /* remove sparse warning */
68struct task_struct *oplockThread = NULL;
69/* extern struct task_struct * dnotifyThread; remove sparse warning */
70static const struct super_operations cifs_super_ops; 67static const struct super_operations cifs_super_ops;
71unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE; 68unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
72module_param(CIFSMaxBufSize, int, 0); 69module_param(CIFSMaxBufSize, int, 0);
@@ -185,8 +182,7 @@ out_mount_failed:
185 cifs_sb->mountdata = NULL; 182 cifs_sb->mountdata = NULL;
186 } 183 }
187#endif 184#endif
188 if (cifs_sb->local_nls) 185 unload_nls(cifs_sb->local_nls);
189 unload_nls(cifs_sb->local_nls);
190 kfree(cifs_sb); 186 kfree(cifs_sb);
191 } 187 }
192 return rc; 188 return rc;
@@ -361,13 +357,10 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
361static int 357static int
362cifs_show_options(struct seq_file *s, struct vfsmount *m) 358cifs_show_options(struct seq_file *s, struct vfsmount *m)
363{ 359{
364 struct cifs_sb_info *cifs_sb; 360 struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb);
365 struct cifsTconInfo *tcon; 361 struct cifsTconInfo *tcon = cifs_sb->tcon;
366
367 cifs_sb = CIFS_SB(m->mnt_sb);
368 tcon = cifs_sb->tcon;
369 362
370 seq_printf(s, ",unc=%s", cifs_sb->tcon->treeName); 363 seq_printf(s, ",unc=%s", tcon->treeName);
371 if (tcon->ses->userName) 364 if (tcon->ses->userName)
372 seq_printf(s, ",username=%s", tcon->ses->userName); 365 seq_printf(s, ",username=%s", tcon->ses->userName);
373 if (tcon->ses->domainName) 366 if (tcon->ses->domainName)
@@ -520,7 +513,7 @@ int cifs_xstate_get(struct super_block *sb, struct fs_quota_stat *qstats)
520 return rc; 513 return rc;
521} 514}
522 515
523static struct quotactl_ops cifs_quotactl_ops = { 516static const struct quotactl_ops cifs_quotactl_ops = {
524 .set_xquota = cifs_xquota_set, 517 .set_xquota = cifs_xquota_set,
525 .get_xquota = cifs_xquota_get, 518 .get_xquota = cifs_xquota_get,
526 .set_xstate = cifs_xstate_set, 519 .set_xstate = cifs_xstate_set,
@@ -976,89 +969,12 @@ cifs_destroy_mids(void)
976 kmem_cache_destroy(cifs_oplock_cachep); 969 kmem_cache_destroy(cifs_oplock_cachep);
977} 970}
978 971
979static int cifs_oplock_thread(void *dummyarg)
980{
981 struct oplock_q_entry *oplock_item;
982 struct cifsTconInfo *pTcon;
983 struct inode *inode;
984 __u16 netfid;
985 int rc, waitrc = 0;
986
987 set_freezable();
988 do {
989 if (try_to_freeze())
990 continue;
991
992 spin_lock(&GlobalMid_Lock);
993 if (list_empty(&GlobalOplock_Q)) {
994 spin_unlock(&GlobalMid_Lock);
995 set_current_state(TASK_INTERRUPTIBLE);
996 schedule_timeout(39*HZ);
997 } else {
998 oplock_item = list_entry(GlobalOplock_Q.next,
999 struct oplock_q_entry, qhead);
1000 cFYI(1, ("found oplock item to write out"));
1001 pTcon = oplock_item->tcon;
1002 inode = oplock_item->pinode;
1003 netfid = oplock_item->netfid;
1004 spin_unlock(&GlobalMid_Lock);
1005 DeleteOplockQEntry(oplock_item);
1006 /* can not grab inode sem here since it would
1007 deadlock when oplock received on delete
1008 since vfs_unlink holds the i_mutex across
1009 the call */
1010 /* mutex_lock(&inode->i_mutex);*/
1011 if (S_ISREG(inode->i_mode)) {
1012#ifdef CONFIG_CIFS_EXPERIMENTAL
1013 if (CIFS_I(inode)->clientCanCacheAll == 0)
1014 break_lease(inode, FMODE_READ);
1015 else if (CIFS_I(inode)->clientCanCacheRead == 0)
1016 break_lease(inode, FMODE_WRITE);
1017#endif
1018 rc = filemap_fdatawrite(inode->i_mapping);
1019 if (CIFS_I(inode)->clientCanCacheRead == 0) {
1020 waitrc = filemap_fdatawait(
1021 inode->i_mapping);
1022 invalidate_remote_inode(inode);
1023 }
1024 if (rc == 0)
1025 rc = waitrc;
1026 } else
1027 rc = 0;
1028 /* mutex_unlock(&inode->i_mutex);*/
1029 if (rc)
1030 CIFS_I(inode)->write_behind_rc = rc;
1031 cFYI(1, ("Oplock flush inode %p rc %d",
1032 inode, rc));
1033
1034 /* releasing stale oplock after recent reconnect
1035 of smb session using a now incorrect file
1036 handle is not a data integrity issue but do
1037 not bother sending an oplock release if session
1038 to server still is disconnected since oplock
1039 already released by the server in that case */
1040 if (!pTcon->need_reconnect) {
1041 rc = CIFSSMBLock(0, pTcon, netfid,
1042 0 /* len */ , 0 /* offset */, 0,
1043 0, LOCKING_ANDX_OPLOCK_RELEASE,
1044 false /* wait flag */);
1045 cFYI(1, ("Oplock release rc = %d", rc));
1046 }
1047 set_current_state(TASK_INTERRUPTIBLE);
1048 schedule_timeout(1); /* yield in case q were corrupt */
1049 }
1050 } while (!kthread_should_stop());
1051
1052 return 0;
1053}
1054
1055static int __init 972static int __init
1056init_cifs(void) 973init_cifs(void)
1057{ 974{
1058 int rc = 0; 975 int rc = 0;
1059 cifs_proc_init(); 976 cifs_proc_init();
1060 INIT_LIST_HEAD(&cifs_tcp_ses_list); 977 INIT_LIST_HEAD(&cifs_tcp_ses_list);
1061 INIT_LIST_HEAD(&GlobalOplock_Q);
1062#ifdef CONFIG_CIFS_EXPERIMENTAL 978#ifdef CONFIG_CIFS_EXPERIMENTAL
1063 INIT_LIST_HEAD(&GlobalDnotifyReqList); 979 INIT_LIST_HEAD(&GlobalDnotifyReqList);
1064 INIT_LIST_HEAD(&GlobalDnotifyRsp_Q); 980 INIT_LIST_HEAD(&GlobalDnotifyRsp_Q);
@@ -1121,16 +1037,13 @@ init_cifs(void)
1121 if (rc) 1037 if (rc)
1122 goto out_unregister_key_type; 1038 goto out_unregister_key_type;
1123#endif 1039#endif
1124 oplockThread = kthread_run(cifs_oplock_thread, NULL, "cifsoplockd"); 1040 rc = slow_work_register_user();
1125 if (IS_ERR(oplockThread)) { 1041 if (rc)
1126 rc = PTR_ERR(oplockThread); 1042 goto out_unregister_resolver_key;
1127 cERROR(1, ("error %d create oplock thread", rc));
1128 goto out_unregister_dfs_key_type;
1129 }
1130 1043
1131 return 0; 1044 return 0;
1132 1045
1133 out_unregister_dfs_key_type: 1046 out_unregister_resolver_key:
1134#ifdef CONFIG_CIFS_DFS_UPCALL 1047#ifdef CONFIG_CIFS_DFS_UPCALL
1135 unregister_key_type(&key_type_dns_resolver); 1048 unregister_key_type(&key_type_dns_resolver);
1136 out_unregister_key_type: 1049 out_unregister_key_type:
@@ -1167,7 +1080,6 @@ exit_cifs(void)
1167 cifs_destroy_inodecache(); 1080 cifs_destroy_inodecache();
1168 cifs_destroy_mids(); 1081 cifs_destroy_mids();
1169 cifs_destroy_request_bufs(); 1082 cifs_destroy_request_bufs();
1170 kthread_stop(oplockThread);
1171} 1083}
1172 1084
1173MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>"); 1085MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>");
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 6c170948300d..ac2b24c192f8 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -67,7 +67,7 @@ extern int cifs_setattr(struct dentry *, struct iattr *);
67 67
68extern const struct inode_operations cifs_file_inode_ops; 68extern const struct inode_operations cifs_file_inode_ops;
69extern const struct inode_operations cifs_symlink_inode_ops; 69extern const struct inode_operations cifs_symlink_inode_ops;
70extern struct inode_operations cifs_dfs_referral_inode_operations; 70extern const struct inode_operations cifs_dfs_referral_inode_operations;
71 71
72 72
73/* Functions related to files and directories */ 73/* Functions related to files and directories */
@@ -113,5 +113,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
113extern const struct export_operations cifs_export_ops; 113extern const struct export_operations cifs_export_ops;
114#endif /* EXPERIMENTAL */ 114#endif /* EXPERIMENTAL */
115 115
116#define CIFS_VERSION "1.60" 116#define CIFS_VERSION "1.61"
117#endif /* _CIFSFS_H */ 117#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 6084d6379c03..5d0fde18039c 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -18,6 +18,7 @@
18 */ 18 */
19#include <linux/in.h> 19#include <linux/in.h>
20#include <linux/in6.h> 20#include <linux/in6.h>
21#include <linux/slow-work.h>
21#include "cifs_fs_sb.h" 22#include "cifs_fs_sb.h"
22#include "cifsacl.h" 23#include "cifsacl.h"
23/* 24/*
@@ -346,16 +347,33 @@ struct cifsFileInfo {
346 /* lock scope id (0 if none) */ 347 /* lock scope id (0 if none) */
347 struct file *pfile; /* needed for writepage */ 348 struct file *pfile; /* needed for writepage */
348 struct inode *pInode; /* needed for oplock break */ 349 struct inode *pInode; /* needed for oplock break */
350 struct vfsmount *mnt;
349 struct mutex lock_mutex; 351 struct mutex lock_mutex;
350 struct list_head llist; /* list of byte range locks we have. */ 352 struct list_head llist; /* list of byte range locks we have. */
351 bool closePend:1; /* file is marked to close */ 353 bool closePend:1; /* file is marked to close */
352 bool invalidHandle:1; /* file closed via session abend */ 354 bool invalidHandle:1; /* file closed via session abend */
353 bool messageMode:1; /* for pipes: message vs byte mode */ 355 bool oplock_break_cancelled:1;
354 atomic_t wrtPending; /* handle in use - defer close */ 356 atomic_t count; /* reference count */
355 struct mutex fh_mutex; /* prevents reopen race after dead ses*/ 357 struct mutex fh_mutex; /* prevents reopen race after dead ses*/
356 struct cifs_search_info srch_inf; 358 struct cifs_search_info srch_inf;
359 struct slow_work oplock_break; /* slow_work job for oplock breaks */
357}; 360};
358 361
362/* Take a reference on the file private data */
363static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file)
364{
365 atomic_inc(&cifs_file->count);
366}
367
368/* Release a reference on the file private data */
369static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
370{
371 if (atomic_dec_and_test(&cifs_file->count)) {
372 iput(cifs_file->pInode);
373 kfree(cifs_file);
374 }
375}
376
359/* 377/*
360 * One of these for each file inode 378 * One of these for each file inode
361 */ 379 */
@@ -369,7 +387,6 @@ struct cifsInodeInfo {
369 unsigned long time; /* jiffies of last update/check of inode */ 387 unsigned long time; /* jiffies of last update/check of inode */
370 bool clientCanCacheRead:1; /* read oplock */ 388 bool clientCanCacheRead:1; /* read oplock */
371 bool clientCanCacheAll:1; /* read and writebehind oplock */ 389 bool clientCanCacheAll:1; /* read and writebehind oplock */
372 bool oplockPending:1;
373 bool delete_pending:1; /* DELETE_ON_CLOSE is set */ 390 bool delete_pending:1; /* DELETE_ON_CLOSE is set */
374 u64 server_eof; /* current file size on server */ 391 u64 server_eof; /* current file size on server */
375 u64 uniqueid; /* server inode number */ 392 u64 uniqueid; /* server inode number */
@@ -572,9 +589,9 @@ require use of the stronger protocol */
572#define CIFSSEC_MUST_LANMAN 0x10010 589#define CIFSSEC_MUST_LANMAN 0x10010
573#define CIFSSEC_MUST_PLNTXT 0x20020 590#define CIFSSEC_MUST_PLNTXT 0x20020
574#ifdef CONFIG_CIFS_UPCALL 591#ifdef CONFIG_CIFS_UPCALL
575#define CIFSSEC_MASK 0xAF0AF /* allows weak security but also krb5 */ 592#define CIFSSEC_MASK 0xBF0BF /* allows weak security but also krb5 */
576#else 593#else
577#define CIFSSEC_MASK 0xA70A7 /* current flags supported if weak */ 594#define CIFSSEC_MASK 0xB70B7 /* current flags supported if weak */
578#endif /* UPCALL */ 595#endif /* UPCALL */
579#else /* do not allow weak pw hash */ 596#else /* do not allow weak pw hash */
580#ifdef CONFIG_CIFS_UPCALL 597#ifdef CONFIG_CIFS_UPCALL
@@ -656,8 +673,6 @@ GLOBAL_EXTERN rwlock_t cifs_tcp_ses_lock;
656 */ 673 */
657GLOBAL_EXTERN rwlock_t GlobalSMBSeslock; 674GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;
658 675
659GLOBAL_EXTERN struct list_head GlobalOplock_Q;
660
661/* Outstanding dir notify requests */ 676/* Outstanding dir notify requests */
662GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; 677GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
663/* DirNotify response queue */ 678/* DirNotify response queue */
@@ -708,3 +723,4 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
708GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */ 723GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */
709GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ 724GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
710 725
726extern const struct slow_work_ops cifs_oplock_break_ops;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index da8fbf565991..6928c24d1d42 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -86,18 +86,17 @@ extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
86 const int stage, 86 const int stage,
87 const struct nls_table *nls_cp); 87 const struct nls_table *nls_cp);
88extern __u16 GetNextMid(struct TCP_Server_Info *server); 88extern __u16 GetNextMid(struct TCP_Server_Info *server);
89extern struct oplock_q_entry *AllocOplockQEntry(struct inode *, u16,
90 struct cifsTconInfo *);
91extern void DeleteOplockQEntry(struct oplock_q_entry *);
92extern void DeleteTconOplockQEntries(struct cifsTconInfo *);
93extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); 89extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
94extern u64 cifs_UnixTimeToNT(struct timespec); 90extern u64 cifs_UnixTimeToNT(struct timespec);
95extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, 91extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
96 int offset); 92 int offset);
97 93
94extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode,
95 __u16 fileHandle, struct file *file,
96 struct vfsmount *mnt, unsigned int oflags);
98extern int cifs_posix_open(char *full_path, struct inode **pinode, 97extern int cifs_posix_open(char *full_path, struct inode **pinode,
99 struct super_block *sb, int mode, int oflags, 98 struct vfsmount *mnt, int mode, int oflags,
100 int *poplock, __u16 *pnetfid, int xid); 99 __u32 *poplock, __u16 *pnetfid, int xid);
101extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, 100extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
102 FILE_UNIX_BASIC_INFO *info, 101 FILE_UNIX_BASIC_INFO *info,
103 struct cifs_sb_info *cifs_sb); 102 struct cifs_sb_info *cifs_sb);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 1866bc2927d4..941441d3e386 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -94,116 +94,145 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
94 list_for_each_safe(tmp, tmp1, &pTcon->openFileList) { 94 list_for_each_safe(tmp, tmp1, &pTcon->openFileList) {
95 open_file = list_entry(tmp, struct cifsFileInfo, tlist); 95 open_file = list_entry(tmp, struct cifsFileInfo, tlist);
96 open_file->invalidHandle = true; 96 open_file->invalidHandle = true;
97 open_file->oplock_break_cancelled = true;
97 } 98 }
98 write_unlock(&GlobalSMBSeslock); 99 write_unlock(&GlobalSMBSeslock);
99 /* BB Add call to invalidate_inodes(sb) for all superblocks mounted 100 /* BB Add call to invalidate_inodes(sb) for all superblocks mounted
100 to this tcon */ 101 to this tcon */
101} 102}
102 103
103/* Allocate and return pointer to an SMB request buffer, and set basic 104/* reconnect the socket, tcon, and smb session if needed */
104 SMB information in the SMB header. If the return code is zero, this
105 function must have filled in request_buf pointer */
106static int 105static int
107small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, 106cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
108 void **request_buf)
109{ 107{
110 int rc = 0; 108 int rc = 0;
109 struct cifsSesInfo *ses;
110 struct TCP_Server_Info *server;
111 struct nls_table *nls_codepage;
111 112
112 /* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so 113 /*
113 check for tcp and smb session status done differently 114 * SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for
114 for those three - in the calling routine */ 115 * tcp and smb session status done differently for those three - in the
115 if (tcon) { 116 * calling routine
116 if (tcon->tidStatus == CifsExiting) { 117 */
117 /* only tree disconnect, open, and write, 118 if (!tcon)
118 (and ulogoff which does not have tcon) 119 return 0;
119 are allowed as we start force umount */ 120
120 if ((smb_command != SMB_COM_WRITE_ANDX) && 121 ses = tcon->ses;
121 (smb_command != SMB_COM_OPEN_ANDX) && 122 server = ses->server;
122 (smb_command != SMB_COM_TREE_DISCONNECT)) { 123
123 cFYI(1, ("can not send cmd %d while umounting", 124 /*
124 smb_command)); 125 * only tree disconnect, open, and write, (and ulogoff which does not
125 return -ENODEV; 126 * have tcon) are allowed as we start force umount
126 } 127 */
128 if (tcon->tidStatus == CifsExiting) {
129 if (smb_command != SMB_COM_WRITE_ANDX &&
130 smb_command != SMB_COM_OPEN_ANDX &&
131 smb_command != SMB_COM_TREE_DISCONNECT) {
132 cFYI(1, ("can not send cmd %d while umounting",
133 smb_command));
134 return -ENODEV;
127 } 135 }
128 if ((tcon->ses) && (tcon->ses->status != CifsExiting) && 136 }
129 (tcon->ses->server)) {
130 struct nls_table *nls_codepage;
131 /* Give Demultiplex thread up to 10 seconds to
132 reconnect, should be greater than cifs socket
133 timeout which is 7 seconds */
134 while (tcon->ses->server->tcpStatus ==
135 CifsNeedReconnect) {
136 wait_event_interruptible_timeout(tcon->ses->server->response_q,
137 (tcon->ses->server->tcpStatus ==
138 CifsGood), 10 * HZ);
139 if (tcon->ses->server->tcpStatus ==
140 CifsNeedReconnect) {
141 /* on "soft" mounts we wait once */
142 if (!tcon->retry ||
143 (tcon->ses->status == CifsExiting)) {
144 cFYI(1, ("gave up waiting on "
145 "reconnect in smb_init"));
146 return -EHOSTDOWN;
147 } /* else "hard" mount - keep retrying
148 until process is killed or server
149 comes back on-line */
150 } else /* TCP session is reestablished now */
151 break;
152 }
153 137
154 nls_codepage = load_nls_default(); 138 if (ses->status == CifsExiting)
155 /* need to prevent multiple threads trying to 139 return -EIO;
156 simultaneously reconnect the same SMB session */
157 down(&tcon->ses->sesSem);
158 if (tcon->ses->need_reconnect)
159 rc = cifs_setup_session(0, tcon->ses,
160 nls_codepage);
161 if (!rc && (tcon->need_reconnect)) {
162 mark_open_files_invalid(tcon);
163 rc = CIFSTCon(0, tcon->ses, tcon->treeName,
164 tcon, nls_codepage);
165 up(&tcon->ses->sesSem);
166 /* BB FIXME add code to check if wsize needs
167 update due to negotiated smb buffer size
168 shrinking */
169 if (rc == 0) {
170 atomic_inc(&tconInfoReconnectCount);
171 /* tell server Unix caps we support */
172 if (tcon->ses->capabilities & CAP_UNIX)
173 reset_cifs_unix_caps(
174 0 /* no xid */,
175 tcon,
176 NULL /* we do not know sb */,
177 NULL /* no vol info */);
178 }
179 140
180 cFYI(1, ("reconnect tcon rc = %d", rc)); 141 /*
181 /* Removed call to reopen open files here. 142 * Give demultiplex thread up to 10 seconds to reconnect, should be
182 It is safer (and faster) to reopen files 143 * greater than cifs socket timeout which is 7 seconds
183 one at a time as needed in read and write */ 144 */
184 145 while (server->tcpStatus == CifsNeedReconnect) {
185 /* Check if handle based operation so we 146 wait_event_interruptible_timeout(server->response_q,
186 know whether we can continue or not without 147 (server->tcpStatus == CifsGood), 10 * HZ);
187 returning to caller to reset file handle */
188 switch (smb_command) {
189 case SMB_COM_READ_ANDX:
190 case SMB_COM_WRITE_ANDX:
191 case SMB_COM_CLOSE:
192 case SMB_COM_FIND_CLOSE2:
193 case SMB_COM_LOCKING_ANDX: {
194 unload_nls(nls_codepage);
195 return -EAGAIN;
196 }
197 }
198 } else {
199 up(&tcon->ses->sesSem);
200 }
201 unload_nls(nls_codepage);
202 148
203 } else { 149 /* is TCP session is reestablished now ?*/
204 return -EIO; 150 if (server->tcpStatus != CifsNeedReconnect)
151 break;
152
153 /*
154 * on "soft" mounts we wait once. Hard mounts keep
155 * retrying until process is killed or server comes
156 * back on-line
157 */
158 if (!tcon->retry || ses->status == CifsExiting) {
159 cFYI(1, ("gave up waiting on reconnect in smb_init"));
160 return -EHOSTDOWN;
205 } 161 }
206 } 162 }
163
164 if (!ses->need_reconnect && !tcon->need_reconnect)
165 return 0;
166
167 nls_codepage = load_nls_default();
168
169 /*
170 * need to prevent multiple threads trying to simultaneously
171 * reconnect the same SMB session
172 */
173 down(&ses->sesSem);
174 if (ses->need_reconnect)
175 rc = cifs_setup_session(0, ses, nls_codepage);
176
177 /* do we need to reconnect tcon? */
178 if (rc || !tcon->need_reconnect) {
179 up(&ses->sesSem);
180 goto out;
181 }
182
183 mark_open_files_invalid(tcon);
184 rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
185 up(&ses->sesSem);
186 cFYI(1, ("reconnect tcon rc = %d", rc));
187
188 if (rc)
189 goto out;
190
191 /*
192 * FIXME: check if wsize needs updated due to negotiated smb buffer
193 * size shrinking
194 */
195 atomic_inc(&tconInfoReconnectCount);
196
197 /* tell server Unix caps we support */
198 if (ses->capabilities & CAP_UNIX)
199 reset_cifs_unix_caps(0, tcon, NULL, NULL);
200
201 /*
202 * Removed call to reopen open files here. It is safer (and faster) to
203 * reopen files one at a time as needed in read and write.
204 *
205 * FIXME: what about file locks? don't we need to reclaim them ASAP?
206 */
207
208out:
209 /*
210 * Check if handle based operation so we know whether we can continue
211 * or not without returning to caller to reset file handle
212 */
213 switch (smb_command) {
214 case SMB_COM_READ_ANDX:
215 case SMB_COM_WRITE_ANDX:
216 case SMB_COM_CLOSE:
217 case SMB_COM_FIND_CLOSE2:
218 case SMB_COM_LOCKING_ANDX:
219 rc = -EAGAIN;
220 }
221
222 unload_nls(nls_codepage);
223 return rc;
224}
225
226/* Allocate and return pointer to an SMB request buffer, and set basic
227 SMB information in the SMB header. If the return code is zero, this
228 function must have filled in request_buf pointer */
229static int
230small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
231 void **request_buf)
232{
233 int rc = 0;
234
235 rc = cifs_reconnect_tcon(tcon, smb_command);
207 if (rc) 236 if (rc)
208 return rc; 237 return rc;
209 238
@@ -256,101 +285,7 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
256{ 285{
257 int rc = 0; 286 int rc = 0;
258 287
259 /* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so 288 rc = cifs_reconnect_tcon(tcon, smb_command);
260 check for tcp and smb session status done differently
261 for those three - in the calling routine */
262 if (tcon) {
263 if (tcon->tidStatus == CifsExiting) {
264 /* only tree disconnect, open, and write,
265 (and ulogoff which does not have tcon)
266 are allowed as we start force umount */
267 if ((smb_command != SMB_COM_WRITE_ANDX) &&
268 (smb_command != SMB_COM_OPEN_ANDX) &&
269 (smb_command != SMB_COM_TREE_DISCONNECT)) {
270 cFYI(1, ("can not send cmd %d while umounting",
271 smb_command));
272 return -ENODEV;
273 }
274 }
275
276 if ((tcon->ses) && (tcon->ses->status != CifsExiting) &&
277 (tcon->ses->server)) {
278 struct nls_table *nls_codepage;
279 /* Give Demultiplex thread up to 10 seconds to
280 reconnect, should be greater than cifs socket
281 timeout which is 7 seconds */
282 while (tcon->ses->server->tcpStatus ==
283 CifsNeedReconnect) {
284 wait_event_interruptible_timeout(tcon->ses->server->response_q,
285 (tcon->ses->server->tcpStatus ==
286 CifsGood), 10 * HZ);
287 if (tcon->ses->server->tcpStatus ==
288 CifsNeedReconnect) {
289 /* on "soft" mounts we wait once */
290 if (!tcon->retry ||
291 (tcon->ses->status == CifsExiting)) {
292 cFYI(1, ("gave up waiting on "
293 "reconnect in smb_init"));
294 return -EHOSTDOWN;
295 } /* else "hard" mount - keep retrying
296 until process is killed or server
297 comes on-line */
298 } else /* TCP session is reestablished now */
299 break;
300 }
301 nls_codepage = load_nls_default();
302 /* need to prevent multiple threads trying to
303 simultaneously reconnect the same SMB session */
304 down(&tcon->ses->sesSem);
305 if (tcon->ses->need_reconnect)
306 rc = cifs_setup_session(0, tcon->ses,
307 nls_codepage);
308 if (!rc && (tcon->need_reconnect)) {
309 mark_open_files_invalid(tcon);
310 rc = CIFSTCon(0, tcon->ses, tcon->treeName,
311 tcon, nls_codepage);
312 up(&tcon->ses->sesSem);
313 /* BB FIXME add code to check if wsize needs
314 update due to negotiated smb buffer size
315 shrinking */
316 if (rc == 0) {
317 atomic_inc(&tconInfoReconnectCount);
318 /* tell server Unix caps we support */
319 if (tcon->ses->capabilities & CAP_UNIX)
320 reset_cifs_unix_caps(
321 0 /* no xid */,
322 tcon,
323 NULL /* do not know sb */,
324 NULL /* no vol info */);
325 }
326
327 cFYI(1, ("reconnect tcon rc = %d", rc));
328 /* Removed call to reopen open files here.
329 It is safer (and faster) to reopen files
330 one at a time as needed in read and write */
331
332 /* Check if handle based operation so we
333 know whether we can continue or not without
334 returning to caller to reset file handle */
335 switch (smb_command) {
336 case SMB_COM_READ_ANDX:
337 case SMB_COM_WRITE_ANDX:
338 case SMB_COM_CLOSE:
339 case SMB_COM_FIND_CLOSE2:
340 case SMB_COM_LOCKING_ANDX: {
341 unload_nls(nls_codepage);
342 return -EAGAIN;
343 }
344 }
345 } else {
346 up(&tcon->ses->sesSem);
347 }
348 unload_nls(nls_codepage);
349
350 } else {
351 return -EIO;
352 }
353 }
354 if (rc) 289 if (rc)
355 return rc; 290 return rc;
356 291
@@ -3961,6 +3896,10 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
3961 if (is_unicode) { 3896 if (is_unicode) {
3962 __le16 *tmp = kmalloc(strlen(searchName)*2 + 2, 3897 __le16 *tmp = kmalloc(strlen(searchName)*2 + 2,
3963 GFP_KERNEL); 3898 GFP_KERNEL);
3899 if (tmp == NULL) {
3900 rc = -ENOMEM;
3901 goto parse_DFS_referrals_exit;
3902 }
3964 cifsConvertToUCS((__le16 *) tmp, searchName, 3903 cifsConvertToUCS((__le16 *) tmp, searchName,
3965 PATH_MAX, nls_codepage, remap); 3904 PATH_MAX, nls_codepage, remap);
3966 node->path_consumed = cifs_ucs2_bytes(tmp, 3905 node->path_consumed = cifs_ucs2_bytes(tmp,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 1f3345d7fa79..43003e0bef18 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1377,7 +1377,7 @@ cifs_parse_mount_options(char *options, const char *devname,
1377} 1377}
1378 1378
1379static struct TCP_Server_Info * 1379static struct TCP_Server_Info *
1380cifs_find_tcp_session(struct sockaddr_storage *addr) 1380cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port)
1381{ 1381{
1382 struct list_head *tmp; 1382 struct list_head *tmp;
1383 struct TCP_Server_Info *server; 1383 struct TCP_Server_Info *server;
@@ -1397,16 +1397,37 @@ cifs_find_tcp_session(struct sockaddr_storage *addr)
1397 if (server->tcpStatus == CifsNew) 1397 if (server->tcpStatus == CifsNew)
1398 continue; 1398 continue;
1399 1399
1400 if (addr->ss_family == AF_INET && 1400 switch (addr->ss_family) {
1401 (addr4->sin_addr.s_addr != 1401 case AF_INET:
1402 server->addr.sockAddr.sin_addr.s_addr)) 1402 if (addr4->sin_addr.s_addr ==
1403 continue; 1403 server->addr.sockAddr.sin_addr.s_addr) {
1404 else if (addr->ss_family == AF_INET6 && 1404 addr4->sin_port = htons(port);
1405 (!ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr, 1405 /* user overrode default port? */
1406 &addr6->sin6_addr) || 1406 if (addr4->sin_port) {
1407 server->addr.sockAddr6.sin6_scope_id != 1407 if (addr4->sin_port !=
1408 addr6->sin6_scope_id)) 1408 server->addr.sockAddr.sin_port)
1409 continue; 1409 continue;
1410 }
1411 break;
1412 } else
1413 continue;
1414
1415 case AF_INET6:
1416 if (ipv6_addr_equal(&addr6->sin6_addr,
1417 &server->addr.sockAddr6.sin6_addr) &&
1418 (addr6->sin6_scope_id ==
1419 server->addr.sockAddr6.sin6_scope_id)) {
1420 addr6->sin6_port = htons(port);
1421 /* user overrode default port? */
1422 if (addr6->sin6_port) {
1423 if (addr6->sin6_port !=
1424 server->addr.sockAddr6.sin6_port)
1425 continue;
1426 }
1427 break;
1428 } else
1429 continue;
1430 }
1410 1431
1411 ++server->srv_count; 1432 ++server->srv_count;
1412 write_unlock(&cifs_tcp_ses_lock); 1433 write_unlock(&cifs_tcp_ses_lock);
@@ -1475,7 +1496,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1475 } 1496 }
1476 1497
1477 /* see if we already have a matching tcp_ses */ 1498 /* see if we already have a matching tcp_ses */
1478 tcp_ses = cifs_find_tcp_session(&addr); 1499 tcp_ses = cifs_find_tcp_session(&addr, volume_info->port);
1479 if (tcp_ses) 1500 if (tcp_ses)
1480 return tcp_ses; 1501 return tcp_ses;
1481 1502
@@ -1649,7 +1670,6 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
1649 CIFSSMBTDis(xid, tcon); 1670 CIFSSMBTDis(xid, tcon);
1650 _FreeXid(xid); 1671 _FreeXid(xid);
1651 1672
1652 DeleteTconOplockQEntries(tcon);
1653 tconInfoFree(tcon); 1673 tconInfoFree(tcon);
1654 cifs_put_smb_ses(ses); 1674 cifs_put_smb_ses(ses);
1655} 1675}
@@ -2636,9 +2656,9 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2636 return -EIO; 2656 return -EIO;
2637 2657
2638 smb_buffer = cifs_buf_get(); 2658 smb_buffer = cifs_buf_get();
2639 if (smb_buffer == NULL) { 2659 if (smb_buffer == NULL)
2640 return -ENOMEM; 2660 return -ENOMEM;
2641 } 2661
2642 smb_buffer_response = smb_buffer; 2662 smb_buffer_response = smb_buffer;
2643 2663
2644 header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX, 2664 header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 4326ffd90fa9..627a60a6c1b1 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -24,6 +24,7 @@
24#include <linux/stat.h> 24#include <linux/stat.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/namei.h> 26#include <linux/namei.h>
27#include <linux/mount.h>
27#include "cifsfs.h" 28#include "cifsfs.h"
28#include "cifspdu.h" 29#include "cifspdu.h"
29#include "cifsglob.h" 30#include "cifsglob.h"
@@ -129,44 +130,45 @@ cifs_bp_rename_retry:
129 return full_path; 130 return full_path;
130} 131}
131 132
132static void 133struct cifsFileInfo *
133cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle, 134cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
134 struct cifsTconInfo *tcon, bool write_only) 135 struct file *file, struct vfsmount *mnt, unsigned int oflags)
135{ 136{
136 int oplock = 0; 137 int oplock = 0;
137 struct cifsFileInfo *pCifsFile; 138 struct cifsFileInfo *pCifsFile;
138 struct cifsInodeInfo *pCifsInode; 139 struct cifsInodeInfo *pCifsInode;
140 struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb);
139 141
140 pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); 142 pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
141
142 if (pCifsFile == NULL) 143 if (pCifsFile == NULL)
143 return; 144 return pCifsFile;
144 145
145 if (oplockEnabled) 146 if (oplockEnabled)
146 oplock = REQ_OPLOCK; 147 oplock = REQ_OPLOCK;
147 148
148 pCifsFile->netfid = fileHandle; 149 pCifsFile->netfid = fileHandle;
149 pCifsFile->pid = current->tgid; 150 pCifsFile->pid = current->tgid;
150 pCifsFile->pInode = newinode; 151 pCifsFile->pInode = igrab(newinode);
152 pCifsFile->mnt = mnt;
153 pCifsFile->pfile = file;
151 pCifsFile->invalidHandle = false; 154 pCifsFile->invalidHandle = false;
152 pCifsFile->closePend = false; 155 pCifsFile->closePend = false;
153 mutex_init(&pCifsFile->fh_mutex); 156 mutex_init(&pCifsFile->fh_mutex);
154 mutex_init(&pCifsFile->lock_mutex); 157 mutex_init(&pCifsFile->lock_mutex);
155 INIT_LIST_HEAD(&pCifsFile->llist); 158 INIT_LIST_HEAD(&pCifsFile->llist);
156 atomic_set(&pCifsFile->wrtPending, 0); 159 atomic_set(&pCifsFile->count, 1);
160 slow_work_init(&pCifsFile->oplock_break, &cifs_oplock_break_ops);
157 161
158 /* set the following in open now
159 pCifsFile->pfile = file; */
160 write_lock(&GlobalSMBSeslock); 162 write_lock(&GlobalSMBSeslock);
161 list_add(&pCifsFile->tlist, &tcon->openFileList); 163 list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList);
162 pCifsInode = CIFS_I(newinode); 164 pCifsInode = CIFS_I(newinode);
163 if (pCifsInode) { 165 if (pCifsInode) {
164 /* if readable file instance put first in list*/ 166 /* if readable file instance put first in list*/
165 if (write_only) 167 if (oflags & FMODE_READ)
168 list_add(&pCifsFile->flist, &pCifsInode->openFileList);
169 else
166 list_add_tail(&pCifsFile->flist, 170 list_add_tail(&pCifsFile->flist,
167 &pCifsInode->openFileList); 171 &pCifsInode->openFileList);
168 else
169 list_add(&pCifsFile->flist, &pCifsInode->openFileList);
170 172
171 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { 173 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
172 pCifsInode->clientCanCacheAll = true; 174 pCifsInode->clientCanCacheAll = true;
@@ -176,18 +178,18 @@ cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle,
176 pCifsInode->clientCanCacheRead = true; 178 pCifsInode->clientCanCacheRead = true;
177 } 179 }
178 write_unlock(&GlobalSMBSeslock); 180 write_unlock(&GlobalSMBSeslock);
181
182 return pCifsFile;
179} 183}
180 184
181int cifs_posix_open(char *full_path, struct inode **pinode, 185int cifs_posix_open(char *full_path, struct inode **pinode,
182 struct super_block *sb, int mode, int oflags, 186 struct vfsmount *mnt, int mode, int oflags,
183 int *poplock, __u16 *pnetfid, int xid) 187 __u32 *poplock, __u16 *pnetfid, int xid)
184{ 188{
185 int rc; 189 int rc;
186 __u32 oplock;
187 bool write_only = false;
188 FILE_UNIX_BASIC_INFO *presp_data; 190 FILE_UNIX_BASIC_INFO *presp_data;
189 __u32 posix_flags = 0; 191 __u32 posix_flags = 0;
190 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 192 struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb);
191 struct cifs_fattr fattr; 193 struct cifs_fattr fattr;
192 194
193 cFYI(1, ("posix open %s", full_path)); 195 cFYI(1, ("posix open %s", full_path));
@@ -223,12 +225,9 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
223 if (oflags & O_DIRECT) 225 if (oflags & O_DIRECT)
224 posix_flags |= SMB_O_DIRECT; 226 posix_flags |= SMB_O_DIRECT;
225 227
226 if (!(oflags & FMODE_READ))
227 write_only = true;
228
229 mode &= ~current_umask(); 228 mode &= ~current_umask();
230 rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode, 229 rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode,
231 pnetfid, presp_data, &oplock, full_path, 230 pnetfid, presp_data, poplock, full_path,
232 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 231 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
233 CIFS_MOUNT_MAP_SPECIAL_CHR); 232 CIFS_MOUNT_MAP_SPECIAL_CHR);
234 if (rc) 233 if (rc)
@@ -244,7 +243,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
244 243
245 /* get new inode and set it up */ 244 /* get new inode and set it up */
246 if (*pinode == NULL) { 245 if (*pinode == NULL) {
247 *pinode = cifs_iget(sb, &fattr); 246 *pinode = cifs_iget(mnt->mnt_sb, &fattr);
248 if (!*pinode) { 247 if (!*pinode) {
249 rc = -ENOMEM; 248 rc = -ENOMEM;
250 goto posix_open_ret; 249 goto posix_open_ret;
@@ -253,7 +252,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
253 cifs_fattr_to_inode(*pinode, &fattr); 252 cifs_fattr_to_inode(*pinode, &fattr);
254 } 253 }
255 254
256 cifs_fill_fileinfo(*pinode, *pnetfid, cifs_sb->tcon, write_only); 255 cifs_new_fileinfo(*pinode, *pnetfid, NULL, mnt, oflags);
257 256
258posix_open_ret: 257posix_open_ret:
259 kfree(presp_data); 258 kfree(presp_data);
@@ -280,7 +279,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
280 int rc = -ENOENT; 279 int rc = -ENOENT;
281 int xid; 280 int xid;
282 int create_options = CREATE_NOT_DIR; 281 int create_options = CREATE_NOT_DIR;
283 int oplock = 0; 282 __u32 oplock = 0;
284 int oflags; 283 int oflags;
285 bool posix_create = false; 284 bool posix_create = false;
286 /* 285 /*
@@ -298,7 +297,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
298 FILE_ALL_INFO *buf = NULL; 297 FILE_ALL_INFO *buf = NULL;
299 struct inode *newinode = NULL; 298 struct inode *newinode = NULL;
300 int disposition = FILE_OVERWRITE_IF; 299 int disposition = FILE_OVERWRITE_IF;
301 bool write_only = false;
302 300
303 xid = GetXid(); 301 xid = GetXid();
304 302
@@ -323,7 +321,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
323 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) && 321 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
324 (CIFS_UNIX_POSIX_PATH_OPS_CAP & 322 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
325 le64_to_cpu(tcon->fsUnixInfo.Capability))) { 323 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
326 rc = cifs_posix_open(full_path, &newinode, inode->i_sb, 324 rc = cifs_posix_open(full_path, &newinode, nd->path.mnt,
327 mode, oflags, &oplock, &fileHandle, xid); 325 mode, oflags, &oplock, &fileHandle, xid);
328 /* EIO could indicate that (posix open) operation is not 326 /* EIO could indicate that (posix open) operation is not
329 supported, despite what server claimed in capability 327 supported, despite what server claimed in capability
@@ -351,11 +349,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
351 desiredAccess = 0; 349 desiredAccess = 0;
352 if (oflags & FMODE_READ) 350 if (oflags & FMODE_READ)
353 desiredAccess |= GENERIC_READ; /* is this too little? */ 351 desiredAccess |= GENERIC_READ; /* is this too little? */
354 if (oflags & FMODE_WRITE) { 352 if (oflags & FMODE_WRITE)
355 desiredAccess |= GENERIC_WRITE; 353 desiredAccess |= GENERIC_WRITE;
356 if (!(oflags & FMODE_READ))
357 write_only = true;
358 }
359 354
360 if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) 355 if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
361 disposition = FILE_CREATE; 356 disposition = FILE_CREATE;
@@ -470,8 +465,8 @@ cifs_create_set_dentry:
470 /* mknod case - do not leave file open */ 465 /* mknod case - do not leave file open */
471 CIFSSMBClose(xid, tcon, fileHandle); 466 CIFSSMBClose(xid, tcon, fileHandle);
472 } else if (!(posix_create) && (newinode)) { 467 } else if (!(posix_create) && (newinode)) {
473 cifs_fill_fileinfo(newinode, fileHandle, 468 cifs_new_fileinfo(newinode, fileHandle, NULL,
474 cifs_sb->tcon, write_only); 469 nd->path.mnt, oflags);
475 } 470 }
476cifs_create_out: 471cifs_create_out:
477 kfree(buf); 472 kfree(buf);
@@ -611,7 +606,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
611{ 606{
612 int xid; 607 int xid;
613 int rc = 0; /* to get around spurious gcc warning, set to zero here */ 608 int rc = 0; /* to get around spurious gcc warning, set to zero here */
614 int oplock = 0; 609 __u32 oplock = 0;
615 __u16 fileHandle = 0; 610 __u16 fileHandle = 0;
616 bool posix_open = false; 611 bool posix_open = false;
617 struct cifs_sb_info *cifs_sb; 612 struct cifs_sb_info *cifs_sb;
@@ -683,8 +678,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
683 if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) && 678 if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
684 (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open && 679 (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
685 (nd->intent.open.flags & O_CREAT)) { 680 (nd->intent.open.flags & O_CREAT)) {
686 rc = cifs_posix_open(full_path, &newInode, 681 rc = cifs_posix_open(full_path, &newInode, nd->path.mnt,
687 parent_dir_inode->i_sb,
688 nd->intent.open.create_mode, 682 nd->intent.open.create_mode,
689 nd->intent.open.flags, &oplock, 683 nd->intent.open.flags, &oplock,
690 &fileHandle, xid); 684 &fileHandle, xid);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index c34b7f8a217b..429337eb7afe 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -30,6 +30,7 @@
30#include <linux/writeback.h> 30#include <linux/writeback.h>
31#include <linux/task_io_accounting_ops.h> 31#include <linux/task_io_accounting_ops.h>
32#include <linux/delay.h> 32#include <linux/delay.h>
33#include <linux/mount.h>
33#include <asm/div64.h> 34#include <asm/div64.h>
34#include "cifsfs.h" 35#include "cifsfs.h"
35#include "cifspdu.h" 36#include "cifspdu.h"
@@ -39,29 +40,6 @@
39#include "cifs_debug.h" 40#include "cifs_debug.h"
40#include "cifs_fs_sb.h" 41#include "cifs_fs_sb.h"
41 42
42static inline struct cifsFileInfo *cifs_init_private(
43 struct cifsFileInfo *private_data, struct inode *inode,
44 struct file *file, __u16 netfid)
45{
46 memset(private_data, 0, sizeof(struct cifsFileInfo));
47 private_data->netfid = netfid;
48 private_data->pid = current->tgid;
49 mutex_init(&private_data->fh_mutex);
50 mutex_init(&private_data->lock_mutex);
51 INIT_LIST_HEAD(&private_data->llist);
52 private_data->pfile = file; /* needed for writepage */
53 private_data->pInode = inode;
54 private_data->invalidHandle = false;
55 private_data->closePend = false;
56 /* we have to track num writers to the inode, since writepages
57 does not tell us which handle the write is for so there can
58 be a close (overlapping with write) of the filehandle that
59 cifs_writepages chose to use */
60 atomic_set(&private_data->wrtPending, 0);
61
62 return private_data;
63}
64
65static inline int cifs_convert_flags(unsigned int flags) 43static inline int cifs_convert_flags(unsigned int flags)
66{ 44{
67 if ((flags & O_ACCMODE) == O_RDONLY) 45 if ((flags & O_ACCMODE) == O_RDONLY)
@@ -125,9 +103,11 @@ static inline int cifs_get_disposition(unsigned int flags)
125} 103}
126 104
127/* all arguments to this function must be checked for validity in caller */ 105/* all arguments to this function must be checked for validity in caller */
128static inline int cifs_posix_open_inode_helper(struct inode *inode, 106static inline int
129 struct file *file, struct cifsInodeInfo *pCifsInode, 107cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
130 struct cifsFileInfo *pCifsFile, int oplock, u16 netfid) 108 struct cifsInodeInfo *pCifsInode,
109 struct cifsFileInfo *pCifsFile, __u32 oplock,
110 u16 netfid)
131{ 111{
132 112
133 write_lock(&GlobalSMBSeslock); 113 write_lock(&GlobalSMBSeslock);
@@ -221,17 +201,6 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
221 struct timespec temp; 201 struct timespec temp;
222 int rc; 202 int rc;
223 203
224 /* want handles we can use to read with first
225 in the list so we do not have to walk the
226 list to search for one in write_begin */
227 if ((file->f_flags & O_ACCMODE) == O_WRONLY) {
228 list_add_tail(&pCifsFile->flist,
229 &pCifsInode->openFileList);
230 } else {
231 list_add(&pCifsFile->flist,
232 &pCifsInode->openFileList);
233 }
234 write_unlock(&GlobalSMBSeslock);
235 if (pCifsInode->clientCanCacheRead) { 204 if (pCifsInode->clientCanCacheRead) {
236 /* we have the inode open somewhere else 205 /* we have the inode open somewhere else
237 no need to discard cache data */ 206 no need to discard cache data */
@@ -281,7 +250,8 @@ client_can_cache:
281int cifs_open(struct inode *inode, struct file *file) 250int cifs_open(struct inode *inode, struct file *file)
282{ 251{
283 int rc = -EACCES; 252 int rc = -EACCES;
284 int xid, oplock; 253 int xid;
254 __u32 oplock;
285 struct cifs_sb_info *cifs_sb; 255 struct cifs_sb_info *cifs_sb;
286 struct cifsTconInfo *tcon; 256 struct cifsTconInfo *tcon;
287 struct cifsFileInfo *pCifsFile; 257 struct cifsFileInfo *pCifsFile;
@@ -326,7 +296,7 @@ int cifs_open(struct inode *inode, struct file *file)
326 le64_to_cpu(tcon->fsUnixInfo.Capability))) { 296 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
327 int oflags = (int) cifs_posix_convert_flags(file->f_flags); 297 int oflags = (int) cifs_posix_convert_flags(file->f_flags);
328 /* can not refresh inode info since size could be stale */ 298 /* can not refresh inode info since size could be stale */
329 rc = cifs_posix_open(full_path, &inode, inode->i_sb, 299 rc = cifs_posix_open(full_path, &inode, file->f_path.mnt,
330 cifs_sb->mnt_file_mode /* ignored */, 300 cifs_sb->mnt_file_mode /* ignored */,
331 oflags, &oplock, &netfid, xid); 301 oflags, &oplock, &netfid, xid);
332 if (rc == 0) { 302 if (rc == 0) {
@@ -416,24 +386,17 @@ int cifs_open(struct inode *inode, struct file *file)
416 cFYI(1, ("cifs_open returned 0x%x", rc)); 386 cFYI(1, ("cifs_open returned 0x%x", rc));
417 goto out; 387 goto out;
418 } 388 }
419 file->private_data = 389
420 kmalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); 390 pCifsFile = cifs_new_fileinfo(inode, netfid, file, file->f_path.mnt,
391 file->f_flags);
392 file->private_data = pCifsFile;
421 if (file->private_data == NULL) { 393 if (file->private_data == NULL) {
422 rc = -ENOMEM; 394 rc = -ENOMEM;
423 goto out; 395 goto out;
424 } 396 }
425 pCifsFile = cifs_init_private(file->private_data, inode, file, netfid);
426 write_lock(&GlobalSMBSeslock);
427 list_add(&pCifsFile->tlist, &tcon->openFileList);
428 397
429 pCifsInode = CIFS_I(file->f_path.dentry->d_inode); 398 rc = cifs_open_inode_helper(inode, file, pCifsInode, pCifsFile, tcon,
430 if (pCifsInode) { 399 &oplock, buf, full_path, xid);
431 rc = cifs_open_inode_helper(inode, file, pCifsInode,
432 pCifsFile, tcon,
433 &oplock, buf, full_path, xid);
434 } else {
435 write_unlock(&GlobalSMBSeslock);
436 }
437 400
438 if (oplock & CIFS_CREATE_ACTION) { 401 if (oplock & CIFS_CREATE_ACTION) {
439 /* time to set mode which we can not set earlier due to 402 /* time to set mode which we can not set earlier due to
@@ -476,7 +439,8 @@ static int cifs_relock_file(struct cifsFileInfo *cifsFile)
476static int cifs_reopen_file(struct file *file, bool can_flush) 439static int cifs_reopen_file(struct file *file, bool can_flush)
477{ 440{
478 int rc = -EACCES; 441 int rc = -EACCES;
479 int xid, oplock; 442 int xid;
443 __u32 oplock;
480 struct cifs_sb_info *cifs_sb; 444 struct cifs_sb_info *cifs_sb;
481 struct cifsTconInfo *tcon; 445 struct cifsTconInfo *tcon;
482 struct cifsFileInfo *pCifsFile; 446 struct cifsFileInfo *pCifsFile;
@@ -545,7 +509,7 @@ reopen_error_exit:
545 le64_to_cpu(tcon->fsUnixInfo.Capability))) { 509 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
546 int oflags = (int) cifs_posix_convert_flags(file->f_flags); 510 int oflags = (int) cifs_posix_convert_flags(file->f_flags);
547 /* can not refresh inode info since size could be stale */ 511 /* can not refresh inode info since size could be stale */
548 rc = cifs_posix_open(full_path, NULL, inode->i_sb, 512 rc = cifs_posix_open(full_path, NULL, file->f_path.mnt,
549 cifs_sb->mnt_file_mode /* ignored */, 513 cifs_sb->mnt_file_mode /* ignored */,
550 oflags, &oplock, &netfid, xid); 514 oflags, &oplock, &netfid, xid);
551 if (rc == 0) { 515 if (rc == 0) {
@@ -643,7 +607,7 @@ int cifs_close(struct inode *inode, struct file *file)
643 if (!pTcon->need_reconnect) { 607 if (!pTcon->need_reconnect) {
644 write_unlock(&GlobalSMBSeslock); 608 write_unlock(&GlobalSMBSeslock);
645 timeout = 2; 609 timeout = 2;
646 while ((atomic_read(&pSMBFile->wrtPending) != 0) 610 while ((atomic_read(&pSMBFile->count) != 1)
647 && (timeout <= 2048)) { 611 && (timeout <= 2048)) {
648 /* Give write a better chance to get to 612 /* Give write a better chance to get to
649 server ahead of the close. We do not 613 server ahead of the close. We do not
@@ -657,8 +621,6 @@ int cifs_close(struct inode *inode, struct file *file)
657 msleep(timeout); 621 msleep(timeout);
658 timeout *= 4; 622 timeout *= 4;
659 } 623 }
660 if (atomic_read(&pSMBFile->wrtPending))
661 cERROR(1, ("close with pending write"));
662 if (!pTcon->need_reconnect && 624 if (!pTcon->need_reconnect &&
663 !pSMBFile->invalidHandle) 625 !pSMBFile->invalidHandle)
664 rc = CIFSSMBClose(xid, pTcon, 626 rc = CIFSSMBClose(xid, pTcon,
@@ -681,24 +643,7 @@ int cifs_close(struct inode *inode, struct file *file)
681 list_del(&pSMBFile->flist); 643 list_del(&pSMBFile->flist);
682 list_del(&pSMBFile->tlist); 644 list_del(&pSMBFile->tlist);
683 write_unlock(&GlobalSMBSeslock); 645 write_unlock(&GlobalSMBSeslock);
684 timeout = 10; 646 cifsFileInfo_put(file->private_data);
685 /* We waited above to give the SMBWrite a chance to issue
686 on the wire (so we do not get SMBWrite returning EBADF
687 if writepages is racing with close. Note that writepages
688 does not specify a file handle, so it is possible for a file
689 to be opened twice, and the application close the "wrong"
690 file handle - in these cases we delay long enough to allow
691 the SMBWrite to get on the wire before the SMB Close.
692 We allow total wait here over 45 seconds, more than
693 oplock break time, and more than enough to allow any write
694 to complete on the server, or to time out on the client */
695 while ((atomic_read(&pSMBFile->wrtPending) != 0)
696 && (timeout <= 50000)) {
697 cERROR(1, ("writes pending, delay free of handle"));
698 msleep(timeout);
699 timeout *= 8;
700 }
701 kfree(file->private_data);
702 file->private_data = NULL; 647 file->private_data = NULL;
703 } else 648 } else
704 rc = -EBADF; 649 rc = -EBADF;
@@ -1236,7 +1181,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
1236 if (!open_file->invalidHandle) { 1181 if (!open_file->invalidHandle) {
1237 /* found a good file */ 1182 /* found a good file */
1238 /* lock it so it will not be closed on us */ 1183 /* lock it so it will not be closed on us */
1239 atomic_inc(&open_file->wrtPending); 1184 cifsFileInfo_get(open_file);
1240 read_unlock(&GlobalSMBSeslock); 1185 read_unlock(&GlobalSMBSeslock);
1241 return open_file; 1186 return open_file;
1242 } /* else might as well continue, and look for 1187 } /* else might as well continue, and look for
@@ -1276,7 +1221,7 @@ refind_writable:
1276 if (open_file->pfile && 1221 if (open_file->pfile &&
1277 ((open_file->pfile->f_flags & O_RDWR) || 1222 ((open_file->pfile->f_flags & O_RDWR) ||
1278 (open_file->pfile->f_flags & O_WRONLY))) { 1223 (open_file->pfile->f_flags & O_WRONLY))) {
1279 atomic_inc(&open_file->wrtPending); 1224 cifsFileInfo_get(open_file);
1280 1225
1281 if (!open_file->invalidHandle) { 1226 if (!open_file->invalidHandle) {
1282 /* found a good writable file */ 1227 /* found a good writable file */
@@ -1293,7 +1238,7 @@ refind_writable:
1293 else { /* start over in case this was deleted */ 1238 else { /* start over in case this was deleted */
1294 /* since the list could be modified */ 1239 /* since the list could be modified */
1295 read_lock(&GlobalSMBSeslock); 1240 read_lock(&GlobalSMBSeslock);
1296 atomic_dec(&open_file->wrtPending); 1241 cifsFileInfo_put(open_file);
1297 goto refind_writable; 1242 goto refind_writable;
1298 } 1243 }
1299 } 1244 }
@@ -1309,7 +1254,7 @@ refind_writable:
1309 read_lock(&GlobalSMBSeslock); 1254 read_lock(&GlobalSMBSeslock);
1310 /* can not use this handle, no write 1255 /* can not use this handle, no write
1311 pending on this one after all */ 1256 pending on this one after all */
1312 atomic_dec(&open_file->wrtPending); 1257 cifsFileInfo_put(open_file);
1313 1258
1314 if (open_file->closePend) /* list could have changed */ 1259 if (open_file->closePend) /* list could have changed */
1315 goto refind_writable; 1260 goto refind_writable;
@@ -1373,7 +1318,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1373 if (open_file) { 1318 if (open_file) {
1374 bytes_written = cifs_write(open_file->pfile, write_data, 1319 bytes_written = cifs_write(open_file->pfile, write_data,
1375 to-from, &offset); 1320 to-from, &offset);
1376 atomic_dec(&open_file->wrtPending); 1321 cifsFileInfo_put(open_file);
1377 /* Does mm or vfs already set times? */ 1322 /* Does mm or vfs already set times? */
1378 inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb); 1323 inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb);
1379 if ((bytes_written > 0) && (offset)) 1324 if ((bytes_written > 0) && (offset))
@@ -1562,7 +1507,7 @@ retry:
1562 bytes_to_write, offset, 1507 bytes_to_write, offset,
1563 &bytes_written, iov, n_iov, 1508 &bytes_written, iov, n_iov,
1564 long_op); 1509 long_op);
1565 atomic_dec(&open_file->wrtPending); 1510 cifsFileInfo_put(open_file);
1566 cifs_update_eof(cifsi, offset, bytes_written); 1511 cifs_update_eof(cifsi, offset, bytes_written);
1567 1512
1568 if (rc || bytes_written < bytes_to_write) { 1513 if (rc || bytes_written < bytes_to_write) {
@@ -2329,6 +2274,73 @@ out:
2329 return rc; 2274 return rc;
2330} 2275}
2331 2276
2277static void
2278cifs_oplock_break(struct slow_work *work)
2279{
2280 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
2281 oplock_break);
2282 struct inode *inode = cfile->pInode;
2283 struct cifsInodeInfo *cinode = CIFS_I(inode);
2284 struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->mnt->mnt_sb);
2285 int rc, waitrc = 0;
2286
2287 if (inode && S_ISREG(inode->i_mode)) {
2288#ifdef CONFIG_CIFS_EXPERIMENTAL
2289 if (cinode->clientCanCacheAll == 0)
2290 break_lease(inode, FMODE_READ);
2291 else if (cinode->clientCanCacheRead == 0)
2292 break_lease(inode, FMODE_WRITE);
2293#endif
2294 rc = filemap_fdatawrite(inode->i_mapping);
2295 if (cinode->clientCanCacheRead == 0) {
2296 waitrc = filemap_fdatawait(inode->i_mapping);
2297 invalidate_remote_inode(inode);
2298 }
2299 if (!rc)
2300 rc = waitrc;
2301 if (rc)
2302 cinode->write_behind_rc = rc;
2303 cFYI(1, ("Oplock flush inode %p rc %d", inode, rc));
2304 }
2305
2306 /*
2307 * releasing stale oplock after recent reconnect of smb session using
2308 * a now incorrect file handle is not a data integrity issue but do
2309 * not bother sending an oplock release if session to server still is
2310 * disconnected since oplock already released by the server
2311 */
2312 if (!cfile->closePend && !cfile->oplock_break_cancelled) {
2313 rc = CIFSSMBLock(0, cifs_sb->tcon, cfile->netfid, 0, 0, 0, 0,
2314 LOCKING_ANDX_OPLOCK_RELEASE, false);
2315 cFYI(1, ("Oplock release rc = %d", rc));
2316 }
2317}
2318
2319static int
2320cifs_oplock_break_get(struct slow_work *work)
2321{
2322 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
2323 oplock_break);
2324 mntget(cfile->mnt);
2325 cifsFileInfo_get(cfile);
2326 return 0;
2327}
2328
2329static void
2330cifs_oplock_break_put(struct slow_work *work)
2331{
2332 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
2333 oplock_break);
2334 mntput(cfile->mnt);
2335 cifsFileInfo_put(cfile);
2336}
2337
2338const struct slow_work_ops cifs_oplock_break_ops = {
2339 .get_ref = cifs_oplock_break_get,
2340 .put_ref = cifs_oplock_break_put,
2341 .execute = cifs_oplock_break,
2342};
2343
2332const struct address_space_operations cifs_addr_ops = { 2344const struct address_space_operations cifs_addr_ops = {
2333 .readpage = cifs_readpage, 2345 .readpage = cifs_readpage,
2334 .readpages = cifs_readpages, 2346 .readpages = cifs_readpages,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 82d83839655e..5e2492535daa 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -800,7 +800,7 @@ set_via_filehandle:
800 if (open_file == NULL) 800 if (open_file == NULL)
801 CIFSSMBClose(xid, pTcon, netfid); 801 CIFSSMBClose(xid, pTcon, netfid);
802 else 802 else
803 atomic_dec(&open_file->wrtPending); 803 cifsFileInfo_put(open_file);
804out: 804out:
805 return rc; 805 return rc;
806} 806}
@@ -1557,57 +1557,24 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from)
1557 1557
1558static int cifs_vmtruncate(struct inode *inode, loff_t offset) 1558static int cifs_vmtruncate(struct inode *inode, loff_t offset)
1559{ 1559{
1560 struct address_space *mapping = inode->i_mapping; 1560 loff_t oldsize;
1561 unsigned long limit; 1561 int err;
1562 1562
1563 spin_lock(&inode->i_lock); 1563 spin_lock(&inode->i_lock);
1564 if (inode->i_size < offset) 1564 err = inode_newsize_ok(inode, offset);
1565 goto do_expand; 1565 if (err) {
1566 /*
1567 * truncation of in-use swapfiles is disallowed - it would cause
1568 * subsequent swapout to scribble on the now-freed blocks.
1569 */
1570 if (IS_SWAPFILE(inode)) {
1571 spin_unlock(&inode->i_lock);
1572 goto out_busy;
1573 }
1574 i_size_write(inode, offset);
1575 spin_unlock(&inode->i_lock);
1576 /*
1577 * unmap_mapping_range is called twice, first simply for efficiency
1578 * so that truncate_inode_pages does fewer single-page unmaps. However
1579 * after this first call, and before truncate_inode_pages finishes,
1580 * it is possible for private pages to be COWed, which remain after
1581 * truncate_inode_pages finishes, hence the second unmap_mapping_range
1582 * call must be made for correctness.
1583 */
1584 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
1585 truncate_inode_pages(mapping, offset);
1586 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
1587 goto out_truncate;
1588
1589do_expand:
1590 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
1591 if (limit != RLIM_INFINITY && offset > limit) {
1592 spin_unlock(&inode->i_lock); 1566 spin_unlock(&inode->i_lock);
1593 goto out_sig; 1567 goto out;
1594 }
1595 if (offset > inode->i_sb->s_maxbytes) {
1596 spin_unlock(&inode->i_lock);
1597 goto out_big;
1598 } 1568 }
1569
1570 oldsize = inode->i_size;
1599 i_size_write(inode, offset); 1571 i_size_write(inode, offset);
1600 spin_unlock(&inode->i_lock); 1572 spin_unlock(&inode->i_lock);
1601out_truncate: 1573 truncate_pagecache(inode, oldsize, offset);
1602 if (inode->i_op->truncate) 1574 if (inode->i_op->truncate)
1603 inode->i_op->truncate(inode); 1575 inode->i_op->truncate(inode);
1604 return 0; 1576out:
1605out_sig: 1577 return err;
1606 send_sig(SIGXFSZ, current, 0);
1607out_big:
1608 return -EFBIG;
1609out_busy:
1610 return -ETXTBSY;
1611} 1578}
1612 1579
1613static int 1580static int
@@ -1635,7 +1602,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1635 __u32 npid = open_file->pid; 1602 __u32 npid = open_file->pid;
1636 rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid, 1603 rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
1637 npid, false); 1604 npid, false);
1638 atomic_dec(&open_file->wrtPending); 1605 cifsFileInfo_put(open_file);
1639 cFYI(1, ("SetFSize for attrs rc = %d", rc)); 1606 cFYI(1, ("SetFSize for attrs rc = %d", rc));
1640 if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { 1607 if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
1641 unsigned int bytes_written; 1608 unsigned int bytes_written;
@@ -1790,7 +1757,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1790 u16 nfid = open_file->netfid; 1757 u16 nfid = open_file->netfid;
1791 u32 npid = open_file->pid; 1758 u32 npid = open_file->pid;
1792 rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid); 1759 rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
1793 atomic_dec(&open_file->wrtPending); 1760 cifsFileInfo_put(open_file);
1794 } else { 1761 } else {
1795 rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args, 1762 rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args,
1796 cifs_sb->local_nls, 1763 cifs_sb->local_nls,
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index e079a9190ec4..0241b25ac33f 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -32,7 +32,6 @@
32 32
33extern mempool_t *cifs_sm_req_poolp; 33extern mempool_t *cifs_sm_req_poolp;
34extern mempool_t *cifs_req_poolp; 34extern mempool_t *cifs_req_poolp;
35extern struct task_struct *oplockThread;
36 35
37/* The xid serves as a useful identifier for each incoming vfs request, 36/* The xid serves as a useful identifier for each incoming vfs request,
38 in a similar way to the mid which is useful to track each sent smb, 37 in a similar way to the mid which is useful to track each sent smb,
@@ -500,6 +499,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
500 struct cifsTconInfo *tcon; 499 struct cifsTconInfo *tcon;
501 struct cifsInodeInfo *pCifsInode; 500 struct cifsInodeInfo *pCifsInode;
502 struct cifsFileInfo *netfile; 501 struct cifsFileInfo *netfile;
502 int rc;
503 503
504 cFYI(1, ("Checking for oplock break or dnotify response")); 504 cFYI(1, ("Checking for oplock break or dnotify response"));
505 if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) && 505 if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) &&
@@ -562,30 +562,40 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
562 continue; 562 continue;
563 563
564 cifs_stats_inc(&tcon->num_oplock_brks); 564 cifs_stats_inc(&tcon->num_oplock_brks);
565 write_lock(&GlobalSMBSeslock); 565 read_lock(&GlobalSMBSeslock);
566 list_for_each(tmp2, &tcon->openFileList) { 566 list_for_each(tmp2, &tcon->openFileList) {
567 netfile = list_entry(tmp2, struct cifsFileInfo, 567 netfile = list_entry(tmp2, struct cifsFileInfo,
568 tlist); 568 tlist);
569 if (pSMB->Fid != netfile->netfid) 569 if (pSMB->Fid != netfile->netfid)
570 continue; 570 continue;
571 571
572 write_unlock(&GlobalSMBSeslock); 572 /*
573 read_unlock(&cifs_tcp_ses_lock); 573 * don't do anything if file is about to be
574 * closed anyway.
575 */
576 if (netfile->closePend) {
577 read_unlock(&GlobalSMBSeslock);
578 read_unlock(&cifs_tcp_ses_lock);
579 return true;
580 }
581
574 cFYI(1, ("file id match, oplock break")); 582 cFYI(1, ("file id match, oplock break"));
575 pCifsInode = CIFS_I(netfile->pInode); 583 pCifsInode = CIFS_I(netfile->pInode);
576 pCifsInode->clientCanCacheAll = false; 584 pCifsInode->clientCanCacheAll = false;
577 if (pSMB->OplockLevel == 0) 585 if (pSMB->OplockLevel == 0)
578 pCifsInode->clientCanCacheRead = false; 586 pCifsInode->clientCanCacheRead = false;
579 pCifsInode->oplockPending = true; 587 rc = slow_work_enqueue(&netfile->oplock_break);
580 AllocOplockQEntry(netfile->pInode, 588 if (rc) {
581 netfile->netfid, tcon); 589 cERROR(1, ("failed to enqueue oplock "
582 cFYI(1, ("about to wake up oplock thread")); 590 "break: %d\n", rc));
583 if (oplockThread) 591 } else {
584 wake_up_process(oplockThread); 592 netfile->oplock_break_cancelled = false;
585 593 }
594 read_unlock(&GlobalSMBSeslock);
595 read_unlock(&cifs_tcp_ses_lock);
586 return true; 596 return true;
587 } 597 }
588 write_unlock(&GlobalSMBSeslock); 598 read_unlock(&GlobalSMBSeslock);
589 read_unlock(&cifs_tcp_ses_lock); 599 read_unlock(&cifs_tcp_ses_lock);
590 cFYI(1, ("No matching file for oplock break")); 600 cFYI(1, ("No matching file for oplock break"));
591 return true; 601 return true;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index f823a4a208a7..1f098ca71636 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -146,7 +146,7 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
146 } 146 }
147} 147}
148 148
149void 149static void
150cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info, 150cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
151 struct cifs_sb_info *cifs_sb) 151 struct cifs_sb_info *cifs_sb)
152{ 152{
@@ -161,7 +161,7 @@ cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
161 cifs_fill_common_info(fattr, cifs_sb); 161 cifs_fill_common_info(fattr, cifs_sb);
162} 162}
163 163
164void 164static void
165cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info, 165cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info,
166 struct cifs_sb_info *cifs_sb) 166 struct cifs_sb_info *cifs_sb)
167{ 167{
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 0ad3e2d116a6..07b8e71544ee 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -103,57 +103,6 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
103 mempool_free(midEntry, cifs_mid_poolp); 103 mempool_free(midEntry, cifs_mid_poolp);
104} 104}
105 105
106struct oplock_q_entry *
107AllocOplockQEntry(struct inode *pinode, __u16 fid, struct cifsTconInfo *tcon)
108{
109 struct oplock_q_entry *temp;
110 if ((pinode == NULL) || (tcon == NULL)) {
111 cERROR(1, ("Null parms passed to AllocOplockQEntry"));
112 return NULL;
113 }
114 temp = (struct oplock_q_entry *) kmem_cache_alloc(cifs_oplock_cachep,
115 GFP_KERNEL);
116 if (temp == NULL)
117 return temp;
118 else {
119 temp->pinode = pinode;
120 temp->tcon = tcon;
121 temp->netfid = fid;
122 spin_lock(&GlobalMid_Lock);
123 list_add_tail(&temp->qhead, &GlobalOplock_Q);
124 spin_unlock(&GlobalMid_Lock);
125 }
126 return temp;
127
128}
129
130void DeleteOplockQEntry(struct oplock_q_entry *oplockEntry)
131{
132 spin_lock(&GlobalMid_Lock);
133 /* should we check if list empty first? */
134 list_del(&oplockEntry->qhead);
135 spin_unlock(&GlobalMid_Lock);
136 kmem_cache_free(cifs_oplock_cachep, oplockEntry);
137}
138
139
140void DeleteTconOplockQEntries(struct cifsTconInfo *tcon)
141{
142 struct oplock_q_entry *temp;
143
144 if (tcon == NULL)
145 return;
146
147 spin_lock(&GlobalMid_Lock);
148 list_for_each_entry(temp, &GlobalOplock_Q, qhead) {
149 if ((temp->tcon) && (temp->tcon == tcon)) {
150 list_del(&temp->qhead);
151 kmem_cache_free(cifs_oplock_cachep, temp);
152 }
153 }
154 spin_unlock(&GlobalMid_Lock);
155}
156
157static int 106static int
158smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) 107smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
159{ 108{
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index 8ccd5ed81d9c..d99860a33890 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -2,6 +2,7 @@
2#define _CODA_INT_ 2#define _CODA_INT_
3 3
4struct dentry; 4struct dentry;
5struct file;
5 6
6extern struct file_system_type coda_fs_type; 7extern struct file_system_type coda_fs_type;
7extern unsigned long coda_timeout; 8extern unsigned long coda_timeout;
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 0376ac66c44a..be4392ca2098 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -22,6 +22,7 @@
22#include <linux/kernel.h> 22#include <linux/kernel.h>
23#include <linux/major.h> 23#include <linux/major.h>
24#include <linux/time.h> 24#include <linux/time.h>
25#include <linux/sched.h>
25#include <linux/slab.h> 26#include <linux/slab.h>
26#include <linux/ioport.h> 27#include <linux/ioport.h>
27#include <linux/fcntl.h> 28#include <linux/fcntl.h>
diff --git a/fs/compat.c b/fs/compat.c
index 94502dab972a..d576b552e8e2 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -100,13 +100,6 @@ asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename, st
100 get_compat_timespec(&tv[1], &t[1])) 100 get_compat_timespec(&tv[1], &t[1]))
101 return -EFAULT; 101 return -EFAULT;
102 102
103 if ((tv[0].tv_nsec == UTIME_OMIT || tv[0].tv_nsec == UTIME_NOW)
104 && tv[0].tv_sec != 0)
105 return -EINVAL;
106 if ((tv[1].tv_nsec == UTIME_OMIT || tv[1].tv_nsec == UTIME_NOW)
107 && tv[1].tv_sec != 0)
108 return -EINVAL;
109
110 if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT) 103 if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT)
111 return 0; 104 return 0;
112 } 105 }
@@ -775,13 +768,13 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
775 char __user * type, unsigned long flags, 768 char __user * type, unsigned long flags,
776 void __user * data) 769 void __user * data)
777{ 770{
778 unsigned long type_page; 771 char *kernel_type;
779 unsigned long data_page; 772 unsigned long data_page;
780 unsigned long dev_page; 773 char *kernel_dev;
781 char *dir_page; 774 char *dir_page;
782 int retval; 775 int retval;
783 776
784 retval = copy_mount_options (type, &type_page); 777 retval = copy_mount_string(type, &kernel_type);
785 if (retval < 0) 778 if (retval < 0)
786 goto out; 779 goto out;
787 780
@@ -790,38 +783,38 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
790 if (IS_ERR(dir_page)) 783 if (IS_ERR(dir_page))
791 goto out1; 784 goto out1;
792 785
793 retval = copy_mount_options (dev_name, &dev_page); 786 retval = copy_mount_string(dev_name, &kernel_dev);
794 if (retval < 0) 787 if (retval < 0)
795 goto out2; 788 goto out2;
796 789
797 retval = copy_mount_options (data, &data_page); 790 retval = copy_mount_options(data, &data_page);
798 if (retval < 0) 791 if (retval < 0)
799 goto out3; 792 goto out3;
800 793
801 retval = -EINVAL; 794 retval = -EINVAL;
802 795
803 if (type_page && data_page) { 796 if (kernel_type && data_page) {
804 if (!strcmp((char *)type_page, SMBFS_NAME)) { 797 if (!strcmp(kernel_type, SMBFS_NAME)) {
805 do_smb_super_data_conv((void *)data_page); 798 do_smb_super_data_conv((void *)data_page);
806 } else if (!strcmp((char *)type_page, NCPFS_NAME)) { 799 } else if (!strcmp(kernel_type, NCPFS_NAME)) {
807 do_ncp_super_data_conv((void *)data_page); 800 do_ncp_super_data_conv((void *)data_page);
808 } else if (!strcmp((char *)type_page, NFS4_NAME)) { 801 } else if (!strcmp(kernel_type, NFS4_NAME)) {
809 if (do_nfs4_super_data_conv((void *) data_page)) 802 if (do_nfs4_super_data_conv((void *) data_page))
810 goto out4; 803 goto out4;
811 } 804 }
812 } 805 }
813 806
814 retval = do_mount((char*)dev_page, dir_page, (char*)type_page, 807 retval = do_mount(kernel_dev, dir_page, kernel_type,
815 flags, (void*)data_page); 808 flags, (void*)data_page);
816 809
817 out4: 810 out4:
818 free_page(data_page); 811 free_page(data_page);
819 out3: 812 out3:
820 free_page(dev_page); 813 kfree(kernel_dev);
821 out2: 814 out2:
822 putname(dir_page); 815 putname(dir_page);
823 out1: 816 out1:
824 free_page(type_page); 817 kfree(kernel_type);
825 out: 818 out:
826 return retval; 819 return retval;
827} 820}
@@ -1485,20 +1478,15 @@ int compat_do_execve(char * filename,
1485 if (!bprm) 1478 if (!bprm)
1486 goto out_files; 1479 goto out_files;
1487 1480
1488 retval = -ERESTARTNOINTR; 1481 retval = prepare_bprm_creds(bprm);
1489 if (mutex_lock_interruptible(&current->cred_guard_mutex)) 1482 if (retval)
1490 goto out_free; 1483 goto out_free;
1491 current->in_execve = 1;
1492
1493 retval = -ENOMEM;
1494 bprm->cred = prepare_exec_creds();
1495 if (!bprm->cred)
1496 goto out_unlock;
1497 1484
1498 retval = check_unsafe_exec(bprm); 1485 retval = check_unsafe_exec(bprm);
1499 if (retval < 0) 1486 if (retval < 0)
1500 goto out_unlock; 1487 goto out_free;
1501 clear_in_exec = retval; 1488 clear_in_exec = retval;
1489 current->in_execve = 1;
1502 1490
1503 file = open_exec(filename); 1491 file = open_exec(filename);
1504 retval = PTR_ERR(file); 1492 retval = PTR_ERR(file);
@@ -1547,7 +1535,6 @@ int compat_do_execve(char * filename,
1547 /* execve succeeded */ 1535 /* execve succeeded */
1548 current->fs->in_exec = 0; 1536 current->fs->in_exec = 0;
1549 current->in_execve = 0; 1537 current->in_execve = 0;
1550 mutex_unlock(&current->cred_guard_mutex);
1551 acct_update_integrals(current); 1538 acct_update_integrals(current);
1552 free_bprm(bprm); 1539 free_bprm(bprm);
1553 if (displaced) 1540 if (displaced)
@@ -1567,10 +1554,7 @@ out_file:
1567out_unmark: 1554out_unmark:
1568 if (clear_in_exec) 1555 if (clear_in_exec)
1569 current->fs->in_exec = 0; 1556 current->fs->in_exec = 0;
1570
1571out_unlock:
1572 current->in_execve = 0; 1557 current->in_execve = 0;
1573 mutex_unlock(&current->cred_guard_mutex);
1574 1558
1575out_free: 1559out_free:
1576 free_bprm(bprm); 1560 free_bprm(bprm);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 4921e7426d95..a2f746066c5d 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -51,6 +51,7 @@ static const struct address_space_operations configfs_aops = {
51}; 51};
52 52
53static struct backing_dev_info configfs_backing_dev_info = { 53static struct backing_dev_info configfs_backing_dev_info = {
54 .name = "configfs",
54 .ra_pages = 0, /* No readahead */ 55 .ra_pages = 0, /* No readahead */
55 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 56 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
56}; 57};
diff --git a/fs/dcache.c b/fs/dcache.c
index 9e5cd3c3a6ba..a100fa35a48f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -32,6 +32,7 @@
32#include <linux/swap.h> 32#include <linux/swap.h>
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/fs_struct.h> 34#include <linux/fs_struct.h>
35#include <linux/hardirq.h>
35#include "internal.h" 36#include "internal.h"
36 37
37int sysctl_vfs_cache_pressure __read_mostly = 100; 38int sysctl_vfs_cache_pressure __read_mostly = 100;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 75efb028974b..d5f8c96964be 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -18,14 +18,13 @@
18#include <linux/mount.h> 18#include <linux/mount.h>
19#include <linux/tty.h> 19#include <linux/tty.h>
20#include <linux/mutex.h> 20#include <linux/mutex.h>
21#include <linux/magic.h>
21#include <linux/idr.h> 22#include <linux/idr.h>
22#include <linux/devpts_fs.h> 23#include <linux/devpts_fs.h>
23#include <linux/parser.h> 24#include <linux/parser.h>
24#include <linux/fsnotify.h> 25#include <linux/fsnotify.h>
25#include <linux/seq_file.h> 26#include <linux/seq_file.h>
26 27
27#define DEVPTS_SUPER_MAGIC 0x1cd1
28
29#define DEVPTS_DEFAULT_MODE 0600 28#define DEVPTS_DEFAULT_MODE 0600
30/* 29/*
31 * ptmx is a new node in /dev/pts and will be unused in legacy (single- 30 * ptmx is a new node in /dev/pts and will be unused in legacy (single-
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 1d1d27442235..1c8bb8c3a82e 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -386,9 +386,9 @@ static int table_seq_show(struct seq_file *seq, void *iter_ptr)
386 return rv; 386 return rv;
387} 387}
388 388
389static struct seq_operations format1_seq_ops; 389static const struct seq_operations format1_seq_ops;
390static struct seq_operations format2_seq_ops; 390static const struct seq_operations format2_seq_ops;
391static struct seq_operations format3_seq_ops; 391static const struct seq_operations format3_seq_ops;
392 392
393static void *table_seq_start(struct seq_file *seq, loff_t *pos) 393static void *table_seq_start(struct seq_file *seq, loff_t *pos)
394{ 394{
@@ -534,21 +534,21 @@ static void table_seq_stop(struct seq_file *seq, void *iter_ptr)
534 } 534 }
535} 535}
536 536
537static struct seq_operations format1_seq_ops = { 537static const struct seq_operations format1_seq_ops = {
538 .start = table_seq_start, 538 .start = table_seq_start,
539 .next = table_seq_next, 539 .next = table_seq_next,
540 .stop = table_seq_stop, 540 .stop = table_seq_stop,
541 .show = table_seq_show, 541 .show = table_seq_show,
542}; 542};
543 543
544static struct seq_operations format2_seq_ops = { 544static const struct seq_operations format2_seq_ops = {
545 .start = table_seq_start, 545 .start = table_seq_start,
546 .next = table_seq_next, 546 .next = table_seq_next,
547 .stop = table_seq_stop, 547 .stop = table_seq_stop,
548 .show = table_seq_show, 548 .show = table_seq_show,
549}; 549};
550 550
551static struct seq_operations format3_seq_ops = { 551static const struct seq_operations format3_seq_ops = {
552 .start = table_seq_start, 552 .start = table_seq_start,
553 .next = table_seq_next, 553 .next = table_seq_next,
554 .stop = table_seq_stop, 554 .stop = table_seq_stop,
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 618a60f03886..240cef14fe58 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -106,6 +106,7 @@ struct connection {
106#define CF_CONNECT_PENDING 3 106#define CF_CONNECT_PENDING 3
107#define CF_INIT_PENDING 4 107#define CF_INIT_PENDING 4
108#define CF_IS_OTHERCON 5 108#define CF_IS_OTHERCON 5
109#define CF_CLOSE 6
109 struct list_head writequeue; /* List of outgoing writequeue_entries */ 110 struct list_head writequeue; /* List of outgoing writequeue_entries */
110 spinlock_t writequeue_lock; 111 spinlock_t writequeue_lock;
111 int (*rx_action) (struct connection *); /* What to do when active */ 112 int (*rx_action) (struct connection *); /* What to do when active */
@@ -299,6 +300,8 @@ static void lowcomms_write_space(struct sock *sk)
299 300
300static inline void lowcomms_connect_sock(struct connection *con) 301static inline void lowcomms_connect_sock(struct connection *con)
301{ 302{
303 if (test_bit(CF_CLOSE, &con->flags))
304 return;
302 if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags)) 305 if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
303 queue_work(send_workqueue, &con->swork); 306 queue_work(send_workqueue, &con->swork);
304} 307}
@@ -926,10 +929,8 @@ static void tcp_connect_to_sock(struct connection *con)
926 goto out_err; 929 goto out_err;
927 930
928 memset(&saddr, 0, sizeof(saddr)); 931 memset(&saddr, 0, sizeof(saddr));
929 if (dlm_nodeid_to_addr(con->nodeid, &saddr)) { 932 if (dlm_nodeid_to_addr(con->nodeid, &saddr))
930 sock_release(sock);
931 goto out_err; 933 goto out_err;
932 }
933 934
934 sock->sk->sk_user_data = con; 935 sock->sk->sk_user_data = con;
935 con->rx_action = receive_from_sock; 936 con->rx_action = receive_from_sock;
@@ -1284,7 +1285,6 @@ out:
1284static void send_to_sock(struct connection *con) 1285static void send_to_sock(struct connection *con)
1285{ 1286{
1286 int ret = 0; 1287 int ret = 0;
1287 ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int);
1288 const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; 1288 const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
1289 struct writequeue_entry *e; 1289 struct writequeue_entry *e;
1290 int len, offset; 1290 int len, offset;
@@ -1293,8 +1293,6 @@ static void send_to_sock(struct connection *con)
1293 if (con->sock == NULL) 1293 if (con->sock == NULL)
1294 goto out_connect; 1294 goto out_connect;
1295 1295
1296 sendpage = con->sock->ops->sendpage;
1297
1298 spin_lock(&con->writequeue_lock); 1296 spin_lock(&con->writequeue_lock);
1299 for (;;) { 1297 for (;;) {
1300 e = list_entry(con->writequeue.next, struct writequeue_entry, 1298 e = list_entry(con->writequeue.next, struct writequeue_entry,
@@ -1309,8 +1307,8 @@ static void send_to_sock(struct connection *con)
1309 1307
1310 ret = 0; 1308 ret = 0;
1311 if (len) { 1309 if (len) {
1312 ret = sendpage(con->sock, e->page, offset, len, 1310 ret = kernel_sendpage(con->sock, e->page, offset, len,
1313 msg_flags); 1311 msg_flags);
1314 if (ret == -EAGAIN || ret == 0) { 1312 if (ret == -EAGAIN || ret == 0) {
1315 cond_resched(); 1313 cond_resched();
1316 goto out; 1314 goto out;
@@ -1370,6 +1368,13 @@ int dlm_lowcomms_close(int nodeid)
1370 log_print("closing connection to node %d", nodeid); 1368 log_print("closing connection to node %d", nodeid);
1371 con = nodeid2con(nodeid, 0); 1369 con = nodeid2con(nodeid, 0);
1372 if (con) { 1370 if (con) {
1371 clear_bit(CF_CONNECT_PENDING, &con->flags);
1372 clear_bit(CF_WRITE_PENDING, &con->flags);
1373 set_bit(CF_CLOSE, &con->flags);
1374 if (cancel_work_sync(&con->swork))
1375 log_print("canceled swork for node %d", nodeid);
1376 if (cancel_work_sync(&con->rwork))
1377 log_print("canceled rwork for node %d", nodeid);
1373 clean_one_writequeue(con); 1378 clean_one_writequeue(con);
1374 close_connection(con, true); 1379 close_connection(con, true);
1375 } 1380 }
@@ -1395,9 +1400,10 @@ static void process_send_sockets(struct work_struct *work)
1395 1400
1396 if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) { 1401 if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
1397 con->connect_action(con); 1402 con->connect_action(con);
1403 set_bit(CF_WRITE_PENDING, &con->flags);
1398 } 1404 }
1399 clear_bit(CF_WRITE_PENDING, &con->flags); 1405 if (test_and_clear_bit(CF_WRITE_PENDING, &con->flags))
1400 send_to_sock(con); 1406 send_to_sock(con);
1401} 1407}
1402 1408
1403 1409
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index ccc9d62c462d..55ea369f43a9 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -63,7 +63,7 @@ static int send_data(struct sk_buff *skb)
63 return rv; 63 return rv;
64 } 64 }
65 65
66 return genlmsg_unicast(skb, listener_nlpid); 66 return genlmsg_unicast(&init_net, skb, listener_nlpid);
67} 67}
68 68
69static int user_cmd(struct sk_buff *skb, struct genl_info *info) 69static int user_cmd(struct sk_buff *skb, struct genl_info *info)
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index a2edb7913447..31f4b0e6d72c 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -63,9 +63,9 @@ static void drop_slab(void)
63} 63}
64 64
65int drop_caches_sysctl_handler(ctl_table *table, int write, 65int drop_caches_sysctl_handler(ctl_table *table, int write,
66 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 66 void __user *buffer, size_t *length, loff_t *ppos)
67{ 67{
68 proc_dointvec_minmax(table, write, file, buffer, length, ppos); 68 proc_dointvec_minmax(table, write, buffer, length, ppos);
69 if (write) { 69 if (write) {
70 if (sysctl_drop_caches & 1) 70 if (sysctl_drop_caches & 1)
71 drop_pagecache(); 71 drop_pagecache();
diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig
index 0c754e64232b..8aadb99b7634 100644
--- a/fs/ecryptfs/Kconfig
+++ b/fs/ecryptfs/Kconfig
@@ -1,6 +1,8 @@
1config ECRYPT_FS 1config ECRYPT_FS
2 tristate "eCrypt filesystem layer support (EXPERIMENTAL)" 2 tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
3 depends on EXPERIMENTAL && KEYS && CRYPTO && NET 3 depends on EXPERIMENTAL && KEYS && NET
4 select CRYPTO_ECB
5 select CRYPTO_CBC
4 help 6 help
5 Encrypted filesystem that operates on the VFS layer. See 7 Encrypted filesystem that operates on the VFS layer. See
6 <file:Documentation/filesystems/ecryptfs.txt> to learn more about 8 <file:Documentation/filesystems/ecryptfs.txt> to learn more about
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index b91851f1cda3..fbb6e5eed697 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -245,13 +245,11 @@ void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
245 crypto_free_blkcipher(crypt_stat->tfm); 245 crypto_free_blkcipher(crypt_stat->tfm);
246 if (crypt_stat->hash_tfm) 246 if (crypt_stat->hash_tfm)
247 crypto_free_hash(crypt_stat->hash_tfm); 247 crypto_free_hash(crypt_stat->hash_tfm);
248 mutex_lock(&crypt_stat->keysig_list_mutex);
249 list_for_each_entry_safe(key_sig, key_sig_tmp, 248 list_for_each_entry_safe(key_sig, key_sig_tmp,
250 &crypt_stat->keysig_list, crypt_stat_list) { 249 &crypt_stat->keysig_list, crypt_stat_list) {
251 list_del(&key_sig->crypt_stat_list); 250 list_del(&key_sig->crypt_stat_list);
252 kmem_cache_free(ecryptfs_key_sig_cache, key_sig); 251 kmem_cache_free(ecryptfs_key_sig_cache, key_sig);
253 } 252 }
254 mutex_unlock(&crypt_stat->keysig_list_mutex);
255 memset(crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat)); 253 memset(crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat));
256} 254}
257 255
@@ -511,13 +509,14 @@ int ecryptfs_encrypt_page(struct page *page)
511 + extent_offset), crypt_stat); 509 + extent_offset), crypt_stat);
512 rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, 510 rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt,
513 offset, crypt_stat->extent_size); 511 offset, crypt_stat->extent_size);
514 if (rc) { 512 if (rc < 0) {
515 ecryptfs_printk(KERN_ERR, "Error attempting " 513 ecryptfs_printk(KERN_ERR, "Error attempting "
516 "to write lower page; rc = [%d]" 514 "to write lower page; rc = [%d]"
517 "\n", rc); 515 "\n", rc);
518 goto out; 516 goto out;
519 } 517 }
520 } 518 }
519 rc = 0;
521out: 520out:
522 if (enc_extent_page) { 521 if (enc_extent_page) {
523 kunmap(enc_extent_page); 522 kunmap(enc_extent_page);
@@ -633,7 +632,7 @@ int ecryptfs_decrypt_page(struct page *page)
633 rc = ecryptfs_read_lower(enc_extent_virt, offset, 632 rc = ecryptfs_read_lower(enc_extent_virt, offset,
634 crypt_stat->extent_size, 633 crypt_stat->extent_size,
635 ecryptfs_inode); 634 ecryptfs_inode);
636 if (rc) { 635 if (rc < 0) {
637 ecryptfs_printk(KERN_ERR, "Error attempting " 636 ecryptfs_printk(KERN_ERR, "Error attempting "
638 "to read lower page; rc = [%d]" 637 "to read lower page; rc = [%d]"
639 "\n", rc); 638 "\n", rc);
@@ -797,6 +796,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
797 kfree(full_alg_name); 796 kfree(full_alg_name);
798 if (IS_ERR(crypt_stat->tfm)) { 797 if (IS_ERR(crypt_stat->tfm)) {
799 rc = PTR_ERR(crypt_stat->tfm); 798 rc = PTR_ERR(crypt_stat->tfm);
799 crypt_stat->tfm = NULL;
800 ecryptfs_printk(KERN_ERR, "cryptfs: init_crypt_ctx(): " 800 ecryptfs_printk(KERN_ERR, "cryptfs: init_crypt_ctx(): "
801 "Error initializing cipher [%s]\n", 801 "Error initializing cipher [%s]\n",
802 crypt_stat->cipher); 802 crypt_stat->cipher);
@@ -925,7 +925,9 @@ static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs(
925 struct ecryptfs_global_auth_tok *global_auth_tok; 925 struct ecryptfs_global_auth_tok *global_auth_tok;
926 int rc = 0; 926 int rc = 0;
927 927
928 mutex_lock(&crypt_stat->keysig_list_mutex);
928 mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); 929 mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
930
929 list_for_each_entry(global_auth_tok, 931 list_for_each_entry(global_auth_tok,
930 &mount_crypt_stat->global_auth_tok_list, 932 &mount_crypt_stat->global_auth_tok_list,
931 mount_crypt_stat_list) { 933 mount_crypt_stat_list) {
@@ -934,13 +936,13 @@ static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs(
934 rc = ecryptfs_add_keysig(crypt_stat, global_auth_tok->sig); 936 rc = ecryptfs_add_keysig(crypt_stat, global_auth_tok->sig);
935 if (rc) { 937 if (rc) {
936 printk(KERN_ERR "Error adding keysig; rc = [%d]\n", rc); 938 printk(KERN_ERR "Error adding keysig; rc = [%d]\n", rc);
937 mutex_unlock(
938 &mount_crypt_stat->global_auth_tok_list_mutex);
939 goto out; 939 goto out;
940 } 940 }
941 } 941 }
942 mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); 942
943out: 943out:
944 mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
945 mutex_unlock(&crypt_stat->keysig_list_mutex);
944 return rc; 946 return rc;
945} 947}
946 948
@@ -1212,14 +1214,15 @@ int ecryptfs_read_and_validate_header_region(char *data,
1212 crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE; 1214 crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE;
1213 rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size, 1215 rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size,
1214 ecryptfs_inode); 1216 ecryptfs_inode);
1215 if (rc) { 1217 if (rc < 0) {
1216 printk(KERN_ERR "%s: Error reading header region; rc = [%d]\n", 1218 printk(KERN_ERR "%s: Error reading header region; rc = [%d]\n",
1217 __func__, rc); 1219 __func__, rc);
1218 goto out; 1220 goto out;
1219 } 1221 }
1220 if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) { 1222 if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) {
1221 rc = -EINVAL; 1223 rc = -EINVAL;
1222 } 1224 } else
1225 rc = 0;
1223out: 1226out:
1224 return rc; 1227 return rc;
1225} 1228}
@@ -1314,10 +1317,11 @@ ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry,
1314 1317
1315 rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt, 1318 rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt,
1316 0, virt_len); 1319 0, virt_len);
1317 if (rc) 1320 if (rc < 0)
1318 printk(KERN_ERR "%s: Error attempting to write header " 1321 printk(KERN_ERR "%s: Error attempting to write header "
1319 "information to lower file; rc = [%d]\n", __func__, 1322 "information to lower file; rc = [%d]\n", __func__, rc);
1320 rc); 1323 else
1324 rc = 0;
1321 return rc; 1325 return rc;
1322} 1326}
1323 1327
@@ -1597,7 +1601,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
1597 } 1601 }
1598 rc = ecryptfs_read_lower(page_virt, 0, crypt_stat->extent_size, 1602 rc = ecryptfs_read_lower(page_virt, 0, crypt_stat->extent_size,
1599 ecryptfs_inode); 1603 ecryptfs_inode);
1600 if (!rc) 1604 if (rc >= 0)
1601 rc = ecryptfs_read_headers_virt(page_virt, crypt_stat, 1605 rc = ecryptfs_read_headers_virt(page_virt, crypt_stat,
1602 ecryptfs_dentry, 1606 ecryptfs_dentry,
1603 ECRYPTFS_VALIDATE_HEADER_SIZE); 1607 ECRYPTFS_VALIDATE_HEADER_SIZE);
@@ -1702,7 +1706,7 @@ ecryptfs_encrypt_filename(struct ecryptfs_filename *filename,
1702 } else { 1706 } else {
1703 printk(KERN_ERR "%s: No support for requested filename " 1707 printk(KERN_ERR "%s: No support for requested filename "
1704 "encryption method in this release\n", __func__); 1708 "encryption method in this release\n", __func__);
1705 rc = -ENOTSUPP; 1709 rc = -EOPNOTSUPP;
1706 goto out; 1710 goto out;
1707 } 1711 }
1708out: 1712out:
@@ -1763,7 +1767,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
1763 if (IS_ERR(*key_tfm)) { 1767 if (IS_ERR(*key_tfm)) {
1764 rc = PTR_ERR(*key_tfm); 1768 rc = PTR_ERR(*key_tfm);
1765 printk(KERN_ERR "Unable to allocate crypto cipher with name " 1769 printk(KERN_ERR "Unable to allocate crypto cipher with name "
1766 "[%s]; rc = [%d]\n", cipher_name, rc); 1770 "[%s]; rc = [%d]\n", full_alg_name, rc);
1767 goto out; 1771 goto out;
1768 } 1772 }
1769 crypto_blkcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY); 1773 crypto_blkcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY);
@@ -1776,7 +1780,8 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
1776 rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size); 1780 rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size);
1777 if (rc) { 1781 if (rc) {
1778 printk(KERN_ERR "Error attempting to set key of size [%zd] for " 1782 printk(KERN_ERR "Error attempting to set key of size [%zd] for "
1779 "cipher [%s]; rc = [%d]\n", *key_size, cipher_name, rc); 1783 "cipher [%s]; rc = [%d]\n", *key_size, full_alg_name,
1784 rc);
1780 rc = -EINVAL; 1785 rc = -EINVAL;
1781 goto out; 1786 goto out;
1782 } 1787 }
@@ -2166,7 +2171,7 @@ int ecryptfs_encrypt_and_encode_filename(
2166 (*encoded_name)[(*encoded_name_size)] = '\0'; 2171 (*encoded_name)[(*encoded_name_size)] = '\0';
2167 (*encoded_name_size)++; 2172 (*encoded_name_size)++;
2168 } else { 2173 } else {
2169 rc = -ENOTSUPP; 2174 rc = -EOPNOTSUPP;
2170 } 2175 }
2171 if (rc) { 2176 if (rc) {
2172 printk(KERN_ERR "%s: Error attempting to encode " 2177 printk(KERN_ERR "%s: Error attempting to encode "
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 00b30a2d5466..542f625312f3 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -582,7 +582,7 @@ extern const struct inode_operations ecryptfs_dir_iops;
582extern const struct inode_operations ecryptfs_symlink_iops; 582extern const struct inode_operations ecryptfs_symlink_iops;
583extern const struct super_operations ecryptfs_sops; 583extern const struct super_operations ecryptfs_sops;
584extern const struct dentry_operations ecryptfs_dops; 584extern const struct dentry_operations ecryptfs_dops;
585extern struct address_space_operations ecryptfs_aops; 585extern const struct address_space_operations ecryptfs_aops;
586extern int ecryptfs_verbosity; 586extern int ecryptfs_verbosity;
587extern unsigned int ecryptfs_message_buf_len; 587extern unsigned int ecryptfs_message_buf_len;
588extern signed long ecryptfs_message_wait_timeout; 588extern signed long ecryptfs_message_wait_timeout;
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 2f0945d63297..056fed62d0de 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -476,6 +476,7 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
476 struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir); 476 struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir);
477 struct dentry *lower_dir_dentry; 477 struct dentry *lower_dir_dentry;
478 478
479 dget(lower_dentry);
479 lower_dir_dentry = lock_parent(lower_dentry); 480 lower_dir_dentry = lock_parent(lower_dentry);
480 rc = vfs_unlink(lower_dir_inode, lower_dentry); 481 rc = vfs_unlink(lower_dir_inode, lower_dentry);
481 if (rc) { 482 if (rc) {
@@ -489,6 +490,7 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
489 d_drop(dentry); 490 d_drop(dentry);
490out_unlock: 491out_unlock:
491 unlock_dir(lower_dir_dentry); 492 unlock_dir(lower_dir_dentry);
493 dput(lower_dentry);
492 return rc; 494 return rc;
493} 495}
494 496
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 259525c9abb8..a0a7847567e9 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -416,7 +416,9 @@ ecryptfs_find_global_auth_tok_for_sig(
416 &mount_crypt_stat->global_auth_tok_list, 416 &mount_crypt_stat->global_auth_tok_list,
417 mount_crypt_stat_list) { 417 mount_crypt_stat_list) {
418 if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) { 418 if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) {
419 (*global_auth_tok) = walker; 419 rc = key_validate(walker->global_auth_tok_key);
420 if (!rc)
421 (*global_auth_tok) = walker;
420 goto out; 422 goto out;
421 } 423 }
422 } 424 }
@@ -612,7 +614,12 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
612 } 614 }
613 /* TODO: Support other key modules than passphrase for 615 /* TODO: Support other key modules than passphrase for
614 * filename encryption */ 616 * filename encryption */
615 BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD); 617 if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) {
618 rc = -EOPNOTSUPP;
619 printk(KERN_INFO "%s: Filename encryption only supports "
620 "password tokens\n", __func__);
621 goto out_free_unlock;
622 }
616 sg_init_one( 623 sg_init_one(
617 &s->hash_sg, 624 &s->hash_sg,
618 (u8 *)s->auth_tok->token.password.session_key_encryption_key, 625 (u8 *)s->auth_tok->token.password.session_key_encryption_key,
@@ -910,7 +917,12 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
910 } 917 }
911 /* TODO: Support other key modules than passphrase for 918 /* TODO: Support other key modules than passphrase for
912 * filename encryption */ 919 * filename encryption */
913 BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD); 920 if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) {
921 rc = -EOPNOTSUPP;
922 printk(KERN_INFO "%s: Filename encryption only supports "
923 "password tokens\n", __func__);
924 goto out_free_unlock;
925 }
914 rc = crypto_blkcipher_setkey( 926 rc = crypto_blkcipher_setkey(
915 s->desc.tfm, 927 s->desc.tfm,
916 s->auth_tok->token.password.session_key_encryption_key, 928 s->auth_tok->token.password.session_key_encryption_key,
@@ -1316,8 +1328,10 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
1316 rc = -EINVAL; 1328 rc = -EINVAL;
1317 goto out_free; 1329 goto out_free;
1318 } 1330 }
1319 ecryptfs_cipher_code_to_string(crypt_stat->cipher, 1331 rc = ecryptfs_cipher_code_to_string(crypt_stat->cipher,
1320 (u16)data[(*packet_size)]); 1332 (u16)data[(*packet_size)]);
1333 if (rc)
1334 goto out_free;
1321 /* A little extra work to differentiate among the AES key 1335 /* A little extra work to differentiate among the AES key
1322 * sizes; see RFC2440 */ 1336 * sizes; see RFC2440 */
1323 switch(data[(*packet_size)++]) { 1337 switch(data[(*packet_size)++]) {
@@ -1328,7 +1342,9 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
1328 crypt_stat->key_size = 1342 crypt_stat->key_size =
1329 (*new_auth_tok)->session_key.encrypted_key_size; 1343 (*new_auth_tok)->session_key.encrypted_key_size;
1330 } 1344 }
1331 ecryptfs_init_crypt_ctx(crypt_stat); 1345 rc = ecryptfs_init_crypt_ctx(crypt_stat);
1346 if (rc)
1347 goto out_free;
1332 if (unlikely(data[(*packet_size)++] != 0x03)) { 1348 if (unlikely(data[(*packet_size)++] != 0x03)) {
1333 printk(KERN_WARNING "Only S2K ID 3 is currently supported\n"); 1349 printk(KERN_WARNING "Only S2K ID 3 is currently supported\n");
1334 rc = -ENOSYS; 1350 rc = -ENOSYS;
@@ -2366,21 +2382,18 @@ struct kmem_cache *ecryptfs_key_sig_cache;
2366int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig) 2382int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig)
2367{ 2383{
2368 struct ecryptfs_key_sig *new_key_sig; 2384 struct ecryptfs_key_sig *new_key_sig;
2369 int rc = 0;
2370 2385
2371 new_key_sig = kmem_cache_alloc(ecryptfs_key_sig_cache, GFP_KERNEL); 2386 new_key_sig = kmem_cache_alloc(ecryptfs_key_sig_cache, GFP_KERNEL);
2372 if (!new_key_sig) { 2387 if (!new_key_sig) {
2373 rc = -ENOMEM;
2374 printk(KERN_ERR 2388 printk(KERN_ERR
2375 "Error allocating from ecryptfs_key_sig_cache\n"); 2389 "Error allocating from ecryptfs_key_sig_cache\n");
2376 goto out; 2390 return -ENOMEM;
2377 } 2391 }
2378 memcpy(new_key_sig->keysig, sig, ECRYPTFS_SIG_SIZE_HEX); 2392 memcpy(new_key_sig->keysig, sig, ECRYPTFS_SIG_SIZE_HEX);
2379 mutex_lock(&crypt_stat->keysig_list_mutex); 2393 /* Caller must hold keysig_list_mutex */
2380 list_add(&new_key_sig->crypt_stat_list, &crypt_stat->keysig_list); 2394 list_add(&new_key_sig->crypt_stat_list, &crypt_stat->keysig_list);
2381 mutex_unlock(&crypt_stat->keysig_list_mutex); 2395
2382out: 2396 return 0;
2383 return rc;
2384} 2397}
2385 2398
2386struct kmem_cache *ecryptfs_global_auth_tok_cache; 2399struct kmem_cache *ecryptfs_global_auth_tok_cache;
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index c6d7a4d748a0..e14cf7e588db 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -136,6 +136,7 @@ int ecryptfs_privileged_open(struct file **lower_file,
136 const struct cred *cred) 136 const struct cred *cred)
137{ 137{
138 struct ecryptfs_open_req *req; 138 struct ecryptfs_open_req *req;
139 int flags = O_LARGEFILE;
139 int rc = 0; 140 int rc = 0;
140 141
141 /* Corresponding dput() and mntput() are done when the 142 /* Corresponding dput() and mntput() are done when the
@@ -143,10 +144,14 @@ int ecryptfs_privileged_open(struct file **lower_file,
143 * destroyed. */ 144 * destroyed. */
144 dget(lower_dentry); 145 dget(lower_dentry);
145 mntget(lower_mnt); 146 mntget(lower_mnt);
146 (*lower_file) = dentry_open(lower_dentry, lower_mnt, 147 flags |= IS_RDONLY(lower_dentry->d_inode) ? O_RDONLY : O_RDWR;
147 (O_RDWR | O_LARGEFILE), cred); 148 (*lower_file) = dentry_open(lower_dentry, lower_mnt, flags, cred);
148 if (!IS_ERR(*lower_file)) 149 if (!IS_ERR(*lower_file))
149 goto out; 150 goto out;
151 if (flags & O_RDONLY) {
152 rc = PTR_ERR((*lower_file));
153 goto out;
154 }
150 req = kmem_cache_alloc(ecryptfs_open_req_cache, GFP_KERNEL); 155 req = kmem_cache_alloc(ecryptfs_open_req_cache, GFP_KERNEL);
151 if (!req) { 156 if (!req) {
152 rc = -ENOMEM; 157 rc = -ENOMEM;
@@ -180,21 +185,8 @@ int ecryptfs_privileged_open(struct file **lower_file,
180 __func__); 185 __func__);
181 goto out_unlock; 186 goto out_unlock;
182 } 187 }
183 if (IS_ERR(*req->lower_file)) { 188 if (IS_ERR(*req->lower_file))
184 rc = PTR_ERR(*req->lower_file); 189 rc = PTR_ERR(*req->lower_file);
185 dget(lower_dentry);
186 mntget(lower_mnt);
187 (*lower_file) = dentry_open(lower_dentry, lower_mnt,
188 (O_RDONLY | O_LARGEFILE), cred);
189 if (IS_ERR(*lower_file)) {
190 rc = PTR_ERR(*req->lower_file);
191 (*lower_file) = NULL;
192 printk(KERN_WARNING "%s: Error attempting privileged "
193 "open of lower file with either RW or RO "
194 "perms; rc = [%d]. Giving up.\n",
195 __func__, rc);
196 }
197 }
198out_unlock: 190out_unlock:
199 mutex_unlock(&req->mux); 191 mutex_unlock(&req->mux);
200out_free: 192out_free:
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 9f0aa9883c28..101fe4c7b1ee 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -129,11 +129,10 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
129 lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); 129 lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
130 rc = ecryptfs_privileged_open(&inode_info->lower_file, 130 rc = ecryptfs_privileged_open(&inode_info->lower_file,
131 lower_dentry, lower_mnt, cred); 131 lower_dentry, lower_mnt, cred);
132 if (rc || IS_ERR(inode_info->lower_file)) { 132 if (rc) {
133 printk(KERN_ERR "Error opening lower persistent file " 133 printk(KERN_ERR "Error opening lower persistent file "
134 "for lower_dentry [0x%p] and lower_mnt [0x%p]; " 134 "for lower_dentry [0x%p] and lower_mnt [0x%p]; "
135 "rc = [%d]\n", lower_dentry, lower_mnt, rc); 135 "rc = [%d]\n", lower_dentry, lower_mnt, rc);
136 rc = PTR_ERR(inode_info->lower_file);
137 inode_info->lower_file = NULL; 136 inode_info->lower_file = NULL;
138 } 137 }
139 } 138 }
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 5c6bab9786e3..df4ce99d0597 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -396,9 +396,11 @@ static int ecryptfs_write_inode_size_to_header(struct inode *ecryptfs_inode)
396 rc = ecryptfs_write_lower(ecryptfs_inode, file_size_virt, 0, 396 rc = ecryptfs_write_lower(ecryptfs_inode, file_size_virt, 0,
397 sizeof(u64)); 397 sizeof(u64));
398 kfree(file_size_virt); 398 kfree(file_size_virt);
399 if (rc) 399 if (rc < 0)
400 printk(KERN_ERR "%s: Error writing file size to header; " 400 printk(KERN_ERR "%s: Error writing file size to header; "
401 "rc = [%d]\n", __func__, rc); 401 "rc = [%d]\n", __func__, rc);
402 else
403 rc = 0;
402out: 404out:
403 return rc; 405 return rc;
404} 406}
@@ -545,7 +547,7 @@ static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block)
545 return rc; 547 return rc;
546} 548}
547 549
548struct address_space_operations ecryptfs_aops = { 550const struct address_space_operations ecryptfs_aops = {
549 .writepage = ecryptfs_writepage, 551 .writepage = ecryptfs_writepage,
550 .readpage = ecryptfs_readpage, 552 .readpage = ecryptfs_readpage,
551 .write_begin = ecryptfs_write_begin, 553 .write_begin = ecryptfs_write_begin,
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index a137c6ea2fee..0cc4fafd6552 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -34,15 +34,14 @@
34 * 34 *
35 * Write data to the lower file. 35 * Write data to the lower file.
36 * 36 *
37 * Returns zero on success; non-zero on error 37 * Returns bytes written on success; less than zero on error
38 */ 38 */
39int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data, 39int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
40 loff_t offset, size_t size) 40 loff_t offset, size_t size)
41{ 41{
42 struct ecryptfs_inode_info *inode_info; 42 struct ecryptfs_inode_info *inode_info;
43 ssize_t octets_written;
44 mm_segment_t fs_save; 43 mm_segment_t fs_save;
45 int rc = 0; 44 ssize_t rc;
46 45
47 inode_info = ecryptfs_inode_to_private(ecryptfs_inode); 46 inode_info = ecryptfs_inode_to_private(ecryptfs_inode);
48 mutex_lock(&inode_info->lower_file_mutex); 47 mutex_lock(&inode_info->lower_file_mutex);
@@ -50,14 +49,9 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
50 inode_info->lower_file->f_pos = offset; 49 inode_info->lower_file->f_pos = offset;
51 fs_save = get_fs(); 50 fs_save = get_fs();
52 set_fs(get_ds()); 51 set_fs(get_ds());
53 octets_written = vfs_write(inode_info->lower_file, data, size, 52 rc = vfs_write(inode_info->lower_file, data, size,
54 &inode_info->lower_file->f_pos); 53 &inode_info->lower_file->f_pos);
55 set_fs(fs_save); 54 set_fs(fs_save);
56 if (octets_written < 0) {
57 printk(KERN_ERR "%s: octets_written = [%td]; "
58 "expected [%td]\n", __func__, octets_written, size);
59 rc = -EINVAL;
60 }
61 mutex_unlock(&inode_info->lower_file_mutex); 55 mutex_unlock(&inode_info->lower_file_mutex);
62 mark_inode_dirty_sync(ecryptfs_inode); 56 mark_inode_dirty_sync(ecryptfs_inode);
63 return rc; 57 return rc;
@@ -91,6 +85,8 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
91 + offset_in_page); 85 + offset_in_page);
92 virt = kmap(page_for_lower); 86 virt = kmap(page_for_lower);
93 rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size); 87 rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size);
88 if (rc > 0)
89 rc = 0;
94 kunmap(page_for_lower); 90 kunmap(page_for_lower);
95 return rc; 91 return rc;
96} 92}
@@ -229,30 +225,24 @@ out:
229 * Read @size bytes of data at byte offset @offset from the lower 225 * Read @size bytes of data at byte offset @offset from the lower
230 * inode into memory location @data. 226 * inode into memory location @data.
231 * 227 *
232 * Returns zero on success; non-zero on error 228 * Returns bytes read on success; 0 on EOF; less than zero on error
233 */ 229 */
234int ecryptfs_read_lower(char *data, loff_t offset, size_t size, 230int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
235 struct inode *ecryptfs_inode) 231 struct inode *ecryptfs_inode)
236{ 232{
237 struct ecryptfs_inode_info *inode_info = 233 struct ecryptfs_inode_info *inode_info =
238 ecryptfs_inode_to_private(ecryptfs_inode); 234 ecryptfs_inode_to_private(ecryptfs_inode);
239 ssize_t octets_read;
240 mm_segment_t fs_save; 235 mm_segment_t fs_save;
241 int rc = 0; 236 ssize_t rc;
242 237
243 mutex_lock(&inode_info->lower_file_mutex); 238 mutex_lock(&inode_info->lower_file_mutex);
244 BUG_ON(!inode_info->lower_file); 239 BUG_ON(!inode_info->lower_file);
245 inode_info->lower_file->f_pos = offset; 240 inode_info->lower_file->f_pos = offset;
246 fs_save = get_fs(); 241 fs_save = get_fs();
247 set_fs(get_ds()); 242 set_fs(get_ds());
248 octets_read = vfs_read(inode_info->lower_file, data, size, 243 rc = vfs_read(inode_info->lower_file, data, size,
249 &inode_info->lower_file->f_pos); 244 &inode_info->lower_file->f_pos);
250 set_fs(fs_save); 245 set_fs(fs_save);
251 if (octets_read < 0) {
252 printk(KERN_ERR "%s: octets_read = [%td]; "
253 "expected [%td]\n", __func__, octets_read, size);
254 rc = -EINVAL;
255 }
256 mutex_unlock(&inode_info->lower_file_mutex); 246 mutex_unlock(&inode_info->lower_file_mutex);
257 return rc; 247 return rc;
258} 248}
@@ -284,6 +274,8 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
284 offset = ((((loff_t)page_index) << PAGE_CACHE_SHIFT) + offset_in_page); 274 offset = ((((loff_t)page_index) << PAGE_CACHE_SHIFT) + offset_in_page);
285 virt = kmap(page_for_ecryptfs); 275 virt = kmap(page_for_ecryptfs);
286 rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode); 276 rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode);
277 if (rc > 0)
278 rc = 0;
287 kunmap(page_for_ecryptfs); 279 kunmap(page_for_ecryptfs);
288 flush_dcache_page(page_for_ecryptfs); 280 flush_dcache_page(page_for_ecryptfs);
289 return rc; 281 return rc;
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 12d649602d3a..b15a43a80ab7 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -77,7 +77,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
77 struct ecryptfs_inode_info *inode_info; 77 struct ecryptfs_inode_info *inode_info;
78 78
79 inode_info = ecryptfs_inode_to_private(inode); 79 inode_info = ecryptfs_inode_to_private(inode);
80 mutex_lock(&inode_info->lower_file_mutex);
81 if (inode_info->lower_file) { 80 if (inode_info->lower_file) {
82 struct dentry *lower_dentry = 81 struct dentry *lower_dentry =
83 inode_info->lower_file->f_dentry; 82 inode_info->lower_file->f_dentry;
@@ -89,7 +88,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
89 d_drop(lower_dentry); 88 d_drop(lower_dentry);
90 } 89 }
91 } 90 }
92 mutex_unlock(&inode_info->lower_file_mutex);
93 ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat); 91 ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat);
94 kmem_cache_free(ecryptfs_inode_info_cache, inode_info); 92 kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
95} 93}
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 31d12de83a2a..8b47e4200e65 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -68,11 +68,16 @@ int eventfd_signal(struct eventfd_ctx *ctx, int n)
68} 68}
69EXPORT_SYMBOL_GPL(eventfd_signal); 69EXPORT_SYMBOL_GPL(eventfd_signal);
70 70
71static void eventfd_free_ctx(struct eventfd_ctx *ctx)
72{
73 kfree(ctx);
74}
75
71static void eventfd_free(struct kref *kref) 76static void eventfd_free(struct kref *kref)
72{ 77{
73 struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref); 78 struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
74 79
75 kfree(ctx); 80 eventfd_free_ctx(ctx);
76} 81}
77 82
78/** 83/**
@@ -298,9 +303,23 @@ struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
298} 303}
299EXPORT_SYMBOL_GPL(eventfd_ctx_fileget); 304EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
300 305
301SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) 306/**
307 * eventfd_file_create - Creates an eventfd file pointer.
308 * @count: Initial eventfd counter value.
309 * @flags: Flags for the eventfd file.
310 *
311 * This function creates an eventfd file pointer, w/out installing it into
312 * the fd table. This is useful when the eventfd file is used during the
313 * initialization of data structures that require extra setup after the eventfd
314 * creation. So the eventfd creation is split into the file pointer creation
315 * phase, and the file descriptor installation phase.
316 * In this way races with userspace closing the newly installed file descriptor
317 * can be avoided.
318 * Returns an eventfd file pointer, or a proper error pointer.
319 */
320struct file *eventfd_file_create(unsigned int count, int flags)
302{ 321{
303 int fd; 322 struct file *file;
304 struct eventfd_ctx *ctx; 323 struct eventfd_ctx *ctx;
305 324
306 /* Check the EFD_* constants for consistency. */ 325 /* Check the EFD_* constants for consistency. */
@@ -308,26 +327,48 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
308 BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); 327 BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
309 328
310 if (flags & ~EFD_FLAGS_SET) 329 if (flags & ~EFD_FLAGS_SET)
311 return -EINVAL; 330 return ERR_PTR(-EINVAL);
312 331
313 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 332 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
314 if (!ctx) 333 if (!ctx)
315 return -ENOMEM; 334 return ERR_PTR(-ENOMEM);
316 335
317 kref_init(&ctx->kref); 336 kref_init(&ctx->kref);
318 init_waitqueue_head(&ctx->wqh); 337 init_waitqueue_head(&ctx->wqh);
319 ctx->count = count; 338 ctx->count = count;
320 ctx->flags = flags; 339 ctx->flags = flags;
321 340
322 /* 341 file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx,
323 * When we call this, the initialization must be complete, since 342 flags & EFD_SHARED_FCNTL_FLAGS);
324 * anon_inode_getfd() will install the fd. 343 if (IS_ERR(file))
325 */ 344 eventfd_free_ctx(ctx);
326 fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx, 345
327 flags & EFD_SHARED_FCNTL_FLAGS); 346 return file;
328 if (fd < 0) 347}
329 kfree(ctx); 348
349SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
350{
351 int fd, error;
352 struct file *file;
353
354 error = get_unused_fd_flags(flags & EFD_SHARED_FCNTL_FLAGS);
355 if (error < 0)
356 return error;
357 fd = error;
358
359 file = eventfd_file_create(count, flags);
360 if (IS_ERR(file)) {
361 error = PTR_ERR(file);
362 goto err_put_unused_fd;
363 }
364 fd_install(fd, file);
365
330 return fd; 366 return fd;
367
368err_put_unused_fd:
369 put_unused_fd(fd);
370
371 return error;
331} 372}
332 373
333SYSCALL_DEFINE1(eventfd, unsigned int, count) 374SYSCALL_DEFINE1(eventfd, unsigned int, count)
diff --git a/fs/exec.c b/fs/exec.c
index fb4f3cdda78c..d49be6bc1793 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,7 +33,7 @@
33#include <linux/string.h> 33#include <linux/string.h>
34#include <linux/init.h> 34#include <linux/init.h>
35#include <linux/pagemap.h> 35#include <linux/pagemap.h>
36#include <linux/perf_counter.h> 36#include <linux/perf_event.h>
37#include <linux/highmem.h> 37#include <linux/highmem.h>
38#include <linux/spinlock.h> 38#include <linux/spinlock.h>
39#include <linux/key.h> 39#include <linux/key.h>
@@ -55,6 +55,7 @@
55#include <linux/kmod.h> 55#include <linux/kmod.h>
56#include <linux/fsnotify.h> 56#include <linux/fsnotify.h>
57#include <linux/fs_struct.h> 57#include <linux/fs_struct.h>
58#include <linux/pipe_fs_i.h>
58 59
59#include <asm/uaccess.h> 60#include <asm/uaccess.h>
60#include <asm/mmu_context.h> 61#include <asm/mmu_context.h>
@@ -63,6 +64,7 @@
63 64
64int core_uses_pid; 65int core_uses_pid;
65char core_pattern[CORENAME_MAX_SIZE] = "core"; 66char core_pattern[CORENAME_MAX_SIZE] = "core";
67unsigned int core_pipe_limit;
66int suid_dumpable = 0; 68int suid_dumpable = 0;
67 69
68/* The maximal length of core_pattern is also specified in sysctl.c */ 70/* The maximal length of core_pattern is also specified in sysctl.c */
@@ -845,6 +847,9 @@ static int de_thread(struct task_struct *tsk)
845 sig->notify_count = 0; 847 sig->notify_count = 0;
846 848
847no_thread_group: 849no_thread_group:
850 if (current->mm)
851 setmax_mm_hiwater_rss(&sig->maxrss, current->mm);
852
848 exit_itimers(sig); 853 exit_itimers(sig);
849 flush_itimer_signals(); 854 flush_itimer_signals();
850 855
@@ -923,7 +928,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
923 task_lock(tsk); 928 task_lock(tsk);
924 strlcpy(tsk->comm, buf, sizeof(tsk->comm)); 929 strlcpy(tsk->comm, buf, sizeof(tsk->comm));
925 task_unlock(tsk); 930 task_unlock(tsk);
926 perf_counter_comm(tsk); 931 perf_event_comm(tsk);
927} 932}
928 933
929int flush_old_exec(struct linux_binprm * bprm) 934int flush_old_exec(struct linux_binprm * bprm)
@@ -997,7 +1002,7 @@ int flush_old_exec(struct linux_binprm * bprm)
997 * security domain: 1002 * security domain:
998 */ 1003 */
999 if (!get_dumpable(current->mm)) 1004 if (!get_dumpable(current->mm))
1000 perf_counter_exit_task(current); 1005 perf_event_exit_task(current);
1001 1006
1002 /* An exec changes our domain. We are no longer part of the thread 1007 /* An exec changes our domain. We are no longer part of the thread
1003 group */ 1008 group */
@@ -1016,6 +1021,35 @@ out:
1016EXPORT_SYMBOL(flush_old_exec); 1021EXPORT_SYMBOL(flush_old_exec);
1017 1022
1018/* 1023/*
1024 * Prepare credentials and lock ->cred_guard_mutex.
1025 * install_exec_creds() commits the new creds and drops the lock.
1026 * Or, if exec fails before, free_bprm() should release ->cred and
1027 * and unlock.
1028 */
1029int prepare_bprm_creds(struct linux_binprm *bprm)
1030{
1031 if (mutex_lock_interruptible(&current->cred_guard_mutex))
1032 return -ERESTARTNOINTR;
1033
1034 bprm->cred = prepare_exec_creds();
1035 if (likely(bprm->cred))
1036 return 0;
1037
1038 mutex_unlock(&current->cred_guard_mutex);
1039 return -ENOMEM;
1040}
1041
1042void free_bprm(struct linux_binprm *bprm)
1043{
1044 free_arg_pages(bprm);
1045 if (bprm->cred) {
1046 mutex_unlock(&current->cred_guard_mutex);
1047 abort_creds(bprm->cred);
1048 }
1049 kfree(bprm);
1050}
1051
1052/*
1019 * install the new credentials for this executable 1053 * install the new credentials for this executable
1020 */ 1054 */
1021void install_exec_creds(struct linux_binprm *bprm) 1055void install_exec_creds(struct linux_binprm *bprm)
@@ -1024,12 +1058,13 @@ void install_exec_creds(struct linux_binprm *bprm)
1024 1058
1025 commit_creds(bprm->cred); 1059 commit_creds(bprm->cred);
1026 bprm->cred = NULL; 1060 bprm->cred = NULL;
1027 1061 /*
1028 /* cred_guard_mutex must be held at least to this point to prevent 1062 * cred_guard_mutex must be held at least to this point to prevent
1029 * ptrace_attach() from altering our determination of the task's 1063 * ptrace_attach() from altering our determination of the task's
1030 * credentials; any time after this it may be unlocked */ 1064 * credentials; any time after this it may be unlocked.
1031 1065 */
1032 security_bprm_committed_creds(bprm); 1066 security_bprm_committed_creds(bprm);
1067 mutex_unlock(&current->cred_guard_mutex);
1033} 1068}
1034EXPORT_SYMBOL(install_exec_creds); 1069EXPORT_SYMBOL(install_exec_creds);
1035 1070
@@ -1246,14 +1281,6 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
1246 1281
1247EXPORT_SYMBOL(search_binary_handler); 1282EXPORT_SYMBOL(search_binary_handler);
1248 1283
1249void free_bprm(struct linux_binprm *bprm)
1250{
1251 free_arg_pages(bprm);
1252 if (bprm->cred)
1253 abort_creds(bprm->cred);
1254 kfree(bprm);
1255}
1256
1257/* 1284/*
1258 * sys_execve() executes a new program. 1285 * sys_execve() executes a new program.
1259 */ 1286 */
@@ -1277,20 +1304,15 @@ int do_execve(char * filename,
1277 if (!bprm) 1304 if (!bprm)
1278 goto out_files; 1305 goto out_files;
1279 1306
1280 retval = -ERESTARTNOINTR; 1307 retval = prepare_bprm_creds(bprm);
1281 if (mutex_lock_interruptible(&current->cred_guard_mutex)) 1308 if (retval)
1282 goto out_free; 1309 goto out_free;
1283 current->in_execve = 1;
1284
1285 retval = -ENOMEM;
1286 bprm->cred = prepare_exec_creds();
1287 if (!bprm->cred)
1288 goto out_unlock;
1289 1310
1290 retval = check_unsafe_exec(bprm); 1311 retval = check_unsafe_exec(bprm);
1291 if (retval < 0) 1312 if (retval < 0)
1292 goto out_unlock; 1313 goto out_free;
1293 clear_in_exec = retval; 1314 clear_in_exec = retval;
1315 current->in_execve = 1;
1294 1316
1295 file = open_exec(filename); 1317 file = open_exec(filename);
1296 retval = PTR_ERR(file); 1318 retval = PTR_ERR(file);
@@ -1337,10 +1359,11 @@ int do_execve(char * filename,
1337 if (retval < 0) 1359 if (retval < 0)
1338 goto out; 1360 goto out;
1339 1361
1362 current->stack_start = current->mm->start_stack;
1363
1340 /* execve succeeded */ 1364 /* execve succeeded */
1341 current->fs->in_exec = 0; 1365 current->fs->in_exec = 0;
1342 current->in_execve = 0; 1366 current->in_execve = 0;
1343 mutex_unlock(&current->cred_guard_mutex);
1344 acct_update_integrals(current); 1367 acct_update_integrals(current);
1345 free_bprm(bprm); 1368 free_bprm(bprm);
1346 if (displaced) 1369 if (displaced)
@@ -1360,10 +1383,7 @@ out_file:
1360out_unmark: 1383out_unmark:
1361 if (clear_in_exec) 1384 if (clear_in_exec)
1362 current->fs->in_exec = 0; 1385 current->fs->in_exec = 0;
1363
1364out_unlock:
1365 current->in_execve = 0; 1386 current->in_execve = 0;
1366 mutex_unlock(&current->cred_guard_mutex);
1367 1387
1368out_free: 1388out_free:
1369 free_bprm(bprm); 1389 free_bprm(bprm);
@@ -1375,18 +1395,16 @@ out_ret:
1375 return retval; 1395 return retval;
1376} 1396}
1377 1397
1378int set_binfmt(struct linux_binfmt *new) 1398void set_binfmt(struct linux_binfmt *new)
1379{ 1399{
1380 struct linux_binfmt *old = current->binfmt; 1400 struct mm_struct *mm = current->mm;
1381 1401
1382 if (new) { 1402 if (mm->binfmt)
1383 if (!try_module_get(new->module)) 1403 module_put(mm->binfmt->module);
1384 return -1; 1404
1385 } 1405 mm->binfmt = new;
1386 current->binfmt = new; 1406 if (new)
1387 if (old) 1407 __module_get(new->module);
1388 module_put(old->module);
1389 return 0;
1390} 1408}
1391 1409
1392EXPORT_SYMBOL(set_binfmt); 1410EXPORT_SYMBOL(set_binfmt);
@@ -1710,6 +1728,29 @@ int get_dumpable(struct mm_struct *mm)
1710 return (ret >= 2) ? 2 : ret; 1728 return (ret >= 2) ? 2 : ret;
1711} 1729}
1712 1730
1731static void wait_for_dump_helpers(struct file *file)
1732{
1733 struct pipe_inode_info *pipe;
1734
1735 pipe = file->f_path.dentry->d_inode->i_pipe;
1736
1737 pipe_lock(pipe);
1738 pipe->readers++;
1739 pipe->writers--;
1740
1741 while ((pipe->readers > 1) && (!signal_pending(current))) {
1742 wake_up_interruptible_sync(&pipe->wait);
1743 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
1744 pipe_wait(pipe);
1745 }
1746
1747 pipe->readers--;
1748 pipe->writers++;
1749 pipe_unlock(pipe);
1750
1751}
1752
1753
1713void do_coredump(long signr, int exit_code, struct pt_regs *regs) 1754void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1714{ 1755{
1715 struct core_state core_state; 1756 struct core_state core_state;
@@ -1726,11 +1767,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1726 unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur; 1767 unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
1727 char **helper_argv = NULL; 1768 char **helper_argv = NULL;
1728 int helper_argc = 0; 1769 int helper_argc = 0;
1729 char *delimit; 1770 int dump_count = 0;
1771 static atomic_t core_dump_count = ATOMIC_INIT(0);
1730 1772
1731 audit_core_dumps(signr); 1773 audit_core_dumps(signr);
1732 1774
1733 binfmt = current->binfmt; 1775 binfmt = mm->binfmt;
1734 if (!binfmt || !binfmt->core_dump) 1776 if (!binfmt || !binfmt->core_dump)
1735 goto fail; 1777 goto fail;
1736 1778
@@ -1781,54 +1823,63 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1781 lock_kernel(); 1823 lock_kernel();
1782 ispipe = format_corename(corename, signr); 1824 ispipe = format_corename(corename, signr);
1783 unlock_kernel(); 1825 unlock_kernel();
1784 /* 1826
1785 * Don't bother to check the RLIMIT_CORE value if core_pattern points
1786 * to a pipe. Since we're not writing directly to the filesystem
1787 * RLIMIT_CORE doesn't really apply, as no actual core file will be
1788 * created unless the pipe reader choses to write out the core file
1789 * at which point file size limits and permissions will be imposed
1790 * as it does with any other process
1791 */
1792 if ((!ispipe) && (core_limit < binfmt->min_coredump)) 1827 if ((!ispipe) && (core_limit < binfmt->min_coredump))
1793 goto fail_unlock; 1828 goto fail_unlock;
1794 1829
1795 if (ispipe) { 1830 if (ispipe) {
1831 if (core_limit == 0) {
1832 /*
1833 * Normally core limits are irrelevant to pipes, since
1834 * we're not writing to the file system, but we use
1835 * core_limit of 0 here as a speacial value. Any
1836 * non-zero limit gets set to RLIM_INFINITY below, but
1837 * a limit of 0 skips the dump. This is a consistent
1838 * way to catch recursive crashes. We can still crash
1839 * if the core_pattern binary sets RLIM_CORE = !0
1840 * but it runs as root, and can do lots of stupid things
1841 * Note that we use task_tgid_vnr here to grab the pid
1842 * of the process group leader. That way we get the
1843 * right pid if a thread in a multi-threaded
1844 * core_pattern process dies.
1845 */
1846 printk(KERN_WARNING
1847 "Process %d(%s) has RLIMIT_CORE set to 0\n",
1848 task_tgid_vnr(current), current->comm);
1849 printk(KERN_WARNING "Aborting core\n");
1850 goto fail_unlock;
1851 }
1852
1853 dump_count = atomic_inc_return(&core_dump_count);
1854 if (core_pipe_limit && (core_pipe_limit < dump_count)) {
1855 printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
1856 task_tgid_vnr(current), current->comm);
1857 printk(KERN_WARNING "Skipping core dump\n");
1858 goto fail_dropcount;
1859 }
1860
1796 helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc); 1861 helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
1797 if (!helper_argv) { 1862 if (!helper_argv) {
1798 printk(KERN_WARNING "%s failed to allocate memory\n", 1863 printk(KERN_WARNING "%s failed to allocate memory\n",
1799 __func__); 1864 __func__);
1800 goto fail_unlock; 1865 goto fail_dropcount;
1801 }
1802 /* Terminate the string before the first option */
1803 delimit = strchr(corename, ' ');
1804 if (delimit)
1805 *delimit = '\0';
1806 delimit = strrchr(helper_argv[0], '/');
1807 if (delimit)
1808 delimit++;
1809 else
1810 delimit = helper_argv[0];
1811 if (!strcmp(delimit, current->comm)) {
1812 printk(KERN_NOTICE "Recursive core dump detected, "
1813 "aborting\n");
1814 goto fail_unlock;
1815 } 1866 }
1816 1867
1817 core_limit = RLIM_INFINITY; 1868 core_limit = RLIM_INFINITY;
1818 1869
1819 /* SIGPIPE can happen, but it's just never processed */ 1870 /* SIGPIPE can happen, but it's just never processed */
1820 if (call_usermodehelper_pipe(corename+1, helper_argv, NULL, 1871 if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL,
1821 &file)) { 1872 &file)) {
1822 printk(KERN_INFO "Core dump to %s pipe failed\n", 1873 printk(KERN_INFO "Core dump to %s pipe failed\n",
1823 corename); 1874 corename);
1824 goto fail_unlock; 1875 goto fail_dropcount;
1825 } 1876 }
1826 } else 1877 } else
1827 file = filp_open(corename, 1878 file = filp_open(corename,
1828 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 1879 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
1829 0600); 1880 0600);
1830 if (IS_ERR(file)) 1881 if (IS_ERR(file))
1831 goto fail_unlock; 1882 goto fail_dropcount;
1832 inode = file->f_path.dentry->d_inode; 1883 inode = file->f_path.dentry->d_inode;
1833 if (inode->i_nlink > 1) 1884 if (inode->i_nlink > 1)
1834 goto close_fail; /* multiple links - don't dump */ 1885 goto close_fail; /* multiple links - don't dump */
@@ -1857,7 +1908,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1857 if (retval) 1908 if (retval)
1858 current->signal->group_exit_code |= 0x80; 1909 current->signal->group_exit_code |= 0x80;
1859close_fail: 1910close_fail:
1911 if (ispipe && core_pipe_limit)
1912 wait_for_dump_helpers(file);
1860 filp_close(file, NULL); 1913 filp_close(file, NULL);
1914fail_dropcount:
1915 if (dump_count)
1916 atomic_dec(&core_dump_count);
1861fail_unlock: 1917fail_unlock:
1862 if (helper_argv) 1918 if (helper_argv)
1863 argv_free(helper_argv); 1919 argv_free(helper_argv);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 5ab10c3bbebe..9f500dec3b59 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -214,7 +214,6 @@ int exofs_sync_fs(struct super_block *sb, int wait)
214 } 214 }
215 215
216 lock_super(sb); 216 lock_super(sb);
217 lock_kernel();
218 sbi = sb->s_fs_info; 217 sbi = sb->s_fs_info;
219 fscb->s_nextid = cpu_to_le64(sbi->s_nextid); 218 fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
220 fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles); 219 fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
@@ -245,7 +244,6 @@ int exofs_sync_fs(struct super_block *sb, int wait)
245out: 244out:
246 if (or) 245 if (or)
247 osd_end_request(or); 246 osd_end_request(or);
248 unlock_kernel();
249 unlock_super(sb); 247 unlock_super(sb);
250 kfree(fscb); 248 kfree(fscb);
251 return ret; 249 return ret;
@@ -268,8 +266,6 @@ static void exofs_put_super(struct super_block *sb)
268 int num_pend; 266 int num_pend;
269 struct exofs_sb_info *sbi = sb->s_fs_info; 267 struct exofs_sb_info *sbi = sb->s_fs_info;
270 268
271 lock_kernel();
272
273 if (sb->s_dirt) 269 if (sb->s_dirt)
274 exofs_write_super(sb); 270 exofs_write_super(sb);
275 271
@@ -286,8 +282,6 @@ static void exofs_put_super(struct super_block *sb)
286 osduld_put_device(sbi->s_dev); 282 osduld_put_device(sbi->s_dev);
287 kfree(sb->s_fs_info); 283 kfree(sb->s_fs_info);
288 sb->s_fs_info = NULL; 284 sb->s_fs_info = NULL;
289
290 unlock_kernel();
291} 285}
292 286
293/* 287/*
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index d636e1297cad..a63d44256a70 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -230,7 +230,7 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
230 return error; 230 return error;
231} 231}
232 232
233static int 233int
234ext2_check_acl(struct inode *inode, int mask) 234ext2_check_acl(struct inode *inode, int mask)
235{ 235{
236 struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS); 236 struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
@@ -246,12 +246,6 @@ ext2_check_acl(struct inode *inode, int mask)
246 return -EAGAIN; 246 return -EAGAIN;
247} 247}
248 248
249int
250ext2_permission(struct inode *inode, int mask)
251{
252 return generic_permission(inode, mask, ext2_check_acl);
253}
254
255/* 249/*
256 * Initialize the ACLs of a new inode. Called from ext2_new_inode. 250 * Initialize the ACLs of a new inode. Called from ext2_new_inode.
257 * 251 *
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index ecefe478898f..3ff6cbb9ac44 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -54,13 +54,13 @@ static inline int ext2_acl_count(size_t size)
54#ifdef CONFIG_EXT2_FS_POSIX_ACL 54#ifdef CONFIG_EXT2_FS_POSIX_ACL
55 55
56/* acl.c */ 56/* acl.c */
57extern int ext2_permission (struct inode *, int); 57extern int ext2_check_acl (struct inode *, int);
58extern int ext2_acl_chmod (struct inode *); 58extern int ext2_acl_chmod (struct inode *);
59extern int ext2_init_acl (struct inode *, struct inode *); 59extern int ext2_init_acl (struct inode *, struct inode *);
60 60
61#else 61#else
62#include <linux/sched.h> 62#include <linux/sched.h>
63#define ext2_permission NULL 63#define ext2_check_acl NULL
64#define ext2_get_acl NULL 64#define ext2_get_acl NULL
65#define ext2_set_acl NULL 65#define ext2_set_acl NULL
66 66
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 2b9e47dc9222..a2f3afd1a1c1 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -85,6 +85,6 @@ const struct inode_operations ext2_file_inode_operations = {
85 .removexattr = generic_removexattr, 85 .removexattr = generic_removexattr,
86#endif 86#endif
87 .setattr = ext2_setattr, 87 .setattr = ext2_setattr,
88 .permission = ext2_permission, 88 .check_acl = ext2_check_acl,
89 .fiemap = ext2_fiemap, 89 .fiemap = ext2_fiemap,
90}; 90};
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e27130341d4f..ade634076d0a 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -482,7 +482,7 @@ static int ext2_alloc_branch(struct inode *inode,
482 unlock_buffer(bh); 482 unlock_buffer(bh);
483 mark_buffer_dirty_inode(bh, inode); 483 mark_buffer_dirty_inode(bh, inode);
484 /* We used to sync bh here if IS_SYNC(inode). 484 /* We used to sync bh here if IS_SYNC(inode).
485 * But we now rely upon generic_osync_inode() 485 * But we now rely upon generic_write_sync()
486 * and b_inode_buffers. But not for directories. 486 * and b_inode_buffers. But not for directories.
487 */ 487 */
488 if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) 488 if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
@@ -819,6 +819,7 @@ const struct address_space_operations ext2_aops = {
819 .writepages = ext2_writepages, 819 .writepages = ext2_writepages,
820 .migratepage = buffer_migrate_page, 820 .migratepage = buffer_migrate_page,
821 .is_partially_uptodate = block_is_partially_uptodate, 821 .is_partially_uptodate = block_is_partially_uptodate,
822 .error_remove_page = generic_error_remove_page,
822}; 823};
823 824
824const struct address_space_operations ext2_aops_xip = { 825const struct address_space_operations ext2_aops_xip = {
@@ -837,6 +838,7 @@ const struct address_space_operations ext2_nobh_aops = {
837 .direct_IO = ext2_direct_IO, 838 .direct_IO = ext2_direct_IO,
838 .writepages = ext2_writepages, 839 .writepages = ext2_writepages,
839 .migratepage = buffer_migrate_page, 840 .migratepage = buffer_migrate_page,
841 .error_remove_page = generic_error_remove_page,
840}; 842};
841 843
842/* 844/*
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index e1dedb0f7873..dd7175ce5606 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -70,7 +70,7 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str
70 if (PTR_ERR(inode) == -ESTALE) { 70 if (PTR_ERR(inode) == -ESTALE) {
71 ext2_error(dir->i_sb, __func__, 71 ext2_error(dir->i_sb, __func__,
72 "deleted inode referenced: %lu", 72 "deleted inode referenced: %lu",
73 ino); 73 (unsigned long) ino);
74 return ERR_PTR(-EIO); 74 return ERR_PTR(-EIO);
75 } else { 75 } else {
76 return ERR_CAST(inode); 76 return ERR_CAST(inode);
@@ -362,6 +362,10 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
362 if (dir_de) { 362 if (dir_de) {
363 if (old_dir != new_dir) 363 if (old_dir != new_dir)
364 ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0); 364 ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0);
365 else {
366 kunmap(dir_page);
367 page_cache_release(dir_page);
368 }
365 inode_dec_link_count(old_dir); 369 inode_dec_link_count(old_dir);
366 } 370 }
367 return 0; 371 return 0;
@@ -396,7 +400,7 @@ const struct inode_operations ext2_dir_inode_operations = {
396 .removexattr = generic_removexattr, 400 .removexattr = generic_removexattr,
397#endif 401#endif
398 .setattr = ext2_setattr, 402 .setattr = ext2_setattr,
399 .permission = ext2_permission, 403 .check_acl = ext2_check_acl,
400}; 404};
401 405
402const struct inode_operations ext2_special_inode_operations = { 406const struct inode_operations ext2_special_inode_operations = {
@@ -407,5 +411,5 @@ const struct inode_operations ext2_special_inode_operations = {
407 .removexattr = generic_removexattr, 411 .removexattr = generic_removexattr,
408#endif 412#endif
409 .setattr = ext2_setattr, 413 .setattr = ext2_setattr,
410 .permission = ext2_permission, 414 .check_acl = ext2_check_acl,
411}; 415};
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
index b72b85884223..c18fbf3e4068 100644
--- a/fs/ext2/xip.c
+++ b/fs/ext2/xip.c
@@ -20,7 +20,7 @@ __inode_direct_access(struct inode *inode, sector_t block,
20 void **kaddr, unsigned long *pfn) 20 void **kaddr, unsigned long *pfn)
21{ 21{
22 struct block_device *bdev = inode->i_sb->s_bdev; 22 struct block_device *bdev = inode->i_sb->s_bdev;
23 struct block_device_operations *ops = bdev->bd_disk->fops; 23 const struct block_device_operations *ops = bdev->bd_disk->fops;
24 sector_t sector; 24 sector_t sector;
25 25
26 sector = block * (PAGE_SIZE / 512); /* ext2 block to bdev sector */ 26 sector = block * (PAGE_SIZE / 512); /* ext2 block to bdev sector */
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index e167bae37ef0..c9b0df376b5f 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -238,7 +238,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
238 return error; 238 return error;
239} 239}
240 240
241static int 241int
242ext3_check_acl(struct inode *inode, int mask) 242ext3_check_acl(struct inode *inode, int mask)
243{ 243{
244 struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); 244 struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
@@ -254,12 +254,6 @@ ext3_check_acl(struct inode *inode, int mask)
254 return -EAGAIN; 254 return -EAGAIN;
255} 255}
256 256
257int
258ext3_permission(struct inode *inode, int mask)
259{
260 return generic_permission(inode, mask, ext3_check_acl);
261}
262
263/* 257/*
264 * Initialize the ACLs of a new inode. Called from ext3_new_inode. 258 * Initialize the ACLs of a new inode. Called from ext3_new_inode.
265 * 259 *
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 07d15a3a5969..597334626de9 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -54,13 +54,13 @@ static inline int ext3_acl_count(size_t size)
54#ifdef CONFIG_EXT3_FS_POSIX_ACL 54#ifdef CONFIG_EXT3_FS_POSIX_ACL
55 55
56/* acl.c */ 56/* acl.c */
57extern int ext3_permission (struct inode *, int); 57extern int ext3_check_acl (struct inode *, int);
58extern int ext3_acl_chmod (struct inode *); 58extern int ext3_acl_chmod (struct inode *);
59extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); 59extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
60 60
61#else /* CONFIG_EXT3_FS_POSIX_ACL */ 61#else /* CONFIG_EXT3_FS_POSIX_ACL */
62#include <linux/sched.h> 62#include <linux/sched.h>
63#define ext3_permission NULL 63#define ext3_check_acl NULL
64 64
65static inline int 65static inline int
66ext3_acl_chmod(struct inode *inode) 66ext3_acl_chmod(struct inode *inode)
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 5b49704b231b..388bbdfa0b4e 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -51,71 +51,12 @@ static int ext3_release_file (struct inode * inode, struct file * filp)
51 return 0; 51 return 0;
52} 52}
53 53
54static ssize_t
55ext3_file_write(struct kiocb *iocb, const struct iovec *iov,
56 unsigned long nr_segs, loff_t pos)
57{
58 struct file *file = iocb->ki_filp;
59 struct inode *inode = file->f_path.dentry->d_inode;
60 ssize_t ret;
61 int err;
62
63 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
64
65 /*
66 * Skip flushing if there was an error, or if nothing was written.
67 */
68 if (ret <= 0)
69 return ret;
70
71 /*
72 * If the inode is IS_SYNC, or is O_SYNC and we are doing data
73 * journalling then we need to make sure that we force the transaction
74 * to disk to keep all metadata uptodate synchronously.
75 */
76 if (file->f_flags & O_SYNC) {
77 /*
78 * If we are non-data-journaled, then the dirty data has
79 * already been flushed to backing store by generic_osync_inode,
80 * and the inode has been flushed too if there have been any
81 * modifications other than mere timestamp updates.
82 *
83 * Open question --- do we care about flushing timestamps too
84 * if the inode is IS_SYNC?
85 */
86 if (!ext3_should_journal_data(inode))
87 return ret;
88
89 goto force_commit;
90 }
91
92 /*
93 * So we know that there has been no forced data flush. If the inode
94 * is marked IS_SYNC, we need to force one ourselves.
95 */
96 if (!IS_SYNC(inode))
97 return ret;
98
99 /*
100 * Open question #2 --- should we force data to disk here too? If we
101 * don't, the only impact is that data=writeback filesystems won't
102 * flush data to disk automatically on IS_SYNC, only metadata (but
103 * historically, that is what ext2 has done.)
104 */
105
106force_commit:
107 err = ext3_force_commit(inode->i_sb);
108 if (err)
109 return err;
110 return ret;
111}
112
113const struct file_operations ext3_file_operations = { 54const struct file_operations ext3_file_operations = {
114 .llseek = generic_file_llseek, 55 .llseek = generic_file_llseek,
115 .read = do_sync_read, 56 .read = do_sync_read,
116 .write = do_sync_write, 57 .write = do_sync_write,
117 .aio_read = generic_file_aio_read, 58 .aio_read = generic_file_aio_read,
118 .aio_write = ext3_file_write, 59 .aio_write = generic_file_aio_write,
119 .unlocked_ioctl = ext3_ioctl, 60 .unlocked_ioctl = ext3_ioctl,
120#ifdef CONFIG_COMPAT 61#ifdef CONFIG_COMPAT
121 .compat_ioctl = ext3_compat_ioctl, 62 .compat_ioctl = ext3_compat_ioctl,
@@ -137,7 +78,7 @@ const struct inode_operations ext3_file_inode_operations = {
137 .listxattr = ext3_listxattr, 78 .listxattr = ext3_listxattr,
138 .removexattr = generic_removexattr, 79 .removexattr = generic_removexattr,
139#endif 80#endif
140 .permission = ext3_permission, 81 .check_acl = ext3_check_acl,
141 .fiemap = ext3_fiemap, 82 .fiemap = ext3_fiemap,
142}; 83};
143 84
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index d33634119e17..451d166bbe93 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -23,6 +23,7 @@
23 */ 23 */
24 24
25#include <linux/time.h> 25#include <linux/time.h>
26#include <linux/blkdev.h>
26#include <linux/fs.h> 27#include <linux/fs.h>
27#include <linux/sched.h> 28#include <linux/sched.h>
28#include <linux/writeback.h> 29#include <linux/writeback.h>
@@ -73,7 +74,7 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
73 } 74 }
74 75
75 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 76 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
76 goto out; 77 goto flush;
77 78
78 /* 79 /*
79 * The VFS has written the file data. If the inode is unaltered 80 * The VFS has written the file data. If the inode is unaltered
@@ -85,7 +86,16 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
85 .nr_to_write = 0, /* sys_fsync did this */ 86 .nr_to_write = 0, /* sys_fsync did this */
86 }; 87 };
87 ret = sync_inode(inode, &wbc); 88 ret = sync_inode(inode, &wbc);
89 goto out;
88 } 90 }
91flush:
92 /*
93 * In case we didn't commit a transaction, we have to flush
94 * disk caches manually so that data really is on persistent
95 * storage
96 */
97 if (test_opt(inode->i_sb, BARRIER))
98 blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
89out: 99out:
90 return ret; 100 return ret;
91} 101}
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index b49908a167ae..acf1b1423327 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -172,10 +172,21 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
172 * so before we call here everything must be consistently dirtied against 172 * so before we call here everything must be consistently dirtied against
173 * this transaction. 173 * this transaction.
174 */ 174 */
175static int ext3_journal_test_restart(handle_t *handle, struct inode *inode) 175static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
176{ 176{
177 int ret;
178
177 jbd_debug(2, "restarting handle %p\n", handle); 179 jbd_debug(2, "restarting handle %p\n", handle);
178 return ext3_journal_restart(handle, blocks_for_truncate(inode)); 180 /*
181 * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle
182 * At this moment, get_block can be called only for blocks inside
183 * i_size since page cache has been already dropped and writes are
184 * blocked by i_mutex. So we can safely drop the truncate_mutex.
185 */
186 mutex_unlock(&EXT3_I(inode)->truncate_mutex);
187 ret = ext3_journal_restart(handle, blocks_for_truncate(inode));
188 mutex_lock(&EXT3_I(inode)->truncate_mutex);
189 return ret;
179} 190}
180 191
181/* 192/*
@@ -1819,6 +1830,7 @@ static const struct address_space_operations ext3_ordered_aops = {
1819 .direct_IO = ext3_direct_IO, 1830 .direct_IO = ext3_direct_IO,
1820 .migratepage = buffer_migrate_page, 1831 .migratepage = buffer_migrate_page,
1821 .is_partially_uptodate = block_is_partially_uptodate, 1832 .is_partially_uptodate = block_is_partially_uptodate,
1833 .error_remove_page = generic_error_remove_page,
1822}; 1834};
1823 1835
1824static const struct address_space_operations ext3_writeback_aops = { 1836static const struct address_space_operations ext3_writeback_aops = {
@@ -1834,6 +1846,7 @@ static const struct address_space_operations ext3_writeback_aops = {
1834 .direct_IO = ext3_direct_IO, 1846 .direct_IO = ext3_direct_IO,
1835 .migratepage = buffer_migrate_page, 1847 .migratepage = buffer_migrate_page,
1836 .is_partially_uptodate = block_is_partially_uptodate, 1848 .is_partially_uptodate = block_is_partially_uptodate,
1849 .error_remove_page = generic_error_remove_page,
1837}; 1850};
1838 1851
1839static const struct address_space_operations ext3_journalled_aops = { 1852static const struct address_space_operations ext3_journalled_aops = {
@@ -1848,6 +1861,7 @@ static const struct address_space_operations ext3_journalled_aops = {
1848 .invalidatepage = ext3_invalidatepage, 1861 .invalidatepage = ext3_invalidatepage,
1849 .releasepage = ext3_releasepage, 1862 .releasepage = ext3_releasepage,
1850 .is_partially_uptodate = block_is_partially_uptodate, 1863 .is_partially_uptodate = block_is_partially_uptodate,
1864 .error_remove_page = generic_error_remove_page,
1851}; 1865};
1852 1866
1853void ext3_set_aops(struct inode *inode) 1867void ext3_set_aops(struct inode *inode)
@@ -2072,7 +2086,7 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
2072 ext3_journal_dirty_metadata(handle, bh); 2086 ext3_journal_dirty_metadata(handle, bh);
2073 } 2087 }
2074 ext3_mark_inode_dirty(handle, inode); 2088 ext3_mark_inode_dirty(handle, inode);
2075 ext3_journal_test_restart(handle, inode); 2089 truncate_restart_transaction(handle, inode);
2076 if (bh) { 2090 if (bh) {
2077 BUFFER_TRACE(bh, "retaking write access"); 2091 BUFFER_TRACE(bh, "retaking write access");
2078 ext3_journal_get_write_access(handle, bh); 2092 ext3_journal_get_write_access(handle, bh);
@@ -2282,7 +2296,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
2282 return; 2296 return;
2283 if (try_to_extend_transaction(handle, inode)) { 2297 if (try_to_extend_transaction(handle, inode)) {
2284 ext3_mark_inode_dirty(handle, inode); 2298 ext3_mark_inode_dirty(handle, inode);
2285 ext3_journal_test_restart(handle, inode); 2299 truncate_restart_transaction(handle, inode);
2286 } 2300 }
2287 2301
2288 ext3_free_blocks(handle, inode, nr, 1); 2302 ext3_free_blocks(handle, inode, nr, 1);
@@ -2892,6 +2906,10 @@ static int ext3_do_update_inode(handle_t *handle,
2892 struct buffer_head *bh = iloc->bh; 2906 struct buffer_head *bh = iloc->bh;
2893 int err = 0, rc, block; 2907 int err = 0, rc, block;
2894 2908
2909again:
2910 /* we can't allow multiple procs in here at once, its a bit racey */
2911 lock_buffer(bh);
2912
2895 /* For fields not not tracking in the in-memory inode, 2913 /* For fields not not tracking in the in-memory inode,
2896 * initialise them to zero for new inodes. */ 2914 * initialise them to zero for new inodes. */
2897 if (ei->i_state & EXT3_STATE_NEW) 2915 if (ei->i_state & EXT3_STATE_NEW)
@@ -2951,16 +2969,20 @@ static int ext3_do_update_inode(handle_t *handle,
2951 /* If this is the first large file 2969 /* If this is the first large file
2952 * created, add a flag to the superblock. 2970 * created, add a flag to the superblock.
2953 */ 2971 */
2972 unlock_buffer(bh);
2954 err = ext3_journal_get_write_access(handle, 2973 err = ext3_journal_get_write_access(handle,
2955 EXT3_SB(sb)->s_sbh); 2974 EXT3_SB(sb)->s_sbh);
2956 if (err) 2975 if (err)
2957 goto out_brelse; 2976 goto out_brelse;
2977
2958 ext3_update_dynamic_rev(sb); 2978 ext3_update_dynamic_rev(sb);
2959 EXT3_SET_RO_COMPAT_FEATURE(sb, 2979 EXT3_SET_RO_COMPAT_FEATURE(sb,
2960 EXT3_FEATURE_RO_COMPAT_LARGE_FILE); 2980 EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
2961 handle->h_sync = 1; 2981 handle->h_sync = 1;
2962 err = ext3_journal_dirty_metadata(handle, 2982 err = ext3_journal_dirty_metadata(handle,
2963 EXT3_SB(sb)->s_sbh); 2983 EXT3_SB(sb)->s_sbh);
2984 /* get our lock and start over */
2985 goto again;
2964 } 2986 }
2965 } 2987 }
2966 } 2988 }
@@ -2983,6 +3005,7 @@ static int ext3_do_update_inode(handle_t *handle,
2983 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 3005 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
2984 3006
2985 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); 3007 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
3008 unlock_buffer(bh);
2986 rc = ext3_journal_dirty_metadata(handle, bh); 3009 rc = ext3_journal_dirty_metadata(handle, bh);
2987 if (!err) 3010 if (!err)
2988 err = rc; 3011 err = rc;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 6ff7b9730234..aad6400c9b77 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2445,7 +2445,7 @@ const struct inode_operations ext3_dir_inode_operations = {
2445 .listxattr = ext3_listxattr, 2445 .listxattr = ext3_listxattr,
2446 .removexattr = generic_removexattr, 2446 .removexattr = generic_removexattr,
2447#endif 2447#endif
2448 .permission = ext3_permission, 2448 .check_acl = ext3_check_acl,
2449}; 2449};
2450 2450
2451const struct inode_operations ext3_special_inode_operations = { 2451const struct inode_operations ext3_special_inode_operations = {
@@ -2456,5 +2456,5 @@ const struct inode_operations ext3_special_inode_operations = {
2456 .listxattr = ext3_listxattr, 2456 .listxattr = ext3_listxattr,
2457 .removexattr = generic_removexattr, 2457 .removexattr = generic_removexattr,
2458#endif 2458#endif
2459 .permission = ext3_permission, 2459 .check_acl = ext3_check_acl,
2460}; 2460};
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index a8d80a7f1105..72743d360509 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -720,7 +720,7 @@ static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
720static ssize_t ext3_quota_write(struct super_block *sb, int type, 720static ssize_t ext3_quota_write(struct super_block *sb, int type,
721 const char *data, size_t len, loff_t off); 721 const char *data, size_t len, loff_t off);
722 722
723static struct dquot_operations ext3_quota_operations = { 723static const struct dquot_operations ext3_quota_operations = {
724 .initialize = dquot_initialize, 724 .initialize = dquot_initialize,
725 .drop = dquot_drop, 725 .drop = dquot_drop,
726 .alloc_space = dquot_alloc_space, 726 .alloc_space = dquot_alloc_space,
@@ -737,7 +737,7 @@ static struct dquot_operations ext3_quota_operations = {
737 .destroy_dquot = dquot_destroy, 737 .destroy_dquot = dquot_destroy,
738}; 738};
739 739
740static struct quotactl_ops ext3_qctl_operations = { 740static const struct quotactl_ops ext3_qctl_operations = {
741 .quota_on = ext3_quota_on, 741 .quota_on = ext3_quota_on,
742 .quota_off = vfs_quota_off, 742 .quota_off = vfs_quota_off,
743 .quota_sync = vfs_quota_sync, 743 .quota_sync = vfs_quota_sync,
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 418b6f3b0ae8..9f2d45d75b1a 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -26,20 +26,6 @@ config EXT4_FS
26 26
27 If unsure, say N. 27 If unsure, say N.
28 28
29config EXT4DEV_COMPAT
30 bool "Enable ext4dev compatibility"
31 depends on EXT4_FS
32 help
33 Starting with 2.6.28, the name of the ext4 filesystem was
34 renamed from ext4dev to ext4. Unfortunately there are some
35 legacy userspace programs (such as klibc's fstype) have
36 "ext4dev" hardcoded.
37
38 To enable backwards compatibility so that systems that are
39 still expecting to mount ext4 filesystems using ext4dev,
40 chose Y here. This feature will go away by 2.6.31, so
41 please arrange to get your userspace programs fixed!
42
43config EXT4_FS_XATTR 29config EXT4_FS_XATTR
44 bool "Ext4 extended attributes" 30 bool "Ext4 extended attributes"
45 depends on EXT4_FS 31 depends on EXT4_FS
@@ -77,3 +63,12 @@ config EXT4_FS_SECURITY
77 63
78 If you are not using a security module that requires using 64 If you are not using a security module that requires using
79 extended attributes for file security labels, say N. 65 extended attributes for file security labels, say N.
66
67config EXT4_DEBUG
68 bool "EXT4 debugging support"
69 depends on EXT4_FS
70 help
71 Enables run-time debugging support for the ext4 filesystem.
72
73 If you select Y here, then you will be able to turn on debugging
74 with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index f6d8967149ca..0df88b2a69b0 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -236,7 +236,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
236 return error; 236 return error;
237} 237}
238 238
239static int 239int
240ext4_check_acl(struct inode *inode, int mask) 240ext4_check_acl(struct inode *inode, int mask)
241{ 241{
242 struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); 242 struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
@@ -252,12 +252,6 @@ ext4_check_acl(struct inode *inode, int mask)
252 return -EAGAIN; 252 return -EAGAIN;
253} 253}
254 254
255int
256ext4_permission(struct inode *inode, int mask)
257{
258 return generic_permission(inode, mask, ext4_check_acl);
259}
260
261/* 255/*
262 * Initialize the ACLs of a new inode. Called from ext4_new_inode. 256 * Initialize the ACLs of a new inode. Called from ext4_new_inode.
263 * 257 *
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 949789d2bba6..9d843d5deac4 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -54,13 +54,13 @@ static inline int ext4_acl_count(size_t size)
54#ifdef CONFIG_EXT4_FS_POSIX_ACL 54#ifdef CONFIG_EXT4_FS_POSIX_ACL
55 55
56/* acl.c */ 56/* acl.c */
57extern int ext4_permission(struct inode *, int); 57extern int ext4_check_acl(struct inode *, int);
58extern int ext4_acl_chmod(struct inode *); 58extern int ext4_acl_chmod(struct inode *);
59extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); 59extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
60 60
61#else /* CONFIG_EXT4_FS_POSIX_ACL */ 61#else /* CONFIG_EXT4_FS_POSIX_ACL */
62#include <linux/sched.h> 62#include <linux/sched.h>
63#define ext4_permission NULL 63#define ext4_check_acl NULL
64 64
65static inline int 65static inline int
66ext4_acl_chmod(struct inode *inode) 66ext4_acl_chmod(struct inode *inode)
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e2126d70dff5..1d0418980f8d 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -478,7 +478,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
478 * new bitmap information 478 * new bitmap information
479 */ 479 */
480 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); 480 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
481 ext4_mb_update_group_info(grp, blocks_freed); 481 grp->bb_free += blocks_freed;
482 up_write(&grp->alloc_sem); 482 up_write(&grp->alloc_sem);
483 483
484 /* We dirtied the bitmap block */ 484 /* We dirtied the bitmap block */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9714db393efe..984ca0cb38c3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -65,29 +65,37 @@ typedef __u32 ext4_lblk_t;
65/* data type for block group number */ 65/* data type for block group number */
66typedef unsigned int ext4_group_t; 66typedef unsigned int ext4_group_t;
67 67
68/*
69 * Flags used in mballoc's allocation_context flags field.
70 *
71 * Also used to show what's going on for debugging purposes when the
72 * flag field is exported via the traceport interface
73 */
68 74
69/* prefer goal again. length */ 75/* prefer goal again. length */
70#define EXT4_MB_HINT_MERGE 1 76#define EXT4_MB_HINT_MERGE 0x0001
71/* blocks already reserved */ 77/* blocks already reserved */
72#define EXT4_MB_HINT_RESERVED 2 78#define EXT4_MB_HINT_RESERVED 0x0002
73/* metadata is being allocated */ 79/* metadata is being allocated */
74#define EXT4_MB_HINT_METADATA 4 80#define EXT4_MB_HINT_METADATA 0x0004
75/* first blocks in the file */ 81/* first blocks in the file */
76#define EXT4_MB_HINT_FIRST 8 82#define EXT4_MB_HINT_FIRST 0x0008
77/* search for the best chunk */ 83/* search for the best chunk */
78#define EXT4_MB_HINT_BEST 16 84#define EXT4_MB_HINT_BEST 0x0010
79/* data is being allocated */ 85/* data is being allocated */
80#define EXT4_MB_HINT_DATA 32 86#define EXT4_MB_HINT_DATA 0x0020
81/* don't preallocate (for tails) */ 87/* don't preallocate (for tails) */
82#define EXT4_MB_HINT_NOPREALLOC 64 88#define EXT4_MB_HINT_NOPREALLOC 0x0040
83/* allocate for locality group */ 89/* allocate for locality group */
84#define EXT4_MB_HINT_GROUP_ALLOC 128 90#define EXT4_MB_HINT_GROUP_ALLOC 0x0080
85/* allocate goal blocks or none */ 91/* allocate goal blocks or none */
86#define EXT4_MB_HINT_GOAL_ONLY 256 92#define EXT4_MB_HINT_GOAL_ONLY 0x0100
87/* goal is meaningful */ 93/* goal is meaningful */
88#define EXT4_MB_HINT_TRY_GOAL 512 94#define EXT4_MB_HINT_TRY_GOAL 0x0200
89/* blocks already pre-reserved by delayed allocation */ 95/* blocks already pre-reserved by delayed allocation */
90#define EXT4_MB_DELALLOC_RESERVED 1024 96#define EXT4_MB_DELALLOC_RESERVED 0x0400
97/* We are doing stream allocation */
98#define EXT4_MB_STREAM_ALLOC 0x0800
91 99
92 100
93struct ext4_allocation_request { 101struct ext4_allocation_request {
@@ -112,6 +120,31 @@ struct ext4_allocation_request {
112}; 120};
113 121
114/* 122/*
123 * For delayed allocation tracking
124 */
125struct mpage_da_data {
126 struct inode *inode;
127 sector_t b_blocknr; /* start block number of extent */
128 size_t b_size; /* size of extent */
129 unsigned long b_state; /* state of the extent */
130 unsigned long first_page, next_page; /* extent of pages */
131 struct writeback_control *wbc;
132 int io_done;
133 int pages_written;
134 int retval;
135};
136#define DIO_AIO_UNWRITTEN 0x1
137typedef struct ext4_io_end {
138 struct list_head list; /* per-file finished AIO list */
139 struct inode *inode; /* file being written to */
140 unsigned int flag; /* unwritten or not */
141 int error; /* I/O error code */
142 ext4_lblk_t offset; /* offset in the file */
143 size_t size; /* size of the extent */
144 struct work_struct work; /* data work queue */
145} ext4_io_end_t;
146
147/*
115 * Special inodes numbers 148 * Special inodes numbers
116 */ 149 */
117#define EXT4_BAD_INO 1 /* Bad blocks inode */ 150#define EXT4_BAD_INO 1 /* Bad blocks inode */
@@ -251,7 +284,6 @@ struct flex_groups {
251#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ 284#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
252#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ 285#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
253#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ 286#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
254#define EXT4_EXT_MIGRATE 0x00100000 /* Inode is migrating */
255#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ 287#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
256 288
257#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ 289#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
@@ -289,6 +321,7 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
289#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */ 321#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
290#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ 322#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
291#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ 323#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
324#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
292 325
293/* Used to pass group descriptor data when online resize is done */ 326/* Used to pass group descriptor data when online resize is done */
294struct ext4_new_group_input { 327struct ext4_new_group_input {
@@ -330,7 +363,16 @@ struct ext4_new_group_data {
330 /* Call ext4_da_update_reserve_space() after successfully 363 /* Call ext4_da_update_reserve_space() after successfully
331 allocating the blocks */ 364 allocating the blocks */
332#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008 365#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
333 366 /* caller is from the direct IO path, request to creation of an
367 unitialized extents if not allocated, split the uninitialized
368 extent if blocks has been preallocated already*/
369#define EXT4_GET_BLOCKS_DIO 0x0010
370#define EXT4_GET_BLOCKS_CONVERT 0x0020
371#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\
372 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
373 /* Convert extent to initialized after direct IO complete */
374#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
375 EXT4_GET_BLOCKS_DIO_CREATE_EXT)
334 376
335/* 377/*
336 * ioctl commands 378 * ioctl commands
@@ -386,6 +428,9 @@ struct ext4_mount_options {
386#endif 428#endif
387}; 429};
388 430
431/* Max physical block we can addres w/o extents */
432#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
433
389/* 434/*
390 * Structure of an inode on the disk 435 * Structure of an inode on the disk
391 */ 436 */
@@ -456,7 +501,6 @@ struct move_extent {
456 __u64 len; /* block length to be moved */ 501 __u64 len; /* block length to be moved */
457 __u64 moved_len; /* moved block length */ 502 __u64 moved_len; /* moved block length */
458}; 503};
459#define MAX_DEFRAG_SIZE ((1UL<<31) - 1)
460 504
461#define EXT4_EPOCH_BITS 2 505#define EXT4_EPOCH_BITS 2
462#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) 506#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
@@ -481,8 +525,8 @@ struct move_extent {
481static inline __le32 ext4_encode_extra_time(struct timespec *time) 525static inline __le32 ext4_encode_extra_time(struct timespec *time)
482{ 526{
483 return cpu_to_le32((sizeof(time->tv_sec) > 4 ? 527 return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
484 time->tv_sec >> 32 : 0) | 528 (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) |
485 ((time->tv_nsec << 2) & EXT4_NSEC_MASK)); 529 ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK));
486} 530}
487 531
488static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) 532static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
@@ -490,7 +534,7 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
490 if (sizeof(time->tv_sec) > 4) 534 if (sizeof(time->tv_sec) > 4)
491 time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) 535 time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
492 << 32; 536 << 32;
493 time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2; 537 time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
494} 538}
495 539
496#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ 540#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
@@ -653,6 +697,11 @@ struct ext4_inode_info {
653 __u16 i_extra_isize; 697 __u16 i_extra_isize;
654 698
655 spinlock_t i_block_reservation_lock; 699 spinlock_t i_block_reservation_lock;
700
701 /* completed async DIOs that might need unwritten extents handling */
702 struct list_head i_aio_dio_complete_list;
703 /* current io_end structure for async DIO write*/
704 ext4_io_end_t *cur_aio_dio;
656}; 705};
657 706
658/* 707/*
@@ -694,7 +743,6 @@ struct ext4_inode_info {
694#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ 743#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
695#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ 744#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
696#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ 745#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
697#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
698#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 746#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
699#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 747#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
700#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 748#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
@@ -841,6 +889,7 @@ struct ext4_sb_info {
841 unsigned long s_gdb_count; /* Number of group descriptor blocks */ 889 unsigned long s_gdb_count; /* Number of group descriptor blocks */
842 unsigned long s_desc_per_block; /* Number of group descriptors per block */ 890 unsigned long s_desc_per_block; /* Number of group descriptors per block */
843 ext4_group_t s_groups_count; /* Number of groups in the fs */ 891 ext4_group_t s_groups_count; /* Number of groups in the fs */
892 ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
844 unsigned long s_overhead_last; /* Last calculated overhead */ 893 unsigned long s_overhead_last; /* Last calculated overhead */
845 unsigned long s_blocks_last; /* Last seen block count */ 894 unsigned long s_blocks_last; /* Last seen block count */
846 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ 895 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
@@ -923,18 +972,11 @@ struct ext4_sb_info {
923 unsigned int s_mb_stats; 972 unsigned int s_mb_stats;
924 unsigned int s_mb_order2_reqs; 973 unsigned int s_mb_order2_reqs;
925 unsigned int s_mb_group_prealloc; 974 unsigned int s_mb_group_prealloc;
975 unsigned int s_max_writeback_mb_bump;
926 /* where last allocation was done - for stream allocation */ 976 /* where last allocation was done - for stream allocation */
927 unsigned long s_mb_last_group; 977 unsigned long s_mb_last_group;
928 unsigned long s_mb_last_start; 978 unsigned long s_mb_last_start;
929 979
930 /* history to debug policy */
931 struct ext4_mb_history *s_mb_history;
932 int s_mb_history_cur;
933 int s_mb_history_max;
934 int s_mb_history_num;
935 spinlock_t s_mb_history_lock;
936 int s_mb_history_filter;
937
938 /* stats for buddy allocator */ 980 /* stats for buddy allocator */
939 spinlock_t s_mb_pa_lock; 981 spinlock_t s_mb_pa_lock;
940 atomic_t s_bal_reqs; /* number of reqs with len > 1 */ 982 atomic_t s_bal_reqs; /* number of reqs with len > 1 */
@@ -950,6 +992,7 @@ struct ext4_sb_info {
950 atomic_t s_mb_lost_chunks; 992 atomic_t s_mb_lost_chunks;
951 atomic_t s_mb_preallocated; 993 atomic_t s_mb_preallocated;
952 atomic_t s_mb_discarded; 994 atomic_t s_mb_discarded;
995 atomic_t s_lock_busy;
953 996
954 /* locality groups */ 997 /* locality groups */
955 struct ext4_locality_group *s_locality_groups; 998 struct ext4_locality_group *s_locality_groups;
@@ -960,6 +1003,9 @@ struct ext4_sb_info {
960 1003
961 unsigned int s_log_groups_per_flex; 1004 unsigned int s_log_groups_per_flex;
962 struct flex_groups *s_flex_groups; 1005 struct flex_groups *s_flex_groups;
1006
1007 /* workqueue for dio unwritten */
1008 struct workqueue_struct *dio_unwritten_wq;
963}; 1009};
964 1010
965static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 1011static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1340,8 +1386,6 @@ extern void ext4_mb_free_blocks(handle_t *, struct inode *,
1340 ext4_fsblk_t, unsigned long, int, unsigned long *); 1386 ext4_fsblk_t, unsigned long, int, unsigned long *);
1341extern int ext4_mb_add_groupinfo(struct super_block *sb, 1387extern int ext4_mb_add_groupinfo(struct super_block *sb,
1342 ext4_group_t i, struct ext4_group_desc *desc); 1388 ext4_group_t i, struct ext4_group_desc *desc);
1343extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
1344 ext4_grpblk_t add);
1345extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); 1389extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
1346extern void ext4_mb_put_buddy_cache_lock(struct super_block *, 1390extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
1347 ext4_group_t, int); 1391 ext4_group_t, int);
@@ -1367,6 +1411,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
1367extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); 1411extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
1368extern int ext4_can_truncate(struct inode *inode); 1412extern int ext4_can_truncate(struct inode *inode);
1369extern void ext4_truncate(struct inode *); 1413extern void ext4_truncate(struct inode *);
1414extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
1370extern void ext4_set_inode_flags(struct inode *); 1415extern void ext4_set_inode_flags(struct inode *);
1371extern void ext4_get_inode_flags(struct ext4_inode_info *); 1416extern void ext4_get_inode_flags(struct ext4_inode_info *);
1372extern int ext4_alloc_da_blocks(struct inode *inode); 1417extern int ext4_alloc_da_blocks(struct inode *inode);
@@ -1378,7 +1423,7 @@ extern int ext4_block_truncate_page(handle_t *handle,
1378 struct address_space *mapping, loff_t from); 1423 struct address_space *mapping, loff_t from);
1379extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 1424extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1380extern qsize_t ext4_get_reserved_space(struct inode *inode); 1425extern qsize_t ext4_get_reserved_space(struct inode *inode);
1381 1426extern int flush_aio_dio_completed_IO(struct inode *inode);
1382/* ioctl.c */ 1427/* ioctl.c */
1383extern long ext4_ioctl(struct file *, unsigned int, unsigned long); 1428extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
1384extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); 1429extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
@@ -1575,15 +1620,18 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
1575struct ext4_group_info { 1620struct ext4_group_info {
1576 unsigned long bb_state; 1621 unsigned long bb_state;
1577 struct rb_root bb_free_root; 1622 struct rb_root bb_free_root;
1578 unsigned short bb_first_free; 1623 ext4_grpblk_t bb_first_free; /* first free block */
1579 unsigned short bb_free; 1624 ext4_grpblk_t bb_free; /* total free blocks */
1580 unsigned short bb_fragments; 1625 ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
1581 struct list_head bb_prealloc_list; 1626 struct list_head bb_prealloc_list;
1582#ifdef DOUBLE_CHECK 1627#ifdef DOUBLE_CHECK
1583 void *bb_bitmap; 1628 void *bb_bitmap;
1584#endif 1629#endif
1585 struct rw_semaphore alloc_sem; 1630 struct rw_semaphore alloc_sem;
1586 unsigned short bb_counters[]; 1631 ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block
1632 * regions, index is order.
1633 * bb_counters[3] = 5 means
1634 * 5 free 8-block regions. */
1587}; 1635};
1588 1636
1589#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 1637#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
@@ -1591,15 +1639,42 @@ struct ext4_group_info {
1591#define EXT4_MB_GRP_NEED_INIT(grp) \ 1639#define EXT4_MB_GRP_NEED_INIT(grp) \
1592 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) 1640 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
1593 1641
1642#define EXT4_MAX_CONTENTION 8
1643#define EXT4_CONTENTION_THRESHOLD 2
1644
1594static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, 1645static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
1595 ext4_group_t group) 1646 ext4_group_t group)
1596{ 1647{
1597 return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); 1648 return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
1598} 1649}
1599 1650
1651/*
1652 * Returns true if the filesystem is busy enough that attempts to
1653 * access the block group locks has run into contention.
1654 */
1655static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
1656{
1657 return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
1658}
1659
1600static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) 1660static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
1601{ 1661{
1602 spin_lock(ext4_group_lock_ptr(sb, group)); 1662 spinlock_t *lock = ext4_group_lock_ptr(sb, group);
1663 if (spin_trylock(lock))
1664 /*
1665 * We're able to grab the lock right away, so drop the
1666 * lock contention counter.
1667 */
1668 atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
1669 else {
1670 /*
1671 * The lock is busy, so bump the contention counter,
1672 * and then wait on the spin lock.
1673 */
1674 atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
1675 EXT4_MAX_CONTENTION);
1676 spin_lock(lock);
1677 }
1603} 1678}
1604 1679
1605static inline void ext4_unlock_group(struct super_block *sb, 1680static inline void ext4_unlock_group(struct super_block *sb,
@@ -1650,6 +1725,8 @@ extern void ext4_ext_init(struct super_block *);
1650extern void ext4_ext_release(struct super_block *); 1725extern void ext4_ext_release(struct super_block *);
1651extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, 1726extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1652 loff_t len); 1727 loff_t len);
1728extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
1729 loff_t len);
1653extern int ext4_get_blocks(handle_t *handle, struct inode *inode, 1730extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
1654 sector_t block, unsigned int max_blocks, 1731 sector_t block, unsigned int max_blocks,
1655 struct buffer_head *bh, int flags); 1732 struct buffer_head *bh, int flags);
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 20a84105a10b..2ca686454e87 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -43,8 +43,7 @@
43#define CHECK_BINSEARCH__ 43#define CHECK_BINSEARCH__
44 44
45/* 45/*
46 * If EXT_DEBUG is defined you can use the 'extdebug' mount option 46 * Turn on EXT_DEBUG to get lots of info about extents operations.
47 * to get lots of info about what's going on.
48 */ 47 */
49#define EXT_DEBUG__ 48#define EXT_DEBUG__
50#ifdef EXT_DEBUG 49#ifdef EXT_DEBUG
@@ -138,6 +137,7 @@ typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
138#define EXT_BREAK 1 137#define EXT_BREAK 1
139#define EXT_REPEAT 2 138#define EXT_REPEAT 2
140 139
140/* Maximum logical block in a file; ext4_extent's ee_block is __le32 */
141#define EXT_MAX_BLOCK 0xffffffff 141#define EXT_MAX_BLOCK 0xffffffff
142 142
143/* 143/*
@@ -220,6 +220,11 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
220 (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); 220 (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
221} 221}
222 222
223static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
224{
225 ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
226}
227
223extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); 228extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
224extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex); 229extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
225extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); 230extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
@@ -235,7 +240,7 @@ extern int ext4_ext_try_to_merge(struct inode *inode,
235 struct ext4_ext_path *path, 240 struct ext4_ext_path *path,
236 struct ext4_extent *); 241 struct ext4_extent *);
237extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); 242extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
238extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *); 243extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
239extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t, 244extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
240 ext_prepare_callback, void *); 245 ext_prepare_callback, void *);
241extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, 246extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index eb27fd0f2ee8..6a9409920dee 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -44,7 +44,7 @@ int __ext4_journal_forget(const char *where, handle_t *handle,
44 handle, err); 44 handle, err);
45 } 45 }
46 else 46 else
47 brelse(bh); 47 bforget(bh);
48 return err; 48 return err;
49} 49}
50 50
@@ -60,7 +60,7 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
60 handle, err); 60 handle, err);
61 } 61 }
62 else 62 else
63 brelse(bh); 63 bforget(bh);
64 return err; 64 return err;
65} 65}
66 66
@@ -89,7 +89,10 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
89 ext4_journal_abort_handle(where, __func__, bh, 89 ext4_journal_abort_handle(where, __func__, bh,
90 handle, err); 90 handle, err);
91 } else { 91 } else {
92 mark_buffer_dirty(bh); 92 if (inode && bh)
93 mark_buffer_dirty_inode(bh, inode);
94 else
95 mark_buffer_dirty(bh);
93 if (inode && inode_needs_sync(inode)) { 96 if (inode && inode_needs_sync(inode)) {
94 sync_dirty_buffer(bh); 97 sync_dirty_buffer(bh);
95 if (buffer_req(bh) && !buffer_uptodate(bh)) { 98 if (buffer_req(bh) && !buffer_uptodate(bh)) {
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 139fb8cb87e4..a2865980342f 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -161,11 +161,13 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
161handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); 161handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
162int __ext4_journal_stop(const char *where, handle_t *handle); 162int __ext4_journal_stop(const char *where, handle_t *handle);
163 163
164#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1) 164#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
165 165
166/* Note: Do not use this for NULL handles. This is only to determine if
167 * a properly allocated handle is using a journal or not. */
166static inline int ext4_handle_valid(handle_t *handle) 168static inline int ext4_handle_valid(handle_t *handle)
167{ 169{
168 if (handle == EXT4_NOJOURNAL_HANDLE) 170 if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
169 return 0; 171 return 0;
170 return 1; 172 return 1;
171} 173}
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 73ebfb44ad75..10539e364283 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -93,7 +93,9 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
93 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); 93 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
94} 94}
95 95
96static int ext4_ext_journal_restart(handle_t *handle, int needed) 96static int ext4_ext_truncate_extend_restart(handle_t *handle,
97 struct inode *inode,
98 int needed)
97{ 99{
98 int err; 100 int err;
99 101
@@ -104,7 +106,14 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
104 err = ext4_journal_extend(handle, needed); 106 err = ext4_journal_extend(handle, needed);
105 if (err <= 0) 107 if (err <= 0)
106 return err; 108 return err;
107 return ext4_journal_restart(handle, needed); 109 err = ext4_truncate_restart_trans(handle, inode, needed);
110 /*
111 * We have dropped i_data_sem so someone might have cached again
112 * an extent we are going to truncate.
113 */
114 ext4_ext_invalidate_cache(inode);
115
116 return err;
108} 117}
109 118
110/* 119/*
@@ -220,57 +229,65 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
220 return newblock; 229 return newblock;
221} 230}
222 231
223static int ext4_ext_space_block(struct inode *inode) 232static inline int ext4_ext_space_block(struct inode *inode, int check)
224{ 233{
225 int size; 234 int size;
226 235
227 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 236 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
228 / sizeof(struct ext4_extent); 237 / sizeof(struct ext4_extent);
238 if (!check) {
229#ifdef AGGRESSIVE_TEST 239#ifdef AGGRESSIVE_TEST
230 if (size > 6) 240 if (size > 6)
231 size = 6; 241 size = 6;
232#endif 242#endif
243 }
233 return size; 244 return size;
234} 245}
235 246
236static int ext4_ext_space_block_idx(struct inode *inode) 247static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
237{ 248{
238 int size; 249 int size;
239 250
240 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 251 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
241 / sizeof(struct ext4_extent_idx); 252 / sizeof(struct ext4_extent_idx);
253 if (!check) {
242#ifdef AGGRESSIVE_TEST 254#ifdef AGGRESSIVE_TEST
243 if (size > 5) 255 if (size > 5)
244 size = 5; 256 size = 5;
245#endif 257#endif
258 }
246 return size; 259 return size;
247} 260}
248 261
249static int ext4_ext_space_root(struct inode *inode) 262static inline int ext4_ext_space_root(struct inode *inode, int check)
250{ 263{
251 int size; 264 int size;
252 265
253 size = sizeof(EXT4_I(inode)->i_data); 266 size = sizeof(EXT4_I(inode)->i_data);
254 size -= sizeof(struct ext4_extent_header); 267 size -= sizeof(struct ext4_extent_header);
255 size /= sizeof(struct ext4_extent); 268 size /= sizeof(struct ext4_extent);
269 if (!check) {
256#ifdef AGGRESSIVE_TEST 270#ifdef AGGRESSIVE_TEST
257 if (size > 3) 271 if (size > 3)
258 size = 3; 272 size = 3;
259#endif 273#endif
274 }
260 return size; 275 return size;
261} 276}
262 277
263static int ext4_ext_space_root_idx(struct inode *inode) 278static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
264{ 279{
265 int size; 280 int size;
266 281
267 size = sizeof(EXT4_I(inode)->i_data); 282 size = sizeof(EXT4_I(inode)->i_data);
268 size -= sizeof(struct ext4_extent_header); 283 size -= sizeof(struct ext4_extent_header);
269 size /= sizeof(struct ext4_extent_idx); 284 size /= sizeof(struct ext4_extent_idx);
285 if (!check) {
270#ifdef AGGRESSIVE_TEST 286#ifdef AGGRESSIVE_TEST
271 if (size > 4) 287 if (size > 4)
272 size = 4; 288 size = 4;
273#endif 289#endif
290 }
274 return size; 291 return size;
275} 292}
276 293
@@ -284,9 +301,9 @@ int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
284 int lcap, icap, rcap, leafs, idxs, num; 301 int lcap, icap, rcap, leafs, idxs, num;
285 int newextents = blocks; 302 int newextents = blocks;
286 303
287 rcap = ext4_ext_space_root_idx(inode); 304 rcap = ext4_ext_space_root_idx(inode, 0);
288 lcap = ext4_ext_space_block(inode); 305 lcap = ext4_ext_space_block(inode, 0);
289 icap = ext4_ext_space_block_idx(inode); 306 icap = ext4_ext_space_block_idx(inode, 0);
290 307
291 /* number of new leaf blocks needed */ 308 /* number of new leaf blocks needed */
292 num = leafs = (newextents + lcap - 1) / lcap; 309 num = leafs = (newextents + lcap - 1) / lcap;
@@ -311,14 +328,14 @@ ext4_ext_max_entries(struct inode *inode, int depth)
311 328
312 if (depth == ext_depth(inode)) { 329 if (depth == ext_depth(inode)) {
313 if (depth == 0) 330 if (depth == 0)
314 max = ext4_ext_space_root(inode); 331 max = ext4_ext_space_root(inode, 1);
315 else 332 else
316 max = ext4_ext_space_root_idx(inode); 333 max = ext4_ext_space_root_idx(inode, 1);
317 } else { 334 } else {
318 if (depth == 0) 335 if (depth == 0)
319 max = ext4_ext_space_block(inode); 336 max = ext4_ext_space_block(inode, 1);
320 else 337 else
321 max = ext4_ext_space_block_idx(inode); 338 max = ext4_ext_space_block_idx(inode, 1);
322 } 339 }
323 340
324 return max; 341 return max;
@@ -437,8 +454,9 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
437 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), 454 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block),
438 idx_pblock(path->p_idx)); 455 idx_pblock(path->p_idx));
439 } else if (path->p_ext) { 456 } else if (path->p_ext) {
440 ext_debug(" %d:%d:%llu ", 457 ext_debug(" %d:[%d]%d:%llu ",
441 le32_to_cpu(path->p_ext->ee_block), 458 le32_to_cpu(path->p_ext->ee_block),
459 ext4_ext_is_uninitialized(path->p_ext),
442 ext4_ext_get_actual_len(path->p_ext), 460 ext4_ext_get_actual_len(path->p_ext),
443 ext_pblock(path->p_ext)); 461 ext_pblock(path->p_ext));
444 } else 462 } else
@@ -460,8 +478,11 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
460 eh = path[depth].p_hdr; 478 eh = path[depth].p_hdr;
461 ex = EXT_FIRST_EXTENT(eh); 479 ex = EXT_FIRST_EXTENT(eh);
462 480
481 ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
482
463 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { 483 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
464 ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block), 484 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
485 ext4_ext_is_uninitialized(ex),
465 ext4_ext_get_actual_len(ex), ext_pblock(ex)); 486 ext4_ext_get_actual_len(ex), ext_pblock(ex));
466 } 487 }
467 ext_debug("\n"); 488 ext_debug("\n");
@@ -580,9 +601,10 @@ ext4_ext_binsearch(struct inode *inode,
580 } 601 }
581 602
582 path->p_ext = l - 1; 603 path->p_ext = l - 1;
583 ext_debug(" -> %d:%llu:%d ", 604 ext_debug(" -> %d:%llu:[%d]%d ",
584 le32_to_cpu(path->p_ext->ee_block), 605 le32_to_cpu(path->p_ext->ee_block),
585 ext_pblock(path->p_ext), 606 ext_pblock(path->p_ext),
607 ext4_ext_is_uninitialized(path->p_ext),
586 ext4_ext_get_actual_len(path->p_ext)); 608 ext4_ext_get_actual_len(path->p_ext));
587 609
588#ifdef CHECK_BINSEARCH 610#ifdef CHECK_BINSEARCH
@@ -612,7 +634,7 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
612 eh->eh_depth = 0; 634 eh->eh_depth = 0;
613 eh->eh_entries = 0; 635 eh->eh_entries = 0;
614 eh->eh_magic = EXT4_EXT_MAGIC; 636 eh->eh_magic = EXT4_EXT_MAGIC;
615 eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode)); 637 eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
616 ext4_mark_inode_dirty(handle, inode); 638 ext4_mark_inode_dirty(handle, inode);
617 ext4_ext_invalidate_cache(inode); 639 ext4_ext_invalidate_cache(inode);
618 return 0; 640 return 0;
@@ -701,7 +723,7 @@ err:
701 * insert new index [@logical;@ptr] into the block at @curp; 723 * insert new index [@logical;@ptr] into the block at @curp;
702 * check where to insert: before @curp or after @curp 724 * check where to insert: before @curp or after @curp
703 */ 725 */
704static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, 726int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
705 struct ext4_ext_path *curp, 727 struct ext4_ext_path *curp,
706 int logical, ext4_fsblk_t ptr) 728 int logical, ext4_fsblk_t ptr)
707{ 729{
@@ -837,7 +859,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
837 859
838 neh = ext_block_hdr(bh); 860 neh = ext_block_hdr(bh);
839 neh->eh_entries = 0; 861 neh->eh_entries = 0;
840 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode)); 862 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
841 neh->eh_magic = EXT4_EXT_MAGIC; 863 neh->eh_magic = EXT4_EXT_MAGIC;
842 neh->eh_depth = 0; 864 neh->eh_depth = 0;
843 ex = EXT_FIRST_EXTENT(neh); 865 ex = EXT_FIRST_EXTENT(neh);
@@ -850,9 +872,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
850 path[depth].p_ext++; 872 path[depth].p_ext++;
851 while (path[depth].p_ext <= 873 while (path[depth].p_ext <=
852 EXT_MAX_EXTENT(path[depth].p_hdr)) { 874 EXT_MAX_EXTENT(path[depth].p_hdr)) {
853 ext_debug("move %d:%llu:%d in new leaf %llu\n", 875 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
854 le32_to_cpu(path[depth].p_ext->ee_block), 876 le32_to_cpu(path[depth].p_ext->ee_block),
855 ext_pblock(path[depth].p_ext), 877 ext_pblock(path[depth].p_ext),
878 ext4_ext_is_uninitialized(path[depth].p_ext),
856 ext4_ext_get_actual_len(path[depth].p_ext), 879 ext4_ext_get_actual_len(path[depth].p_ext),
857 newblock); 880 newblock);
858 /*memmove(ex++, path[depth].p_ext++, 881 /*memmove(ex++, path[depth].p_ext++,
@@ -912,7 +935,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
912 neh = ext_block_hdr(bh); 935 neh = ext_block_hdr(bh);
913 neh->eh_entries = cpu_to_le16(1); 936 neh->eh_entries = cpu_to_le16(1);
914 neh->eh_magic = EXT4_EXT_MAGIC; 937 neh->eh_magic = EXT4_EXT_MAGIC;
915 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode)); 938 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
916 neh->eh_depth = cpu_to_le16(depth - i); 939 neh->eh_depth = cpu_to_le16(depth - i);
917 fidx = EXT_FIRST_INDEX(neh); 940 fidx = EXT_FIRST_INDEX(neh);
918 fidx->ei_block = border; 941 fidx->ei_block = border;
@@ -1037,9 +1060,9 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1037 /* old root could have indexes or leaves 1060 /* old root could have indexes or leaves
1038 * so calculate e_max right way */ 1061 * so calculate e_max right way */
1039 if (ext_depth(inode)) 1062 if (ext_depth(inode))
1040 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode)); 1063 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1041 else 1064 else
1042 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode)); 1065 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1043 neh->eh_magic = EXT4_EXT_MAGIC; 1066 neh->eh_magic = EXT4_EXT_MAGIC;
1044 set_buffer_uptodate(bh); 1067 set_buffer_uptodate(bh);
1045 unlock_buffer(bh); 1068 unlock_buffer(bh);
@@ -1054,7 +1077,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1054 goto out; 1077 goto out;
1055 1078
1056 curp->p_hdr->eh_magic = EXT4_EXT_MAGIC; 1079 curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
1057 curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode)); 1080 curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1058 curp->p_hdr->eh_entries = cpu_to_le16(1); 1081 curp->p_hdr->eh_entries = cpu_to_le16(1);
1059 curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); 1082 curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
1060 1083
@@ -1563,7 +1586,7 @@ out:
1563 */ 1586 */
1564int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, 1587int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1565 struct ext4_ext_path *path, 1588 struct ext4_ext_path *path,
1566 struct ext4_extent *newext) 1589 struct ext4_extent *newext, int flag)
1567{ 1590{
1568 struct ext4_extent_header *eh; 1591 struct ext4_extent_header *eh;
1569 struct ext4_extent *ex, *fex; 1592 struct ext4_extent *ex, *fex;
@@ -1579,10 +1602,13 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1579 BUG_ON(path[depth].p_hdr == NULL); 1602 BUG_ON(path[depth].p_hdr == NULL);
1580 1603
1581 /* try to insert block into found extent and return */ 1604 /* try to insert block into found extent and return */
1582 if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { 1605 if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
1583 ext_debug("append %d block to %d:%d (from %llu)\n", 1606 && ext4_can_extents_be_merged(inode, ex, newext)) {
1607 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1608 ext4_ext_is_uninitialized(newext),
1584 ext4_ext_get_actual_len(newext), 1609 ext4_ext_get_actual_len(newext),
1585 le32_to_cpu(ex->ee_block), 1610 le32_to_cpu(ex->ee_block),
1611 ext4_ext_is_uninitialized(ex),
1586 ext4_ext_get_actual_len(ex), ext_pblock(ex)); 1612 ext4_ext_get_actual_len(ex), ext_pblock(ex));
1587 err = ext4_ext_get_access(handle, inode, path + depth); 1613 err = ext4_ext_get_access(handle, inode, path + depth);
1588 if (err) 1614 if (err)
@@ -1651,9 +1677,10 @@ has_space:
1651 1677
1652 if (!nearex) { 1678 if (!nearex) {
1653 /* there is no extent in this leaf, create first one */ 1679 /* there is no extent in this leaf, create first one */
1654 ext_debug("first extent in the leaf: %d:%llu:%d\n", 1680 ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
1655 le32_to_cpu(newext->ee_block), 1681 le32_to_cpu(newext->ee_block),
1656 ext_pblock(newext), 1682 ext_pblock(newext),
1683 ext4_ext_is_uninitialized(newext),
1657 ext4_ext_get_actual_len(newext)); 1684 ext4_ext_get_actual_len(newext));
1658 path[depth].p_ext = EXT_FIRST_EXTENT(eh); 1685 path[depth].p_ext = EXT_FIRST_EXTENT(eh);
1659 } else if (le32_to_cpu(newext->ee_block) 1686 } else if (le32_to_cpu(newext->ee_block)
@@ -1663,10 +1690,11 @@ has_space:
1663 len = EXT_MAX_EXTENT(eh) - nearex; 1690 len = EXT_MAX_EXTENT(eh) - nearex;
1664 len = (len - 1) * sizeof(struct ext4_extent); 1691 len = (len - 1) * sizeof(struct ext4_extent);
1665 len = len < 0 ? 0 : len; 1692 len = len < 0 ? 0 : len;
1666 ext_debug("insert %d:%llu:%d after: nearest 0x%p, " 1693 ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
1667 "move %d from 0x%p to 0x%p\n", 1694 "move %d from 0x%p to 0x%p\n",
1668 le32_to_cpu(newext->ee_block), 1695 le32_to_cpu(newext->ee_block),
1669 ext_pblock(newext), 1696 ext_pblock(newext),
1697 ext4_ext_is_uninitialized(newext),
1670 ext4_ext_get_actual_len(newext), 1698 ext4_ext_get_actual_len(newext),
1671 nearex, len, nearex + 1, nearex + 2); 1699 nearex, len, nearex + 1, nearex + 2);
1672 memmove(nearex + 2, nearex + 1, len); 1700 memmove(nearex + 2, nearex + 1, len);
@@ -1676,10 +1704,11 @@ has_space:
1676 BUG_ON(newext->ee_block == nearex->ee_block); 1704 BUG_ON(newext->ee_block == nearex->ee_block);
1677 len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent); 1705 len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1678 len = len < 0 ? 0 : len; 1706 len = len < 0 ? 0 : len;
1679 ext_debug("insert %d:%llu:%d before: nearest 0x%p, " 1707 ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
1680 "move %d from 0x%p to 0x%p\n", 1708 "move %d from 0x%p to 0x%p\n",
1681 le32_to_cpu(newext->ee_block), 1709 le32_to_cpu(newext->ee_block),
1682 ext_pblock(newext), 1710 ext_pblock(newext),
1711 ext4_ext_is_uninitialized(newext),
1683 ext4_ext_get_actual_len(newext), 1712 ext4_ext_get_actual_len(newext),
1684 nearex, len, nearex + 1, nearex + 2); 1713 nearex, len, nearex + 1, nearex + 2);
1685 memmove(nearex + 1, nearex, len); 1714 memmove(nearex + 1, nearex, len);
@@ -1694,7 +1723,8 @@ has_space:
1694 1723
1695merge: 1724merge:
1696 /* try to merge extents to the right */ 1725 /* try to merge extents to the right */
1697 ext4_ext_try_to_merge(inode, path, nearex); 1726 if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
1727 ext4_ext_try_to_merge(inode, path, nearex);
1698 1728
1699 /* try to merge extents to the left */ 1729 /* try to merge extents to the left */
1700 1730
@@ -2094,7 +2124,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2094 else 2124 else
2095 uninitialized = 0; 2125 uninitialized = 0;
2096 2126
2097 ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len); 2127 ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2128 uninitialized, ex_ee_len);
2098 path[depth].p_ext = ex; 2129 path[depth].p_ext = ex;
2099 2130
2100 a = ex_ee_block > start ? ex_ee_block : start; 2131 a = ex_ee_block > start ? ex_ee_block : start;
@@ -2138,7 +2169,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2138 } 2169 }
2139 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); 2170 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
2140 2171
2141 err = ext4_ext_journal_restart(handle, credits); 2172 err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2142 if (err) 2173 if (err)
2143 goto out; 2174 goto out;
2144 2175
@@ -2327,7 +2358,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2327 if (err == 0) { 2358 if (err == 0) {
2328 ext_inode_hdr(inode)->eh_depth = 0; 2359 ext_inode_hdr(inode)->eh_depth = 0;
2329 ext_inode_hdr(inode)->eh_max = 2360 ext_inode_hdr(inode)->eh_max =
2330 cpu_to_le16(ext4_ext_space_root(inode)); 2361 cpu_to_le16(ext4_ext_space_root(inode, 0));
2331 err = ext4_ext_dirty(handle, inode, path); 2362 err = ext4_ext_dirty(handle, inode, path);
2332 } 2363 }
2333 } 2364 }
@@ -2349,6 +2380,7 @@ void ext4_ext_init(struct super_block *sb)
2349 */ 2380 */
2350 2381
2351 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { 2382 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2383#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
2352 printk(KERN_INFO "EXT4-fs: file extents enabled"); 2384 printk(KERN_INFO "EXT4-fs: file extents enabled");
2353#ifdef AGGRESSIVE_TEST 2385#ifdef AGGRESSIVE_TEST
2354 printk(", aggressive tests"); 2386 printk(", aggressive tests");
@@ -2360,6 +2392,7 @@ void ext4_ext_init(struct super_block *sb)
2360 printk(", stats"); 2392 printk(", stats");
2361#endif 2393#endif
2362 printk("\n"); 2394 printk("\n");
2395#endif
2363#ifdef EXTENTS_STATS 2396#ifdef EXTENTS_STATS
2364 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); 2397 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
2365 EXT4_SB(sb)->s_ext_min = 1 << 30; 2398 EXT4_SB(sb)->s_ext_min = 1 << 30;
@@ -2461,7 +2494,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2461} 2494}
2462 2495
2463#define EXT4_EXT_ZERO_LEN 7 2496#define EXT4_EXT_ZERO_LEN 7
2464
2465/* 2497/*
2466 * This function is called by ext4_ext_get_blocks() if someone tries to write 2498 * This function is called by ext4_ext_get_blocks() if someone tries to write
2467 * to an uninitialized extent. It may result in splitting the uninitialized 2499 * to an uninitialized extent. It may result in splitting the uninitialized
@@ -2554,7 +2586,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2554 ex3->ee_block = cpu_to_le32(iblock); 2586 ex3->ee_block = cpu_to_le32(iblock);
2555 ext4_ext_store_pblock(ex3, newblock); 2587 ext4_ext_store_pblock(ex3, newblock);
2556 ex3->ee_len = cpu_to_le16(allocated); 2588 ex3->ee_len = cpu_to_le16(allocated);
2557 err = ext4_ext_insert_extent(handle, inode, path, ex3); 2589 err = ext4_ext_insert_extent(handle, inode, path,
2590 ex3, 0);
2558 if (err == -ENOSPC) { 2591 if (err == -ENOSPC) {
2559 err = ext4_ext_zeroout(inode, &orig_ex); 2592 err = ext4_ext_zeroout(inode, &orig_ex);
2560 if (err) 2593 if (err)
@@ -2610,7 +2643,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2610 ext4_ext_store_pblock(ex3, newblock + max_blocks); 2643 ext4_ext_store_pblock(ex3, newblock + max_blocks);
2611 ex3->ee_len = cpu_to_le16(allocated - max_blocks); 2644 ex3->ee_len = cpu_to_le16(allocated - max_blocks);
2612 ext4_ext_mark_uninitialized(ex3); 2645 ext4_ext_mark_uninitialized(ex3);
2613 err = ext4_ext_insert_extent(handle, inode, path, ex3); 2646 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
2614 if (err == -ENOSPC) { 2647 if (err == -ENOSPC) {
2615 err = ext4_ext_zeroout(inode, &orig_ex); 2648 err = ext4_ext_zeroout(inode, &orig_ex);
2616 if (err) 2649 if (err)
@@ -2728,7 +2761,191 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2728 err = ext4_ext_dirty(handle, inode, path + depth); 2761 err = ext4_ext_dirty(handle, inode, path + depth);
2729 goto out; 2762 goto out;
2730insert: 2763insert:
2731 err = ext4_ext_insert_extent(handle, inode, path, &newex); 2764 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
2765 if (err == -ENOSPC) {
2766 err = ext4_ext_zeroout(inode, &orig_ex);
2767 if (err)
2768 goto fix_extent_len;
2769 /* update the extent length and mark as initialized */
2770 ex->ee_block = orig_ex.ee_block;
2771 ex->ee_len = orig_ex.ee_len;
2772 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2773 ext4_ext_dirty(handle, inode, path + depth);
2774 /* zero out the first half */
2775 return allocated;
2776 } else if (err)
2777 goto fix_extent_len;
2778out:
2779 ext4_ext_show_leaf(inode, path);
2780 return err ? err : allocated;
2781
2782fix_extent_len:
2783 ex->ee_block = orig_ex.ee_block;
2784 ex->ee_len = orig_ex.ee_len;
2785 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2786 ext4_ext_mark_uninitialized(ex);
2787 ext4_ext_dirty(handle, inode, path + depth);
2788 return err;
2789}
2790
2791/*
2792 * This function is called by ext4_ext_get_blocks() from
2793 * ext4_get_blocks_dio_write() when DIO to write
2794 * to an uninitialized extent.
2795 *
2796 * Writing to an uninitized extent may result in splitting the uninitialized
2797 * extent into multiple /intialized unintialized extents (up to three)
2798 * There are three possibilities:
2799 * a> There is no split required: Entire extent should be uninitialized
2800 * b> Splits in two extents: Write is happening at either end of the extent
2801 * c> Splits in three extents: Somone is writing in middle of the extent
2802 *
2803 * One of more index blocks maybe needed if the extent tree grow after
2804 * the unintialized extent split. To prevent ENOSPC occur at the IO
2805 * complete, we need to split the uninitialized extent before DIO submit
2806 * the IO. The uninitilized extent called at this time will be split
2807 * into three uninitialized extent(at most). After IO complete, the part
2808 * being filled will be convert to initialized by the end_io callback function
2809 * via ext4_convert_unwritten_extents().
2810 */
2811static int ext4_split_unwritten_extents(handle_t *handle,
2812 struct inode *inode,
2813 struct ext4_ext_path *path,
2814 ext4_lblk_t iblock,
2815 unsigned int max_blocks,
2816 int flags)
2817{
2818 struct ext4_extent *ex, newex, orig_ex;
2819 struct ext4_extent *ex1 = NULL;
2820 struct ext4_extent *ex2 = NULL;
2821 struct ext4_extent *ex3 = NULL;
2822 struct ext4_extent_header *eh;
2823 ext4_lblk_t ee_block;
2824 unsigned int allocated, ee_len, depth;
2825 ext4_fsblk_t newblock;
2826 int err = 0;
2827 int ret = 0;
2828
2829 ext_debug("ext4_split_unwritten_extents: inode %lu,"
2830 "iblock %llu, max_blocks %u\n", inode->i_ino,
2831 (unsigned long long)iblock, max_blocks);
2832 depth = ext_depth(inode);
2833 eh = path[depth].p_hdr;
2834 ex = path[depth].p_ext;
2835 ee_block = le32_to_cpu(ex->ee_block);
2836 ee_len = ext4_ext_get_actual_len(ex);
2837 allocated = ee_len - (iblock - ee_block);
2838 newblock = iblock - ee_block + ext_pblock(ex);
2839 ex2 = ex;
2840 orig_ex.ee_block = ex->ee_block;
2841 orig_ex.ee_len = cpu_to_le16(ee_len);
2842 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2843
2844 /*
2845 * if the entire unintialized extent length less than
2846 * the size of extent to write, there is no need to split
2847 * uninitialized extent
2848 */
2849 if (allocated <= max_blocks)
2850 return ret;
2851
2852 err = ext4_ext_get_access(handle, inode, path + depth);
2853 if (err)
2854 goto out;
2855 /* ex1: ee_block to iblock - 1 : uninitialized */
2856 if (iblock > ee_block) {
2857 ex1 = ex;
2858 ex1->ee_len = cpu_to_le16(iblock - ee_block);
2859 ext4_ext_mark_uninitialized(ex1);
2860 ex2 = &newex;
2861 }
2862 /*
2863 * for sanity, update the length of the ex2 extent before
2864 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2865 * overlap of blocks.
2866 */
2867 if (!ex1 && allocated > max_blocks)
2868 ex2->ee_len = cpu_to_le16(max_blocks);
2869 /* ex3: to ee_block + ee_len : uninitialised */
2870 if (allocated > max_blocks) {
2871 unsigned int newdepth;
2872 ex3 = &newex;
2873 ex3->ee_block = cpu_to_le32(iblock + max_blocks);
2874 ext4_ext_store_pblock(ex3, newblock + max_blocks);
2875 ex3->ee_len = cpu_to_le16(allocated - max_blocks);
2876 ext4_ext_mark_uninitialized(ex3);
2877 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
2878 if (err == -ENOSPC) {
2879 err = ext4_ext_zeroout(inode, &orig_ex);
2880 if (err)
2881 goto fix_extent_len;
2882 /* update the extent length and mark as initialized */
2883 ex->ee_block = orig_ex.ee_block;
2884 ex->ee_len = orig_ex.ee_len;
2885 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2886 ext4_ext_dirty(handle, inode, path + depth);
2887 /* zeroed the full extent */
2888 /* blocks available from iblock */
2889 return allocated;
2890
2891 } else if (err)
2892 goto fix_extent_len;
2893 /*
2894 * The depth, and hence eh & ex might change
2895 * as part of the insert above.
2896 */
2897 newdepth = ext_depth(inode);
2898 /*
2899 * update the extent length after successful insert of the
2900 * split extent
2901 */
2902 orig_ex.ee_len = cpu_to_le16(ee_len -
2903 ext4_ext_get_actual_len(ex3));
2904 depth = newdepth;
2905 ext4_ext_drop_refs(path);
2906 path = ext4_ext_find_extent(inode, iblock, path);
2907 if (IS_ERR(path)) {
2908 err = PTR_ERR(path);
2909 goto out;
2910 }
2911 eh = path[depth].p_hdr;
2912 ex = path[depth].p_ext;
2913 if (ex2 != &newex)
2914 ex2 = ex;
2915
2916 err = ext4_ext_get_access(handle, inode, path + depth);
2917 if (err)
2918 goto out;
2919
2920 allocated = max_blocks;
2921 }
2922 /*
2923 * If there was a change of depth as part of the
2924 * insertion of ex3 above, we need to update the length
2925 * of the ex1 extent again here
2926 */
2927 if (ex1 && ex1 != ex) {
2928 ex1 = ex;
2929 ex1->ee_len = cpu_to_le16(iblock - ee_block);
2930 ext4_ext_mark_uninitialized(ex1);
2931 ex2 = &newex;
2932 }
2933 /*
2934 * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
2935 * uninitialised still.
2936 */
2937 ex2->ee_block = cpu_to_le32(iblock);
2938 ext4_ext_store_pblock(ex2, newblock);
2939 ex2->ee_len = cpu_to_le16(allocated);
2940 ext4_ext_mark_uninitialized(ex2);
2941 if (ex2 != ex)
2942 goto insert;
2943 /* Mark modified extent as dirty */
2944 err = ext4_ext_dirty(handle, inode, path + depth);
2945 ext_debug("out here\n");
2946 goto out;
2947insert:
2948 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2732 if (err == -ENOSPC) { 2949 if (err == -ENOSPC) {
2733 err = ext4_ext_zeroout(inode, &orig_ex); 2950 err = ext4_ext_zeroout(inode, &orig_ex);
2734 if (err) 2951 if (err)
@@ -2743,6 +2960,7 @@ insert:
2743 } else if (err) 2960 } else if (err)
2744 goto fix_extent_len; 2961 goto fix_extent_len;
2745out: 2962out:
2963 ext4_ext_show_leaf(inode, path);
2746 return err ? err : allocated; 2964 return err ? err : allocated;
2747 2965
2748fix_extent_len: 2966fix_extent_len:
@@ -2753,7 +2971,141 @@ fix_extent_len:
2753 ext4_ext_dirty(handle, inode, path + depth); 2971 ext4_ext_dirty(handle, inode, path + depth);
2754 return err; 2972 return err;
2755} 2973}
2974static int ext4_convert_unwritten_extents_dio(handle_t *handle,
2975 struct inode *inode,
2976 struct ext4_ext_path *path)
2977{
2978 struct ext4_extent *ex;
2979 struct ext4_extent_header *eh;
2980 int depth;
2981 int err = 0;
2982 int ret = 0;
2983
2984 depth = ext_depth(inode);
2985 eh = path[depth].p_hdr;
2986 ex = path[depth].p_ext;
2987
2988 err = ext4_ext_get_access(handle, inode, path + depth);
2989 if (err)
2990 goto out;
2991 /* first mark the extent as initialized */
2992 ext4_ext_mark_initialized(ex);
2993
2994 /*
2995 * We have to see if it can be merged with the extent
2996 * on the left.
2997 */
2998 if (ex > EXT_FIRST_EXTENT(eh)) {
2999 /*
3000 * To merge left, pass "ex - 1" to try_to_merge(),
3001 * since it merges towards right _only_.
3002 */
3003 ret = ext4_ext_try_to_merge(inode, path, ex - 1);
3004 if (ret) {
3005 err = ext4_ext_correct_indexes(handle, inode, path);
3006 if (err)
3007 goto out;
3008 depth = ext_depth(inode);
3009 ex--;
3010 }
3011 }
3012 /*
3013 * Try to Merge towards right.
3014 */
3015 ret = ext4_ext_try_to_merge(inode, path, ex);
3016 if (ret) {
3017 err = ext4_ext_correct_indexes(handle, inode, path);
3018 if (err)
3019 goto out;
3020 depth = ext_depth(inode);
3021 }
3022 /* Mark modified extent as dirty */
3023 err = ext4_ext_dirty(handle, inode, path + depth);
3024out:
3025 ext4_ext_show_leaf(inode, path);
3026 return err;
3027}
3028
3029static int
3030ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3031 ext4_lblk_t iblock, unsigned int max_blocks,
3032 struct ext4_ext_path *path, int flags,
3033 unsigned int allocated, struct buffer_head *bh_result,
3034 ext4_fsblk_t newblock)
3035{
3036 int ret = 0;
3037 int err = 0;
3038 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3039
3040 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
3041 "block %llu, max_blocks %u, flags %d, allocated %u",
3042 inode->i_ino, (unsigned long long)iblock, max_blocks,
3043 flags, allocated);
3044 ext4_ext_show_leaf(inode, path);
3045
3046 /* DIO get_block() before submit the IO, split the extent */
3047 if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
3048 ret = ext4_split_unwritten_extents(handle,
3049 inode, path, iblock,
3050 max_blocks, flags);
3051 /* flag the io_end struct that we need convert when IO done */
3052 if (io)
3053 io->flag = DIO_AIO_UNWRITTEN;
3054 goto out;
3055 }
3056 /* DIO end_io complete, convert the filled extent to written */
3057 if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
3058 ret = ext4_convert_unwritten_extents_dio(handle, inode,
3059 path);
3060 goto out2;
3061 }
3062 /* buffered IO case */
3063 /*
3064 * repeat fallocate creation request
3065 * we already have an unwritten extent
3066 */
3067 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
3068 goto map_out;
3069
3070 /* buffered READ or buffered write_begin() lookup */
3071 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3072 /*
3073 * We have blocks reserved already. We
3074 * return allocated blocks so that delalloc
3075 * won't do block reservation for us. But
3076 * the buffer head will be unmapped so that
3077 * a read from the block returns 0s.
3078 */
3079 set_buffer_unwritten(bh_result);
3080 goto out1;
3081 }
2756 3082
3083 /* buffered write, writepage time, convert*/
3084 ret = ext4_ext_convert_to_initialized(handle, inode,
3085 path, iblock,
3086 max_blocks);
3087out:
3088 if (ret <= 0) {
3089 err = ret;
3090 goto out2;
3091 } else
3092 allocated = ret;
3093 set_buffer_new(bh_result);
3094map_out:
3095 set_buffer_mapped(bh_result);
3096out1:
3097 if (allocated > max_blocks)
3098 allocated = max_blocks;
3099 ext4_ext_show_leaf(inode, path);
3100 bh_result->b_bdev = inode->i_sb->s_bdev;
3101 bh_result->b_blocknr = newblock;
3102out2:
3103 if (path) {
3104 ext4_ext_drop_refs(path);
3105 kfree(path);
3106 }
3107 return err ? err : allocated;
3108}
2757/* 3109/*
2758 * Block allocation/map/preallocation routine for extents based files 3110 * Block allocation/map/preallocation routine for extents based files
2759 * 3111 *
@@ -2784,9 +3136,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2784 int err = 0, depth, ret, cache_type; 3136 int err = 0, depth, ret, cache_type;
2785 unsigned int allocated = 0; 3137 unsigned int allocated = 0;
2786 struct ext4_allocation_request ar; 3138 struct ext4_allocation_request ar;
3139 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
2787 3140
2788 __clear_bit(BH_New, &bh_result->b_state); 3141 __clear_bit(BH_New, &bh_result->b_state);
2789 ext_debug("blocks %u/%u requested for inode %u\n", 3142 ext_debug("blocks %u/%u requested for inode %lu\n",
2790 iblock, max_blocks, inode->i_ino); 3143 iblock, max_blocks, inode->i_ino);
2791 3144
2792 /* check in cache */ 3145 /* check in cache */
@@ -2849,7 +3202,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2849 newblock = iblock - ee_block + ee_start; 3202 newblock = iblock - ee_block + ee_start;
2850 /* number of remaining blocks in the extent */ 3203 /* number of remaining blocks in the extent */
2851 allocated = ee_len - (iblock - ee_block); 3204 allocated = ee_len - (iblock - ee_block);
2852 ext_debug("%u fit into %lu:%d -> %llu\n", iblock, 3205 ext_debug("%u fit into %u:%d -> %llu\n", iblock,
2853 ee_block, ee_len, newblock); 3206 ee_block, ee_len, newblock);
2854 3207
2855 /* Do not put uninitialized extent in the cache */ 3208 /* Do not put uninitialized extent in the cache */
@@ -2859,33 +3212,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2859 EXT4_EXT_CACHE_EXTENT); 3212 EXT4_EXT_CACHE_EXTENT);
2860 goto out; 3213 goto out;
2861 } 3214 }
2862 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) 3215 ret = ext4_ext_handle_uninitialized_extents(handle,
2863 goto out; 3216 inode, iblock, max_blocks, path,
2864 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3217 flags, allocated, bh_result, newblock);
2865 if (allocated > max_blocks) 3218 return ret;
2866 allocated = max_blocks;
2867 /*
2868 * We have blocks reserved already. We
2869 * return allocated blocks so that delalloc
2870 * won't do block reservation for us. But
2871 * the buffer head will be unmapped so that
2872 * a read from the block returns 0s.
2873 */
2874 set_buffer_unwritten(bh_result);
2875 bh_result->b_bdev = inode->i_sb->s_bdev;
2876 bh_result->b_blocknr = newblock;
2877 goto out2;
2878 }
2879
2880 ret = ext4_ext_convert_to_initialized(handle, inode,
2881 path, iblock,
2882 max_blocks);
2883 if (ret <= 0) {
2884 err = ret;
2885 goto out2;
2886 } else
2887 allocated = ret;
2888 goto outnew;
2889 } 3219 }
2890 } 3220 }
2891 3221
@@ -2950,15 +3280,27 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2950 newblock = ext4_mb_new_blocks(handle, &ar, &err); 3280 newblock = ext4_mb_new_blocks(handle, &ar, &err);
2951 if (!newblock) 3281 if (!newblock)
2952 goto out2; 3282 goto out2;
2953 ext_debug("allocate new block: goal %llu, found %llu/%lu\n", 3283 ext_debug("allocate new block: goal %llu, found %llu/%u\n",
2954 ar.goal, newblock, allocated); 3284 ar.goal, newblock, allocated);
2955 3285
2956 /* try to insert new extent into found leaf and return */ 3286 /* try to insert new extent into found leaf and return */
2957 ext4_ext_store_pblock(&newex, newblock); 3287 ext4_ext_store_pblock(&newex, newblock);
2958 newex.ee_len = cpu_to_le16(ar.len); 3288 newex.ee_len = cpu_to_le16(ar.len);
2959 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */ 3289 /* Mark uninitialized */
3290 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
2960 ext4_ext_mark_uninitialized(&newex); 3291 ext4_ext_mark_uninitialized(&newex);
2961 err = ext4_ext_insert_extent(handle, inode, path, &newex); 3292 /*
3293 * io_end structure was created for every async
3294 * direct IO write to the middle of the file.
3295 * To avoid unecessary convertion for every aio dio rewrite
3296 * to the mid of file, here we flag the IO that is really
3297 * need the convertion.
3298 *
3299 */
3300 if (io && flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT)
3301 io->flag = DIO_AIO_UNWRITTEN;
3302 }
3303 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2962 if (err) { 3304 if (err) {
2963 /* free data blocks we just allocated */ 3305 /* free data blocks we just allocated */
2964 /* not a good idea to call discard here directly, 3306 /* not a good idea to call discard here directly,
@@ -2972,7 +3314,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2972 /* previous routine could use block we allocated */ 3314 /* previous routine could use block we allocated */
2973 newblock = ext_pblock(&newex); 3315 newblock = ext_pblock(&newex);
2974 allocated = ext4_ext_get_actual_len(&newex); 3316 allocated = ext4_ext_get_actual_len(&newex);
2975outnew:
2976 set_buffer_new(bh_result); 3317 set_buffer_new(bh_result);
2977 3318
2978 /* Cache only when it is _not_ an uninitialized extent */ 3319 /* Cache only when it is _not_ an uninitialized extent */
@@ -3171,6 +3512,63 @@ retry:
3171} 3512}
3172 3513
3173/* 3514/*
3515 * This function convert a range of blocks to written extents
3516 * The caller of this function will pass the start offset and the size.
3517 * all unwritten extents within this range will be converted to
3518 * written extents.
3519 *
3520 * This function is called from the direct IO end io call back
3521 * function, to convert the fallocated extents after IO is completed.
3522 */
3523int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3524 loff_t len)
3525{
3526 handle_t *handle;
3527 ext4_lblk_t block;
3528 unsigned int max_blocks;
3529 int ret = 0;
3530 int ret2 = 0;
3531 struct buffer_head map_bh;
3532 unsigned int credits, blkbits = inode->i_blkbits;
3533
3534 block = offset >> blkbits;
3535 /*
3536 * We can't just convert len to max_blocks because
3537 * If blocksize = 4096 offset = 3072 and len = 2048
3538 */
3539 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
3540 - block;
3541 /*
3542 * credits to insert 1 extent into extent tree
3543 */
3544 credits = ext4_chunk_trans_blocks(inode, max_blocks);
3545 while (ret >= 0 && ret < max_blocks) {
3546 block = block + ret;
3547 max_blocks = max_blocks - ret;
3548 handle = ext4_journal_start(inode, credits);
3549 if (IS_ERR(handle)) {
3550 ret = PTR_ERR(handle);
3551 break;
3552 }
3553 map_bh.b_state = 0;
3554 ret = ext4_get_blocks(handle, inode, block,
3555 max_blocks, &map_bh,
3556 EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
3557 if (ret <= 0) {
3558 WARN_ON(ret <= 0);
3559 printk(KERN_ERR "%s: ext4_ext_get_blocks "
3560 "returned error inode#%lu, block=%u, "
3561 "max_blocks=%u", __func__,
3562 inode->i_ino, block, max_blocks);
3563 }
3564 ext4_mark_inode_dirty(handle, inode);
3565 ret2 = ext4_journal_stop(handle);
3566 if (ret <= 0 || ret2 )
3567 break;
3568 }
3569 return ret > 0 ? ret2 : ret;
3570}
3571/*
3174 * Callback function called for each extent to gather FIEMAP information. 3572 * Callback function called for each extent to gather FIEMAP information.
3175 */ 3573 */
3176static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, 3574static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3f1873fef1c6..9630583cef28 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -58,10 +58,7 @@ static ssize_t
58ext4_file_write(struct kiocb *iocb, const struct iovec *iov, 58ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
59 unsigned long nr_segs, loff_t pos) 59 unsigned long nr_segs, loff_t pos)
60{ 60{
61 struct file *file = iocb->ki_filp; 61 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
62 struct inode *inode = file->f_path.dentry->d_inode;
63 ssize_t ret;
64 int err;
65 62
66 /* 63 /*
67 * If we have encountered a bitmap-format file, the size limit 64 * If we have encountered a bitmap-format file, the size limit
@@ -81,56 +78,10 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
81 } 78 }
82 } 79 }
83 80
84 ret = generic_file_aio_write(iocb, iov, nr_segs, pos); 81 return generic_file_aio_write(iocb, iov, nr_segs, pos);
85 /*
86 * Skip flushing if there was an error, or if nothing was written.
87 */
88 if (ret <= 0)
89 return ret;
90
91 /*
92 * If the inode is IS_SYNC, or is O_SYNC and we are doing data
93 * journalling then we need to make sure that we force the transaction
94 * to disk to keep all metadata uptodate synchronously.
95 */
96 if (file->f_flags & O_SYNC) {
97 /*
98 * If we are non-data-journaled, then the dirty data has
99 * already been flushed to backing store by generic_osync_inode,
100 * and the inode has been flushed too if there have been any
101 * modifications other than mere timestamp updates.
102 *
103 * Open question --- do we care about flushing timestamps too
104 * if the inode is IS_SYNC?
105 */
106 if (!ext4_should_journal_data(inode))
107 return ret;
108
109 goto force_commit;
110 }
111
112 /*
113 * So we know that there has been no forced data flush. If the inode
114 * is marked IS_SYNC, we need to force one ourselves.
115 */
116 if (!IS_SYNC(inode))
117 return ret;
118
119 /*
120 * Open question #2 --- should we force data to disk here too? If we
121 * don't, the only impact is that data=writeback filesystems won't
122 * flush data to disk automatically on IS_SYNC, only metadata (but
123 * historically, that is what ext2 has done.)
124 */
125
126force_commit:
127 err = ext4_force_commit(inode->i_sb);
128 if (err)
129 return err;
130 return ret;
131} 82}
132 83
133static struct vm_operations_struct ext4_file_vm_ops = { 84static const struct vm_operations_struct ext4_file_vm_ops = {
134 .fault = filemap_fault, 85 .fault = filemap_fault,
135 .page_mkwrite = ext4_page_mkwrite, 86 .page_mkwrite = ext4_page_mkwrite,
136}; 87};
@@ -207,7 +158,7 @@ const struct inode_operations ext4_file_inode_operations = {
207 .listxattr = ext4_listxattr, 158 .listxattr = ext4_listxattr,
208 .removexattr = generic_removexattr, 159 .removexattr = generic_removexattr,
209#endif 160#endif
210 .permission = ext4_permission, 161 .check_acl = ext4_check_acl,
211 .fallocate = ext4_fallocate, 162 .fallocate = ext4_fallocate,
212 .fiemap = ext4_fiemap, 163 .fiemap = ext4_fiemap,
213}; 164};
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 83cf6415f599..2b1531266ee2 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -44,18 +44,23 @@
44 * 44 *
45 * What we do is just kick off a commit and wait on it. This will snapshot the 45 * What we do is just kick off a commit and wait on it. This will snapshot the
46 * inode to disk. 46 * inode to disk.
47 *
48 * i_mutex lock is held when entering and exiting this function
47 */ 49 */
48 50
49int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) 51int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
50{ 52{
51 struct inode *inode = dentry->d_inode; 53 struct inode *inode = dentry->d_inode;
52 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 54 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
53 int ret = 0; 55 int err, ret = 0;
54 56
55 J_ASSERT(ext4_journal_current_handle() == NULL); 57 J_ASSERT(ext4_journal_current_handle() == NULL);
56 58
57 trace_ext4_sync_file(file, dentry, datasync); 59 trace_ext4_sync_file(file, dentry, datasync);
58 60
61 ret = flush_aio_dio_completed_IO(inode);
62 if (ret < 0)
63 goto out;
59 /* 64 /*
60 * data=writeback: 65 * data=writeback:
61 * The caller's filemap_fdatawrite()/wait will sync the data. 66 * The caller's filemap_fdatawrite()/wait will sync the data.
@@ -79,6 +84,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
79 goto out; 84 goto out;
80 } 85 }
81 86
87 if (!journal)
88 ret = sync_mapping_buffers(inode->i_mapping);
89
82 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 90 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
83 goto out; 91 goto out;
84 92
@@ -91,10 +99,12 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
91 .sync_mode = WB_SYNC_ALL, 99 .sync_mode = WB_SYNC_ALL,
92 .nr_to_write = 0, /* sys_fsync did this */ 100 .nr_to_write = 0, /* sys_fsync did this */
93 }; 101 };
94 ret = sync_inode(inode, &wbc); 102 err = sync_inode(inode, &wbc);
95 if (journal && (journal->j_flags & JBD2_BARRIER)) 103 if (ret == 0)
96 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 104 ret = err;
97 } 105 }
98out: 106out:
107 if (journal && (journal->j_flags & JBD2_BARRIER))
108 blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
99 return ret; 109 return ret;
100} 110}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 29e6dc7299b8..f3624ead4f6c 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1189,7 +1189,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
1189 1189
1190 x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); 1190 x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
1191 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", 1191 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
1192 i, ext4_free_inodes_count(sb, gdp), x); 1192 (unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
1193 bitmap_count += x; 1193 bitmap_count += x;
1194 } 1194 }
1195 brelse(bitmap_bh); 1195 brelse(bitmap_bh);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f9c642b22efa..5c5bc5dafff8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
37#include <linux/namei.h> 37#include <linux/namei.h>
38#include <linux/uio.h> 38#include <linux/uio.h>
39#include <linux/bio.h> 39#include <linux/bio.h>
40#include <linux/workqueue.h>
40 41
41#include "ext4_jbd2.h" 42#include "ext4_jbd2.h"
42#include "xattr.h" 43#include "xattr.h"
@@ -192,11 +193,24 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
192 * so before we call here everything must be consistently dirtied against 193 * so before we call here everything must be consistently dirtied against
193 * this transaction. 194 * this transaction.
194 */ 195 */
195static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) 196 int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
197 int nblocks)
196{ 198{
199 int ret;
200
201 /*
202 * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
203 * moment, get_block can be called only for blocks inside i_size since
204 * page cache has been already dropped and writes are blocked by
205 * i_mutex. So we can safely drop the i_data_sem here.
206 */
197 BUG_ON(EXT4_JOURNAL(inode) == NULL); 207 BUG_ON(EXT4_JOURNAL(inode) == NULL);
198 jbd_debug(2, "restarting handle %p\n", handle); 208 jbd_debug(2, "restarting handle %p\n", handle);
199 return ext4_journal_restart(handle, blocks_for_truncate(inode)); 209 up_write(&EXT4_I(inode)->i_data_sem);
210 ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
211 down_write(&EXT4_I(inode)->i_data_sem);
212
213 return ret;
200} 214}
201 215
202/* 216/*
@@ -341,9 +355,7 @@ static int ext4_block_to_path(struct inode *inode,
341 int n = 0; 355 int n = 0;
342 int final = 0; 356 int final = 0;
343 357
344 if (i_block < 0) { 358 if (i_block < direct_blocks) {
345 ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0");
346 } else if (i_block < direct_blocks) {
347 offsets[n++] = i_block; 359 offsets[n++] = i_block;
348 final = direct_blocks; 360 final = direct_blocks;
349 } else if ((i_block -= direct_blocks) < indirect_blocks) { 361 } else if ((i_block -= direct_blocks) < indirect_blocks) {
@@ -551,15 +563,21 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
551 * 563 *
552 * Normally this function find the preferred place for block allocation, 564 * Normally this function find the preferred place for block allocation,
553 * returns it. 565 * returns it.
566 * Because this is only used for non-extent files, we limit the block nr
567 * to 32 bits.
554 */ 568 */
555static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, 569static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
556 Indirect *partial) 570 Indirect *partial)
557{ 571{
572 ext4_fsblk_t goal;
573
558 /* 574 /*
559 * XXX need to get goal block from mballoc's data structures 575 * XXX need to get goal block from mballoc's data structures
560 */ 576 */
561 577
562 return ext4_find_near(inode, partial); 578 goal = ext4_find_near(inode, partial);
579 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
580 return goal;
563} 581}
564 582
565/** 583/**
@@ -640,6 +658,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
640 if (*err) 658 if (*err)
641 goto failed_out; 659 goto failed_out;
642 660
661 BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS);
662
643 target -= count; 663 target -= count;
644 /* allocate blocks for indirect blocks */ 664 /* allocate blocks for indirect blocks */
645 while (index < indirect_blks && count) { 665 while (index < indirect_blks && count) {
@@ -674,6 +694,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
674 ar.flags = EXT4_MB_HINT_DATA; 694 ar.flags = EXT4_MB_HINT_DATA;
675 695
676 current_block = ext4_mb_new_blocks(handle, &ar, err); 696 current_block = ext4_mb_new_blocks(handle, &ar, err);
697 BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS);
677 698
678 if (*err && (target == blks)) { 699 if (*err && (target == blks)) {
679 /* 700 /*
@@ -762,8 +783,9 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
762 BUFFER_TRACE(bh, "call get_create_access"); 783 BUFFER_TRACE(bh, "call get_create_access");
763 err = ext4_journal_get_create_access(handle, bh); 784 err = ext4_journal_get_create_access(handle, bh);
764 if (err) { 785 if (err) {
786 /* Don't brelse(bh) here; it's done in
787 * ext4_journal_forget() below */
765 unlock_buffer(bh); 788 unlock_buffer(bh);
766 brelse(bh);
767 goto failed; 789 goto failed;
768 } 790 }
769 791
@@ -1109,22 +1131,79 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
1109 ext4_discard_preallocations(inode); 1131 ext4_discard_preallocations(inode);
1110} 1132}
1111 1133
1112static int check_block_validity(struct inode *inode, sector_t logical, 1134static int check_block_validity(struct inode *inode, const char *msg,
1113 sector_t phys, int len) 1135 sector_t logical, sector_t phys, int len)
1114{ 1136{
1115 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { 1137 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
1116 ext4_error(inode->i_sb, "check_block_validity", 1138 ext4_error(inode->i_sb, msg,
1117 "inode #%lu logical block %llu mapped to %llu " 1139 "inode #%lu logical block %llu mapped to %llu "
1118 "(size %d)", inode->i_ino, 1140 "(size %d)", inode->i_ino,
1119 (unsigned long long) logical, 1141 (unsigned long long) logical,
1120 (unsigned long long) phys, len); 1142 (unsigned long long) phys, len);
1121 WARN_ON(1);
1122 return -EIO; 1143 return -EIO;
1123 } 1144 }
1124 return 0; 1145 return 0;
1125} 1146}
1126 1147
1127/* 1148/*
1149 * Return the number of contiguous dirty pages in a given inode
1150 * starting at page frame idx.
1151 */
1152static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1153 unsigned int max_pages)
1154{
1155 struct address_space *mapping = inode->i_mapping;
1156 pgoff_t index;
1157 struct pagevec pvec;
1158 pgoff_t num = 0;
1159 int i, nr_pages, done = 0;
1160
1161 if (max_pages == 0)
1162 return 0;
1163 pagevec_init(&pvec, 0);
1164 while (!done) {
1165 index = idx;
1166 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
1167 PAGECACHE_TAG_DIRTY,
1168 (pgoff_t)PAGEVEC_SIZE);
1169 if (nr_pages == 0)
1170 break;
1171 for (i = 0; i < nr_pages; i++) {
1172 struct page *page = pvec.pages[i];
1173 struct buffer_head *bh, *head;
1174
1175 lock_page(page);
1176 if (unlikely(page->mapping != mapping) ||
1177 !PageDirty(page) ||
1178 PageWriteback(page) ||
1179 page->index != idx) {
1180 done = 1;
1181 unlock_page(page);
1182 break;
1183 }
1184 if (page_has_buffers(page)) {
1185 bh = head = page_buffers(page);
1186 do {
1187 if (!buffer_delay(bh) &&
1188 !buffer_unwritten(bh))
1189 done = 1;
1190 bh = bh->b_this_page;
1191 } while (!done && (bh != head));
1192 }
1193 unlock_page(page);
1194 if (done)
1195 break;
1196 idx++;
1197 num++;
1198 if (num >= max_pages)
1199 break;
1200 }
1201 pagevec_release(&pvec);
1202 }
1203 return num;
1204}
1205
1206/*
1128 * The ext4_get_blocks() function tries to look up the requested blocks, 1207 * The ext4_get_blocks() function tries to look up the requested blocks,
1129 * and returns if the blocks are already mapped. 1208 * and returns if the blocks are already mapped.
1130 * 1209 *
@@ -1155,6 +1234,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1155 clear_buffer_mapped(bh); 1234 clear_buffer_mapped(bh);
1156 clear_buffer_unwritten(bh); 1235 clear_buffer_unwritten(bh);
1157 1236
1237 ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
1238 "logical block %lu\n", inode->i_ino, flags, max_blocks,
1239 (unsigned long)block);
1158 /* 1240 /*
1159 * Try to see if we can get the block without requesting a new 1241 * Try to see if we can get the block without requesting a new
1160 * file system block. 1242 * file system block.
@@ -1170,8 +1252,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1170 up_read((&EXT4_I(inode)->i_data_sem)); 1252 up_read((&EXT4_I(inode)->i_data_sem));
1171 1253
1172 if (retval > 0 && buffer_mapped(bh)) { 1254 if (retval > 0 && buffer_mapped(bh)) {
1173 int ret = check_block_validity(inode, block, 1255 int ret = check_block_validity(inode, "file system corruption",
1174 bh->b_blocknr, retval); 1256 block, bh->b_blocknr, retval);
1175 if (ret != 0) 1257 if (ret != 0)
1176 return ret; 1258 return ret;
1177 } 1259 }
@@ -1235,8 +1317,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1235 * i_data's format changing. Force the migrate 1317 * i_data's format changing. Force the migrate
1236 * to fail by clearing migrate flags 1318 * to fail by clearing migrate flags
1237 */ 1319 */
1238 EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & 1320 EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
1239 ~EXT4_EXT_MIGRATE;
1240 } 1321 }
1241 } 1322 }
1242 1323
@@ -1252,8 +1333,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1252 1333
1253 up_write((&EXT4_I(inode)->i_data_sem)); 1334 up_write((&EXT4_I(inode)->i_data_sem));
1254 if (retval > 0 && buffer_mapped(bh)) { 1335 if (retval > 0 && buffer_mapped(bh)) {
1255 int ret = check_block_validity(inode, block, 1336 int ret = check_block_validity(inode, "file system "
1256 bh->b_blocknr, retval); 1337 "corruption after allocation",
1338 block, bh->b_blocknr, retval);
1257 if (ret != 0) 1339 if (ret != 0)
1258 return ret; 1340 return ret;
1259 } 1341 }
@@ -1776,11 +1858,11 @@ repeat:
1776 1858
1777 if (ext4_claim_free_blocks(sbi, total)) { 1859 if (ext4_claim_free_blocks(sbi, total)) {
1778 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1860 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1861 vfs_dq_release_reservation_block(inode, total);
1779 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1862 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1780 yield(); 1863 yield();
1781 goto repeat; 1864 goto repeat;
1782 } 1865 }
1783 vfs_dq_release_reservation_block(inode, total);
1784 return -ENOSPC; 1866 return -ENOSPC;
1785 } 1867 }
1786 EXT4_I(inode)->i_reserved_data_blocks += nrblocks; 1868 EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
@@ -1863,18 +1945,6 @@ static void ext4_da_page_release_reservation(struct page *page,
1863 * Delayed allocation stuff 1945 * Delayed allocation stuff
1864 */ 1946 */
1865 1947
1866struct mpage_da_data {
1867 struct inode *inode;
1868 sector_t b_blocknr; /* start block number of extent */
1869 size_t b_size; /* size of extent */
1870 unsigned long b_state; /* state of the extent */
1871 unsigned long first_page, next_page; /* extent of pages */
1872 struct writeback_control *wbc;
1873 int io_done;
1874 int pages_written;
1875 int retval;
1876};
1877
1878/* 1948/*
1879 * mpage_da_submit_io - walks through extent of pages and try to write 1949 * mpage_da_submit_io - walks through extent of pages and try to write
1880 * them with writepage() call back 1950 * them with writepage() call back
@@ -2084,18 +2154,18 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2084static void ext4_print_free_blocks(struct inode *inode) 2154static void ext4_print_free_blocks(struct inode *inode)
2085{ 2155{
2086 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2156 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2087 printk(KERN_EMERG "Total free blocks count %lld\n", 2157 printk(KERN_CRIT "Total free blocks count %lld\n",
2088 ext4_count_free_blocks(inode->i_sb)); 2158 ext4_count_free_blocks(inode->i_sb));
2089 printk(KERN_EMERG "Free/Dirty block details\n"); 2159 printk(KERN_CRIT "Free/Dirty block details\n");
2090 printk(KERN_EMERG "free_blocks=%lld\n", 2160 printk(KERN_CRIT "free_blocks=%lld\n",
2091 (long long)percpu_counter_sum(&sbi->s_freeblocks_counter)); 2161 (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
2092 printk(KERN_EMERG "dirty_blocks=%lld\n", 2162 printk(KERN_CRIT "dirty_blocks=%lld\n",
2093 (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter)); 2163 (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
2094 printk(KERN_EMERG "Block reservation details\n"); 2164 printk(KERN_CRIT "Block reservation details\n");
2095 printk(KERN_EMERG "i_reserved_data_blocks=%u\n", 2165 printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
2096 EXT4_I(inode)->i_reserved_data_blocks); 2166 EXT4_I(inode)->i_reserved_data_blocks);
2097 printk(KERN_EMERG "i_reserved_meta_blocks=%u\n", 2167 printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
2098 EXT4_I(inode)->i_reserved_meta_blocks); 2168 EXT4_I(inode)->i_reserved_meta_blocks);
2099 return; 2169 return;
2100} 2170}
2101 2171
@@ -2181,14 +2251,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2181 * writepage and writepages will again try to write 2251 * writepage and writepages will again try to write
2182 * the same. 2252 * the same.
2183 */ 2253 */
2184 printk(KERN_EMERG "%s block allocation failed for inode %lu " 2254 ext4_msg(mpd->inode->i_sb, KERN_CRIT,
2185 "at logical offset %llu with max blocks " 2255 "delayed block allocation failed for inode %lu at "
2186 "%zd with error %d\n", 2256 "logical offset %llu with max blocks %zd with "
2187 __func__, mpd->inode->i_ino, 2257 "error %d\n", mpd->inode->i_ino,
2188 (unsigned long long)next, 2258 (unsigned long long) next,
2189 mpd->b_size >> mpd->inode->i_blkbits, err); 2259 mpd->b_size >> mpd->inode->i_blkbits, err);
2190 printk(KERN_EMERG "This should not happen.!! " 2260 printk(KERN_CRIT "This should not happen!! "
2191 "Data will be lost\n"); 2261 "Data will be lost\n");
2192 if (err == -ENOSPC) { 2262 if (err == -ENOSPC) {
2193 ext4_print_free_blocks(mpd->inode); 2263 ext4_print_free_blocks(mpd->inode);
2194 } 2264 }
@@ -2329,7 +2399,7 @@ static int __mpage_da_writepage(struct page *page,
2329 /* 2399 /*
2330 * Rest of the page in the page_vec 2400 * Rest of the page in the page_vec
2331 * redirty then and skip then. We will 2401 * redirty then and skip then. We will
2332 * try to to write them again after 2402 * try to write them again after
2333 * starting a new transaction 2403 * starting a new transaction
2334 */ 2404 */
2335 redirty_page_for_writepage(wbc, page); 2405 redirty_page_for_writepage(wbc, page);
@@ -2735,8 +2805,11 @@ static int ext4_da_writepages(struct address_space *mapping,
2735 int no_nrwrite_index_update; 2805 int no_nrwrite_index_update;
2736 int pages_written = 0; 2806 int pages_written = 0;
2737 long pages_skipped; 2807 long pages_skipped;
2808 unsigned int max_pages;
2738 int range_cyclic, cycled = 1, io_done = 0; 2809 int range_cyclic, cycled = 1, io_done = 0;
2739 int needed_blocks, ret = 0, nr_to_writebump = 0; 2810 int needed_blocks, ret = 0;
2811 long desired_nr_to_write, nr_to_writebump = 0;
2812 loff_t range_start = wbc->range_start;
2740 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2813 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2741 2814
2742 trace_ext4_da_writepages(inode, wbc); 2815 trace_ext4_da_writepages(inode, wbc);
@@ -2762,16 +2835,6 @@ static int ext4_da_writepages(struct address_space *mapping,
2762 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) 2835 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
2763 return -EROFS; 2836 return -EROFS;
2764 2837
2765 /*
2766 * Make sure nr_to_write is >= sbi->s_mb_stream_request
2767 * This make sure small files blocks are allocated in
2768 * single attempt. This ensure that small files
2769 * get less fragmented.
2770 */
2771 if (wbc->nr_to_write < sbi->s_mb_stream_request) {
2772 nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
2773 wbc->nr_to_write = sbi->s_mb_stream_request;
2774 }
2775 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2838 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2776 range_whole = 1; 2839 range_whole = 1;
2777 2840
@@ -2786,6 +2849,36 @@ static int ext4_da_writepages(struct address_space *mapping,
2786 } else 2849 } else
2787 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2850 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2788 2851
2852 /*
2853 * This works around two forms of stupidity. The first is in
2854 * the writeback code, which caps the maximum number of pages
2855 * written to be 1024 pages. This is wrong on multiple
2856 * levels; different architectues have a different page size,
2857 * which changes the maximum amount of data which gets
2858 * written. Secondly, 4 megabytes is way too small. XFS
2859 * forces this value to be 16 megabytes by multiplying
2860 * nr_to_write parameter by four, and then relies on its
2861 * allocator to allocate larger extents to make them
2862 * contiguous. Unfortunately this brings us to the second
2863 * stupidity, which is that ext4's mballoc code only allocates
2864 * at most 2048 blocks. So we force contiguous writes up to
2865 * the number of dirty blocks in the inode, or
2866 * sbi->max_writeback_mb_bump whichever is smaller.
2867 */
2868 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
2869 if (!range_cyclic && range_whole)
2870 desired_nr_to_write = wbc->nr_to_write * 8;
2871 else
2872 desired_nr_to_write = ext4_num_dirty_pages(inode, index,
2873 max_pages);
2874 if (desired_nr_to_write > max_pages)
2875 desired_nr_to_write = max_pages;
2876
2877 if (wbc->nr_to_write < desired_nr_to_write) {
2878 nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
2879 wbc->nr_to_write = desired_nr_to_write;
2880 }
2881
2789 mpd.wbc = wbc; 2882 mpd.wbc = wbc;
2790 mpd.inode = mapping->host; 2883 mpd.inode = mapping->host;
2791 2884
@@ -2813,10 +2906,9 @@ retry:
2813 handle = ext4_journal_start(inode, needed_blocks); 2906 handle = ext4_journal_start(inode, needed_blocks);
2814 if (IS_ERR(handle)) { 2907 if (IS_ERR(handle)) {
2815 ret = PTR_ERR(handle); 2908 ret = PTR_ERR(handle);
2816 printk(KERN_CRIT "%s: jbd2_start: " 2909 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2817 "%ld pages, ino %lu; err %d\n", __func__, 2910 "%ld pages, ino %lu; err %d\n", __func__,
2818 wbc->nr_to_write, inode->i_ino, ret); 2911 wbc->nr_to_write, inode->i_ino, ret);
2819 dump_stack();
2820 goto out_writepages; 2912 goto out_writepages;
2821 } 2913 }
2822 2914
@@ -2850,6 +2942,7 @@ retry:
2850 mpd.io_done = 1; 2942 mpd.io_done = 1;
2851 ret = MPAGE_DA_EXTENT_TAIL; 2943 ret = MPAGE_DA_EXTENT_TAIL;
2852 } 2944 }
2945 trace_ext4_da_write_pages(inode, &mpd);
2853 wbc->nr_to_write -= mpd.pages_written; 2946 wbc->nr_to_write -= mpd.pages_written;
2854 2947
2855 ext4_journal_stop(handle); 2948 ext4_journal_stop(handle);
@@ -2887,9 +2980,10 @@ retry:
2887 goto retry; 2980 goto retry;
2888 } 2981 }
2889 if (pages_skipped != wbc->pages_skipped) 2982 if (pages_skipped != wbc->pages_skipped)
2890 printk(KERN_EMERG "This should not happen leaving %s " 2983 ext4_msg(inode->i_sb, KERN_CRIT,
2891 "with nr_to_write = %ld ret = %d\n", 2984 "This should not happen leaving %s "
2892 __func__, wbc->nr_to_write, ret); 2985 "with nr_to_write = %ld ret = %d\n",
2986 __func__, wbc->nr_to_write, ret);
2893 2987
2894 /* Update index */ 2988 /* Update index */
2895 index += pages_written; 2989 index += pages_written;
@@ -2904,7 +2998,9 @@ retry:
2904out_writepages: 2998out_writepages:
2905 if (!no_nrwrite_index_update) 2999 if (!no_nrwrite_index_update)
2906 wbc->no_nrwrite_index_update = 0; 3000 wbc->no_nrwrite_index_update = 0;
2907 wbc->nr_to_write -= nr_to_writebump; 3001 if (wbc->nr_to_write > nr_to_writebump)
3002 wbc->nr_to_write -= nr_to_writebump;
3003 wbc->range_start = range_start;
2908 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); 3004 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
2909 return ret; 3005 return ret;
2910} 3006}
@@ -3117,6 +3213,8 @@ out:
3117 */ 3213 */
3118int ext4_alloc_da_blocks(struct inode *inode) 3214int ext4_alloc_da_blocks(struct inode *inode)
3119{ 3215{
3216 trace_ext4_alloc_da_blocks(inode);
3217
3120 if (!EXT4_I(inode)->i_reserved_data_blocks && 3218 if (!EXT4_I(inode)->i_reserved_data_blocks &&
3121 !EXT4_I(inode)->i_reserved_meta_blocks) 3219 !EXT4_I(inode)->i_reserved_meta_blocks)
3122 return 0; 3220 return 0;
@@ -3259,6 +3357,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3259} 3357}
3260 3358
3261/* 3359/*
3360 * O_DIRECT for ext3 (or indirect map) based files
3361 *
3262 * If the O_DIRECT write will extend the file then add this inode to the 3362 * If the O_DIRECT write will extend the file then add this inode to the
3263 * orphan list. So recovery will truncate it back to the original size 3363 * orphan list. So recovery will truncate it back to the original size
3264 * if the machine crashes during the write. 3364 * if the machine crashes during the write.
@@ -3267,7 +3367,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3267 * crashes then stale disk data _may_ be exposed inside the file. But current 3367 * crashes then stale disk data _may_ be exposed inside the file. But current
3268 * VFS code falls back into buffered path in that case so we are safe. 3368 * VFS code falls back into buffered path in that case so we are safe.
3269 */ 3369 */
3270static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, 3370static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
3271 const struct iovec *iov, loff_t offset, 3371 const struct iovec *iov, loff_t offset,
3272 unsigned long nr_segs) 3372 unsigned long nr_segs)
3273{ 3373{
@@ -3278,6 +3378,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3278 ssize_t ret; 3378 ssize_t ret;
3279 int orphan = 0; 3379 int orphan = 0;
3280 size_t count = iov_length(iov, nr_segs); 3380 size_t count = iov_length(iov, nr_segs);
3381 int retries = 0;
3281 3382
3282 if (rw == WRITE) { 3383 if (rw == WRITE) {
3283 loff_t final_size = offset + count; 3384 loff_t final_size = offset + count;
@@ -3300,9 +3401,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3300 } 3401 }
3301 } 3402 }
3302 3403
3404retry:
3303 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 3405 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
3304 offset, nr_segs, 3406 offset, nr_segs,
3305 ext4_get_block, NULL); 3407 ext4_get_block, NULL);
3408 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3409 goto retry;
3306 3410
3307 if (orphan) { 3411 if (orphan) {
3308 int err; 3412 int err;
@@ -3341,6 +3445,359 @@ out:
3341 return ret; 3445 return ret;
3342} 3446}
3343 3447
3448/* Maximum number of blocks we map for direct IO at once. */
3449
3450static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
3451 struct buffer_head *bh_result, int create)
3452{
3453 handle_t *handle = NULL;
3454 int ret = 0;
3455 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3456 int dio_credits;
3457
3458 ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n",
3459 inode->i_ino, create);
3460 /*
3461 * DIO VFS code passes create = 0 flag for write to
3462 * the middle of file. It does this to avoid block
3463 * allocation for holes, to prevent expose stale data
3464 * out when there is parallel buffered read (which does
3465 * not hold the i_mutex lock) while direct IO write has
3466 * not completed. DIO request on holes finally falls back
3467 * to buffered IO for this reason.
3468 *
3469 * For ext4 extent based file, since we support fallocate,
3470 * new allocated extent as uninitialized, for holes, we
3471 * could fallocate blocks for holes, thus parallel
3472 * buffered IO read will zero out the page when read on
3473 * a hole while parallel DIO write to the hole has not completed.
3474 *
3475 * when we come here, we know it's a direct IO write to
3476 * to the middle of file (<i_size)
3477 * so it's safe to override the create flag from VFS.
3478 */
3479 create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
3480
3481 if (max_blocks > DIO_MAX_BLOCKS)
3482 max_blocks = DIO_MAX_BLOCKS;
3483 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
3484 handle = ext4_journal_start(inode, dio_credits);
3485 if (IS_ERR(handle)) {
3486 ret = PTR_ERR(handle);
3487 goto out;
3488 }
3489 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
3490 create);
3491 if (ret > 0) {
3492 bh_result->b_size = (ret << inode->i_blkbits);
3493 ret = 0;
3494 }
3495 ext4_journal_stop(handle);
3496out:
3497 return ret;
3498}
3499
3500static void ext4_free_io_end(ext4_io_end_t *io)
3501{
3502 BUG_ON(!io);
3503 iput(io->inode);
3504 kfree(io);
3505}
3506static void dump_aio_dio_list(struct inode * inode)
3507{
3508#ifdef EXT4_DEBUG
3509 struct list_head *cur, *before, *after;
3510 ext4_io_end_t *io, *io0, *io1;
3511
3512 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
3513 ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino);
3514 return;
3515 }
3516
3517 ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino);
3518 list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){
3519 cur = &io->list;
3520 before = cur->prev;
3521 io0 = container_of(before, ext4_io_end_t, list);
3522 after = cur->next;
3523 io1 = container_of(after, ext4_io_end_t, list);
3524
3525 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
3526 io, inode->i_ino, io0, io1);
3527 }
3528#endif
3529}
3530
3531/*
3532 * check a range of space and convert unwritten extents to written.
3533 */
3534static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
3535{
3536 struct inode *inode = io->inode;
3537 loff_t offset = io->offset;
3538 size_t size = io->size;
3539 int ret = 0;
3540
3541 ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
3542 "list->prev 0x%p\n",
3543 io, inode->i_ino, io->list.next, io->list.prev);
3544
3545 if (list_empty(&io->list))
3546 return ret;
3547
3548 if (io->flag != DIO_AIO_UNWRITTEN)
3549 return ret;
3550
3551 if (offset + size <= i_size_read(inode))
3552 ret = ext4_convert_unwritten_extents(inode, offset, size);
3553
3554 if (ret < 0) {
3555 printk(KERN_EMERG "%s: failed to convert unwritten"
3556 "extents to written extents, error is %d"
3557 " io is still on inode %lu aio dio list\n",
3558 __func__, ret, inode->i_ino);
3559 return ret;
3560 }
3561
3562 /* clear the DIO AIO unwritten flag */
3563 io->flag = 0;
3564 return ret;
3565}
3566/*
3567 * work on completed aio dio IO, to convert unwritten extents to extents
3568 */
3569static void ext4_end_aio_dio_work(struct work_struct *work)
3570{
3571 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3572 struct inode *inode = io->inode;
3573 int ret = 0;
3574
3575 mutex_lock(&inode->i_mutex);
3576 ret = ext4_end_aio_dio_nolock(io);
3577 if (ret >= 0) {
3578 if (!list_empty(&io->list))
3579 list_del_init(&io->list);
3580 ext4_free_io_end(io);
3581 }
3582 mutex_unlock(&inode->i_mutex);
3583}
3584/*
3585 * This function is called from ext4_sync_file().
3586 *
3587 * When AIO DIO IO is completed, the work to convert unwritten
3588 * extents to written is queued on workqueue but may not get immediately
3589 * scheduled. When fsync is called, we need to ensure the
3590 * conversion is complete before fsync returns.
3591 * The inode keeps track of a list of completed AIO from DIO path
3592 * that might needs to do the conversion. This function walks through
3593 * the list and convert the related unwritten extents to written.
3594 */
3595int flush_aio_dio_completed_IO(struct inode *inode)
3596{
3597 ext4_io_end_t *io;
3598 int ret = 0;
3599 int ret2 = 0;
3600
3601 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list))
3602 return ret;
3603
3604 dump_aio_dio_list(inode);
3605 while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
3606 io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next,
3607 ext4_io_end_t, list);
3608 /*
3609 * Calling ext4_end_aio_dio_nolock() to convert completed
3610 * IO to written.
3611 *
3612 * When ext4_sync_file() is called, run_queue() may already
3613 * about to flush the work corresponding to this io structure.
3614 * It will be upset if it founds the io structure related
3615 * to the work-to-be schedule is freed.
3616 *
3617 * Thus we need to keep the io structure still valid here after
3618 * convertion finished. The io structure has a flag to
3619 * avoid double converting from both fsync and background work
3620 * queue work.
3621 */
3622 ret = ext4_end_aio_dio_nolock(io);
3623 if (ret < 0)
3624 ret2 = ret;
3625 else
3626 list_del_init(&io->list);
3627 }
3628 return (ret2 < 0) ? ret2 : 0;
3629}
3630
3631static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
3632{
3633 ext4_io_end_t *io = NULL;
3634
3635 io = kmalloc(sizeof(*io), GFP_NOFS);
3636
3637 if (io) {
3638 igrab(inode);
3639 io->inode = inode;
3640 io->flag = 0;
3641 io->offset = 0;
3642 io->size = 0;
3643 io->error = 0;
3644 INIT_WORK(&io->work, ext4_end_aio_dio_work);
3645 INIT_LIST_HEAD(&io->list);
3646 }
3647
3648 return io;
3649}
3650
3651static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3652 ssize_t size, void *private)
3653{
3654 ext4_io_end_t *io_end = iocb->private;
3655 struct workqueue_struct *wq;
3656
3657 ext_debug("ext4_end_io_dio(): io_end 0x%p"
3658 "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
3659 iocb->private, io_end->inode->i_ino, iocb, offset,
3660 size);
3661 /* if not async direct IO or dio with 0 bytes write, just return */
3662 if (!io_end || !size)
3663 return;
3664
3665 /* if not aio dio with unwritten extents, just free io and return */
3666 if (io_end->flag != DIO_AIO_UNWRITTEN){
3667 ext4_free_io_end(io_end);
3668 iocb->private = NULL;
3669 return;
3670 }
3671
3672 io_end->offset = offset;
3673 io_end->size = size;
3674 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3675
3676 /* queue the work to convert unwritten extents to written */
3677 queue_work(wq, &io_end->work);
3678
3679 /* Add the io_end to per-inode completed aio dio list*/
3680 list_add_tail(&io_end->list,
3681 &EXT4_I(io_end->inode)->i_aio_dio_complete_list);
3682 iocb->private = NULL;
3683}
3684/*
3685 * For ext4 extent files, ext4 will do direct-io write to holes,
3686 * preallocated extents, and those write extend the file, no need to
3687 * fall back to buffered IO.
3688 *
3689 * For holes, we fallocate those blocks, mark them as unintialized
3690 * If those blocks were preallocated, we mark sure they are splited, but
3691 * still keep the range to write as unintialized.
3692 *
3693 * The unwrritten extents will be converted to written when DIO is completed.
3694 * For async direct IO, since the IO may still pending when return, we
3695 * set up an end_io call back function, which will do the convertion
3696 * when async direct IO completed.
3697 *
3698 * If the O_DIRECT write will extend the file then add this inode to the
3699 * orphan list. So recovery will truncate it back to the original size
3700 * if the machine crashes during the write.
3701 *
3702 */
3703static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3704 const struct iovec *iov, loff_t offset,
3705 unsigned long nr_segs)
3706{
3707 struct file *file = iocb->ki_filp;
3708 struct inode *inode = file->f_mapping->host;
3709 ssize_t ret;
3710 size_t count = iov_length(iov, nr_segs);
3711
3712 loff_t final_size = offset + count;
3713 if (rw == WRITE && final_size <= inode->i_size) {
3714 /*
3715 * We could direct write to holes and fallocate.
3716 *
3717 * Allocated blocks to fill the hole are marked as uninitialized
3718 * to prevent paralel buffered read to expose the stale data
3719 * before DIO complete the data IO.
3720 *
3721 * As to previously fallocated extents, ext4 get_block
3722 * will just simply mark the buffer mapped but still
3723 * keep the extents uninitialized.
3724 *
3725 * for non AIO case, we will convert those unwritten extents
3726 * to written after return back from blockdev_direct_IO.
3727 *
3728 * for async DIO, the conversion needs to be defered when
3729 * the IO is completed. The ext4 end_io callback function
3730 * will be called to take care of the conversion work.
3731 * Here for async case, we allocate an io_end structure to
3732 * hook to the iocb.
3733 */
3734 iocb->private = NULL;
3735 EXT4_I(inode)->cur_aio_dio = NULL;
3736 if (!is_sync_kiocb(iocb)) {
3737 iocb->private = ext4_init_io_end(inode);
3738 if (!iocb->private)
3739 return -ENOMEM;
3740 /*
3741 * we save the io structure for current async
3742 * direct IO, so that later ext4_get_blocks()
3743 * could flag the io structure whether there
3744 * is a unwritten extents needs to be converted
3745 * when IO is completed.
3746 */
3747 EXT4_I(inode)->cur_aio_dio = iocb->private;
3748 }
3749
3750 ret = blockdev_direct_IO(rw, iocb, inode,
3751 inode->i_sb->s_bdev, iov,
3752 offset, nr_segs,
3753 ext4_get_block_dio_write,
3754 ext4_end_io_dio);
3755 if (iocb->private)
3756 EXT4_I(inode)->cur_aio_dio = NULL;
3757 /*
3758 * The io_end structure takes a reference to the inode,
3759 * that structure needs to be destroyed and the
3760 * reference to the inode need to be dropped, when IO is
3761 * complete, even with 0 byte write, or failed.
3762 *
3763 * In the successful AIO DIO case, the io_end structure will be
3764 * desctroyed and the reference to the inode will be dropped
3765 * after the end_io call back function is called.
3766 *
3767 * In the case there is 0 byte write, or error case, since
3768 * VFS direct IO won't invoke the end_io call back function,
3769 * we need to free the end_io structure here.
3770 */
3771 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
3772 ext4_free_io_end(iocb->private);
3773 iocb->private = NULL;
3774 } else if (ret > 0)
3775 /*
3776 * for non AIO case, since the IO is already
3777 * completed, we could do the convertion right here
3778 */
3779 ret = ext4_convert_unwritten_extents(inode,
3780 offset, ret);
3781 return ret;
3782 }
3783
3784 /* for write the the end of file case, we fall back to old way */
3785 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3786}
3787
3788static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3789 const struct iovec *iov, loff_t offset,
3790 unsigned long nr_segs)
3791{
3792 struct file *file = iocb->ki_filp;
3793 struct inode *inode = file->f_mapping->host;
3794
3795 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
3796 return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
3797
3798 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3799}
3800
3344/* 3801/*
3345 * Pages can be marked dirty completely asynchronously from ext4's journalling 3802 * Pages can be marked dirty completely asynchronously from ext4's journalling
3346 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do 3803 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
@@ -3373,6 +3830,7 @@ static const struct address_space_operations ext4_ordered_aops = {
3373 .direct_IO = ext4_direct_IO, 3830 .direct_IO = ext4_direct_IO,
3374 .migratepage = buffer_migrate_page, 3831 .migratepage = buffer_migrate_page,
3375 .is_partially_uptodate = block_is_partially_uptodate, 3832 .is_partially_uptodate = block_is_partially_uptodate,
3833 .error_remove_page = generic_error_remove_page,
3376}; 3834};
3377 3835
3378static const struct address_space_operations ext4_writeback_aops = { 3836static const struct address_space_operations ext4_writeback_aops = {
@@ -3388,6 +3846,7 @@ static const struct address_space_operations ext4_writeback_aops = {
3388 .direct_IO = ext4_direct_IO, 3846 .direct_IO = ext4_direct_IO,
3389 .migratepage = buffer_migrate_page, 3847 .migratepage = buffer_migrate_page,
3390 .is_partially_uptodate = block_is_partially_uptodate, 3848 .is_partially_uptodate = block_is_partially_uptodate,
3849 .error_remove_page = generic_error_remove_page,
3391}; 3850};
3392 3851
3393static const struct address_space_operations ext4_journalled_aops = { 3852static const struct address_space_operations ext4_journalled_aops = {
@@ -3402,6 +3861,7 @@ static const struct address_space_operations ext4_journalled_aops = {
3402 .invalidatepage = ext4_invalidatepage, 3861 .invalidatepage = ext4_invalidatepage,
3403 .releasepage = ext4_releasepage, 3862 .releasepage = ext4_releasepage,
3404 .is_partially_uptodate = block_is_partially_uptodate, 3863 .is_partially_uptodate = block_is_partially_uptodate,
3864 .error_remove_page = generic_error_remove_page,
3405}; 3865};
3406 3866
3407static const struct address_space_operations ext4_da_aops = { 3867static const struct address_space_operations ext4_da_aops = {
@@ -3418,6 +3878,7 @@ static const struct address_space_operations ext4_da_aops = {
3418 .direct_IO = ext4_direct_IO, 3878 .direct_IO = ext4_direct_IO,
3419 .migratepage = buffer_migrate_page, 3879 .migratepage = buffer_migrate_page,
3420 .is_partially_uptodate = block_is_partially_uptodate, 3880 .is_partially_uptodate = block_is_partially_uptodate,
3881 .error_remove_page = generic_error_remove_page,
3421}; 3882};
3422 3883
3423void ext4_set_aops(struct inode *inode) 3884void ext4_set_aops(struct inode *inode)
@@ -3659,7 +4120,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
3659 ext4_handle_dirty_metadata(handle, inode, bh); 4120 ext4_handle_dirty_metadata(handle, inode, bh);
3660 } 4121 }
3661 ext4_mark_inode_dirty(handle, inode); 4122 ext4_mark_inode_dirty(handle, inode);
3662 ext4_journal_test_restart(handle, inode); 4123 ext4_truncate_restart_trans(handle, inode,
4124 blocks_for_truncate(inode));
3663 if (bh) { 4125 if (bh) {
3664 BUFFER_TRACE(bh, "retaking write access"); 4126 BUFFER_TRACE(bh, "retaking write access");
3665 ext4_journal_get_write_access(handle, bh); 4127 ext4_journal_get_write_access(handle, bh);
@@ -3870,7 +4332,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
3870 return; 4332 return;
3871 if (try_to_extend_transaction(handle, inode)) { 4333 if (try_to_extend_transaction(handle, inode)) {
3872 ext4_mark_inode_dirty(handle, inode); 4334 ext4_mark_inode_dirty(handle, inode);
3873 ext4_journal_test_restart(handle, inode); 4335 ext4_truncate_restart_trans(handle, inode,
4336 blocks_for_truncate(inode));
3874 } 4337 }
3875 4338
3876 ext4_free_blocks(handle, inode, nr, 1, 1); 4339 ext4_free_blocks(handle, inode, nr, 1, 1);
@@ -3958,8 +4421,7 @@ void ext4_truncate(struct inode *inode)
3958 if (!ext4_can_truncate(inode)) 4421 if (!ext4_can_truncate(inode))
3959 return; 4422 return;
3960 4423
3961 if (ei->i_disksize && inode->i_size == 0 && 4424 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
3962 !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
3963 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; 4425 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
3964 4426
3965 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 4427 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
@@ -4581,8 +5043,7 @@ static int ext4_do_update_inode(handle_t *handle,
4581 if (ext4_inode_blocks_set(handle, raw_inode, ei)) 5043 if (ext4_inode_blocks_set(handle, raw_inode, ei))
4582 goto out_brelse; 5044 goto out_brelse;
4583 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 5045 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
4584 /* clear the migrate flag in the raw_inode */ 5046 raw_inode->i_flags = cpu_to_le32(ei->i_flags);
4585 raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE);
4586 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 5047 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
4587 cpu_to_le32(EXT4_OS_HURD)) 5048 cpu_to_le32(EXT4_OS_HURD))
4588 raw_inode->i_file_acl_high = 5049 raw_inode->i_file_acl_high =
@@ -4684,19 +5145,40 @@ out_brelse:
4684 */ 5145 */
4685int ext4_write_inode(struct inode *inode, int wait) 5146int ext4_write_inode(struct inode *inode, int wait)
4686{ 5147{
5148 int err;
5149
4687 if (current->flags & PF_MEMALLOC) 5150 if (current->flags & PF_MEMALLOC)
4688 return 0; 5151 return 0;
4689 5152
4690 if (ext4_journal_current_handle()) { 5153 if (EXT4_SB(inode->i_sb)->s_journal) {
4691 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); 5154 if (ext4_journal_current_handle()) {
4692 dump_stack(); 5155 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
4693 return -EIO; 5156 dump_stack();
4694 } 5157 return -EIO;
5158 }
4695 5159
4696 if (!wait) 5160 if (!wait)
4697 return 0; 5161 return 0;
4698 5162
4699 return ext4_force_commit(inode->i_sb); 5163 err = ext4_force_commit(inode->i_sb);
5164 } else {
5165 struct ext4_iloc iloc;
5166
5167 err = ext4_get_inode_loc(inode, &iloc);
5168 if (err)
5169 return err;
5170 if (wait)
5171 sync_dirty_buffer(iloc.bh);
5172 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5173 ext4_error(inode->i_sb, __func__,
5174 "IO error syncing inode, "
5175 "inode=%lu, block=%llu",
5176 inode->i_ino,
5177 (unsigned long long)iloc.bh->b_blocknr);
5178 err = -EIO;
5179 }
5180 }
5181 return err;
4700} 5182}
4701 5183
4702/* 5184/*
@@ -5134,27 +5616,14 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5134 */ 5616 */
5135void ext4_dirty_inode(struct inode *inode) 5617void ext4_dirty_inode(struct inode *inode)
5136{ 5618{
5137 handle_t *current_handle = ext4_journal_current_handle();
5138 handle_t *handle; 5619 handle_t *handle;
5139 5620
5140 if (!ext4_handle_valid(current_handle)) {
5141 ext4_mark_inode_dirty(current_handle, inode);
5142 return;
5143 }
5144
5145 handle = ext4_journal_start(inode, 2); 5621 handle = ext4_journal_start(inode, 2);
5146 if (IS_ERR(handle)) 5622 if (IS_ERR(handle))
5147 goto out; 5623 goto out;
5148 if (current_handle && 5624
5149 current_handle->h_transaction != handle->h_transaction) { 5625 ext4_mark_inode_dirty(handle, inode);
5150 /* This task has a transaction open against a different fs */ 5626
5151 printk(KERN_EMERG "%s: transactions do not match!\n",
5152 __func__);
5153 } else {
5154 jbd_debug(5, "marking dirty. outer handle=%p\n",
5155 current_handle);
5156 ext4_mark_inode_dirty(handle, inode);
5157 }
5158 ext4_journal_stop(handle); 5627 ext4_journal_stop(handle);
5159out: 5628out:
5160 return; 5629 return;
@@ -5281,12 +5750,21 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5281 else 5750 else
5282 len = PAGE_CACHE_SIZE; 5751 len = PAGE_CACHE_SIZE;
5283 5752
5753 lock_page(page);
5754 /*
5755 * return if we have all the buffers mapped. This avoid
5756 * the need to call write_begin/write_end which does a
5757 * journal_start/journal_stop which can block and take
5758 * long time
5759 */
5284 if (page_has_buffers(page)) { 5760 if (page_has_buffers(page)) {
5285 /* return if we have all the buffers mapped */
5286 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 5761 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
5287 ext4_bh_unmapped)) 5762 ext4_bh_unmapped)) {
5763 unlock_page(page);
5288 goto out_unlock; 5764 goto out_unlock;
5765 }
5289 } 5766 }
5767 unlock_page(page);
5290 /* 5768 /*
5291 * OK, we need to fill the hole... Do write_begin write_end 5769 * OK, we need to fill the hole... Do write_begin write_end
5292 * to do block allocation/reservation.We are not holding 5770 * to do block allocation/reservation.We are not holding
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7050a9cd04a4..c1cdf613e725 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -243,10 +243,9 @@ setversion_out:
243 me.donor_start, me.len, &me.moved_len); 243 me.donor_start, me.len, &me.moved_len);
244 fput(donor_filp); 244 fput(donor_filp);
245 245
246 if (!err) 246 if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
247 if (copy_to_user((struct move_extent *)arg, 247 return -EFAULT;
248 &me, sizeof(me))) 248
249 return -EFAULT;
250 return err; 249 return err;
251 } 250 }
252 251
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cd258463e2a9..bba12824defa 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -22,6 +22,7 @@
22 */ 22 */
23 23
24#include "mballoc.h" 24#include "mballoc.h"
25#include <linux/debugfs.h>
25#include <trace/events/ext4.h> 26#include <trace/events/ext4.h>
26 27
27/* 28/*
@@ -622,13 +623,13 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
622 623
623/* FIXME!! need more doc */ 624/* FIXME!! need more doc */
624static void ext4_mb_mark_free_simple(struct super_block *sb, 625static void ext4_mb_mark_free_simple(struct super_block *sb,
625 void *buddy, unsigned first, int len, 626 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
626 struct ext4_group_info *grp) 627 struct ext4_group_info *grp)
627{ 628{
628 struct ext4_sb_info *sbi = EXT4_SB(sb); 629 struct ext4_sb_info *sbi = EXT4_SB(sb);
629 unsigned short min; 630 ext4_grpblk_t min;
630 unsigned short max; 631 ext4_grpblk_t max;
631 unsigned short chunk; 632 ext4_grpblk_t chunk;
632 unsigned short border; 633 unsigned short border;
633 634
634 BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb)); 635 BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
@@ -662,10 +663,10 @@ void ext4_mb_generate_buddy(struct super_block *sb,
662 void *buddy, void *bitmap, ext4_group_t group) 663 void *buddy, void *bitmap, ext4_group_t group)
663{ 664{
664 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 665 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
665 unsigned short max = EXT4_BLOCKS_PER_GROUP(sb); 666 ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
666 unsigned short i = 0; 667 ext4_grpblk_t i = 0;
667 unsigned short first; 668 ext4_grpblk_t first;
668 unsigned short len; 669 ext4_grpblk_t len;
669 unsigned free = 0; 670 unsigned free = 0;
670 unsigned fragments = 0; 671 unsigned fragments = 0;
671 unsigned long long period = get_cycles(); 672 unsigned long long period = get_cycles();
@@ -743,7 +744,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
743 char *data; 744 char *data;
744 char *bitmap; 745 char *bitmap;
745 746
746 mb_debug("init page %lu\n", page->index); 747 mb_debug(1, "init page %lu\n", page->index);
747 748
748 inode = page->mapping->host; 749 inode = page->mapping->host;
749 sb = inode->i_sb; 750 sb = inode->i_sb;
@@ -822,7 +823,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
822 set_bitmap_uptodate(bh[i]); 823 set_bitmap_uptodate(bh[i]);
823 bh[i]->b_end_io = end_buffer_read_sync; 824 bh[i]->b_end_io = end_buffer_read_sync;
824 submit_bh(READ, bh[i]); 825 submit_bh(READ, bh[i]);
825 mb_debug("read bitmap for group %u\n", first_group + i); 826 mb_debug(1, "read bitmap for group %u\n", first_group + i);
826 } 827 }
827 828
828 /* wait for I/O completion */ 829 /* wait for I/O completion */
@@ -862,12 +863,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
862 if ((first_block + i) & 1) { 863 if ((first_block + i) & 1) {
863 /* this is block of buddy */ 864 /* this is block of buddy */
864 BUG_ON(incore == NULL); 865 BUG_ON(incore == NULL);
865 mb_debug("put buddy for group %u in page %lu/%x\n", 866 mb_debug(1, "put buddy for group %u in page %lu/%x\n",
866 group, page->index, i * blocksize); 867 group, page->index, i * blocksize);
867 grinfo = ext4_get_group_info(sb, group); 868 grinfo = ext4_get_group_info(sb, group);
868 grinfo->bb_fragments = 0; 869 grinfo->bb_fragments = 0;
869 memset(grinfo->bb_counters, 0, 870 memset(grinfo->bb_counters, 0,
870 sizeof(unsigned short)*(sb->s_blocksize_bits+2)); 871 sizeof(*grinfo->bb_counters) *
872 (sb->s_blocksize_bits+2));
871 /* 873 /*
872 * incore got set to the group block bitmap below 874 * incore got set to the group block bitmap below
873 */ 875 */
@@ -878,7 +880,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
878 } else { 880 } else {
879 /* this is block of bitmap */ 881 /* this is block of bitmap */
880 BUG_ON(incore != NULL); 882 BUG_ON(incore != NULL);
881 mb_debug("put bitmap for group %u in page %lu/%x\n", 883 mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
882 group, page->index, i * blocksize); 884 group, page->index, i * blocksize);
883 885
884 /* see comments in ext4_mb_put_pa() */ 886 /* see comments in ext4_mb_put_pa() */
@@ -908,6 +910,100 @@ out:
908 return err; 910 return err;
909} 911}
910 912
913static noinline_for_stack
914int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
915{
916
917 int ret = 0;
918 void *bitmap;
919 int blocks_per_page;
920 int block, pnum, poff;
921 int num_grp_locked = 0;
922 struct ext4_group_info *this_grp;
923 struct ext4_sb_info *sbi = EXT4_SB(sb);
924 struct inode *inode = sbi->s_buddy_cache;
925 struct page *page = NULL, *bitmap_page = NULL;
926
927 mb_debug(1, "init group %u\n", group);
928 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
929 this_grp = ext4_get_group_info(sb, group);
930 /*
931 * This ensures that we don't reinit the buddy cache
932 * page which map to the group from which we are already
933 * allocating. If we are looking at the buddy cache we would
934 * have taken a reference using ext4_mb_load_buddy and that
935 * would have taken the alloc_sem lock.
936 */
937 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
938 if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
939 /*
940 * somebody initialized the group
941 * return without doing anything
942 */
943 ret = 0;
944 goto err;
945 }
946 /*
947 * the buddy cache inode stores the block bitmap
948 * and buddy information in consecutive blocks.
949 * So for each group we need two blocks.
950 */
951 block = group * 2;
952 pnum = block / blocks_per_page;
953 poff = block % blocks_per_page;
954 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
955 if (page) {
956 BUG_ON(page->mapping != inode->i_mapping);
957 ret = ext4_mb_init_cache(page, NULL);
958 if (ret) {
959 unlock_page(page);
960 goto err;
961 }
962 unlock_page(page);
963 }
964 if (page == NULL || !PageUptodate(page)) {
965 ret = -EIO;
966 goto err;
967 }
968 mark_page_accessed(page);
969 bitmap_page = page;
970 bitmap = page_address(page) + (poff * sb->s_blocksize);
971
972 /* init buddy cache */
973 block++;
974 pnum = block / blocks_per_page;
975 poff = block % blocks_per_page;
976 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
977 if (page == bitmap_page) {
978 /*
979 * If both the bitmap and buddy are in
980 * the same page we don't need to force
981 * init the buddy
982 */
983 unlock_page(page);
984 } else if (page) {
985 BUG_ON(page->mapping != inode->i_mapping);
986 ret = ext4_mb_init_cache(page, bitmap);
987 if (ret) {
988 unlock_page(page);
989 goto err;
990 }
991 unlock_page(page);
992 }
993 if (page == NULL || !PageUptodate(page)) {
994 ret = -EIO;
995 goto err;
996 }
997 mark_page_accessed(page);
998err:
999 ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
1000 if (bitmap_page)
1001 page_cache_release(bitmap_page);
1002 if (page)
1003 page_cache_release(page);
1004 return ret;
1005}
1006
911static noinline_for_stack int 1007static noinline_for_stack int
912ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 1008ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
913 struct ext4_buddy *e4b) 1009 struct ext4_buddy *e4b)
@@ -922,7 +1018,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
922 struct ext4_sb_info *sbi = EXT4_SB(sb); 1018 struct ext4_sb_info *sbi = EXT4_SB(sb);
923 struct inode *inode = sbi->s_buddy_cache; 1019 struct inode *inode = sbi->s_buddy_cache;
924 1020
925 mb_debug("load group %u\n", group); 1021 mb_debug(1, "load group %u\n", group);
926 1022
927 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 1023 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
928 grp = ext4_get_group_info(sb, group); 1024 grp = ext4_get_group_info(sb, group);
@@ -941,8 +1037,26 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
941 * groups mapped by the page is blocked 1037 * groups mapped by the page is blocked
942 * till we are done with allocation 1038 * till we are done with allocation
943 */ 1039 */
1040repeat_load_buddy:
944 down_read(e4b->alloc_semp); 1041 down_read(e4b->alloc_semp);
945 1042
1043 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1044 /* we need to check for group need init flag
1045 * with alloc_semp held so that we can be sure
1046 * that new blocks didn't get added to the group
1047 * when we are loading the buddy cache
1048 */
1049 up_read(e4b->alloc_semp);
1050 /*
1051 * we need full data about the group
1052 * to make a good selection
1053 */
1054 ret = ext4_mb_init_group(sb, group);
1055 if (ret)
1056 return ret;
1057 goto repeat_load_buddy;
1058 }
1059
946 /* 1060 /*
947 * the buddy cache inode stores the block bitmap 1061 * the buddy cache inode stores the block bitmap
948 * and buddy information in consecutive blocks. 1062 * and buddy information in consecutive blocks.
@@ -1360,7 +1474,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1360 ac->alloc_semp = e4b->alloc_semp; 1474 ac->alloc_semp = e4b->alloc_semp;
1361 e4b->alloc_semp = NULL; 1475 e4b->alloc_semp = NULL;
1362 /* store last allocated for subsequent stream allocation */ 1476 /* store last allocated for subsequent stream allocation */
1363 if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { 1477 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1364 spin_lock(&sbi->s_md_lock); 1478 spin_lock(&sbi->s_md_lock);
1365 sbi->s_mb_last_group = ac->ac_f_ex.fe_group; 1479 sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
1366 sbi->s_mb_last_start = ac->ac_f_ex.fe_start; 1480 sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
@@ -1837,97 +1951,6 @@ void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
1837 1951
1838} 1952}
1839 1953
1840static noinline_for_stack
1841int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1842{
1843
1844 int ret;
1845 void *bitmap;
1846 int blocks_per_page;
1847 int block, pnum, poff;
1848 int num_grp_locked = 0;
1849 struct ext4_group_info *this_grp;
1850 struct ext4_sb_info *sbi = EXT4_SB(sb);
1851 struct inode *inode = sbi->s_buddy_cache;
1852 struct page *page = NULL, *bitmap_page = NULL;
1853
1854 mb_debug("init group %lu\n", group);
1855 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1856 this_grp = ext4_get_group_info(sb, group);
1857 /*
1858 * This ensures we don't add group
1859 * to this buddy cache via resize
1860 */
1861 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
1862 if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
1863 /*
1864 * somebody initialized the group
1865 * return without doing anything
1866 */
1867 ret = 0;
1868 goto err;
1869 }
1870 /*
1871 * the buddy cache inode stores the block bitmap
1872 * and buddy information in consecutive blocks.
1873 * So for each group we need two blocks.
1874 */
1875 block = group * 2;
1876 pnum = block / blocks_per_page;
1877 poff = block % blocks_per_page;
1878 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1879 if (page) {
1880 BUG_ON(page->mapping != inode->i_mapping);
1881 ret = ext4_mb_init_cache(page, NULL);
1882 if (ret) {
1883 unlock_page(page);
1884 goto err;
1885 }
1886 unlock_page(page);
1887 }
1888 if (page == NULL || !PageUptodate(page)) {
1889 ret = -EIO;
1890 goto err;
1891 }
1892 mark_page_accessed(page);
1893 bitmap_page = page;
1894 bitmap = page_address(page) + (poff * sb->s_blocksize);
1895
1896 /* init buddy cache */
1897 block++;
1898 pnum = block / blocks_per_page;
1899 poff = block % blocks_per_page;
1900 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1901 if (page == bitmap_page) {
1902 /*
1903 * If both the bitmap and buddy are in
1904 * the same page we don't need to force
1905 * init the buddy
1906 */
1907 unlock_page(page);
1908 } else if (page) {
1909 BUG_ON(page->mapping != inode->i_mapping);
1910 ret = ext4_mb_init_cache(page, bitmap);
1911 if (ret) {
1912 unlock_page(page);
1913 goto err;
1914 }
1915 unlock_page(page);
1916 }
1917 if (page == NULL || !PageUptodate(page)) {
1918 ret = -EIO;
1919 goto err;
1920 }
1921 mark_page_accessed(page);
1922err:
1923 ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
1924 if (bitmap_page)
1925 page_cache_release(bitmap_page);
1926 if (page)
1927 page_cache_release(page);
1928 return ret;
1929}
1930
1931static noinline_for_stack int 1954static noinline_for_stack int
1932ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 1955ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1933{ 1956{
@@ -1938,11 +1961,14 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1938 struct ext4_sb_info *sbi; 1961 struct ext4_sb_info *sbi;
1939 struct super_block *sb; 1962 struct super_block *sb;
1940 struct ext4_buddy e4b; 1963 struct ext4_buddy e4b;
1941 loff_t size, isize;
1942 1964
1943 sb = ac->ac_sb; 1965 sb = ac->ac_sb;
1944 sbi = EXT4_SB(sb); 1966 sbi = EXT4_SB(sb);
1945 ngroups = ext4_get_groups_count(sb); 1967 ngroups = ext4_get_groups_count(sb);
1968 /* non-extent files are limited to low blocks/groups */
1969 if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
1970 ngroups = sbi->s_blockfile_groups;
1971
1946 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 1972 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1947 1973
1948 /* first, try the goal */ 1974 /* first, try the goal */
@@ -1974,20 +2000,16 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1974 } 2000 }
1975 2001
1976 bsbits = ac->ac_sb->s_blocksize_bits; 2002 bsbits = ac->ac_sb->s_blocksize_bits;
1977 /* if stream allocation is enabled, use global goal */
1978 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
1979 isize = i_size_read(ac->ac_inode) >> bsbits;
1980 if (size < isize)
1981 size = isize;
1982 2003
1983 if (size < sbi->s_mb_stream_request && 2004 /* if stream allocation is enabled, use global goal */
1984 (ac->ac_flags & EXT4_MB_HINT_DATA)) { 2005 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1985 /* TBD: may be hot point */ 2006 /* TBD: may be hot point */
1986 spin_lock(&sbi->s_md_lock); 2007 spin_lock(&sbi->s_md_lock);
1987 ac->ac_g_ex.fe_group = sbi->s_mb_last_group; 2008 ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
1988 ac->ac_g_ex.fe_start = sbi->s_mb_last_start; 2009 ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
1989 spin_unlock(&sbi->s_md_lock); 2010 spin_unlock(&sbi->s_md_lock);
1990 } 2011 }
2012
1991 /* Let's just scan groups to find more-less suitable blocks */ 2013 /* Let's just scan groups to find more-less suitable blocks */
1992 cr = ac->ac_2order ? 0 : 1; 2014 cr = ac->ac_2order ? 0 : 1;
1993 /* 2015 /*
@@ -2015,27 +2037,6 @@ repeat:
2015 if (grp->bb_free == 0) 2037 if (grp->bb_free == 0)
2016 continue; 2038 continue;
2017 2039
2018 /*
2019 * if the group is already init we check whether it is
2020 * a good group and if not we don't load the buddy
2021 */
2022 if (EXT4_MB_GRP_NEED_INIT(grp)) {
2023 /*
2024 * we need full data about the group
2025 * to make a good selection
2026 */
2027 err = ext4_mb_init_group(sb, group);
2028 if (err)
2029 goto out;
2030 }
2031
2032 /*
2033 * If the particular group doesn't satisfy our
2034 * criteria we continue with the next group
2035 */
2036 if (!ext4_mb_good_group(ac, group, cr))
2037 continue;
2038
2039 err = ext4_mb_load_buddy(sb, group, &e4b); 2040 err = ext4_mb_load_buddy(sb, group, &e4b);
2040 if (err) 2041 if (err)
2041 goto out; 2042 goto out;
@@ -2095,207 +2096,6 @@ out:
2095 return err; 2096 return err;
2096} 2097}
2097 2098
2098#ifdef EXT4_MB_HISTORY
2099struct ext4_mb_proc_session {
2100 struct ext4_mb_history *history;
2101 struct super_block *sb;
2102 int start;
2103 int max;
2104};
2105
2106static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s,
2107 struct ext4_mb_history *hs,
2108 int first)
2109{
2110 if (hs == s->history + s->max)
2111 hs = s->history;
2112 if (!first && hs == s->history + s->start)
2113 return NULL;
2114 while (hs->orig.fe_len == 0) {
2115 hs++;
2116 if (hs == s->history + s->max)
2117 hs = s->history;
2118 if (hs == s->history + s->start)
2119 return NULL;
2120 }
2121 return hs;
2122}
2123
2124static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos)
2125{
2126 struct ext4_mb_proc_session *s = seq->private;
2127 struct ext4_mb_history *hs;
2128 int l = *pos;
2129
2130 if (l == 0)
2131 return SEQ_START_TOKEN;
2132 hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1);
2133 if (!hs)
2134 return NULL;
2135 while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL);
2136 return hs;
2137}
2138
2139static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v,
2140 loff_t *pos)
2141{
2142 struct ext4_mb_proc_session *s = seq->private;
2143 struct ext4_mb_history *hs = v;
2144
2145 ++*pos;
2146 if (v == SEQ_START_TOKEN)
2147 return ext4_mb_history_skip_empty(s, s->history + s->start, 1);
2148 else
2149 return ext4_mb_history_skip_empty(s, ++hs, 0);
2150}
2151
2152static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
2153{
2154 char buf[25], buf2[25], buf3[25], *fmt;
2155 struct ext4_mb_history *hs = v;
2156
2157 if (v == SEQ_START_TOKEN) {
2158 seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
2159 "%-5s %-2s %-5s %-5s %-5s %-6s\n",
2160 "pid", "inode", "original", "goal", "result", "found",
2161 "grps", "cr", "flags", "merge", "tail", "broken");
2162 return 0;
2163 }
2164
2165 if (hs->op == EXT4_MB_HISTORY_ALLOC) {
2166 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
2167 "%-5u %-5s %-5u %-6u\n";
2168 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
2169 hs->result.fe_start, hs->result.fe_len,
2170 hs->result.fe_logical);
2171 sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
2172 hs->orig.fe_start, hs->orig.fe_len,
2173 hs->orig.fe_logical);
2174 sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group,
2175 hs->goal.fe_start, hs->goal.fe_len,
2176 hs->goal.fe_logical);
2177 seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
2178 hs->found, hs->groups, hs->cr, hs->flags,
2179 hs->merged ? "M" : "", hs->tail,
2180 hs->buddy ? 1 << hs->buddy : 0);
2181 } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
2182 fmt = "%-5u %-8u %-23s %-23s %-23s\n";
2183 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
2184 hs->result.fe_start, hs->result.fe_len,
2185 hs->result.fe_logical);
2186 sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
2187 hs->orig.fe_start, hs->orig.fe_len,
2188 hs->orig.fe_logical);
2189 seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
2190 } else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
2191 sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
2192 hs->result.fe_start, hs->result.fe_len);
2193 seq_printf(seq, "%-5u %-8u %-23s discard\n",
2194 hs->pid, hs->ino, buf2);
2195 } else if (hs->op == EXT4_MB_HISTORY_FREE) {
2196 sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
2197 hs->result.fe_start, hs->result.fe_len);
2198 seq_printf(seq, "%-5u %-8u %-23s free\n",
2199 hs->pid, hs->ino, buf2);
2200 }
2201 return 0;
2202}
2203
2204static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
2205{
2206}
2207
2208static struct seq_operations ext4_mb_seq_history_ops = {
2209 .start = ext4_mb_seq_history_start,
2210 .next = ext4_mb_seq_history_next,
2211 .stop = ext4_mb_seq_history_stop,
2212 .show = ext4_mb_seq_history_show,
2213};
2214
2215static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
2216{
2217 struct super_block *sb = PDE(inode)->data;
2218 struct ext4_sb_info *sbi = EXT4_SB(sb);
2219 struct ext4_mb_proc_session *s;
2220 int rc;
2221 int size;
2222
2223 if (unlikely(sbi->s_mb_history == NULL))
2224 return -ENOMEM;
2225 s = kmalloc(sizeof(*s), GFP_KERNEL);
2226 if (s == NULL)
2227 return -ENOMEM;
2228 s->sb = sb;
2229 size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max;
2230 s->history = kmalloc(size, GFP_KERNEL);
2231 if (s->history == NULL) {
2232 kfree(s);
2233 return -ENOMEM;
2234 }
2235
2236 spin_lock(&sbi->s_mb_history_lock);
2237 memcpy(s->history, sbi->s_mb_history, size);
2238 s->max = sbi->s_mb_history_max;
2239 s->start = sbi->s_mb_history_cur % s->max;
2240 spin_unlock(&sbi->s_mb_history_lock);
2241
2242 rc = seq_open(file, &ext4_mb_seq_history_ops);
2243 if (rc == 0) {
2244 struct seq_file *m = (struct seq_file *)file->private_data;
2245 m->private = s;
2246 } else {
2247 kfree(s->history);
2248 kfree(s);
2249 }
2250 return rc;
2251
2252}
2253
2254static int ext4_mb_seq_history_release(struct inode *inode, struct file *file)
2255{
2256 struct seq_file *seq = (struct seq_file *)file->private_data;
2257 struct ext4_mb_proc_session *s = seq->private;
2258 kfree(s->history);
2259 kfree(s);
2260 return seq_release(inode, file);
2261}
2262
2263static ssize_t ext4_mb_seq_history_write(struct file *file,
2264 const char __user *buffer,
2265 size_t count, loff_t *ppos)
2266{
2267 struct seq_file *seq = (struct seq_file *)file->private_data;
2268 struct ext4_mb_proc_session *s = seq->private;
2269 struct super_block *sb = s->sb;
2270 char str[32];
2271 int value;
2272
2273 if (count >= sizeof(str)) {
2274 printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n",
2275 "mb_history", (int)sizeof(str));
2276 return -EOVERFLOW;
2277 }
2278
2279 if (copy_from_user(str, buffer, count))
2280 return -EFAULT;
2281
2282 value = simple_strtol(str, NULL, 0);
2283 if (value < 0)
2284 return -ERANGE;
2285 EXT4_SB(sb)->s_mb_history_filter = value;
2286
2287 return count;
2288}
2289
2290static struct file_operations ext4_mb_seq_history_fops = {
2291 .owner = THIS_MODULE,
2292 .open = ext4_mb_seq_history_open,
2293 .read = seq_read,
2294 .write = ext4_mb_seq_history_write,
2295 .llseek = seq_lseek,
2296 .release = ext4_mb_seq_history_release,
2297};
2298
2299static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) 2099static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2300{ 2100{
2301 struct super_block *sb = seq->private; 2101 struct super_block *sb = seq->private;
@@ -2328,7 +2128,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2328 struct ext4_buddy e4b; 2128 struct ext4_buddy e4b;
2329 struct sg { 2129 struct sg {
2330 struct ext4_group_info info; 2130 struct ext4_group_info info;
2331 unsigned short counters[16]; 2131 ext4_grpblk_t counters[16];
2332 } sg; 2132 } sg;
2333 2133
2334 group--; 2134 group--;
@@ -2366,7 +2166,7 @@ static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
2366{ 2166{
2367} 2167}
2368 2168
2369static struct seq_operations ext4_mb_seq_groups_ops = { 2169static const struct seq_operations ext4_mb_seq_groups_ops = {
2370 .start = ext4_mb_seq_groups_start, 2170 .start = ext4_mb_seq_groups_start,
2371 .next = ext4_mb_seq_groups_next, 2171 .next = ext4_mb_seq_groups_next,
2372 .stop = ext4_mb_seq_groups_stop, 2172 .stop = ext4_mb_seq_groups_stop,
@@ -2387,7 +2187,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
2387 2187
2388} 2188}
2389 2189
2390static struct file_operations ext4_mb_seq_groups_fops = { 2190static const struct file_operations ext4_mb_seq_groups_fops = {
2391 .owner = THIS_MODULE, 2191 .owner = THIS_MODULE,
2392 .open = ext4_mb_seq_groups_open, 2192 .open = ext4_mb_seq_groups_open,
2393 .read = seq_read, 2193 .read = seq_read,
@@ -2395,82 +2195,6 @@ static struct file_operations ext4_mb_seq_groups_fops = {
2395 .release = seq_release, 2195 .release = seq_release,
2396}; 2196};
2397 2197
2398static void ext4_mb_history_release(struct super_block *sb)
2399{
2400 struct ext4_sb_info *sbi = EXT4_SB(sb);
2401
2402 if (sbi->s_proc != NULL) {
2403 remove_proc_entry("mb_groups", sbi->s_proc);
2404 if (sbi->s_mb_history_max)
2405 remove_proc_entry("mb_history", sbi->s_proc);
2406 }
2407 kfree(sbi->s_mb_history);
2408}
2409
2410static void ext4_mb_history_init(struct super_block *sb)
2411{
2412 struct ext4_sb_info *sbi = EXT4_SB(sb);
2413 int i;
2414
2415 if (sbi->s_proc != NULL) {
2416 if (sbi->s_mb_history_max)
2417 proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
2418 &ext4_mb_seq_history_fops, sb);
2419 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2420 &ext4_mb_seq_groups_fops, sb);
2421 }
2422
2423 sbi->s_mb_history_cur = 0;
2424 spin_lock_init(&sbi->s_mb_history_lock);
2425 i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
2426 sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL;
2427 /* if we can't allocate history, then we simple won't use it */
2428}
2429
2430static noinline_for_stack void
2431ext4_mb_store_history(struct ext4_allocation_context *ac)
2432{
2433 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2434 struct ext4_mb_history h;
2435
2436 if (sbi->s_mb_history == NULL)
2437 return;
2438
2439 if (!(ac->ac_op & sbi->s_mb_history_filter))
2440 return;
2441
2442 h.op = ac->ac_op;
2443 h.pid = current->pid;
2444 h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0;
2445 h.orig = ac->ac_o_ex;
2446 h.result = ac->ac_b_ex;
2447 h.flags = ac->ac_flags;
2448 h.found = ac->ac_found;
2449 h.groups = ac->ac_groups_scanned;
2450 h.cr = ac->ac_criteria;
2451 h.tail = ac->ac_tail;
2452 h.buddy = ac->ac_buddy;
2453 h.merged = 0;
2454 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) {
2455 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
2456 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
2457 h.merged = 1;
2458 h.goal = ac->ac_g_ex;
2459 h.result = ac->ac_f_ex;
2460 }
2461
2462 spin_lock(&sbi->s_mb_history_lock);
2463 memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h));
2464 if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max)
2465 sbi->s_mb_history_cur = 0;
2466 spin_unlock(&sbi->s_mb_history_lock);
2467}
2468
2469#else
2470#define ext4_mb_history_release(sb)
2471#define ext4_mb_history_init(sb)
2472#endif
2473
2474 2198
2475/* Create and initialize ext4_group_info data for the given group. */ 2199/* Create and initialize ext4_group_info data for the given group. */
2476int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, 2200int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
@@ -2532,7 +2256,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2532 2256
2533 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2257 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2534 init_rwsem(&meta_group_info[i]->alloc_sem); 2258 init_rwsem(&meta_group_info[i]->alloc_sem);
2535 meta_group_info[i]->bb_free_root.rb_node = NULL;; 2259 meta_group_info[i]->bb_free_root.rb_node = NULL;
2536 2260
2537#ifdef DOUBLE_CHECK 2261#ifdef DOUBLE_CHECK
2538 { 2262 {
@@ -2558,26 +2282,15 @@ exit_meta_group_info:
2558 return -ENOMEM; 2282 return -ENOMEM;
2559} /* ext4_mb_add_groupinfo */ 2283} /* ext4_mb_add_groupinfo */
2560 2284
2561/*
2562 * Update an existing group.
2563 * This function is used for online resize
2564 */
2565void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
2566{
2567 grp->bb_free += add;
2568}
2569
2570static int ext4_mb_init_backend(struct super_block *sb) 2285static int ext4_mb_init_backend(struct super_block *sb)
2571{ 2286{
2572 ext4_group_t ngroups = ext4_get_groups_count(sb); 2287 ext4_group_t ngroups = ext4_get_groups_count(sb);
2573 ext4_group_t i; 2288 ext4_group_t i;
2574 int metalen;
2575 struct ext4_sb_info *sbi = EXT4_SB(sb); 2289 struct ext4_sb_info *sbi = EXT4_SB(sb);
2576 struct ext4_super_block *es = sbi->s_es; 2290 struct ext4_super_block *es = sbi->s_es;
2577 int num_meta_group_infos; 2291 int num_meta_group_infos;
2578 int num_meta_group_infos_max; 2292 int num_meta_group_infos_max;
2579 int array_size; 2293 int array_size;
2580 struct ext4_group_info **meta_group_info;
2581 struct ext4_group_desc *desc; 2294 struct ext4_group_desc *desc;
2582 2295
2583 /* This is the number of blocks used by GDT */ 2296 /* This is the number of blocks used by GDT */
@@ -2622,22 +2335,6 @@ static int ext4_mb_init_backend(struct super_block *sb)
2622 goto err_freesgi; 2335 goto err_freesgi;
2623 } 2336 }
2624 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; 2337 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2625
2626 metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb);
2627 for (i = 0; i < num_meta_group_infos; i++) {
2628 if ((i + 1) == num_meta_group_infos)
2629 metalen = sizeof(*meta_group_info) *
2630 (ngroups -
2631 (i << EXT4_DESC_PER_BLOCK_BITS(sb)));
2632 meta_group_info = kmalloc(metalen, GFP_KERNEL);
2633 if (meta_group_info == NULL) {
2634 printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
2635 "buddy group\n");
2636 goto err_freemeta;
2637 }
2638 sbi->s_group_info[i] = meta_group_info;
2639 }
2640
2641 for (i = 0; i < ngroups; i++) { 2338 for (i = 0; i < ngroups; i++) {
2642 desc = ext4_get_group_desc(sb, i, NULL); 2339 desc = ext4_get_group_desc(sb, i, NULL);
2643 if (desc == NULL) { 2340 if (desc == NULL) {
@@ -2655,7 +2352,6 @@ err_freebuddy:
2655 while (i-- > 0) 2352 while (i-- > 0)
2656 kfree(ext4_get_group_info(sb, i)); 2353 kfree(ext4_get_group_info(sb, i));
2657 i = num_meta_group_infos; 2354 i = num_meta_group_infos;
2658err_freemeta:
2659 while (i-- > 0) 2355 while (i-- > 0)
2660 kfree(sbi->s_group_info[i]); 2356 kfree(sbi->s_group_info[i]);
2661 iput(sbi->s_buddy_cache); 2357 iput(sbi->s_buddy_cache);
@@ -2672,14 +2368,14 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2672 unsigned max; 2368 unsigned max;
2673 int ret; 2369 int ret;
2674 2370
2675 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); 2371 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
2676 2372
2677 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); 2373 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2678 if (sbi->s_mb_offsets == NULL) { 2374 if (sbi->s_mb_offsets == NULL) {
2679 return -ENOMEM; 2375 return -ENOMEM;
2680 } 2376 }
2681 2377
2682 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int); 2378 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
2683 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2379 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2684 if (sbi->s_mb_maxs == NULL) { 2380 if (sbi->s_mb_maxs == NULL) {
2685 kfree(sbi->s_mb_offsets); 2381 kfree(sbi->s_mb_offsets);
@@ -2717,7 +2413,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2717 sbi->s_mb_stats = MB_DEFAULT_STATS; 2413 sbi->s_mb_stats = MB_DEFAULT_STATS;
2718 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; 2414 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
2719 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; 2415 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2720 sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
2721 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; 2416 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
2722 2417
2723 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); 2418 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
@@ -2735,12 +2430,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2735 spin_lock_init(&lg->lg_prealloc_lock); 2430 spin_lock_init(&lg->lg_prealloc_lock);
2736 } 2431 }
2737 2432
2738 ext4_mb_history_init(sb); 2433 if (sbi->s_proc)
2434 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2435 &ext4_mb_seq_groups_fops, sb);
2739 2436
2740 if (sbi->s_journal) 2437 if (sbi->s_journal)
2741 sbi->s_journal->j_commit_callback = release_blocks_on_commit; 2438 sbi->s_journal->j_commit_callback = release_blocks_on_commit;
2742
2743 printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
2744 return 0; 2439 return 0;
2745} 2440}
2746 2441
@@ -2758,7 +2453,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
2758 kmem_cache_free(ext4_pspace_cachep, pa); 2453 kmem_cache_free(ext4_pspace_cachep, pa);
2759 } 2454 }
2760 if (count) 2455 if (count)
2761 mb_debug("mballoc: %u PAs left\n", count); 2456 mb_debug(1, "mballoc: %u PAs left\n", count);
2762 2457
2763} 2458}
2764 2459
@@ -2817,7 +2512,8 @@ int ext4_mb_release(struct super_block *sb)
2817 } 2512 }
2818 2513
2819 free_percpu(sbi->s_locality_groups); 2514 free_percpu(sbi->s_locality_groups);
2820 ext4_mb_history_release(sb); 2515 if (sbi->s_proc)
2516 remove_proc_entry("mb_groups", sbi->s_proc);
2821 2517
2822 return 0; 2518 return 0;
2823} 2519}
@@ -2839,7 +2535,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2839 list_for_each_safe(l, ltmp, &txn->t_private_list) { 2535 list_for_each_safe(l, ltmp, &txn->t_private_list) {
2840 entry = list_entry(l, struct ext4_free_data, list); 2536 entry = list_entry(l, struct ext4_free_data, list);
2841 2537
2842 mb_debug("gonna free %u blocks in group %u (0x%p):", 2538 mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2843 entry->count, entry->group, entry); 2539 entry->count, entry->group, entry);
2844 2540
2845 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2541 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2874,9 +2570,43 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2874 ext4_mb_release_desc(&e4b); 2570 ext4_mb_release_desc(&e4b);
2875 } 2571 }
2876 2572
2877 mb_debug("freed %u blocks in %u structures\n", count, count2); 2573 mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
2574}
2575
2576#ifdef CONFIG_EXT4_DEBUG
2577u8 mb_enable_debug __read_mostly;
2578
2579static struct dentry *debugfs_dir;
2580static struct dentry *debugfs_debug;
2581
2582static void __init ext4_create_debugfs_entry(void)
2583{
2584 debugfs_dir = debugfs_create_dir("ext4", NULL);
2585 if (debugfs_dir)
2586 debugfs_debug = debugfs_create_u8("mballoc-debug",
2587 S_IRUGO | S_IWUSR,
2588 debugfs_dir,
2589 &mb_enable_debug);
2590}
2591
2592static void ext4_remove_debugfs_entry(void)
2593{
2594 debugfs_remove(debugfs_debug);
2595 debugfs_remove(debugfs_dir);
2878} 2596}
2879 2597
2598#else
2599
2600static void __init ext4_create_debugfs_entry(void)
2601{
2602}
2603
2604static void ext4_remove_debugfs_entry(void)
2605{
2606}
2607
2608#endif
2609
2880int __init init_ext4_mballoc(void) 2610int __init init_ext4_mballoc(void)
2881{ 2611{
2882 ext4_pspace_cachep = 2612 ext4_pspace_cachep =
@@ -2904,6 +2634,7 @@ int __init init_ext4_mballoc(void)
2904 kmem_cache_destroy(ext4_ac_cachep); 2634 kmem_cache_destroy(ext4_ac_cachep);
2905 return -ENOMEM; 2635 return -ENOMEM;
2906 } 2636 }
2637 ext4_create_debugfs_entry();
2907 return 0; 2638 return 0;
2908} 2639}
2909 2640
@@ -2917,6 +2648,7 @@ void exit_ext4_mballoc(void)
2917 kmem_cache_destroy(ext4_pspace_cachep); 2648 kmem_cache_destroy(ext4_pspace_cachep);
2918 kmem_cache_destroy(ext4_ac_cachep); 2649 kmem_cache_destroy(ext4_ac_cachep);
2919 kmem_cache_destroy(ext4_free_ext_cachep); 2650 kmem_cache_destroy(ext4_free_ext_cachep);
2651 ext4_remove_debugfs_entry();
2920} 2652}
2921 2653
2922 2654
@@ -3061,7 +2793,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
3061 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe; 2793 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
3062 else 2794 else
3063 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; 2795 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
3064 mb_debug("#%u: goal %u blocks for locality group\n", 2796 mb_debug(1, "#%u: goal %u blocks for locality group\n",
3065 current->pid, ac->ac_g_ex.fe_len); 2797 current->pid, ac->ac_g_ex.fe_len);
3066} 2798}
3067 2799
@@ -3180,23 +2912,18 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3180 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || 2912 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
3181 ac->ac_o_ex.fe_logical < pa->pa_lstart)); 2913 ac->ac_o_ex.fe_logical < pa->pa_lstart));
3182 2914
3183 /* skip PA normalized request doesn't overlap with */ 2915 /* skip PAs this normalized request doesn't overlap with */
3184 if (pa->pa_lstart >= end) { 2916 if (pa->pa_lstart >= end || pa_end <= start) {
3185 spin_unlock(&pa->pa_lock);
3186 continue;
3187 }
3188 if (pa_end <= start) {
3189 spin_unlock(&pa->pa_lock); 2917 spin_unlock(&pa->pa_lock);
3190 continue; 2918 continue;
3191 } 2919 }
3192 BUG_ON(pa->pa_lstart <= start && pa_end >= end); 2920 BUG_ON(pa->pa_lstart <= start && pa_end >= end);
3193 2921
2922 /* adjust start or end to be adjacent to this pa */
3194 if (pa_end <= ac->ac_o_ex.fe_logical) { 2923 if (pa_end <= ac->ac_o_ex.fe_logical) {
3195 BUG_ON(pa_end < start); 2924 BUG_ON(pa_end < start);
3196 start = pa_end; 2925 start = pa_end;
3197 } 2926 } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
3198
3199 if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
3200 BUG_ON(pa->pa_lstart > end); 2927 BUG_ON(pa->pa_lstart > end);
3201 end = pa->pa_lstart; 2928 end = pa->pa_lstart;
3202 } 2929 }
@@ -3251,7 +2978,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3251 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 2978 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3252 } 2979 }
3253 2980
3254 mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size, 2981 mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
3255 (unsigned) orig_size, (unsigned) start); 2982 (unsigned) orig_size, (unsigned) start);
3256} 2983}
3257 2984
@@ -3272,7 +2999,10 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
3272 atomic_inc(&sbi->s_bal_breaks); 2999 atomic_inc(&sbi->s_bal_breaks);
3273 } 3000 }
3274 3001
3275 ext4_mb_store_history(ac); 3002 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
3003 trace_ext4_mballoc_alloc(ac);
3004 else
3005 trace_ext4_mballoc_prealloc(ac);
3276} 3006}
3277 3007
3278/* 3008/*
@@ -3300,7 +3030,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
3300 BUG_ON(pa->pa_free < len); 3030 BUG_ON(pa->pa_free < len);
3301 pa->pa_free -= len; 3031 pa->pa_free -= len;
3302 3032
3303 mb_debug("use %llu/%u from inode pa %p\n", start, len, pa); 3033 mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
3304} 3034}
3305 3035
3306/* 3036/*
@@ -3324,7 +3054,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
3324 * in on-disk bitmap -- see ext4_mb_release_context() 3054 * in on-disk bitmap -- see ext4_mb_release_context()
3325 * Other CPUs are prevented from allocating from this pa by lg_mutex 3055 * Other CPUs are prevented from allocating from this pa by lg_mutex
3326 */ 3056 */
3327 mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); 3057 mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
3328} 3058}
3329 3059
3330/* 3060/*
@@ -3382,6 +3112,11 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3382 ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) 3112 ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
3383 continue; 3113 continue;
3384 3114
3115 /* non-extent files can't have physical blocks past 2^32 */
3116 if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
3117 pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
3118 continue;
3119
3385 /* found preallocated blocks, use them */ 3120 /* found preallocated blocks, use them */
3386 spin_lock(&pa->pa_lock); 3121 spin_lock(&pa->pa_lock);
3387 if (pa->pa_deleted == 0 && pa->pa_free) { 3122 if (pa->pa_deleted == 0 && pa->pa_free) {
@@ -3503,7 +3238,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3503 preallocated += len; 3238 preallocated += len;
3504 count++; 3239 count++;
3505 } 3240 }
3506 mb_debug("prellocated %u for group %u\n", preallocated, group); 3241 mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
3507} 3242}
3508 3243
3509static void ext4_mb_pa_callback(struct rcu_head *head) 3244static void ext4_mb_pa_callback(struct rcu_head *head)
@@ -3638,7 +3373,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3638 pa->pa_deleted = 0; 3373 pa->pa_deleted = 0;
3639 pa->pa_type = MB_INODE_PA; 3374 pa->pa_type = MB_INODE_PA;
3640 3375
3641 mb_debug("new inode pa %p: %llu/%u for %u\n", pa, 3376 mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
3642 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3377 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3643 trace_ext4_mb_new_inode_pa(ac, pa); 3378 trace_ext4_mb_new_inode_pa(ac, pa);
3644 3379
@@ -3698,7 +3433,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3698 pa->pa_deleted = 0; 3433 pa->pa_deleted = 0;
3699 pa->pa_type = MB_GROUP_PA; 3434 pa->pa_type = MB_GROUP_PA;
3700 3435
3701 mb_debug("new group pa %p: %llu/%u for %u\n", pa, 3436 mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
3702 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3437 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3703 trace_ext4_mb_new_group_pa(ac, pa); 3438 trace_ext4_mb_new_group_pa(ac, pa);
3704 3439
@@ -3767,7 +3502,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3767 if (ac) { 3502 if (ac) {
3768 ac->ac_sb = sb; 3503 ac->ac_sb = sb;
3769 ac->ac_inode = pa->pa_inode; 3504 ac->ac_inode = pa->pa_inode;
3770 ac->ac_op = EXT4_MB_HISTORY_DISCARD;
3771 } 3505 }
3772 3506
3773 while (bit < end) { 3507 while (bit < end) {
@@ -3777,7 +3511,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3777 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); 3511 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3778 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + 3512 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
3779 le32_to_cpu(sbi->s_es->s_first_data_block); 3513 le32_to_cpu(sbi->s_es->s_first_data_block);
3780 mb_debug(" free preallocated %u/%u in group %u\n", 3514 mb_debug(1, " free preallocated %u/%u in group %u\n",
3781 (unsigned) start, (unsigned) next - bit, 3515 (unsigned) start, (unsigned) next - bit,
3782 (unsigned) group); 3516 (unsigned) group);
3783 free += next - bit; 3517 free += next - bit;
@@ -3787,7 +3521,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3787 ac->ac_b_ex.fe_start = bit; 3521 ac->ac_b_ex.fe_start = bit;
3788 ac->ac_b_ex.fe_len = next - bit; 3522 ac->ac_b_ex.fe_len = next - bit;
3789 ac->ac_b_ex.fe_logical = 0; 3523 ac->ac_b_ex.fe_logical = 0;
3790 ext4_mb_store_history(ac); 3524 trace_ext4_mballoc_discard(ac);
3791 } 3525 }
3792 3526
3793 trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit, 3527 trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit,
@@ -3822,9 +3556,6 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3822 ext4_group_t group; 3556 ext4_group_t group;
3823 ext4_grpblk_t bit; 3557 ext4_grpblk_t bit;
3824 3558
3825 if (ac)
3826 ac->ac_op = EXT4_MB_HISTORY_DISCARD;
3827
3828 trace_ext4_mb_release_group_pa(ac, pa); 3559 trace_ext4_mb_release_group_pa(ac, pa);
3829 BUG_ON(pa->pa_deleted == 0); 3560 BUG_ON(pa->pa_deleted == 0);
3830 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3561 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
@@ -3839,7 +3570,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3839 ac->ac_b_ex.fe_start = bit; 3570 ac->ac_b_ex.fe_start = bit;
3840 ac->ac_b_ex.fe_len = pa->pa_len; 3571 ac->ac_b_ex.fe_len = pa->pa_len;
3841 ac->ac_b_ex.fe_logical = 0; 3572 ac->ac_b_ex.fe_logical = 0;
3842 ext4_mb_store_history(ac); 3573 trace_ext4_mballoc_discard(ac);
3843 } 3574 }
3844 3575
3845 return 0; 3576 return 0;
@@ -3868,7 +3599,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3868 int busy = 0; 3599 int busy = 0;
3869 int free = 0; 3600 int free = 0;
3870 3601
3871 mb_debug("discard preallocation for group %u\n", group); 3602 mb_debug(1, "discard preallocation for group %u\n", group);
3872 3603
3873 if (list_empty(&grp->bb_prealloc_list)) 3604 if (list_empty(&grp->bb_prealloc_list))
3874 return 0; 3605 return 0;
@@ -3992,7 +3723,7 @@ void ext4_discard_preallocations(struct inode *inode)
3992 return; 3723 return;
3993 } 3724 }
3994 3725
3995 mb_debug("discard preallocation for inode %lu\n", inode->i_ino); 3726 mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
3996 trace_ext4_discard_preallocations(inode); 3727 trace_ext4_discard_preallocations(inode);
3997 3728
3998 INIT_LIST_HEAD(&list); 3729 INIT_LIST_HEAD(&list);
@@ -4097,7 +3828,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode,
4097{ 3828{
4098 BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list)); 3829 BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
4099} 3830}
4100#ifdef MB_DEBUG 3831#ifdef CONFIG_EXT4_DEBUG
4101static void ext4_mb_show_ac(struct ext4_allocation_context *ac) 3832static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4102{ 3833{
4103 struct super_block *sb = ac->ac_sb; 3834 struct super_block *sb = ac->ac_sb;
@@ -4139,14 +3870,14 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4139 ext4_get_group_no_and_offset(sb, pa->pa_pstart, 3870 ext4_get_group_no_and_offset(sb, pa->pa_pstart,
4140 NULL, &start); 3871 NULL, &start);
4141 spin_unlock(&pa->pa_lock); 3872 spin_unlock(&pa->pa_lock);
4142 printk(KERN_ERR "PA:%lu:%d:%u \n", i, 3873 printk(KERN_ERR "PA:%u:%d:%u \n", i,
4143 start, pa->pa_len); 3874 start, pa->pa_len);
4144 } 3875 }
4145 ext4_unlock_group(sb, i); 3876 ext4_unlock_group(sb, i);
4146 3877
4147 if (grp->bb_free == 0) 3878 if (grp->bb_free == 0)
4148 continue; 3879 continue;
4149 printk(KERN_ERR "%lu: %d/%d \n", 3880 printk(KERN_ERR "%u: %d/%d \n",
4150 i, grp->bb_free, grp->bb_fragments); 3881 i, grp->bb_free, grp->bb_fragments);
4151 } 3882 }
4152 printk(KERN_ERR "\n"); 3883 printk(KERN_ERR "\n");
@@ -4174,16 +3905,26 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4174 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 3905 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
4175 return; 3906 return;
4176 3907
3908 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
3909 return;
3910
4177 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; 3911 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
4178 isize = i_size_read(ac->ac_inode) >> bsbits; 3912 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
4179 size = max(size, isize); 3913 >> bsbits;
4180 3914
4181 /* don't use group allocation for large files */ 3915 if ((size == isize) &&
4182 if (size >= sbi->s_mb_stream_request) 3916 !ext4_fs_is_busy(sbi) &&
3917 (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
3918 ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
4183 return; 3919 return;
3920 }
4184 3921
4185 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 3922 /* don't use group allocation for large files */
3923 size = max(size, isize);
3924 if (size >= sbi->s_mb_stream_request) {
3925 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4186 return; 3926 return;
3927 }
4187 3928
4188 BUG_ON(ac->ac_lg != NULL); 3929 BUG_ON(ac->ac_lg != NULL);
4189 /* 3930 /*
@@ -4246,7 +3987,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4246 * locality group. this is a policy, actually */ 3987 * locality group. this is a policy, actually */
4247 ext4_mb_group_or_file(ac); 3988 ext4_mb_group_or_file(ac);
4248 3989
4249 mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " 3990 mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
4250 "left: %u/%u, right %u/%u to %swritable\n", 3991 "left: %u/%u, right %u/%u to %swritable\n",
4251 (unsigned) ar->len, (unsigned) ar->logical, 3992 (unsigned) ar->len, (unsigned) ar->logical,
4252 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, 3993 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
@@ -4268,7 +4009,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4268 struct ext4_prealloc_space *pa, *tmp; 4009 struct ext4_prealloc_space *pa, *tmp;
4269 struct ext4_allocation_context *ac; 4010 struct ext4_allocation_context *ac;
4270 4011
4271 mb_debug("discard locality group preallocation\n"); 4012 mb_debug(1, "discard locality group preallocation\n");
4272 4013
4273 INIT_LIST_HEAD(&discard_list); 4014 INIT_LIST_HEAD(&discard_list);
4274 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4015 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
@@ -4720,7 +4461,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
4720 4461
4721 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4462 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4722 if (ac) { 4463 if (ac) {
4723 ac->ac_op = EXT4_MB_HISTORY_FREE;
4724 ac->ac_inode = inode; 4464 ac->ac_inode = inode;
4725 ac->ac_sb = sb; 4465 ac->ac_sb = sb;
4726 } 4466 }
@@ -4787,7 +4527,7 @@ do_more:
4787 ac->ac_b_ex.fe_group = block_group; 4527 ac->ac_b_ex.fe_group = block_group;
4788 ac->ac_b_ex.fe_start = bit; 4528 ac->ac_b_ex.fe_start = bit;
4789 ac->ac_b_ex.fe_len = count; 4529 ac->ac_b_ex.fe_len = count;
4790 ext4_mb_store_history(ac); 4530 trace_ext4_mballoc_free(ac);
4791 } 4531 }
4792 4532
4793 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4533 err = ext4_mb_load_buddy(sb, block_group, &e4b);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index c96bb19f58f9..0ca811061bc7 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -37,25 +37,23 @@
37 37
38/* 38/*
39 */ 39 */
40#define MB_DEBUG__ 40#ifdef CONFIG_EXT4_DEBUG
41#ifdef MB_DEBUG 41extern u8 mb_enable_debug;
42#define mb_debug(fmt, a...) printk(fmt, ##a) 42
43#define mb_debug(n, fmt, a...) \
44 do { \
45 if ((n) <= mb_enable_debug) { \
46 printk(KERN_DEBUG "(%s, %d): %s: ", \
47 __FILE__, __LINE__, __func__); \
48 printk(fmt, ## a); \
49 } \
50 } while (0)
43#else 51#else
44#define mb_debug(fmt, a...) 52#define mb_debug(n, fmt, a...)
45#endif 53#endif
46 54
47/*
48 * with EXT4_MB_HISTORY mballoc stores last N allocations in memory
49 * and you can monitor it in /proc/fs/ext4/<dev>/mb_history
50 */
51#define EXT4_MB_HISTORY
52#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ 55#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
53#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ 56#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */
54#define EXT4_MB_HISTORY_DISCARD 4 /* preallocation discarded */
55#define EXT4_MB_HISTORY_FREE 8 /* free */
56
57#define EXT4_MB_HISTORY_DEFAULT (EXT4_MB_HISTORY_ALLOC | \
58 EXT4_MB_HISTORY_PREALLOC)
59 57
60/* 58/*
61 * How long mballoc can look for a best extent (in found extents) 59 * How long mballoc can look for a best extent (in found extents)
@@ -76,7 +74,7 @@
76 * with 'ext4_mb_stats' allocator will collect stats that will be 74 * with 'ext4_mb_stats' allocator will collect stats that will be
77 * shown at umount. The collecting costs though! 75 * shown at umount. The collecting costs though!
78 */ 76 */
79#define MB_DEFAULT_STATS 1 77#define MB_DEFAULT_STATS 0
80 78
81/* 79/*
82 * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served 80 * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
@@ -128,8 +126,8 @@ struct ext4_prealloc_space {
128 unsigned pa_deleted; 126 unsigned pa_deleted;
129 ext4_fsblk_t pa_pstart; /* phys. block */ 127 ext4_fsblk_t pa_pstart; /* phys. block */
130 ext4_lblk_t pa_lstart; /* log. block */ 128 ext4_lblk_t pa_lstart; /* log. block */
131 unsigned short pa_len; /* len of preallocated chunk */ 129 ext4_grpblk_t pa_len; /* len of preallocated chunk */
132 unsigned short pa_free; /* how many blocks are free */ 130 ext4_grpblk_t pa_free; /* how many blocks are free */
133 unsigned short pa_type; /* pa type. inode or group */ 131 unsigned short pa_type; /* pa type. inode or group */
134 spinlock_t *pa_obj_lock; 132 spinlock_t *pa_obj_lock;
135 struct inode *pa_inode; /* hack, for history only */ 133 struct inode *pa_inode; /* hack, for history only */
@@ -144,7 +142,7 @@ struct ext4_free_extent {
144 ext4_lblk_t fe_logical; 142 ext4_lblk_t fe_logical;
145 ext4_grpblk_t fe_start; 143 ext4_grpblk_t fe_start;
146 ext4_group_t fe_group; 144 ext4_group_t fe_group;
147 int fe_len; 145 ext4_grpblk_t fe_len;
148}; 146};
149 147
150/* 148/*
@@ -209,22 +207,6 @@ struct ext4_allocation_context {
209#define AC_STATUS_FOUND 2 207#define AC_STATUS_FOUND 2
210#define AC_STATUS_BREAK 3 208#define AC_STATUS_BREAK 3
211 209
212struct ext4_mb_history {
213 struct ext4_free_extent orig; /* orig allocation */
214 struct ext4_free_extent goal; /* goal allocation */
215 struct ext4_free_extent result; /* result allocation */
216 unsigned pid;
217 unsigned ino;
218 __u16 found; /* how many extents have been found */
219 __u16 groups; /* how many groups have been scanned */
220 __u16 tail; /* what tail broke some buddy */
221 __u16 buddy; /* buddy the tail ^^^ broke */
222 __u16 flags;
223 __u8 cr:3; /* which phase the result extent was found at */
224 __u8 op:4;
225 __u8 merged:1;
226};
227
228struct ext4_buddy { 210struct ext4_buddy {
229 struct page *bd_buddy_page; 211 struct page *bd_buddy_page;
230 void *bd_buddy; 212 void *bd_buddy;
@@ -239,13 +221,6 @@ struct ext4_buddy {
239#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) 221#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
240#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) 222#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
241 223
242#ifndef EXT4_MB_HISTORY
243static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
244{
245 return;
246}
247#endif
248
249#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 224#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
250 225
251static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, 226static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 313a50b39741..a93d5b80f3e2 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -75,7 +75,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
75 goto err_out; 75 goto err_out;
76 } 76 }
77 } 77 }
78 retval = ext4_ext_insert_extent(handle, inode, path, &newext); 78 retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
79err_out: 79err_out:
80 if (path) { 80 if (path) {
81 ext4_ext_drop_refs(path); 81 ext4_ext_drop_refs(path);
@@ -353,17 +353,16 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
353 353
354 down_write(&EXT4_I(inode)->i_data_sem); 354 down_write(&EXT4_I(inode)->i_data_sem);
355 /* 355 /*
356 * if EXT4_EXT_MIGRATE is cleared a block allocation 356 * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation
357 * happened after we started the migrate. We need to 357 * happened after we started the migrate. We need to
358 * fail the migrate 358 * fail the migrate
359 */ 359 */
360 if (!(EXT4_I(inode)->i_flags & EXT4_EXT_MIGRATE)) { 360 if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) {
361 retval = -EAGAIN; 361 retval = -EAGAIN;
362 up_write(&EXT4_I(inode)->i_data_sem); 362 up_write(&EXT4_I(inode)->i_data_sem);
363 goto err_out; 363 goto err_out;
364 } else 364 } else
365 EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & 365 EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
366 ~EXT4_EXT_MIGRATE;
367 /* 366 /*
368 * We have the extent map build with the tmp inode. 367 * We have the extent map build with the tmp inode.
369 * Now copy the i_data across 368 * Now copy the i_data across
@@ -517,14 +516,15 @@ int ext4_ext_migrate(struct inode *inode)
517 * when we add extents we extent the journal 516 * when we add extents we extent the journal
518 */ 517 */
519 /* 518 /*
520 * Even though we take i_mutex we can still cause block allocation 519 * Even though we take i_mutex we can still cause block
521 * via mmap write to holes. If we have allocated new blocks we fail 520 * allocation via mmap write to holes. If we have allocated
522 * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag. 521 * new blocks we fail migrate. New block allocation will
523 * The flag is updated with i_data_sem held to prevent racing with 522 * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated
524 * block allocation. 523 * with i_data_sem held to prevent racing with block
524 * allocation.
525 */ 525 */
526 down_read((&EXT4_I(inode)->i_data_sem)); 526 down_read((&EXT4_I(inode)->i_data_sem));
527 EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags | EXT4_EXT_MIGRATE; 527 EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE;
528 up_read((&EXT4_I(inode)->i_data_sem)); 528 up_read((&EXT4_I(inode)->i_data_sem));
529 529
530 handle = ext4_journal_start(inode, 1); 530 handle = ext4_journal_start(inode, 1);
@@ -618,7 +618,7 @@ err_out:
618 tmp_inode->i_nlink = 0; 618 tmp_inode->i_nlink = 0;
619 619
620 ext4_journal_stop(handle); 620 ext4_journal_stop(handle);
621 621 unlock_new_inode(tmp_inode);
622 iput(tmp_inode); 622 iput(tmp_inode);
623 623
624 return retval; 624 return retval;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index bbf2dd9404dc..25b6b1457360 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -19,14 +19,31 @@
19#include "ext4_extents.h" 19#include "ext4_extents.h"
20#include "ext4.h" 20#include "ext4.h"
21 21
22#define get_ext_path(path, inode, block, ret) \ 22/**
23 do { \ 23 * get_ext_path - Find an extent path for designated logical block number.
24 path = ext4_ext_find_extent(inode, block, path); \ 24 *
25 if (IS_ERR(path)) { \ 25 * @inode: an inode which is searched
26 ret = PTR_ERR(path); \ 26 * @lblock: logical block number to find an extent path
27 path = NULL; \ 27 * @path: pointer to an extent path pointer (for output)
28 } \ 28 *
29 } while (0) 29 * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value
30 * on failure.
31 */
32static inline int
33get_ext_path(struct inode *inode, ext4_lblk_t lblock,
34 struct ext4_ext_path **path)
35{
36 int ret = 0;
37
38 *path = ext4_ext_find_extent(inode, lblock, *path);
39 if (IS_ERR(*path)) {
40 ret = PTR_ERR(*path);
41 *path = NULL;
42 } else if ((*path)[ext_depth(inode)].p_ext == NULL)
43 ret = -ENODATA;
44
45 return ret;
46}
30 47
31/** 48/**
32 * copy_extent_status - Copy the extent's initialization status 49 * copy_extent_status - Copy the extent's initialization status
@@ -113,6 +130,31 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
113} 130}
114 131
115/** 132/**
133 * mext_check_null_inode - NULL check for two inodes
134 *
135 * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
136 */
137static int
138mext_check_null_inode(struct inode *inode1, struct inode *inode2,
139 const char *function)
140{
141 int ret = 0;
142
143 if (inode1 == NULL) {
144 ext4_error(inode2->i_sb, function,
145 "Both inodes should not be NULL: "
146 "inode1 NULL inode2 %lu", inode2->i_ino);
147 ret = -EIO;
148 } else if (inode2 == NULL) {
149 ext4_error(inode1->i_sb, function,
150 "Both inodes should not be NULL: "
151 "inode1 %lu inode2 NULL", inode1->i_ino);
152 ret = -EIO;
153 }
154 return ret;
155}
156
157/**
116 * mext_double_down_read - Acquire two inodes' read semaphore 158 * mext_double_down_read - Acquire two inodes' read semaphore
117 * 159 *
118 * @orig_inode: original inode structure 160 * @orig_inode: original inode structure
@@ -124,8 +166,6 @@ mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
124{ 166{
125 struct inode *first = orig_inode, *second = donor_inode; 167 struct inode *first = orig_inode, *second = donor_inode;
126 168
127 BUG_ON(orig_inode == NULL || donor_inode == NULL);
128
129 /* 169 /*
130 * Use the inode number to provide the stable locking order instead 170 * Use the inode number to provide the stable locking order instead
131 * of its address, because the C language doesn't guarantee you can 171 * of its address, because the C language doesn't guarantee you can
@@ -152,8 +192,6 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
152{ 192{
153 struct inode *first = orig_inode, *second = donor_inode; 193 struct inode *first = orig_inode, *second = donor_inode;
154 194
155 BUG_ON(orig_inode == NULL || donor_inode == NULL);
156
157 /* 195 /*
158 * Use the inode number to provide the stable locking order instead 196 * Use the inode number to provide the stable locking order instead
159 * of its address, because the C language doesn't guarantee you can 197 * of its address, because the C language doesn't guarantee you can
@@ -178,8 +216,6 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
178static void 216static void
179mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode) 217mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
180{ 218{
181 BUG_ON(orig_inode == NULL || donor_inode == NULL);
182
183 up_read(&EXT4_I(orig_inode)->i_data_sem); 219 up_read(&EXT4_I(orig_inode)->i_data_sem);
184 up_read(&EXT4_I(donor_inode)->i_data_sem); 220 up_read(&EXT4_I(donor_inode)->i_data_sem);
185} 221}
@@ -194,8 +230,6 @@ mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
194static void 230static void
195mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode) 231mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
196{ 232{
197 BUG_ON(orig_inode == NULL || donor_inode == NULL);
198
199 up_write(&EXT4_I(orig_inode)->i_data_sem); 233 up_write(&EXT4_I(orig_inode)->i_data_sem);
200 up_write(&EXT4_I(donor_inode)->i_data_sem); 234 up_write(&EXT4_I(donor_inode)->i_data_sem);
201} 235}
@@ -283,23 +317,23 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
283 } 317 }
284 318
285 if (new_flag) { 319 if (new_flag) {
286 get_ext_path(orig_path, orig_inode, eblock, err); 320 err = get_ext_path(orig_inode, eblock, &orig_path);
287 if (orig_path == NULL) 321 if (err)
288 goto out; 322 goto out;
289 323
290 if (ext4_ext_insert_extent(handle, orig_inode, 324 if (ext4_ext_insert_extent(handle, orig_inode,
291 orig_path, new_ext)) 325 orig_path, new_ext, 0))
292 goto out; 326 goto out;
293 } 327 }
294 328
295 if (end_flag) { 329 if (end_flag) {
296 get_ext_path(orig_path, orig_inode, 330 err = get_ext_path(orig_inode,
297 le32_to_cpu(end_ext->ee_block) - 1, err); 331 le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
298 if (orig_path == NULL) 332 if (err)
299 goto out; 333 goto out;
300 334
301 if (ext4_ext_insert_extent(handle, orig_inode, 335 if (ext4_ext_insert_extent(handle, orig_inode,
302 orig_path, end_ext)) 336 orig_path, end_ext, 0))
303 goto out; 337 goto out;
304 } 338 }
305out: 339out:
@@ -519,7 +553,15 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
519 * oext |-----------| 553 * oext |-----------|
520 * new_ext |-------| 554 * new_ext |-------|
521 */ 555 */
522 BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end); 556 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
557 ext4_error(orig_inode->i_sb, __func__,
558 "new_ext_end(%u) should be less than or equal to "
559 "oext->ee_block(%u) + oext_alen(%d) - 1",
560 new_ext_end, le32_to_cpu(oext->ee_block),
561 oext_alen);
562 ret = -EIO;
563 goto out;
564 }
523 565
524 /* 566 /*
525 * Case: new_ext is smaller than original extent 567 * Case: new_ext is smaller than original extent
@@ -543,6 +585,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
543 585
544 ret = mext_insert_extents(handle, orig_inode, orig_path, o_start, 586 ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
545 o_end, &start_ext, &new_ext, &end_ext); 587 o_end, &start_ext, &new_ext, &end_ext);
588out:
546 return ret; 589 return ret;
547} 590}
548 591
@@ -554,8 +597,10 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
554 * @orig_off: block offset of original inode 597 * @orig_off: block offset of original inode
555 * @donor_off: block offset of donor inode 598 * @donor_off: block offset of donor inode
556 * @max_count: the maximun length of extents 599 * @max_count: the maximun length of extents
600 *
601 * Return 0 on success, or a negative error value on failure.
557 */ 602 */
558static void 603static int
559mext_calc_swap_extents(struct ext4_extent *tmp_dext, 604mext_calc_swap_extents(struct ext4_extent *tmp_dext,
560 struct ext4_extent *tmp_oext, 605 struct ext4_extent *tmp_oext,
561 ext4_lblk_t orig_off, ext4_lblk_t donor_off, 606 ext4_lblk_t orig_off, ext4_lblk_t donor_off,
@@ -564,6 +609,19 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
564 ext4_lblk_t diff, orig_diff; 609 ext4_lblk_t diff, orig_diff;
565 struct ext4_extent dext_old, oext_old; 610 struct ext4_extent dext_old, oext_old;
566 611
612 BUG_ON(orig_off != donor_off);
613
614 /* original and donor extents have to cover the same block offset */
615 if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
616 le32_to_cpu(tmp_oext->ee_block) +
617 ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
618 return -ENODATA;
619
620 if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
621 le32_to_cpu(tmp_dext->ee_block) +
622 ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
623 return -ENODATA;
624
567 dext_old = *tmp_dext; 625 dext_old = *tmp_dext;
568 oext_old = *tmp_oext; 626 oext_old = *tmp_oext;
569 627
@@ -591,6 +649,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
591 649
592 copy_extent_status(&oext_old, tmp_dext); 650 copy_extent_status(&oext_old, tmp_dext);
593 copy_extent_status(&dext_old, tmp_oext); 651 copy_extent_status(&dext_old, tmp_oext);
652
653 return 0;
594} 654}
595 655
596/** 656/**
@@ -631,13 +691,13 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
631 mext_double_down_write(orig_inode, donor_inode); 691 mext_double_down_write(orig_inode, donor_inode);
632 692
633 /* Get the original extent for the block "orig_off" */ 693 /* Get the original extent for the block "orig_off" */
634 get_ext_path(orig_path, orig_inode, orig_off, err); 694 err = get_ext_path(orig_inode, orig_off, &orig_path);
635 if (orig_path == NULL) 695 if (err)
636 goto out; 696 goto out;
637 697
638 /* Get the donor extent for the head */ 698 /* Get the donor extent for the head */
639 get_ext_path(donor_path, donor_inode, donor_off, err); 699 err = get_ext_path(donor_inode, donor_off, &donor_path);
640 if (donor_path == NULL) 700 if (err)
641 goto out; 701 goto out;
642 depth = ext_depth(orig_inode); 702 depth = ext_depth(orig_inode);
643 oext = orig_path[depth].p_ext; 703 oext = orig_path[depth].p_ext;
@@ -647,13 +707,28 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
647 dext = donor_path[depth].p_ext; 707 dext = donor_path[depth].p_ext;
648 tmp_dext = *dext; 708 tmp_dext = *dext;
649 709
650 mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, 710 err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
651 donor_off, count); 711 donor_off, count);
712 if (err)
713 goto out;
652 714
653 /* Loop for the donor extents */ 715 /* Loop for the donor extents */
654 while (1) { 716 while (1) {
655 /* The extent for donor must be found. */ 717 /* The extent for donor must be found. */
656 BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block)); 718 if (!dext) {
719 ext4_error(donor_inode->i_sb, __func__,
720 "The extent for donor must be found");
721 err = -EIO;
722 goto out;
723 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
724 ext4_error(donor_inode->i_sb, __func__,
725 "Donor offset(%u) and the first block of donor "
726 "extent(%u) should be equal",
727 donor_off,
728 le32_to_cpu(tmp_dext.ee_block));
729 err = -EIO;
730 goto out;
731 }
657 732
658 /* Set donor extent to orig extent */ 733 /* Set donor extent to orig extent */
659 err = mext_leaf_block(handle, orig_inode, 734 err = mext_leaf_block(handle, orig_inode,
@@ -678,8 +753,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
678 753
679 if (orig_path) 754 if (orig_path)
680 ext4_ext_drop_refs(orig_path); 755 ext4_ext_drop_refs(orig_path);
681 get_ext_path(orig_path, orig_inode, orig_off, err); 756 err = get_ext_path(orig_inode, orig_off, &orig_path);
682 if (orig_path == NULL) 757 if (err)
683 goto out; 758 goto out;
684 depth = ext_depth(orig_inode); 759 depth = ext_depth(orig_inode);
685 oext = orig_path[depth].p_ext; 760 oext = orig_path[depth].p_ext;
@@ -692,9 +767,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
692 767
693 if (donor_path) 768 if (donor_path)
694 ext4_ext_drop_refs(donor_path); 769 ext4_ext_drop_refs(donor_path);
695 get_ext_path(donor_path, donor_inode, 770 err = get_ext_path(donor_inode, donor_off, &donor_path);
696 donor_off, err); 771 if (err)
697 if (donor_path == NULL)
698 goto out; 772 goto out;
699 depth = ext_depth(donor_inode); 773 depth = ext_depth(donor_inode);
700 dext = donor_path[depth].p_ext; 774 dext = donor_path[depth].p_ext;
@@ -705,9 +779,10 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
705 } 779 }
706 tmp_dext = *dext; 780 tmp_dext = *dext;
707 781
708 mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, 782 err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
709 donor_off, 783 donor_off, count - replaced_count);
710 count - replaced_count); 784 if (err)
785 goto out;
711 } 786 }
712 787
713out: 788out:
@@ -740,7 +815,7 @@ out:
740 * on success, or a negative error value on failure. 815 * on success, or a negative error value on failure.
741 */ 816 */
742static int 817static int
743move_extent_par_page(struct file *o_filp, struct inode *donor_inode, 818move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
744 pgoff_t orig_page_offset, int data_offset_in_page, 819 pgoff_t orig_page_offset, int data_offset_in_page,
745 int block_len_in_page, int uninit) 820 int block_len_in_page, int uninit)
746{ 821{
@@ -871,6 +946,7 @@ out:
871 if (PageLocked(page)) 946 if (PageLocked(page))
872 unlock_page(page); 947 unlock_page(page);
873 page_cache_release(page); 948 page_cache_release(page);
949 ext4_journal_stop(handle);
874 } 950 }
875out2: 951out2:
876 ext4_journal_stop(handle); 952 ext4_journal_stop(handle);
@@ -897,6 +973,10 @@ mext_check_arguments(struct inode *orig_inode,
897 struct inode *donor_inode, __u64 orig_start, 973 struct inode *donor_inode, __u64 orig_start,
898 __u64 donor_start, __u64 *len, __u64 moved_len) 974 __u64 donor_start, __u64 *len, __u64 moved_len)
899{ 975{
976 ext4_lblk_t orig_blocks, donor_blocks;
977 unsigned int blkbits = orig_inode->i_blkbits;
978 unsigned int blocksize = 1 << blkbits;
979
900 /* Regular file check */ 980 /* Regular file check */
901 if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { 981 if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
902 ext4_debug("ext4 move extent: The argument files should be " 982 ext4_debug("ext4 move extent: The argument files should be "
@@ -921,14 +1001,6 @@ mext_check_arguments(struct inode *orig_inode,
921 return -EINVAL; 1001 return -EINVAL;
922 } 1002 }
923 1003
924 /* orig and donor should be different file */
925 if (orig_inode->i_ino == donor_inode->i_ino) {
926 ext4_debug("ext4 move extent: The argument files should not "
927 "be same file [ino:orig %lu, donor %lu]\n",
928 orig_inode->i_ino, donor_inode->i_ino);
929 return -EINVAL;
930 }
931
932 /* Ext4 move extent supports only extent based file */ 1004 /* Ext4 move extent supports only extent based file */
933 if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) { 1005 if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
934 ext4_debug("ext4 move extent: orig file is not extents " 1006 ext4_debug("ext4 move extent: orig file is not extents "
@@ -960,54 +1032,58 @@ mext_check_arguments(struct inode *orig_inode,
960 return -EINVAL; 1032 return -EINVAL;
961 } 1033 }
962 1034
963 if ((orig_start > MAX_DEFRAG_SIZE) || 1035 if ((orig_start > EXT_MAX_BLOCK) ||
964 (donor_start > MAX_DEFRAG_SIZE) || 1036 (donor_start > EXT_MAX_BLOCK) ||
965 (*len > MAX_DEFRAG_SIZE) || 1037 (*len > EXT_MAX_BLOCK) ||
966 (orig_start + *len > MAX_DEFRAG_SIZE)) { 1038 (orig_start + *len > EXT_MAX_BLOCK)) {
967 ext4_debug("ext4 move extent: Can't handle over [%lu] blocks " 1039 ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
968 "[ino:orig %lu, donor %lu]\n", MAX_DEFRAG_SIZE, 1040 "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCK,
969 orig_inode->i_ino, donor_inode->i_ino); 1041 orig_inode->i_ino, donor_inode->i_ino);
970 return -EINVAL; 1042 return -EINVAL;
971 } 1043 }
972 1044
973 if (orig_inode->i_size > donor_inode->i_size) { 1045 if (orig_inode->i_size > donor_inode->i_size) {
974 if (orig_start >= donor_inode->i_size) { 1046 donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits;
1047 /* TODO: eliminate this artificial restriction */
1048 if (orig_start >= donor_blocks) {
975 ext4_debug("ext4 move extent: orig start offset " 1049 ext4_debug("ext4 move extent: orig start offset "
976 "[%llu] should be less than donor file size " 1050 "[%llu] should be less than donor file blocks "
977 "[%lld] [ino:orig %lu, donor_inode %lu]\n", 1051 "[%u] [ino:orig %lu, donor %lu]\n",
978 orig_start, donor_inode->i_size, 1052 orig_start, donor_blocks,
979 orig_inode->i_ino, donor_inode->i_ino); 1053 orig_inode->i_ino, donor_inode->i_ino);
980 return -EINVAL; 1054 return -EINVAL;
981 } 1055 }
982 1056
983 if (orig_start + *len > donor_inode->i_size) { 1057 /* TODO: eliminate this artificial restriction */
1058 if (orig_start + *len > donor_blocks) {
984 ext4_debug("ext4 move extent: End offset [%llu] should " 1059 ext4_debug("ext4 move extent: End offset [%llu] should "
985 "be less than donor file size [%lld]." 1060 "be less than donor file blocks [%u]."
986 "So adjust length from %llu to %lld " 1061 "So adjust length from %llu to %llu "
987 "[ino:orig %lu, donor %lu]\n", 1062 "[ino:orig %lu, donor %lu]\n",
988 orig_start + *len, donor_inode->i_size, 1063 orig_start + *len, donor_blocks,
989 *len, donor_inode->i_size - orig_start, 1064 *len, donor_blocks - orig_start,
990 orig_inode->i_ino, donor_inode->i_ino); 1065 orig_inode->i_ino, donor_inode->i_ino);
991 *len = donor_inode->i_size - orig_start; 1066 *len = donor_blocks - orig_start;
992 } 1067 }
993 } else { 1068 } else {
994 if (orig_start >= orig_inode->i_size) { 1069 orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
1070 if (orig_start >= orig_blocks) {
995 ext4_debug("ext4 move extent: start offset [%llu] " 1071 ext4_debug("ext4 move extent: start offset [%llu] "
996 "should be less than original file size " 1072 "should be less than original file blocks "
997 "[%lld] [inode:orig %lu, donor %lu]\n", 1073 "[%u] [ino:orig %lu, donor %lu]\n",
998 orig_start, orig_inode->i_size, 1074 orig_start, orig_blocks,
999 orig_inode->i_ino, donor_inode->i_ino); 1075 orig_inode->i_ino, donor_inode->i_ino);
1000 return -EINVAL; 1076 return -EINVAL;
1001 } 1077 }
1002 1078
1003 if (orig_start + *len > orig_inode->i_size) { 1079 if (orig_start + *len > orig_blocks) {
1004 ext4_debug("ext4 move extent: Adjust length " 1080 ext4_debug("ext4 move extent: Adjust length "
1005 "from %llu to %lld. Because it should be " 1081 "from %llu to %llu. Because it should be "
1006 "less than original file size " 1082 "less than original file blocks "
1007 "[ino:orig %lu, donor %lu]\n", 1083 "[ino:orig %lu, donor %lu]\n",
1008 *len, orig_inode->i_size - orig_start, 1084 *len, orig_blocks - orig_start,
1009 orig_inode->i_ino, donor_inode->i_ino); 1085 orig_inode->i_ino, donor_inode->i_ino);
1010 *len = orig_inode->i_size - orig_start; 1086 *len = orig_blocks - orig_start;
1011 } 1087 }
1012 } 1088 }
1013 1089
@@ -1027,18 +1103,23 @@ mext_check_arguments(struct inode *orig_inode,
1027 * @inode1: the inode structure 1103 * @inode1: the inode structure
1028 * @inode2: the inode structure 1104 * @inode2: the inode structure
1029 * 1105 *
1030 * Lock two inodes' i_mutex by i_ino order. This function is moved from 1106 * Lock two inodes' i_mutex by i_ino order.
1031 * fs/inode.c. 1107 * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
1032 */ 1108 */
1033static void 1109static int
1034mext_inode_double_lock(struct inode *inode1, struct inode *inode2) 1110mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
1035{ 1111{
1036 if (inode1 == NULL || inode2 == NULL || inode1 == inode2) { 1112 int ret = 0;
1037 if (inode1) 1113
1038 mutex_lock(&inode1->i_mutex); 1114 BUG_ON(inode1 == NULL && inode2 == NULL);
1039 else if (inode2) 1115
1040 mutex_lock(&inode2->i_mutex); 1116 ret = mext_check_null_inode(inode1, inode2, __func__);
1041 return; 1117 if (ret < 0)
1118 goto out;
1119
1120 if (inode1 == inode2) {
1121 mutex_lock(&inode1->i_mutex);
1122 goto out;
1042 } 1123 }
1043 1124
1044 if (inode1->i_ino < inode2->i_ino) { 1125 if (inode1->i_ino < inode2->i_ino) {
@@ -1048,6 +1129,9 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
1048 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); 1129 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
1049 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); 1130 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
1050 } 1131 }
1132
1133out:
1134 return ret;
1051} 1135}
1052 1136
1053/** 1137/**
@@ -1056,17 +1140,28 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
1056 * @inode1: the inode that is released first 1140 * @inode1: the inode that is released first
1057 * @inode2: the inode that is released second 1141 * @inode2: the inode that is released second
1058 * 1142 *
1059 * This function is moved from fs/inode.c. 1143 * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
1060 */ 1144 */
1061 1145
1062static void 1146static int
1063mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) 1147mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
1064{ 1148{
1149 int ret = 0;
1150
1151 BUG_ON(inode1 == NULL && inode2 == NULL);
1152
1153 ret = mext_check_null_inode(inode1, inode2, __func__);
1154 if (ret < 0)
1155 goto out;
1156
1065 if (inode1) 1157 if (inode1)
1066 mutex_unlock(&inode1->i_mutex); 1158 mutex_unlock(&inode1->i_mutex);
1067 1159
1068 if (inode2 && inode2 != inode1) 1160 if (inode2 && inode2 != inode1)
1069 mutex_unlock(&inode2->i_mutex); 1161 mutex_unlock(&inode2->i_mutex);
1162
1163out:
1164 return ret;
1070} 1165}
1071 1166
1072/** 1167/**
@@ -1123,70 +1218,84 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1123 ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; 1218 ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
1124 ext4_lblk_t rest_blocks; 1219 ext4_lblk_t rest_blocks;
1125 pgoff_t orig_page_offset = 0, seq_end_page; 1220 pgoff_t orig_page_offset = 0, seq_end_page;
1126 int ret, depth, last_extent = 0; 1221 int ret1, ret2, depth, last_extent = 0;
1127 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; 1222 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
1128 int data_offset_in_page; 1223 int data_offset_in_page;
1129 int block_len_in_page; 1224 int block_len_in_page;
1130 int uninit; 1225 int uninit;
1131 1226
1227 /* orig and donor should be different file */
1228 if (orig_inode->i_ino == donor_inode->i_ino) {
1229 ext4_debug("ext4 move extent: The argument files should not "
1230 "be same file [ino:orig %lu, donor %lu]\n",
1231 orig_inode->i_ino, donor_inode->i_ino);
1232 return -EINVAL;
1233 }
1234
1132 /* protect orig and donor against a truncate */ 1235 /* protect orig and donor against a truncate */
1133 mext_inode_double_lock(orig_inode, donor_inode); 1236 ret1 = mext_inode_double_lock(orig_inode, donor_inode);
1237 if (ret1 < 0)
1238 return ret1;
1134 1239
1135 mext_double_down_read(orig_inode, donor_inode); 1240 mext_double_down_read(orig_inode, donor_inode);
1136 /* Check the filesystem environment whether move_extent can be done */ 1241 /* Check the filesystem environment whether move_extent can be done */
1137 ret = mext_check_arguments(orig_inode, donor_inode, orig_start, 1242 ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
1138 donor_start, &len, *moved_len); 1243 donor_start, &len, *moved_len);
1139 mext_double_up_read(orig_inode, donor_inode); 1244 mext_double_up_read(orig_inode, donor_inode);
1140 if (ret) 1245 if (ret1)
1141 goto out2; 1246 goto out;
1142 1247
1143 file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; 1248 file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
1144 block_end = block_start + len - 1; 1249 block_end = block_start + len - 1;
1145 if (file_end < block_end) 1250 if (file_end < block_end)
1146 len -= block_end - file_end; 1251 len -= block_end - file_end;
1147 1252
1148 get_ext_path(orig_path, orig_inode, block_start, ret); 1253 ret1 = get_ext_path(orig_inode, block_start, &orig_path);
1149 if (orig_path == NULL) 1254 if (ret1)
1150 goto out2; 1255 goto out;
1151 1256
1152 /* Get path structure to check the hole */ 1257 /* Get path structure to check the hole */
1153 get_ext_path(holecheck_path, orig_inode, block_start, ret); 1258 ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
1154 if (holecheck_path == NULL) 1259 if (ret1)
1155 goto out; 1260 goto out;
1156 1261
1157 depth = ext_depth(orig_inode); 1262 depth = ext_depth(orig_inode);
1158 ext_cur = holecheck_path[depth].p_ext; 1263 ext_cur = holecheck_path[depth].p_ext;
1159 if (ext_cur == NULL) {
1160 ret = -EINVAL;
1161 goto out;
1162 }
1163 1264
1164 /* 1265 /*
1165 * Get proper extent whose ee_block is beyond block_start 1266 * Get proper starting location of block replacement if block_start was
1166 * if block_start was within the hole. 1267 * within the hole.
1167 */ 1268 */
1168 if (le32_to_cpu(ext_cur->ee_block) + 1269 if (le32_to_cpu(ext_cur->ee_block) +
1169 ext4_ext_get_actual_len(ext_cur) - 1 < block_start) { 1270 ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
1271 /*
1272 * The hole exists between extents or the tail of
1273 * original file.
1274 */
1170 last_extent = mext_next_extent(orig_inode, 1275 last_extent = mext_next_extent(orig_inode,
1171 holecheck_path, &ext_cur); 1276 holecheck_path, &ext_cur);
1172 if (last_extent < 0) { 1277 if (last_extent < 0) {
1173 ret = last_extent; 1278 ret1 = last_extent;
1174 goto out; 1279 goto out;
1175 } 1280 }
1176 last_extent = mext_next_extent(orig_inode, orig_path, 1281 last_extent = mext_next_extent(orig_inode, orig_path,
1177 &ext_dummy); 1282 &ext_dummy);
1178 if (last_extent < 0) { 1283 if (last_extent < 0) {
1179 ret = last_extent; 1284 ret1 = last_extent;
1180 goto out; 1285 goto out;
1181 } 1286 }
1182 } 1287 seq_start = le32_to_cpu(ext_cur->ee_block);
1183 seq_start = block_start; 1288 } else if (le32_to_cpu(ext_cur->ee_block) > block_start)
1289 /* The hole exists at the beginning of original file. */
1290 seq_start = le32_to_cpu(ext_cur->ee_block);
1291 else
1292 seq_start = block_start;
1184 1293
1185 /* No blocks within the specified range. */ 1294 /* No blocks within the specified range. */
1186 if (le32_to_cpu(ext_cur->ee_block) > block_end) { 1295 if (le32_to_cpu(ext_cur->ee_block) > block_end) {
1187 ext4_debug("ext4 move extent: The specified range of file " 1296 ext4_debug("ext4 move extent: The specified range of file "
1188 "may be the hole\n"); 1297 "may be the hole\n");
1189 ret = -EINVAL; 1298 ret1 = -EINVAL;
1190 goto out; 1299 goto out;
1191 } 1300 }
1192 1301
@@ -1206,7 +1315,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1206 last_extent = mext_next_extent(orig_inode, holecheck_path, 1315 last_extent = mext_next_extent(orig_inode, holecheck_path,
1207 &ext_cur); 1316 &ext_cur);
1208 if (last_extent < 0) { 1317 if (last_extent < 0) {
1209 ret = last_extent; 1318 ret1 = last_extent;
1210 break; 1319 break;
1211 } 1320 }
1212 add_blocks = ext4_ext_get_actual_len(ext_cur); 1321 add_blocks = ext4_ext_get_actual_len(ext_cur);
@@ -1258,16 +1367,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1258 while (orig_page_offset <= seq_end_page) { 1367 while (orig_page_offset <= seq_end_page) {
1259 1368
1260 /* Swap original branches with new branches */ 1369 /* Swap original branches with new branches */
1261 ret = move_extent_par_page(o_filp, donor_inode, 1370 ret1 = move_extent_per_page(o_filp, donor_inode,
1262 orig_page_offset, 1371 orig_page_offset,
1263 data_offset_in_page, 1372 data_offset_in_page,
1264 block_len_in_page, uninit); 1373 block_len_in_page, uninit);
1265 if (ret < 0) 1374 if (ret1 < 0)
1266 goto out; 1375 goto out;
1267 orig_page_offset++; 1376 orig_page_offset++;
1268 /* Count how many blocks we have exchanged */ 1377 /* Count how many blocks we have exchanged */
1269 *moved_len += block_len_in_page; 1378 *moved_len += block_len_in_page;
1270 BUG_ON(*moved_len > len); 1379 if (*moved_len > len) {
1380 ext4_error(orig_inode->i_sb, __func__,
1381 "We replaced blocks too much! "
1382 "sum of replaced: %llu requested: %llu",
1383 *moved_len, len);
1384 ret1 = -EIO;
1385 goto out;
1386 }
1271 1387
1272 data_offset_in_page = 0; 1388 data_offset_in_page = 0;
1273 rest_blocks -= block_len_in_page; 1389 rest_blocks -= block_len_in_page;
@@ -1280,17 +1396,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1280 /* Decrease buffer counter */ 1396 /* Decrease buffer counter */
1281 if (holecheck_path) 1397 if (holecheck_path)
1282 ext4_ext_drop_refs(holecheck_path); 1398 ext4_ext_drop_refs(holecheck_path);
1283 get_ext_path(holecheck_path, orig_inode, 1399 ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
1284 seq_start, ret); 1400 if (ret1)
1285 if (holecheck_path == NULL)
1286 break; 1401 break;
1287 depth = holecheck_path->p_depth; 1402 depth = holecheck_path->p_depth;
1288 1403
1289 /* Decrease buffer counter */ 1404 /* Decrease buffer counter */
1290 if (orig_path) 1405 if (orig_path)
1291 ext4_ext_drop_refs(orig_path); 1406 ext4_ext_drop_refs(orig_path);
1292 get_ext_path(orig_path, orig_inode, seq_start, ret); 1407 ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
1293 if (orig_path == NULL) 1408 if (ret1)
1294 break; 1409 break;
1295 1410
1296 ext_cur = holecheck_path[depth].p_ext; 1411 ext_cur = holecheck_path[depth].p_ext;
@@ -1307,14 +1422,13 @@ out:
1307 ext4_ext_drop_refs(holecheck_path); 1422 ext4_ext_drop_refs(holecheck_path);
1308 kfree(holecheck_path); 1423 kfree(holecheck_path);
1309 } 1424 }
1310out2:
1311 mext_inode_double_unlock(orig_inode, donor_inode);
1312 1425
1313 if (ret) 1426 ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
1314 return ret;
1315 1427
1316 /* All of the specified blocks must be exchanged in succeed */ 1428 if (ret1)
1317 BUG_ON(*moved_len != len); 1429 return ret1;
1430 else if (ret2)
1431 return ret2;
1318 1432
1319 return 0; 1433 return 0;
1320} 1434}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index de04013d16ff..7c8fe80bacdd 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1518,8 +1518,12 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1518 return retval; 1518 return retval;
1519 1519
1520 if (blocks == 1 && !dx_fallback && 1520 if (blocks == 1 && !dx_fallback &&
1521 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) 1521 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
1522 return make_indexed_dir(handle, dentry, inode, bh); 1522 retval = make_indexed_dir(handle, dentry, inode, bh);
1523 if (retval == -ENOSPC)
1524 brelse(bh);
1525 return retval;
1526 }
1523 brelse(bh); 1527 brelse(bh);
1524 } 1528 }
1525 bh = ext4_append(handle, dir, &block, &retval); 1529 bh = ext4_append(handle, dir, &block, &retval);
@@ -1528,7 +1532,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1528 de = (struct ext4_dir_entry_2 *) bh->b_data; 1532 de = (struct ext4_dir_entry_2 *) bh->b_data;
1529 de->inode = 0; 1533 de->inode = 0;
1530 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); 1534 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
1531 return add_dirent_to_buf(handle, dentry, inode, de, bh); 1535 retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
1536 if (retval == -ENOSPC)
1537 brelse(bh);
1538 return retval;
1532} 1539}
1533 1540
1534/* 1541/*
@@ -1590,9 +1597,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1590 goto cleanup; 1597 goto cleanup;
1591 node2 = (struct dx_node *)(bh2->b_data); 1598 node2 = (struct dx_node *)(bh2->b_data);
1592 entries2 = node2->entries; 1599 entries2 = node2->entries;
1600 memset(&node2->fake, 0, sizeof(struct fake_dirent));
1593 node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize, 1601 node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
1594 sb->s_blocksize); 1602 sb->s_blocksize);
1595 node2->fake.inode = 0;
1596 BUFFER_TRACE(frame->bh, "get_write_access"); 1603 BUFFER_TRACE(frame->bh, "get_write_access");
1597 err = ext4_journal_get_write_access(handle, frame->bh); 1604 err = ext4_journal_get_write_access(handle, frame->bh);
1598 if (err) 1605 if (err)
@@ -1657,7 +1664,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1657 if (!de) 1664 if (!de)
1658 goto cleanup; 1665 goto cleanup;
1659 err = add_dirent_to_buf(handle, dentry, inode, de, bh); 1666 err = add_dirent_to_buf(handle, dentry, inode, de, bh);
1660 bh = NULL; 1667 if (err != -ENOSPC)
1668 bh = NULL;
1661 goto cleanup; 1669 goto cleanup;
1662 1670
1663journal_error: 1671journal_error:
@@ -2068,7 +2076,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2068 struct ext4_iloc iloc; 2076 struct ext4_iloc iloc;
2069 int err = 0; 2077 int err = 0;
2070 2078
2071 if (!ext4_handle_valid(handle)) 2079 /* ext4_handle_valid() assumes a valid handle_t pointer */
2080 if (handle && !ext4_handle_valid(handle))
2072 return 0; 2081 return 0;
2073 2082
2074 mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); 2083 mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
@@ -2310,7 +2319,7 @@ static int ext4_link(struct dentry *old_dentry,
2310 struct inode *inode = old_dentry->d_inode; 2319 struct inode *inode = old_dentry->d_inode;
2311 int err, retries = 0; 2320 int err, retries = 0;
2312 2321
2313 if (EXT4_DIR_LINK_MAX(inode)) 2322 if (inode->i_nlink >= EXT4_LINK_MAX)
2314 return -EMLINK; 2323 return -EMLINK;
2315 2324
2316 /* 2325 /*
@@ -2413,7 +2422,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2413 goto end_rename; 2422 goto end_rename;
2414 retval = -EMLINK; 2423 retval = -EMLINK;
2415 if (!new_inode && new_dir != old_dir && 2424 if (!new_inode && new_dir != old_dir &&
2416 new_dir->i_nlink >= EXT4_LINK_MAX) 2425 EXT4_DIR_LINK_MAX(new_dir))
2417 goto end_rename; 2426 goto end_rename;
2418 } 2427 }
2419 if (!new_bh) { 2428 if (!new_bh) {
@@ -2536,7 +2545,7 @@ const struct inode_operations ext4_dir_inode_operations = {
2536 .listxattr = ext4_listxattr, 2545 .listxattr = ext4_listxattr,
2537 .removexattr = generic_removexattr, 2546 .removexattr = generic_removexattr,
2538#endif 2547#endif
2539 .permission = ext4_permission, 2548 .check_acl = ext4_check_acl,
2540 .fiemap = ext4_fiemap, 2549 .fiemap = ext4_fiemap,
2541}; 2550};
2542 2551
@@ -2548,5 +2557,5 @@ const struct inode_operations ext4_special_inode_operations = {
2548 .listxattr = ext4_listxattr, 2557 .listxattr = ext4_listxattr,
2549 .removexattr = generic_removexattr, 2558 .removexattr = generic_removexattr,
2550#endif 2559#endif
2551 .permission = ext4_permission, 2560 .check_acl = ext4_check_acl,
2552}; 2561};
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 68b0351fc647..3cfc343c41b5 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -746,7 +746,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
746 struct inode *inode = NULL; 746 struct inode *inode = NULL;
747 handle_t *handle; 747 handle_t *handle;
748 int gdb_off, gdb_num; 748 int gdb_off, gdb_num;
749 int num_grp_locked = 0;
750 int err, err2; 749 int err, err2;
751 750
752 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); 751 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
@@ -856,7 +855,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
856 * using the new disk blocks. 855 * using the new disk blocks.
857 */ 856 */
858 857
859 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
860 /* Update group descriptor block for new group */ 858 /* Update group descriptor block for new group */
861 gdp = (struct ext4_group_desc *)((char *)primary->b_data + 859 gdp = (struct ext4_group_desc *)((char *)primary->b_data +
862 gdb_off * EXT4_DESC_SIZE(sb)); 860 gdb_off * EXT4_DESC_SIZE(sb));
@@ -875,10 +873,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
875 * descriptor 873 * descriptor
876 */ 874 */
877 err = ext4_mb_add_groupinfo(sb, input->group, gdp); 875 err = ext4_mb_add_groupinfo(sb, input->group, gdp);
878 if (err) { 876 if (err)
879 ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
880 goto exit_journal; 877 goto exit_journal;
881 }
882 878
883 /* 879 /*
884 * Make the new blocks and inodes valid next. We do this before 880 * Make the new blocks and inodes valid next. We do this before
@@ -920,7 +916,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
920 916
921 /* Update the global fs size fields */ 917 /* Update the global fs size fields */
922 sbi->s_groups_count++; 918 sbi->s_groups_count++;
923 ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
924 919
925 ext4_handle_dirty_metadata(handle, NULL, primary); 920 ext4_handle_dirty_metadata(handle, NULL, primary);
926 921
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8f4f079e6b9a..312211ee05af 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -45,17 +45,11 @@
45#include "ext4_jbd2.h" 45#include "ext4_jbd2.h"
46#include "xattr.h" 46#include "xattr.h"
47#include "acl.h" 47#include "acl.h"
48#include "mballoc.h"
48 49
49#define CREATE_TRACE_POINTS 50#define CREATE_TRACE_POINTS
50#include <trace/events/ext4.h> 51#include <trace/events/ext4.h>
51 52
52static int default_mb_history_length = 1000;
53
54module_param_named(default_mb_history_length, default_mb_history_length,
55 int, 0644);
56MODULE_PARM_DESC(default_mb_history_length,
57 "Default number of entries saved for mb_history");
58
59struct proc_dir_entry *ext4_proc_root; 53struct proc_dir_entry *ext4_proc_root;
60static struct kset *ext4_kset; 54static struct kset *ext4_kset;
61 55
@@ -188,6 +182,36 @@ void ext4_itable_unused_set(struct super_block *sb,
188 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); 182 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
189} 183}
190 184
185
186/* Just increment the non-pointer handle value */
187static handle_t *ext4_get_nojournal(void)
188{
189 handle_t *handle = current->journal_info;
190 unsigned long ref_cnt = (unsigned long)handle;
191
192 BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
193
194 ref_cnt++;
195 handle = (handle_t *)ref_cnt;
196
197 current->journal_info = handle;
198 return handle;
199}
200
201
202/* Decrement the non-pointer handle value */
203static void ext4_put_nojournal(handle_t *handle)
204{
205 unsigned long ref_cnt = (unsigned long)handle;
206
207 BUG_ON(ref_cnt == 0);
208
209 ref_cnt--;
210 handle = (handle_t *)ref_cnt;
211
212 current->journal_info = handle;
213}
214
191/* 215/*
192 * Wrappers for jbd2_journal_start/end. 216 * Wrappers for jbd2_journal_start/end.
193 * 217 *
@@ -214,11 +238,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
214 } 238 }
215 return jbd2_journal_start(journal, nblocks); 239 return jbd2_journal_start(journal, nblocks);
216 } 240 }
217 /* 241 return ext4_get_nojournal();
218 * We're not journaling, return the appropriate indication.
219 */
220 current->journal_info = EXT4_NOJOURNAL_HANDLE;
221 return current->journal_info;
222} 242}
223 243
224/* 244/*
@@ -234,11 +254,7 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
234 int rc; 254 int rc;
235 255
236 if (!ext4_handle_valid(handle)) { 256 if (!ext4_handle_valid(handle)) {
237 /* 257 ext4_put_nojournal(handle);
238 * Do this here since we don't call jbd2_journal_stop() in
239 * no-journal mode.
240 */
241 current->journal_info = NULL;
242 return 0; 258 return 0;
243 } 259 }
244 sb = handle->h_transaction->t_journal->j_private; 260 sb = handle->h_transaction->t_journal->j_private;
@@ -344,7 +360,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
344 errstr = "Out of memory"; 360 errstr = "Out of memory";
345 break; 361 break;
346 case -EROFS: 362 case -EROFS:
347 if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT) 363 if (!sb || (EXT4_SB(sb)->s_journal &&
364 EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
348 errstr = "Journal has aborted"; 365 errstr = "Journal has aborted";
349 else 366 else
350 errstr = "Readonly filesystem"; 367 errstr = "Readonly filesystem";
@@ -578,6 +595,9 @@ static void ext4_put_super(struct super_block *sb)
578 struct ext4_super_block *es = sbi->s_es; 595 struct ext4_super_block *es = sbi->s_es;
579 int i, err; 596 int i, err;
580 597
598 flush_workqueue(sbi->dio_unwritten_wq);
599 destroy_workqueue(sbi->dio_unwritten_wq);
600
581 lock_super(sb); 601 lock_super(sb);
582 lock_kernel(); 602 lock_kernel();
583 if (sb->s_dirt) 603 if (sb->s_dirt)
@@ -682,6 +702,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
682 ei->i_allocated_meta_blocks = 0; 702 ei->i_allocated_meta_blocks = 0;
683 ei->i_delalloc_reserved_flag = 0; 703 ei->i_delalloc_reserved_flag = 0;
684 spin_lock_init(&(ei->i_block_reservation_lock)); 704 spin_lock_init(&(ei->i_block_reservation_lock));
705 INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
706 ei->cur_aio_dio = NULL;
685 707
686 return &ei->vfs_inode; 708 return &ei->vfs_inode;
687} 709}
@@ -962,7 +984,7 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
962static ssize_t ext4_quota_write(struct super_block *sb, int type, 984static ssize_t ext4_quota_write(struct super_block *sb, int type,
963 const char *data, size_t len, loff_t off); 985 const char *data, size_t len, loff_t off);
964 986
965static struct dquot_operations ext4_quota_operations = { 987static const struct dquot_operations ext4_quota_operations = {
966 .initialize = dquot_initialize, 988 .initialize = dquot_initialize,
967 .drop = dquot_drop, 989 .drop = dquot_drop,
968 .alloc_space = dquot_alloc_space, 990 .alloc_space = dquot_alloc_space,
@@ -983,7 +1005,7 @@ static struct dquot_operations ext4_quota_operations = {
983 .destroy_dquot = dquot_destroy, 1005 .destroy_dquot = dquot_destroy,
984}; 1006};
985 1007
986static struct quotactl_ops ext4_qctl_operations = { 1008static const struct quotactl_ops ext4_qctl_operations = {
987 .quota_on = ext4_quota_on, 1009 .quota_on = ext4_quota_on,
988 .quota_off = vfs_quota_off, 1010 .quota_off = vfs_quota_off,
989 .quota_sync = vfs_quota_sync, 1011 .quota_sync = vfs_quota_sync,
@@ -1050,7 +1072,7 @@ enum {
1050 Opt_journal_update, Opt_journal_dev, 1072 Opt_journal_update, Opt_journal_dev,
1051 Opt_journal_checksum, Opt_journal_async_commit, 1073 Opt_journal_checksum, Opt_journal_async_commit,
1052 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 1074 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
1053 Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length, 1075 Opt_data_err_abort, Opt_data_err_ignore,
1054 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 1076 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1055 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 1077 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
1056 Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, 1078 Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
@@ -1097,7 +1119,6 @@ static const match_table_t tokens = {
1097 {Opt_data_writeback, "data=writeback"}, 1119 {Opt_data_writeback, "data=writeback"},
1098 {Opt_data_err_abort, "data_err=abort"}, 1120 {Opt_data_err_abort, "data_err=abort"},
1099 {Opt_data_err_ignore, "data_err=ignore"}, 1121 {Opt_data_err_ignore, "data_err=ignore"},
1100 {Opt_mb_history_length, "mb_history_length=%u"},
1101 {Opt_offusrjquota, "usrjquota="}, 1122 {Opt_offusrjquota, "usrjquota="},
1102 {Opt_usrjquota, "usrjquota=%s"}, 1123 {Opt_usrjquota, "usrjquota=%s"},
1103 {Opt_offgrpjquota, "grpjquota="}, 1124 {Opt_offgrpjquota, "grpjquota="},
@@ -1279,11 +1300,9 @@ static int parse_options(char *options, struct super_block *sb,
1279 *journal_devnum = option; 1300 *journal_devnum = option;
1280 break; 1301 break;
1281 case Opt_journal_checksum: 1302 case Opt_journal_checksum:
1282 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); 1303 break; /* Kept for backwards compatibility */
1283 break;
1284 case Opt_journal_async_commit: 1304 case Opt_journal_async_commit:
1285 set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT); 1305 set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
1286 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
1287 break; 1306 break;
1288 case Opt_noload: 1307 case Opt_noload:
1289 set_opt(sbi->s_mount_opt, NOLOAD); 1308 set_opt(sbi->s_mount_opt, NOLOAD);
@@ -1340,13 +1359,6 @@ static int parse_options(char *options, struct super_block *sb,
1340 case Opt_data_err_ignore: 1359 case Opt_data_err_ignore:
1341 clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); 1360 clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
1342 break; 1361 break;
1343 case Opt_mb_history_length:
1344 if (match_int(&args[0], &option))
1345 return 0;
1346 if (option < 0)
1347 return 0;
1348 sbi->s_mb_history_max = option;
1349 break;
1350#ifdef CONFIG_QUOTA 1362#ifdef CONFIG_QUOTA
1351 case Opt_usrjquota: 1363 case Opt_usrjquota:
1352 qtype = USRQUOTA; 1364 qtype = USRQUOTA;
@@ -1646,13 +1658,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1646 EXT4_INODES_PER_GROUP(sb), 1658 EXT4_INODES_PER_GROUP(sb),
1647 sbi->s_mount_opt); 1659 sbi->s_mount_opt);
1648 1660
1649 if (EXT4_SB(sb)->s_journal) {
1650 ext4_msg(sb, KERN_INFO, "%s journal on %s",
1651 EXT4_SB(sb)->s_journal->j_inode ? "internal" :
1652 "external", EXT4_SB(sb)->s_journal->j_devname);
1653 } else {
1654 ext4_msg(sb, KERN_INFO, "no journal");
1655 }
1656 return res; 1661 return res;
1657} 1662}
1658 1663
@@ -1695,12 +1700,12 @@ static int ext4_fill_flex_info(struct super_block *sb)
1695 gdp = ext4_get_group_desc(sb, i, NULL); 1700 gdp = ext4_get_group_desc(sb, i, NULL);
1696 1701
1697 flex_group = ext4_flex_group(sbi, i); 1702 flex_group = ext4_flex_group(sbi, i);
1698 atomic_set(&sbi->s_flex_groups[flex_group].free_inodes, 1703 atomic_add(ext4_free_inodes_count(sb, gdp),
1699 ext4_free_inodes_count(sb, gdp)); 1704 &sbi->s_flex_groups[flex_group].free_inodes);
1700 atomic_set(&sbi->s_flex_groups[flex_group].free_blocks, 1705 atomic_add(ext4_free_blks_count(sb, gdp),
1701 ext4_free_blks_count(sb, gdp)); 1706 &sbi->s_flex_groups[flex_group].free_blocks);
1702 atomic_set(&sbi->s_flex_groups[flex_group].used_dirs, 1707 atomic_add(ext4_used_dirs_count(sb, gdp),
1703 ext4_used_dirs_count(sb, gdp)); 1708 &sbi->s_flex_groups[flex_group].used_dirs);
1704 } 1709 }
1705 1710
1706 return 1; 1711 return 1;
@@ -2197,6 +2202,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
2197EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); 2202EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
2198EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); 2203EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
2199EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); 2204EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
2205EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
2200 2206
2201static struct attribute *ext4_attrs[] = { 2207static struct attribute *ext4_attrs[] = {
2202 ATTR_LIST(delayed_allocation_blocks), 2208 ATTR_LIST(delayed_allocation_blocks),
@@ -2210,6 +2216,7 @@ static struct attribute *ext4_attrs[] = {
2210 ATTR_LIST(mb_order2_req), 2216 ATTR_LIST(mb_order2_req),
2211 ATTR_LIST(mb_stream_req), 2217 ATTR_LIST(mb_stream_req),
2212 ATTR_LIST(mb_group_prealloc), 2218 ATTR_LIST(mb_group_prealloc),
2219 ATTR_LIST(max_writeback_mb_bump),
2213 NULL, 2220 NULL,
2214}; 2221};
2215 2222
@@ -2253,6 +2260,49 @@ static struct kobj_type ext4_ktype = {
2253 .release = ext4_sb_release, 2260 .release = ext4_sb_release,
2254}; 2261};
2255 2262
2263/*
2264 * Check whether this filesystem can be mounted based on
2265 * the features present and the RDONLY/RDWR mount requested.
2266 * Returns 1 if this filesystem can be mounted as requested,
2267 * 0 if it cannot be.
2268 */
2269static int ext4_feature_set_ok(struct super_block *sb, int readonly)
2270{
2271 if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
2272 ext4_msg(sb, KERN_ERR,
2273 "Couldn't mount because of "
2274 "unsupported optional features (%x)",
2275 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
2276 ~EXT4_FEATURE_INCOMPAT_SUPP));
2277 return 0;
2278 }
2279
2280 if (readonly)
2281 return 1;
2282
2283 /* Check that feature set is OK for a read-write mount */
2284 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
2285 ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
2286 "unsupported optional features (%x)",
2287 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
2288 ~EXT4_FEATURE_RO_COMPAT_SUPP));
2289 return 0;
2290 }
2291 /*
2292 * Large file size enabled file system can only be mounted
2293 * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
2294 */
2295 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
2296 if (sizeof(blkcnt_t) < sizeof(u64)) {
2297 ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
2298 "cannot be mounted RDWR without "
2299 "CONFIG_LBDAF");
2300 return 0;
2301 }
2302 }
2303 return 1;
2304}
2305
2256static int ext4_fill_super(struct super_block *sb, void *data, int silent) 2306static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2257 __releases(kernel_lock) 2307 __releases(kernel_lock)
2258 __acquires(kernel_lock) 2308 __acquires(kernel_lock)
@@ -2274,7 +2324,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2274 unsigned int db_count; 2324 unsigned int db_count;
2275 unsigned int i; 2325 unsigned int i;
2276 int needs_recovery, has_huge_files; 2326 int needs_recovery, has_huge_files;
2277 int features;
2278 __u64 blocks_count; 2327 __u64 blocks_count;
2279 int err; 2328 int err;
2280 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 2329 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
@@ -2371,7 +2420,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2371 sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; 2420 sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
2372 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; 2421 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
2373 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; 2422 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
2374 sbi->s_mb_history_max = default_mb_history_length;
2375 2423
2376 set_opt(sbi->s_mount_opt, BARRIER); 2424 set_opt(sbi->s_mount_opt, BARRIER);
2377 2425
@@ -2401,39 +2449,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2401 * previously didn't change the revision level when setting the flags, 2449 * previously didn't change the revision level when setting the flags,
2402 * so there is a chance incompat flags are set on a rev 0 filesystem. 2450 * so there is a chance incompat flags are set on a rev 0 filesystem.
2403 */ 2451 */
2404 features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP); 2452 if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
2405 if (features) {
2406 ext4_msg(sb, KERN_ERR,
2407 "Couldn't mount because of "
2408 "unsupported optional features (%x)",
2409 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
2410 ~EXT4_FEATURE_INCOMPAT_SUPP));
2411 goto failed_mount; 2453 goto failed_mount;
2412 } 2454
2413 features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
2414 if (!(sb->s_flags & MS_RDONLY) && features) {
2415 ext4_msg(sb, KERN_ERR,
2416 "Couldn't mount RDWR because of "
2417 "unsupported optional features (%x)",
2418 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
2419 ~EXT4_FEATURE_RO_COMPAT_SUPP));
2420 goto failed_mount;
2421 }
2422 has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
2423 EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
2424 if (has_huge_files) {
2425 /*
2426 * Large file size enabled file system can only be
2427 * mount if kernel is build with CONFIG_LBDAF
2428 */
2429 if (sizeof(root->i_blocks) < sizeof(u64) &&
2430 !(sb->s_flags & MS_RDONLY)) {
2431 ext4_msg(sb, KERN_ERR, "Filesystem with huge "
2432 "files cannot be mounted read-write "
2433 "without CONFIG_LBDAF");
2434 goto failed_mount;
2435 }
2436 }
2437 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); 2455 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
2438 2456
2439 if (blocksize < EXT4_MIN_BLOCK_SIZE || 2457 if (blocksize < EXT4_MIN_BLOCK_SIZE ||
@@ -2469,6 +2487,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2469 } 2487 }
2470 } 2488 }
2471 2489
2490 has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
2491 EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
2472 sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, 2492 sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
2473 has_huge_files); 2493 has_huge_files);
2474 sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); 2494 sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
@@ -2549,12 +2569,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2549 goto failed_mount; 2569 goto failed_mount;
2550 } 2570 }
2551 2571
2552 if (ext4_blocks_count(es) > 2572 /*
2553 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { 2573 * Test whether we have more sectors than will fit in sector_t,
2574 * and whether the max offset is addressable by the page cache.
2575 */
2576 if ((ext4_blocks_count(es) >
2577 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) ||
2578 (ext4_blocks_count(es) >
2579 (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
2554 ext4_msg(sb, KERN_ERR, "filesystem" 2580 ext4_msg(sb, KERN_ERR, "filesystem"
2555 " too large to mount safely"); 2581 " too large to mount safely on this system");
2556 if (sizeof(sector_t) < 8) 2582 if (sizeof(sector_t) < 8)
2557 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); 2583 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
2584 ret = -EFBIG;
2558 goto failed_mount; 2585 goto failed_mount;
2559 } 2586 }
2560 2587
@@ -2595,6 +2622,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2595 goto failed_mount; 2622 goto failed_mount;
2596 } 2623 }
2597 sbi->s_groups_count = blocks_count; 2624 sbi->s_groups_count = blocks_count;
2625 sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
2626 (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
2598 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / 2627 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
2599 EXT4_DESC_PER_BLOCK(sb); 2628 EXT4_DESC_PER_BLOCK(sb);
2600 sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), 2629 sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
@@ -2656,6 +2685,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2656 } 2685 }
2657 2686
2658 sbi->s_stripe = ext4_get_stripe_size(sbi); 2687 sbi->s_stripe = ext4_get_stripe_size(sbi);
2688 sbi->s_max_writeback_mb_bump = 128;
2659 2689
2660 /* 2690 /*
2661 * set up enough so that it can read an inode 2691 * set up enough so that it can read an inode
@@ -2729,20 +2759,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2729 goto failed_mount4; 2759 goto failed_mount4;
2730 } 2760 }
2731 2761
2732 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { 2762 jbd2_journal_set_features(sbi->s_journal,
2733 jbd2_journal_set_features(sbi->s_journal, 2763 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
2734 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 2764 if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
2765 jbd2_journal_set_features(sbi->s_journal, 0, 0,
2735 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); 2766 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2736 } else if (test_opt(sb, JOURNAL_CHECKSUM)) { 2767 else
2737 jbd2_journal_set_features(sbi->s_journal,
2738 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
2739 jbd2_journal_clear_features(sbi->s_journal, 0, 0, 2768 jbd2_journal_clear_features(sbi->s_journal, 0, 0,
2740 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); 2769 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2741 } else {
2742 jbd2_journal_clear_features(sbi->s_journal,
2743 JBD2_FEATURE_COMPAT_CHECKSUM, 0,
2744 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2745 }
2746 2770
2747 /* We have now updated the journal if required, so we can 2771 /* We have now updated the journal if required, so we can
2748 * validate the data journaling mode. */ 2772 * validate the data journaling mode. */
@@ -2781,6 +2805,12 @@ no_journal:
2781 clear_opt(sbi->s_mount_opt, NOBH); 2805 clear_opt(sbi->s_mount_opt, NOBH);
2782 } 2806 }
2783 } 2807 }
2808 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
2809 if (!EXT4_SB(sb)->dio_unwritten_wq) {
2810 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
2811 goto failed_mount_wq;
2812 }
2813
2784 /* 2814 /*
2785 * The jbd2_journal_load will have done any necessary log recovery, 2815 * The jbd2_journal_load will have done any necessary log recovery,
2786 * so we can safely mount the rest of the filesystem now. 2816 * so we can safely mount the rest of the filesystem now.
@@ -2832,12 +2862,12 @@ no_journal:
2832 "available"); 2862 "available");
2833 } 2863 }
2834 2864
2835 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 2865 if (test_opt(sb, DELALLOC) &&
2866 (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
2836 ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - " 2867 ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
2837 "requested data journaling mode"); 2868 "requested data journaling mode");
2838 clear_opt(sbi->s_mount_opt, DELALLOC); 2869 clear_opt(sbi->s_mount_opt, DELALLOC);
2839 } else if (test_opt(sb, DELALLOC)) 2870 }
2840 ext4_msg(sb, KERN_INFO, "delayed allocation enabled");
2841 2871
2842 err = ext4_setup_system_zone(sb); 2872 err = ext4_setup_system_zone(sb);
2843 if (err) { 2873 if (err) {
@@ -2893,6 +2923,8 @@ cantfind_ext4:
2893 2923
2894failed_mount4: 2924failed_mount4:
2895 ext4_msg(sb, KERN_ERR, "mount failed"); 2925 ext4_msg(sb, KERN_ERR, "mount failed");
2926 destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
2927failed_mount_wq:
2896 ext4_release_system_zone(sb); 2928 ext4_release_system_zone(sb);
2897 if (sbi->s_journal) { 2929 if (sbi->s_journal) {
2898 jbd2_journal_destroy(sbi->s_journal); 2930 jbd2_journal_destroy(sbi->s_journal);
@@ -3147,9 +3179,7 @@ static int ext4_load_journal(struct super_block *sb,
3147 return -EINVAL; 3179 return -EINVAL;
3148 } 3180 }
3149 3181
3150 if (journal->j_flags & JBD2_BARRIER) 3182 if (!(journal->j_flags & JBD2_BARRIER))
3151 ext4_msg(sb, KERN_INFO, "barriers enabled");
3152 else
3153 ext4_msg(sb, KERN_INFO, "barriers disabled"); 3183 ext4_msg(sb, KERN_INFO, "barriers disabled");
3154 3184
3155 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { 3185 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
@@ -3208,7 +3238,18 @@ static int ext4_commit_super(struct super_block *sb, int sync)
3208 clear_buffer_write_io_error(sbh); 3238 clear_buffer_write_io_error(sbh);
3209 set_buffer_uptodate(sbh); 3239 set_buffer_uptodate(sbh);
3210 } 3240 }
3211 es->s_wtime = cpu_to_le32(get_seconds()); 3241 /*
3242 * If the file system is mounted read-only, don't update the
3243 * superblock write time. This avoids updating the superblock
3244 * write time when we are mounting the root file system
3245 * read/only but we need to replay the journal; at that point,
3246 * for people who are east of GMT and who make their clock
3247 * tick in localtime for Windows bug-for-bug compatibility,
3248 * the clock is set in the future, and this will cause e2fsck
3249 * to complain and force a full file system check.
3250 */
3251 if (!(sb->s_flags & MS_RDONLY))
3252 es->s_wtime = cpu_to_le32(get_seconds());
3212 es->s_kbytes_written = 3253 es->s_kbytes_written =
3213 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 3254 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
3214 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - 3255 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
@@ -3333,11 +3374,13 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
3333{ 3374{
3334 int ret = 0; 3375 int ret = 0;
3335 tid_t target; 3376 tid_t target;
3377 struct ext4_sb_info *sbi = EXT4_SB(sb);
3336 3378
3337 trace_ext4_sync_fs(sb, wait); 3379 trace_ext4_sync_fs(sb, wait);
3338 if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { 3380 flush_workqueue(sbi->dio_unwritten_wq);
3381 if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
3339 if (wait) 3382 if (wait)
3340 jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target); 3383 jbd2_log_wait_commit(sbi->s_journal, target);
3341 } 3384 }
3342 return ret; 3385 return ret;
3343} 3386}
@@ -3477,18 +3520,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3477 if (sbi->s_journal) 3520 if (sbi->s_journal)
3478 ext4_mark_recovery_complete(sb, es); 3521 ext4_mark_recovery_complete(sb, es);
3479 } else { 3522 } else {
3480 int ret; 3523 /* Make sure we can mount this feature set readwrite */
3481 if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, 3524 if (!ext4_feature_set_ok(sb, 0)) {
3482 ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
3483 ext4_msg(sb, KERN_WARNING, "couldn't "
3484 "remount RDWR because of unsupported "
3485 "optional features (%x)",
3486 (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
3487 ~EXT4_FEATURE_RO_COMPAT_SUPP));
3488 err = -EROFS; 3525 err = -EROFS;
3489 goto restore_opts; 3526 goto restore_opts;
3490 } 3527 }
3491
3492 /* 3528 /*
3493 * Make sure the group descriptor checksums 3529 * Make sure the group descriptor checksums
3494 * are sane. If they aren't, refuse to remount r/w. 3530 * are sane. If they aren't, refuse to remount r/w.
@@ -3930,27 +3966,6 @@ static struct file_system_type ext4_fs_type = {
3930 .fs_flags = FS_REQUIRES_DEV, 3966 .fs_flags = FS_REQUIRES_DEV,
3931}; 3967};
3932 3968
3933#ifdef CONFIG_EXT4DEV_COMPAT
3934static int ext4dev_get_sb(struct file_system_type *fs_type, int flags,
3935 const char *dev_name, void *data,struct vfsmount *mnt)
3936{
3937 printk(KERN_WARNING "EXT4-fs (%s): Update your userspace programs "
3938 "to mount using ext4\n", dev_name);
3939 printk(KERN_WARNING "EXT4-fs (%s): ext4dev backwards compatibility "
3940 "will go away by 2.6.31\n", dev_name);
3941 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
3942}
3943
3944static struct file_system_type ext4dev_fs_type = {
3945 .owner = THIS_MODULE,
3946 .name = "ext4dev",
3947 .get_sb = ext4dev_get_sb,
3948 .kill_sb = kill_block_super,
3949 .fs_flags = FS_REQUIRES_DEV,
3950};
3951MODULE_ALIAS("ext4dev");
3952#endif
3953
3954static int __init init_ext4_fs(void) 3969static int __init init_ext4_fs(void)
3955{ 3970{
3956 int err; 3971 int err;
@@ -3975,13 +3990,6 @@ static int __init init_ext4_fs(void)
3975 err = register_filesystem(&ext4_fs_type); 3990 err = register_filesystem(&ext4_fs_type);
3976 if (err) 3991 if (err)
3977 goto out; 3992 goto out;
3978#ifdef CONFIG_EXT4DEV_COMPAT
3979 err = register_filesystem(&ext4dev_fs_type);
3980 if (err) {
3981 unregister_filesystem(&ext4_fs_type);
3982 goto out;
3983 }
3984#endif
3985 return 0; 3993 return 0;
3986out: 3994out:
3987 destroy_inodecache(); 3995 destroy_inodecache();
@@ -4000,9 +4008,6 @@ out4:
4000static void __exit exit_ext4_fs(void) 4008static void __exit exit_ext4_fs(void)
4001{ 4009{
4002 unregister_filesystem(&ext4_fs_type); 4010 unregister_filesystem(&ext4_fs_type);
4003#ifdef CONFIG_EXT4DEV_COMPAT
4004 unregister_filesystem(&ext4dev_fs_type);
4005#endif
4006 destroy_inodecache(); 4011 destroy_inodecache();
4007 exit_ext4_xattr(); 4012 exit_ext4_xattr();
4008 exit_ext4_mballoc(); 4013 exit_ext4_mballoc();
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 62b31c246994..fed5b01d7a8d 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -810,12 +810,23 @@ inserted:
810 get_bh(new_bh); 810 get_bh(new_bh);
811 } else { 811 } else {
812 /* We need to allocate a new block */ 812 /* We need to allocate a new block */
813 ext4_fsblk_t goal = ext4_group_first_block_no(sb, 813 ext4_fsblk_t goal, block;
814
815 goal = ext4_group_first_block_no(sb,
814 EXT4_I(inode)->i_block_group); 816 EXT4_I(inode)->i_block_group);
815 ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode, 817
818 /* non-extent files can't have physical blocks past 2^32 */
819 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
820 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
821
822 block = ext4_new_meta_blocks(handle, inode,
816 goal, NULL, &error); 823 goal, NULL, &error);
817 if (error) 824 if (error)
818 goto cleanup; 825 goto cleanup;
826
827 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
828 BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
829
819 ea_idebug(inode, "creating block %d", block); 830 ea_idebug(inode, "creating block %d", block);
820 831
821 new_bh = sb_getblk(sb, block); 832 new_bh = sb_getblk(sb, block);
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index adb0e72a176d..7db0979c6b72 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -323,7 +323,7 @@ extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
323/* fat/misc.c */ 323/* fat/misc.c */
324extern void fat_fs_error(struct super_block *s, const char *fmt, ...) 324extern void fat_fs_error(struct super_block *s, const char *fmt, ...)
325 __attribute__ ((format (printf, 2, 3))) __cold; 325 __attribute__ ((format (printf, 2, 3))) __cold;
326extern void fat_clusters_flush(struct super_block *sb); 326extern int fat_clusters_flush(struct super_block *sb);
327extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); 327extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
328extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, 328extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
329 __le16 __time, __le16 __date, u8 time_cs); 329 __le16 __time, __le16 __date, u8 time_cs);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index f042b965c95c..e8c159de236b 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -176,8 +176,26 @@ static int fat_cont_expand(struct inode *inode, loff_t size)
176 176
177 inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; 177 inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
178 mark_inode_dirty(inode); 178 mark_inode_dirty(inode);
179 if (IS_SYNC(inode)) 179 if (IS_SYNC(inode)) {
180 err = sync_page_range_nolock(inode, mapping, start, count); 180 int err2;
181
182 /*
183 * Opencode syncing since we don't have a file open to use
184 * standard fsync path.
185 */
186 err = filemap_fdatawrite_range(mapping, start,
187 start + count - 1);
188 err2 = sync_mapping_buffers(mapping);
189 if (!err)
190 err = err2;
191 err2 = write_inode_now(inode, 1);
192 if (!err)
193 err = err2;
194 if (!err) {
195 err = filemap_fdatawait_range(mapping, start,
196 start + count - 1);
197 }
198 }
181out: 199out:
182 return err; 200 return err;
183} 201}
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 8970d8c49bb0..76b7961ab663 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -451,12 +451,16 @@ static void fat_write_super(struct super_block *sb)
451 451
452static int fat_sync_fs(struct super_block *sb, int wait) 452static int fat_sync_fs(struct super_block *sb, int wait)
453{ 453{
454 lock_super(sb); 454 int err = 0;
455 fat_clusters_flush(sb);
456 sb->s_dirt = 0;
457 unlock_super(sb);
458 455
459 return 0; 456 if (sb->s_dirt) {
457 lock_super(sb);
458 sb->s_dirt = 0;
459 err = fat_clusters_flush(sb);
460 unlock_super(sb);
461 }
462
463 return err;
460} 464}
461 465
462static void fat_put_super(struct super_block *sb) 466static void fat_put_super(struct super_block *sb)
@@ -470,19 +474,11 @@ static void fat_put_super(struct super_block *sb)
470 474
471 iput(sbi->fat_inode); 475 iput(sbi->fat_inode);
472 476
473 if (sbi->nls_disk) { 477 unload_nls(sbi->nls_disk);
474 unload_nls(sbi->nls_disk); 478 unload_nls(sbi->nls_io);
475 sbi->nls_disk = NULL; 479
476 sbi->options.codepage = fat_default_codepage; 480 if (sbi->options.iocharset != fat_default_iocharset)
477 }
478 if (sbi->nls_io) {
479 unload_nls(sbi->nls_io);
480 sbi->nls_io = NULL;
481 }
482 if (sbi->options.iocharset != fat_default_iocharset) {
483 kfree(sbi->options.iocharset); 481 kfree(sbi->options.iocharset);
484 sbi->options.iocharset = fat_default_iocharset;
485 }
486 482
487 sb->s_fs_info = NULL; 483 sb->s_fs_info = NULL;
488 kfree(sbi); 484 kfree(sbi);
@@ -820,7 +816,7 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
820 seq_puts(m, ",shortname=mixed"); 816 seq_puts(m, ",shortname=mixed");
821 break; 817 break;
822 case VFAT_SFN_DISPLAY_LOWER | VFAT_SFN_CREATE_WIN95: 818 case VFAT_SFN_DISPLAY_LOWER | VFAT_SFN_CREATE_WIN95:
823 /* seq_puts(m, ",shortname=lower"); */ 819 seq_puts(m, ",shortname=lower");
824 break; 820 break;
825 default: 821 default:
826 seq_puts(m, ",shortname=unknown"); 822 seq_puts(m, ",shortname=unknown");
@@ -971,7 +967,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
971 opts->codepage = fat_default_codepage; 967 opts->codepage = fat_default_codepage;
972 opts->iocharset = fat_default_iocharset; 968 opts->iocharset = fat_default_iocharset;
973 if (is_vfat) { 969 if (is_vfat) {
974 opts->shortname = VFAT_SFN_DISPLAY_LOWER|VFAT_SFN_CREATE_WIN95; 970 opts->shortname = VFAT_SFN_DISPLAY_WINNT|VFAT_SFN_CREATE_WIN95;
975 opts->rodir = 0; 971 opts->rodir = 0;
976 } else { 972 } else {
977 opts->shortname = 0; 973 opts->shortname = 0;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index a6c20473dfd7..0f55f5cb732f 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -43,19 +43,19 @@ EXPORT_SYMBOL_GPL(fat_fs_error);
43 43
44/* Flushes the number of free clusters on FAT32 */ 44/* Flushes the number of free clusters on FAT32 */
45/* XXX: Need to write one per FSINFO block. Currently only writes 1 */ 45/* XXX: Need to write one per FSINFO block. Currently only writes 1 */
46void fat_clusters_flush(struct super_block *sb) 46int fat_clusters_flush(struct super_block *sb)
47{ 47{
48 struct msdos_sb_info *sbi = MSDOS_SB(sb); 48 struct msdos_sb_info *sbi = MSDOS_SB(sb);
49 struct buffer_head *bh; 49 struct buffer_head *bh;
50 struct fat_boot_fsinfo *fsinfo; 50 struct fat_boot_fsinfo *fsinfo;
51 51
52 if (sbi->fat_bits != 32) 52 if (sbi->fat_bits != 32)
53 return; 53 return 0;
54 54
55 bh = sb_bread(sb, sbi->fsinfo_sector); 55 bh = sb_bread(sb, sbi->fsinfo_sector);
56 if (bh == NULL) { 56 if (bh == NULL) {
57 printk(KERN_ERR "FAT: bread failed in fat_clusters_flush\n"); 57 printk(KERN_ERR "FAT: bread failed in fat_clusters_flush\n");
58 return; 58 return -EIO;
59 } 59 }
60 60
61 fsinfo = (struct fat_boot_fsinfo *)bh->b_data; 61 fsinfo = (struct fat_boot_fsinfo *)bh->b_data;
@@ -74,6 +74,8 @@ void fat_clusters_flush(struct super_block *sb)
74 mark_buffer_dirty(bh); 74 mark_buffer_dirty(bh);
75 } 75 }
76 brelse(bh); 76 brelse(bh);
77
78 return 0;
77} 79}
78 80
79/* 81/*
@@ -119,8 +121,8 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
119 MSDOS_I(inode)->i_start = new_dclus; 121 MSDOS_I(inode)->i_start = new_dclus;
120 MSDOS_I(inode)->i_logstart = new_dclus; 122 MSDOS_I(inode)->i_logstart = new_dclus;
121 /* 123 /*
122 * Since generic_osync_inode() synchronize later if 124 * Since generic_write_sync() synchronizes regular files later,
123 * this is not directory, we don't here. 125 * we sync here only directories.
124 */ 126 */
125 if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) { 127 if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) {
126 ret = fat_sync_inode(inode); 128 ret = fat_sync_inode(inode);
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index cb6e83557112..f565f24019b5 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -499,17 +499,10 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
499 int charlen; 499 int charlen;
500 500
501 if (utf8) { 501 if (utf8) {
502 int name_len = strlen(name); 502 *outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname);
503 503 if (*outlen < 0)
504 *outlen = utf8s_to_utf16s(name, PATH_MAX, (wchar_t *) outname); 504 return *outlen;
505 505 else if (*outlen > 255)
506 /*
507 * We stripped '.'s before and set len appropriately,
508 * but utf8s_to_utf16s doesn't care about len
509 */
510 *outlen -= (name_len - len);
511
512 if (*outlen > 255)
513 return -ENAMETOOLONG; 506 return -ENAMETOOLONG;
514 507
515 op = &outname[*outlen * sizeof(wchar_t)]; 508 op = &outname[*outlen * sizeof(wchar_t)];
diff --git a/fs/fcntl.c b/fs/fcntl.c
index ae413086db97..fc089f2f7f56 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -263,6 +263,79 @@ pid_t f_getown(struct file *filp)
263 return pid; 263 return pid;
264} 264}
265 265
266static int f_setown_ex(struct file *filp, unsigned long arg)
267{
268 struct f_owner_ex * __user owner_p = (void * __user)arg;
269 struct f_owner_ex owner;
270 struct pid *pid;
271 int type;
272 int ret;
273
274 ret = copy_from_user(&owner, owner_p, sizeof(owner));
275 if (ret)
276 return ret;
277
278 switch (owner.type) {
279 case F_OWNER_TID:
280 type = PIDTYPE_MAX;
281 break;
282
283 case F_OWNER_PID:
284 type = PIDTYPE_PID;
285 break;
286
287 case F_OWNER_GID:
288 type = PIDTYPE_PGID;
289 break;
290
291 default:
292 return -EINVAL;
293 }
294
295 rcu_read_lock();
296 pid = find_vpid(owner.pid);
297 if (owner.pid && !pid)
298 ret = -ESRCH;
299 else
300 ret = __f_setown(filp, pid, type, 1);
301 rcu_read_unlock();
302
303 return ret;
304}
305
306static int f_getown_ex(struct file *filp, unsigned long arg)
307{
308 struct f_owner_ex * __user owner_p = (void * __user)arg;
309 struct f_owner_ex owner;
310 int ret = 0;
311
312 read_lock(&filp->f_owner.lock);
313 owner.pid = pid_vnr(filp->f_owner.pid);
314 switch (filp->f_owner.pid_type) {
315 case PIDTYPE_MAX:
316 owner.type = F_OWNER_TID;
317 break;
318
319 case PIDTYPE_PID:
320 owner.type = F_OWNER_PID;
321 break;
322
323 case PIDTYPE_PGID:
324 owner.type = F_OWNER_GID;
325 break;
326
327 default:
328 WARN_ON(1);
329 ret = -EINVAL;
330 break;
331 }
332 read_unlock(&filp->f_owner.lock);
333
334 if (!ret)
335 ret = copy_to_user(owner_p, &owner, sizeof(owner));
336 return ret;
337}
338
266static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, 339static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
267 struct file *filp) 340 struct file *filp)
268{ 341{
@@ -313,6 +386,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
313 case F_SETOWN: 386 case F_SETOWN:
314 err = f_setown(filp, arg, 1); 387 err = f_setown(filp, arg, 1);
315 break; 388 break;
389 case F_GETOWN_EX:
390 err = f_getown_ex(filp, arg);
391 break;
392 case F_SETOWN_EX:
393 err = f_setown_ex(filp, arg);
394 break;
316 case F_GETSIG: 395 case F_GETSIG:
317 err = filp->f_owner.signum; 396 err = filp->f_owner.signum;
318 break; 397 break;
@@ -428,8 +507,7 @@ static inline int sigio_perm(struct task_struct *p,
428 507
429static void send_sigio_to_task(struct task_struct *p, 508static void send_sigio_to_task(struct task_struct *p,
430 struct fown_struct *fown, 509 struct fown_struct *fown,
431 int fd, 510 int fd, int reason, int group)
432 int reason)
433{ 511{
434 /* 512 /*
435 * F_SETSIG can change ->signum lockless in parallel, make 513 * F_SETSIG can change ->signum lockless in parallel, make
@@ -461,11 +539,11 @@ static void send_sigio_to_task(struct task_struct *p,
461 else 539 else
462 si.si_band = band_table[reason - POLL_IN]; 540 si.si_band = band_table[reason - POLL_IN];
463 si.si_fd = fd; 541 si.si_fd = fd;
464 if (!group_send_sig_info(signum, &si, p)) 542 if (!do_send_sig_info(signum, &si, p, group))
465 break; 543 break;
466 /* fall-through: fall back on the old plain SIGIO signal */ 544 /* fall-through: fall back on the old plain SIGIO signal */
467 case 0: 545 case 0:
468 group_send_sig_info(SIGIO, SEND_SIG_PRIV, p); 546 do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, group);
469 } 547 }
470} 548}
471 549
@@ -474,16 +552,23 @@ void send_sigio(struct fown_struct *fown, int fd, int band)
474 struct task_struct *p; 552 struct task_struct *p;
475 enum pid_type type; 553 enum pid_type type;
476 struct pid *pid; 554 struct pid *pid;
555 int group = 1;
477 556
478 read_lock(&fown->lock); 557 read_lock(&fown->lock);
558
479 type = fown->pid_type; 559 type = fown->pid_type;
560 if (type == PIDTYPE_MAX) {
561 group = 0;
562 type = PIDTYPE_PID;
563 }
564
480 pid = fown->pid; 565 pid = fown->pid;
481 if (!pid) 566 if (!pid)
482 goto out_unlock_fown; 567 goto out_unlock_fown;
483 568
484 read_lock(&tasklist_lock); 569 read_lock(&tasklist_lock);
485 do_each_pid_task(pid, type, p) { 570 do_each_pid_task(pid, type, p) {
486 send_sigio_to_task(p, fown, fd, band); 571 send_sigio_to_task(p, fown, fd, band, group);
487 } while_each_pid_task(pid, type, p); 572 } while_each_pid_task(pid, type, p);
488 read_unlock(&tasklist_lock); 573 read_unlock(&tasklist_lock);
489 out_unlock_fown: 574 out_unlock_fown:
@@ -491,10 +576,10 @@ void send_sigio(struct fown_struct *fown, int fd, int band)
491} 576}
492 577
493static void send_sigurg_to_task(struct task_struct *p, 578static void send_sigurg_to_task(struct task_struct *p,
494 struct fown_struct *fown) 579 struct fown_struct *fown, int group)
495{ 580{
496 if (sigio_perm(p, fown, SIGURG)) 581 if (sigio_perm(p, fown, SIGURG))
497 group_send_sig_info(SIGURG, SEND_SIG_PRIV, p); 582 do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, group);
498} 583}
499 584
500int send_sigurg(struct fown_struct *fown) 585int send_sigurg(struct fown_struct *fown)
@@ -502,10 +587,17 @@ int send_sigurg(struct fown_struct *fown)
502 struct task_struct *p; 587 struct task_struct *p;
503 enum pid_type type; 588 enum pid_type type;
504 struct pid *pid; 589 struct pid *pid;
590 int group = 1;
505 int ret = 0; 591 int ret = 0;
506 592
507 read_lock(&fown->lock); 593 read_lock(&fown->lock);
594
508 type = fown->pid_type; 595 type = fown->pid_type;
596 if (type == PIDTYPE_MAX) {
597 group = 0;
598 type = PIDTYPE_PID;
599 }
600
509 pid = fown->pid; 601 pid = fown->pid;
510 if (!pid) 602 if (!pid)
511 goto out_unlock_fown; 603 goto out_unlock_fown;
@@ -514,7 +606,7 @@ int send_sigurg(struct fown_struct *fown)
514 606
515 read_lock(&tasklist_lock); 607 read_lock(&tasklist_lock);
516 do_each_pid_task(pid, type, p) { 608 do_each_pid_task(pid, type, p) {
517 send_sigurg_to_task(p, fown); 609 send_sigurg_to_task(p, fown, group);
518 } while_each_pid_task(pid, type, p); 610 } while_each_pid_task(pid, type, p);
519 read_unlock(&tasklist_lock); 611 read_unlock(&tasklist_lock);
520 out_unlock_fown: 612 out_unlock_fown:
diff --git a/fs/file_table.c b/fs/file_table.c
index 334ce39881f8..8eb44042e009 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -74,14 +74,14 @@ EXPORT_SYMBOL_GPL(get_max_files);
74 * Handle nr_files sysctl 74 * Handle nr_files sysctl
75 */ 75 */
76#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) 76#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
77int proc_nr_files(ctl_table *table, int write, struct file *filp, 77int proc_nr_files(ctl_table *table, int write,
78 void __user *buffer, size_t *lenp, loff_t *ppos) 78 void __user *buffer, size_t *lenp, loff_t *ppos)
79{ 79{
80 files_stat.nr_files = get_nr_files(); 80 files_stat.nr_files = get_nr_files();
81 return proc_dointvec(table, write, filp, buffer, lenp, ppos); 81 return proc_dointvec(table, write, buffer, lenp, ppos);
82} 82}
83#else 83#else
84int proc_nr_files(ctl_table *table, int write, struct file *filp, 84int proc_nr_files(ctl_table *table, int write,
85 void __user *buffer, size_t *lenp, loff_t *ppos) 85 void __user *buffer, size_t *lenp, loff_t *ppos)
86{ 86{
87 return -ENOSYS; 87 return -ENOSYS;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c54226be5294..9d5360c4c2af 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -19,171 +19,257 @@
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/kthread.h>
23#include <linux/freezer.h>
22#include <linux/writeback.h> 24#include <linux/writeback.h>
23#include <linux/blkdev.h> 25#include <linux/blkdev.h>
24#include <linux/backing-dev.h> 26#include <linux/backing-dev.h>
25#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
26#include "internal.h" 28#include "internal.h"
27 29
30#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
28 31
29/** 32/*
30 * writeback_acquire - attempt to get exclusive writeback access to a device 33 * We don't actually have pdflush, but this one is exported though /proc...
31 * @bdi: the device's backing_dev_info structure
32 *
33 * It is a waste of resources to have more than one pdflush thread blocked on
34 * a single request queue. Exclusion at the request_queue level is obtained
35 * via a flag in the request_queue's backing_dev_info.state.
36 *
37 * Non-request_queue-backed address_spaces will share default_backing_dev_info,
38 * unless they implement their own. Which is somewhat inefficient, as this
39 * may prevent concurrent writeback against multiple devices.
40 */ 34 */
41static int writeback_acquire(struct backing_dev_info *bdi) 35int nr_pdflush_threads;
36
37/*
38 * Passed into wb_writeback(), essentially a subset of writeback_control
39 */
40struct wb_writeback_args {
41 long nr_pages;
42 struct super_block *sb;
43 enum writeback_sync_modes sync_mode;
44 int for_kupdate:1;
45 int range_cyclic:1;
46 int for_background:1;
47};
48
49/*
50 * Work items for the bdi_writeback threads
51 */
52struct bdi_work {
53 struct list_head list; /* pending work list */
54 struct rcu_head rcu_head; /* for RCU free/clear of work */
55
56 unsigned long seen; /* threads that have seen this work */
57 atomic_t pending; /* number of threads still to do work */
58
59 struct wb_writeback_args args; /* writeback arguments */
60
61 unsigned long state; /* flag bits, see WS_* */
62};
63
64enum {
65 WS_USED_B = 0,
66 WS_ONSTACK_B,
67};
68
69#define WS_USED (1 << WS_USED_B)
70#define WS_ONSTACK (1 << WS_ONSTACK_B)
71
72static inline bool bdi_work_on_stack(struct bdi_work *work)
42{ 73{
43 return !test_and_set_bit(BDI_pdflush, &bdi->state); 74 return test_bit(WS_ONSTACK_B, &work->state);
75}
76
77static inline void bdi_work_init(struct bdi_work *work,
78 struct wb_writeback_args *args)
79{
80 INIT_RCU_HEAD(&work->rcu_head);
81 work->args = *args;
82 work->state = WS_USED;
44} 83}
45 84
46/** 85/**
47 * writeback_in_progress - determine whether there is writeback in progress 86 * writeback_in_progress - determine whether there is writeback in progress
48 * @bdi: the device's backing_dev_info structure. 87 * @bdi: the device's backing_dev_info structure.
49 * 88 *
50 * Determine whether there is writeback in progress against a backing device. 89 * Determine whether there is writeback waiting to be handled against a
90 * backing device.
51 */ 91 */
52int writeback_in_progress(struct backing_dev_info *bdi) 92int writeback_in_progress(struct backing_dev_info *bdi)
53{ 93{
54 return test_bit(BDI_pdflush, &bdi->state); 94 return !list_empty(&bdi->work_list);
55} 95}
56 96
57/** 97static void bdi_work_clear(struct bdi_work *work)
58 * writeback_release - relinquish exclusive writeback access against a device.
59 * @bdi: the device's backing_dev_info structure
60 */
61static void writeback_release(struct backing_dev_info *bdi)
62{ 98{
63 BUG_ON(!writeback_in_progress(bdi)); 99 clear_bit(WS_USED_B, &work->state);
64 clear_bit(BDI_pdflush, &bdi->state); 100 smp_mb__after_clear_bit();
101 /*
102 * work can have disappeared at this point. bit waitq functions
103 * should be able to tolerate this, provided bdi_sched_wait does
104 * not dereference it's pointer argument.
105 */
106 wake_up_bit(&work->state, WS_USED_B);
65} 107}
66 108
67static noinline void block_dump___mark_inode_dirty(struct inode *inode) 109static void bdi_work_free(struct rcu_head *head)
68{ 110{
69 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { 111 struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
70 struct dentry *dentry;
71 const char *name = "?";
72 112
73 dentry = d_find_alias(inode); 113 if (!bdi_work_on_stack(work))
74 if (dentry) { 114 kfree(work);
75 spin_lock(&dentry->d_lock); 115 else
76 name = (const char *) dentry->d_name.name; 116 bdi_work_clear(work);
77 }
78 printk(KERN_DEBUG
79 "%s(%d): dirtied inode %lu (%s) on %s\n",
80 current->comm, task_pid_nr(current), inode->i_ino,
81 name, inode->i_sb->s_id);
82 if (dentry) {
83 spin_unlock(&dentry->d_lock);
84 dput(dentry);
85 }
86 }
87} 117}
88 118
89/** 119static void wb_work_complete(struct bdi_work *work)
90 * __mark_inode_dirty - internal function
91 * @inode: inode to mark
92 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
93 * Mark an inode as dirty. Callers should use mark_inode_dirty or
94 * mark_inode_dirty_sync.
95 *
96 * Put the inode on the super block's dirty list.
97 *
98 * CAREFUL! We mark it dirty unconditionally, but move it onto the
99 * dirty list only if it is hashed or if it refers to a blockdev.
100 * If it was not hashed, it will never be added to the dirty list
101 * even if it is later hashed, as it will have been marked dirty already.
102 *
103 * In short, make sure you hash any inodes _before_ you start marking
104 * them dirty.
105 *
106 * This function *must* be atomic for the I_DIRTY_PAGES case -
107 * set_page_dirty() is called under spinlock in several places.
108 *
109 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
110 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
111 * the kernel-internal blockdev inode represents the dirtying time of the
112 * blockdev's pages. This is why for I_DIRTY_PAGES we always use
113 * page->mapping->host, so the page-dirtying time is recorded in the internal
114 * blockdev inode.
115 */
116void __mark_inode_dirty(struct inode *inode, int flags)
117{ 120{
118 struct super_block *sb = inode->i_sb; 121 const enum writeback_sync_modes sync_mode = work->args.sync_mode;
122 int onstack = bdi_work_on_stack(work);
119 123
120 /* 124 /*
121 * Don't do this for I_DIRTY_PAGES - that doesn't actually 125 * For allocated work, we can clear the done/seen bit right here.
122 * dirty the inode itself 126 * For on-stack work, we need to postpone both the clear and free
127 * to after the RCU grace period, since the stack could be invalidated
128 * as soon as bdi_work_clear() has done the wakeup.
123 */ 129 */
124 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 130 if (!onstack)
125 if (sb->s_op->dirty_inode) 131 bdi_work_clear(work);
126 sb->s_op->dirty_inode(inode); 132 if (sync_mode == WB_SYNC_NONE || onstack)
127 } 133 call_rcu(&work->rcu_head, bdi_work_free);
134}
128 135
136static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
137{
129 /* 138 /*
130 * make sure that changes are seen by all cpus before we test i_state 139 * The caller has retrieved the work arguments from this work,
131 * -- mikulas 140 * drop our reference. If this is the last ref, delete and free it
132 */ 141 */
133 smp_mb(); 142 if (atomic_dec_and_test(&work->pending)) {
143 struct backing_dev_info *bdi = wb->bdi;
134 144
135 /* avoid the locking if we can */ 145 spin_lock(&bdi->wb_lock);
136 if ((inode->i_state & flags) == flags) 146 list_del_rcu(&work->list);
137 return; 147 spin_unlock(&bdi->wb_lock);
138 148
139 if (unlikely(block_dump)) 149 wb_work_complete(work);
140 block_dump___mark_inode_dirty(inode); 150 }
151}
141 152
142 spin_lock(&inode_lock); 153static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
143 if ((inode->i_state & flags) != flags) { 154{
144 const int was_dirty = inode->i_state & I_DIRTY; 155 work->seen = bdi->wb_mask;
156 BUG_ON(!work->seen);
157 atomic_set(&work->pending, bdi->wb_cnt);
158 BUG_ON(!bdi->wb_cnt);
145 159
146 inode->i_state |= flags; 160 /*
161 * list_add_tail_rcu() contains the necessary barriers to
162 * make sure the above stores are seen before the item is
163 * noticed on the list
164 */
165 spin_lock(&bdi->wb_lock);
166 list_add_tail_rcu(&work->list, &bdi->work_list);
167 spin_unlock(&bdi->wb_lock);
147 168
148 /* 169 /*
149 * If the inode is being synced, just update its dirty state. 170 * If the default thread isn't there, make sure we add it. When
150 * The unlocker will place the inode on the appropriate 171 * it gets created and wakes up, we'll run this work.
151 * superblock list, based upon its state. 172 */
152 */ 173 if (unlikely(list_empty_careful(&bdi->wb_list)))
153 if (inode->i_state & I_SYNC) 174 wake_up_process(default_backing_dev_info.wb.task);
154 goto out; 175 else {
176 struct bdi_writeback *wb = &bdi->wb;
155 177
156 /* 178 if (wb->task)
157 * Only add valid (hashed) inodes to the superblock's 179 wake_up_process(wb->task);
158 * dirty list. Add blockdev inodes as well. 180 }
159 */ 181}
160 if (!S_ISBLK(inode->i_mode)) {
161 if (hlist_unhashed(&inode->i_hash))
162 goto out;
163 }
164 if (inode->i_state & (I_FREEING|I_CLEAR))
165 goto out;
166 182
167 /* 183/*
168 * If the inode was already on s_dirty/s_io/s_more_io, don't 184 * Used for on-stack allocated work items. The caller needs to wait until
169 * reposition it (that would break s_dirty time-ordering). 185 * the wb threads have acked the work before it's safe to continue.
170 */ 186 */
171 if (!was_dirty) { 187static void bdi_wait_on_work_clear(struct bdi_work *work)
172 inode->dirtied_when = jiffies; 188{
173 list_move(&inode->i_list, &sb->s_dirty); 189 wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
174 } 190 TASK_UNINTERRUPTIBLE);
191}
192
193static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
194 struct wb_writeback_args *args)
195{
196 struct bdi_work *work;
197
198 /*
199 * This is WB_SYNC_NONE writeback, so if allocation fails just
200 * wakeup the thread for old dirty data writeback
201 */
202 work = kmalloc(sizeof(*work), GFP_ATOMIC);
203 if (work) {
204 bdi_work_init(work, args);
205 bdi_queue_work(bdi, work);
206 } else {
207 struct bdi_writeback *wb = &bdi->wb;
208
209 if (wb->task)
210 wake_up_process(wb->task);
175 } 211 }
176out:
177 spin_unlock(&inode_lock);
178} 212}
179 213
180EXPORT_SYMBOL(__mark_inode_dirty); 214/**
215 * bdi_sync_writeback - start and wait for writeback
216 * @bdi: the backing device to write from
217 * @sb: write inodes from this super_block
218 *
219 * Description:
220 * This does WB_SYNC_ALL data integrity writeback and waits for the
221 * IO to complete. Callers must hold the sb s_umount semaphore for
222 * reading, to avoid having the super disappear before we are done.
223 */
224static void bdi_sync_writeback(struct backing_dev_info *bdi,
225 struct super_block *sb)
226{
227 struct wb_writeback_args args = {
228 .sb = sb,
229 .sync_mode = WB_SYNC_ALL,
230 .nr_pages = LONG_MAX,
231 .range_cyclic = 0,
232 };
233 struct bdi_work work;
181 234
182static int write_inode(struct inode *inode, int sync) 235 bdi_work_init(&work, &args);
236 work.state |= WS_ONSTACK;
237
238 bdi_queue_work(bdi, &work);
239 bdi_wait_on_work_clear(&work);
240}
241
242/**
243 * bdi_start_writeback - start writeback
244 * @bdi: the backing device to write from
245 * @nr_pages: the number of pages to write
246 *
247 * Description:
248 * This does WB_SYNC_NONE opportunistic writeback. The IO is only
249 * started when this function returns, we make no guarentees on
250 * completion. Caller need not hold sb s_umount semaphore.
251 *
252 */
253void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
254 long nr_pages)
183{ 255{
184 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) 256 struct wb_writeback_args args = {
185 return inode->i_sb->s_op->write_inode(inode, sync); 257 .sb = sb,
186 return 0; 258 .sync_mode = WB_SYNC_NONE,
259 .nr_pages = nr_pages,
260 .range_cyclic = 1,
261 };
262
263 /*
264 * We treat @nr_pages=0 as the special case to do background writeback,
265 * ie. to sync pages until the background dirty threshold is reached.
266 */
267 if (!nr_pages) {
268 args.nr_pages = LONG_MAX;
269 args.for_background = 1;
270 }
271
272 bdi_alloc_queue_work(bdi, &args);
187} 273}
188 274
189/* 275/*
@@ -191,31 +277,32 @@ static int write_inode(struct inode *inode, int sync)
191 * furthest end of its superblock's dirty-inode list. 277 * furthest end of its superblock's dirty-inode list.
192 * 278 *
193 * Before stamping the inode's ->dirtied_when, we check to see whether it is 279 * Before stamping the inode's ->dirtied_when, we check to see whether it is
194 * already the most-recently-dirtied inode on the s_dirty list. If that is 280 * already the most-recently-dirtied inode on the b_dirty list. If that is
195 * the case then the inode must have been redirtied while it was being written 281 * the case then the inode must have been redirtied while it was being written
196 * out and we don't reset its dirtied_when. 282 * out and we don't reset its dirtied_when.
197 */ 283 */
198static void redirty_tail(struct inode *inode) 284static void redirty_tail(struct inode *inode)
199{ 285{
200 struct super_block *sb = inode->i_sb; 286 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
201 287
202 if (!list_empty(&sb->s_dirty)) { 288 if (!list_empty(&wb->b_dirty)) {
203 struct inode *tail_inode; 289 struct inode *tail;
204 290
205 tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list); 291 tail = list_entry(wb->b_dirty.next, struct inode, i_list);
206 if (time_before(inode->dirtied_when, 292 if (time_before(inode->dirtied_when, tail->dirtied_when))
207 tail_inode->dirtied_when))
208 inode->dirtied_when = jiffies; 293 inode->dirtied_when = jiffies;
209 } 294 }
210 list_move(&inode->i_list, &sb->s_dirty); 295 list_move(&inode->i_list, &wb->b_dirty);
211} 296}
212 297
213/* 298/*
214 * requeue inode for re-scanning after sb->s_io list is exhausted. 299 * requeue inode for re-scanning after bdi->b_io list is exhausted.
215 */ 300 */
216static void requeue_io(struct inode *inode) 301static void requeue_io(struct inode *inode)
217{ 302{
218 list_move(&inode->i_list, &inode->i_sb->s_more_io); 303 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
304
305 list_move(&inode->i_list, &wb->b_more_io);
219} 306}
220 307
221static void inode_sync_complete(struct inode *inode) 308static void inode_sync_complete(struct inode *inode)
@@ -235,7 +322,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
235 * For inodes being constantly redirtied, dirtied_when can get stuck. 322 * For inodes being constantly redirtied, dirtied_when can get stuck.
236 * It _appears_ to be in the future, but is actually in distant past. 323 * It _appears_ to be in the future, but is actually in distant past.
237 * This test is necessary to prevent such wrapped-around relative times 324 * This test is necessary to prevent such wrapped-around relative times
238 * from permanently stopping the whole pdflush writeback. 325 * from permanently stopping the whole bdi writeback.
239 */ 326 */
240 ret = ret && time_before_eq(inode->dirtied_when, jiffies); 327 ret = ret && time_before_eq(inode->dirtied_when, jiffies);
241#endif 328#endif
@@ -249,33 +336,56 @@ static void move_expired_inodes(struct list_head *delaying_queue,
249 struct list_head *dispatch_queue, 336 struct list_head *dispatch_queue,
250 unsigned long *older_than_this) 337 unsigned long *older_than_this)
251{ 338{
339 LIST_HEAD(tmp);
340 struct list_head *pos, *node;
341 struct super_block *sb = NULL;
342 struct inode *inode;
343 int do_sb_sort = 0;
344
252 while (!list_empty(delaying_queue)) { 345 while (!list_empty(delaying_queue)) {
253 struct inode *inode = list_entry(delaying_queue->prev, 346 inode = list_entry(delaying_queue->prev, struct inode, i_list);
254 struct inode, i_list);
255 if (older_than_this && 347 if (older_than_this &&
256 inode_dirtied_after(inode, *older_than_this)) 348 inode_dirtied_after(inode, *older_than_this))
257 break; 349 break;
258 list_move(&inode->i_list, dispatch_queue); 350 if (sb && sb != inode->i_sb)
351 do_sb_sort = 1;
352 sb = inode->i_sb;
353 list_move(&inode->i_list, &tmp);
354 }
355
356 /* just one sb in list, splice to dispatch_queue and we're done */
357 if (!do_sb_sort) {
358 list_splice(&tmp, dispatch_queue);
359 return;
360 }
361
362 /* Move inodes from one superblock together */
363 while (!list_empty(&tmp)) {
364 inode = list_entry(tmp.prev, struct inode, i_list);
365 sb = inode->i_sb;
366 list_for_each_prev_safe(pos, node, &tmp) {
367 inode = list_entry(pos, struct inode, i_list);
368 if (inode->i_sb == sb)
369 list_move(&inode->i_list, dispatch_queue);
370 }
259 } 371 }
260} 372}
261 373
262/* 374/*
263 * Queue all expired dirty inodes for io, eldest first. 375 * Queue all expired dirty inodes for io, eldest first.
264 */ 376 */
265static void queue_io(struct super_block *sb, 377static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
266 unsigned long *older_than_this)
267{ 378{
268 list_splice_init(&sb->s_more_io, sb->s_io.prev); 379 list_splice_init(&wb->b_more_io, wb->b_io.prev);
269 move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this); 380 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
270} 381}
271 382
272int sb_has_dirty_inodes(struct super_block *sb) 383static int write_inode(struct inode *inode, int sync)
273{ 384{
274 return !list_empty(&sb->s_dirty) || 385 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
275 !list_empty(&sb->s_io) || 386 return inode->i_sb->s_op->write_inode(inode, sync);
276 !list_empty(&sb->s_more_io); 387 return 0;
277} 388}
278EXPORT_SYMBOL(sb_has_dirty_inodes);
279 389
280/* 390/*
281 * Wait for writeback on an inode to complete. 391 * Wait for writeback on an inode to complete.
@@ -322,11 +432,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
322 if (inode->i_state & I_SYNC) { 432 if (inode->i_state & I_SYNC) {
323 /* 433 /*
324 * If this inode is locked for writeback and we are not doing 434 * If this inode is locked for writeback and we are not doing
325 * writeback-for-data-integrity, move it to s_more_io so that 435 * writeback-for-data-integrity, move it to b_more_io so that
326 * writeback can proceed with the other inodes on s_io. 436 * writeback can proceed with the other inodes on s_io.
327 * 437 *
328 * We'll have another go at writing back this inode when we 438 * We'll have another go at writing back this inode when we
329 * completed a full scan of s_io. 439 * completed a full scan of b_io.
330 */ 440 */
331 if (!wait) { 441 if (!wait) {
332 requeue_io(inode); 442 requeue_io(inode);
@@ -366,16 +476,26 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
366 spin_lock(&inode_lock); 476 spin_lock(&inode_lock);
367 inode->i_state &= ~I_SYNC; 477 inode->i_state &= ~I_SYNC;
368 if (!(inode->i_state & (I_FREEING | I_CLEAR))) { 478 if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
369 if (!(inode->i_state & I_DIRTY) && 479 if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) {
370 mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 480 /*
481 * More pages get dirtied by a fast dirtier.
482 */
483 goto select_queue;
484 } else if (inode->i_state & I_DIRTY) {
485 /*
486 * At least XFS will redirty the inode during the
487 * writeback (delalloc) and on io completion (isize).
488 */
489 redirty_tail(inode);
490 } else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
371 /* 491 /*
372 * We didn't write back all the pages. nfs_writepages() 492 * We didn't write back all the pages. nfs_writepages()
373 * sometimes bales out without doing anything. Redirty 493 * sometimes bales out without doing anything. Redirty
374 * the inode; Move it from s_io onto s_more_io/s_dirty. 494 * the inode; Move it from b_io onto b_more_io/b_dirty.
375 */ 495 */
376 /* 496 /*
377 * akpm: if the caller was the kupdate function we put 497 * akpm: if the caller was the kupdate function we put
378 * this inode at the head of s_dirty so it gets first 498 * this inode at the head of b_dirty so it gets first
379 * consideration. Otherwise, move it to the tail, for 499 * consideration. Otherwise, move it to the tail, for
380 * the reasons described there. I'm not really sure 500 * the reasons described there. I'm not really sure
381 * how much sense this makes. Presumably I had a good 501 * how much sense this makes. Presumably I had a good
@@ -385,10 +505,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
385 if (wbc->for_kupdate) { 505 if (wbc->for_kupdate) {
386 /* 506 /*
387 * For the kupdate function we move the inode 507 * For the kupdate function we move the inode
388 * to s_more_io so it will get more writeout as 508 * to b_more_io so it will get more writeout as
389 * soon as the queue becomes uncongested. 509 * soon as the queue becomes uncongested.
390 */ 510 */
391 inode->i_state |= I_DIRTY_PAGES; 511 inode->i_state |= I_DIRTY_PAGES;
512select_queue:
392 if (wbc->nr_to_write <= 0) { 513 if (wbc->nr_to_write <= 0) {
393 /* 514 /*
394 * slice used up: queue for next turn 515 * slice used up: queue for next turn
@@ -411,12 +532,6 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
411 inode->i_state |= I_DIRTY_PAGES; 532 inode->i_state |= I_DIRTY_PAGES;
412 redirty_tail(inode); 533 redirty_tail(inode);
413 } 534 }
414 } else if (inode->i_state & I_DIRTY) {
415 /*
416 * Someone redirtied the inode while were writing back
417 * the pages.
418 */
419 redirty_tail(inode);
420 } else if (atomic_read(&inode->i_count)) { 535 } else if (atomic_read(&inode->i_count)) {
421 /* 536 /*
422 * The inode is clean, inuse 537 * The inode is clean, inuse
@@ -433,51 +548,96 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
433 return ret; 548 return ret;
434} 549}
435 550
551static void unpin_sb_for_writeback(struct super_block **psb)
552{
553 struct super_block *sb = *psb;
554
555 if (sb) {
556 up_read(&sb->s_umount);
557 put_super(sb);
558 *psb = NULL;
559 }
560}
561
436/* 562/*
437 * Write out a superblock's list of dirty inodes. A wait will be performed 563 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
438 * upon no inodes, all inodes or the final one, depending upon sync_mode. 564 * before calling writeback. So make sure that we do pin it, so it doesn't
439 * 565 * go away while we are writing inodes from it.
440 * If older_than_this is non-NULL, then only write out inodes which
441 * had their first dirtying at a time earlier than *older_than_this.
442 *
443 * If we're a pdflush thread, then implement pdflush collision avoidance
444 * against the entire list.
445 *
446 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
447 * This function assumes that the blockdev superblock's inodes are backed by
448 * a variety of queues, so all inodes are searched. For other superblocks,
449 * assume that all inodes are backed by the same queue.
450 * 566 *
451 * FIXME: this linear search could get expensive with many fileystems. But 567 * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
452 * how to fix? We need to go from an address_space to all inodes which share 568 * 1 if we failed.
453 * a queue with that address_space. (Easy: have a global "dirty superblocks"
454 * list).
455 *
456 * The inodes to be written are parked on sb->s_io. They are moved back onto
457 * sb->s_dirty as they are selected for writing. This way, none can be missed
458 * on the writer throttling path, and we get decent balancing between many
459 * throttled threads: we don't want them all piling up on inode_sync_wait.
460 */ 569 */
461void generic_sync_sb_inodes(struct super_block *sb, 570static int pin_sb_for_writeback(struct writeback_control *wbc,
571 struct inode *inode, struct super_block **psb)
572{
573 struct super_block *sb = inode->i_sb;
574
575 /*
576 * If this sb is already pinned, nothing more to do. If not and
577 * *psb is non-NULL, unpin the old one first
578 */
579 if (sb == *psb)
580 return 0;
581 else if (*psb)
582 unpin_sb_for_writeback(psb);
583
584 /*
585 * Caller must already hold the ref for this
586 */
587 if (wbc->sync_mode == WB_SYNC_ALL) {
588 WARN_ON(!rwsem_is_locked(&sb->s_umount));
589 return 0;
590 }
591
592 spin_lock(&sb_lock);
593 sb->s_count++;
594 if (down_read_trylock(&sb->s_umount)) {
595 if (sb->s_root) {
596 spin_unlock(&sb_lock);
597 goto pinned;
598 }
599 /*
600 * umounted, drop rwsem again and fall through to failure
601 */
602 up_read(&sb->s_umount);
603 }
604
605 sb->s_count--;
606 spin_unlock(&sb_lock);
607 return 1;
608pinned:
609 *psb = sb;
610 return 0;
611}
612
613static void writeback_inodes_wb(struct bdi_writeback *wb,
462 struct writeback_control *wbc) 614 struct writeback_control *wbc)
463{ 615{
616 struct super_block *sb = wbc->sb, *pin_sb = NULL;
617 const int is_blkdev_sb = sb_is_blkdev_sb(sb);
464 const unsigned long start = jiffies; /* livelock avoidance */ 618 const unsigned long start = jiffies; /* livelock avoidance */
465 int sync = wbc->sync_mode == WB_SYNC_ALL;
466 619
467 spin_lock(&inode_lock); 620 spin_lock(&inode_lock);
468 if (!wbc->for_kupdate || list_empty(&sb->s_io))
469 queue_io(sb, wbc->older_than_this);
470 621
471 while (!list_empty(&sb->s_io)) { 622 if (!wbc->for_kupdate || list_empty(&wb->b_io))
472 struct inode *inode = list_entry(sb->s_io.prev, 623 queue_io(wb, wbc->older_than_this);
624
625 while (!list_empty(&wb->b_io)) {
626 struct inode *inode = list_entry(wb->b_io.prev,
473 struct inode, i_list); 627 struct inode, i_list);
474 struct address_space *mapping = inode->i_mapping;
475 struct backing_dev_info *bdi = mapping->backing_dev_info;
476 long pages_skipped; 628 long pages_skipped;
477 629
478 if (!bdi_cap_writeback_dirty(bdi)) { 630 /*
631 * super block given and doesn't match, skip this inode
632 */
633 if (sb && sb != inode->i_sb) {
479 redirty_tail(inode); 634 redirty_tail(inode);
480 if (sb_is_blkdev_sb(sb)) { 635 continue;
636 }
637
638 if (!bdi_cap_writeback_dirty(wb->bdi)) {
639 redirty_tail(inode);
640 if (is_blkdev_sb) {
481 /* 641 /*
482 * Dirty memory-backed blockdev: the ramdisk 642 * Dirty memory-backed blockdev: the ramdisk
483 * driver does this. Skip just this inode 643 * driver does this. Skip just this inode
@@ -497,21 +657,14 @@ void generic_sync_sb_inodes(struct super_block *sb,
497 continue; 657 continue;
498 } 658 }
499 659
500 if (wbc->nonblocking && bdi_write_congested(bdi)) { 660 if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
501 wbc->encountered_congestion = 1; 661 wbc->encountered_congestion = 1;
502 if (!sb_is_blkdev_sb(sb)) 662 if (!is_blkdev_sb)
503 break; /* Skip a congested fs */ 663 break; /* Skip a congested fs */
504 requeue_io(inode); 664 requeue_io(inode);
505 continue; /* Skip a congested blockdev */ 665 continue; /* Skip a congested blockdev */
506 } 666 }
507 667
508 if (wbc->bdi && bdi != wbc->bdi) {
509 if (!sb_is_blkdev_sb(sb))
510 break; /* fs has the wrong queue */
511 requeue_io(inode);
512 continue; /* blockdev has wrong queue */
513 }
514
515 /* 668 /*
516 * Was this inode dirtied after sync_sb_inodes was called? 669 * Was this inode dirtied after sync_sb_inodes was called?
517 * This keeps sync from extra jobs and livelock. 670 * This keeps sync from extra jobs and livelock.
@@ -519,16 +672,15 @@ void generic_sync_sb_inodes(struct super_block *sb,
519 if (inode_dirtied_after(inode, start)) 672 if (inode_dirtied_after(inode, start))
520 break; 673 break;
521 674
522 /* Is another pdflush already flushing this queue? */ 675 if (pin_sb_for_writeback(wbc, inode, &pin_sb)) {
523 if (current_is_pdflush() && !writeback_acquire(bdi)) 676 requeue_io(inode);
524 break; 677 continue;
678 }
525 679
526 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); 680 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
527 __iget(inode); 681 __iget(inode);
528 pages_skipped = wbc->pages_skipped; 682 pages_skipped = wbc->pages_skipped;
529 writeback_single_inode(inode, wbc); 683 writeback_single_inode(inode, wbc);
530 if (current_is_pdflush())
531 writeback_release(bdi);
532 if (wbc->pages_skipped != pages_skipped) { 684 if (wbc->pages_skipped != pages_skipped) {
533 /* 685 /*
534 * writeback is not making progress due to locked 686 * writeback is not making progress due to locked
@@ -544,144 +696,535 @@ void generic_sync_sb_inodes(struct super_block *sb,
544 wbc->more_io = 1; 696 wbc->more_io = 1;
545 break; 697 break;
546 } 698 }
547 if (!list_empty(&sb->s_more_io)) 699 if (!list_empty(&wb->b_more_io))
548 wbc->more_io = 1; 700 wbc->more_io = 1;
549 } 701 }
550 702
551 if (sync) { 703 unpin_sb_for_writeback(&pin_sb);
552 struct inode *inode, *old_inode = NULL;
553 704
705 spin_unlock(&inode_lock);
706 /* Leave any unwritten inodes on b_io */
707}
708
709void writeback_inodes_wbc(struct writeback_control *wbc)
710{
711 struct backing_dev_info *bdi = wbc->bdi;
712
713 writeback_inodes_wb(&bdi->wb, wbc);
714}
715
716/*
717 * The maximum number of pages to writeout in a single bdi flush/kupdate
718 * operation. We do this so we don't hold I_SYNC against an inode for
719 * enormous amounts of time, which would block a userspace task which has
720 * been forced to throttle against that inode. Also, the code reevaluates
721 * the dirty each time it has written this many pages.
722 */
723#define MAX_WRITEBACK_PAGES 1024
724
725static inline bool over_bground_thresh(void)
726{
727 unsigned long background_thresh, dirty_thresh;
728
729 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
730
731 return (global_page_state(NR_FILE_DIRTY) +
732 global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
733}
734
735/*
736 * Explicit flushing or periodic writeback of "old" data.
737 *
738 * Define "old": the first time one of an inode's pages is dirtied, we mark the
739 * dirtying-time in the inode's address_space. So this periodic writeback code
740 * just walks the superblock inode list, writing back any inodes which are
741 * older than a specific point in time.
742 *
743 * Try to run once per dirty_writeback_interval. But if a writeback event
744 * takes longer than a dirty_writeback_interval interval, then leave a
745 * one-second gap.
746 *
747 * older_than_this takes precedence over nr_to_write. So we'll only write back
748 * all dirty pages if they are all attached to "old" mappings.
749 */
750static long wb_writeback(struct bdi_writeback *wb,
751 struct wb_writeback_args *args)
752{
753 struct writeback_control wbc = {
754 .bdi = wb->bdi,
755 .sb = args->sb,
756 .sync_mode = args->sync_mode,
757 .older_than_this = NULL,
758 .for_kupdate = args->for_kupdate,
759 .range_cyclic = args->range_cyclic,
760 };
761 unsigned long oldest_jif;
762 long wrote = 0;
763 struct inode *inode;
764
765 if (wbc.for_kupdate) {
766 wbc.older_than_this = &oldest_jif;
767 oldest_jif = jiffies -
768 msecs_to_jiffies(dirty_expire_interval * 10);
769 }
770 if (!wbc.range_cyclic) {
771 wbc.range_start = 0;
772 wbc.range_end = LLONG_MAX;
773 }
774
775 for (;;) {
554 /* 776 /*
555 * Data integrity sync. Must wait for all pages under writeback, 777 * Stop writeback when nr_pages has been consumed
556 * because there may have been pages dirtied before our sync
557 * call, but which had writeout started before we write it out.
558 * In which case, the inode may not be on the dirty list, but
559 * we still have to wait for that writeout.
560 */ 778 */
561 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 779 if (args->nr_pages <= 0)
562 struct address_space *mapping; 780 break;
563
564 if (inode->i_state &
565 (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
566 continue;
567 mapping = inode->i_mapping;
568 if (mapping->nrpages == 0)
569 continue;
570 __iget(inode);
571 spin_unlock(&inode_lock);
572 /*
573 * We hold a reference to 'inode' so it couldn't have
574 * been removed from s_inodes list while we dropped the
575 * inode_lock. We cannot iput the inode now as we can
576 * be holding the last reference and we cannot iput it
577 * under inode_lock. So we keep the reference and iput
578 * it later.
579 */
580 iput(old_inode);
581 old_inode = inode;
582 781
583 filemap_fdatawait(mapping); 782 /*
783 * For background writeout, stop when we are below the
784 * background dirty threshold
785 */
786 if (args->for_background && !over_bground_thresh())
787 break;
584 788
585 cond_resched(); 789 wbc.more_io = 0;
790 wbc.encountered_congestion = 0;
791 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
792 wbc.pages_skipped = 0;
793 writeback_inodes_wb(wb, &wbc);
794 args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
795 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
586 796
587 spin_lock(&inode_lock); 797 /*
798 * If we consumed everything, see if we have more
799 */
800 if (wbc.nr_to_write <= 0)
801 continue;
802 /*
803 * Didn't write everything and we don't have more IO, bail
804 */
805 if (!wbc.more_io)
806 break;
807 /*
808 * Did we write something? Try for more
809 */
810 if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
811 continue;
812 /*
813 * Nothing written. Wait for some inode to
814 * become available for writeback. Otherwise
815 * we'll just busyloop.
816 */
817 spin_lock(&inode_lock);
818 if (!list_empty(&wb->b_more_io)) {
819 inode = list_entry(wb->b_more_io.prev,
820 struct inode, i_list);
821 inode_wait_for_writeback(inode);
588 } 822 }
589 spin_unlock(&inode_lock); 823 spin_unlock(&inode_lock);
590 iput(old_inode); 824 }
591 } else
592 spin_unlock(&inode_lock);
593 825
594 return; /* Leave any unwritten inodes on s_io */ 826 return wrote;
595} 827}
596EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
597 828
598static void sync_sb_inodes(struct super_block *sb, 829/*
599 struct writeback_control *wbc) 830 * Return the next bdi_work struct that hasn't been processed by this
831 * wb thread yet. ->seen is initially set for each thread that exists
832 * for this device, when a thread first notices a piece of work it
833 * clears its bit. Depending on writeback type, the thread will notify
834 * completion on either receiving the work (WB_SYNC_NONE) or after
835 * it is done (WB_SYNC_ALL).
836 */
837static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
838 struct bdi_writeback *wb)
839{
840 struct bdi_work *work, *ret = NULL;
841
842 rcu_read_lock();
843
844 list_for_each_entry_rcu(work, &bdi->work_list, list) {
845 if (!test_bit(wb->nr, &work->seen))
846 continue;
847 clear_bit(wb->nr, &work->seen);
848
849 ret = work;
850 break;
851 }
852
853 rcu_read_unlock();
854 return ret;
855}
856
857static long wb_check_old_data_flush(struct bdi_writeback *wb)
858{
859 unsigned long expired;
860 long nr_pages;
861
862 expired = wb->last_old_flush +
863 msecs_to_jiffies(dirty_writeback_interval * 10);
864 if (time_before(jiffies, expired))
865 return 0;
866
867 wb->last_old_flush = jiffies;
868 nr_pages = global_page_state(NR_FILE_DIRTY) +
869 global_page_state(NR_UNSTABLE_NFS) +
870 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
871
872 if (nr_pages) {
873 struct wb_writeback_args args = {
874 .nr_pages = nr_pages,
875 .sync_mode = WB_SYNC_NONE,
876 .for_kupdate = 1,
877 .range_cyclic = 1,
878 };
879
880 return wb_writeback(wb, &args);
881 }
882
883 return 0;
884}
885
886/*
887 * Retrieve work items and do the writeback they describe
888 */
889long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
890{
891 struct backing_dev_info *bdi = wb->bdi;
892 struct bdi_work *work;
893 long wrote = 0;
894
895 while ((work = get_next_work_item(bdi, wb)) != NULL) {
896 struct wb_writeback_args args = work->args;
897
898 /*
899 * Override sync mode, in case we must wait for completion
900 */
901 if (force_wait)
902 work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
903
904 /*
905 * If this isn't a data integrity operation, just notify
906 * that we have seen this work and we are now starting it.
907 */
908 if (args.sync_mode == WB_SYNC_NONE)
909 wb_clear_pending(wb, work);
910
911 wrote += wb_writeback(wb, &args);
912
913 /*
914 * This is a data integrity writeback, so only do the
915 * notification when we have completed the work.
916 */
917 if (args.sync_mode == WB_SYNC_ALL)
918 wb_clear_pending(wb, work);
919 }
920
921 /*
922 * Check for periodic writeback, kupdated() style
923 */
924 wrote += wb_check_old_data_flush(wb);
925
926 return wrote;
927}
928
929/*
930 * Handle writeback of dirty data for the device backed by this bdi. Also
931 * wakes up periodically and does kupdated style flushing.
932 */
933int bdi_writeback_task(struct bdi_writeback *wb)
934{
935 unsigned long last_active = jiffies;
936 unsigned long wait_jiffies = -1UL;
937 long pages_written;
938
939 while (!kthread_should_stop()) {
940 pages_written = wb_do_writeback(wb, 0);
941
942 if (pages_written)
943 last_active = jiffies;
944 else if (wait_jiffies != -1UL) {
945 unsigned long max_idle;
946
947 /*
948 * Longest period of inactivity that we tolerate. If we
949 * see dirty data again later, the task will get
950 * recreated automatically.
951 */
952 max_idle = max(5UL * 60 * HZ, wait_jiffies);
953 if (time_after(jiffies, max_idle + last_active))
954 break;
955 }
956
957 wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
958 schedule_timeout_interruptible(wait_jiffies);
959 try_to_freeze();
960 }
961
962 return 0;
963}
964
965/*
966 * Schedule writeback for all backing devices. This does WB_SYNC_NONE
967 * writeback, for integrity writeback see bdi_sync_writeback().
968 */
969static void bdi_writeback_all(struct super_block *sb, long nr_pages)
600{ 970{
601 generic_sync_sb_inodes(sb, wbc); 971 struct wb_writeback_args args = {
972 .sb = sb,
973 .nr_pages = nr_pages,
974 .sync_mode = WB_SYNC_NONE,
975 };
976 struct backing_dev_info *bdi;
977
978 rcu_read_lock();
979
980 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
981 if (!bdi_has_dirty_io(bdi))
982 continue;
983
984 bdi_alloc_queue_work(bdi, &args);
985 }
986
987 rcu_read_unlock();
602} 988}
603 989
604/* 990/*
605 * Start writeback of dirty pagecache data against all unlocked inodes. 991 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
992 * the whole world.
993 */
994void wakeup_flusher_threads(long nr_pages)
995{
996 if (nr_pages == 0)
997 nr_pages = global_page_state(NR_FILE_DIRTY) +
998 global_page_state(NR_UNSTABLE_NFS);
999 bdi_writeback_all(NULL, nr_pages);
1000}
1001
1002static noinline void block_dump___mark_inode_dirty(struct inode *inode)
1003{
1004 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
1005 struct dentry *dentry;
1006 const char *name = "?";
1007
1008 dentry = d_find_alias(inode);
1009 if (dentry) {
1010 spin_lock(&dentry->d_lock);
1011 name = (const char *) dentry->d_name.name;
1012 }
1013 printk(KERN_DEBUG
1014 "%s(%d): dirtied inode %lu (%s) on %s\n",
1015 current->comm, task_pid_nr(current), inode->i_ino,
1016 name, inode->i_sb->s_id);
1017 if (dentry) {
1018 spin_unlock(&dentry->d_lock);
1019 dput(dentry);
1020 }
1021 }
1022}
1023
1024/**
1025 * __mark_inode_dirty - internal function
1026 * @inode: inode to mark
1027 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
1028 * Mark an inode as dirty. Callers should use mark_inode_dirty or
1029 * mark_inode_dirty_sync.
1030 *
1031 * Put the inode on the super block's dirty list.
1032 *
1033 * CAREFUL! We mark it dirty unconditionally, but move it onto the
1034 * dirty list only if it is hashed or if it refers to a blockdev.
1035 * If it was not hashed, it will never be added to the dirty list
1036 * even if it is later hashed, as it will have been marked dirty already.
606 * 1037 *
607 * Note: 1038 * In short, make sure you hash any inodes _before_ you start marking
608 * We don't need to grab a reference to superblock here. If it has non-empty 1039 * them dirty.
609 * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
610 * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all
611 * empty. Since __sync_single_inode() regains inode_lock before it finally moves
612 * inode from superblock lists we are OK.
613 * 1040 *
614 * If `older_than_this' is non-zero then only flush inodes which have a 1041 * This function *must* be atomic for the I_DIRTY_PAGES case -
615 * flushtime older than *older_than_this. 1042 * set_page_dirty() is called under spinlock in several places.
616 * 1043 *
617 * If `bdi' is non-zero then we will scan the first inode against each 1044 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
618 * superblock until we find the matching ones. One group will be the dirty 1045 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
619 * inodes against a filesystem. Then when we hit the dummy blockdev superblock, 1046 * the kernel-internal blockdev inode represents the dirtying time of the
620 * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not 1047 * blockdev's pages. This is why for I_DIRTY_PAGES we always use
621 * super-efficient but we're about to do a ton of I/O... 1048 * page->mapping->host, so the page-dirtying time is recorded in the internal
1049 * blockdev inode.
622 */ 1050 */
623void 1051void __mark_inode_dirty(struct inode *inode, int flags)
624writeback_inodes(struct writeback_control *wbc)
625{ 1052{
626 struct super_block *sb; 1053 struct super_block *sb = inode->i_sb;
627 1054
628 might_sleep(); 1055 /*
629 spin_lock(&sb_lock); 1056 * Don't do this for I_DIRTY_PAGES - that doesn't actually
630restart: 1057 * dirty the inode itself
631 list_for_each_entry_reverse(sb, &super_blocks, s_list) { 1058 */
632 if (sb_has_dirty_inodes(sb)) { 1059 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
633 /* we're making our own get_super here */ 1060 if (sb->s_op->dirty_inode)
634 sb->s_count++; 1061 sb->s_op->dirty_inode(inode);
635 spin_unlock(&sb_lock); 1062 }
636 /* 1063
637 * If we can't get the readlock, there's no sense in 1064 /*
638 * waiting around, most of the time the FS is going to 1065 * make sure that changes are seen by all cpus before we test i_state
639 * be unmounted by the time it is released. 1066 * -- mikulas
640 */ 1067 */
641 if (down_read_trylock(&sb->s_umount)) { 1068 smp_mb();
642 if (sb->s_root) 1069
643 sync_sb_inodes(sb, wbc); 1070 /* avoid the locking if we can */
644 up_read(&sb->s_umount); 1071 if ((inode->i_state & flags) == flags)
1072 return;
1073
1074 if (unlikely(block_dump))
1075 block_dump___mark_inode_dirty(inode);
1076
1077 spin_lock(&inode_lock);
1078 if ((inode->i_state & flags) != flags) {
1079 const int was_dirty = inode->i_state & I_DIRTY;
1080
1081 inode->i_state |= flags;
1082
1083 /*
1084 * If the inode is being synced, just update its dirty state.
1085 * The unlocker will place the inode on the appropriate
1086 * superblock list, based upon its state.
1087 */
1088 if (inode->i_state & I_SYNC)
1089 goto out;
1090
1091 /*
1092 * Only add valid (hashed) inodes to the superblock's
1093 * dirty list. Add blockdev inodes as well.
1094 */
1095 if (!S_ISBLK(inode->i_mode)) {
1096 if (hlist_unhashed(&inode->i_hash))
1097 goto out;
1098 }
1099 if (inode->i_state & (I_FREEING|I_CLEAR))
1100 goto out;
1101
1102 /*
1103 * If the inode was already on b_dirty/b_io/b_more_io, don't
1104 * reposition it (that would break b_dirty time-ordering).
1105 */
1106 if (!was_dirty) {
1107 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
1108 struct backing_dev_info *bdi = wb->bdi;
1109
1110 if (bdi_cap_writeback_dirty(bdi) &&
1111 !test_bit(BDI_registered, &bdi->state)) {
1112 WARN_ON(1);
1113 printk(KERN_ERR "bdi-%s not registered\n",
1114 bdi->name);
645 } 1115 }
646 spin_lock(&sb_lock); 1116
647 if (__put_super_and_need_restart(sb)) 1117 inode->dirtied_when = jiffies;
648 goto restart; 1118 list_move(&inode->i_list, &wb->b_dirty);
649 } 1119 }
650 if (wbc->nr_to_write <= 0)
651 break;
652 } 1120 }
653 spin_unlock(&sb_lock); 1121out:
1122 spin_unlock(&inode_lock);
654} 1123}
1124EXPORT_SYMBOL(__mark_inode_dirty);
655 1125
656/* 1126/*
657 * writeback and wait upon the filesystem's dirty inodes. The caller will 1127 * Write out a superblock's list of dirty inodes. A wait will be performed
658 * do this in two passes - one to write, and one to wait. 1128 * upon no inodes, all inodes or the final one, depending upon sync_mode.
1129 *
1130 * If older_than_this is non-NULL, then only write out inodes which
1131 * had their first dirtying at a time earlier than *older_than_this.
659 * 1132 *
660 * A finite limit is set on the number of pages which will be written. 1133 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
661 * To prevent infinite livelock of sys_sync(). 1134 * This function assumes that the blockdev superblock's inodes are backed by
1135 * a variety of queues, so all inodes are searched. For other superblocks,
1136 * assume that all inodes are backed by the same queue.
662 * 1137 *
663 * We add in the number of potentially dirty inodes, because each inode write 1138 * The inodes to be written are parked on bdi->b_io. They are moved back onto
664 * can dirty pagecache in the underlying blockdev. 1139 * bdi->b_dirty as they are selected for writing. This way, none can be missed
1140 * on the writer throttling path, and we get decent balancing between many
1141 * throttled threads: we don't want them all piling up on inode_sync_wait.
665 */ 1142 */
666void sync_inodes_sb(struct super_block *sb, int wait) 1143static void wait_sb_inodes(struct super_block *sb)
667{ 1144{
668 struct writeback_control wbc = { 1145 struct inode *inode, *old_inode = NULL;
669 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, 1146
670 .range_start = 0, 1147 /*
671 .range_end = LLONG_MAX, 1148 * We need to be protected against the filesystem going from
672 }; 1149 * r/o to r/w or vice versa.
1150 */
1151 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1152
1153 spin_lock(&inode_lock);
673 1154
674 if (!wait) { 1155 /*
675 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); 1156 * Data integrity sync. Must wait for all pages under writeback,
676 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); 1157 * because there may have been pages dirtied before our sync
1158 * call, but which had writeout started before we write it out.
1159 * In which case, the inode may not be on the dirty list, but
1160 * we still have to wait for that writeout.
1161 */
1162 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1163 struct address_space *mapping;
677 1164
678 wbc.nr_to_write = nr_dirty + nr_unstable + 1165 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
1166 continue;
1167 mapping = inode->i_mapping;
1168 if (mapping->nrpages == 0)
1169 continue;
1170 __iget(inode);
1171 spin_unlock(&inode_lock);
1172 /*
1173 * We hold a reference to 'inode' so it couldn't have
1174 * been removed from s_inodes list while we dropped the
1175 * inode_lock. We cannot iput the inode now as we can
1176 * be holding the last reference and we cannot iput it
1177 * under inode_lock. So we keep the reference and iput
1178 * it later.
1179 */
1180 iput(old_inode);
1181 old_inode = inode;
1182
1183 filemap_fdatawait(mapping);
1184
1185 cond_resched();
1186
1187 spin_lock(&inode_lock);
1188 }
1189 spin_unlock(&inode_lock);
1190 iput(old_inode);
1191}
1192
1193/**
1194 * writeback_inodes_sb - writeback dirty inodes from given super_block
1195 * @sb: the superblock
1196 *
1197 * Start writeback on some inodes on this super_block. No guarantees are made
1198 * on how many (if any) will be written, and this function does not wait
1199 * for IO completion of submitted IO. The number of pages submitted is
1200 * returned.
1201 */
1202void writeback_inodes_sb(struct super_block *sb)
1203{
1204 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
1205 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1206 long nr_to_write;
1207
1208 nr_to_write = nr_dirty + nr_unstable +
679 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 1209 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
680 } else
681 wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */
682 1210
683 sync_sb_inodes(sb, &wbc); 1211 bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
684} 1212}
1213EXPORT_SYMBOL(writeback_inodes_sb);
1214
1215/**
1216 * sync_inodes_sb - sync sb inode pages
1217 * @sb: the superblock
1218 *
1219 * This function writes and waits on any dirty inode belonging to this
1220 * super_block. The number of pages synced is returned.
1221 */
1222void sync_inodes_sb(struct super_block *sb)
1223{
1224 bdi_sync_writeback(sb->s_bdi, sb);
1225 wait_sb_inodes(sb);
1226}
1227EXPORT_SYMBOL(sync_inodes_sb);
685 1228
686/** 1229/**
687 * write_inode_now - write an inode to disk 1230 * write_inode_now - write an inode to disk
@@ -737,57 +1280,3 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
737 return ret; 1280 return ret;
738} 1281}
739EXPORT_SYMBOL(sync_inode); 1282EXPORT_SYMBOL(sync_inode);
740
741/**
742 * generic_osync_inode - flush all dirty data for a given inode to disk
743 * @inode: inode to write
744 * @mapping: the address_space that should be flushed
745 * @what: what to write and wait upon
746 *
747 * This can be called by file_write functions for files which have the
748 * O_SYNC flag set, to flush dirty writes to disk.
749 *
750 * @what is a bitmask, specifying which part of the inode's data should be
751 * written and waited upon.
752 *
753 * OSYNC_DATA: i_mapping's dirty data
754 * OSYNC_METADATA: the buffers at i_mapping->private_list
755 * OSYNC_INODE: the inode itself
756 */
757
758int generic_osync_inode(struct inode *inode, struct address_space *mapping, int what)
759{
760 int err = 0;
761 int need_write_inode_now = 0;
762 int err2;
763
764 if (what & OSYNC_DATA)
765 err = filemap_fdatawrite(mapping);
766 if (what & (OSYNC_METADATA|OSYNC_DATA)) {
767 err2 = sync_mapping_buffers(mapping);
768 if (!err)
769 err = err2;
770 }
771 if (what & OSYNC_DATA) {
772 err2 = filemap_fdatawait(mapping);
773 if (!err)
774 err = err2;
775 }
776
777 spin_lock(&inode_lock);
778 if ((inode->i_state & I_DIRTY) &&
779 ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC)))
780 need_write_inode_now = 1;
781 spin_unlock(&inode_lock);
782
783 if (need_write_inode_now) {
784 err2 = write_inode_now(inode, 1);
785 if (!err)
786 err = err2;
787 }
788 else
789 inode_sync_wait(inode);
790
791 return err;
792}
793EXPORT_SYMBOL(generic_osync_inode);
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 99c99dfb0373..3773fd63d2f9 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -61,6 +61,121 @@ static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
61 return simple_read_from_buffer(buf, len, ppos, tmp, size); 61 return simple_read_from_buffer(buf, len, ppos, tmp, size);
62} 62}
63 63
64static ssize_t fuse_conn_limit_read(struct file *file, char __user *buf,
65 size_t len, loff_t *ppos, unsigned val)
66{
67 char tmp[32];
68 size_t size = sprintf(tmp, "%u\n", val);
69
70 return simple_read_from_buffer(buf, len, ppos, tmp, size);
71}
72
73static ssize_t fuse_conn_limit_write(struct file *file, const char __user *buf,
74 size_t count, loff_t *ppos, unsigned *val,
75 unsigned global_limit)
76{
77 unsigned long t;
78 char tmp[32];
79 unsigned limit = (1 << 16) - 1;
80 int err;
81
82 if (*ppos || count >= sizeof(tmp) - 1)
83 return -EINVAL;
84
85 if (copy_from_user(tmp, buf, count))
86 return -EINVAL;
87
88 tmp[count] = '\0';
89
90 err = strict_strtoul(tmp, 0, &t);
91 if (err)
92 return err;
93
94 if (!capable(CAP_SYS_ADMIN))
95 limit = min(limit, global_limit);
96
97 if (t > limit)
98 return -EINVAL;
99
100 *val = t;
101
102 return count;
103}
104
105static ssize_t fuse_conn_max_background_read(struct file *file,
106 char __user *buf, size_t len,
107 loff_t *ppos)
108{
109 struct fuse_conn *fc;
110 unsigned val;
111
112 fc = fuse_ctl_file_conn_get(file);
113 if (!fc)
114 return 0;
115
116 val = fc->max_background;
117 fuse_conn_put(fc);
118
119 return fuse_conn_limit_read(file, buf, len, ppos, val);
120}
121
122static ssize_t fuse_conn_max_background_write(struct file *file,
123 const char __user *buf,
124 size_t count, loff_t *ppos)
125{
126 unsigned val;
127 ssize_t ret;
128
129 ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
130 max_user_bgreq);
131 if (ret > 0) {
132 struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
133 if (fc) {
134 fc->max_background = val;
135 fuse_conn_put(fc);
136 }
137 }
138
139 return ret;
140}
141
142static ssize_t fuse_conn_congestion_threshold_read(struct file *file,
143 char __user *buf, size_t len,
144 loff_t *ppos)
145{
146 struct fuse_conn *fc;
147 unsigned val;
148
149 fc = fuse_ctl_file_conn_get(file);
150 if (!fc)
151 return 0;
152
153 val = fc->congestion_threshold;
154 fuse_conn_put(fc);
155
156 return fuse_conn_limit_read(file, buf, len, ppos, val);
157}
158
159static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
160 const char __user *buf,
161 size_t count, loff_t *ppos)
162{
163 unsigned val;
164 ssize_t ret;
165
166 ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
167 max_user_congthresh);
168 if (ret > 0) {
169 struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
170 if (fc) {
171 fc->congestion_threshold = val;
172 fuse_conn_put(fc);
173 }
174 }
175
176 return ret;
177}
178
64static const struct file_operations fuse_ctl_abort_ops = { 179static const struct file_operations fuse_ctl_abort_ops = {
65 .open = nonseekable_open, 180 .open = nonseekable_open,
66 .write = fuse_conn_abort_write, 181 .write = fuse_conn_abort_write,
@@ -71,6 +186,18 @@ static const struct file_operations fuse_ctl_waiting_ops = {
71 .read = fuse_conn_waiting_read, 186 .read = fuse_conn_waiting_read,
72}; 187};
73 188
189static const struct file_operations fuse_conn_max_background_ops = {
190 .open = nonseekable_open,
191 .read = fuse_conn_max_background_read,
192 .write = fuse_conn_max_background_write,
193};
194
195static const struct file_operations fuse_conn_congestion_threshold_ops = {
196 .open = nonseekable_open,
197 .read = fuse_conn_congestion_threshold_read,
198 .write = fuse_conn_congestion_threshold_write,
199};
200
74static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, 201static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
75 struct fuse_conn *fc, 202 struct fuse_conn *fc,
76 const char *name, 203 const char *name,
@@ -127,9 +254,14 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
127 goto err; 254 goto err;
128 255
129 if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1, 256 if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1,
130 NULL, &fuse_ctl_waiting_ops) || 257 NULL, &fuse_ctl_waiting_ops) ||
131 !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1, 258 !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1,
132 NULL, &fuse_ctl_abort_ops)) 259 NULL, &fuse_ctl_abort_ops) ||
260 !fuse_ctl_add_dentry(parent, fc, "max_background", S_IFREG | 0600,
261 1, NULL, &fuse_conn_max_background_ops) ||
262 !fuse_ctl_add_dentry(parent, fc, "congestion_threshold",
263 S_IFREG | 0600, 1, NULL,
264 &fuse_conn_congestion_threshold_ops))
133 goto err; 265 goto err;
134 266
135 return 0; 267 return 0;
@@ -156,7 +288,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc)
156 d_drop(dentry); 288 d_drop(dentry);
157 dput(dentry); 289 dput(dentry);
158 } 290 }
159 fuse_control_sb->s_root->d_inode->i_nlink--; 291 drop_nlink(fuse_control_sb->s_root->d_inode);
160} 292}
161 293
162static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent) 294static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6484eb75acd6..51d9e33d634f 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -250,7 +250,7 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
250 250
251static void flush_bg_queue(struct fuse_conn *fc) 251static void flush_bg_queue(struct fuse_conn *fc)
252{ 252{
253 while (fc->active_background < FUSE_MAX_BACKGROUND && 253 while (fc->active_background < fc->max_background &&
254 !list_empty(&fc->bg_queue)) { 254 !list_empty(&fc->bg_queue)) {
255 struct fuse_req *req; 255 struct fuse_req *req;
256 256
@@ -280,11 +280,11 @@ __releases(&fc->lock)
280 list_del(&req->intr_entry); 280 list_del(&req->intr_entry);
281 req->state = FUSE_REQ_FINISHED; 281 req->state = FUSE_REQ_FINISHED;
282 if (req->background) { 282 if (req->background) {
283 if (fc->num_background == FUSE_MAX_BACKGROUND) { 283 if (fc->num_background == fc->max_background) {
284 fc->blocked = 0; 284 fc->blocked = 0;
285 wake_up_all(&fc->blocked_waitq); 285 wake_up_all(&fc->blocked_waitq);
286 } 286 }
287 if (fc->num_background == FUSE_CONGESTION_THRESHOLD && 287 if (fc->num_background == fc->congestion_threshold &&
288 fc->connected && fc->bdi_initialized) { 288 fc->connected && fc->bdi_initialized) {
289 clear_bdi_congested(&fc->bdi, BLK_RW_SYNC); 289 clear_bdi_congested(&fc->bdi, BLK_RW_SYNC);
290 clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC); 290 clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
@@ -410,9 +410,9 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
410{ 410{
411 req->background = 1; 411 req->background = 1;
412 fc->num_background++; 412 fc->num_background++;
413 if (fc->num_background == FUSE_MAX_BACKGROUND) 413 if (fc->num_background == fc->max_background)
414 fc->blocked = 1; 414 fc->blocked = 1;
415 if (fc->num_background == FUSE_CONGESTION_THRESHOLD && 415 if (fc->num_background == fc->congestion_threshold &&
416 fc->bdi_initialized) { 416 fc->bdi_initialized) {
417 set_bdi_congested(&fc->bdi, BLK_RW_SYNC); 417 set_bdi_congested(&fc->bdi, BLK_RW_SYNC);
418 set_bdi_congested(&fc->bdi, BLK_RW_ASYNC); 418 set_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index e703654e7f40..992f6c9410bb 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1276,14 +1276,9 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
1276 return 0; 1276 return 0;
1277 1277
1278 if (attr->ia_valid & ATTR_SIZE) { 1278 if (attr->ia_valid & ATTR_SIZE) {
1279 unsigned long limit; 1279 err = inode_newsize_ok(inode, attr->ia_size);
1280 if (IS_SWAPFILE(inode)) 1280 if (err)
1281 return -ETXTBSY; 1281 return err;
1282 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
1283 if (limit != RLIM_INFINITY && attr->ia_size > (loff_t) limit) {
1284 send_sig(SIGXFSZ, current, 0);
1285 return -EFBIG;
1286 }
1287 is_truncate = true; 1282 is_truncate = true;
1288 } 1283 }
1289 1284
@@ -1350,8 +1345,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
1350 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock. 1345 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
1351 */ 1346 */
1352 if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { 1347 if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
1353 if (outarg.attr.size < oldsize) 1348 truncate_pagecache(inode, oldsize, outarg.attr.size);
1354 fuse_truncate(inode->i_mapping, outarg.attr.size);
1355 invalidate_inode_pages2(inode->i_mapping); 1349 invalidate_inode_pages2(inode->i_mapping);
1356 } 1350 }
1357 1351
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index cbc464043b6f..a3492f7d207c 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1313,7 +1313,7 @@ static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1313 return 0; 1313 return 0;
1314} 1314}
1315 1315
1316static struct vm_operations_struct fuse_file_vm_ops = { 1316static const struct vm_operations_struct fuse_file_vm_ops = {
1317 .close = fuse_vma_close, 1317 .close = fuse_vma_close,
1318 .fault = filemap_fault, 1318 .fault = filemap_fault,
1319 .page_mkwrite = fuse_page_mkwrite, 1319 .page_mkwrite = fuse_page_mkwrite,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 52b641fc0faf..01cc462ff45d 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -25,12 +25,6 @@
25/** Max number of pages that can be used in a single read request */ 25/** Max number of pages that can be used in a single read request */
26#define FUSE_MAX_PAGES_PER_REQ 32 26#define FUSE_MAX_PAGES_PER_REQ 32
27 27
28/** Maximum number of outstanding background requests */
29#define FUSE_MAX_BACKGROUND 12
30
31/** Congestion starts at 75% of maximum */
32#define FUSE_CONGESTION_THRESHOLD (FUSE_MAX_BACKGROUND * 75 / 100)
33
34/** Bias for fi->writectr, meaning new writepages must not be sent */ 28/** Bias for fi->writectr, meaning new writepages must not be sent */
35#define FUSE_NOWRITE INT_MIN 29#define FUSE_NOWRITE INT_MIN
36 30
@@ -38,7 +32,7 @@
38#define FUSE_NAME_MAX 1024 32#define FUSE_NAME_MAX 1024
39 33
40/** Number of dentries for each connection in the control filesystem */ 34/** Number of dentries for each connection in the control filesystem */
41#define FUSE_CTL_NUM_DENTRIES 3 35#define FUSE_CTL_NUM_DENTRIES 5
42 36
43/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem 37/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
44 module will check permissions based on the file mode. Otherwise no 38 module will check permissions based on the file mode. Otherwise no
@@ -55,6 +49,10 @@ extern struct list_head fuse_conn_list;
55/** Global mutex protecting fuse_conn_list and the control filesystem */ 49/** Global mutex protecting fuse_conn_list and the control filesystem */
56extern struct mutex fuse_mutex; 50extern struct mutex fuse_mutex;
57 51
52/** Module parameters */
53extern unsigned max_user_bgreq;
54extern unsigned max_user_congthresh;
55
58/** FUSE inode */ 56/** FUSE inode */
59struct fuse_inode { 57struct fuse_inode {
60 /** Inode data */ 58 /** Inode data */
@@ -349,6 +347,12 @@ struct fuse_conn {
349 /** rbtree of fuse_files waiting for poll events indexed by ph */ 347 /** rbtree of fuse_files waiting for poll events indexed by ph */
350 struct rb_root polled_files; 348 struct rb_root polled_files;
351 349
350 /** Maximum number of outstanding background requests */
351 unsigned max_background;
352
353 /** Number of background requests at which congestion starts */
354 unsigned congestion_threshold;
355
352 /** Number of requests currently in the background */ 356 /** Number of requests currently in the background */
353 unsigned num_background; 357 unsigned num_background;
354 358
@@ -602,8 +606,6 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
602void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, 606void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
603 u64 attr_valid); 607 u64 attr_valid);
604 608
605void fuse_truncate(struct address_space *mapping, loff_t offset);
606
607/** 609/**
608 * Initialize the client device 610 * Initialize the client device
609 */ 611 */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f91ccc4a189d..1a822ce2b24b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -14,6 +14,7 @@
14#include <linux/seq_file.h> 14#include <linux/seq_file.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/moduleparam.h>
17#include <linux/parser.h> 18#include <linux/parser.h>
18#include <linux/statfs.h> 19#include <linux/statfs.h>
19#include <linux/random.h> 20#include <linux/random.h>
@@ -28,10 +29,34 @@ static struct kmem_cache *fuse_inode_cachep;
28struct list_head fuse_conn_list; 29struct list_head fuse_conn_list;
29DEFINE_MUTEX(fuse_mutex); 30DEFINE_MUTEX(fuse_mutex);
30 31
32static int set_global_limit(const char *val, struct kernel_param *kp);
33
34unsigned max_user_bgreq;
35module_param_call(max_user_bgreq, set_global_limit, param_get_uint,
36 &max_user_bgreq, 0644);
37__MODULE_PARM_TYPE(max_user_bgreq, "uint");
38MODULE_PARM_DESC(max_user_bgreq,
39 "Global limit for the maximum number of backgrounded requests an "
40 "unprivileged user can set");
41
42unsigned max_user_congthresh;
43module_param_call(max_user_congthresh, set_global_limit, param_get_uint,
44 &max_user_congthresh, 0644);
45__MODULE_PARM_TYPE(max_user_congthresh, "uint");
46MODULE_PARM_DESC(max_user_congthresh,
47 "Global limit for the maximum congestion threshold an "
48 "unprivileged user can set");
49
31#define FUSE_SUPER_MAGIC 0x65735546 50#define FUSE_SUPER_MAGIC 0x65735546
32 51
33#define FUSE_DEFAULT_BLKSIZE 512 52#define FUSE_DEFAULT_BLKSIZE 512
34 53
54/** Maximum number of outstanding background requests */
55#define FUSE_DEFAULT_MAX_BACKGROUND 12
56
57/** Congestion starts at 75% of maximum */
58#define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4)
59
35struct fuse_mount_data { 60struct fuse_mount_data {
36 int fd; 61 int fd;
37 unsigned rootmode; 62 unsigned rootmode;
@@ -115,14 +140,6 @@ static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
115 return 0; 140 return 0;
116} 141}
117 142
118void fuse_truncate(struct address_space *mapping, loff_t offset)
119{
120 /* See vmtruncate() */
121 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
122 truncate_inode_pages(mapping, offset);
123 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
124}
125
126void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, 143void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
127 u64 attr_valid) 144 u64 attr_valid)
128{ 145{
@@ -180,8 +197,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
180 spin_unlock(&fc->lock); 197 spin_unlock(&fc->lock);
181 198
182 if (S_ISREG(inode->i_mode) && oldsize != attr->size) { 199 if (S_ISREG(inode->i_mode) && oldsize != attr->size) {
183 if (attr->size < oldsize) 200 truncate_pagecache(inode, oldsize, attr->size);
184 fuse_truncate(inode->i_mapping, attr->size);
185 invalidate_inode_pages2(inode->i_mapping); 201 invalidate_inode_pages2(inode->i_mapping);
186 } 202 }
187} 203}
@@ -517,6 +533,8 @@ void fuse_conn_init(struct fuse_conn *fc)
517 INIT_LIST_HEAD(&fc->bg_queue); 533 INIT_LIST_HEAD(&fc->bg_queue);
518 INIT_LIST_HEAD(&fc->entry); 534 INIT_LIST_HEAD(&fc->entry);
519 atomic_set(&fc->num_waiting, 0); 535 atomic_set(&fc->num_waiting, 0);
536 fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
537 fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
520 fc->khctr = 0; 538 fc->khctr = 0;
521 fc->polled_files = RB_ROOT; 539 fc->polled_files = RB_ROOT;
522 fc->reqctr = 0; 540 fc->reqctr = 0;
@@ -727,6 +745,54 @@ static const struct super_operations fuse_super_operations = {
727 .show_options = fuse_show_options, 745 .show_options = fuse_show_options,
728}; 746};
729 747
748static void sanitize_global_limit(unsigned *limit)
749{
750 if (*limit == 0)
751 *limit = ((num_physpages << PAGE_SHIFT) >> 13) /
752 sizeof(struct fuse_req);
753
754 if (*limit >= 1 << 16)
755 *limit = (1 << 16) - 1;
756}
757
758static int set_global_limit(const char *val, struct kernel_param *kp)
759{
760 int rv;
761
762 rv = param_set_uint(val, kp);
763 if (rv)
764 return rv;
765
766 sanitize_global_limit((unsigned *)kp->arg);
767
768 return 0;
769}
770
771static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
772{
773 int cap_sys_admin = capable(CAP_SYS_ADMIN);
774
775 if (arg->minor < 13)
776 return;
777
778 sanitize_global_limit(&max_user_bgreq);
779 sanitize_global_limit(&max_user_congthresh);
780
781 if (arg->max_background) {
782 fc->max_background = arg->max_background;
783
784 if (!cap_sys_admin && fc->max_background > max_user_bgreq)
785 fc->max_background = max_user_bgreq;
786 }
787 if (arg->congestion_threshold) {
788 fc->congestion_threshold = arg->congestion_threshold;
789
790 if (!cap_sys_admin &&
791 fc->congestion_threshold > max_user_congthresh)
792 fc->congestion_threshold = max_user_congthresh;
793 }
794}
795
730static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) 796static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
731{ 797{
732 struct fuse_init_out *arg = &req->misc.init_out; 798 struct fuse_init_out *arg = &req->misc.init_out;
@@ -736,6 +802,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
736 else { 802 else {
737 unsigned long ra_pages; 803 unsigned long ra_pages;
738 804
805 process_init_limits(fc, arg);
806
739 if (arg->minor >= 6) { 807 if (arg->minor >= 6) {
740 ra_pages = arg->max_readahead / PAGE_CACHE_SIZE; 808 ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
741 if (arg->flags & FUSE_ASYNC_READ) 809 if (arg->flags & FUSE_ASYNC_READ)
@@ -801,6 +869,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
801{ 869{
802 int err; 870 int err;
803 871
872 fc->bdi.name = "fuse";
804 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 873 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
805 fc->bdi.unplug_io_fn = default_unplug_io_fn; 874 fc->bdi.unplug_io_fn = default_unplug_io_fn;
806 /* fuse does it's own writeback accounting */ 875 /* fuse does it's own writeback accounting */
@@ -893,6 +962,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
893 if (err) 962 if (err)
894 goto err_put_conn; 963 goto err_put_conn;
895 964
965 sb->s_bdi = &fc->bdi;
966
896 /* Handle umasking inside the fuse code */ 967 /* Handle umasking inside the fuse code */
897 if (sb->s_flags & MS_POSIXACL) 968 if (sb->s_flags & MS_POSIXACL)
898 fc->dont_mask = 1; 969 fc->dont_mask = 1;
@@ -1147,6 +1218,9 @@ static int __init fuse_init(void)
1147 if (res) 1218 if (res)
1148 goto err_sysfs_cleanup; 1219 goto err_sysfs_cleanup;
1149 1220
1221 sanitize_global_limit(&max_user_bgreq);
1222 sanitize_global_limit(&max_user_congthresh);
1223
1150 return 0; 1224 return 0;
1151 1225
1152 err_sysfs_cleanup: 1226 err_sysfs_cleanup:
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 3da2f1f4f738..21f7e46da4c0 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,6 +1,6 @@
1EXTRA_CFLAGS := -I$(src) 1EXTRA_CFLAGS := -I$(src)
2obj-$(CONFIG_GFS2_FS) += gfs2.o 2obj-$(CONFIG_GFS2_FS) += gfs2.o
3gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \ 3gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \
4 glops.o inode.o log.o lops.o main.o meta_io.o \ 4 glops.o inode.o log.o lops.o main.o meta_io.o \
5 aops.o dentry.o export.o file.o \ 5 aops.o dentry.o export.o file.o \
6 ops_fstype.o ops_inode.o quota.o \ 6 ops_fstype.o ops_inode.o quota.o \
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index fa881bdc3d85..3fc4e3ac7d84 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -19,8 +19,7 @@
19#include "gfs2.h" 19#include "gfs2.h"
20#include "incore.h" 20#include "incore.h"
21#include "acl.h" 21#include "acl.h"
22#include "eaops.h" 22#include "xattr.h"
23#include "eattr.h"
24#include "glock.h" 23#include "glock.h"
25#include "inode.h" 24#include "inode.h"
26#include "meta_io.h" 25#include "meta_io.h"
@@ -31,8 +30,7 @@
31#define ACL_DEFAULT 0 30#define ACL_DEFAULT 0
32 31
33int gfs2_acl_validate_set(struct gfs2_inode *ip, int access, 32int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
34 struct gfs2_ea_request *er, 33 struct gfs2_ea_request *er, int *remove, mode_t *mode)
35 int *remove, mode_t *mode)
36{ 34{
37 struct posix_acl *acl; 35 struct posix_acl *acl;
38 int error; 36 int error;
@@ -83,30 +81,20 @@ int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
83 return 0; 81 return 0;
84} 82}
85 83
86static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl, 84static int acl_get(struct gfs2_inode *ip, const char *name,
87 struct gfs2_ea_location *el, char **data, unsigned int *len) 85 struct posix_acl **acl, struct gfs2_ea_location *el,
86 char **datap, unsigned int *lenp)
88{ 87{
89 struct gfs2_ea_request er; 88 char *data;
90 struct gfs2_ea_location el_this; 89 unsigned int len;
91 int error; 90 int error;
92 91
92 el->el_bh = NULL;
93
93 if (!ip->i_eattr) 94 if (!ip->i_eattr)
94 return 0; 95 return 0;
95 96
96 memset(&er, 0, sizeof(struct gfs2_ea_request)); 97 error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, el);
97 if (access) {
98 er.er_name = GFS2_POSIX_ACL_ACCESS;
99 er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
100 } else {
101 er.er_name = GFS2_POSIX_ACL_DEFAULT;
102 er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
103 }
104 er.er_type = GFS2_EATYPE_SYS;
105
106 if (!el)
107 el = &el_this;
108
109 error = gfs2_ea_find(ip, &er, el);
110 if (error) 98 if (error)
111 return error; 99 return error;
112 if (!el->el_ea) 100 if (!el->el_ea)
@@ -114,32 +102,31 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
114 if (!GFS2_EA_DATA_LEN(el->el_ea)) 102 if (!GFS2_EA_DATA_LEN(el->el_ea))
115 goto out; 103 goto out;
116 104
117 er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea); 105 len = GFS2_EA_DATA_LEN(el->el_ea);
118 er.er_data = kmalloc(er.er_data_len, GFP_NOFS); 106 data = kmalloc(len, GFP_NOFS);
119 error = -ENOMEM; 107 error = -ENOMEM;
120 if (!er.er_data) 108 if (!data)
121 goto out; 109 goto out;
122 110
123 error = gfs2_ea_get_copy(ip, el, er.er_data); 111 error = gfs2_ea_get_copy(ip, el, data, len);
124 if (error) 112 if (error < 0)
125 goto out_kfree; 113 goto out_kfree;
114 error = 0;
126 115
127 if (acl) { 116 if (acl) {
128 *acl = posix_acl_from_xattr(er.er_data, er.er_data_len); 117 *acl = posix_acl_from_xattr(data, len);
129 if (IS_ERR(*acl)) 118 if (IS_ERR(*acl))
130 error = PTR_ERR(*acl); 119 error = PTR_ERR(*acl);
131 } 120 }
132 121
133out_kfree: 122out_kfree:
134 if (error || !data) 123 if (error || !datap) {
135 kfree(er.er_data); 124 kfree(data);
136 else { 125 } else {
137 *data = er.er_data; 126 *datap = data;
138 *len = er.er_data_len; 127 *lenp = len;
139 } 128 }
140out: 129out:
141 if (error || el == &el_this)
142 brelse(el->el_bh);
143 return error; 130 return error;
144} 131}
145 132
@@ -153,10 +140,12 @@ out:
153 140
154int gfs2_check_acl(struct inode *inode, int mask) 141int gfs2_check_acl(struct inode *inode, int mask)
155{ 142{
143 struct gfs2_ea_location el;
156 struct posix_acl *acl = NULL; 144 struct posix_acl *acl = NULL;
157 int error; 145 int error;
158 146
159 error = acl_get(GFS2_I(inode), ACL_ACCESS, &acl, NULL, NULL, NULL); 147 error = acl_get(GFS2_I(inode), GFS2_POSIX_ACL_ACCESS, &acl, &el, NULL, NULL);
148 brelse(el.el_bh);
160 if (error) 149 if (error)
161 return error; 150 return error;
162 151
@@ -196,10 +185,12 @@ static int munge_mode(struct gfs2_inode *ip, mode_t mode)
196 185
197int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) 186int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
198{ 187{
188 struct gfs2_ea_location el;
199 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 189 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
200 struct posix_acl *acl = NULL, *clone; 190 struct posix_acl *acl = NULL, *clone;
201 struct gfs2_ea_request er;
202 mode_t mode = ip->i_inode.i_mode; 191 mode_t mode = ip->i_inode.i_mode;
192 char *data = NULL;
193 unsigned int len;
203 int error; 194 int error;
204 195
205 if (!sdp->sd_args.ar_posix_acl) 196 if (!sdp->sd_args.ar_posix_acl)
@@ -207,11 +198,8 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
207 if (S_ISLNK(ip->i_inode.i_mode)) 198 if (S_ISLNK(ip->i_inode.i_mode))
208 return 0; 199 return 0;
209 200
210 memset(&er, 0, sizeof(struct gfs2_ea_request)); 201 error = acl_get(dip, GFS2_POSIX_ACL_DEFAULT, &acl, &el, &data, &len);
211 er.er_type = GFS2_EATYPE_SYS; 202 brelse(el.el_bh);
212
213 error = acl_get(dip, ACL_DEFAULT, &acl, NULL,
214 &er.er_data, &er.er_data_len);
215 if (error) 203 if (error)
216 return error; 204 return error;
217 if (!acl) { 205 if (!acl) {
@@ -229,9 +217,8 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
229 acl = clone; 217 acl = clone;
230 218
231 if (S_ISDIR(ip->i_inode.i_mode)) { 219 if (S_ISDIR(ip->i_inode.i_mode)) {
232 er.er_name = GFS2_POSIX_ACL_DEFAULT; 220 error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
233 er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN; 221 GFS2_POSIX_ACL_DEFAULT, data, len, 0);
234 error = gfs2_system_eaops.eo_set(ip, &er);
235 if (error) 222 if (error)
236 goto out; 223 goto out;
237 } 224 }
@@ -239,21 +226,19 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
239 error = posix_acl_create_masq(acl, &mode); 226 error = posix_acl_create_masq(acl, &mode);
240 if (error < 0) 227 if (error < 0)
241 goto out; 228 goto out;
242 if (error > 0) { 229 if (error == 0)
243 er.er_name = GFS2_POSIX_ACL_ACCESS; 230 goto munge;
244 er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
245 posix_acl_to_xattr(acl, er.er_data, er.er_data_len);
246 er.er_mode = mode;
247 er.er_flags = GFS2_ERF_MODE;
248 error = gfs2_system_eaops.eo_set(ip, &er);
249 if (error)
250 goto out;
251 } else
252 munge_mode(ip, mode);
253 231
232 posix_acl_to_xattr(acl, data, len);
233 error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
234 GFS2_POSIX_ACL_ACCESS, data, len, 0);
235 if (error)
236 goto out;
237munge:
238 error = munge_mode(ip, mode);
254out: 239out:
255 posix_acl_release(acl); 240 posix_acl_release(acl);
256 kfree(er.er_data); 241 kfree(data);
257 return error; 242 return error;
258} 243}
259 244
@@ -265,9 +250,9 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
265 unsigned int len; 250 unsigned int len;
266 int error; 251 int error;
267 252
268 error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len); 253 error = acl_get(ip, GFS2_POSIX_ACL_ACCESS, &acl, &el, &data, &len);
269 if (error) 254 if (error)
270 return error; 255 goto out_brelse;
271 if (!acl) 256 if (!acl)
272 return gfs2_setattr_simple(ip, attr); 257 return gfs2_setattr_simple(ip, attr);
273 258
@@ -286,8 +271,9 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
286 271
287out: 272out:
288 posix_acl_release(acl); 273 posix_acl_release(acl);
289 brelse(el.el_bh);
290 kfree(data); 274 kfree(data);
275out_brelse:
276 brelse(el.el_bh);
291 return error; 277 return error;
292} 278}
293 279
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 7ebae9a4ecc0..694b5d48f036 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1135,6 +1135,7 @@ static const struct address_space_operations gfs2_writeback_aops = {
1135 .direct_IO = gfs2_direct_IO, 1135 .direct_IO = gfs2_direct_IO,
1136 .migratepage = buffer_migrate_page, 1136 .migratepage = buffer_migrate_page,
1137 .is_partially_uptodate = block_is_partially_uptodate, 1137 .is_partially_uptodate = block_is_partially_uptodate,
1138 .error_remove_page = generic_error_remove_page,
1138}; 1139};
1139 1140
1140static const struct address_space_operations gfs2_ordered_aops = { 1141static const struct address_space_operations gfs2_ordered_aops = {
@@ -1151,6 +1152,7 @@ static const struct address_space_operations gfs2_ordered_aops = {
1151 .direct_IO = gfs2_direct_IO, 1152 .direct_IO = gfs2_direct_IO,
1152 .migratepage = buffer_migrate_page, 1153 .migratepage = buffer_migrate_page,
1153 .is_partially_uptodate = block_is_partially_uptodate, 1154 .is_partially_uptodate = block_is_partially_uptodate,
1155 .error_remove_page = generic_error_remove_page,
1154}; 1156};
1155 1157
1156static const struct address_space_operations gfs2_jdata_aops = { 1158static const struct address_space_operations gfs2_jdata_aops = {
@@ -1166,6 +1168,7 @@ static const struct address_space_operations gfs2_jdata_aops = {
1166 .invalidatepage = gfs2_invalidatepage, 1168 .invalidatepage = gfs2_invalidatepage,
1167 .releasepage = gfs2_releasepage, 1169 .releasepage = gfs2_releasepage,
1168 .is_partially_uptodate = block_is_partially_uptodate, 1170 .is_partially_uptodate = block_is_partially_uptodate,
1171 .error_remove_page = generic_error_remove_page,
1169}; 1172};
1170 1173
1171void gfs2_set_aops(struct inode *inode) 1174void gfs2_set_aops(struct inode *inode)
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 022c66cd5606..91beddadd388 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -107,8 +107,26 @@ static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
107 return 0; 107 return 0;
108} 108}
109 109
110static int gfs2_dentry_delete(struct dentry *dentry)
111{
112 struct gfs2_inode *ginode;
113
114 if (!dentry->d_inode)
115 return 0;
116
117 ginode = GFS2_I(dentry->d_inode);
118 if (!ginode->i_iopen_gh.gh_gl)
119 return 0;
120
121 if (test_bit(GLF_DEMOTE, &ginode->i_iopen_gh.gh_gl->gl_flags))
122 return 1;
123
124 return 0;
125}
126
110const struct dentry_operations gfs2_dops = { 127const struct dentry_operations gfs2_dops = {
111 .d_revalidate = gfs2_drevalidate, 128 .d_revalidate = gfs2_drevalidate,
112 .d_hash = gfs2_dhash, 129 .d_hash = gfs2_dhash,
130 .d_delete = gfs2_dentry_delete,
113}; 131};
114 132
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
deleted file mode 100644
index dee9b03e5b37..000000000000
--- a/fs/gfs2/eaops.c
+++ /dev/null
@@ -1,157 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/slab.h>
11#include <linux/spinlock.h>
12#include <linux/completion.h>
13#include <linux/buffer_head.h>
14#include <linux/capability.h>
15#include <linux/xattr.h>
16#include <linux/gfs2_ondisk.h>
17#include <asm/uaccess.h>
18
19#include "gfs2.h"
20#include "incore.h"
21#include "acl.h"
22#include "eaops.h"
23#include "eattr.h"
24#include "util.h"
25
26/**
27 * gfs2_ea_name2type - get the type of the ea, and truncate type from the name
28 * @namep: ea name, possibly with type appended
29 *
30 * Returns: GFS2_EATYPE_XXX
31 */
32
33unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name)
34{
35 unsigned int type;
36
37 if (strncmp(name, "system.", 7) == 0) {
38 type = GFS2_EATYPE_SYS;
39 if (truncated_name)
40 *truncated_name = name + sizeof("system.") - 1;
41 } else if (strncmp(name, "user.", 5) == 0) {
42 type = GFS2_EATYPE_USR;
43 if (truncated_name)
44 *truncated_name = name + sizeof("user.") - 1;
45 } else if (strncmp(name, "security.", 9) == 0) {
46 type = GFS2_EATYPE_SECURITY;
47 if (truncated_name)
48 *truncated_name = name + sizeof("security.") - 1;
49 } else {
50 type = GFS2_EATYPE_UNUSED;
51 if (truncated_name)
52 *truncated_name = NULL;
53 }
54
55 return type;
56}
57
58static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
59{
60 if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
61 !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) &&
62 !capable(CAP_SYS_ADMIN))
63 return -EPERM;
64
65 if (GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl == 0 &&
66 (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) ||
67 GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
68 return -EOPNOTSUPP;
69
70 return gfs2_ea_get_i(ip, er);
71}
72
73static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
74{
75 int remove = 0;
76 int error;
77
78 if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
79 if (!(er->er_flags & GFS2_ERF_MODE)) {
80 er->er_mode = ip->i_inode.i_mode;
81 er->er_flags |= GFS2_ERF_MODE;
82 }
83 error = gfs2_acl_validate_set(ip, 1, er,
84 &remove, &er->er_mode);
85 if (error)
86 return error;
87 error = gfs2_ea_set_i(ip, er);
88 if (error)
89 return error;
90 if (remove)
91 gfs2_ea_remove_i(ip, er);
92 return 0;
93
94 } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
95 error = gfs2_acl_validate_set(ip, 0, er,
96 &remove, NULL);
97 if (error)
98 return error;
99 if (!remove)
100 error = gfs2_ea_set_i(ip, er);
101 else {
102 error = gfs2_ea_remove_i(ip, er);
103 if (error == -ENODATA)
104 error = 0;
105 }
106 return error;
107 }
108
109 return -EPERM;
110}
111
112static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
113{
114 if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
115 int error = gfs2_acl_validate_remove(ip, 1);
116 if (error)
117 return error;
118
119 } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
120 int error = gfs2_acl_validate_remove(ip, 0);
121 if (error)
122 return error;
123
124 } else
125 return -EPERM;
126
127 return gfs2_ea_remove_i(ip, er);
128}
129
130static const struct gfs2_eattr_operations gfs2_user_eaops = {
131 .eo_get = gfs2_ea_get_i,
132 .eo_set = gfs2_ea_set_i,
133 .eo_remove = gfs2_ea_remove_i,
134 .eo_name = "user",
135};
136
137const struct gfs2_eattr_operations gfs2_system_eaops = {
138 .eo_get = system_eo_get,
139 .eo_set = system_eo_set,
140 .eo_remove = system_eo_remove,
141 .eo_name = "system",
142};
143
144static const struct gfs2_eattr_operations gfs2_security_eaops = {
145 .eo_get = gfs2_ea_get_i,
146 .eo_set = gfs2_ea_set_i,
147 .eo_remove = gfs2_ea_remove_i,
148 .eo_name = "security",
149};
150
151const struct gfs2_eattr_operations *gfs2_ea_ops[] = {
152 NULL,
153 &gfs2_user_eaops,
154 &gfs2_system_eaops,
155 &gfs2_security_eaops,
156};
157
diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h
deleted file mode 100644
index da2f7fbbb40d..000000000000
--- a/fs/gfs2/eaops.h
+++ /dev/null
@@ -1,30 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __EAOPS_DOT_H__
11#define __EAOPS_DOT_H__
12
13struct gfs2_ea_request;
14struct gfs2_inode;
15
16struct gfs2_eattr_operations {
17 int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
18 int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
19 int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
20 char *eo_name;
21};
22
23unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name);
24
25extern const struct gfs2_eattr_operations gfs2_system_eaops;
26
27extern const struct gfs2_eattr_operations *gfs2_ea_ops[];
28
29#endif /* __EAOPS_DOT_H__ */
30
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 9200ef221716..d15876e9aa26 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -143,17 +143,14 @@ static struct dentry *gfs2_get_parent(struct dentry *child)
143} 143}
144 144
145static struct dentry *gfs2_get_dentry(struct super_block *sb, 145static struct dentry *gfs2_get_dentry(struct super_block *sb,
146 struct gfs2_inum_host *inum) 146 struct gfs2_inum_host *inum)
147{ 147{
148 struct gfs2_sbd *sdp = sb->s_fs_info; 148 struct gfs2_sbd *sdp = sb->s_fs_info;
149 struct gfs2_holder i_gh, ri_gh, rgd_gh; 149 struct gfs2_holder i_gh;
150 struct gfs2_rgrpd *rgd;
151 struct inode *inode; 150 struct inode *inode;
152 struct dentry *dentry; 151 struct dentry *dentry;
153 int error; 152 int error;
154 153
155 /* System files? */
156
157 inode = gfs2_ilookup(sb, inum->no_addr); 154 inode = gfs2_ilookup(sb, inum->no_addr);
158 if (inode) { 155 if (inode) {
159 if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) { 156 if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
@@ -168,29 +165,11 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
168 if (error) 165 if (error)
169 return ERR_PTR(error); 166 return ERR_PTR(error);
170 167
171 error = gfs2_rindex_hold(sdp, &ri_gh); 168 error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE);
172 if (error) 169 if (error)
173 goto fail; 170 goto fail;
174 171
175 error = -EINVAL; 172 inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0);
176 rgd = gfs2_blk2rgrpd(sdp, inum->no_addr);
177 if (!rgd)
178 goto fail_rindex;
179
180 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
181 if (error)
182 goto fail_rindex;
183
184 error = -ESTALE;
185 if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE)
186 goto fail_rgd;
187
188 gfs2_glock_dq_uninit(&rgd_gh);
189 gfs2_glock_dq_uninit(&ri_gh);
190
191 inode = gfs2_inode_lookup(sb, DT_UNKNOWN,
192 inum->no_addr,
193 0, 0);
194 if (IS_ERR(inode)) { 173 if (IS_ERR(inode)) {
195 error = PTR_ERR(inode); 174 error = PTR_ERR(inode);
196 goto fail; 175 goto fail;
@@ -224,13 +203,6 @@ out_inode:
224 if (!IS_ERR(dentry)) 203 if (!IS_ERR(dentry))
225 dentry->d_op = &gfs2_dops; 204 dentry->d_op = &gfs2_dops;
226 return dentry; 205 return dentry;
227
228fail_rgd:
229 gfs2_glock_dq_uninit(&rgd_gh);
230
231fail_rindex:
232 gfs2_glock_dq_uninit(&ri_gh);
233
234fail: 206fail:
235 gfs2_glock_dq_uninit(&i_gh); 207 gfs2_glock_dq_uninit(&i_gh);
236 return ERR_PTR(error); 208 return ERR_PTR(error);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 73318a3ce6f1..4eb308aa3234 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -38,7 +38,6 @@
38#include "rgrp.h" 38#include "rgrp.h"
39#include "trans.h" 39#include "trans.h"
40#include "util.h" 40#include "util.h"
41#include "eaops.h"
42 41
43/** 42/**
44 * gfs2_llseek - seek to a location in a file 43 * gfs2_llseek - seek to a location in a file
@@ -419,7 +418,7 @@ out:
419 return ret; 418 return ret;
420} 419}
421 420
422static struct vm_operations_struct gfs2_vm_ops = { 421static const struct vm_operations_struct gfs2_vm_ops = {
423 .fault = filemap_fault, 422 .fault = filemap_fault,
424 .page_mkwrite = gfs2_page_mkwrite, 423 .page_mkwrite = gfs2_page_mkwrite,
425}; 424};
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 61801ada36f0..6edb423f90b3 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -406,6 +406,12 @@ struct gfs2_statfs_change_host {
406#define GFS2_DATA_WRITEBACK 1 406#define GFS2_DATA_WRITEBACK 1
407#define GFS2_DATA_ORDERED 2 407#define GFS2_DATA_ORDERED 2
408 408
409#define GFS2_ERRORS_DEFAULT GFS2_ERRORS_WITHDRAW
410#define GFS2_ERRORS_WITHDRAW 0
411#define GFS2_ERRORS_CONTINUE 1 /* place holder for future feature */
412#define GFS2_ERRORS_RO 2 /* place holder for future feature */
413#define GFS2_ERRORS_PANIC 3
414
409struct gfs2_args { 415struct gfs2_args {
410 char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */ 416 char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */
411 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ 417 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
@@ -422,6 +428,7 @@ struct gfs2_args {
422 unsigned int ar_data:2; /* ordered/writeback */ 428 unsigned int ar_data:2; /* ordered/writeback */
423 unsigned int ar_meta:1; /* mount metafs */ 429 unsigned int ar_meta:1; /* mount metafs */
424 unsigned int ar_discard:1; /* discard requests */ 430 unsigned int ar_discard:1; /* discard requests */
431 unsigned int ar_errors:2; /* errors=withdraw | panic */
425 int ar_commit; /* Commit interval */ 432 int ar_commit; /* Commit interval */
426}; 433};
427 434
@@ -489,7 +496,6 @@ struct gfs2_sb_host {
489 */ 496 */
490 497
491struct lm_lockstruct { 498struct lm_lockstruct {
492 u32 ls_id;
493 unsigned int ls_jid; 499 unsigned int ls_jid;
494 unsigned int ls_first; 500 unsigned int ls_first;
495 unsigned int ls_first_done; 501 unsigned int ls_first_done;
@@ -541,18 +547,12 @@ struct gfs2_sbd {
541 struct dentry *sd_root_dir; 547 struct dentry *sd_root_dir;
542 548
543 struct inode *sd_jindex; 549 struct inode *sd_jindex;
544 struct inode *sd_inum_inode;
545 struct inode *sd_statfs_inode; 550 struct inode *sd_statfs_inode;
546 struct inode *sd_ir_inode;
547 struct inode *sd_sc_inode; 551 struct inode *sd_sc_inode;
548 struct inode *sd_qc_inode; 552 struct inode *sd_qc_inode;
549 struct inode *sd_rindex; 553 struct inode *sd_rindex;
550 struct inode *sd_quota_inode; 554 struct inode *sd_quota_inode;
551 555
552 /* Inum stuff */
553
554 struct mutex sd_inum_mutex;
555
556 /* StatFS stuff */ 556 /* StatFS stuff */
557 557
558 spinlock_t sd_statfs_spin; 558 spinlock_t sd_statfs_spin;
@@ -580,7 +580,6 @@ struct gfs2_sbd {
580 struct gfs2_holder sd_journal_gh; 580 struct gfs2_holder sd_journal_gh;
581 struct gfs2_holder sd_jinode_gh; 581 struct gfs2_holder sd_jinode_gh;
582 582
583 struct gfs2_holder sd_ir_gh;
584 struct gfs2_holder sd_sc_gh; 583 struct gfs2_holder sd_sc_gh;
585 struct gfs2_holder sd_qc_gh; 584 struct gfs2_holder sd_qc_gh;
586 585
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2f94bd723698..fb15d3b1f409 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -24,7 +24,7 @@
24#include "acl.h" 24#include "acl.h"
25#include "bmap.h" 25#include "bmap.h"
26#include "dir.h" 26#include "dir.h"
27#include "eattr.h" 27#include "xattr.h"
28#include "glock.h" 28#include "glock.h"
29#include "glops.h" 29#include "glops.h"
30#include "inode.h" 30#include "inode.h"
@@ -519,139 +519,6 @@ out:
519 return inode ? inode : ERR_PTR(error); 519 return inode ? inode : ERR_PTR(error);
520} 520}
521 521
522static void gfs2_inum_range_in(struct gfs2_inum_range_host *ir, const void *buf)
523{
524 const struct gfs2_inum_range *str = buf;
525
526 ir->ir_start = be64_to_cpu(str->ir_start);
527 ir->ir_length = be64_to_cpu(str->ir_length);
528}
529
530static void gfs2_inum_range_out(const struct gfs2_inum_range_host *ir, void *buf)
531{
532 struct gfs2_inum_range *str = buf;
533
534 str->ir_start = cpu_to_be64(ir->ir_start);
535 str->ir_length = cpu_to_be64(ir->ir_length);
536}
537
538static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino)
539{
540 struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
541 struct buffer_head *bh;
542 struct gfs2_inum_range_host ir;
543 int error;
544
545 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
546 if (error)
547 return error;
548 mutex_lock(&sdp->sd_inum_mutex);
549
550 error = gfs2_meta_inode_buffer(ip, &bh);
551 if (error) {
552 mutex_unlock(&sdp->sd_inum_mutex);
553 gfs2_trans_end(sdp);
554 return error;
555 }
556
557 gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
558
559 if (ir.ir_length) {
560 *formal_ino = ir.ir_start++;
561 ir.ir_length--;
562 gfs2_trans_add_bh(ip->i_gl, bh, 1);
563 gfs2_inum_range_out(&ir,
564 bh->b_data + sizeof(struct gfs2_dinode));
565 brelse(bh);
566 mutex_unlock(&sdp->sd_inum_mutex);
567 gfs2_trans_end(sdp);
568 return 0;
569 }
570
571 brelse(bh);
572
573 mutex_unlock(&sdp->sd_inum_mutex);
574 gfs2_trans_end(sdp);
575
576 return 1;
577}
578
579static int pick_formal_ino_2(struct gfs2_sbd *sdp, u64 *formal_ino)
580{
581 struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
582 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_inum_inode);
583 struct gfs2_holder gh;
584 struct buffer_head *bh;
585 struct gfs2_inum_range_host ir;
586 int error;
587
588 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
589 if (error)
590 return error;
591
592 error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
593 if (error)
594 goto out;
595 mutex_lock(&sdp->sd_inum_mutex);
596
597 error = gfs2_meta_inode_buffer(ip, &bh);
598 if (error)
599 goto out_end_trans;
600
601 gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
602
603 if (!ir.ir_length) {
604 struct buffer_head *m_bh;
605 u64 x, y;
606 __be64 z;
607
608 error = gfs2_meta_inode_buffer(m_ip, &m_bh);
609 if (error)
610 goto out_brelse;
611
612 z = *(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode));
613 x = y = be64_to_cpu(z);
614 ir.ir_start = x;
615 ir.ir_length = GFS2_INUM_QUANTUM;
616 x += GFS2_INUM_QUANTUM;
617 if (x < y)
618 gfs2_consist_inode(m_ip);
619 z = cpu_to_be64(x);
620 gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
621 *(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = z;
622
623 brelse(m_bh);
624 }
625
626 *formal_ino = ir.ir_start++;
627 ir.ir_length--;
628
629 gfs2_trans_add_bh(ip->i_gl, bh, 1);
630 gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode));
631
632out_brelse:
633 brelse(bh);
634out_end_trans:
635 mutex_unlock(&sdp->sd_inum_mutex);
636 gfs2_trans_end(sdp);
637out:
638 gfs2_glock_dq_uninit(&gh);
639 return error;
640}
641
642static int pick_formal_ino(struct gfs2_sbd *sdp, u64 *inum)
643{
644 int error;
645
646 error = pick_formal_ino_1(sdp, inum);
647 if (error <= 0)
648 return error;
649
650 error = pick_formal_ino_2(sdp, inum);
651
652 return error;
653}
654
655/** 522/**
656 * create_ok - OK to create a new on-disk inode here? 523 * create_ok - OK to create a new on-disk inode here?
657 * @dip: Directory in which dinode is to be created 524 * @dip: Directory in which dinode is to be created
@@ -731,7 +598,7 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)
731 if (error) 598 if (error)
732 goto out_ipreserv; 599 goto out_ipreserv;
733 600
734 *no_addr = gfs2_alloc_di(dip, generation); 601 error = gfs2_alloc_di(dip, no_addr, generation);
735 602
736 gfs2_trans_end(sdp); 603 gfs2_trans_end(sdp);
737 604
@@ -924,7 +791,6 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
924 size_t len; 791 size_t len;
925 void *value; 792 void *value;
926 char *name; 793 char *name;
927 struct gfs2_ea_request er;
928 794
929 err = security_inode_init_security(&ip->i_inode, &dip->i_inode, 795 err = security_inode_init_security(&ip->i_inode, &dip->i_inode,
930 &name, &value, &len); 796 &name, &value, &len);
@@ -935,16 +801,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
935 return err; 801 return err;
936 } 802 }
937 803
938 memset(&er, 0, sizeof(struct gfs2_ea_request)); 804 err = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SECURITY, name, value, len, 0);
939
940 er.er_type = GFS2_EATYPE_SECURITY;
941 er.er_name = name;
942 er.er_data = value;
943 er.er_name_len = strlen(name);
944 er.er_data_len = len;
945
946 err = gfs2_ea_set_i(ip, &er);
947
948 kfree(value); 805 kfree(value);
949 kfree(name); 806 kfree(name);
950 807
@@ -991,13 +848,10 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
991 if (error) 848 if (error)
992 goto fail_gunlock; 849 goto fail_gunlock;
993 850
994 error = pick_formal_ino(sdp, &inum.no_formal_ino);
995 if (error)
996 goto fail_gunlock;
997
998 error = alloc_dinode(dip, &inum.no_addr, &generation); 851 error = alloc_dinode(dip, &inum.no_addr, &generation);
999 if (error) 852 if (error)
1000 goto fail_gunlock; 853 goto fail_gunlock;
854 inum.no_formal_ino = generation;
1001 855
1002 error = gfs2_glock_nq_num(sdp, inum.no_addr, &gfs2_inode_glops, 856 error = gfs2_glock_nq_num(sdp, inum.no_addr, &gfs2_inode_glops,
1003 LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); 857 LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
@@ -1008,9 +862,8 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
1008 if (error) 862 if (error)
1009 goto fail_gunlock2; 863 goto fail_gunlock2;
1010 864
1011 inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), 865 inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr,
1012 inum.no_addr, 866 inum.no_formal_ino, 0);
1013 inum.no_formal_ino, 0);
1014 if (IS_ERR(inode)) 867 if (IS_ERR(inode))
1015 goto fail_gunlock2; 868 goto fail_gunlock2;
1016 869
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 7bc3c45cd676..52fb6c048981 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -84,7 +84,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
84 84
85 gfs2_tune_init(&sdp->sd_tune); 85 gfs2_tune_init(&sdp->sd_tune);
86 86
87 mutex_init(&sdp->sd_inum_mutex);
88 spin_lock_init(&sdp->sd_statfs_spin); 87 spin_lock_init(&sdp->sd_statfs_spin);
89 88
90 spin_lock_init(&sdp->sd_rindex_spin); 89 spin_lock_init(&sdp->sd_rindex_spin);
@@ -833,21 +832,12 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
833 if (error) 832 if (error)
834 goto fail; 833 goto fail;
835 834
836 /* Read in the master inode number inode */
837 sdp->sd_inum_inode = gfs2_lookup_simple(master, "inum");
838 if (IS_ERR(sdp->sd_inum_inode)) {
839 error = PTR_ERR(sdp->sd_inum_inode);
840 fs_err(sdp, "can't read in inum inode: %d\n", error);
841 goto fail_journal;
842 }
843
844
845 /* Read in the master statfs inode */ 835 /* Read in the master statfs inode */
846 sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs"); 836 sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs");
847 if (IS_ERR(sdp->sd_statfs_inode)) { 837 if (IS_ERR(sdp->sd_statfs_inode)) {
848 error = PTR_ERR(sdp->sd_statfs_inode); 838 error = PTR_ERR(sdp->sd_statfs_inode);
849 fs_err(sdp, "can't read in statfs inode: %d\n", error); 839 fs_err(sdp, "can't read in statfs inode: %d\n", error);
850 goto fail_inum; 840 goto fail_journal;
851 } 841 }
852 842
853 /* Read in the resource index inode */ 843 /* Read in the resource index inode */
@@ -876,8 +866,6 @@ fail_rindex:
876 iput(sdp->sd_rindex); 866 iput(sdp->sd_rindex);
877fail_statfs: 867fail_statfs:
878 iput(sdp->sd_statfs_inode); 868 iput(sdp->sd_statfs_inode);
879fail_inum:
880 iput(sdp->sd_inum_inode);
881fail_journal: 869fail_journal:
882 init_journal(sdp, UNDO); 870 init_journal(sdp, UNDO);
883fail: 871fail:
@@ -905,20 +893,12 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
905 return error; 893 return error;
906 } 894 }
907 895
908 sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid);
909 sdp->sd_ir_inode = gfs2_lookup_simple(pn, buf);
910 if (IS_ERR(sdp->sd_ir_inode)) {
911 error = PTR_ERR(sdp->sd_ir_inode);
912 fs_err(sdp, "can't find local \"ir\" file: %d\n", error);
913 goto fail;
914 }
915
916 sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid); 896 sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid);
917 sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf); 897 sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf);
918 if (IS_ERR(sdp->sd_sc_inode)) { 898 if (IS_ERR(sdp->sd_sc_inode)) {
919 error = PTR_ERR(sdp->sd_sc_inode); 899 error = PTR_ERR(sdp->sd_sc_inode);
920 fs_err(sdp, "can't find local \"sc\" file: %d\n", error); 900 fs_err(sdp, "can't find local \"sc\" file: %d\n", error);
921 goto fail_ir_i; 901 goto fail;
922 } 902 }
923 903
924 sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid); 904 sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
@@ -932,27 +912,16 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
932 iput(pn); 912 iput(pn);
933 pn = NULL; 913 pn = NULL;
934 914
935 ip = GFS2_I(sdp->sd_ir_inode);
936 error = gfs2_glock_nq_init(ip->i_gl,
937 LM_ST_EXCLUSIVE, 0,
938 &sdp->sd_ir_gh);
939 if (error) {
940 fs_err(sdp, "can't lock local \"ir\" file: %d\n", error);
941 goto fail_qc_i;
942 }
943
944 ip = GFS2_I(sdp->sd_sc_inode); 915 ip = GFS2_I(sdp->sd_sc_inode);
945 error = gfs2_glock_nq_init(ip->i_gl, 916 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
946 LM_ST_EXCLUSIVE, 0,
947 &sdp->sd_sc_gh); 917 &sdp->sd_sc_gh);
948 if (error) { 918 if (error) {
949 fs_err(sdp, "can't lock local \"sc\" file: %d\n", error); 919 fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
950 goto fail_ir_gh; 920 goto fail_qc_i;
951 } 921 }
952 922
953 ip = GFS2_I(sdp->sd_qc_inode); 923 ip = GFS2_I(sdp->sd_qc_inode);
954 error = gfs2_glock_nq_init(ip->i_gl, 924 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
955 LM_ST_EXCLUSIVE, 0,
956 &sdp->sd_qc_gh); 925 &sdp->sd_qc_gh);
957 if (error) { 926 if (error) {
958 fs_err(sdp, "can't lock local \"qc\" file: %d\n", error); 927 fs_err(sdp, "can't lock local \"qc\" file: %d\n", error);
@@ -965,14 +934,10 @@ fail_qc_gh:
965 gfs2_glock_dq_uninit(&sdp->sd_qc_gh); 934 gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
966fail_ut_gh: 935fail_ut_gh:
967 gfs2_glock_dq_uninit(&sdp->sd_sc_gh); 936 gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
968fail_ir_gh:
969 gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
970fail_qc_i: 937fail_qc_i:
971 iput(sdp->sd_qc_inode); 938 iput(sdp->sd_qc_inode);
972fail_ut_i: 939fail_ut_i:
973 iput(sdp->sd_sc_inode); 940 iput(sdp->sd_sc_inode);
974fail_ir_i:
975 iput(sdp->sd_ir_inode);
976fail: 941fail:
977 if (pn) 942 if (pn)
978 iput(pn); 943 iput(pn);
@@ -1063,7 +1028,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
1063 1028
1064 ls->ls_ops = lm; 1029 ls->ls_ops = lm;
1065 ls->ls_first = 1; 1030 ls->ls_first = 1;
1066 ls->ls_id = 0;
1067 1031
1068 for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) { 1032 for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) {
1069 substring_t tmp[MAX_OPT_ARGS]; 1033 substring_t tmp[MAX_OPT_ARGS];
@@ -1081,10 +1045,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
1081 ls->ls_jid = option; 1045 ls->ls_jid = option;
1082 break; 1046 break;
1083 case Opt_id: 1047 case Opt_id:
1084 ret = match_int(&tmp[0], &option); 1048 /* Obsolete, but left for backward compat purposes */
1085 if (ret)
1086 goto hostdata_error;
1087 ls->ls_id = option;
1088 break; 1049 break;
1089 case Opt_first: 1050 case Opt_first:
1090 ret = match_int(&tmp[0], &option); 1051 ret = match_int(&tmp[0], &option);
@@ -1133,6 +1094,17 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp)
1133 lm->lm_unmount(sdp); 1094 lm->lm_unmount(sdp);
1134} 1095}
1135 1096
1097void gfs2_online_uevent(struct gfs2_sbd *sdp)
1098{
1099 struct super_block *sb = sdp->sd_vfs;
1100 char ro[20];
1101 char spectator[20];
1102 char *envp[] = { ro, spectator, NULL };
1103 sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0);
1104 sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0);
1105 kobject_uevent_env(&sdp->sd_kobj, KOBJ_ONLINE, envp);
1106}
1107
1136/** 1108/**
1137 * fill_super - Read in superblock 1109 * fill_super - Read in superblock
1138 * @sb: The VFS superblock 1110 * @sb: The VFS superblock
@@ -1157,6 +1129,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
1157 sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT; 1129 sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
1158 sdp->sd_args.ar_data = GFS2_DATA_DEFAULT; 1130 sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
1159 sdp->sd_args.ar_commit = 60; 1131 sdp->sd_args.ar_commit = 60;
1132 sdp->sd_args.ar_errors = GFS2_ERRORS_DEFAULT;
1160 1133
1161 error = gfs2_mount_args(sdp, &sdp->sd_args, data); 1134 error = gfs2_mount_args(sdp, &sdp->sd_args, data);
1162 if (error) { 1135 if (error) {
@@ -1174,6 +1147,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
1174 sb->s_magic = GFS2_MAGIC; 1147 sb->s_magic = GFS2_MAGIC;
1175 sb->s_op = &gfs2_super_ops; 1148 sb->s_op = &gfs2_super_ops;
1176 sb->s_export_op = &gfs2_export_ops; 1149 sb->s_export_op = &gfs2_export_ops;
1150 sb->s_xattr = gfs2_xattr_handlers;
1177 sb->s_time_gran = 1; 1151 sb->s_time_gran = 1;
1178 sb->s_maxbytes = MAX_LFS_FILESIZE; 1152 sb->s_maxbytes = MAX_LFS_FILESIZE;
1179 1153
@@ -1236,7 +1210,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
1236 } 1210 }
1237 1211
1238 gfs2_glock_dq_uninit(&mount_gh); 1212 gfs2_glock_dq_uninit(&mount_gh);
1239 1213 gfs2_online_uevent(sdp);
1240 return 0; 1214 return 0;
1241 1215
1242fail_threads: 1216fail_threads:
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index f8bd20baf99c..247436c10deb 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -12,7 +12,6 @@
12#include <linux/completion.h> 12#include <linux/completion.h>
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/namei.h> 14#include <linux/namei.h>
15#include <linux/utsname.h>
16#include <linux/mm.h> 15#include <linux/mm.h>
17#include <linux/xattr.h> 16#include <linux/xattr.h>
18#include <linux/posix_acl.h> 17#include <linux/posix_acl.h>
@@ -26,8 +25,7 @@
26#include "acl.h" 25#include "acl.h"
27#include "bmap.h" 26#include "bmap.h"
28#include "dir.h" 27#include "dir.h"
29#include "eaops.h" 28#include "xattr.h"
30#include "eattr.h"
31#include "glock.h" 29#include "glock.h"
32#include "inode.h" 30#include "inode.h"
33#include "meta_io.h" 31#include "meta_io.h"
@@ -349,7 +347,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
349 347
350 error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0); 348 error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
351 if (error) 349 if (error)
352 goto out_rgrp; 350 goto out_gunlock;
353 351
354 error = gfs2_dir_del(dip, &dentry->d_name); 352 error = gfs2_dir_del(dip, &dentry->d_name);
355 if (error) 353 if (error)
@@ -1302,60 +1300,53 @@ static int gfs2_setxattr(struct dentry *dentry, const char *name,
1302 const void *data, size_t size, int flags) 1300 const void *data, size_t size, int flags)
1303{ 1301{
1304 struct inode *inode = dentry->d_inode; 1302 struct inode *inode = dentry->d_inode;
1305 struct gfs2_ea_request er; 1303 struct gfs2_inode *ip = GFS2_I(inode);
1306 1304 struct gfs2_holder gh;
1307 memset(&er, 0, sizeof(struct gfs2_ea_request)); 1305 int ret;
1308 er.er_type = gfs2_ea_name2type(name, &er.er_name);
1309 if (er.er_type == GFS2_EATYPE_UNUSED)
1310 return -EOPNOTSUPP;
1311 er.er_data = (char *)data;
1312 er.er_name_len = strlen(er.er_name);
1313 er.er_data_len = size;
1314 er.er_flags = flags;
1315
1316 gfs2_assert_warn(GFS2_SB(inode), !(er.er_flags & GFS2_ERF_MODE));
1317 1306
1318 return gfs2_ea_set(GFS2_I(inode), &er); 1307 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
1308 ret = gfs2_glock_nq(&gh);
1309 if (ret == 0) {
1310 ret = generic_setxattr(dentry, name, data, size, flags);
1311 gfs2_glock_dq(&gh);
1312 }
1313 gfs2_holder_uninit(&gh);
1314 return ret;
1319} 1315}
1320 1316
1321static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name, 1317static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
1322 void *data, size_t size) 1318 void *data, size_t size)
1323{ 1319{
1324 struct gfs2_ea_request er; 1320 struct inode *inode = dentry->d_inode;
1325 1321 struct gfs2_inode *ip = GFS2_I(inode);
1326 memset(&er, 0, sizeof(struct gfs2_ea_request)); 1322 struct gfs2_holder gh;
1327 er.er_type = gfs2_ea_name2type(name, &er.er_name); 1323 int ret;
1328 if (er.er_type == GFS2_EATYPE_UNUSED)
1329 return -EOPNOTSUPP;
1330 er.er_data = data;
1331 er.er_name_len = strlen(er.er_name);
1332 er.er_data_len = size;
1333
1334 return gfs2_ea_get(GFS2_I(dentry->d_inode), &er);
1335}
1336
1337static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
1338{
1339 struct gfs2_ea_request er;
1340
1341 memset(&er, 0, sizeof(struct gfs2_ea_request));
1342 er.er_data = (size) ? buffer : NULL;
1343 er.er_data_len = size;
1344 1324
1345 return gfs2_ea_list(GFS2_I(dentry->d_inode), &er); 1325 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
1326 ret = gfs2_glock_nq(&gh);
1327 if (ret == 0) {
1328 ret = generic_getxattr(dentry, name, data, size);
1329 gfs2_glock_dq(&gh);
1330 }
1331 gfs2_holder_uninit(&gh);
1332 return ret;
1346} 1333}
1347 1334
1348static int gfs2_removexattr(struct dentry *dentry, const char *name) 1335static int gfs2_removexattr(struct dentry *dentry, const char *name)
1349{ 1336{
1350 struct gfs2_ea_request er; 1337 struct inode *inode = dentry->d_inode;
1351 1338 struct gfs2_inode *ip = GFS2_I(inode);
1352 memset(&er, 0, sizeof(struct gfs2_ea_request)); 1339 struct gfs2_holder gh;
1353 er.er_type = gfs2_ea_name2type(name, &er.er_name); 1340 int ret;
1354 if (er.er_type == GFS2_EATYPE_UNUSED)
1355 return -EOPNOTSUPP;
1356 er.er_name_len = strlen(er.er_name);
1357 1341
1358 return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er); 1342 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
1343 ret = gfs2_glock_nq(&gh);
1344 if (ret == 0) {
1345 ret = generic_removexattr(dentry, name);
1346 gfs2_glock_dq(&gh);
1347 }
1348 gfs2_holder_uninit(&gh);
1349 return ret;
1359} 1350}
1360 1351
1361static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 1352static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index fba795798d3a..8f1cfb02a6cb 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -179,7 +179,7 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
179 * always aligned to a 64 bit boundary. 179 * always aligned to a 64 bit boundary.
180 * 180 *
181 * The size of the buffer is in bytes, but is it assumed that it is 181 * The size of the buffer is in bytes, but is it assumed that it is
182 * always ok to to read a complete multiple of 64 bits at the end 182 * always ok to read a complete multiple of 64 bits at the end
183 * of the block in case the end is no aligned to a natural boundary. 183 * of the block in case the end is no aligned to a natural boundary.
184 * 184 *
185 * Return: the block number (bitmap buffer scope) that was found 185 * Return: the block number (bitmap buffer scope) that was found
@@ -857,7 +857,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
857 goto start_new_extent; 857 goto start_new_extent;
858 if ((start + nr_sects) != blk) { 858 if ((start + nr_sects) != blk) {
859 rv = blkdev_issue_discard(bdev, start, 859 rv = blkdev_issue_discard(bdev, start,
860 nr_sects, GFP_NOFS); 860 nr_sects, GFP_NOFS,
861 DISCARD_FL_BARRIER);
861 if (rv) 862 if (rv)
862 goto fail; 863 goto fail;
863 nr_sects = 0; 864 nr_sects = 0;
@@ -871,7 +872,8 @@ start_new_extent:
871 } 872 }
872 } 873 }
873 if (nr_sects) { 874 if (nr_sects) {
874 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS); 875 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
876 DISCARD_FL_BARRIER);
875 if (rv) 877 if (rv)
876 goto fail; 878 goto fail;
877 } 879 }
@@ -1256,7 +1258,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
1256 * Returns: The block type (GFS2_BLKST_*) 1258 * Returns: The block type (GFS2_BLKST_*)
1257 */ 1259 */
1258 1260
1259unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block) 1261static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
1260{ 1262{
1261 struct gfs2_bitmap *bi = NULL; 1263 struct gfs2_bitmap *bi = NULL;
1262 u32 length, rgrp_block, buf_block; 1264 u32 length, rgrp_block, buf_block;
@@ -1459,6 +1461,16 @@ int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
1459 return 0; 1461 return 0;
1460} 1462}
1461 1463
1464static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
1465{
1466 struct gfs2_sbd *sdp = rgd->rd_sbd;
1467 fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
1468 (unsigned long long)rgd->rd_addr);
1469 fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
1470 gfs2_rgrp_dump(NULL, rgd->rd_gl);
1471 rgd->rd_flags |= GFS2_RDF_ERROR;
1472}
1473
1462/** 1474/**
1463 * gfs2_alloc_block - Allocate one or more blocks 1475 * gfs2_alloc_block - Allocate one or more blocks
1464 * @ip: the inode to allocate the block for 1476 * @ip: the inode to allocate the block for
@@ -1520,22 +1532,20 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
1520 return 0; 1532 return 0;
1521 1533
1522rgrp_error: 1534rgrp_error:
1523 fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n", 1535 gfs2_rgrp_error(rgd);
1524 (unsigned long long)rgd->rd_addr);
1525 fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
1526 gfs2_rgrp_dump(NULL, rgd->rd_gl);
1527 rgd->rd_flags |= GFS2_RDF_ERROR;
1528 return -EIO; 1536 return -EIO;
1529} 1537}
1530 1538
1531/** 1539/**
1532 * gfs2_alloc_di - Allocate a dinode 1540 * gfs2_alloc_di - Allocate a dinode
1533 * @dip: the directory that the inode is going in 1541 * @dip: the directory that the inode is going in
1542 * @bn: the block number which is allocated
1543 * @generation: the generation number of the inode
1534 * 1544 *
1535 * Returns: the block allocated 1545 * Returns: 0 on success or error
1536 */ 1546 */
1537 1547
1538u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation) 1548int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
1539{ 1549{
1540 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 1550 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1541 struct gfs2_alloc *al = dip->i_alloc; 1551 struct gfs2_alloc *al = dip->i_alloc;
@@ -1546,16 +1556,21 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
1546 1556
1547 blk = rgblk_search(rgd, rgd->rd_last_alloc, 1557 blk = rgblk_search(rgd, rgd->rd_last_alloc,
1548 GFS2_BLKST_FREE, GFS2_BLKST_DINODE, &n); 1558 GFS2_BLKST_FREE, GFS2_BLKST_DINODE, &n);
1549 BUG_ON(blk == BFITNOENT);
1550 1559
1551 rgd->rd_last_alloc = blk; 1560 /* Since all blocks are reserved in advance, this shouldn't happen */
1561 if (blk == BFITNOENT)
1562 goto rgrp_error;
1552 1563
1564 rgd->rd_last_alloc = blk;
1553 block = rgd->rd_data0 + blk; 1565 block = rgd->rd_data0 + blk;
1566 if (rgd->rd_free == 0)
1567 goto rgrp_error;
1554 1568
1555 gfs2_assert_withdraw(sdp, rgd->rd_free);
1556 rgd->rd_free--; 1569 rgd->rd_free--;
1557 rgd->rd_dinodes++; 1570 rgd->rd_dinodes++;
1558 *generation = rgd->rd_igeneration++; 1571 *generation = rgd->rd_igeneration++;
1572 if (*generation == 0)
1573 *generation = rgd->rd_igeneration++;
1559 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1574 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1560 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1575 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
1561 1576
@@ -1568,7 +1583,12 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
1568 rgd->rd_free_clone--; 1583 rgd->rd_free_clone--;
1569 spin_unlock(&sdp->sd_rindex_spin); 1584 spin_unlock(&sdp->sd_rindex_spin);
1570 trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE); 1585 trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE);
1571 return block; 1586 *bn = block;
1587 return 0;
1588
1589rgrp_error:
1590 gfs2_rgrp_error(rgd);
1591 return -EIO;
1572} 1592}
1573 1593
1574/** 1594/**
@@ -1676,6 +1696,46 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
1676} 1696}
1677 1697
1678/** 1698/**
1699 * gfs2_check_blk_type - Check the type of a block
1700 * @sdp: The superblock
1701 * @no_addr: The block number to check
1702 * @type: The block type we are looking for
1703 *
1704 * Returns: 0 if the block type matches the expected type
1705 * -ESTALE if it doesn't match
1706 * or -ve errno if something went wrong while checking
1707 */
1708
1709int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
1710{
1711 struct gfs2_rgrpd *rgd;
1712 struct gfs2_holder ri_gh, rgd_gh;
1713 int error;
1714
1715 error = gfs2_rindex_hold(sdp, &ri_gh);
1716 if (error)
1717 goto fail;
1718
1719 error = -EINVAL;
1720 rgd = gfs2_blk2rgrpd(sdp, no_addr);
1721 if (!rgd)
1722 goto fail_rindex;
1723
1724 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
1725 if (error)
1726 goto fail_rindex;
1727
1728 if (gfs2_get_block_type(rgd, no_addr) != type)
1729 error = -ESTALE;
1730
1731 gfs2_glock_dq_uninit(&rgd_gh);
1732fail_rindex:
1733 gfs2_glock_dq_uninit(&ri_gh);
1734fail:
1735 return error;
1736}
1737
1738/**
1679 * gfs2_rlist_add - add a RG to a list of RGs 1739 * gfs2_rlist_add - add a RG to a list of RGs
1680 * @sdp: the filesystem 1740 * @sdp: the filesystem
1681 * @rlist: the list of resource groups 1741 * @rlist: the list of resource groups
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 1e76ff0f3e00..b4106ddaaa98 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -44,15 +44,15 @@ gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
44 44
45extern void gfs2_inplace_release(struct gfs2_inode *ip); 45extern void gfs2_inplace_release(struct gfs2_inode *ip);
46 46
47extern unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
48
49extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); 47extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
50extern u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation); 48extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
51 49
52extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); 50extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
53extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); 51extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
54extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); 52extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
55extern void gfs2_unlink_di(struct inode *inode); 53extern void gfs2_unlink_di(struct inode *inode);
54extern int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr,
55 unsigned int type);
56 56
57struct gfs2_rgrp_list { 57struct gfs2_rgrp_list {
58 unsigned int rl_rgrps; 58 unsigned int rl_rgrps;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index f522bb017973..0ec3ec672de1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -38,7 +38,7 @@
38#include "trans.h" 38#include "trans.h"
39#include "util.h" 39#include "util.h"
40#include "sys.h" 40#include "sys.h"
41#include "eattr.h" 41#include "xattr.h"
42 42
43#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x) 43#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
44 44
@@ -68,6 +68,8 @@ enum {
68 Opt_discard, 68 Opt_discard,
69 Opt_nodiscard, 69 Opt_nodiscard,
70 Opt_commit, 70 Opt_commit,
71 Opt_err_withdraw,
72 Opt_err_panic,
71 Opt_error, 73 Opt_error,
72}; 74};
73 75
@@ -97,6 +99,8 @@ static const match_table_t tokens = {
97 {Opt_discard, "discard"}, 99 {Opt_discard, "discard"},
98 {Opt_nodiscard, "nodiscard"}, 100 {Opt_nodiscard, "nodiscard"},
99 {Opt_commit, "commit=%d"}, 101 {Opt_commit, "commit=%d"},
102 {Opt_err_withdraw, "errors=withdraw"},
103 {Opt_err_panic, "errors=panic"},
100 {Opt_error, NULL} 104 {Opt_error, NULL}
101}; 105};
102 106
@@ -152,6 +156,11 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
152 args->ar_localcaching = 1; 156 args->ar_localcaching = 1;
153 break; 157 break;
154 case Opt_debug: 158 case Opt_debug:
159 if (args->ar_errors == GFS2_ERRORS_PANIC) {
160 fs_info(sdp, "-o debug and -o errors=panic "
161 "are mutually exclusive.\n");
162 return -EINVAL;
163 }
155 args->ar_debug = 1; 164 args->ar_debug = 1;
156 break; 165 break;
157 case Opt_nodebug: 166 case Opt_nodebug:
@@ -205,6 +214,17 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
205 return rv ? rv : -EINVAL; 214 return rv ? rv : -EINVAL;
206 } 215 }
207 break; 216 break;
217 case Opt_err_withdraw:
218 args->ar_errors = GFS2_ERRORS_WITHDRAW;
219 break;
220 case Opt_err_panic:
221 if (args->ar_debug) {
222 fs_info(sdp, "-o debug and -o errors=panic "
223 "are mutually exclusive.\n");
224 return -EINVAL;
225 }
226 args->ar_errors = GFS2_ERRORS_PANIC;
227 break;
208 case Opt_error: 228 case Opt_error:
209 default: 229 default:
210 fs_info(sdp, "invalid mount option: %s\n", o); 230 fs_info(sdp, "invalid mount option: %s\n", o);
@@ -768,7 +788,6 @@ restart:
768 /* Release stuff */ 788 /* Release stuff */
769 789
770 iput(sdp->sd_jindex); 790 iput(sdp->sd_jindex);
771 iput(sdp->sd_inum_inode);
772 iput(sdp->sd_statfs_inode); 791 iput(sdp->sd_statfs_inode);
773 iput(sdp->sd_rindex); 792 iput(sdp->sd_rindex);
774 iput(sdp->sd_quota_inode); 793 iput(sdp->sd_quota_inode);
@@ -779,10 +798,8 @@ restart:
779 if (!sdp->sd_args.ar_spectator) { 798 if (!sdp->sd_args.ar_spectator) {
780 gfs2_glock_dq_uninit(&sdp->sd_journal_gh); 799 gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
781 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); 800 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
782 gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
783 gfs2_glock_dq_uninit(&sdp->sd_sc_gh); 801 gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
784 gfs2_glock_dq_uninit(&sdp->sd_qc_gh); 802 gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
785 iput(sdp->sd_ir_inode);
786 iput(sdp->sd_sc_inode); 803 iput(sdp->sd_sc_inode);
787 iput(sdp->sd_qc_inode); 804 iput(sdp->sd_qc_inode);
788 } 805 }
@@ -1084,6 +1101,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
1084 gt->gt_log_flush_secs = args.ar_commit; 1101 gt->gt_log_flush_secs = args.ar_commit;
1085 spin_unlock(&gt->gt_spin); 1102 spin_unlock(&gt->gt_spin);
1086 1103
1104 gfs2_online_uevent(sdp);
1087 return 0; 1105 return 0;
1088} 1106}
1089 1107
@@ -1225,6 +1243,22 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1225 lfsecs = sdp->sd_tune.gt_log_flush_secs; 1243 lfsecs = sdp->sd_tune.gt_log_flush_secs;
1226 if (lfsecs != 60) 1244 if (lfsecs != 60)
1227 seq_printf(s, ",commit=%d", lfsecs); 1245 seq_printf(s, ",commit=%d", lfsecs);
1246 if (args->ar_errors != GFS2_ERRORS_DEFAULT) {
1247 const char *state;
1248
1249 switch (args->ar_errors) {
1250 case GFS2_ERRORS_WITHDRAW:
1251 state = "withdraw";
1252 break;
1253 case GFS2_ERRORS_PANIC:
1254 state = "panic";
1255 break;
1256 default:
1257 state = "unknown";
1258 break;
1259 }
1260 seq_printf(s, ",errors=%s", state);
1261 }
1228 return 0; 1262 return 0;
1229} 1263}
1230 1264
@@ -1252,6 +1286,10 @@ static void gfs2_delete_inode(struct inode *inode)
1252 goto out; 1286 goto out;
1253 } 1287 }
1254 1288
1289 error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
1290 if (error)
1291 goto out_truncate;
1292
1255 gfs2_glock_dq_wait(&ip->i_iopen_gh); 1293 gfs2_glock_dq_wait(&ip->i_iopen_gh);
1256 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); 1294 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
1257 error = gfs2_glock_nq(&ip->i_iopen_gh); 1295 error = gfs2_glock_nq(&ip->i_iopen_gh);
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 22e0417ed996..235db3682885 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -25,7 +25,7 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
25 return x; 25 return x;
26} 26}
27 27
28void gfs2_jindex_free(struct gfs2_sbd *sdp); 28extern void gfs2_jindex_free(struct gfs2_sbd *sdp);
29 29
30extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data); 30extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data);
31 31
@@ -36,7 +36,7 @@ extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
36 struct gfs2_inode **ipp); 36 struct gfs2_inode **ipp);
37 37
38extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp); 38extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
39 39extern void gfs2_online_uevent(struct gfs2_sbd *sdp);
40extern int gfs2_statfs_init(struct gfs2_sbd *sdp); 40extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
41extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, 41extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
42 s64 dinodes); 42 s64 dinodes);
@@ -54,6 +54,7 @@ extern struct file_system_type gfs2meta_fs_type;
54extern const struct export_operations gfs2_export_ops; 54extern const struct export_operations gfs2_export_ops;
55extern const struct super_operations gfs2_super_ops; 55extern const struct super_operations gfs2_super_ops;
56extern const struct dentry_operations gfs2_dops; 56extern const struct dentry_operations gfs2_dops;
57extern struct xattr_handler *gfs2_xattr_handlers[];
57 58
58#endif /* __SUPER_DOT_H__ */ 59#endif /* __SUPER_DOT_H__ */
59 60
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index a7cbfbd340c7..446329728d52 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -16,6 +16,7 @@
16#include <linux/kobject.h> 16#include <linux/kobject.h>
17#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18#include <linux/gfs2_ondisk.h> 18#include <linux/gfs2_ondisk.h>
19#include <linux/genhd.h>
19 20
20#include "gfs2.h" 21#include "gfs2.h"
21#include "incore.h" 22#include "incore.h"
@@ -319,12 +320,6 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
319 return ret; 320 return ret;
320} 321}
321 322
322static ssize_t lkid_show(struct gfs2_sbd *sdp, char *buf)
323{
324 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
325 return sprintf(buf, "%u\n", ls->ls_id);
326}
327
328static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf) 323static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
329{ 324{
330 struct lm_lockstruct *ls = &sdp->sd_lockstruct; 325 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
@@ -389,7 +384,6 @@ static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
389GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); 384GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
390GDLM_ATTR(block, 0644, block_show, block_store); 385GDLM_ATTR(block, 0644, block_show, block_store);
391GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); 386GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
392GDLM_ATTR(id, 0444, lkid_show, NULL);
393GDLM_ATTR(jid, 0444, jid_show, NULL); 387GDLM_ATTR(jid, 0444, jid_show, NULL);
394GDLM_ATTR(first, 0444, lkfirst_show, NULL); 388GDLM_ATTR(first, 0444, lkfirst_show, NULL);
395GDLM_ATTR(first_done, 0444, first_done_show, NULL); 389GDLM_ATTR(first_done, 0444, first_done_show, NULL);
@@ -401,7 +395,6 @@ static struct attribute *lock_module_attrs[] = {
401 &gdlm_attr_proto_name.attr, 395 &gdlm_attr_proto_name.attr,
402 &gdlm_attr_block.attr, 396 &gdlm_attr_block.attr,
403 &gdlm_attr_withdraw.attr, 397 &gdlm_attr_withdraw.attr,
404 &gdlm_attr_id.attr,
405 &gdlm_attr_jid.attr, 398 &gdlm_attr_jid.attr,
406 &gdlm_attr_first.attr, 399 &gdlm_attr_first.attr,
407 &gdlm_attr_first_done.attr, 400 &gdlm_attr_first_done.attr,
@@ -519,7 +512,14 @@ static struct attribute_group lock_module_group = {
519 512
520int gfs2_sys_fs_add(struct gfs2_sbd *sdp) 513int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
521{ 514{
515 struct super_block *sb = sdp->sd_vfs;
522 int error; 516 int error;
517 char ro[20];
518 char spectator[20];
519 char *envp[] = { ro, spectator, NULL };
520
521 sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0);
522 sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0);
523 523
524 sdp->sd_kobj.kset = gfs2_kset; 524 sdp->sd_kobj.kset = gfs2_kset;
525 error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL, 525 error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL,
@@ -535,9 +535,17 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
535 if (error) 535 if (error)
536 goto fail_tune; 536 goto fail_tune;
537 537
538 kobject_uevent(&sdp->sd_kobj, KOBJ_ADD); 538 error = sysfs_create_link(&sdp->sd_kobj,
539 &disk_to_dev(sb->s_bdev->bd_disk)->kobj,
540 "device");
541 if (error)
542 goto fail_lock_module;
543
544 kobject_uevent_env(&sdp->sd_kobj, KOBJ_ADD, envp);
539 return 0; 545 return 0;
540 546
547fail_lock_module:
548 sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
541fail_tune: 549fail_tune:
542 sysfs_remove_group(&sdp->sd_kobj, &tune_group); 550 sysfs_remove_group(&sdp->sd_kobj, &tune_group);
543fail_reg: 551fail_reg:
@@ -549,12 +557,12 @@ fail:
549 557
550void gfs2_sys_fs_del(struct gfs2_sbd *sdp) 558void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
551{ 559{
560 sysfs_remove_link(&sdp->sd_kobj, "device");
552 sysfs_remove_group(&sdp->sd_kobj, &tune_group); 561 sysfs_remove_group(&sdp->sd_kobj, &tune_group);
553 sysfs_remove_group(&sdp->sd_kobj, &lock_module_group); 562 sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
554 kobject_put(&sdp->sd_kobj); 563 kobject_put(&sdp->sd_kobj);
555} 564}
556 565
557
558static int gfs2_uevent(struct kset *kset, struct kobject *kobj, 566static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
559 struct kobj_uevent_env *env) 567 struct kobj_uevent_env *env)
560{ 568{
@@ -563,6 +571,8 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
563 571
564 add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); 572 add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
565 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); 573 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
574 if (!sdp->sd_args.ar_spectator)
575 add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid);
566 if (gfs2_uuid_valid(uuid)) { 576 if (gfs2_uuid_valid(uuid)) {
567 add_uevent_var(env, "UUID=%02X%02X%02X%02X-%02X%02X-%02X%02X-" 577 add_uevent_var(env, "UUID=%02X%02X%02X%02X-%02X%02X-%02X%02X-"
568 "%02X%02X-%02X%02X%02X%02X%02X%02X", 578 "%02X%02X-%02X%02X%02X%02X%02X%02X",
@@ -578,7 +588,6 @@ static struct kset_uevent_ops gfs2_uevent_ops = {
578 .uevent = gfs2_uevent, 588 .uevent = gfs2_uevent,
579}; 589};
580 590
581
582int gfs2_sys_init(void) 591int gfs2_sys_init(void)
583{ 592{
584 gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj); 593 gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 9d12b1118ba0..f6a7efa34eb9 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -38,24 +38,30 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
38 const struct lm_lockops *lm = ls->ls_ops; 38 const struct lm_lockops *lm = ls->ls_ops;
39 va_list args; 39 va_list args;
40 40
41 if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) 41 if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW &&
42 test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
42 return 0; 43 return 0;
43 44
44 va_start(args, fmt); 45 va_start(args, fmt);
45 vprintk(fmt, args); 46 vprintk(fmt, args);
46 va_end(args); 47 va_end(args);
47 48
48 fs_err(sdp, "about to withdraw this file system\n"); 49 if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) {
49 BUG_ON(sdp->sd_args.ar_debug); 50 fs_err(sdp, "about to withdraw this file system\n");
51 BUG_ON(sdp->sd_args.ar_debug);
50 52
51 kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE); 53 kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
52 54
53 if (lm->lm_unmount) { 55 if (lm->lm_unmount) {
54 fs_err(sdp, "telling LM to unmount\n"); 56 fs_err(sdp, "telling LM to unmount\n");
55 lm->lm_unmount(sdp); 57 lm->lm_unmount(sdp);
58 }
59 fs_err(sdp, "withdrawn\n");
60 dump_stack();
56 } 61 }
57 fs_err(sdp, "withdrawn\n"); 62
58 dump_stack(); 63 if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
64 panic("GFS2: fsid=%s: panic requested.\n", sdp->sd_fsname);
59 65
60 return -1; 66 return -1;
61} 67}
@@ -93,17 +99,24 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
93 gfs2_tune_get(sdp, gt_complain_secs) * HZ)) 99 gfs2_tune_get(sdp, gt_complain_secs) * HZ))
94 return -2; 100 return -2;
95 101
96 printk(KERN_WARNING 102 if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW)
97 "GFS2: fsid=%s: warning: assertion \"%s\" failed\n" 103 printk(KERN_WARNING
98 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 104 "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
99 sdp->sd_fsname, assertion, 105 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
100 sdp->sd_fsname, function, file, line); 106 sdp->sd_fsname, assertion,
107 sdp->sd_fsname, function, file, line);
101 108
102 if (sdp->sd_args.ar_debug) 109 if (sdp->sd_args.ar_debug)
103 BUG(); 110 BUG();
104 else 111 else
105 dump_stack(); 112 dump_stack();
106 113
114 if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
115 panic("GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
116 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
117 sdp->sd_fsname, assertion,
118 sdp->sd_fsname, function, file, line);
119
107 sdp->sd_last_warning = jiffies; 120 sdp->sd_last_warning = jiffies;
108 121
109 return -1; 122 return -1;
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/xattr.c
index 07ea9529adda..8a0f8ef6ee27 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/xattr.c
@@ -18,8 +18,7 @@
18#include "gfs2.h" 18#include "gfs2.h"
19#include "incore.h" 19#include "incore.h"
20#include "acl.h" 20#include "acl.h"
21#include "eaops.h" 21#include "xattr.h"
22#include "eattr.h"
23#include "glock.h" 22#include "glock.h"
24#include "inode.h" 23#include "inode.h"
25#include "meta_io.h" 24#include "meta_io.h"
@@ -38,26 +37,32 @@
38 * Returns: 1 if the EA should be stuffed 37 * Returns: 1 if the EA should be stuffed
39 */ 38 */
40 39
41static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er, 40static int ea_calc_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize,
42 unsigned int *size) 41 unsigned int *size)
43{ 42{
44 *size = GFS2_EAREQ_SIZE_STUFFED(er); 43 unsigned int jbsize = sdp->sd_jbsize;
45 if (*size <= sdp->sd_jbsize) 44
45 /* Stuffed */
46 *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + dsize, 8);
47
48 if (*size <= jbsize)
46 return 1; 49 return 1;
47 50
48 *size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er); 51 /* Unstuffed */
52 *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize +
53 (sizeof(__be64) * DIV_ROUND_UP(dsize, jbsize)), 8);
49 54
50 return 0; 55 return 0;
51} 56}
52 57
53static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er) 58static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize)
54{ 59{
55 unsigned int size; 60 unsigned int size;
56 61
57 if (er->er_data_len > GFS2_EA_MAX_DATA_LEN) 62 if (dsize > GFS2_EA_MAX_DATA_LEN)
58 return -ERANGE; 63 return -ERANGE;
59 64
60 ea_calc_size(sdp, er, &size); 65 ea_calc_size(sdp, nsize, dsize, &size);
61 66
62 /* This can only happen with 512 byte blocks */ 67 /* This can only happen with 512 byte blocks */
63 if (size > sdp->sd_jbsize) 68 if (size > sdp->sd_jbsize)
@@ -151,7 +156,9 @@ out:
151} 156}
152 157
153struct ea_find { 158struct ea_find {
154 struct gfs2_ea_request *ef_er; 159 int type;
160 const char *name;
161 size_t namel;
155 struct gfs2_ea_location *ef_el; 162 struct gfs2_ea_location *ef_el;
156}; 163};
157 164
@@ -160,14 +167,13 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
160 void *private) 167 void *private)
161{ 168{
162 struct ea_find *ef = private; 169 struct ea_find *ef = private;
163 struct gfs2_ea_request *er = ef->ef_er;
164 170
165 if (ea->ea_type == GFS2_EATYPE_UNUSED) 171 if (ea->ea_type == GFS2_EATYPE_UNUSED)
166 return 0; 172 return 0;
167 173
168 if (ea->ea_type == er->er_type) { 174 if (ea->ea_type == ef->type) {
169 if (ea->ea_name_len == er->er_name_len && 175 if (ea->ea_name_len == ef->namel &&
170 !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) { 176 !memcmp(GFS2_EA2NAME(ea), ef->name, ea->ea_name_len)) {
171 struct gfs2_ea_location *el = ef->ef_el; 177 struct gfs2_ea_location *el = ef->ef_el;
172 get_bh(bh); 178 get_bh(bh);
173 el->el_bh = bh; 179 el->el_bh = bh;
@@ -180,13 +186,15 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
180 return 0; 186 return 0;
181} 187}
182 188
183int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er, 189int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
184 struct gfs2_ea_location *el) 190 struct gfs2_ea_location *el)
185{ 191{
186 struct ea_find ef; 192 struct ea_find ef;
187 int error; 193 int error;
188 194
189 ef.ef_er = er; 195 ef.type = type;
196 ef.name = name;
197 ef.namel = strlen(name);
190 ef.ef_el = el; 198 ef.ef_el = el;
191 199
192 memset(el, 0, sizeof(struct gfs2_ea_location)); 200 memset(el, 0, sizeof(struct gfs2_ea_location));
@@ -344,6 +352,20 @@ struct ea_list {
344 unsigned int ei_size; 352 unsigned int ei_size;
345}; 353};
346 354
355static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
356{
357 switch (ea->ea_type) {
358 case GFS2_EATYPE_USR:
359 return 5 + ea->ea_name_len + 1;
360 case GFS2_EATYPE_SYS:
361 return 7 + ea->ea_name_len + 1;
362 case GFS2_EATYPE_SECURITY:
363 return 9 + ea->ea_name_len + 1;
364 default:
365 return 0;
366 }
367}
368
347static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, 369static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
348 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, 370 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
349 void *private) 371 void *private)
@@ -392,21 +414,25 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
392} 414}
393 415
394/** 416/**
395 * gfs2_ea_list - 417 * gfs2_listxattr - List gfs2 extended attributes
396 * @ip: 418 * @dentry: The dentry whose inode we are interested in
397 * @er: 419 * @buffer: The buffer to write the results
420 * @size: The size of the buffer
398 * 421 *
399 * Returns: actual size of data on success, -errno on error 422 * Returns: actual size of data on success, -errno on error
400 */ 423 */
401 424
402int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er) 425ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
403{ 426{
427 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
428 struct gfs2_ea_request er;
404 struct gfs2_holder i_gh; 429 struct gfs2_holder i_gh;
405 int error; 430 int error;
406 431
407 if (!er->er_data || !er->er_data_len) { 432 memset(&er, 0, sizeof(struct gfs2_ea_request));
408 er->er_data = NULL; 433 if (size) {
409 er->er_data_len = 0; 434 er.er_data = buffer;
435 er.er_data_len = size;
410 } 436 }
411 437
412 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); 438 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
@@ -414,7 +440,7 @@ int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
414 return error; 440 return error;
415 441
416 if (ip->i_eattr) { 442 if (ip->i_eattr) {
417 struct ea_list ei = { .ei_er = er, .ei_size = 0 }; 443 struct ea_list ei = { .ei_er = &er, .ei_size = 0 };
418 444
419 error = ea_foreach(ip, ea_list_i, &ei); 445 error = ea_foreach(ip, ea_list_i, &ei);
420 if (!error) 446 if (!error)
@@ -491,84 +517,61 @@ out:
491} 517}
492 518
493int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el, 519int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
494 char *data) 520 char *data, size_t size)
495{ 521{
522 int ret;
523 size_t len = GFS2_EA_DATA_LEN(el->el_ea);
524 if (len > size)
525 return -ERANGE;
526
496 if (GFS2_EA_IS_STUFFED(el->el_ea)) { 527 if (GFS2_EA_IS_STUFFED(el->el_ea)) {
497 memcpy(data, GFS2_EA2DATA(el->el_ea), GFS2_EA_DATA_LEN(el->el_ea)); 528 memcpy(data, GFS2_EA2DATA(el->el_ea), len);
498 return 0; 529 return len;
499 } else 530 }
500 return ea_get_unstuffed(ip, el->el_ea, data); 531 ret = ea_get_unstuffed(ip, el->el_ea, data);
532 if (ret < 0)
533 return ret;
534 return len;
501} 535}
502 536
503/** 537/**
504 * gfs2_ea_get_i - 538 * gfs2_xattr_get - Get a GFS2 extended attribute
505 * @ip: The GFS2 inode 539 * @inode: The inode
506 * @er: The request structure 540 * @type: The type of extended attribute
541 * @name: The name of the extended attribute
542 * @buffer: The buffer to write the result into
543 * @size: The size of the buffer
507 * 544 *
508 * Returns: actual size of data on success, -errno on error 545 * Returns: actual size of data on success, -errno on error
509 */ 546 */
510 547
511int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) 548int gfs2_xattr_get(struct inode *inode, int type, const char *name,
549 void *buffer, size_t size)
512{ 550{
551 struct gfs2_inode *ip = GFS2_I(inode);
513 struct gfs2_ea_location el; 552 struct gfs2_ea_location el;
514 int error; 553 int error;
515 554
516 if (!ip->i_eattr) 555 if (!ip->i_eattr)
517 return -ENODATA; 556 return -ENODATA;
557 if (strlen(name) > GFS2_EA_MAX_NAME_LEN)
558 return -EINVAL;
518 559
519 error = gfs2_ea_find(ip, er, &el); 560 error = gfs2_ea_find(ip, type, name, &el);
520 if (error) 561 if (error)
521 return error; 562 return error;
522 if (!el.el_ea) 563 if (!el.el_ea)
523 return -ENODATA; 564 return -ENODATA;
524 565 if (size)
525 if (er->er_data_len) { 566 error = gfs2_ea_get_copy(ip, &el, buffer, size);
526 if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len) 567 else
527 error = -ERANGE;
528 else
529 error = gfs2_ea_get_copy(ip, &el, er->er_data);
530 }
531 if (!error)
532 error = GFS2_EA_DATA_LEN(el.el_ea); 568 error = GFS2_EA_DATA_LEN(el.el_ea);
533
534 brelse(el.el_bh); 569 brelse(el.el_bh);
535 570
536 return error; 571 return error;
537} 572}
538 573
539/** 574/**
540 * gfs2_ea_get -
541 * @ip: The GFS2 inode
542 * @er: The request structure
543 *
544 * Returns: actual size of data on success, -errno on error
545 */
546
547int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
548{
549 struct gfs2_holder i_gh;
550 int error;
551
552 if (!er->er_name_len ||
553 er->er_name_len > GFS2_EA_MAX_NAME_LEN)
554 return -EINVAL;
555 if (!er->er_data || !er->er_data_len) {
556 er->er_data = NULL;
557 er->er_data_len = 0;
558 }
559
560 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
561 if (error)
562 return error;
563
564 error = gfs2_ea_ops[er->er_type]->eo_get(ip, er);
565
566 gfs2_glock_dq_uninit(&i_gh);
567
568 return error;
569}
570
571/**
572 * ea_alloc_blk - allocates a new block for extended attributes. 575 * ea_alloc_blk - allocates a new block for extended attributes.
573 * @ip: A pointer to the inode that's getting extended attributes 576 * @ip: A pointer to the inode that's getting extended attributes
574 * @bhp: Pointer to pointer to a struct buffer_head 577 * @bhp: Pointer to pointer to a struct buffer_head
@@ -713,12 +716,6 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
713 716
714 error = gfs2_meta_inode_buffer(ip, &dibh); 717 error = gfs2_meta_inode_buffer(ip, &dibh);
715 if (!error) { 718 if (!error) {
716 if (er->er_flags & GFS2_ERF_MODE) {
717 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
718 (ip->i_inode.i_mode & S_IFMT) ==
719 (er->er_mode & S_IFMT));
720 ip->i_inode.i_mode = er->er_mode;
721 }
722 ip->i_inode.i_ctime = CURRENT_TIME; 719 ip->i_inode.i_ctime = CURRENT_TIME;
723 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 720 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
724 gfs2_dinode_out(ip, dibh->b_data); 721 gfs2_dinode_out(ip, dibh->b_data);
@@ -762,15 +759,23 @@ static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
762 * Returns: errno 759 * Returns: errno
763 */ 760 */
764 761
765static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er) 762static int ea_init(struct gfs2_inode *ip, int type, const char *name,
763 const void *data, size_t size)
766{ 764{
765 struct gfs2_ea_request er;
767 unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize; 766 unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize;
768 unsigned int blks = 1; 767 unsigned int blks = 1;
769 768
770 if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize) 769 er.er_type = type;
771 blks += DIV_ROUND_UP(er->er_data_len, jbsize); 770 er.er_name = name;
771 er.er_name_len = strlen(name);
772 er.er_data = (void *)data;
773 er.er_data_len = size;
774
775 if (GFS2_EAREQ_SIZE_STUFFED(&er) > jbsize)
776 blks += DIV_ROUND_UP(er.er_data_len, jbsize);
772 777
773 return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL); 778 return ea_alloc_skeleton(ip, &er, blks, ea_init_i, NULL);
774} 779}
775 780
776static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea) 781static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
@@ -848,12 +853,6 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
848 error = gfs2_meta_inode_buffer(ip, &dibh); 853 error = gfs2_meta_inode_buffer(ip, &dibh);
849 if (error) 854 if (error)
850 goto out; 855 goto out;
851
852 if (er->er_flags & GFS2_ERF_MODE) {
853 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
854 (ip->i_inode.i_mode & S_IFMT) == (er->er_mode & S_IFMT));
855 ip->i_inode.i_mode = er->er_mode;
856 }
857 ip->i_inode.i_ctime = CURRENT_TIME; 856 ip->i_inode.i_ctime = CURRENT_TIME;
858 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 857 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
859 gfs2_dinode_out(ip, dibh->b_data); 858 gfs2_dinode_out(ip, dibh->b_data);
@@ -894,7 +893,8 @@ static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
894 int stuffed; 893 int stuffed;
895 int error; 894 int error;
896 895
897 stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er, &size); 896 stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er->er_name_len,
897 es->es_er->er_data_len, &size);
898 898
899 if (ea->ea_type == GFS2_EATYPE_UNUSED) { 899 if (ea->ea_type == GFS2_EATYPE_UNUSED) {
900 if (GFS2_EA_REC_LEN(ea) < size) 900 if (GFS2_EA_REC_LEN(ea) < size)
@@ -1005,15 +1005,22 @@ out:
1005 return error; 1005 return error;
1006} 1006}
1007 1007
1008static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, 1008static int ea_set_i(struct gfs2_inode *ip, int type, const char *name,
1009 struct gfs2_ea_location *el) 1009 const void *value, size_t size, struct gfs2_ea_location *el)
1010{ 1010{
1011 struct gfs2_ea_request er;
1011 struct ea_set es; 1012 struct ea_set es;
1012 unsigned int blks = 2; 1013 unsigned int blks = 2;
1013 int error; 1014 int error;
1014 1015
1016 er.er_type = type;
1017 er.er_name = name;
1018 er.er_data = (void *)value;
1019 er.er_name_len = strlen(name);
1020 er.er_data_len = size;
1021
1015 memset(&es, 0, sizeof(struct ea_set)); 1022 memset(&es, 0, sizeof(struct ea_set));
1016 es.es_er = er; 1023 es.es_er = &er;
1017 es.es_el = el; 1024 es.es_el = el;
1018 1025
1019 error = ea_foreach(ip, ea_set_simple, &es); 1026 error = ea_foreach(ip, ea_set_simple, &es);
@@ -1024,10 +1031,10 @@ static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
1024 1031
1025 if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) 1032 if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT))
1026 blks++; 1033 blks++;
1027 if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize) 1034 if (GFS2_EAREQ_SIZE_STUFFED(&er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
1028 blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize); 1035 blks += DIV_ROUND_UP(er.er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
1029 1036
1030 return ea_alloc_skeleton(ip, er, blks, ea_set_block, el); 1037 return ea_alloc_skeleton(ip, &er, blks, ea_set_block, el);
1031} 1038}
1032 1039
1033static int ea_set_remove_unstuffed(struct gfs2_inode *ip, 1040static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
@@ -1039,75 +1046,7 @@ static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
1039 GFS2_EA2NEXT(el->el_prev) == el->el_ea); 1046 GFS2_EA2NEXT(el->el_prev) == el->el_ea);
1040 } 1047 }
1041 1048
1042 return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0); 1049 return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev, 0);
1043}
1044
1045int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1046{
1047 struct gfs2_ea_location el;
1048 int error;
1049
1050 if (!ip->i_eattr) {
1051 if (er->er_flags & XATTR_REPLACE)
1052 return -ENODATA;
1053 return ea_init(ip, er);
1054 }
1055
1056 error = gfs2_ea_find(ip, er, &el);
1057 if (error)
1058 return error;
1059
1060 if (el.el_ea) {
1061 if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
1062 brelse(el.el_bh);
1063 return -EPERM;
1064 }
1065
1066 error = -EEXIST;
1067 if (!(er->er_flags & XATTR_CREATE)) {
1068 int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
1069 error = ea_set_i(ip, er, &el);
1070 if (!error && unstuffed)
1071 ea_set_remove_unstuffed(ip, &el);
1072 }
1073
1074 brelse(el.el_bh);
1075 } else {
1076 error = -ENODATA;
1077 if (!(er->er_flags & XATTR_REPLACE))
1078 error = ea_set_i(ip, er, NULL);
1079 }
1080
1081 return error;
1082}
1083
1084int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1085{
1086 struct gfs2_holder i_gh;
1087 int error;
1088
1089 if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
1090 return -EINVAL;
1091 if (!er->er_data || !er->er_data_len) {
1092 er->er_data = NULL;
1093 er->er_data_len = 0;
1094 }
1095 error = ea_check_size(GFS2_SB(&ip->i_inode), er);
1096 if (error)
1097 return error;
1098
1099 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
1100 if (error)
1101 return error;
1102
1103 if (IS_IMMUTABLE(&ip->i_inode))
1104 error = -EPERM;
1105 else
1106 error = gfs2_ea_ops[er->er_type]->eo_set(ip, er);
1107
1108 gfs2_glock_dq_uninit(&i_gh);
1109
1110 return error;
1111} 1050}
1112 1051
1113static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) 1052static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
@@ -1131,8 +1070,9 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
1131 1070
1132 if (GFS2_EA_IS_LAST(ea)) 1071 if (GFS2_EA_IS_LAST(ea))
1133 prev->ea_flags |= GFS2_EAFLAG_LAST; 1072 prev->ea_flags |= GFS2_EAFLAG_LAST;
1134 } else 1073 } else {
1135 ea->ea_type = GFS2_EATYPE_UNUSED; 1074 ea->ea_type = GFS2_EATYPE_UNUSED;
1075 }
1136 1076
1137 error = gfs2_meta_inode_buffer(ip, &dibh); 1077 error = gfs2_meta_inode_buffer(ip, &dibh);
1138 if (!error) { 1078 if (!error) {
@@ -1147,15 +1087,29 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
1147 return error; 1087 return error;
1148} 1088}
1149 1089
1150int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) 1090/**
1091 * gfs2_xattr_remove - Remove a GFS2 extended attribute
1092 * @inode: The inode
1093 * @type: The type of the extended attribute
1094 * @name: The name of the extended attribute
1095 *
1096 * This is not called directly by the VFS since we use the (common)
1097 * scheme of making a "set with NULL data" mean a remove request. Note
1098 * that this is different from a set with zero length data.
1099 *
1100 * Returns: 0, or errno on failure
1101 */
1102
1103static int gfs2_xattr_remove(struct inode *inode, int type, const char *name)
1151{ 1104{
1105 struct gfs2_inode *ip = GFS2_I(inode);
1152 struct gfs2_ea_location el; 1106 struct gfs2_ea_location el;
1153 int error; 1107 int error;
1154 1108
1155 if (!ip->i_eattr) 1109 if (!ip->i_eattr)
1156 return -ENODATA; 1110 return -ENODATA;
1157 1111
1158 error = gfs2_ea_find(ip, er, &el); 1112 error = gfs2_ea_find(ip, type, name, &el);
1159 if (error) 1113 if (error)
1160 return error; 1114 return error;
1161 if (!el.el_ea) 1115 if (!el.el_ea)
@@ -1164,8 +1118,7 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1164 if (GFS2_EA_IS_STUFFED(el.el_ea)) 1118 if (GFS2_EA_IS_STUFFED(el.el_ea))
1165 error = ea_remove_stuffed(ip, &el); 1119 error = ea_remove_stuffed(ip, &el);
1166 else 1120 else
1167 error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, 1121 error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, 0);
1168 0);
1169 1122
1170 brelse(el.el_bh); 1123 brelse(el.el_bh);
1171 1124
@@ -1173,31 +1126,70 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1173} 1126}
1174 1127
1175/** 1128/**
1176 * gfs2_ea_remove - sets (or creates or replaces) an extended attribute 1129 * gfs2_xattr_set - Set (or remove) a GFS2 extended attribute
1177 * @ip: pointer to the inode of the target file 1130 * @inode: The inode
1178 * @er: request information 1131 * @type: The type of the extended attribute
1132 * @name: The name of the extended attribute
1133 * @value: The value of the extended attribute (NULL for remove)
1134 * @size: The size of the @value argument
1135 * @flags: Create or Replace
1179 * 1136 *
1180 * Returns: errno 1137 * See gfs2_xattr_remove() for details of the removal of xattrs.
1138 *
1139 * Returns: 0 or errno on failure
1181 */ 1140 */
1182 1141
1183int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er) 1142int gfs2_xattr_set(struct inode *inode, int type, const char *name,
1143 const void *value, size_t size, int flags)
1184{ 1144{
1185 struct gfs2_holder i_gh; 1145 struct gfs2_sbd *sdp = GFS2_SB(inode);
1146 struct gfs2_inode *ip = GFS2_I(inode);
1147 struct gfs2_ea_location el;
1148 unsigned int namel = strlen(name);
1186 int error; 1149 int error;
1187 1150
1188 if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN) 1151 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
1189 return -EINVAL; 1152 return -EPERM;
1153 if (namel > GFS2_EA_MAX_NAME_LEN)
1154 return -ERANGE;
1190 1155
1191 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); 1156 if (value == NULL)
1157 return gfs2_xattr_remove(inode, type, name);
1158
1159 if (ea_check_size(sdp, namel, size))
1160 return -ERANGE;
1161
1162 if (!ip->i_eattr) {
1163 if (flags & XATTR_REPLACE)
1164 return -ENODATA;
1165 return ea_init(ip, type, name, value, size);
1166 }
1167
1168 error = gfs2_ea_find(ip, type, name, &el);
1192 if (error) 1169 if (error)
1193 return error; 1170 return error;
1194 1171
1195 if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode)) 1172 if (el.el_ea) {
1196 error = -EPERM; 1173 if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
1197 else 1174 brelse(el.el_bh);
1198 error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er); 1175 return -EPERM;
1176 }
1199 1177
1200 gfs2_glock_dq_uninit(&i_gh); 1178 error = -EEXIST;
1179 if (!(flags & XATTR_CREATE)) {
1180 int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
1181 error = ea_set_i(ip, type, name, value, size, &el);
1182 if (!error && unstuffed)
1183 ea_set_remove_unstuffed(ip, &el);
1184 }
1185
1186 brelse(el.el_bh);
1187 return error;
1188 }
1189
1190 error = -ENODATA;
1191 if (!(flags & XATTR_REPLACE))
1192 error = ea_set_i(ip, type, name, value, size, NULL);
1201 1193
1202 return error; 1194 return error;
1203} 1195}
@@ -1503,3 +1495,64 @@ out_alloc:
1503 return error; 1495 return error;
1504} 1496}
1505 1497
1498static int gfs2_xattr_user_get(struct inode *inode, const char *name,
1499 void *buffer, size_t size)
1500{
1501 return gfs2_xattr_get(inode, GFS2_EATYPE_USR, name, buffer, size);
1502}
1503
1504static int gfs2_xattr_user_set(struct inode *inode, const char *name,
1505 const void *value, size_t size, int flags)
1506{
1507 return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags);
1508}
1509
1510static int gfs2_xattr_system_get(struct inode *inode, const char *name,
1511 void *buffer, size_t size)
1512{
1513 return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size);
1514}
1515
1516static int gfs2_xattr_system_set(struct inode *inode, const char *name,
1517 const void *value, size_t size, int flags)
1518{
1519 return gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, flags);
1520}
1521
1522static int gfs2_xattr_security_get(struct inode *inode, const char *name,
1523 void *buffer, size_t size)
1524{
1525 return gfs2_xattr_get(inode, GFS2_EATYPE_SECURITY, name, buffer, size);
1526}
1527
1528static int gfs2_xattr_security_set(struct inode *inode, const char *name,
1529 const void *value, size_t size, int flags)
1530{
1531 return gfs2_xattr_set(inode, GFS2_EATYPE_SECURITY, name, value, size, flags);
1532}
1533
1534static struct xattr_handler gfs2_xattr_user_handler = {
1535 .prefix = XATTR_USER_PREFIX,
1536 .get = gfs2_xattr_user_get,
1537 .set = gfs2_xattr_user_set,
1538};
1539
1540static struct xattr_handler gfs2_xattr_security_handler = {
1541 .prefix = XATTR_SECURITY_PREFIX,
1542 .get = gfs2_xattr_security_get,
1543 .set = gfs2_xattr_security_set,
1544};
1545
1546static struct xattr_handler gfs2_xattr_system_handler = {
1547 .prefix = XATTR_SYSTEM_PREFIX,
1548 .get = gfs2_xattr_system_get,
1549 .set = gfs2_xattr_system_set,
1550};
1551
1552struct xattr_handler *gfs2_xattr_handlers[] = {
1553 &gfs2_xattr_user_handler,
1554 &gfs2_xattr_security_handler,
1555 &gfs2_xattr_system_handler,
1556 NULL,
1557};
1558
diff --git a/fs/gfs2/eattr.h b/fs/gfs2/xattr.h
index c82dbe01d713..cbdfd7743733 100644
--- a/fs/gfs2/eattr.h
+++ b/fs/gfs2/xattr.h
@@ -19,7 +19,7 @@ struct iattr;
19#define GFS2_EA_SIZE(ea) \ 19#define GFS2_EA_SIZE(ea) \
20ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \ 20ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
21 ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \ 21 ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
22 (sizeof(__be64) * (ea)->ea_num_ptrs)), 8) 22 (sizeof(__be64) * (ea)->ea_num_ptrs)), 8)
23 23
24#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs) 24#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
25#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST) 25#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
@@ -27,10 +27,6 @@ ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
27#define GFS2_EAREQ_SIZE_STUFFED(er) \ 27#define GFS2_EAREQ_SIZE_STUFFED(er) \
28ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8) 28ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
29 29
30#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \
31ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
32 sizeof(__be64) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8)
33
34#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1)) 30#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
35#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len) 31#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
36 32
@@ -43,16 +39,12 @@ ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
43#define GFS2_EA_BH2FIRST(bh) \ 39#define GFS2_EA_BH2FIRST(bh) \
44((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header))) 40((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
45 41
46#define GFS2_ERF_MODE 0x80000000
47
48struct gfs2_ea_request { 42struct gfs2_ea_request {
49 const char *er_name; 43 const char *er_name;
50 char *er_data; 44 char *er_data;
51 unsigned int er_name_len; 45 unsigned int er_name_len;
52 unsigned int er_data_len; 46 unsigned int er_data_len;
53 unsigned int er_type; /* GFS2_EATYPE_... */ 47 unsigned int er_type; /* GFS2_EATYPE_... */
54 int er_flags;
55 mode_t er_mode;
56}; 48};
57 49
58struct gfs2_ea_location { 50struct gfs2_ea_location {
@@ -61,40 +53,20 @@ struct gfs2_ea_location {
61 struct gfs2_ea_header *el_prev; 53 struct gfs2_ea_header *el_prev;
62}; 54};
63 55
64int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er); 56extern int gfs2_xattr_get(struct inode *inode, int type, const char *name,
65int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er); 57 void *buffer, size_t size);
66int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er); 58extern int gfs2_xattr_set(struct inode *inode, int type, const char *name,
67 59 const void *value, size_t size, int flags);
68int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er); 60extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size);
69int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er); 61extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
70int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er);
71int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er);
72
73int gfs2_ea_dealloc(struct gfs2_inode *ip);
74 62
75/* Exported to acl.c */ 63/* Exported to acl.c */
76 64
77int gfs2_ea_find(struct gfs2_inode *ip, 65extern int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
78 struct gfs2_ea_request *er, 66 struct gfs2_ea_location *el);
79 struct gfs2_ea_location *el); 67extern int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
80int gfs2_ea_get_copy(struct gfs2_inode *ip, 68 char *data, size_t size);
81 struct gfs2_ea_location *el, 69extern int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
82 char *data); 70 struct iattr *attr, char *data);
83int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
84 struct iattr *attr, char *data);
85
86static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
87{
88 switch (ea->ea_type) {
89 case GFS2_EATYPE_USR:
90 return 5 + ea->ea_name_len + 1;
91 case GFS2_EATYPE_SYS:
92 return 7 + ea->ea_name_len + 1;
93 case GFS2_EATYPE_SECURITY:
94 return 9 + ea->ea_name_len + 1;
95 default:
96 return 0;
97 }
98}
99 71
100#endif /* __EATTR_DOT_H__ */ 72#endif /* __EATTR_DOT_H__ */
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 7b6165f25fbe..8bbe03c3f6d5 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -344,10 +344,8 @@ void hfs_mdb_put(struct super_block *sb)
344 brelse(HFS_SB(sb)->mdb_bh); 344 brelse(HFS_SB(sb)->mdb_bh);
345 brelse(HFS_SB(sb)->alt_mdb_bh); 345 brelse(HFS_SB(sb)->alt_mdb_bh);
346 346
347 if (HFS_SB(sb)->nls_io) 347 unload_nls(HFS_SB(sb)->nls_io);
348 unload_nls(HFS_SB(sb)->nls_io); 348 unload_nls(HFS_SB(sb)->nls_disk);
349 if (HFS_SB(sb)->nls_disk)
350 unload_nls(HFS_SB(sb)->nls_disk);
351 349
352 free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0); 350 free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0);
353 kfree(HFS_SB(sb)); 351 kfree(HFS_SB(sb));
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index c0759fe0855b..43022f3d5148 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -229,8 +229,7 @@ static void hfsplus_put_super(struct super_block *sb)
229 iput(HFSPLUS_SB(sb).alloc_file); 229 iput(HFSPLUS_SB(sb).alloc_file);
230 iput(HFSPLUS_SB(sb).hidden_dir); 230 iput(HFSPLUS_SB(sb).hidden_dir);
231 brelse(HFSPLUS_SB(sb).s_vhbh); 231 brelse(HFSPLUS_SB(sb).s_vhbh);
232 if (HFSPLUS_SB(sb).nls) 232 unload_nls(HFSPLUS_SB(sb).nls);
233 unload_nls(HFSPLUS_SB(sb).nls);
234 kfree(sb->s_fs_info); 233 kfree(sb->s_fs_info);
235 sb->s_fs_info = NULL; 234 sb->s_fs_info = NULL;
236 235
@@ -464,8 +463,7 @@ out:
464 463
465cleanup: 464cleanup:
466 hfsplus_put_super(sb); 465 hfsplus_put_super(sb);
467 if (nls) 466 unload_nls(nls);
468 unload_nls(nls);
469 return err; 467 return err;
470} 468}
471 469
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index cb88dac8ccaa..87a1258953b8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -31,12 +31,10 @@
31#include <linux/statfs.h> 31#include <linux/statfs.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/ima.h> 33#include <linux/ima.h>
34#include <linux/magic.h>
34 35
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36 37
37/* some random number */
38#define HUGETLBFS_MAGIC 0x958458f6
39
40static const struct super_operations hugetlbfs_ops; 38static const struct super_operations hugetlbfs_ops;
41static const struct address_space_operations hugetlbfs_aops; 39static const struct address_space_operations hugetlbfs_aops;
42const struct file_operations hugetlbfs_file_operations; 40const struct file_operations hugetlbfs_file_operations;
@@ -44,6 +42,7 @@ static const struct inode_operations hugetlbfs_dir_inode_operations;
44static const struct inode_operations hugetlbfs_inode_operations; 42static const struct inode_operations hugetlbfs_inode_operations;
45 43
46static struct backing_dev_info hugetlbfs_backing_dev_info = { 44static struct backing_dev_info hugetlbfs_backing_dev_info = {
45 .name = "hugetlbfs",
47 .ra_pages = 0, /* No readahead */ 46 .ra_pages = 0, /* No readahead */
48 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 47 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
49}; 48};
@@ -381,36 +380,11 @@ static void hugetlbfs_delete_inode(struct inode *inode)
381 380
382static void hugetlbfs_forget_inode(struct inode *inode) __releases(inode_lock) 381static void hugetlbfs_forget_inode(struct inode *inode) __releases(inode_lock)
383{ 382{
384 struct super_block *sb = inode->i_sb; 383 if (generic_detach_inode(inode)) {
385 384 truncate_hugepages(inode, 0);
386 if (!hlist_unhashed(&inode->i_hash)) { 385 clear_inode(inode);
387 if (!(inode->i_state & (I_DIRTY|I_SYNC))) 386 destroy_inode(inode);
388 list_move(&inode->i_list, &inode_unused);
389 inodes_stat.nr_unused++;
390 if (!sb || (sb->s_flags & MS_ACTIVE)) {
391 spin_unlock(&inode_lock);
392 return;
393 }
394 inode->i_state |= I_WILL_FREE;
395 spin_unlock(&inode_lock);
396 /*
397 * write_inode_now is a noop as we set BDI_CAP_NO_WRITEBACK
398 * in our backing_dev_info.
399 */
400 write_inode_now(inode, 1);
401 spin_lock(&inode_lock);
402 inode->i_state &= ~I_WILL_FREE;
403 inodes_stat.nr_unused--;
404 hlist_del_init(&inode->i_hash);
405 } 387 }
406 list_del_init(&inode->i_list);
407 list_del_init(&inode->i_sb_list);
408 inode->i_state |= I_FREEING;
409 inodes_stat.nr_inodes--;
410 spin_unlock(&inode_lock);
411 truncate_hugepages(inode, 0);
412 clear_inode(inode);
413 destroy_inode(inode);
414} 388}
415 389
416static void hugetlbfs_drop_inode(struct inode *inode) 390static void hugetlbfs_drop_inode(struct inode *inode)
@@ -506,6 +480,13 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
506 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 480 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
507 INIT_LIST_HEAD(&inode->i_mapping->private_list); 481 INIT_LIST_HEAD(&inode->i_mapping->private_list);
508 info = HUGETLBFS_I(inode); 482 info = HUGETLBFS_I(inode);
483 /*
484 * The policy is initialized here even if we are creating a
485 * private inode because initialization simply creates an
486 * an empty rb tree and calls spin_lock_init(), later when we
487 * call mpol_free_shared_policy() it will just return because
488 * the rb tree will still be empty.
489 */
509 mpol_shared_policy_init(&info->policy, NULL); 490 mpol_shared_policy_init(&info->policy, NULL);
510 switch (mode & S_IFMT) { 491 switch (mode & S_IFMT) {
511 default: 492 default:
@@ -936,7 +917,7 @@ static int can_do_hugetlb_shm(void)
936} 917}
937 918
938struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, 919struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
939 struct user_struct **user) 920 struct user_struct **user, int creat_flags)
940{ 921{
941 int error = -ENOMEM; 922 int error = -ENOMEM;
942 struct file *file; 923 struct file *file;
@@ -948,7 +929,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
948 if (!hugetlbfs_vfsmount) 929 if (!hugetlbfs_vfsmount)
949 return ERR_PTR(-ENOENT); 930 return ERR_PTR(-ENOENT);
950 931
951 if (!can_do_hugetlb_shm()) { 932 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
952 *user = current_user(); 933 *user = current_user();
953 if (user_shm_lock(size, *user)) { 934 if (user_shm_lock(size, *user)) {
954 WARN_ONCE(1, 935 WARN_ONCE(1,
diff --git a/fs/inode.c b/fs/inode.c
index ae7b67e48661..4d8e3be55976 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -14,6 +14,7 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/backing-dev.h> 15#include <linux/backing-dev.h>
16#include <linux/wait.h> 16#include <linux/wait.h>
17#include <linux/rwsem.h>
17#include <linux/hash.h> 18#include <linux/hash.h>
18#include <linux/swap.h> 19#include <linux/swap.h>
19#include <linux/security.h> 20#include <linux/security.h>
@@ -87,14 +88,18 @@ static struct hlist_head *inode_hashtable __read_mostly;
87DEFINE_SPINLOCK(inode_lock); 88DEFINE_SPINLOCK(inode_lock);
88 89
89/* 90/*
90 * iprune_mutex provides exclusion between the kswapd or try_to_free_pages 91 * iprune_sem provides exclusion between the kswapd or try_to_free_pages
91 * icache shrinking path, and the umount path. Without this exclusion, 92 * icache shrinking path, and the umount path. Without this exclusion,
92 * by the time prune_icache calls iput for the inode whose pages it has 93 * by the time prune_icache calls iput for the inode whose pages it has
93 * been invalidating, or by the time it calls clear_inode & destroy_inode 94 * been invalidating, or by the time it calls clear_inode & destroy_inode
94 * from its final dispose_list, the struct super_block they refer to 95 * from its final dispose_list, the struct super_block they refer to
95 * (for inode->i_sb->s_op) may already have been freed and reused. 96 * (for inode->i_sb->s_op) may already have been freed and reused.
97 *
98 * We make this an rwsem because the fastpath is icache shrinking. In
99 * some cases a filesystem may be doing a significant amount of work in
100 * its inode reclaim code, so this should improve parallelism.
96 */ 101 */
97static DEFINE_MUTEX(iprune_mutex); 102static DECLARE_RWSEM(iprune_sem);
98 103
99/* 104/*
100 * Statistics gathering.. 105 * Statistics gathering..
@@ -123,7 +128,7 @@ static void wake_up_inode(struct inode *inode)
123int inode_init_always(struct super_block *sb, struct inode *inode) 128int inode_init_always(struct super_block *sb, struct inode *inode)
124{ 129{
125 static const struct address_space_operations empty_aops; 130 static const struct address_space_operations empty_aops;
126 static struct inode_operations empty_iops; 131 static const struct inode_operations empty_iops;
127 static const struct file_operations empty_fops; 132 static const struct file_operations empty_fops;
128 struct address_space *const mapping = &inode->i_data; 133 struct address_space *const mapping = &inode->i_data;
129 134
@@ -182,9 +187,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
182 if (sb->s_bdev) { 187 if (sb->s_bdev) {
183 struct backing_dev_info *bdi; 188 struct backing_dev_info *bdi;
184 189
185 bdi = sb->s_bdev->bd_inode_backing_dev_info; 190 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
186 if (!bdi)
187 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
188 mapping->backing_dev_info = bdi; 191 mapping->backing_dev_info = bdi;
189 } 192 }
190 inode->i_private = NULL; 193 inode->i_private = NULL;
@@ -383,7 +386,7 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
383 /* 386 /*
384 * We can reschedule here without worrying about the list's 387 * We can reschedule here without worrying about the list's
385 * consistency because the per-sb list of inodes must not 388 * consistency because the per-sb list of inodes must not
386 * change during umount anymore, and because iprune_mutex keeps 389 * change during umount anymore, and because iprune_sem keeps
387 * shrink_icache_memory() away. 390 * shrink_icache_memory() away.
388 */ 391 */
389 cond_resched_lock(&inode_lock); 392 cond_resched_lock(&inode_lock);
@@ -422,7 +425,7 @@ int invalidate_inodes(struct super_block *sb)
422 int busy; 425 int busy;
423 LIST_HEAD(throw_away); 426 LIST_HEAD(throw_away);
424 427
425 mutex_lock(&iprune_mutex); 428 down_write(&iprune_sem);
426 spin_lock(&inode_lock); 429 spin_lock(&inode_lock);
427 inotify_unmount_inodes(&sb->s_inodes); 430 inotify_unmount_inodes(&sb->s_inodes);
428 fsnotify_unmount_inodes(&sb->s_inodes); 431 fsnotify_unmount_inodes(&sb->s_inodes);
@@ -430,7 +433,7 @@ int invalidate_inodes(struct super_block *sb)
430 spin_unlock(&inode_lock); 433 spin_unlock(&inode_lock);
431 434
432 dispose_list(&throw_away); 435 dispose_list(&throw_away);
433 mutex_unlock(&iprune_mutex); 436 up_write(&iprune_sem);
434 437
435 return busy; 438 return busy;
436} 439}
@@ -469,7 +472,7 @@ static void prune_icache(int nr_to_scan)
469 int nr_scanned; 472 int nr_scanned;
470 unsigned long reap = 0; 473 unsigned long reap = 0;
471 474
472 mutex_lock(&iprune_mutex); 475 down_read(&iprune_sem);
473 spin_lock(&inode_lock); 476 spin_lock(&inode_lock);
474 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { 477 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
475 struct inode *inode; 478 struct inode *inode;
@@ -511,7 +514,7 @@ static void prune_icache(int nr_to_scan)
511 spin_unlock(&inode_lock); 514 spin_unlock(&inode_lock);
512 515
513 dispose_list(&freeable); 516 dispose_list(&freeable);
514 mutex_unlock(&iprune_mutex); 517 up_read(&iprune_sem);
515} 518}
516 519
517/* 520/*
@@ -697,13 +700,15 @@ void unlock_new_inode(struct inode *inode)
697 } 700 }
698#endif 701#endif
699 /* 702 /*
700 * This is special! We do not need the spinlock 703 * This is special! We do not need the spinlock when clearing I_LOCK,
701 * when clearing I_LOCK, because we're guaranteed 704 * because we're guaranteed that nobody else tries to do anything about
702 * that nobody else tries to do anything about the 705 * the state of the inode when it is locked, as we just created it (so
703 * state of the inode when it is locked, as we 706 * there can be no old holders that haven't tested I_LOCK).
704 * just created it (so there can be no old holders 707 * However we must emit the memory barrier so that other CPUs reliably
705 * that haven't tested I_LOCK). 708 * see the clearing of I_LOCK after the other inode initialisation has
709 * completed.
706 */ 710 */
711 smp_mb();
707 WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW)); 712 WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW));
708 inode->i_state &= ~(I_LOCK|I_NEW); 713 inode->i_state &= ~(I_LOCK|I_NEW);
709 wake_up_inode(inode); 714 wake_up_inode(inode);
@@ -1236,7 +1241,16 @@ void generic_delete_inode(struct inode *inode)
1236} 1241}
1237EXPORT_SYMBOL(generic_delete_inode); 1242EXPORT_SYMBOL(generic_delete_inode);
1238 1243
1239static void generic_forget_inode(struct inode *inode) 1244/**
1245 * generic_detach_inode - remove inode from inode lists
1246 * @inode: inode to remove
1247 *
1248 * Remove inode from inode lists, write it if it's dirty. This is just an
1249 * internal VFS helper exported for hugetlbfs. Do not use!
1250 *
1251 * Returns 1 if inode should be completely destroyed.
1252 */
1253int generic_detach_inode(struct inode *inode)
1240{ 1254{
1241 struct super_block *sb = inode->i_sb; 1255 struct super_block *sb = inode->i_sb;
1242 1256
@@ -1246,7 +1260,7 @@ static void generic_forget_inode(struct inode *inode)
1246 inodes_stat.nr_unused++; 1260 inodes_stat.nr_unused++;
1247 if (sb->s_flags & MS_ACTIVE) { 1261 if (sb->s_flags & MS_ACTIVE) {
1248 spin_unlock(&inode_lock); 1262 spin_unlock(&inode_lock);
1249 return; 1263 return 0;
1250 } 1264 }
1251 WARN_ON(inode->i_state & I_NEW); 1265 WARN_ON(inode->i_state & I_NEW);
1252 inode->i_state |= I_WILL_FREE; 1266 inode->i_state |= I_WILL_FREE;
@@ -1264,6 +1278,14 @@ static void generic_forget_inode(struct inode *inode)
1264 inode->i_state |= I_FREEING; 1278 inode->i_state |= I_FREEING;
1265 inodes_stat.nr_inodes--; 1279 inodes_stat.nr_inodes--;
1266 spin_unlock(&inode_lock); 1280 spin_unlock(&inode_lock);
1281 return 1;
1282}
1283EXPORT_SYMBOL_GPL(generic_detach_inode);
1284
1285static void generic_forget_inode(struct inode *inode)
1286{
1287 if (!generic_detach_inode(inode))
1288 return;
1267 if (inode->i_data.nrpages) 1289 if (inode->i_data.nrpages)
1268 truncate_inode_pages(&inode->i_data, 0); 1290 truncate_inode_pages(&inode->i_data, 0);
1269 clear_inode(inode); 1291 clear_inode(inode);
@@ -1394,31 +1416,31 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
1394 struct inode *inode = dentry->d_inode; 1416 struct inode *inode = dentry->d_inode;
1395 struct timespec now; 1417 struct timespec now;
1396 1418
1397 if (mnt_want_write(mnt))
1398 return;
1399 if (inode->i_flags & S_NOATIME) 1419 if (inode->i_flags & S_NOATIME)
1400 goto out; 1420 return;
1401 if (IS_NOATIME(inode)) 1421 if (IS_NOATIME(inode))
1402 goto out; 1422 return;
1403 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) 1423 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1404 goto out; 1424 return;
1405 1425
1406 if (mnt->mnt_flags & MNT_NOATIME) 1426 if (mnt->mnt_flags & MNT_NOATIME)
1407 goto out; 1427 return;
1408 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) 1428 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1409 goto out; 1429 return;
1410 1430
1411 now = current_fs_time(inode->i_sb); 1431 now = current_fs_time(inode->i_sb);
1412 1432
1413 if (!relatime_need_update(mnt, inode, now)) 1433 if (!relatime_need_update(mnt, inode, now))
1414 goto out; 1434 return;
1415 1435
1416 if (timespec_equal(&inode->i_atime, &now)) 1436 if (timespec_equal(&inode->i_atime, &now))
1417 goto out; 1437 return;
1438
1439 if (mnt_want_write(mnt))
1440 return;
1418 1441
1419 inode->i_atime = now; 1442 inode->i_atime = now;
1420 mark_inode_dirty_sync(inode); 1443 mark_inode_dirty_sync(inode);
1421out:
1422 mnt_drop_write(mnt); 1444 mnt_drop_write(mnt);
1423} 1445}
1424EXPORT_SYMBOL(touch_atime); 1446EXPORT_SYMBOL(touch_atime);
@@ -1439,34 +1461,37 @@ void file_update_time(struct file *file)
1439{ 1461{
1440 struct inode *inode = file->f_path.dentry->d_inode; 1462 struct inode *inode = file->f_path.dentry->d_inode;
1441 struct timespec now; 1463 struct timespec now;
1442 int sync_it = 0; 1464 enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
1443 int err;
1444 1465
1466 /* First try to exhaust all avenues to not sync */
1445 if (IS_NOCMTIME(inode)) 1467 if (IS_NOCMTIME(inode))
1446 return; 1468 return;
1447 1469
1448 err = mnt_want_write_file(file);
1449 if (err)
1450 return;
1451
1452 now = current_fs_time(inode->i_sb); 1470 now = current_fs_time(inode->i_sb);
1453 if (!timespec_equal(&inode->i_mtime, &now)) { 1471 if (!timespec_equal(&inode->i_mtime, &now))
1454 inode->i_mtime = now; 1472 sync_it = S_MTIME;
1455 sync_it = 1;
1456 }
1457 1473
1458 if (!timespec_equal(&inode->i_ctime, &now)) { 1474 if (!timespec_equal(&inode->i_ctime, &now))
1459 inode->i_ctime = now; 1475 sync_it |= S_CTIME;
1460 sync_it = 1;
1461 }
1462 1476
1463 if (IS_I_VERSION(inode)) { 1477 if (IS_I_VERSION(inode))
1464 inode_inc_iversion(inode); 1478 sync_it |= S_VERSION;
1465 sync_it = 1; 1479
1466 } 1480 if (!sync_it)
1481 return;
1467 1482
1468 if (sync_it) 1483 /* Finally allowed to write? Takes lock. */
1469 mark_inode_dirty_sync(inode); 1484 if (mnt_want_write_file(file))
1485 return;
1486
1487 /* Only change inode inside the lock region */
1488 if (sync_it & S_VERSION)
1489 inode_inc_iversion(inode);
1490 if (sync_it & S_CTIME)
1491 inode->i_ctime = now;
1492 if (sync_it & S_MTIME)
1493 inode->i_mtime = now;
1494 mark_inode_dirty_sync(inode);
1470 mnt_drop_write(file->f_path.mnt); 1495 mnt_drop_write(file->f_path.mnt);
1471} 1496}
1472EXPORT_SYMBOL(file_update_time); 1497EXPORT_SYMBOL(file_update_time);
@@ -1594,7 +1619,8 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
1594 else if (S_ISSOCK(mode)) 1619 else if (S_ISSOCK(mode))
1595 inode->i_fop = &bad_sock_fops; 1620 inode->i_fop = &bad_sock_fops;
1596 else 1621 else
1597 printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n", 1622 printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
1598 mode); 1623 " inode %s:%lu\n", mode, inode->i_sb->s_id,
1624 inode->i_ino);
1599} 1625}
1600EXPORT_SYMBOL(init_special_inode); 1626EXPORT_SYMBOL(init_special_inode);
diff --git a/fs/internal.h b/fs/internal.h
index d55ef562f0bb..515175b8b72e 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -57,6 +57,7 @@ extern int check_unsafe_exec(struct linux_binprm *);
57 * namespace.c 57 * namespace.c
58 */ 58 */
59extern int copy_mount_options(const void __user *, unsigned long *); 59extern int copy_mount_options(const void __user *, unsigned long *);
60extern int copy_mount_string(const void __user *, char **);
60 61
61extern void free_vfsmnt(struct vfsmount *); 62extern void free_vfsmnt(struct vfsmount *);
62extern struct vfsmount *alloc_vfsmnt(const char *); 63extern struct vfsmount *alloc_vfsmnt(const char *);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 5612880fcbe7..7b17a14396ff 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -162,20 +162,21 @@ EXPORT_SYMBOL(fiemap_check_flags);
162static int fiemap_check_ranges(struct super_block *sb, 162static int fiemap_check_ranges(struct super_block *sb,
163 u64 start, u64 len, u64 *new_len) 163 u64 start, u64 len, u64 *new_len)
164{ 164{
165 u64 maxbytes = (u64) sb->s_maxbytes;
166
165 *new_len = len; 167 *new_len = len;
166 168
167 if (len == 0) 169 if (len == 0)
168 return -EINVAL; 170 return -EINVAL;
169 171
170 if (start > sb->s_maxbytes) 172 if (start > maxbytes)
171 return -EFBIG; 173 return -EFBIG;
172 174
173 /* 175 /*
174 * Shrink request scope to what the fs can actually handle. 176 * Shrink request scope to what the fs can actually handle.
175 */ 177 */
176 if ((len > sb->s_maxbytes) || 178 if (len > maxbytes || (maxbytes - len) < start)
177 (sb->s_maxbytes - len) < start) 179 *new_len = maxbytes - start;
178 *new_len = sb->s_maxbytes - start;
179 180
180 return 0; 181 return 0;
181} 182}
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 85f96bc651c7..6b4dcd4f2943 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -46,10 +46,7 @@ static void isofs_put_super(struct super_block *sb)
46#ifdef CONFIG_JOLIET 46#ifdef CONFIG_JOLIET
47 lock_kernel(); 47 lock_kernel();
48 48
49 if (sbi->s_nls_iocharset) { 49 unload_nls(sbi->s_nls_iocharset);
50 unload_nls(sbi->s_nls_iocharset);
51 sbi->s_nls_iocharset = NULL;
52 }
53 50
54 unlock_kernel(); 51 unlock_kernel();
55#endif 52#endif
@@ -912,8 +909,7 @@ out_no_root:
912 printk(KERN_WARNING "%s: get root inode failed\n", __func__); 909 printk(KERN_WARNING "%s: get root inode failed\n", __func__);
913out_no_inode: 910out_no_inode:
914#ifdef CONFIG_JOLIET 911#ifdef CONFIG_JOLIET
915 if (sbi->s_nls_iocharset) 912 unload_nls(sbi->s_nls_iocharset);
916 unload_nls(sbi->s_nls_iocharset);
917#endif 913#endif
918 goto out_freesbi; 914 goto out_freesbi;
919out_no_read: 915out_no_read:
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 61f32f3868cd..b0435dd0654d 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -456,7 +456,7 @@ int cleanup_journal_tail(journal_t *journal)
456{ 456{
457 transaction_t * transaction; 457 transaction_t * transaction;
458 tid_t first_tid; 458 tid_t first_tid;
459 unsigned long blocknr, freed; 459 unsigned int blocknr, freed;
460 460
461 if (is_journal_aborted(journal)) 461 if (is_journal_aborted(journal))
462 return 1; 462 return 1;
@@ -502,8 +502,8 @@ int cleanup_journal_tail(journal_t *journal)
502 freed = freed + journal->j_last - journal->j_first; 502 freed = freed + journal->j_last - journal->j_first;
503 503
504 jbd_debug(1, 504 jbd_debug(1,
505 "Cleaning journal tail from %d to %d (offset %lu), " 505 "Cleaning journal tail from %d to %d (offset %u), "
506 "freeing %lu\n", 506 "freeing %u\n",
507 journal->j_tail_sequence, first_tid, blocknr, freed); 507 journal->j_tail_sequence, first_tid, blocknr, freed);
508 508
509 journal->j_free += freed; 509 journal->j_free += freed;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 618e21c0b7a3..4bd882548c45 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -308,7 +308,7 @@ void journal_commit_transaction(journal_t *journal)
308 int bufs; 308 int bufs;
309 int flags; 309 int flags;
310 int err; 310 int err;
311 unsigned long blocknr; 311 unsigned int blocknr;
312 ktime_t start_time; 312 ktime_t start_time;
313 u64 commit_time; 313 u64 commit_time;
314 char *tagp = NULL; 314 char *tagp = NULL;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index f96f85092d1c..bd3c073b485d 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -276,7 +276,7 @@ static void journal_kill_thread(journal_t *journal)
276int journal_write_metadata_buffer(transaction_t *transaction, 276int journal_write_metadata_buffer(transaction_t *transaction,
277 struct journal_head *jh_in, 277 struct journal_head *jh_in,
278 struct journal_head **jh_out, 278 struct journal_head **jh_out,
279 unsigned long blocknr) 279 unsigned int blocknr)
280{ 280{
281 int need_copy_out = 0; 281 int need_copy_out = 0;
282 int done_copy_out = 0; 282 int done_copy_out = 0;
@@ -567,9 +567,9 @@ int log_wait_commit(journal_t *journal, tid_t tid)
567 * Log buffer allocation routines: 567 * Log buffer allocation routines:
568 */ 568 */
569 569
570int journal_next_log_block(journal_t *journal, unsigned long *retp) 570int journal_next_log_block(journal_t *journal, unsigned int *retp)
571{ 571{
572 unsigned long blocknr; 572 unsigned int blocknr;
573 573
574 spin_lock(&journal->j_state_lock); 574 spin_lock(&journal->j_state_lock);
575 J_ASSERT(journal->j_free > 1); 575 J_ASSERT(journal->j_free > 1);
@@ -590,11 +590,11 @@ int journal_next_log_block(journal_t *journal, unsigned long *retp)
590 * this is a no-op. If needed, we can use j_blk_offset - everything is 590 * this is a no-op. If needed, we can use j_blk_offset - everything is
591 * ready. 591 * ready.
592 */ 592 */
593int journal_bmap(journal_t *journal, unsigned long blocknr, 593int journal_bmap(journal_t *journal, unsigned int blocknr,
594 unsigned long *retp) 594 unsigned int *retp)
595{ 595{
596 int err = 0; 596 int err = 0;
597 unsigned long ret; 597 unsigned int ret;
598 598
599 if (journal->j_inode) { 599 if (journal->j_inode) {
600 ret = bmap(journal->j_inode, blocknr); 600 ret = bmap(journal->j_inode, blocknr);
@@ -604,7 +604,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr,
604 char b[BDEVNAME_SIZE]; 604 char b[BDEVNAME_SIZE];
605 605
606 printk(KERN_ALERT "%s: journal block not found " 606 printk(KERN_ALERT "%s: journal block not found "
607 "at offset %lu on %s\n", 607 "at offset %u on %s\n",
608 __func__, 608 __func__,
609 blocknr, 609 blocknr,
610 bdevname(journal->j_dev, b)); 610 bdevname(journal->j_dev, b));
@@ -630,7 +630,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr,
630struct journal_head *journal_get_descriptor_buffer(journal_t *journal) 630struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
631{ 631{
632 struct buffer_head *bh; 632 struct buffer_head *bh;
633 unsigned long blocknr; 633 unsigned int blocknr;
634 int err; 634 int err;
635 635
636 err = journal_next_log_block(journal, &blocknr); 636 err = journal_next_log_block(journal, &blocknr);
@@ -774,7 +774,7 @@ journal_t * journal_init_inode (struct inode *inode)
774 journal_t *journal = journal_init_common(); 774 journal_t *journal = journal_init_common();
775 int err; 775 int err;
776 int n; 776 int n;
777 unsigned long blocknr; 777 unsigned int blocknr;
778 778
779 if (!journal) 779 if (!journal)
780 return NULL; 780 return NULL;
@@ -846,12 +846,12 @@ static void journal_fail_superblock (journal_t *journal)
846static int journal_reset(journal_t *journal) 846static int journal_reset(journal_t *journal)
847{ 847{
848 journal_superblock_t *sb = journal->j_superblock; 848 journal_superblock_t *sb = journal->j_superblock;
849 unsigned long first, last; 849 unsigned int first, last;
850 850
851 first = be32_to_cpu(sb->s_first); 851 first = be32_to_cpu(sb->s_first);
852 last = be32_to_cpu(sb->s_maxlen); 852 last = be32_to_cpu(sb->s_maxlen);
853 if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) { 853 if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) {
854 printk(KERN_ERR "JBD: Journal too short (blocks %lu-%lu).\n", 854 printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n",
855 first, last); 855 first, last);
856 journal_fail_superblock(journal); 856 journal_fail_superblock(journal);
857 return -EINVAL; 857 return -EINVAL;
@@ -885,7 +885,7 @@ static int journal_reset(journal_t *journal)
885 **/ 885 **/
886int journal_create(journal_t *journal) 886int journal_create(journal_t *journal)
887{ 887{
888 unsigned long blocknr; 888 unsigned int blocknr;
889 struct buffer_head *bh; 889 struct buffer_head *bh;
890 journal_superblock_t *sb; 890 journal_superblock_t *sb;
891 int i, err; 891 int i, err;
@@ -969,14 +969,14 @@ void journal_update_superblock(journal_t *journal, int wait)
969 if (sb->s_start == 0 && journal->j_tail_sequence == 969 if (sb->s_start == 0 && journal->j_tail_sequence ==
970 journal->j_transaction_sequence) { 970 journal->j_transaction_sequence) {
971 jbd_debug(1,"JBD: Skipping superblock update on recovered sb " 971 jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
972 "(start %ld, seq %d, errno %d)\n", 972 "(start %u, seq %d, errno %d)\n",
973 journal->j_tail, journal->j_tail_sequence, 973 journal->j_tail, journal->j_tail_sequence,
974 journal->j_errno); 974 journal->j_errno);
975 goto out; 975 goto out;
976 } 976 }
977 977
978 spin_lock(&journal->j_state_lock); 978 spin_lock(&journal->j_state_lock);
979 jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", 979 jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n",
980 journal->j_tail, journal->j_tail_sequence, journal->j_errno); 980 journal->j_tail, journal->j_tail_sequence, journal->j_errno);
981 981
982 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); 982 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
@@ -1371,7 +1371,7 @@ int journal_flush(journal_t *journal)
1371{ 1371{
1372 int err = 0; 1372 int err = 0;
1373 transaction_t *transaction = NULL; 1373 transaction_t *transaction = NULL;
1374 unsigned long old_tail; 1374 unsigned int old_tail;
1375 1375
1376 spin_lock(&journal->j_state_lock); 1376 spin_lock(&journal->j_state_lock);
1377 1377
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index db5e982c5ddf..cb1a49ae605e 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -70,7 +70,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
70{ 70{
71 int err; 71 int err;
72 unsigned int max, nbufs, next; 72 unsigned int max, nbufs, next;
73 unsigned long blocknr; 73 unsigned int blocknr;
74 struct buffer_head *bh; 74 struct buffer_head *bh;
75 75
76 struct buffer_head * bufs[MAXBUF]; 76 struct buffer_head * bufs[MAXBUF];
@@ -132,7 +132,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
132 unsigned int offset) 132 unsigned int offset)
133{ 133{
134 int err; 134 int err;
135 unsigned long blocknr; 135 unsigned int blocknr;
136 struct buffer_head *bh; 136 struct buffer_head *bh;
137 137
138 *bhp = NULL; 138 *bhp = NULL;
@@ -314,7 +314,7 @@ static int do_one_pass(journal_t *journal,
314 struct recovery_info *info, enum passtype pass) 314 struct recovery_info *info, enum passtype pass)
315{ 315{
316 unsigned int first_commit_ID, next_commit_ID; 316 unsigned int first_commit_ID, next_commit_ID;
317 unsigned long next_log_block; 317 unsigned int next_log_block;
318 int err, success = 0; 318 int err, success = 0;
319 journal_superblock_t * sb; 319 journal_superblock_t * sb;
320 journal_header_t * tmp; 320 journal_header_t * tmp;
@@ -367,14 +367,14 @@ static int do_one_pass(journal_t *journal,
367 if (tid_geq(next_commit_ID, info->end_transaction)) 367 if (tid_geq(next_commit_ID, info->end_transaction))
368 break; 368 break;
369 369
370 jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", 370 jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n",
371 next_commit_ID, next_log_block, journal->j_last); 371 next_commit_ID, next_log_block, journal->j_last);
372 372
373 /* Skip over each chunk of the transaction looking 373 /* Skip over each chunk of the transaction looking
374 * either the next descriptor block or the final commit 374 * either the next descriptor block or the final commit
375 * record. */ 375 * record. */
376 376
377 jbd_debug(3, "JBD: checking block %ld\n", next_log_block); 377 jbd_debug(3, "JBD: checking block %u\n", next_log_block);
378 err = jread(&bh, journal, next_log_block); 378 err = jread(&bh, journal, next_log_block);
379 if (err) 379 if (err)
380 goto failed; 380 goto failed;
@@ -429,7 +429,7 @@ static int do_one_pass(journal_t *journal,
429 tagp = &bh->b_data[sizeof(journal_header_t)]; 429 tagp = &bh->b_data[sizeof(journal_header_t)];
430 while ((tagp - bh->b_data +sizeof(journal_block_tag_t)) 430 while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
431 <= journal->j_blocksize) { 431 <= journal->j_blocksize) {
432 unsigned long io_block; 432 unsigned int io_block;
433 433
434 tag = (journal_block_tag_t *) tagp; 434 tag = (journal_block_tag_t *) tagp;
435 flags = be32_to_cpu(tag->t_flags); 435 flags = be32_to_cpu(tag->t_flags);
@@ -443,10 +443,10 @@ static int do_one_pass(journal_t *journal,
443 success = err; 443 success = err;
444 printk (KERN_ERR 444 printk (KERN_ERR
445 "JBD: IO error %d recovering " 445 "JBD: IO error %d recovering "
446 "block %ld in log\n", 446 "block %u in log\n",
447 err, io_block); 447 err, io_block);
448 } else { 448 } else {
449 unsigned long blocknr; 449 unsigned int blocknr;
450 450
451 J_ASSERT(obh != NULL); 451 J_ASSERT(obh != NULL);
452 blocknr = be32_to_cpu(tag->t_blocknr); 452 blocknr = be32_to_cpu(tag->t_blocknr);
@@ -581,7 +581,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
581 max = be32_to_cpu(header->r_count); 581 max = be32_to_cpu(header->r_count);
582 582
583 while (offset < max) { 583 while (offset < max) {
584 unsigned long blocknr; 584 unsigned int blocknr;
585 int err; 585 int err;
586 586
587 blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset))); 587 blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index da6cd9bdaabc..ad717328343a 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -101,7 +101,7 @@ struct jbd_revoke_record_s
101{ 101{
102 struct list_head hash; 102 struct list_head hash;
103 tid_t sequence; /* Used for recovery only */ 103 tid_t sequence; /* Used for recovery only */
104 unsigned long blocknr; 104 unsigned int blocknr;
105}; 105};
106 106
107 107
@@ -126,7 +126,7 @@ static void flush_descriptor(journal_t *, struct journal_head *, int, int);
126/* Utility functions to maintain the revoke table */ 126/* Utility functions to maintain the revoke table */
127 127
128/* Borrowed from buffer.c: this is a tried and tested block hash function */ 128/* Borrowed from buffer.c: this is a tried and tested block hash function */
129static inline int hash(journal_t *journal, unsigned long block) 129static inline int hash(journal_t *journal, unsigned int block)
130{ 130{
131 struct jbd_revoke_table_s *table = journal->j_revoke; 131 struct jbd_revoke_table_s *table = journal->j_revoke;
132 int hash_shift = table->hash_shift; 132 int hash_shift = table->hash_shift;
@@ -136,7 +136,7 @@ static inline int hash(journal_t *journal, unsigned long block)
136 (block << (hash_shift - 12))) & (table->hash_size - 1); 136 (block << (hash_shift - 12))) & (table->hash_size - 1);
137} 137}
138 138
139static int insert_revoke_hash(journal_t *journal, unsigned long blocknr, 139static int insert_revoke_hash(journal_t *journal, unsigned int blocknr,
140 tid_t seq) 140 tid_t seq)
141{ 141{
142 struct list_head *hash_list; 142 struct list_head *hash_list;
@@ -166,7 +166,7 @@ oom:
166/* Find a revoke record in the journal's hash table. */ 166/* Find a revoke record in the journal's hash table. */
167 167
168static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal, 168static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
169 unsigned long blocknr) 169 unsigned int blocknr)
170{ 170{
171 struct list_head *hash_list; 171 struct list_head *hash_list;
172 struct jbd_revoke_record_s *record; 172 struct jbd_revoke_record_s *record;
@@ -332,7 +332,7 @@ void journal_destroy_revoke(journal_t *journal)
332 * by one. 332 * by one.
333 */ 333 */
334 334
335int journal_revoke(handle_t *handle, unsigned long blocknr, 335int journal_revoke(handle_t *handle, unsigned int blocknr,
336 struct buffer_head *bh_in) 336 struct buffer_head *bh_in)
337{ 337{
338 struct buffer_head *bh = NULL; 338 struct buffer_head *bh = NULL;
@@ -401,7 +401,7 @@ int journal_revoke(handle_t *handle, unsigned long blocknr,
401 } 401 }
402 } 402 }
403 403
404 jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in); 404 jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in);
405 err = insert_revoke_hash(journal, blocknr, 405 err = insert_revoke_hash(journal, blocknr,
406 handle->h_transaction->t_tid); 406 handle->h_transaction->t_tid);
407 BUFFER_TRACE(bh_in, "exit"); 407 BUFFER_TRACE(bh_in, "exit");
@@ -644,7 +644,7 @@ static void flush_descriptor(journal_t *journal,
644 */ 644 */
645 645
646int journal_set_revoke(journal_t *journal, 646int journal_set_revoke(journal_t *journal,
647 unsigned long blocknr, 647 unsigned int blocknr,
648 tid_t sequence) 648 tid_t sequence)
649{ 649{
650 struct jbd_revoke_record_s *record; 650 struct jbd_revoke_record_s *record;
@@ -668,7 +668,7 @@ int journal_set_revoke(journal_t *journal,
668 */ 668 */
669 669
670int journal_test_revoke(journal_t *journal, 670int journal_test_revoke(journal_t *journal,
671 unsigned long blocknr, 671 unsigned int blocknr,
672 tid_t sequence) 672 tid_t sequence)
673{ 673{
674 struct jbd_revoke_record_s *record; 674 struct jbd_revoke_record_s *record;
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index c03ac11f74be..006f9ad838a2 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -56,7 +56,8 @@ get_transaction(journal_t *journal, transaction_t *transaction)
56 spin_lock_init(&transaction->t_handle_lock); 56 spin_lock_init(&transaction->t_handle_lock);
57 57
58 /* Set up the commit timer for the new transaction. */ 58 /* Set up the commit timer for the new transaction. */
59 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); 59 journal->j_commit_timer.expires =
60 round_jiffies_up(transaction->t_expires);
60 add_timer(&journal->j_commit_timer); 61 add_timer(&journal->j_commit_timer);
61 62
62 J_ASSERT(journal->j_running_transaction == NULL); 63 J_ASSERT(journal->j_running_transaction == NULL);
@@ -228,6 +229,8 @@ repeat_locked:
228 __log_space_left(journal)); 229 __log_space_left(journal));
229 spin_unlock(&transaction->t_handle_lock); 230 spin_unlock(&transaction->t_handle_lock);
230 spin_unlock(&journal->j_state_lock); 231 spin_unlock(&journal->j_state_lock);
232
233 lock_map_acquire(&handle->h_lockdep_map);
231out: 234out:
232 if (unlikely(new_transaction)) /* It's usually NULL */ 235 if (unlikely(new_transaction)) /* It's usually NULL */
233 kfree(new_transaction); 236 kfree(new_transaction);
@@ -292,9 +295,6 @@ handle_t *journal_start(journal_t *journal, int nblocks)
292 handle = ERR_PTR(err); 295 handle = ERR_PTR(err);
293 goto out; 296 goto out;
294 } 297 }
295
296 lock_map_acquire(&handle->h_lockdep_map);
297
298out: 298out:
299 return handle; 299 return handle;
300} 300}
@@ -416,6 +416,7 @@ int journal_restart(handle_t *handle, int nblocks)
416 __log_start_commit(journal, transaction->t_tid); 416 __log_start_commit(journal, transaction->t_tid);
417 spin_unlock(&journal->j_state_lock); 417 spin_unlock(&journal->j_state_lock);
418 418
419 lock_map_release(&handle->h_lockdep_map);
419 handle->h_buffer_credits = nblocks; 420 handle->h_buffer_credits = nblocks;
420 ret = start_this_handle(journal, handle); 421 ret = start_this_handle(journal, handle);
421 return ret; 422 return ret;
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 5d70b3e6d49b..ca0f5eb62b20 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -643,6 +643,7 @@ out:
643 643
644int __jbd2_journal_remove_checkpoint(struct journal_head *jh) 644int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
645{ 645{
646 struct transaction_chp_stats_s *stats;
646 transaction_t *transaction; 647 transaction_t *transaction;
647 journal_t *journal; 648 journal_t *journal;
648 int ret = 0; 649 int ret = 0;
@@ -679,6 +680,12 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
679 680
680 /* OK, that was the last buffer for the transaction: we can now 681 /* OK, that was the last buffer for the transaction: we can now
681 safely remove this transaction from the log */ 682 safely remove this transaction from the log */
683 stats = &transaction->t_chp_stats;
684 if (stats->cs_chp_time)
685 stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time,
686 jiffies);
687 trace_jbd2_checkpoint_stats(journal->j_fs_dev->bd_dev,
688 transaction->t_tid, stats);
682 689
683 __jbd2_journal_drop_transaction(journal, transaction); 690 __jbd2_journal_drop_transaction(journal, transaction);
684 kfree(transaction); 691 kfree(transaction);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7b4088b2364d..d4cfd6d2779e 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -25,6 +25,7 @@
25#include <linux/writeback.h> 25#include <linux/writeback.h>
26#include <linux/backing-dev.h> 26#include <linux/backing-dev.h>
27#include <linux/bio.h> 27#include <linux/bio.h>
28#include <linux/blkdev.h>
28#include <trace/events/jbd2.h> 29#include <trace/events/jbd2.h>
29 30
30/* 31/*
@@ -133,8 +134,8 @@ static int journal_submit_commit_record(journal_t *journal,
133 bh->b_end_io = journal_end_buffer_io_sync; 134 bh->b_end_io = journal_end_buffer_io_sync;
134 135
135 if (journal->j_flags & JBD2_BARRIER && 136 if (journal->j_flags & JBD2_BARRIER &&
136 !JBD2_HAS_INCOMPAT_FEATURE(journal, 137 !JBD2_HAS_INCOMPAT_FEATURE(journal,
137 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 138 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
138 set_buffer_ordered(bh); 139 set_buffer_ordered(bh);
139 barrier_done = 1; 140 barrier_done = 1;
140 } 141 }
@@ -220,7 +221,6 @@ static int journal_submit_inode_data_buffers(struct address_space *mapping)
220 .nr_to_write = mapping->nrpages * 2, 221 .nr_to_write = mapping->nrpages * 2,
221 .range_start = 0, 222 .range_start = 0,
222 .range_end = i_size_read(mapping->host), 223 .range_end = i_size_read(mapping->host),
223 .for_writepages = 1,
224 }; 224 };
225 225
226 ret = generic_writepages(mapping, &wbc); 226 ret = generic_writepages(mapping, &wbc);
@@ -410,10 +410,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
410 if (commit_transaction->t_synchronous_commit) 410 if (commit_transaction->t_synchronous_commit)
411 write_op = WRITE_SYNC_PLUG; 411 write_op = WRITE_SYNC_PLUG;
412 trace_jbd2_commit_locking(journal, commit_transaction); 412 trace_jbd2_commit_locking(journal, commit_transaction);
413 stats.u.run.rs_wait = commit_transaction->t_max_wait; 413 stats.run.rs_wait = commit_transaction->t_max_wait;
414 stats.u.run.rs_locked = jiffies; 414 stats.run.rs_locked = jiffies;
415 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, 415 stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
416 stats.u.run.rs_locked); 416 stats.run.rs_locked);
417 417
418 spin_lock(&commit_transaction->t_handle_lock); 418 spin_lock(&commit_transaction->t_handle_lock);
419 while (commit_transaction->t_updates) { 419 while (commit_transaction->t_updates) {
@@ -486,9 +486,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
486 jbd2_journal_switch_revoke_table(journal); 486 jbd2_journal_switch_revoke_table(journal);
487 487
488 trace_jbd2_commit_flushing(journal, commit_transaction); 488 trace_jbd2_commit_flushing(journal, commit_transaction);
489 stats.u.run.rs_flushing = jiffies; 489 stats.run.rs_flushing = jiffies;
490 stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked, 490 stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
491 stats.u.run.rs_flushing); 491 stats.run.rs_flushing);
492 492
493 commit_transaction->t_state = T_FLUSH; 493 commit_transaction->t_state = T_FLUSH;
494 journal->j_committing_transaction = commit_transaction; 494 journal->j_committing_transaction = commit_transaction;
@@ -523,11 +523,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
523 spin_unlock(&journal->j_state_lock); 523 spin_unlock(&journal->j_state_lock);
524 524
525 trace_jbd2_commit_logging(journal, commit_transaction); 525 trace_jbd2_commit_logging(journal, commit_transaction);
526 stats.u.run.rs_logging = jiffies; 526 stats.run.rs_logging = jiffies;
527 stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing, 527 stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
528 stats.u.run.rs_logging); 528 stats.run.rs_logging);
529 stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits; 529 stats.run.rs_blocks = commit_transaction->t_outstanding_credits;
530 stats.u.run.rs_blocks_logged = 0; 530 stats.run.rs_blocks_logged = 0;
531 531
532 J_ASSERT(commit_transaction->t_nr_buffers <= 532 J_ASSERT(commit_transaction->t_nr_buffers <=
533 commit_transaction->t_outstanding_credits); 533 commit_transaction->t_outstanding_credits);
@@ -695,7 +695,7 @@ start_journal_io:
695 submit_bh(write_op, bh); 695 submit_bh(write_op, bh);
696 } 696 }
697 cond_resched(); 697 cond_resched();
698 stats.u.run.rs_blocks_logged += bufs; 698 stats.run.rs_blocks_logged += bufs;
699 699
700 /* Force a new descriptor to be generated next 700 /* Force a new descriptor to be generated next
701 time round the loop. */ 701 time round the loop. */
@@ -707,11 +707,13 @@ start_journal_io:
707 /* Done it all: now write the commit record asynchronously. */ 707 /* Done it all: now write the commit record asynchronously. */
708 708
709 if (JBD2_HAS_INCOMPAT_FEATURE(journal, 709 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
710 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 710 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
711 err = journal_submit_commit_record(journal, commit_transaction, 711 err = journal_submit_commit_record(journal, commit_transaction,
712 &cbh, crc32_sum); 712 &cbh, crc32_sum);
713 if (err) 713 if (err)
714 __jbd2_journal_abort_hard(journal); 714 __jbd2_journal_abort_hard(journal);
715 if (journal->j_flags & JBD2_BARRIER)
716 blkdev_issue_flush(journal->j_dev, NULL);
715 } 717 }
716 718
717 /* 719 /*
@@ -834,7 +836,7 @@ wait_for_iobuf:
834 jbd_debug(3, "JBD: commit phase 5\n"); 836 jbd_debug(3, "JBD: commit phase 5\n");
835 837
836 if (!JBD2_HAS_INCOMPAT_FEATURE(journal, 838 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
837 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 839 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
838 err = journal_submit_commit_record(journal, commit_transaction, 840 err = journal_submit_commit_record(journal, commit_transaction,
839 &cbh, crc32_sum); 841 &cbh, crc32_sum);
840 if (err) 842 if (err)
@@ -986,33 +988,30 @@ restart_loop:
986 J_ASSERT(commit_transaction->t_state == T_COMMIT); 988 J_ASSERT(commit_transaction->t_state == T_COMMIT);
987 989
988 commit_transaction->t_start = jiffies; 990 commit_transaction->t_start = jiffies;
989 stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging, 991 stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
990 commit_transaction->t_start); 992 commit_transaction->t_start);
991 993
992 /* 994 /*
993 * File the transaction for history 995 * File the transaction statistics
994 */ 996 */
995 stats.ts_type = JBD2_STATS_RUN;
996 stats.ts_tid = commit_transaction->t_tid; 997 stats.ts_tid = commit_transaction->t_tid;
997 stats.u.run.rs_handle_count = commit_transaction->t_handle_count; 998 stats.run.rs_handle_count = commit_transaction->t_handle_count;
998 spin_lock(&journal->j_history_lock); 999 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
999 memcpy(journal->j_history + journal->j_history_cur, &stats, 1000 commit_transaction->t_tid, &stats.run);
1000 sizeof(stats));
1001 if (++journal->j_history_cur == journal->j_history_max)
1002 journal->j_history_cur = 0;
1003 1001
1004 /* 1002 /*
1005 * Calculate overall stats 1003 * Calculate overall stats
1006 */ 1004 */
1005 spin_lock(&journal->j_history_lock);
1007 journal->j_stats.ts_tid++; 1006 journal->j_stats.ts_tid++;
1008 journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait; 1007 journal->j_stats.run.rs_wait += stats.run.rs_wait;
1009 journal->j_stats.u.run.rs_running += stats.u.run.rs_running; 1008 journal->j_stats.run.rs_running += stats.run.rs_running;
1010 journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked; 1009 journal->j_stats.run.rs_locked += stats.run.rs_locked;
1011 journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing; 1010 journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1012 journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging; 1011 journal->j_stats.run.rs_logging += stats.run.rs_logging;
1013 journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count; 1012 journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1014 journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks; 1013 journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1015 journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged; 1014 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1016 spin_unlock(&journal->j_history_lock); 1015 spin_unlock(&journal->j_history_lock);
1017 1016
1018 commit_transaction->t_state = T_FINISHED; 1017 commit_transaction->t_state = T_FINISHED;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e378cb383979..b0ab5219becb 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -136,10 +136,6 @@ static int kjournald2(void *arg)
136 journal->j_task = current; 136 journal->j_task = current;
137 wake_up(&journal->j_wait_done_commit); 137 wake_up(&journal->j_wait_done_commit);
138 138
139 printk(KERN_INFO "kjournald2 starting: pid %d, dev %s, "
140 "commit interval %ld seconds\n", current->pid,
141 journal->j_devname, journal->j_commit_interval / HZ);
142
143 /* 139 /*
144 * And now, wait forever for commit wakeup events. 140 * And now, wait forever for commit wakeup events.
145 */ 141 */
@@ -223,7 +219,8 @@ static int jbd2_journal_start_thread(journal_t *journal)
223{ 219{
224 struct task_struct *t; 220 struct task_struct *t;
225 221
226 t = kthread_run(kjournald2, journal, "kjournald2"); 222 t = kthread_run(kjournald2, journal, "jbd2/%s",
223 journal->j_devname);
227 if (IS_ERR(t)) 224 if (IS_ERR(t))
228 return PTR_ERR(t); 225 return PTR_ERR(t);
229 226
@@ -679,153 +676,6 @@ struct jbd2_stats_proc_session {
679 int max; 676 int max;
680}; 677};
681 678
682static void *jbd2_history_skip_empty(struct jbd2_stats_proc_session *s,
683 struct transaction_stats_s *ts,
684 int first)
685{
686 if (ts == s->stats + s->max)
687 ts = s->stats;
688 if (!first && ts == s->stats + s->start)
689 return NULL;
690 while (ts->ts_type == 0) {
691 ts++;
692 if (ts == s->stats + s->max)
693 ts = s->stats;
694 if (ts == s->stats + s->start)
695 return NULL;
696 }
697 return ts;
698
699}
700
701static void *jbd2_seq_history_start(struct seq_file *seq, loff_t *pos)
702{
703 struct jbd2_stats_proc_session *s = seq->private;
704 struct transaction_stats_s *ts;
705 int l = *pos;
706
707 if (l == 0)
708 return SEQ_START_TOKEN;
709 ts = jbd2_history_skip_empty(s, s->stats + s->start, 1);
710 if (!ts)
711 return NULL;
712 l--;
713 while (l) {
714 ts = jbd2_history_skip_empty(s, ++ts, 0);
715 if (!ts)
716 break;
717 l--;
718 }
719 return ts;
720}
721
722static void *jbd2_seq_history_next(struct seq_file *seq, void *v, loff_t *pos)
723{
724 struct jbd2_stats_proc_session *s = seq->private;
725 struct transaction_stats_s *ts = v;
726
727 ++*pos;
728 if (v == SEQ_START_TOKEN)
729 return jbd2_history_skip_empty(s, s->stats + s->start, 1);
730 else
731 return jbd2_history_skip_empty(s, ++ts, 0);
732}
733
734static int jbd2_seq_history_show(struct seq_file *seq, void *v)
735{
736 struct transaction_stats_s *ts = v;
737 if (v == SEQ_START_TOKEN) {
738 seq_printf(seq, "%-4s %-5s %-5s %-5s %-5s %-5s %-5s %-6s %-5s "
739 "%-5s %-5s %-5s %-5s %-5s\n", "R/C", "tid",
740 "wait", "run", "lock", "flush", "log", "hndls",
741 "block", "inlog", "ctime", "write", "drop",
742 "close");
743 return 0;
744 }
745 if (ts->ts_type == JBD2_STATS_RUN)
746 seq_printf(seq, "%-4s %-5lu %-5u %-5u %-5u %-5u %-5u "
747 "%-6lu %-5lu %-5lu\n", "R", ts->ts_tid,
748 jiffies_to_msecs(ts->u.run.rs_wait),
749 jiffies_to_msecs(ts->u.run.rs_running),
750 jiffies_to_msecs(ts->u.run.rs_locked),
751 jiffies_to_msecs(ts->u.run.rs_flushing),
752 jiffies_to_msecs(ts->u.run.rs_logging),
753 ts->u.run.rs_handle_count,
754 ts->u.run.rs_blocks,
755 ts->u.run.rs_blocks_logged);
756 else if (ts->ts_type == JBD2_STATS_CHECKPOINT)
757 seq_printf(seq, "%-4s %-5lu %48s %-5u %-5lu %-5lu %-5lu\n",
758 "C", ts->ts_tid, " ",
759 jiffies_to_msecs(ts->u.chp.cs_chp_time),
760 ts->u.chp.cs_written, ts->u.chp.cs_dropped,
761 ts->u.chp.cs_forced_to_close);
762 else
763 J_ASSERT(0);
764 return 0;
765}
766
767static void jbd2_seq_history_stop(struct seq_file *seq, void *v)
768{
769}
770
771static struct seq_operations jbd2_seq_history_ops = {
772 .start = jbd2_seq_history_start,
773 .next = jbd2_seq_history_next,
774 .stop = jbd2_seq_history_stop,
775 .show = jbd2_seq_history_show,
776};
777
778static int jbd2_seq_history_open(struct inode *inode, struct file *file)
779{
780 journal_t *journal = PDE(inode)->data;
781 struct jbd2_stats_proc_session *s;
782 int rc, size;
783
784 s = kmalloc(sizeof(*s), GFP_KERNEL);
785 if (s == NULL)
786 return -ENOMEM;
787 size = sizeof(struct transaction_stats_s) * journal->j_history_max;
788 s->stats = kmalloc(size, GFP_KERNEL);
789 if (s->stats == NULL) {
790 kfree(s);
791 return -ENOMEM;
792 }
793 spin_lock(&journal->j_history_lock);
794 memcpy(s->stats, journal->j_history, size);
795 s->max = journal->j_history_max;
796 s->start = journal->j_history_cur % s->max;
797 spin_unlock(&journal->j_history_lock);
798
799 rc = seq_open(file, &jbd2_seq_history_ops);
800 if (rc == 0) {
801 struct seq_file *m = file->private_data;
802 m->private = s;
803 } else {
804 kfree(s->stats);
805 kfree(s);
806 }
807 return rc;
808
809}
810
811static int jbd2_seq_history_release(struct inode *inode, struct file *file)
812{
813 struct seq_file *seq = file->private_data;
814 struct jbd2_stats_proc_session *s = seq->private;
815
816 kfree(s->stats);
817 kfree(s);
818 return seq_release(inode, file);
819}
820
821static struct file_operations jbd2_seq_history_fops = {
822 .owner = THIS_MODULE,
823 .open = jbd2_seq_history_open,
824 .read = seq_read,
825 .llseek = seq_lseek,
826 .release = jbd2_seq_history_release,
827};
828
829static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos) 679static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
830{ 680{
831 return *pos ? NULL : SEQ_START_TOKEN; 681 return *pos ? NULL : SEQ_START_TOKEN;
@@ -842,29 +692,29 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
842 692
843 if (v != SEQ_START_TOKEN) 693 if (v != SEQ_START_TOKEN)
844 return 0; 694 return 0;
845 seq_printf(seq, "%lu transaction, each upto %u blocks\n", 695 seq_printf(seq, "%lu transaction, each up to %u blocks\n",
846 s->stats->ts_tid, 696 s->stats->ts_tid,
847 s->journal->j_max_transaction_buffers); 697 s->journal->j_max_transaction_buffers);
848 if (s->stats->ts_tid == 0) 698 if (s->stats->ts_tid == 0)
849 return 0; 699 return 0;
850 seq_printf(seq, "average: \n %ums waiting for transaction\n", 700 seq_printf(seq, "average: \n %ums waiting for transaction\n",
851 jiffies_to_msecs(s->stats->u.run.rs_wait / s->stats->ts_tid)); 701 jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid));
852 seq_printf(seq, " %ums running transaction\n", 702 seq_printf(seq, " %ums running transaction\n",
853 jiffies_to_msecs(s->stats->u.run.rs_running / s->stats->ts_tid)); 703 jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid));
854 seq_printf(seq, " %ums transaction was being locked\n", 704 seq_printf(seq, " %ums transaction was being locked\n",
855 jiffies_to_msecs(s->stats->u.run.rs_locked / s->stats->ts_tid)); 705 jiffies_to_msecs(s->stats->run.rs_locked / s->stats->ts_tid));
856 seq_printf(seq, " %ums flushing data (in ordered mode)\n", 706 seq_printf(seq, " %ums flushing data (in ordered mode)\n",
857 jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid)); 707 jiffies_to_msecs(s->stats->run.rs_flushing / s->stats->ts_tid));
858 seq_printf(seq, " %ums logging transaction\n", 708 seq_printf(seq, " %ums logging transaction\n",
859 jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid)); 709 jiffies_to_msecs(s->stats->run.rs_logging / s->stats->ts_tid));
860 seq_printf(seq, " %lluus average transaction commit time\n", 710 seq_printf(seq, " %lluus average transaction commit time\n",
861 div_u64(s->journal->j_average_commit_time, 1000)); 711 div_u64(s->journal->j_average_commit_time, 1000));
862 seq_printf(seq, " %lu handles per transaction\n", 712 seq_printf(seq, " %lu handles per transaction\n",
863 s->stats->u.run.rs_handle_count / s->stats->ts_tid); 713 s->stats->run.rs_handle_count / s->stats->ts_tid);
864 seq_printf(seq, " %lu blocks per transaction\n", 714 seq_printf(seq, " %lu blocks per transaction\n",
865 s->stats->u.run.rs_blocks / s->stats->ts_tid); 715 s->stats->run.rs_blocks / s->stats->ts_tid);
866 seq_printf(seq, " %lu logged blocks per transaction\n", 716 seq_printf(seq, " %lu logged blocks per transaction\n",
867 s->stats->u.run.rs_blocks_logged / s->stats->ts_tid); 717 s->stats->run.rs_blocks_logged / s->stats->ts_tid);
868 return 0; 718 return 0;
869} 719}
870 720
@@ -872,7 +722,7 @@ static void jbd2_seq_info_stop(struct seq_file *seq, void *v)
872{ 722{
873} 723}
874 724
875static struct seq_operations jbd2_seq_info_ops = { 725static const struct seq_operations jbd2_seq_info_ops = {
876 .start = jbd2_seq_info_start, 726 .start = jbd2_seq_info_start,
877 .next = jbd2_seq_info_next, 727 .next = jbd2_seq_info_next,
878 .stop = jbd2_seq_info_stop, 728 .stop = jbd2_seq_info_stop,
@@ -920,7 +770,7 @@ static int jbd2_seq_info_release(struct inode *inode, struct file *file)
920 return seq_release(inode, file); 770 return seq_release(inode, file);
921} 771}
922 772
923static struct file_operations jbd2_seq_info_fops = { 773static const struct file_operations jbd2_seq_info_fops = {
924 .owner = THIS_MODULE, 774 .owner = THIS_MODULE,
925 .open = jbd2_seq_info_open, 775 .open = jbd2_seq_info_open,
926 .read = seq_read, 776 .read = seq_read,
@@ -934,8 +784,6 @@ static void jbd2_stats_proc_init(journal_t *journal)
934{ 784{
935 journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats); 785 journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
936 if (journal->j_proc_entry) { 786 if (journal->j_proc_entry) {
937 proc_create_data("history", S_IRUGO, journal->j_proc_entry,
938 &jbd2_seq_history_fops, journal);
939 proc_create_data("info", S_IRUGO, journal->j_proc_entry, 787 proc_create_data("info", S_IRUGO, journal->j_proc_entry,
940 &jbd2_seq_info_fops, journal); 788 &jbd2_seq_info_fops, journal);
941 } 789 }
@@ -944,27 +792,9 @@ static void jbd2_stats_proc_init(journal_t *journal)
944static void jbd2_stats_proc_exit(journal_t *journal) 792static void jbd2_stats_proc_exit(journal_t *journal)
945{ 793{
946 remove_proc_entry("info", journal->j_proc_entry); 794 remove_proc_entry("info", journal->j_proc_entry);
947 remove_proc_entry("history", journal->j_proc_entry);
948 remove_proc_entry(journal->j_devname, proc_jbd2_stats); 795 remove_proc_entry(journal->j_devname, proc_jbd2_stats);
949} 796}
950 797
951static void journal_init_stats(journal_t *journal)
952{
953 int size;
954
955 if (!proc_jbd2_stats)
956 return;
957
958 journal->j_history_max = 100;
959 size = sizeof(struct transaction_stats_s) * journal->j_history_max;
960 journal->j_history = kzalloc(size, GFP_KERNEL);
961 if (!journal->j_history) {
962 journal->j_history_max = 0;
963 return;
964 }
965 spin_lock_init(&journal->j_history_lock);
966}
967
968/* 798/*
969 * Management for journal control blocks: functions to create and 799 * Management for journal control blocks: functions to create and
970 * destroy journal_t structures, and to initialise and read existing 800 * destroy journal_t structures, and to initialise and read existing
@@ -1009,7 +839,7 @@ static journal_t * journal_init_common (void)
1009 goto fail; 839 goto fail;
1010 } 840 }
1011 841
1012 journal_init_stats(journal); 842 spin_lock_init(&journal->j_history_lock);
1013 843
1014 return journal; 844 return journal;
1015fail: 845fail:
@@ -1115,7 +945,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
1115 while ((p = strchr(p, '/'))) 945 while ((p = strchr(p, '/')))
1116 *p = '!'; 946 *p = '!';
1117 p = journal->j_devname + strlen(journal->j_devname); 947 p = journal->j_devname + strlen(journal->j_devname);
1118 sprintf(p, ":%lu", journal->j_inode->i_ino); 948 sprintf(p, "-%lu", journal->j_inode->i_ino);
1119 jbd_debug(1, 949 jbd_debug(1,
1120 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", 950 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
1121 journal, inode->i_sb->s_id, inode->i_ino, 951 journal, inode->i_sb->s_id, inode->i_ino,
@@ -1187,6 +1017,12 @@ static int journal_reset(journal_t *journal)
1187 1017
1188 first = be32_to_cpu(sb->s_first); 1018 first = be32_to_cpu(sb->s_first);
1189 last = be32_to_cpu(sb->s_maxlen); 1019 last = be32_to_cpu(sb->s_maxlen);
1020 if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
1021 printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n",
1022 first, last);
1023 journal_fail_superblock(journal);
1024 return -EINVAL;
1025 }
1190 1026
1191 journal->j_first = first; 1027 journal->j_first = first;
1192 journal->j_last = last; 1028 journal->j_last = last;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6213ac728f30..a0512700542f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -57,7 +57,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
57 INIT_LIST_HEAD(&transaction->t_private_list); 57 INIT_LIST_HEAD(&transaction->t_private_list);
58 58
59 /* Set up the commit timer for the new transaction. */ 59 /* Set up the commit timer for the new transaction. */
60 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); 60 journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
61 add_timer(&journal->j_commit_timer); 61 add_timer(&journal->j_commit_timer);
62 62
63 J_ASSERT(journal->j_running_transaction == NULL); 63 J_ASSERT(journal->j_running_transaction == NULL);
@@ -238,6 +238,8 @@ repeat_locked:
238 __jbd2_log_space_left(journal)); 238 __jbd2_log_space_left(journal));
239 spin_unlock(&transaction->t_handle_lock); 239 spin_unlock(&transaction->t_handle_lock);
240 spin_unlock(&journal->j_state_lock); 240 spin_unlock(&journal->j_state_lock);
241
242 lock_map_acquire(&handle->h_lockdep_map);
241out: 243out:
242 if (unlikely(new_transaction)) /* It's usually NULL */ 244 if (unlikely(new_transaction)) /* It's usually NULL */
243 kfree(new_transaction); 245 kfree(new_transaction);
@@ -303,8 +305,6 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
303 handle = ERR_PTR(err); 305 handle = ERR_PTR(err);
304 goto out; 306 goto out;
305 } 307 }
306
307 lock_map_acquire(&handle->h_lockdep_map);
308out: 308out:
309 return handle; 309 return handle;
310} 310}
@@ -426,6 +426,7 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
426 __jbd2_log_start_commit(journal, transaction->t_tid); 426 __jbd2_log_start_commit(journal, transaction->t_tid);
427 spin_unlock(&journal->j_state_lock); 427 spin_unlock(&journal->j_state_lock);
428 428
429 lock_map_release(&handle->h_lockdep_map);
429 handle->h_buffer_credits = nblocks; 430 handle->h_buffer_credits = nblocks;
430 ret = start_this_handle(journal, handle); 431 ret = start_this_handle(journal, handle);
431 return ret; 432 return ret;
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 8fcb6239218e..7edb62e97419 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -258,7 +258,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
258 return rc; 258 return rc;
259} 259}
260 260
261static int jffs2_check_acl(struct inode *inode, int mask) 261int jffs2_check_acl(struct inode *inode, int mask)
262{ 262{
263 struct posix_acl *acl; 263 struct posix_acl *acl;
264 int rc; 264 int rc;
@@ -274,11 +274,6 @@ static int jffs2_check_acl(struct inode *inode, int mask)
274 return -EAGAIN; 274 return -EAGAIN;
275} 275}
276 276
277int jffs2_permission(struct inode *inode, int mask)
278{
279 return generic_permission(inode, mask, jffs2_check_acl);
280}
281
282int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode) 277int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
283{ 278{
284 struct posix_acl *acl, *clone; 279 struct posix_acl *acl, *clone;
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index fc929f2a14f6..f0ba63e3c36b 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -26,7 +26,7 @@ struct jffs2_acl_header {
26 26
27#ifdef CONFIG_JFFS2_FS_POSIX_ACL 27#ifdef CONFIG_JFFS2_FS_POSIX_ACL
28 28
29extern int jffs2_permission(struct inode *, int); 29extern int jffs2_check_acl(struct inode *, int);
30extern int jffs2_acl_chmod(struct inode *); 30extern int jffs2_acl_chmod(struct inode *);
31extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); 31extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
32extern int jffs2_init_acl_post(struct inode *); 32extern int jffs2_init_acl_post(struct inode *);
@@ -36,7 +36,7 @@ extern struct xattr_handler jffs2_acl_default_xattr_handler;
36 36
37#else 37#else
38 38
39#define jffs2_permission (NULL) 39#define jffs2_check_acl (NULL)
40#define jffs2_acl_chmod(inode) (0) 40#define jffs2_acl_chmod(inode) (0)
41#define jffs2_init_acl_pre(dir_i,inode,mode) (0) 41#define jffs2_init_acl_pre(dir_i,inode,mode) (0)
42#define jffs2_init_acl_post(inode) (0) 42#define jffs2_init_acl_post(inode) (0)
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index e9580104b6ba..3ff50da94789 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -15,6 +15,7 @@
15#include <linux/completion.h> 15#include <linux/completion.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/freezer.h> 17#include <linux/freezer.h>
18#include <linux/kthread.h>
18#include "nodelist.h" 19#include "nodelist.h"
19 20
20 21
@@ -31,7 +32,7 @@ void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c)
31/* This must only ever be called when no GC thread is currently running */ 32/* This must only ever be called when no GC thread is currently running */
32int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c) 33int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c)
33{ 34{
34 pid_t pid; 35 struct task_struct *tsk;
35 int ret = 0; 36 int ret = 0;
36 37
37 BUG_ON(c->gc_task); 38 BUG_ON(c->gc_task);
@@ -39,15 +40,16 @@ int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c)
39 init_completion(&c->gc_thread_start); 40 init_completion(&c->gc_thread_start);
40 init_completion(&c->gc_thread_exit); 41 init_completion(&c->gc_thread_exit);
41 42
42 pid = kernel_thread(jffs2_garbage_collect_thread, c, CLONE_FS|CLONE_FILES); 43 tsk = kthread_run(jffs2_garbage_collect_thread, c, "jffs2_gcd_mtd%d", c->mtd->index);
43 if (pid < 0) { 44 if (IS_ERR(tsk)) {
44 printk(KERN_WARNING "fork failed for JFFS2 garbage collect thread: %d\n", -pid); 45 printk(KERN_WARNING "fork failed for JFFS2 garbage collect thread: %ld\n", -PTR_ERR(tsk));
45 complete(&c->gc_thread_exit); 46 complete(&c->gc_thread_exit);
46 ret = pid; 47 ret = PTR_ERR(tsk);
47 } else { 48 } else {
48 /* Wait for it... */ 49 /* Wait for it... */
49 D1(printk(KERN_DEBUG "JFFS2: Garbage collect thread is pid %d\n", pid)); 50 D1(printk(KERN_DEBUG "JFFS2: Garbage collect thread is pid %d\n", tsk->pid));
50 wait_for_completion(&c->gc_thread_start); 51 wait_for_completion(&c->gc_thread_start);
52 ret = tsk->pid;
51 } 53 }
52 54
53 return ret; 55 return ret;
@@ -71,7 +73,6 @@ static int jffs2_garbage_collect_thread(void *_c)
71{ 73{
72 struct jffs2_sb_info *c = _c; 74 struct jffs2_sb_info *c = _c;
73 75
74 daemonize("jffs2_gcd_mtd%d", c->mtd->index);
75 allow_signal(SIGKILL); 76 allow_signal(SIGKILL);
76 allow_signal(SIGSTOP); 77 allow_signal(SIGSTOP);
77 allow_signal(SIGCONT); 78 allow_signal(SIGCONT);
@@ -107,6 +108,11 @@ static int jffs2_garbage_collect_thread(void *_c)
107 * the GC thread get there first. */ 108 * the GC thread get there first. */
108 schedule_timeout_interruptible(msecs_to_jiffies(50)); 109 schedule_timeout_interruptible(msecs_to_jiffies(50));
109 110
111 if (kthread_should_stop()) {
112 D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): kthread_stop() called.\n"));
113 goto die;
114 }
115
110 /* Put_super will send a SIGKILL and then wait on the sem. 116 /* Put_super will send a SIGKILL and then wait on the sem.
111 */ 117 */
112 while (signal_pending(current) || freezing(current)) { 118 while (signal_pending(current) || freezing(current)) {
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 6f60cc910f4c..7aa4417e085f 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -55,7 +55,7 @@ const struct inode_operations jffs2_dir_inode_operations =
55 .rmdir = jffs2_rmdir, 55 .rmdir = jffs2_rmdir,
56 .mknod = jffs2_mknod, 56 .mknod = jffs2_mknod,
57 .rename = jffs2_rename, 57 .rename = jffs2_rename,
58 .permission = jffs2_permission, 58 .check_acl = jffs2_check_acl,
59 .setattr = jffs2_setattr, 59 .setattr = jffs2_setattr,
60 .setxattr = jffs2_setxattr, 60 .setxattr = jffs2_setxattr,
61 .getxattr = jffs2_getxattr, 61 .getxattr = jffs2_getxattr,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 23c947539864..b7b74e299142 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -56,7 +56,7 @@ const struct file_operations jffs2_file_operations =
56 56
57const struct inode_operations jffs2_file_inode_operations = 57const struct inode_operations jffs2_file_inode_operations =
58{ 58{
59 .permission = jffs2_permission, 59 .check_acl = jffs2_check_acl,
60 .setattr = jffs2_setattr, 60 .setattr = jffs2_setattr,
61 .setxattr = jffs2_setxattr, 61 .setxattr = jffs2_setxattr,
62 .getxattr = jffs2_getxattr, 62 .getxattr = jffs2_getxattr,
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index 9eff2bdae8a7..c082868910f2 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -39,13 +39,13 @@ int __init jffs2_create_slab_caches(void)
39 39
40 raw_dirent_slab = kmem_cache_create("jffs2_raw_dirent", 40 raw_dirent_slab = kmem_cache_create("jffs2_raw_dirent",
41 sizeof(struct jffs2_raw_dirent), 41 sizeof(struct jffs2_raw_dirent),
42 0, 0, NULL); 42 0, SLAB_HWCACHE_ALIGN, NULL);
43 if (!raw_dirent_slab) 43 if (!raw_dirent_slab)
44 goto err; 44 goto err;
45 45
46 raw_inode_slab = kmem_cache_create("jffs2_raw_inode", 46 raw_inode_slab = kmem_cache_create("jffs2_raw_inode",
47 sizeof(struct jffs2_raw_inode), 47 sizeof(struct jffs2_raw_inode),
48 0, 0, NULL); 48 0, SLAB_HWCACHE_ALIGN, NULL);
49 if (!raw_inode_slab) 49 if (!raw_inode_slab)
50 goto err; 50 goto err;
51 51
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 0035c021395a..9a80e8e595d0 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -123,7 +123,7 @@ static struct dentry *jffs2_get_parent(struct dentry *child)
123 return d_obtain_alias(jffs2_iget(child->d_inode->i_sb, pino)); 123 return d_obtain_alias(jffs2_iget(child->d_inode->i_sb, pino));
124} 124}
125 125
126static struct export_operations jffs2_export_ops = { 126static const struct export_operations jffs2_export_ops = {
127 .get_parent = jffs2_get_parent, 127 .get_parent = jffs2_get_parent,
128 .fh_to_dentry = jffs2_fh_to_dentry, 128 .fh_to_dentry = jffs2_fh_to_dentry,
129 .fh_to_parent = jffs2_fh_to_parent, 129 .fh_to_parent = jffs2_fh_to_parent,
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index b7339c3b6ad9..4ec11e8bda8c 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -21,7 +21,7 @@ const struct inode_operations jffs2_symlink_inode_operations =
21{ 21{
22 .readlink = generic_readlink, 22 .readlink = generic_readlink,
23 .follow_link = jffs2_follow_link, 23 .follow_link = jffs2_follow_link,
24 .permission = jffs2_permission, 24 .check_acl = jffs2_check_acl,
25 .setattr = jffs2_setattr, 25 .setattr = jffs2_setattr,
26 .setxattr = jffs2_setxattr, 26 .setxattr = jffs2_setxattr,
27 .getxattr = jffs2_getxattr, 27 .getxattr = jffs2_getxattr,
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index d9a721e6db70..5ef7bac265e5 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1268,10 +1268,20 @@ int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) {
1268 if (!c->wbuf) 1268 if (!c->wbuf)
1269 return -ENOMEM; 1269 return -ENOMEM;
1270 1270
1271#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
1272 c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
1273 if (!c->wbuf_verify) {
1274 kfree(c->wbuf);
1275 return -ENOMEM;
1276 }
1277#endif
1271 return 0; 1278 return 0;
1272} 1279}
1273 1280
1274void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) { 1281void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) {
1282#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
1283 kfree(c->wbuf_verify);
1284#endif
1275 kfree(c->wbuf); 1285 kfree(c->wbuf);
1276} 1286}
1277 1287
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index a29c7c3e3fb8..d66477c34306 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -114,7 +114,7 @@ out:
114 return rc; 114 return rc;
115} 115}
116 116
117static int jfs_check_acl(struct inode *inode, int mask) 117int jfs_check_acl(struct inode *inode, int mask)
118{ 118{
119 struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS); 119 struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
120 120
@@ -129,11 +129,6 @@ static int jfs_check_acl(struct inode *inode, int mask)
129 return -EAGAIN; 129 return -EAGAIN;
130} 130}
131 131
132int jfs_permission(struct inode *inode, int mask)
133{
134 return generic_permission(inode, mask, jfs_check_acl);
135}
136
137int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) 132int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
138{ 133{
139 struct posix_acl *acl = NULL; 134 struct posix_acl *acl = NULL;
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 7f6063acaa3b..2b70fa78e4a7 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -96,7 +96,7 @@ const struct inode_operations jfs_file_inode_operations = {
96 .removexattr = jfs_removexattr, 96 .removexattr = jfs_removexattr,
97#ifdef CONFIG_JFS_POSIX_ACL 97#ifdef CONFIG_JFS_POSIX_ACL
98 .setattr = jfs_setattr, 98 .setattr = jfs_setattr,
99 .permission = jfs_permission, 99 .check_acl = jfs_check_acl,
100#endif 100#endif
101}; 101};
102 102
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 88475f10a389..b07bd417ef85 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -20,7 +20,7 @@
20 20
21#ifdef CONFIG_JFS_POSIX_ACL 21#ifdef CONFIG_JFS_POSIX_ACL
22 22
23int jfs_permission(struct inode *, int); 23int jfs_check_acl(struct inode *, int);
24int jfs_init_acl(tid_t, struct inode *, struct inode *); 24int jfs_init_acl(tid_t, struct inode *, struct inode *);
25int jfs_setattr(struct dentry *, struct iattr *); 25int jfs_setattr(struct dentry *, struct iattr *);
26 26
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 514ee2edb92a..c79a4270f083 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1543,7 +1543,7 @@ const struct inode_operations jfs_dir_inode_operations = {
1543 .removexattr = jfs_removexattr, 1543 .removexattr = jfs_removexattr,
1544#ifdef CONFIG_JFS_POSIX_ACL 1544#ifdef CONFIG_JFS_POSIX_ACL
1545 .setattr = jfs_setattr, 1545 .setattr = jfs_setattr,
1546 .permission = jfs_permission, 1546 .check_acl = jfs_check_acl,
1547#endif 1547#endif
1548}; 1548};
1549 1549
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 37e6dcda8fc8..2234c73fc577 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -178,13 +178,11 @@ static void jfs_put_super(struct super_block *sb)
178 rc = jfs_umount(sb); 178 rc = jfs_umount(sb);
179 if (rc) 179 if (rc)
180 jfs_err("jfs_umount failed with return code %d", rc); 180 jfs_err("jfs_umount failed with return code %d", rc);
181 if (sbi->nls_tab) 181
182 unload_nls(sbi->nls_tab); 182 unload_nls(sbi->nls_tab);
183 sbi->nls_tab = NULL;
184 183
185 truncate_inode_pages(sbi->direct_inode->i_mapping, 0); 184 truncate_inode_pages(sbi->direct_inode->i_mapping, 0);
186 iput(sbi->direct_inode); 185 iput(sbi->direct_inode);
187 sbi->direct_inode = NULL;
188 186
189 kfree(sbi); 187 kfree(sbi);
190 188
@@ -347,8 +345,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
347 345
348 if (nls_map != (void *) -1) { 346 if (nls_map != (void *) -1) {
349 /* Discard old (if remount) */ 347 /* Discard old (if remount) */
350 if (sbi->nls_tab) 348 unload_nls(sbi->nls_tab);
351 unload_nls(sbi->nls_tab);
352 sbi->nls_tab = nls_map; 349 sbi->nls_tab = nls_map;
353 } 350 }
354 return 1; 351 return 1;
diff --git a/fs/libfs.c b/fs/libfs.c
index dcec3d3ea64f..219576c52d80 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -527,14 +527,18 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
527 const void *from, size_t available) 527 const void *from, size_t available)
528{ 528{
529 loff_t pos = *ppos; 529 loff_t pos = *ppos;
530 size_t ret;
531
530 if (pos < 0) 532 if (pos < 0)
531 return -EINVAL; 533 return -EINVAL;
532 if (pos >= available) 534 if (pos >= available || !count)
533 return 0; 535 return 0;
534 if (count > available - pos) 536 if (count > available - pos)
535 count = available - pos; 537 count = available - pos;
536 if (copy_to_user(to, from + pos, count)) 538 ret = copy_to_user(to, from + pos, count);
539 if (ret == count)
537 return -EFAULT; 540 return -EFAULT;
541 count -= ret;
538 *ppos = pos + count; 542 *ppos = pos + count;
539 return count; 543 return count;
540} 544}
@@ -735,10 +739,11 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
735 if (copy_from_user(attr->set_buf, buf, size)) 739 if (copy_from_user(attr->set_buf, buf, size))
736 goto out; 740 goto out;
737 741
738 ret = len; /* claim we got the whole input */
739 attr->set_buf[size] = '\0'; 742 attr->set_buf[size] = '\0';
740 val = simple_strtol(attr->set_buf, NULL, 0); 743 val = simple_strtol(attr->set_buf, NULL, 0);
741 attr->set(attr->data, val); 744 ret = attr->set(attr->data, val);
745 if (ret == 0)
746 ret = len; /* on success, claim we got the whole input */
742out: 747out:
743 mutex_unlock(&attr->mutex); 748 mutex_unlock(&attr->mutex);
744 return ret; 749 return ret;
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 1f3b0fc0d351..fc9032dc8862 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -166,7 +166,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
166 */ 166 */
167 if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) 167 if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
168 continue; 168 continue;
169 if (!nlm_cmp_addr(nlm_addr(block->b_host), addr)) 169 if (!rpc_cmp_addr(nlm_addr(block->b_host), addr))
170 continue; 170 continue;
171 if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) 171 if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
172 continue; 172 continue;
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 4336adba952a..c81249fef11f 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -458,7 +458,7 @@ static void nlmclnt_locks_release_private(struct file_lock *fl)
458 nlm_put_lockowner(fl->fl_u.nfs_fl.owner); 458 nlm_put_lockowner(fl->fl_u.nfs_fl.owner);
459} 459}
460 460
461static struct file_lock_operations nlmclnt_lock_ops = { 461static const struct file_lock_operations nlmclnt_lock_ops = {
462 .fl_copy_lock = nlmclnt_locks_copy_lock, 462 .fl_copy_lock = nlmclnt_locks_copy_lock,
463 .fl_release_private = nlmclnt_locks_release_private, 463 .fl_release_private = nlmclnt_locks_release_private,
464}; 464};
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 99d737bd4325..4600c2037b8b 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -87,18 +87,6 @@ static unsigned int nlm_hash_address(const struct sockaddr *sap)
87 return hash & (NLM_HOST_NRHASH - 1); 87 return hash & (NLM_HOST_NRHASH - 1);
88} 88}
89 89
90static void nlm_clear_port(struct sockaddr *sap)
91{
92 switch (sap->sa_family) {
93 case AF_INET:
94 ((struct sockaddr_in *)sap)->sin_port = 0;
95 break;
96 case AF_INET6:
97 ((struct sockaddr_in6 *)sap)->sin6_port = 0;
98 break;
99 }
100}
101
102/* 90/*
103 * Common host lookup routine for server & client 91 * Common host lookup routine for server & client
104 */ 92 */
@@ -123,7 +111,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
123 */ 111 */
124 chain = &nlm_hosts[nlm_hash_address(ni->sap)]; 112 chain = &nlm_hosts[nlm_hash_address(ni->sap)];
125 hlist_for_each_entry(host, pos, chain, h_hash) { 113 hlist_for_each_entry(host, pos, chain, h_hash) {
126 if (!nlm_cmp_addr(nlm_addr(host), ni->sap)) 114 if (!rpc_cmp_addr(nlm_addr(host), ni->sap))
127 continue; 115 continue;
128 116
129 /* See if we have an NSM handle for this client */ 117 /* See if we have an NSM handle for this client */
@@ -137,7 +125,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
137 if (host->h_server != ni->server) 125 if (host->h_server != ni->server)
138 continue; 126 continue;
139 if (ni->server && 127 if (ni->server &&
140 !nlm_cmp_addr(nlm_srcaddr(host), ni->src_sap)) 128 !rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap))
141 continue; 129 continue;
142 130
143 /* Move to head of hash chain. */ 131 /* Move to head of hash chain. */
@@ -177,7 +165,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
177 host->h_addrbuf = nsm->sm_addrbuf; 165 host->h_addrbuf = nsm->sm_addrbuf;
178 memcpy(nlm_addr(host), ni->sap, ni->salen); 166 memcpy(nlm_addr(host), ni->sap, ni->salen);
179 host->h_addrlen = ni->salen; 167 host->h_addrlen = ni->salen;
180 nlm_clear_port(nlm_addr(host)); 168 rpc_set_port(nlm_addr(host), 0);
181 memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len); 169 memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
182 host->h_version = ni->version; 170 host->h_version = ni->version;
183 host->h_proto = ni->protocol; 171 host->h_proto = ni->protocol;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 7fce1b525849..f956651d0f65 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -61,43 +61,6 @@ static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
61 return (struct sockaddr *)&nsm->sm_addr; 61 return (struct sockaddr *)&nsm->sm_addr;
62} 62}
63 63
64static void nsm_display_ipv4_address(const struct sockaddr *sap, char *buf,
65 const size_t len)
66{
67 const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
68 snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
69}
70
71static void nsm_display_ipv6_address(const struct sockaddr *sap, char *buf,
72 const size_t len)
73{
74 const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
75
76 if (ipv6_addr_v4mapped(&sin6->sin6_addr))
77 snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]);
78 else if (sin6->sin6_scope_id != 0)
79 snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr,
80 sin6->sin6_scope_id);
81 else
82 snprintf(buf, len, "%pI6", &sin6->sin6_addr);
83}
84
85static void nsm_display_address(const struct sockaddr *sap,
86 char *buf, const size_t len)
87{
88 switch (sap->sa_family) {
89 case AF_INET:
90 nsm_display_ipv4_address(sap, buf, len);
91 break;
92 case AF_INET6:
93 nsm_display_ipv6_address(sap, buf, len);
94 break;
95 default:
96 snprintf(buf, len, "unsupported address family");
97 break;
98 }
99}
100
101static struct rpc_clnt *nsm_create(void) 64static struct rpc_clnt *nsm_create(void)
102{ 65{
103 struct sockaddr_in sin = { 66 struct sockaddr_in sin = {
@@ -246,7 +209,7 @@ static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap)
246 struct nsm_handle *nsm; 209 struct nsm_handle *nsm;
247 210
248 list_for_each_entry(nsm, &nsm_handles, sm_link) 211 list_for_each_entry(nsm, &nsm_handles, sm_link)
249 if (nlm_cmp_addr(nsm_addr(nsm), sap)) 212 if (rpc_cmp_addr(nsm_addr(nsm), sap))
250 return nsm; 213 return nsm;
251 return NULL; 214 return NULL;
252} 215}
@@ -307,8 +270,11 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
307 memcpy(nsm_addr(new), sap, salen); 270 memcpy(nsm_addr(new), sap, salen);
308 new->sm_addrlen = salen; 271 new->sm_addrlen = salen;
309 nsm_init_private(new); 272 nsm_init_private(new);
310 nsm_display_address((const struct sockaddr *)&new->sm_addr, 273
311 new->sm_addrbuf, sizeof(new->sm_addrbuf)); 274 if (rpc_ntop(nsm_addr(new), new->sm_addrbuf,
275 sizeof(new->sm_addrbuf)) == 0)
276 (void)snprintf(new->sm_addrbuf, sizeof(new->sm_addrbuf),
277 "unsupported address family");
312 memcpy(new->sm_name, hostname, hostname_len); 278 memcpy(new->sm_name, hostname, hostname_len);
313 new->sm_name[hostname_len] = '\0'; 279 new->sm_name[hostname_len] = '\0';
314 280
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index e577a78d7bac..d1001790fa9a 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -705,7 +705,7 @@ static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2)
705 return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid; 705 return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid;
706} 706}
707 707
708struct lock_manager_operations nlmsvc_lock_operations = { 708const struct lock_manager_operations nlmsvc_lock_operations = {
709 .fl_compare_owner = nlmsvc_same_owner, 709 .fl_compare_owner = nlmsvc_same_owner,
710 .fl_notify = nlmsvc_notify_blocked, 710 .fl_notify = nlmsvc_notify_blocked,
711 .fl_grant = nlmsvc_grant_deferred, 711 .fl_grant = nlmsvc_grant_deferred,
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 9e4d6aab611b..ad478da7ca63 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -417,7 +417,7 @@ EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb);
417static int 417static int
418nlmsvc_match_ip(void *datap, struct nlm_host *host) 418nlmsvc_match_ip(void *datap, struct nlm_host *host)
419{ 419{
420 return nlm_cmp_addr(nlm_srcaddr(host), datap); 420 return rpc_cmp_addr(nlm_srcaddr(host), datap);
421} 421}
422 422
423/** 423/**
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 0336f2beacde..b583ab0a4cbb 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -8,7 +8,6 @@
8 8
9#include <linux/types.h> 9#include <linux/types.h>
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/utsname.h>
12#include <linux/nfs.h> 11#include <linux/nfs.h>
13 12
14#include <linux/sunrpc/xdr.h> 13#include <linux/sunrpc/xdr.h>
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index e1d528653192..ad9dbbc9145d 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -9,7 +9,6 @@
9 9
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/sched.h> 11#include <linux/sched.h>
12#include <linux/utsname.h>
13#include <linux/nfs.h> 12#include <linux/nfs.h>
14 13
15#include <linux/sunrpc/xdr.h> 14#include <linux/sunrpc/xdr.h>
diff --git a/fs/locks.c b/fs/locks.c
index b6440f52178f..a8794f233bc9 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -434,7 +434,7 @@ static int lease_mylease_callback(struct file_lock *fl, struct file_lock *try)
434 return fl->fl_file == try->fl_file; 434 return fl->fl_file == try->fl_file;
435} 435}
436 436
437static struct lock_manager_operations lease_manager_ops = { 437static const struct lock_manager_operations lease_manager_ops = {
438 .fl_break = lease_break_callback, 438 .fl_break = lease_break_callback,
439 .fl_release_private = lease_release_private_callback, 439 .fl_release_private = lease_release_private_callback,
440 .fl_mylease = lease_mylease_callback, 440 .fl_mylease = lease_mylease_callback,
@@ -768,7 +768,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
768 * give it the opportunity to lock the file. 768 * give it the opportunity to lock the file.
769 */ 769 */
770 if (found) 770 if (found)
771 cond_resched_bkl(); 771 cond_resched();
772 772
773find_conflict: 773find_conflict:
774 for_each_lock(inode, before) { 774 for_each_lock(inode, before) {
@@ -1591,7 +1591,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
1591 if (can_sleep) 1591 if (can_sleep)
1592 lock->fl_flags |= FL_SLEEP; 1592 lock->fl_flags |= FL_SLEEP;
1593 1593
1594 error = security_file_lock(filp, cmd); 1594 error = security_file_lock(filp, lock->fl_type);
1595 if (error) 1595 if (error)
1596 goto out_free; 1596 goto out_free;
1597 1597
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index d407e7a0b6fe..6198731d7fcd 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -308,14 +308,18 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
308 struct inode *inode = (struct inode*)mapping->host; 308 struct inode *inode = (struct inode*)mapping->host;
309 char *kaddr = page_address(page); 309 char *kaddr = page_address(page);
310 loff_t pos = page_offset(page) + (char*)de - kaddr; 310 loff_t pos = page_offset(page) + (char*)de - kaddr;
311 unsigned len = minix_sb(inode->i_sb)->s_dirsize; 311 struct minix_sb_info *sbi = minix_sb(inode->i_sb);
312 unsigned len = sbi->s_dirsize;
312 int err; 313 int err;
313 314
314 lock_page(page); 315 lock_page(page);
315 err = __minix_write_begin(NULL, mapping, pos, len, 316 err = __minix_write_begin(NULL, mapping, pos, len,
316 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); 317 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
317 if (err == 0) { 318 if (err == 0) {
318 de->inode = 0; 319 if (sbi->s_version == MINIX_V3)
320 ((minix3_dirent *) de)->inode = 0;
321 else
322 de->inode = 0;
319 err = dir_commit_chunk(page, pos, len); 323 err = dir_commit_chunk(page, pos, len);
320 } else { 324 } else {
321 unlock_page(page); 325 unlock_page(page);
@@ -440,7 +444,10 @@ void minix_set_link(struct minix_dir_entry *de, struct page *page,
440 err = __minix_write_begin(NULL, mapping, pos, sbi->s_dirsize, 444 err = __minix_write_begin(NULL, mapping, pos, sbi->s_dirsize,
441 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); 445 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
442 if (err == 0) { 446 if (err == 0) {
443 de->inode = inode->i_ino; 447 if (sbi->s_version == MINIX_V3)
448 ((minix3_dirent *) de)->inode = inode->i_ino;
449 else
450 de->inode = inode->i_ino;
444 err = dir_commit_chunk(page, pos, sbi->s_dirsize); 451 err = dir_commit_chunk(page, pos, sbi->s_dirsize);
445 } else { 452 } else {
446 unlock_page(page); 453 unlock_page(page);
@@ -470,7 +477,14 @@ ino_t minix_inode_by_name(struct dentry *dentry)
470 ino_t res = 0; 477 ino_t res = 0;
471 478
472 if (de) { 479 if (de) {
473 res = de->inode; 480 struct address_space *mapping = page->mapping;
481 struct inode *inode = mapping->host;
482 struct minix_sb_info *sbi = minix_sb(inode->i_sb);
483
484 if (sbi->s_version == MINIX_V3)
485 res = ((minix3_dirent *) de)->inode;
486 else
487 res = de->inode;
474 dir_put_page(page); 488 dir_put_page(page);
475 } 489 }
476 return res; 490 return res;
diff --git a/fs/namei.c b/fs/namei.c
index f3c5b278895a..d11f404667e9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -169,19 +169,10 @@ void putname(const char *name)
169EXPORT_SYMBOL(putname); 169EXPORT_SYMBOL(putname);
170#endif 170#endif
171 171
172 172/*
173/** 173 * This does basic POSIX ACL permission checking
174 * generic_permission - check for access rights on a Posix-like filesystem
175 * @inode: inode to check access rights for
176 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
177 * @check_acl: optional callback to check for Posix ACLs
178 *
179 * Used to check for read/write/execute permissions on a file.
180 * We use "fsuid" for this, letting us set arbitrary permissions
181 * for filesystem access without changing the "normal" uids which
182 * are used for other things..
183 */ 174 */
184int generic_permission(struct inode *inode, int mask, 175static int acl_permission_check(struct inode *inode, int mask,
185 int (*check_acl)(struct inode *inode, int mask)) 176 int (*check_acl)(struct inode *inode, int mask))
186{ 177{
187 umode_t mode = inode->i_mode; 178 umode_t mode = inode->i_mode;
@@ -193,9 +184,7 @@ int generic_permission(struct inode *inode, int mask,
193 else { 184 else {
194 if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) { 185 if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
195 int error = check_acl(inode, mask); 186 int error = check_acl(inode, mask);
196 if (error == -EACCES) 187 if (error != -EAGAIN)
197 goto check_capabilities;
198 else if (error != -EAGAIN)
199 return error; 188 return error;
200 } 189 }
201 190
@@ -208,8 +197,32 @@ int generic_permission(struct inode *inode, int mask,
208 */ 197 */
209 if ((mask & ~mode) == 0) 198 if ((mask & ~mode) == 0)
210 return 0; 199 return 0;
200 return -EACCES;
201}
202
203/**
204 * generic_permission - check for access rights on a Posix-like filesystem
205 * @inode: inode to check access rights for
206 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
207 * @check_acl: optional callback to check for Posix ACLs
208 *
209 * Used to check for read/write/execute permissions on a file.
210 * We use "fsuid" for this, letting us set arbitrary permissions
211 * for filesystem access without changing the "normal" uids which
212 * are used for other things..
213 */
214int generic_permission(struct inode *inode, int mask,
215 int (*check_acl)(struct inode *inode, int mask))
216{
217 int ret;
218
219 /*
220 * Do the basic POSIX ACL permission checks.
221 */
222 ret = acl_permission_check(inode, mask, check_acl);
223 if (ret != -EACCES)
224 return ret;
211 225
212 check_capabilities:
213 /* 226 /*
214 * Read/write DACs are always overridable. 227 * Read/write DACs are always overridable.
215 * Executable DACs are overridable if at least one exec bit is set. 228 * Executable DACs are overridable if at least one exec bit is set.
@@ -262,7 +275,7 @@ int inode_permission(struct inode *inode, int mask)
262 if (inode->i_op->permission) 275 if (inode->i_op->permission)
263 retval = inode->i_op->permission(inode, mask); 276 retval = inode->i_op->permission(inode, mask);
264 else 277 else
265 retval = generic_permission(inode, mask, NULL); 278 retval = generic_permission(inode, mask, inode->i_op->check_acl);
266 279
267 if (retval) 280 if (retval)
268 return retval; 281 return retval;
@@ -432,29 +445,22 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name,
432 */ 445 */
433static int exec_permission_lite(struct inode *inode) 446static int exec_permission_lite(struct inode *inode)
434{ 447{
435 umode_t mode = inode->i_mode; 448 int ret;
436 449
437 if (inode->i_op->permission) 450 if (inode->i_op->permission) {
438 return -EAGAIN; 451 ret = inode->i_op->permission(inode, MAY_EXEC);
439 452 if (!ret)
440 if (current_fsuid() == inode->i_uid) 453 goto ok;
441 mode >>= 6; 454 return ret;
442 else if (in_group_p(inode->i_gid)) 455 }
443 mode >>= 3; 456 ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl);
444 457 if (!ret)
445 if (mode & MAY_EXEC)
446 goto ok;
447
448 if ((inode->i_mode & S_IXUGO) && capable(CAP_DAC_OVERRIDE))
449 goto ok;
450
451 if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_OVERRIDE))
452 goto ok; 458 goto ok;
453 459
454 if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_READ_SEARCH)) 460 if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
455 goto ok; 461 goto ok;
456 462
457 return -EACCES; 463 return ret;
458ok: 464ok:
459 return security_inode_permission(inode, MAY_EXEC); 465 return security_inode_permission(inode, MAY_EXEC);
460} 466}
@@ -853,12 +859,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
853 859
854 nd->flags |= LOOKUP_CONTINUE; 860 nd->flags |= LOOKUP_CONTINUE;
855 err = exec_permission_lite(inode); 861 err = exec_permission_lite(inode);
856 if (err == -EAGAIN)
857 err = inode_permission(nd->path.dentry->d_inode,
858 MAY_EXEC);
859 if (!err)
860 err = ima_path_check(&nd->path, MAY_EXEC,
861 IMA_COUNT_UPDATE);
862 if (err) 862 if (err)
863 break; 863 break;
864 864
@@ -1533,37 +1533,42 @@ int may_open(struct path *path, int acc_mode, int flag)
1533 if (error) 1533 if (error)
1534 return error; 1534 return error;
1535 1535
1536 error = ima_path_check(path, 1536 error = ima_path_check(path, acc_mode ?
1537 acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC), 1537 acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) :
1538 ACC_MODE(flag) & (MAY_READ | MAY_WRITE),
1538 IMA_COUNT_UPDATE); 1539 IMA_COUNT_UPDATE);
1540
1539 if (error) 1541 if (error)
1540 return error; 1542 return error;
1541 /* 1543 /*
1542 * An append-only file must be opened in append mode for writing. 1544 * An append-only file must be opened in append mode for writing.
1543 */ 1545 */
1544 if (IS_APPEND(inode)) { 1546 if (IS_APPEND(inode)) {
1547 error = -EPERM;
1545 if ((flag & FMODE_WRITE) && !(flag & O_APPEND)) 1548 if ((flag & FMODE_WRITE) && !(flag & O_APPEND))
1546 return -EPERM; 1549 goto err_out;
1547 if (flag & O_TRUNC) 1550 if (flag & O_TRUNC)
1548 return -EPERM; 1551 goto err_out;
1549 } 1552 }
1550 1553
1551 /* O_NOATIME can only be set by the owner or superuser */ 1554 /* O_NOATIME can only be set by the owner or superuser */
1552 if (flag & O_NOATIME) 1555 if (flag & O_NOATIME)
1553 if (!is_owner_or_cap(inode)) 1556 if (!is_owner_or_cap(inode)) {
1554 return -EPERM; 1557 error = -EPERM;
1558 goto err_out;
1559 }
1555 1560
1556 /* 1561 /*
1557 * Ensure there are no outstanding leases on the file. 1562 * Ensure there are no outstanding leases on the file.
1558 */ 1563 */
1559 error = break_lease(inode, flag); 1564 error = break_lease(inode, flag);
1560 if (error) 1565 if (error)
1561 return error; 1566 goto err_out;
1562 1567
1563 if (flag & O_TRUNC) { 1568 if (flag & O_TRUNC) {
1564 error = get_write_access(inode); 1569 error = get_write_access(inode);
1565 if (error) 1570 if (error)
1566 return error; 1571 goto err_out;
1567 1572
1568 /* 1573 /*
1569 * Refuse to truncate files with mandatory locks held on them. 1574 * Refuse to truncate files with mandatory locks held on them.
@@ -1581,12 +1586,17 @@ int may_open(struct path *path, int acc_mode, int flag)
1581 } 1586 }
1582 put_write_access(inode); 1587 put_write_access(inode);
1583 if (error) 1588 if (error)
1584 return error; 1589 goto err_out;
1585 } else 1590 } else
1586 if (flag & FMODE_WRITE) 1591 if (flag & FMODE_WRITE)
1587 vfs_dq_init(inode); 1592 vfs_dq_init(inode);
1588 1593
1589 return 0; 1594 return 0;
1595err_out:
1596 ima_counts_put(path, acc_mode ?
1597 acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) :
1598 ACC_MODE(flag) & (MAY_READ | MAY_WRITE));
1599 return error;
1590} 1600}
1591 1601
1592/* 1602/*
diff --git a/fs/namespace.c b/fs/namespace.c
index 7230787d18b0..bdc3cb4fd222 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1640,7 +1640,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
1640{ 1640{
1641 struct vfsmount *mnt; 1641 struct vfsmount *mnt;
1642 1642
1643 if (!type || !memchr(type, 0, PAGE_SIZE)) 1643 if (!type)
1644 return -EINVAL; 1644 return -EINVAL;
1645 1645
1646 /* we need capabilities... */ 1646 /* we need capabilities... */
@@ -1871,6 +1871,23 @@ int copy_mount_options(const void __user * data, unsigned long *where)
1871 return 0; 1871 return 0;
1872} 1872}
1873 1873
1874int copy_mount_string(const void __user *data, char **where)
1875{
1876 char *tmp;
1877
1878 if (!data) {
1879 *where = NULL;
1880 return 0;
1881 }
1882
1883 tmp = strndup_user(data, PAGE_SIZE);
1884 if (IS_ERR(tmp))
1885 return PTR_ERR(tmp);
1886
1887 *where = tmp;
1888 return 0;
1889}
1890
1874/* 1891/*
1875 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to 1892 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
1876 * be given to the mount() call (ie: read-only, no-dev, no-suid etc). 1893 * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
@@ -1900,8 +1917,6 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
1900 1917
1901 if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE)) 1918 if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
1902 return -EINVAL; 1919 return -EINVAL;
1903 if (dev_name && !memchr(dev_name, 0, PAGE_SIZE))
1904 return -EINVAL;
1905 1920
1906 if (data_page) 1921 if (data_page)
1907 ((char *)data_page)[PAGE_SIZE - 1] = 0; 1922 ((char *)data_page)[PAGE_SIZE - 1] = 0;
@@ -2070,40 +2085,42 @@ EXPORT_SYMBOL(create_mnt_ns);
2070SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, 2085SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
2071 char __user *, type, unsigned long, flags, void __user *, data) 2086 char __user *, type, unsigned long, flags, void __user *, data)
2072{ 2087{
2073 int retval; 2088 int ret;
2089 char *kernel_type;
2090 char *kernel_dir;
2091 char *kernel_dev;
2074 unsigned long data_page; 2092 unsigned long data_page;
2075 unsigned long type_page;
2076 unsigned long dev_page;
2077 char *dir_page;
2078 2093
2079 retval = copy_mount_options(type, &type_page); 2094 ret = copy_mount_string(type, &kernel_type);
2080 if (retval < 0) 2095 if (ret < 0)
2081 return retval; 2096 goto out_type;
2082 2097
2083 dir_page = getname(dir_name); 2098 kernel_dir = getname(dir_name);
2084 retval = PTR_ERR(dir_page); 2099 if (IS_ERR(kernel_dir)) {
2085 if (IS_ERR(dir_page)) 2100 ret = PTR_ERR(kernel_dir);
2086 goto out1; 2101 goto out_dir;
2102 }
2087 2103
2088 retval = copy_mount_options(dev_name, &dev_page); 2104 ret = copy_mount_string(dev_name, &kernel_dev);
2089 if (retval < 0) 2105 if (ret < 0)
2090 goto out2; 2106 goto out_dev;
2091 2107
2092 retval = copy_mount_options(data, &data_page); 2108 ret = copy_mount_options(data, &data_page);
2093 if (retval < 0) 2109 if (ret < 0)
2094 goto out3; 2110 goto out_data;
2095 2111
2096 retval = do_mount((char *)dev_page, dir_page, (char *)type_page, 2112 ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags,
2097 flags, (void *)data_page); 2113 (void *) data_page);
2098 free_page(data_page);
2099 2114
2100out3: 2115 free_page(data_page);
2101 free_page(dev_page); 2116out_data:
2102out2: 2117 kfree(kernel_dev);
2103 putname(dir_page); 2118out_dev:
2104out1: 2119 putname(kernel_dir);
2105 free_page(type_page); 2120out_dir:
2106 return retval; 2121 kfree(kernel_type);
2122out_type:
2123 return ret;
2107} 2124}
2108 2125
2109/* 2126/*
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 9c590722d87e..b8b5b30d53f0 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -1241,7 +1241,7 @@ ncp_date_unix2dos(int unix_date, __le16 *time, __le16 *date)
1241 month = 2; 1241 month = 2;
1242 } else { 1242 } else {
1243 nl_day = (year & 3) || day <= 59 ? day : day - 1; 1243 nl_day = (year & 3) || day <= 59 ? day : day - 1;
1244 for (month = 0; month < 12; month++) 1244 for (month = 1; month < 12; month++)
1245 if (day_n[month] > nl_day) 1245 if (day_n[month] > nl_day)
1246 break; 1246 break;
1247 } 1247 }
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index b99ce205b1bd..cf98da1be23e 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -746,16 +746,8 @@ static void ncp_put_super(struct super_block *sb)
746 746
747#ifdef CONFIG_NCPFS_NLS 747#ifdef CONFIG_NCPFS_NLS
748 /* unload the NLS charsets */ 748 /* unload the NLS charsets */
749 if (server->nls_vol) 749 unload_nls(server->nls_vol);
750 { 750 unload_nls(server->nls_io);
751 unload_nls(server->nls_vol);
752 server->nls_vol = NULL;
753 }
754 if (server->nls_io)
755 {
756 unload_nls(server->nls_io);
757 server->nls_io = NULL;
758 }
759#endif /* CONFIG_NCPFS_NLS */ 751#endif /* CONFIG_NCPFS_NLS */
760 752
761 if (server->info_filp) 753 if (server->info_filp)
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index fa038df63ac8..0d58caf4a6e1 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -223,10 +223,8 @@ ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
223 oldset_io = server->nls_io; 223 oldset_io = server->nls_io;
224 server->nls_io = iocharset; 224 server->nls_io = iocharset;
225 225
226 if (oldset_cp) 226 unload_nls(oldset_cp);
227 unload_nls(oldset_cp); 227 unload_nls(oldset_io);
228 if (oldset_io)
229 unload_nls(oldset_io);
230 228
231 return 0; 229 return 0;
232} 230}
@@ -442,7 +440,7 @@ static int __ncp_ioctl(struct inode *inode, struct file *filp,
442 if (dentry) { 440 if (dentry) {
443 struct inode* s_inode = dentry->d_inode; 441 struct inode* s_inode = dentry->d_inode;
444 442
445 if (inode) { 443 if (s_inode) {
446 NCP_FINFO(s_inode)->volNumber = vnum; 444 NCP_FINFO(s_inode)->volNumber = vnum;
447 NCP_FINFO(s_inode)->dirEntNum = de; 445 NCP_FINFO(s_inode)->dirEntNum = de;
448 NCP_FINFO(s_inode)->DosDirNum = dosde; 446 NCP_FINFO(s_inode)->DosDirNum = dosde;
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 5d8dcb9ee326..15458decdb8a 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -95,7 +95,7 @@ static int ncp_file_mmap_fault(struct vm_area_struct *area,
95 return VM_FAULT_MAJOR; 95 return VM_FAULT_MAJOR;
96} 96}
97 97
98static struct vm_operations_struct ncp_file_mmap = 98static const struct vm_operations_struct ncp_file_mmap =
99{ 99{
100 .fault = ncp_file_mmap_fault, 100 .fault = ncp_file_mmap_fault,
101}; 101};
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 845159814de2..da7fda639eac 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -6,7 +6,8 @@ obj-$(CONFIG_NFS_FS) += nfs.o
6 6
7nfs-y := client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \ 7nfs-y := client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \
8 direct.o pagelist.o proc.o read.o symlink.o unlink.o \ 8 direct.o pagelist.o proc.o read.o symlink.o unlink.o \
9 write.o namespace.o mount_clnt.o 9 write.o namespace.o mount_clnt.o \
10 dns_resolve.o cache_lib.o
10nfs-$(CONFIG_ROOT_NFS) += nfsroot.o 11nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
11nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o 12nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o
12nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o 13nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
new file mode 100644
index 000000000000..b4ffd0146ea6
--- /dev/null
+++ b/fs/nfs/cache_lib.c
@@ -0,0 +1,140 @@
1/*
2 * linux/fs/nfs/cache_lib.c
3 *
4 * Helper routines for the NFS client caches
5 *
6 * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
7 */
8#include <linux/kmod.h>
9#include <linux/module.h>
10#include <linux/moduleparam.h>
11#include <linux/mount.h>
12#include <linux/namei.h>
13#include <linux/sunrpc/cache.h>
14#include <linux/sunrpc/rpc_pipe_fs.h>
15
16#include "cache_lib.h"
17
18#define NFS_CACHE_UPCALL_PATHLEN 256
19#define NFS_CACHE_UPCALL_TIMEOUT 15
20
21static char nfs_cache_getent_prog[NFS_CACHE_UPCALL_PATHLEN] =
22 "/sbin/nfs_cache_getent";
23static unsigned long nfs_cache_getent_timeout = NFS_CACHE_UPCALL_TIMEOUT;
24
25module_param_string(cache_getent, nfs_cache_getent_prog,
26 sizeof(nfs_cache_getent_prog), 0600);
27MODULE_PARM_DESC(cache_getent, "Path to the client cache upcall program");
28module_param_named(cache_getent_timeout, nfs_cache_getent_timeout, ulong, 0600);
29MODULE_PARM_DESC(cache_getent_timeout, "Timeout (in seconds) after which "
30 "the cache upcall is assumed to have failed");
31
32int nfs_cache_upcall(struct cache_detail *cd, char *entry_name)
33{
34 static char *envp[] = { "HOME=/",
35 "TERM=linux",
36 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
37 NULL
38 };
39 char *argv[] = {
40 nfs_cache_getent_prog,
41 cd->name,
42 entry_name,
43 NULL
44 };
45 int ret = -EACCES;
46
47 if (nfs_cache_getent_prog[0] == '\0')
48 goto out;
49 ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
50 /*
51 * Disable the upcall mechanism if we're getting an ENOENT or
52 * EACCES error. The admin can re-enable it on the fly by using
53 * sysfs to set the 'cache_getent' parameter once the problem
54 * has been fixed.
55 */
56 if (ret == -ENOENT || ret == -EACCES)
57 nfs_cache_getent_prog[0] = '\0';
58out:
59 return ret > 0 ? 0 : ret;
60}
61
62/*
63 * Deferred request handling
64 */
65void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq)
66{
67 if (atomic_dec_and_test(&dreq->count))
68 kfree(dreq);
69}
70
71static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany)
72{
73 struct nfs_cache_defer_req *dreq;
74
75 dreq = container_of(d, struct nfs_cache_defer_req, deferred_req);
76
77 complete_all(&dreq->completion);
78 nfs_cache_defer_req_put(dreq);
79}
80
81static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req)
82{
83 struct nfs_cache_defer_req *dreq;
84
85 dreq = container_of(req, struct nfs_cache_defer_req, req);
86 dreq->deferred_req.revisit = nfs_dns_cache_revisit;
87 atomic_inc(&dreq->count);
88
89 return &dreq->deferred_req;
90}
91
92struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void)
93{
94 struct nfs_cache_defer_req *dreq;
95
96 dreq = kzalloc(sizeof(*dreq), GFP_KERNEL);
97 if (dreq) {
98 init_completion(&dreq->completion);
99 atomic_set(&dreq->count, 1);
100 dreq->req.defer = nfs_dns_cache_defer;
101 }
102 return dreq;
103}
104
105int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq)
106{
107 if (wait_for_completion_timeout(&dreq->completion,
108 nfs_cache_getent_timeout * HZ) == 0)
109 return -ETIMEDOUT;
110 return 0;
111}
112
113int nfs_cache_register(struct cache_detail *cd)
114{
115 struct nameidata nd;
116 struct vfsmount *mnt;
117 int ret;
118
119 mnt = rpc_get_mount();
120 if (IS_ERR(mnt))
121 return PTR_ERR(mnt);
122 ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &nd);
123 if (ret)
124 goto err;
125 ret = sunrpc_cache_register_pipefs(nd.path.dentry,
126 cd->name, 0600, cd);
127 path_put(&nd.path);
128 if (!ret)
129 return ret;
130err:
131 rpc_put_mount();
132 return ret;
133}
134
135void nfs_cache_unregister(struct cache_detail *cd)
136{
137 sunrpc_cache_unregister_pipefs(cd);
138 rpc_put_mount();
139}
140
diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h
new file mode 100644
index 000000000000..76f856e284e4
--- /dev/null
+++ b/fs/nfs/cache_lib.h
@@ -0,0 +1,27 @@
1/*
2 * Helper routines for the NFS client caches
3 *
4 * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
5 */
6
7#include <linux/completion.h>
8#include <linux/sunrpc/cache.h>
9#include <asm/atomic.h>
10
11/*
12 * Deferred request handling
13 */
14struct nfs_cache_defer_req {
15 struct cache_req req;
16 struct cache_deferred_req deferred_req;
17 struct completion completion;
18 atomic_t count;
19};
20
21extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name);
22extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void);
23extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq);
24extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq);
25
26extern int nfs_cache_register(struct cache_detail *cd);
27extern void nfs_cache_unregister(struct cache_detail *cd);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 7f604c7941fb..293fa0528a6e 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -43,21 +43,29 @@ static struct svc_program nfs4_callback_program;
43unsigned int nfs_callback_set_tcpport; 43unsigned int nfs_callback_set_tcpport;
44unsigned short nfs_callback_tcpport; 44unsigned short nfs_callback_tcpport;
45unsigned short nfs_callback_tcpport6; 45unsigned short nfs_callback_tcpport6;
46static const int nfs_set_port_min = 0; 46#define NFS_CALLBACK_MAXPORTNR (65535U)
47static const int nfs_set_port_max = 65535;
48 47
49static int param_set_port(const char *val, struct kernel_param *kp) 48static int param_set_portnr(const char *val, struct kernel_param *kp)
50{ 49{
51 char *endp; 50 unsigned long num;
52 int num = simple_strtol(val, &endp, 0); 51 int ret;
53 if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max) 52
53 if (!val)
54 return -EINVAL;
55 ret = strict_strtoul(val, 0, &num);
56 if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR)
54 return -EINVAL; 57 return -EINVAL;
55 *((int *)kp->arg) = num; 58 *((unsigned int *)kp->arg) = num;
56 return 0; 59 return 0;
57} 60}
58 61
59module_param_call(callback_tcpport, param_set_port, param_get_int, 62static int param_get_portnr(char *buffer, struct kernel_param *kp)
60 &nfs_callback_set_tcpport, 0644); 63{
64 return param_get_uint(buffer, kp);
65}
66#define param_check_portnr(name, p) __param_check(name, p, unsigned int);
67
68module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
61 69
62/* 70/*
63 * This is the NFSv4 callback kernel thread. 71 * This is the NFSv4 callback kernel thread.
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index e5a2dac5f715..76b0aa0f73bf 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -222,7 +222,7 @@ static unsigned decode_sessionid(struct xdr_stream *xdr,
222 222
223 p = read_buf(xdr, len); 223 p = read_buf(xdr, len);
224 if (unlikely(p == NULL)) 224 if (unlikely(p == NULL))
225 return htonl(NFS4ERR_RESOURCE);; 225 return htonl(NFS4ERR_RESOURCE);
226 226
227 memcpy(sid->data, p, len); 227 memcpy(sid->data, p, len);
228 return 0; 228 return 0;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8d25ccb2d51d..63976c0ccc25 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -648,8 +648,6 @@ static int nfs_start_lockd(struct nfs_server *server)
648 .hostname = clp->cl_hostname, 648 .hostname = clp->cl_hostname,
649 .address = (struct sockaddr *)&clp->cl_addr, 649 .address = (struct sockaddr *)&clp->cl_addr,
650 .addrlen = clp->cl_addrlen, 650 .addrlen = clp->cl_addrlen,
651 .protocol = server->flags & NFS_MOUNT_TCP ?
652 IPPROTO_TCP : IPPROTO_UDP,
653 .nfs_version = clp->rpc_ops->version, 651 .nfs_version = clp->rpc_ops->version,
654 .noresvport = server->flags & NFS_MOUNT_NORESVPORT ? 652 .noresvport = server->flags & NFS_MOUNT_NORESVPORT ?
655 1 : 0, 653 1 : 0,
@@ -660,6 +658,14 @@ static int nfs_start_lockd(struct nfs_server *server)
660 if (server->flags & NFS_MOUNT_NONLM) 658 if (server->flags & NFS_MOUNT_NONLM)
661 return 0; 659 return 0;
662 660
661 switch (clp->cl_proto) {
662 default:
663 nlm_init.protocol = IPPROTO_TCP;
664 break;
665 case XPRT_TRANSPORT_UDP:
666 nlm_init.protocol = IPPROTO_UDP;
667 }
668
663 host = nlmclnt_init(&nlm_init); 669 host = nlmclnt_init(&nlm_init);
664 if (IS_ERR(host)) 670 if (IS_ERR(host))
665 return PTR_ERR(host); 671 return PTR_ERR(host);
@@ -787,7 +793,7 @@ static int nfs_init_server(struct nfs_server *server,
787 dprintk("--> nfs_init_server()\n"); 793 dprintk("--> nfs_init_server()\n");
788 794
789#ifdef CONFIG_NFS_V3 795#ifdef CONFIG_NFS_V3
790 if (data->flags & NFS_MOUNT_VER3) 796 if (data->version == 3)
791 cl_init.rpc_ops = &nfs_v3_clientops; 797 cl_init.rpc_ops = &nfs_v3_clientops;
792#endif 798#endif
793 799
@@ -809,6 +815,9 @@ static int nfs_init_server(struct nfs_server *server,
809 /* Initialise the client representation from the mount data */ 815 /* Initialise the client representation from the mount data */
810 server->flags = data->flags; 816 server->flags = data->flags;
811 server->options = data->options; 817 server->options = data->options;
818 server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
819 NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP|
820 NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME;
812 821
813 if (data->rsize) 822 if (data->rsize)
814 server->rsize = nfs_block_size(data->rsize, NULL); 823 server->rsize = nfs_block_size(data->rsize, NULL);
@@ -879,6 +888,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
879 server->rsize = NFS_MAX_FILE_IO_SIZE; 888 server->rsize = NFS_MAX_FILE_IO_SIZE;
880 server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 889 server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
881 890
891 server->backing_dev_info.name = "nfs";
882 server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; 892 server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
883 893
884 if (server->wsize > max_rpc_payload) 894 if (server->wsize > max_rpc_payload)
@@ -929,10 +939,6 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
929 goto out_error; 939 goto out_error;
930 940
931 nfs_server_set_fsinfo(server, &fsinfo); 941 nfs_server_set_fsinfo(server, &fsinfo);
932 error = bdi_init(&server->backing_dev_info);
933 if (error)
934 goto out_error;
935
936 942
937 /* Get some general file system info */ 943 /* Get some general file system info */
938 if (server->namelen == 0) { 944 if (server->namelen == 0) {
@@ -964,6 +970,7 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve
964 target->acdirmin = source->acdirmin; 970 target->acdirmin = source->acdirmin;
965 target->acdirmax = source->acdirmax; 971 target->acdirmax = source->acdirmax;
966 target->caps = source->caps; 972 target->caps = source->caps;
973 target->options = source->options;
967} 974}
968 975
969/* 976/*
@@ -991,6 +998,12 @@ static struct nfs_server *nfs_alloc_server(void)
991 return NULL; 998 return NULL;
992 } 999 }
993 1000
1001 if (bdi_init(&server->backing_dev_info)) {
1002 nfs_free_iostats(server->io_stats);
1003 kfree(server);
1004 return NULL;
1005 }
1006
994 return server; 1007 return server;
995} 1008}
996 1009
@@ -1074,10 +1087,6 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
1074 (unsigned long long) server->fsid.major, 1087 (unsigned long long) server->fsid.major,
1075 (unsigned long long) server->fsid.minor); 1088 (unsigned long long) server->fsid.minor);
1076 1089
1077 BUG_ON(!server->nfs_client);
1078 BUG_ON(!server->nfs_client->rpc_ops);
1079 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
1080
1081 spin_lock(&nfs_client_lock); 1090 spin_lock(&nfs_client_lock);
1082 list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); 1091 list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
1083 list_add_tail(&server->master_link, &nfs_volume_list); 1092 list_add_tail(&server->master_link, &nfs_volume_list);
@@ -1274,7 +1283,7 @@ static int nfs4_init_server(struct nfs_server *server,
1274 1283
1275 /* Initialise the client representation from the mount data */ 1284 /* Initialise the client representation from the mount data */
1276 server->flags = data->flags; 1285 server->flags = data->flags;
1277 server->caps |= NFS_CAP_ATOMIC_OPEN; 1286 server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
1278 server->options = data->options; 1287 server->options = data->options;
1279 1288
1280 /* Get a client record */ 1289 /* Get a client record */
@@ -1359,10 +1368,6 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
1359 if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) 1368 if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
1360 server->namelen = NFS4_MAXNAMLEN; 1369 server->namelen = NFS4_MAXNAMLEN;
1361 1370
1362 BUG_ON(!server->nfs_client);
1363 BUG_ON(!server->nfs_client->rpc_ops);
1364 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
1365
1366 spin_lock(&nfs_client_lock); 1371 spin_lock(&nfs_client_lock);
1367 list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); 1372 list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
1368 list_add_tail(&server->master_link, &nfs_volume_list); 1373 list_add_tail(&server->master_link, &nfs_volume_list);
@@ -1400,7 +1405,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
1400 1405
1401 /* Initialise the client representation from the parent server */ 1406 /* Initialise the client representation from the parent server */
1402 nfs_server_copy_userdata(server, parent_server); 1407 nfs_server_copy_userdata(server, parent_server);
1403 server->caps |= NFS_CAP_ATOMIC_OPEN; 1408 server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
1404 1409
1405 /* Get a client representation. 1410 /* Get a client representation.
1406 * Note: NFSv4 always uses TCP, */ 1411 * Note: NFSv4 always uses TCP, */
@@ -1533,7 +1538,7 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos);
1533static void nfs_server_list_stop(struct seq_file *p, void *v); 1538static void nfs_server_list_stop(struct seq_file *p, void *v);
1534static int nfs_server_list_show(struct seq_file *m, void *v); 1539static int nfs_server_list_show(struct seq_file *m, void *v);
1535 1540
1536static struct seq_operations nfs_server_list_ops = { 1541static const struct seq_operations nfs_server_list_ops = {
1537 .start = nfs_server_list_start, 1542 .start = nfs_server_list_start,
1538 .next = nfs_server_list_next, 1543 .next = nfs_server_list_next,
1539 .stop = nfs_server_list_stop, 1544 .stop = nfs_server_list_stop,
@@ -1554,7 +1559,7 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos);
1554static void nfs_volume_list_stop(struct seq_file *p, void *v); 1559static void nfs_volume_list_stop(struct seq_file *p, void *v);
1555static int nfs_volume_list_show(struct seq_file *m, void *v); 1560static int nfs_volume_list_show(struct seq_file *m, void *v);
1556 1561
1557static struct seq_operations nfs_volume_list_ops = { 1562static const struct seq_operations nfs_volume_list_ops = {
1558 .start = nfs_volume_list_start, 1563 .start = nfs_volume_list_start,
1559 .next = nfs_volume_list_next, 1564 .next = nfs_volume_list_next,
1560 .stop = nfs_volume_list_stop, 1565 .stop = nfs_volume_list_stop,
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e4e089a8f294..6c3210099d51 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -934,9 +934,6 @@ out:
934 * back into its cache. We let the server do generic write 934 * back into its cache. We let the server do generic write
935 * parameter checking and report problems. 935 * parameter checking and report problems.
936 * 936 *
937 * We also avoid an unnecessary invocation of generic_osync_inode(),
938 * as it is fairly meaningless to sync the metadata of an NFS file.
939 *
940 * We eliminate local atime updates, see direct read above. 937 * We eliminate local atime updates, see direct read above.
941 * 938 *
942 * We avoid unnecessary page cache invalidations for normal cached 939 * We avoid unnecessary page cache invalidations for normal cached
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
new file mode 100644
index 000000000000..f4d54ba97cc6
--- /dev/null
+++ b/fs/nfs/dns_resolve.c
@@ -0,0 +1,335 @@
1/*
2 * linux/fs/nfs/dns_resolve.c
3 *
4 * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
5 *
6 * Resolves DNS hostnames into valid ip addresses
7 */
8
9#include <linux/hash.h>
10#include <linux/string.h>
11#include <linux/kmod.h>
12#include <linux/module.h>
13#include <linux/socket.h>
14#include <linux/seq_file.h>
15#include <linux/inet.h>
16#include <linux/sunrpc/clnt.h>
17#include <linux/sunrpc/cache.h>
18#include <linux/sunrpc/svcauth.h>
19
20#include "dns_resolve.h"
21#include "cache_lib.h"
22
23#define NFS_DNS_HASHBITS 4
24#define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS)
25
26static struct cache_head *nfs_dns_table[NFS_DNS_HASHTBL_SIZE];
27
28struct nfs_dns_ent {
29 struct cache_head h;
30
31 char *hostname;
32 size_t namelen;
33
34 struct sockaddr_storage addr;
35 size_t addrlen;
36};
37
38
39static void nfs_dns_ent_init(struct cache_head *cnew,
40 struct cache_head *ckey)
41{
42 struct nfs_dns_ent *new;
43 struct nfs_dns_ent *key;
44
45 new = container_of(cnew, struct nfs_dns_ent, h);
46 key = container_of(ckey, struct nfs_dns_ent, h);
47
48 kfree(new->hostname);
49 new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL);
50 if (new->hostname) {
51 new->namelen = key->namelen;
52 memcpy(&new->addr, &key->addr, key->addrlen);
53 new->addrlen = key->addrlen;
54 } else {
55 new->namelen = 0;
56 new->addrlen = 0;
57 }
58}
59
60static void nfs_dns_ent_put(struct kref *ref)
61{
62 struct nfs_dns_ent *item;
63
64 item = container_of(ref, struct nfs_dns_ent, h.ref);
65 kfree(item->hostname);
66 kfree(item);
67}
68
69static struct cache_head *nfs_dns_ent_alloc(void)
70{
71 struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL);
72
73 if (item != NULL) {
74 item->hostname = NULL;
75 item->namelen = 0;
76 item->addrlen = 0;
77 return &item->h;
78 }
79 return NULL;
80};
81
82static unsigned int nfs_dns_hash(const struct nfs_dns_ent *key)
83{
84 return hash_str(key->hostname, NFS_DNS_HASHBITS);
85}
86
87static void nfs_dns_request(struct cache_detail *cd,
88 struct cache_head *ch,
89 char **bpp, int *blen)
90{
91 struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
92
93 qword_add(bpp, blen, key->hostname);
94 (*bpp)[-1] = '\n';
95}
96
97static int nfs_dns_upcall(struct cache_detail *cd,
98 struct cache_head *ch)
99{
100 struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
101 int ret;
102
103 ret = nfs_cache_upcall(cd, key->hostname);
104 if (ret)
105 ret = sunrpc_cache_pipe_upcall(cd, ch, nfs_dns_request);
106 return ret;
107}
108
109static int nfs_dns_match(struct cache_head *ca,
110 struct cache_head *cb)
111{
112 struct nfs_dns_ent *a;
113 struct nfs_dns_ent *b;
114
115 a = container_of(ca, struct nfs_dns_ent, h);
116 b = container_of(cb, struct nfs_dns_ent, h);
117
118 if (a->namelen == 0 || a->namelen != b->namelen)
119 return 0;
120 return memcmp(a->hostname, b->hostname, a->namelen) == 0;
121}
122
123static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd,
124 struct cache_head *h)
125{
126 struct nfs_dns_ent *item;
127 long ttl;
128
129 if (h == NULL) {
130 seq_puts(m, "# ip address hostname ttl\n");
131 return 0;
132 }
133 item = container_of(h, struct nfs_dns_ent, h);
134 ttl = (long)item->h.expiry_time - (long)get_seconds();
135 if (ttl < 0)
136 ttl = 0;
137
138 if (!test_bit(CACHE_NEGATIVE, &h->flags)) {
139 char buf[INET6_ADDRSTRLEN+IPV6_SCOPE_ID_LEN+1];
140
141 rpc_ntop((struct sockaddr *)&item->addr, buf, sizeof(buf));
142 seq_printf(m, "%15s ", buf);
143 } else
144 seq_puts(m, "<none> ");
145 seq_printf(m, "%15s %ld\n", item->hostname, ttl);
146 return 0;
147}
148
149struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd,
150 struct nfs_dns_ent *key)
151{
152 struct cache_head *ch;
153
154 ch = sunrpc_cache_lookup(cd,
155 &key->h,
156 nfs_dns_hash(key));
157 if (!ch)
158 return NULL;
159 return container_of(ch, struct nfs_dns_ent, h);
160}
161
162struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd,
163 struct nfs_dns_ent *new,
164 struct nfs_dns_ent *key)
165{
166 struct cache_head *ch;
167
168 ch = sunrpc_cache_update(cd,
169 &new->h, &key->h,
170 nfs_dns_hash(key));
171 if (!ch)
172 return NULL;
173 return container_of(ch, struct nfs_dns_ent, h);
174}
175
176static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
177{
178 char buf1[NFS_DNS_HOSTNAME_MAXLEN+1];
179 struct nfs_dns_ent key, *item;
180 unsigned long ttl;
181 ssize_t len;
182 int ret = -EINVAL;
183
184 if (buf[buflen-1] != '\n')
185 goto out;
186 buf[buflen-1] = '\0';
187
188 len = qword_get(&buf, buf1, sizeof(buf1));
189 if (len <= 0)
190 goto out;
191 key.addrlen = rpc_pton(buf1, len,
192 (struct sockaddr *)&key.addr,
193 sizeof(key.addr));
194
195 len = qword_get(&buf, buf1, sizeof(buf1));
196 if (len <= 0)
197 goto out;
198
199 key.hostname = buf1;
200 key.namelen = len;
201 memset(&key.h, 0, sizeof(key.h));
202
203 ttl = get_expiry(&buf);
204 if (ttl == 0)
205 goto out;
206 key.h.expiry_time = ttl + get_seconds();
207
208 ret = -ENOMEM;
209 item = nfs_dns_lookup(cd, &key);
210 if (item == NULL)
211 goto out;
212
213 if (key.addrlen == 0)
214 set_bit(CACHE_NEGATIVE, &key.h.flags);
215
216 item = nfs_dns_update(cd, &key, item);
217 if (item == NULL)
218 goto out;
219
220 ret = 0;
221 cache_put(&item->h, cd);
222out:
223 return ret;
224}
225
226static struct cache_detail nfs_dns_resolve = {
227 .owner = THIS_MODULE,
228 .hash_size = NFS_DNS_HASHTBL_SIZE,
229 .hash_table = nfs_dns_table,
230 .name = "dns_resolve",
231 .cache_put = nfs_dns_ent_put,
232 .cache_upcall = nfs_dns_upcall,
233 .cache_parse = nfs_dns_parse,
234 .cache_show = nfs_dns_show,
235 .match = nfs_dns_match,
236 .init = nfs_dns_ent_init,
237 .update = nfs_dns_ent_init,
238 .alloc = nfs_dns_ent_alloc,
239};
240
241static int do_cache_lookup(struct cache_detail *cd,
242 struct nfs_dns_ent *key,
243 struct nfs_dns_ent **item,
244 struct nfs_cache_defer_req *dreq)
245{
246 int ret = -ENOMEM;
247
248 *item = nfs_dns_lookup(cd, key);
249 if (*item) {
250 ret = cache_check(cd, &(*item)->h, &dreq->req);
251 if (ret)
252 *item = NULL;
253 }
254 return ret;
255}
256
257static int do_cache_lookup_nowait(struct cache_detail *cd,
258 struct nfs_dns_ent *key,
259 struct nfs_dns_ent **item)
260{
261 int ret = -ENOMEM;
262
263 *item = nfs_dns_lookup(cd, key);
264 if (!*item)
265 goto out_err;
266 ret = -ETIMEDOUT;
267 if (!test_bit(CACHE_VALID, &(*item)->h.flags)
268 || (*item)->h.expiry_time < get_seconds()
269 || cd->flush_time > (*item)->h.last_refresh)
270 goto out_put;
271 ret = -ENOENT;
272 if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags))
273 goto out_put;
274 return 0;
275out_put:
276 cache_put(&(*item)->h, cd);
277out_err:
278 *item = NULL;
279 return ret;
280}
281
282static int do_cache_lookup_wait(struct cache_detail *cd,
283 struct nfs_dns_ent *key,
284 struct nfs_dns_ent **item)
285{
286 struct nfs_cache_defer_req *dreq;
287 int ret = -ENOMEM;
288
289 dreq = nfs_cache_defer_req_alloc();
290 if (!dreq)
291 goto out;
292 ret = do_cache_lookup(cd, key, item, dreq);
293 if (ret == -EAGAIN) {
294 ret = nfs_cache_wait_for_upcall(dreq);
295 if (!ret)
296 ret = do_cache_lookup_nowait(cd, key, item);
297 }
298 nfs_cache_defer_req_put(dreq);
299out:
300 return ret;
301}
302
303ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
304 struct sockaddr *sa, size_t salen)
305{
306 struct nfs_dns_ent key = {
307 .hostname = name,
308 .namelen = namelen,
309 };
310 struct nfs_dns_ent *item = NULL;
311 ssize_t ret;
312
313 ret = do_cache_lookup_wait(&nfs_dns_resolve, &key, &item);
314 if (ret == 0) {
315 if (salen >= item->addrlen) {
316 memcpy(sa, &item->addr, item->addrlen);
317 ret = item->addrlen;
318 } else
319 ret = -EOVERFLOW;
320 cache_put(&item->h, &nfs_dns_resolve);
321 } else if (ret == -ENOENT)
322 ret = -ESRCH;
323 return ret;
324}
325
326int nfs_dns_resolver_init(void)
327{
328 return nfs_cache_register(&nfs_dns_resolve);
329}
330
331void nfs_dns_resolver_destroy(void)
332{
333 nfs_cache_unregister(&nfs_dns_resolve);
334}
335
diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h
new file mode 100644
index 000000000000..a3f0938babf7
--- /dev/null
+++ b/fs/nfs/dns_resolve.h
@@ -0,0 +1,14 @@
1/*
2 * Resolve DNS hostnames into valid ip addresses
3 */
4#ifndef __LINUX_FS_NFS_DNS_RESOLVE_H
5#define __LINUX_FS_NFS_DNS_RESOLVE_H
6
7#define NFS_DNS_HOSTNAME_MAXLEN (128)
8
9extern int nfs_dns_resolver_init(void);
10extern void nfs_dns_resolver_destroy(void);
11extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
12 struct sockaddr *sa, size_t salen);
13
14#endif
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 05062329b678..f5fdd39e037a 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -59,7 +59,7 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
59static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); 59static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
60static int nfs_setlease(struct file *file, long arg, struct file_lock **fl); 60static int nfs_setlease(struct file *file, long arg, struct file_lock **fl);
61 61
62static struct vm_operations_struct nfs_file_vm_ops; 62static const struct vm_operations_struct nfs_file_vm_ops;
63 63
64const struct file_operations nfs_file_operations = { 64const struct file_operations nfs_file_operations = {
65 .llseek = nfs_file_llseek, 65 .llseek = nfs_file_llseek,
@@ -328,6 +328,42 @@ nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
328} 328}
329 329
330/* 330/*
331 * Decide whether a read/modify/write cycle may be more efficient
332 * then a modify/write/read cycle when writing to a page in the
333 * page cache.
334 *
335 * The modify/write/read cycle may occur if a page is read before
336 * being completely filled by the writer. In this situation, the
337 * page must be completely written to stable storage on the server
338 * before it can be refilled by reading in the page from the server.
339 * This can lead to expensive, small, FILE_SYNC mode writes being
340 * done.
341 *
342 * It may be more efficient to read the page first if the file is
343 * open for reading in addition to writing, the page is not marked
344 * as Uptodate, it is not dirty or waiting to be committed,
345 * indicating that it was previously allocated and then modified,
346 * that there were valid bytes of data in that range of the file,
347 * and that the new data won't completely replace the old data in
348 * that range of the file.
349 */
350static int nfs_want_read_modify_write(struct file *file, struct page *page,
351 loff_t pos, unsigned len)
352{
353 unsigned int pglen = nfs_page_length(page);
354 unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
355 unsigned int end = offset + len;
356
357 if ((file->f_mode & FMODE_READ) && /* open for read? */
358 !PageUptodate(page) && /* Uptodate? */
359 !PagePrivate(page) && /* i/o request already? */
360 pglen && /* valid bytes of file? */
361 (end < pglen || offset)) /* replace all valid bytes? */
362 return 1;
363 return 0;
364}
365
366/*
331 * This does the "real" work of the write. We must allocate and lock the 367 * This does the "real" work of the write. We must allocate and lock the
332 * page to be sent back to the generic routine, which then copies the 368 * page to be sent back to the generic routine, which then copies the
333 * data from user space. 369 * data from user space.
@@ -340,15 +376,16 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
340 struct page **pagep, void **fsdata) 376 struct page **pagep, void **fsdata)
341{ 377{
342 int ret; 378 int ret;
343 pgoff_t index; 379 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
344 struct page *page; 380 struct page *page;
345 index = pos >> PAGE_CACHE_SHIFT; 381 int once_thru = 0;
346 382
347 dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n", 383 dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
348 file->f_path.dentry->d_parent->d_name.name, 384 file->f_path.dentry->d_parent->d_name.name,
349 file->f_path.dentry->d_name.name, 385 file->f_path.dentry->d_name.name,
350 mapping->host->i_ino, len, (long long) pos); 386 mapping->host->i_ino, len, (long long) pos);
351 387
388start:
352 /* 389 /*
353 * Prevent starvation issues if someone is doing a consistency 390 * Prevent starvation issues if someone is doing a consistency
354 * sync-to-disk 391 * sync-to-disk
@@ -367,6 +404,13 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
367 if (ret) { 404 if (ret) {
368 unlock_page(page); 405 unlock_page(page);
369 page_cache_release(page); 406 page_cache_release(page);
407 } else if (!once_thru &&
408 nfs_want_read_modify_write(file, page, pos, len)) {
409 once_thru = 1;
410 ret = nfs_readpage(file, page);
411 page_cache_release(page);
412 if (!ret)
413 goto start;
370 } 414 }
371 return ret; 415 return ret;
372} 416}
@@ -479,7 +523,9 @@ const struct address_space_operations nfs_file_aops = {
479 .invalidatepage = nfs_invalidate_page, 523 .invalidatepage = nfs_invalidate_page,
480 .releasepage = nfs_release_page, 524 .releasepage = nfs_release_page,
481 .direct_IO = nfs_direct_IO, 525 .direct_IO = nfs_direct_IO,
526 .migratepage = nfs_migrate_page,
482 .launder_page = nfs_launder_page, 527 .launder_page = nfs_launder_page,
528 .error_remove_page = generic_error_remove_page,
483}; 529};
484 530
485/* 531/*
@@ -526,7 +572,7 @@ out_unlock:
526 return VM_FAULT_SIGBUS; 572 return VM_FAULT_SIGBUS;
527} 573}
528 574
529static struct vm_operations_struct nfs_file_vm_ops = { 575static const struct vm_operations_struct nfs_file_vm_ops = {
530 .fault = filemap_fault, 576 .fault = filemap_fault,
531 .page_mkwrite = nfs_vm_page_mkwrite, 577 .page_mkwrite = nfs_vm_page_mkwrite,
532}; 578};
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 379be678cb7e..70fad69eb959 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -58,17 +58,34 @@ void nfs_fscache_release_client_cookie(struct nfs_client *clp)
58/* 58/*
59 * Get the cache cookie for an NFS superblock. We have to handle 59 * Get the cache cookie for an NFS superblock. We have to handle
60 * uniquification here because the cache doesn't do it for us. 60 * uniquification here because the cache doesn't do it for us.
61 *
62 * The default uniquifier is just an empty string, but it may be overridden
63 * either by the 'fsc=xxx' option to mount, or by inheriting it from the parent
64 * superblock across an automount point of some nature.
61 */ 65 */
62void nfs_fscache_get_super_cookie(struct super_block *sb, 66void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq,
63 struct nfs_parsed_mount_data *data) 67 struct nfs_clone_mount *mntdata)
64{ 68{
65 struct nfs_fscache_key *key, *xkey; 69 struct nfs_fscache_key *key, *xkey;
66 struct nfs_server *nfss = NFS_SB(sb); 70 struct nfs_server *nfss = NFS_SB(sb);
67 struct rb_node **p, *parent; 71 struct rb_node **p, *parent;
68 const char *uniq = data->fscache_uniq ?: "";
69 int diff, ulen; 72 int diff, ulen;
70 73
71 ulen = strlen(uniq); 74 if (uniq) {
75 ulen = strlen(uniq);
76 } else if (mntdata) {
77 struct nfs_server *mnt_s = NFS_SB(mntdata->sb);
78 if (mnt_s->fscache_key) {
79 uniq = mnt_s->fscache_key->key.uniquifier;
80 ulen = mnt_s->fscache_key->key.uniq_len;
81 }
82 }
83
84 if (!uniq) {
85 uniq = "";
86 ulen = 1;
87 }
88
72 key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL); 89 key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL);
73 if (!key) 90 if (!key)
74 return; 91 return;
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 6e809bb0ff08..b9c572d0679f 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -74,7 +74,8 @@ extern void nfs_fscache_get_client_cookie(struct nfs_client *);
74extern void nfs_fscache_release_client_cookie(struct nfs_client *); 74extern void nfs_fscache_release_client_cookie(struct nfs_client *);
75 75
76extern void nfs_fscache_get_super_cookie(struct super_block *, 76extern void nfs_fscache_get_super_cookie(struct super_block *,
77 struct nfs_parsed_mount_data *); 77 const char *,
78 struct nfs_clone_mount *);
78extern void nfs_fscache_release_super_cookie(struct super_block *); 79extern void nfs_fscache_release_super_cookie(struct super_block *);
79 80
80extern void nfs_fscache_init_inode_cookie(struct inode *); 81extern void nfs_fscache_init_inode_cookie(struct inode *);
@@ -173,7 +174,8 @@ static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
173 174
174static inline void nfs_fscache_get_super_cookie( 175static inline void nfs_fscache_get_super_cookie(
175 struct super_block *sb, 176 struct super_block *sb,
176 struct nfs_parsed_mount_data *data) 177 const char *uniq,
178 struct nfs_clone_mount *mntdata)
177{ 179{
178} 180}
179static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {} 181static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 86147b0ab2cf..21a84d45916f 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -101,7 +101,7 @@ static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
101 101
102static unsigned int fnvhash32(const void *, size_t); 102static unsigned int fnvhash32(const void *, size_t);
103 103
104static struct rpc_pipe_ops idmap_upcall_ops = { 104static const struct rpc_pipe_ops idmap_upcall_ops = {
105 .upcall = idmap_pipe_upcall, 105 .upcall = idmap_pipe_upcall,
106 .downcall = idmap_pipe_downcall, 106 .downcall = idmap_pipe_downcall,
107 .destroy_msg = idmap_pipe_destroy_msg, 107 .destroy_msg = idmap_pipe_destroy_msg,
@@ -119,8 +119,8 @@ nfs_idmap_new(struct nfs_client *clp)
119 if (idmap == NULL) 119 if (idmap == NULL)
120 return -ENOMEM; 120 return -ENOMEM;
121 121
122 idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_dentry, "idmap", 122 idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_path.dentry,
123 idmap, &idmap_upcall_ops, 0); 123 "idmap", idmap, &idmap_upcall_ops, 0);
124 if (IS_ERR(idmap->idmap_dentry)) { 124 if (IS_ERR(idmap->idmap_dentry)) {
125 error = PTR_ERR(idmap->idmap_dentry); 125 error = PTR_ERR(idmap->idmap_dentry);
126 kfree(idmap); 126 kfree(idmap);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index bd7938eda6a8..faa091865ad0 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -46,6 +46,7 @@
46#include "iostat.h" 46#include "iostat.h"
47#include "internal.h" 47#include "internal.h"
48#include "fscache.h" 48#include "fscache.h"
49#include "dns_resolve.h"
49 50
50#define NFSDBG_FACILITY NFSDBG_VFS 51#define NFSDBG_FACILITY NFSDBG_VFS
51 52
@@ -286,6 +287,11 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
286 /* We can't support update_atime(), since the server will reset it */ 287 /* We can't support update_atime(), since the server will reset it */
287 inode->i_flags |= S_NOATIME|S_NOCMTIME; 288 inode->i_flags |= S_NOATIME|S_NOCMTIME;
288 inode->i_mode = fattr->mode; 289 inode->i_mode = fattr->mode;
290 if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0
291 && nfs_server_capable(inode, NFS_CAP_MODE))
292 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
293 | NFS_INO_INVALID_ACCESS
294 | NFS_INO_INVALID_ACL;
289 /* Why so? Because we want revalidate for devices/FIFOs, and 295 /* Why so? Because we want revalidate for devices/FIFOs, and
290 * that's precisely what we have in nfs_file_inode_operations. 296 * that's precisely what we have in nfs_file_inode_operations.
291 */ 297 */
@@ -330,20 +336,46 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
330 nfsi->attr_gencount = fattr->gencount; 336 nfsi->attr_gencount = fattr->gencount;
331 if (fattr->valid & NFS_ATTR_FATTR_ATIME) 337 if (fattr->valid & NFS_ATTR_FATTR_ATIME)
332 inode->i_atime = fattr->atime; 338 inode->i_atime = fattr->atime;
339 else if (nfs_server_capable(inode, NFS_CAP_ATIME))
340 nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
333 if (fattr->valid & NFS_ATTR_FATTR_MTIME) 341 if (fattr->valid & NFS_ATTR_FATTR_MTIME)
334 inode->i_mtime = fattr->mtime; 342 inode->i_mtime = fattr->mtime;
343 else if (nfs_server_capable(inode, NFS_CAP_MTIME))
344 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
345 | NFS_INO_INVALID_DATA;
335 if (fattr->valid & NFS_ATTR_FATTR_CTIME) 346 if (fattr->valid & NFS_ATTR_FATTR_CTIME)
336 inode->i_ctime = fattr->ctime; 347 inode->i_ctime = fattr->ctime;
348 else if (nfs_server_capable(inode, NFS_CAP_CTIME))
349 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
350 | NFS_INO_INVALID_ACCESS
351 | NFS_INO_INVALID_ACL;
337 if (fattr->valid & NFS_ATTR_FATTR_CHANGE) 352 if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
338 nfsi->change_attr = fattr->change_attr; 353 nfsi->change_attr = fattr->change_attr;
354 else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR))
355 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
356 | NFS_INO_INVALID_DATA;
339 if (fattr->valid & NFS_ATTR_FATTR_SIZE) 357 if (fattr->valid & NFS_ATTR_FATTR_SIZE)
340 inode->i_size = nfs_size_to_loff_t(fattr->size); 358 inode->i_size = nfs_size_to_loff_t(fattr->size);
359 else
360 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
361 | NFS_INO_INVALID_DATA
362 | NFS_INO_REVAL_PAGECACHE;
341 if (fattr->valid & NFS_ATTR_FATTR_NLINK) 363 if (fattr->valid & NFS_ATTR_FATTR_NLINK)
342 inode->i_nlink = fattr->nlink; 364 inode->i_nlink = fattr->nlink;
365 else if (nfs_server_capable(inode, NFS_CAP_NLINK))
366 nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
343 if (fattr->valid & NFS_ATTR_FATTR_OWNER) 367 if (fattr->valid & NFS_ATTR_FATTR_OWNER)
344 inode->i_uid = fattr->uid; 368 inode->i_uid = fattr->uid;
369 else if (nfs_server_capable(inode, NFS_CAP_OWNER))
370 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
371 | NFS_INO_INVALID_ACCESS
372 | NFS_INO_INVALID_ACL;
345 if (fattr->valid & NFS_ATTR_FATTR_GROUP) 373 if (fattr->valid & NFS_ATTR_FATTR_GROUP)
346 inode->i_gid = fattr->gid; 374 inode->i_gid = fattr->gid;
375 else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP))
376 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
377 | NFS_INO_INVALID_ACCESS
378 | NFS_INO_INVALID_ACL;
347 if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) 379 if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
348 inode->i_blocks = fattr->du.nfs2.blocks; 380 inode->i_blocks = fattr->du.nfs2.blocks;
349 if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { 381 if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
@@ -426,49 +458,21 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
426 */ 458 */
427static int nfs_vmtruncate(struct inode * inode, loff_t offset) 459static int nfs_vmtruncate(struct inode * inode, loff_t offset)
428{ 460{
429 if (i_size_read(inode) < offset) { 461 loff_t oldsize;
430 unsigned long limit; 462 int err;
431
432 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
433 if (limit != RLIM_INFINITY && offset > limit)
434 goto out_sig;
435 if (offset > inode->i_sb->s_maxbytes)
436 goto out_big;
437 spin_lock(&inode->i_lock);
438 i_size_write(inode, offset);
439 spin_unlock(&inode->i_lock);
440 } else {
441 struct address_space *mapping = inode->i_mapping;
442 463
443 /* 464 err = inode_newsize_ok(inode, offset);
444 * truncation of in-use swapfiles is disallowed - it would 465 if (err)
445 * cause subsequent swapout to scribble on the now-freed 466 goto out;
446 * blocks.
447 */
448 if (IS_SWAPFILE(inode))
449 return -ETXTBSY;
450 spin_lock(&inode->i_lock);
451 i_size_write(inode, offset);
452 spin_unlock(&inode->i_lock);
453 467
454 /* 468 spin_lock(&inode->i_lock);
455 * unmap_mapping_range is called twice, first simply for 469 oldsize = inode->i_size;
456 * efficiency so that truncate_inode_pages does fewer 470 i_size_write(inode, offset);
457 * single-page unmaps. However after this first call, and 471 spin_unlock(&inode->i_lock);
458 * before truncate_inode_pages finishes, it is possible for 472
459 * private pages to be COWed, which remain after 473 truncate_pagecache(inode, oldsize, offset);
460 * truncate_inode_pages finishes, hence the second 474out:
461 * unmap_mapping_range call must be made for correctness. 475 return err;
462 */
463 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
464 truncate_inode_pages(mapping, offset);
465 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
466 }
467 return 0;
468out_sig:
469 send_sig(SIGXFSZ, current, 0);
470out_big:
471 return -EFBIG;
472} 476}
473 477
474/** 478/**
@@ -1145,6 +1149,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1145 loff_t cur_isize, new_isize; 1149 loff_t cur_isize, new_isize;
1146 unsigned long invalid = 0; 1150 unsigned long invalid = 0;
1147 unsigned long now = jiffies; 1151 unsigned long now = jiffies;
1152 unsigned long save_cache_validity;
1148 1153
1149 dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n", 1154 dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n",
1150 __func__, inode->i_sb->s_id, inode->i_ino, 1155 __func__, inode->i_sb->s_id, inode->i_ino,
@@ -1171,10 +1176,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1171 */ 1176 */
1172 nfsi->read_cache_jiffies = fattr->time_start; 1177 nfsi->read_cache_jiffies = fattr->time_start;
1173 1178
1174 if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME))) 1179 save_cache_validity = nfsi->cache_validity;
1175 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR 1180 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
1176 | NFS_INO_INVALID_ATIME 1181 | NFS_INO_INVALID_ATIME
1177 | NFS_INO_REVAL_PAGECACHE); 1182 | NFS_INO_REVAL_FORCED
1183 | NFS_INO_REVAL_PAGECACHE);
1178 1184
1179 /* Do atomic weak cache consistency updates */ 1185 /* Do atomic weak cache consistency updates */
1180 nfs_wcc_update_inode(inode, fattr); 1186 nfs_wcc_update_inode(inode, fattr);
@@ -1189,7 +1195,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1189 nfs_force_lookup_revalidate(inode); 1195 nfs_force_lookup_revalidate(inode);
1190 nfsi->change_attr = fattr->change_attr; 1196 nfsi->change_attr = fattr->change_attr;
1191 } 1197 }
1192 } 1198 } else if (server->caps & NFS_CAP_CHANGE_ATTR)
1199 invalid |= save_cache_validity;
1193 1200
1194 if (fattr->valid & NFS_ATTR_FATTR_MTIME) { 1201 if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
1195 /* NFSv2/v3: Check if the mtime agrees */ 1202 /* NFSv2/v3: Check if the mtime agrees */
@@ -1201,7 +1208,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1201 nfs_force_lookup_revalidate(inode); 1208 nfs_force_lookup_revalidate(inode);
1202 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); 1209 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
1203 } 1210 }
1204 } 1211 } else if (server->caps & NFS_CAP_MTIME)
1212 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
1213 | NFS_INO_INVALID_DATA
1214 | NFS_INO_REVAL_PAGECACHE
1215 | NFS_INO_REVAL_FORCED);
1216
1205 if (fattr->valid & NFS_ATTR_FATTR_CTIME) { 1217 if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
1206 /* If ctime has changed we should definitely clear access+acl caches */ 1218 /* If ctime has changed we should definitely clear access+acl caches */
1207 if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) { 1219 if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
@@ -1215,7 +1227,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1215 } 1227 }
1216 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); 1228 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
1217 } 1229 }
1218 } 1230 } else if (server->caps & NFS_CAP_CTIME)
1231 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
1232 | NFS_INO_INVALID_ACCESS
1233 | NFS_INO_INVALID_ACL
1234 | NFS_INO_REVAL_FORCED);
1219 1235
1220 /* Check if our cached file size is stale */ 1236 /* Check if our cached file size is stale */
1221 if (fattr->valid & NFS_ATTR_FATTR_SIZE) { 1237 if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
@@ -1231,30 +1247,50 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1231 dprintk("NFS: isize change on server for file %s/%ld\n", 1247 dprintk("NFS: isize change on server for file %s/%ld\n",
1232 inode->i_sb->s_id, inode->i_ino); 1248 inode->i_sb->s_id, inode->i_ino);
1233 } 1249 }
1234 } 1250 } else
1251 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
1252 | NFS_INO_REVAL_PAGECACHE
1253 | NFS_INO_REVAL_FORCED);
1235 1254
1236 1255
1237 if (fattr->valid & NFS_ATTR_FATTR_ATIME) 1256 if (fattr->valid & NFS_ATTR_FATTR_ATIME)
1238 memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); 1257 memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
1258 else if (server->caps & NFS_CAP_ATIME)
1259 invalid |= save_cache_validity & (NFS_INO_INVALID_ATIME
1260 | NFS_INO_REVAL_FORCED);
1239 1261
1240 if (fattr->valid & NFS_ATTR_FATTR_MODE) { 1262 if (fattr->valid & NFS_ATTR_FATTR_MODE) {
1241 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { 1263 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
1242 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1264 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1243 inode->i_mode = fattr->mode; 1265 inode->i_mode = fattr->mode;
1244 } 1266 }
1245 } 1267 } else if (server->caps & NFS_CAP_MODE)
1268 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
1269 | NFS_INO_INVALID_ACCESS
1270 | NFS_INO_INVALID_ACL
1271 | NFS_INO_REVAL_FORCED);
1272
1246 if (fattr->valid & NFS_ATTR_FATTR_OWNER) { 1273 if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
1247 if (inode->i_uid != fattr->uid) { 1274 if (inode->i_uid != fattr->uid) {
1248 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1275 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1249 inode->i_uid = fattr->uid; 1276 inode->i_uid = fattr->uid;
1250 } 1277 }
1251 } 1278 } else if (server->caps & NFS_CAP_OWNER)
1279 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
1280 | NFS_INO_INVALID_ACCESS
1281 | NFS_INO_INVALID_ACL
1282 | NFS_INO_REVAL_FORCED);
1283
1252 if (fattr->valid & NFS_ATTR_FATTR_GROUP) { 1284 if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
1253 if (inode->i_gid != fattr->gid) { 1285 if (inode->i_gid != fattr->gid) {
1254 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1286 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1255 inode->i_gid = fattr->gid; 1287 inode->i_gid = fattr->gid;
1256 } 1288 }
1257 } 1289 } else if (server->caps & NFS_CAP_OWNER_GROUP)
1290 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
1291 | NFS_INO_INVALID_ACCESS
1292 | NFS_INO_INVALID_ACL
1293 | NFS_INO_REVAL_FORCED);
1258 1294
1259 if (fattr->valid & NFS_ATTR_FATTR_NLINK) { 1295 if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
1260 if (inode->i_nlink != fattr->nlink) { 1296 if (inode->i_nlink != fattr->nlink) {
@@ -1263,7 +1299,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1263 invalid |= NFS_INO_INVALID_DATA; 1299 invalid |= NFS_INO_INVALID_DATA;
1264 inode->i_nlink = fattr->nlink; 1300 inode->i_nlink = fattr->nlink;
1265 } 1301 }
1266 } 1302 } else if (server->caps & NFS_CAP_NLINK)
1303 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
1304 | NFS_INO_REVAL_FORCED);
1267 1305
1268 if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { 1306 if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
1269 /* 1307 /*
@@ -1293,9 +1331,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1293 || S_ISLNK(inode->i_mode))) 1331 || S_ISLNK(inode->i_mode)))
1294 invalid &= ~NFS_INO_INVALID_DATA; 1332 invalid &= ~NFS_INO_INVALID_DATA;
1295 if (!nfs_have_delegation(inode, FMODE_READ) || 1333 if (!nfs_have_delegation(inode, FMODE_READ) ||
1296 (nfsi->cache_validity & NFS_INO_REVAL_FORCED)) 1334 (save_cache_validity & NFS_INO_REVAL_FORCED))
1297 nfsi->cache_validity |= invalid; 1335 nfsi->cache_validity |= invalid;
1298 nfsi->cache_validity &= ~NFS_INO_REVAL_FORCED;
1299 1336
1300 return 0; 1337 return 0;
1301 out_changed: 1338 out_changed:
@@ -1442,6 +1479,10 @@ static int __init init_nfs_fs(void)
1442{ 1479{
1443 int err; 1480 int err;
1444 1481
1482 err = nfs_dns_resolver_init();
1483 if (err < 0)
1484 goto out8;
1485
1445 err = nfs_fscache_register(); 1486 err = nfs_fscache_register();
1446 if (err < 0) 1487 if (err < 0)
1447 goto out7; 1488 goto out7;
@@ -1500,6 +1541,8 @@ out5:
1500out6: 1541out6:
1501 nfs_fscache_unregister(); 1542 nfs_fscache_unregister();
1502out7: 1543out7:
1544 nfs_dns_resolver_destroy();
1545out8:
1503 return err; 1546 return err;
1504} 1547}
1505 1548
@@ -1511,6 +1554,7 @@ static void __exit exit_nfs_fs(void)
1511 nfs_destroy_inodecache(); 1554 nfs_destroy_inodecache();
1512 nfs_destroy_nfspagecache(); 1555 nfs_destroy_nfspagecache();
1513 nfs_fscache_unregister(); 1556 nfs_fscache_unregister();
1557 nfs_dns_resolver_destroy();
1514#ifdef CONFIG_PROC_FS 1558#ifdef CONFIG_PROC_FS
1515 rpc_proc_unregister("nfs"); 1559 rpc_proc_unregister("nfs");
1516#endif 1560#endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7dd90a6769d0..e21b1bb9972f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -49,6 +49,11 @@ struct nfs_clone_mount {
49#define NFS_MAX_SECFLAVORS (12) 49#define NFS_MAX_SECFLAVORS (12)
50 50
51/* 51/*
52 * Value used if the user did not specify a port value.
53 */
54#define NFS_UNSPEC_PORT (-1)
55
56/*
52 * In-kernel mount arguments 57 * In-kernel mount arguments
53 */ 58 */
54struct nfs_parsed_mount_data { 59struct nfs_parsed_mount_data {
@@ -63,6 +68,7 @@ struct nfs_parsed_mount_data {
63 unsigned int auth_flavor_len; 68 unsigned int auth_flavor_len;
64 rpc_authflavor_t auth_flavors[1]; 69 rpc_authflavor_t auth_flavors[1];
65 char *client_address; 70 char *client_address;
71 unsigned int version;
66 unsigned int minorversion; 72 unsigned int minorversion;
67 char *fscache_uniq; 73 char *fscache_uniq;
68 74
@@ -71,7 +77,7 @@ struct nfs_parsed_mount_data {
71 size_t addrlen; 77 size_t addrlen;
72 char *hostname; 78 char *hostname;
73 u32 version; 79 u32 version;
74 unsigned short port; 80 int port;
75 unsigned short protocol; 81 unsigned short protocol;
76 } mount_server; 82 } mount_server;
77 83
@@ -80,7 +86,7 @@ struct nfs_parsed_mount_data {
80 size_t addrlen; 86 size_t addrlen;
81 char *hostname; 87 char *hostname;
82 char *export_path; 88 char *export_path;
83 unsigned short port; 89 int port;
84 unsigned short protocol; 90 unsigned short protocol;
85 } nfs_server; 91 } nfs_server;
86 92
@@ -102,6 +108,7 @@ struct nfs_mount_request {
102}; 108};
103 109
104extern int nfs_mount(struct nfs_mount_request *info); 110extern int nfs_mount(struct nfs_mount_request *info);
111extern void nfs_umount(const struct nfs_mount_request *info);
105 112
106/* client.c */ 113/* client.c */
107extern struct rpc_program nfs_program; 114extern struct rpc_program nfs_program;
@@ -213,7 +220,6 @@ void nfs_zap_acl_cache(struct inode *inode);
213extern int nfs_wait_bit_killable(void *word); 220extern int nfs_wait_bit_killable(void *word);
214 221
215/* super.c */ 222/* super.c */
216void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *);
217extern struct file_system_type nfs_xdev_fs_type; 223extern struct file_system_type nfs_xdev_fs_type;
218#ifdef CONFIG_NFS_V4 224#ifdef CONFIG_NFS_V4
219extern struct file_system_type nfs4_xdev_fs_type; 225extern struct file_system_type nfs4_xdev_fs_type;
@@ -248,6 +254,12 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
248 254
249/* write.c */ 255/* write.c */
250extern void nfs_write_prepare(struct rpc_task *task, void *calldata); 256extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
257#ifdef CONFIG_MIGRATION
258extern int nfs_migrate_page(struct address_space *,
259 struct page *, struct page *);
260#else
261#define nfs_migrate_page NULL
262#endif
251 263
252/* nfs4proc.c */ 264/* nfs4proc.c */
253extern int _nfs4_call_sync(struct nfs_server *server, 265extern int _nfs4_call_sync(struct nfs_server *server,
@@ -368,24 +380,3 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
368 return ((unsigned long)len + (unsigned long)base + 380 return ((unsigned long)len + (unsigned long)base +
369 PAGE_SIZE - 1) >> PAGE_SHIFT; 381 PAGE_SIZE - 1) >> PAGE_SHIFT;
370} 382}
371
372#define IPV6_SCOPE_DELIMITER '%'
373
374/*
375 * Set the port number in an address. Be agnostic about the address
376 * family.
377 */
378static inline void nfs_set_port(struct sockaddr *sap, unsigned short port)
379{
380 struct sockaddr_in *ap = (struct sockaddr_in *)sap;
381 struct sockaddr_in6 *ap6 = (struct sockaddr_in6 *)sap;
382
383 switch (sap->sa_family) {
384 case AF_INET:
385 ap->sin_port = htons(port);
386 break;
387 case AF_INET6:
388 ap6->sin6_port = htons(port);
389 break;
390 }
391}
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 38ef9eaec407..0adefc40cc89 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -209,6 +209,71 @@ out_mnt_err:
209 goto out; 209 goto out;
210} 210}
211 211
212/**
213 * nfs_umount - Notify a server that we have unmounted this export
214 * @info: pointer to umount request arguments
215 *
216 * MOUNTPROC_UMNT is advisory, so we set a short timeout, and always
217 * use UDP.
218 */
219void nfs_umount(const struct nfs_mount_request *info)
220{
221 static const struct rpc_timeout nfs_umnt_timeout = {
222 .to_initval = 1 * HZ,
223 .to_maxval = 3 * HZ,
224 .to_retries = 2,
225 };
226 struct rpc_create_args args = {
227 .protocol = IPPROTO_UDP,
228 .address = info->sap,
229 .addrsize = info->salen,
230 .timeout = &nfs_umnt_timeout,
231 .servername = info->hostname,
232 .program = &mnt_program,
233 .version = info->version,
234 .authflavor = RPC_AUTH_UNIX,
235 .flags = RPC_CLNT_CREATE_NOPING,
236 };
237 struct mountres result;
238 struct rpc_message msg = {
239 .rpc_argp = info->dirpath,
240 .rpc_resp = &result,
241 };
242 struct rpc_clnt *clnt;
243 int status;
244
245 if (info->noresvport)
246 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
247
248 clnt = rpc_create(&args);
249 if (unlikely(IS_ERR(clnt)))
250 goto out_clnt_err;
251
252 dprintk("NFS: sending UMNT request for %s:%s\n",
253 (info->hostname ? info->hostname : "server"), info->dirpath);
254
255 if (info->version == NFS_MNT3_VERSION)
256 msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC3_UMNT];
257 else
258 msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC_UMNT];
259
260 status = rpc_call_sync(clnt, &msg, 0);
261 rpc_shutdown_client(clnt);
262
263 if (unlikely(status < 0))
264 goto out_call_err;
265
266 return;
267
268out_clnt_err:
269 dprintk("NFS: failed to create UMNT RPC client, status=%ld\n",
270 PTR_ERR(clnt));
271 return;
272
273out_call_err:
274 dprintk("NFS: UMNT request failed, status=%d\n", status);
275}
276
212/* 277/*
213 * XDR encode/decode functions for MOUNT 278 * XDR encode/decode functions for MOUNT
214 */ 279 */
@@ -258,7 +323,7 @@ static int decode_status(struct xdr_stream *xdr, struct mountres *res)
258 return -EIO; 323 return -EIO;
259 status = ntohl(*p); 324 status = ntohl(*p);
260 325
261 for (i = 0; i <= ARRAY_SIZE(mnt_errtbl); i++) { 326 for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) {
262 if (mnt_errtbl[i].status == status) { 327 if (mnt_errtbl[i].status == status) {
263 res->errno = mnt_errtbl[i].errno; 328 res->errno = mnt_errtbl[i].errno;
264 return 0; 329 return 0;
@@ -309,7 +374,7 @@ static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
309 return -EIO; 374 return -EIO;
310 status = ntohl(*p); 375 status = ntohl(*p);
311 376
312 for (i = 0; i <= ARRAY_SIZE(mnt3_errtbl); i++) { 377 for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) {
313 if (mnt3_errtbl[i].status == status) { 378 if (mnt3_errtbl[i].status == status) {
314 res->errno = mnt3_errtbl[i].errno; 379 res->errno = mnt3_errtbl[i].errno;
315 return 0; 380 return 0;
@@ -407,6 +472,13 @@ static struct rpc_procinfo mnt_procedures[] = {
407 .p_statidx = MOUNTPROC_MNT, 472 .p_statidx = MOUNTPROC_MNT,
408 .p_name = "MOUNT", 473 .p_name = "MOUNT",
409 }, 474 },
475 [MOUNTPROC_UMNT] = {
476 .p_proc = MOUNTPROC_UMNT,
477 .p_encode = (kxdrproc_t)mnt_enc_dirpath,
478 .p_arglen = MNT_enc_dirpath_sz,
479 .p_statidx = MOUNTPROC_UMNT,
480 .p_name = "UMOUNT",
481 },
410}; 482};
411 483
412static struct rpc_procinfo mnt3_procedures[] = { 484static struct rpc_procinfo mnt3_procedures[] = {
@@ -419,6 +491,13 @@ static struct rpc_procinfo mnt3_procedures[] = {
419 .p_statidx = MOUNTPROC3_MNT, 491 .p_statidx = MOUNTPROC3_MNT,
420 .p_name = "MOUNT", 492 .p_name = "MOUNT",
421 }, 493 },
494 [MOUNTPROC3_UMNT] = {
495 .p_proc = MOUNTPROC3_UMNT,
496 .p_encode = (kxdrproc_t)mnt_enc_dirpath,
497 .p_arglen = MNT_enc_dirpath_sz,
498 .p_statidx = MOUNTPROC3_UMNT,
499 .p_name = "UMOUNT",
500 },
422}; 501};
423 502
424 503
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index c862c9340f9a..5e078b222b4e 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -13,7 +13,6 @@
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/utsname.h>
17#include <linux/errno.h> 16#include <linux/errno.h>
18#include <linux/string.h> 17#include <linux/string.h>
19#include <linux/in.h> 18#include <linux/in.h>
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index d0cc5ce0edfe..3f8881d1a050 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -7,7 +7,6 @@
7 */ 7 */
8 8
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/utsname.h>
11#include <linux/errno.h> 10#include <linux/errno.h>
12#include <linux/string.h> 11#include <linux/string.h>
13#include <linux/sunrpc/clnt.h> 12#include <linux/sunrpc/clnt.h>
@@ -299,7 +298,6 @@ static void nfs3_free_createdata(struct nfs3_createdata *data)
299 298
300/* 299/*
301 * Create a regular file. 300 * Create a regular file.
302 * For now, we don't implement O_EXCL.
303 */ 301 */
304static int 302static int
305nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 303nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 35869a4921f1..5fe5492fbd29 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -10,7 +10,6 @@
10#include <linux/time.h> 10#include <linux/time.h>
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/utsname.h>
14#include <linux/errno.h> 13#include <linux/errno.h>
15#include <linux/string.h> 14#include <linux/string.h>
16#include <linux/in.h> 15#include <linux/in.h>
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 2a2a0a7143ad..2636c26d56fa 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -17,6 +17,7 @@
17#include <linux/inet.h> 17#include <linux/inet.h>
18#include "internal.h" 18#include "internal.h"
19#include "nfs4_fs.h" 19#include "nfs4_fs.h"
20#include "dns_resolve.h"
20 21
21#define NFSDBG_FACILITY NFSDBG_VFS 22#define NFSDBG_FACILITY NFSDBG_VFS
22 23
@@ -95,6 +96,20 @@ static int nfs4_validate_fspath(const struct vfsmount *mnt_parent,
95 return 0; 96 return 0;
96} 97}
97 98
99static size_t nfs_parse_server_name(char *string, size_t len,
100 struct sockaddr *sa, size_t salen)
101{
102 ssize_t ret;
103
104 ret = rpc_pton(string, len, sa, salen);
105 if (ret == 0) {
106 ret = nfs_dns_resolve_name(string, len, sa, salen);
107 if (ret < 0)
108 ret = 0;
109 }
110 return ret;
111}
112
98static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, 113static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
99 char *page, char *page2, 114 char *page, char *page2,
100 const struct nfs4_fs_location *location) 115 const struct nfs4_fs_location *location)
@@ -121,11 +136,12 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
121 136
122 if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len)) 137 if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len))
123 continue; 138 continue;
124 nfs_parse_ip_address(buf->data, buf->len, 139 mountdata->addrlen = nfs_parse_server_name(buf->data,
125 mountdata->addr, &mountdata->addrlen); 140 buf->len,
126 if (mountdata->addr->sa_family == AF_UNSPEC) 141 mountdata->addr, mountdata->addrlen);
142 if (mountdata->addrlen == 0)
127 continue; 143 continue;
128 nfs_set_port(mountdata->addr, NFS_PORT); 144 rpc_set_port(mountdata->addr, NFS_PORT);
129 145
130 memcpy(page2, buf->data, buf->len); 146 memcpy(page2, buf->data, buf->len);
131 page2[buf->len] = '\0'; 147 page2[buf->len] = '\0';
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 6917311f201c..ed7c269e2514 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -36,7 +36,6 @@
36 */ 36 */
37 37
38#include <linux/mm.h> 38#include <linux/mm.h>
39#include <linux/utsname.h>
40#include <linux/delay.h> 39#include <linux/delay.h>
41#include <linux/errno.h> 40#include <linux/errno.h>
42#include <linux/string.h> 41#include <linux/string.h>
@@ -61,6 +60,8 @@
61#define NFS4_POLL_RETRY_MIN (HZ/10) 60#define NFS4_POLL_RETRY_MIN (HZ/10)
62#define NFS4_POLL_RETRY_MAX (15*HZ) 61#define NFS4_POLL_RETRY_MAX (15*HZ)
63 62
63#define NFS4_MAX_LOOP_ON_RECOVER (10)
64
64struct nfs4_opendata; 65struct nfs4_opendata;
65static int _nfs4_proc_open(struct nfs4_opendata *data); 66static int _nfs4_proc_open(struct nfs4_opendata *data);
66static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); 67static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
@@ -426,17 +427,19 @@ out:
426static int nfs4_recover_session(struct nfs4_session *session) 427static int nfs4_recover_session(struct nfs4_session *session)
427{ 428{
428 struct nfs_client *clp = session->clp; 429 struct nfs_client *clp = session->clp;
430 unsigned int loop;
429 int ret; 431 int ret;
430 432
431 for (;;) { 433 for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
432 ret = nfs4_wait_clnt_recover(clp); 434 ret = nfs4_wait_clnt_recover(clp);
433 if (ret != 0) 435 if (ret != 0)
434 return ret; 436 break;
435 if (!test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) 437 if (!test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state))
436 break; 438 break;
437 nfs4_schedule_state_manager(clp); 439 nfs4_schedule_state_manager(clp);
440 ret = -EIO;
438 } 441 }
439 return 0; 442 return ret;
440} 443}
441 444
442static int nfs41_setup_sequence(struct nfs4_session *session, 445static int nfs41_setup_sequence(struct nfs4_session *session,
@@ -1444,18 +1447,20 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
1444static int nfs4_recover_expired_lease(struct nfs_server *server) 1447static int nfs4_recover_expired_lease(struct nfs_server *server)
1445{ 1448{
1446 struct nfs_client *clp = server->nfs_client; 1449 struct nfs_client *clp = server->nfs_client;
1450 unsigned int loop;
1447 int ret; 1451 int ret;
1448 1452
1449 for (;;) { 1453 for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
1450 ret = nfs4_wait_clnt_recover(clp); 1454 ret = nfs4_wait_clnt_recover(clp);
1451 if (ret != 0) 1455 if (ret != 0)
1452 return ret; 1456 break;
1453 if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) && 1457 if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
1454 !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state)) 1458 !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
1455 break; 1459 break;
1456 nfs4_schedule_state_recovery(clp); 1460 nfs4_schedule_state_recovery(clp);
1461 ret = -EIO;
1457 } 1462 }
1458 return 0; 1463 return ret;
1459} 1464}
1460 1465
1461/* 1466/*
@@ -1997,12 +2002,34 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
1997 status = nfs4_call_sync(server, &msg, &args, &res, 0); 2002 status = nfs4_call_sync(server, &msg, &args, &res, 0);
1998 if (status == 0) { 2003 if (status == 0) {
1999 memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); 2004 memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
2005 server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS|
2006 NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
2007 NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|
2008 NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME|
2009 NFS_CAP_CTIME|NFS_CAP_MTIME);
2000 if (res.attr_bitmask[0] & FATTR4_WORD0_ACL) 2010 if (res.attr_bitmask[0] & FATTR4_WORD0_ACL)
2001 server->caps |= NFS_CAP_ACLS; 2011 server->caps |= NFS_CAP_ACLS;
2002 if (res.has_links != 0) 2012 if (res.has_links != 0)
2003 server->caps |= NFS_CAP_HARDLINKS; 2013 server->caps |= NFS_CAP_HARDLINKS;
2004 if (res.has_symlinks != 0) 2014 if (res.has_symlinks != 0)
2005 server->caps |= NFS_CAP_SYMLINKS; 2015 server->caps |= NFS_CAP_SYMLINKS;
2016 if (res.attr_bitmask[0] & FATTR4_WORD0_FILEID)
2017 server->caps |= NFS_CAP_FILEID;
2018 if (res.attr_bitmask[1] & FATTR4_WORD1_MODE)
2019 server->caps |= NFS_CAP_MODE;
2020 if (res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS)
2021 server->caps |= NFS_CAP_NLINK;
2022 if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER)
2023 server->caps |= NFS_CAP_OWNER;
2024 if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP)
2025 server->caps |= NFS_CAP_OWNER_GROUP;
2026 if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS)
2027 server->caps |= NFS_CAP_ATIME;
2028 if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA)
2029 server->caps |= NFS_CAP_CTIME;
2030 if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)
2031 server->caps |= NFS_CAP_MTIME;
2032
2006 memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask)); 2033 memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
2007 server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; 2034 server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
2008 server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; 2035 server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 1434080aefeb..2ef4fecf3984 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -638,7 +638,7 @@ static void nfs4_fl_release_lock(struct file_lock *fl)
638 nfs4_put_lock_state(fl->fl_u.nfs4_fl.owner); 638 nfs4_put_lock_state(fl->fl_u.nfs4_fl.owner);
639} 639}
640 640
641static struct file_lock_operations nfs4_fl_lock_ops = { 641static const struct file_lock_operations nfs4_fl_lock_ops = {
642 .fl_copy_lock = nfs4_fl_copy_lock, 642 .fl_copy_lock = nfs4_fl_copy_lock,
643 .fl_release_private = nfs4_fl_release_lock, 643 .fl_release_private = nfs4_fl_release_lock,
644}; 644};
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 617273e7d47f..83ad47cbdd8a 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -39,7 +39,6 @@
39#include <linux/time.h> 39#include <linux/time.h>
40#include <linux/mm.h> 40#include <linux/mm.h>
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/utsname.h>
43#include <linux/errno.h> 42#include <linux/errno.h>
44#include <linux/string.h> 43#include <linux/string.h>
45#include <linux/in.h> 44#include <linux/in.h>
@@ -702,29 +701,12 @@ struct compound_hdr {
702 u32 minorversion; 701 u32 minorversion;
703}; 702};
704 703
705/* 704static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes)
706 * START OF "GENERIC" ENCODE ROUTINES. 705{
707 * These may look a little ugly since they are imported from a "generic" 706 __be32 *p = xdr_reserve_space(xdr, nbytes);
708 * set of XDR encode/decode routines which are intended to be shared by 707 BUG_ON(!p);
709 * all of our NFSv4 implementations (OpenBSD, MacOS X...). 708 return p;
710 * 709}
711 * If the pain of reading these is too great, it should be a straightforward
712 * task to translate them into Linux-specific versions which are more
713 * consistent with the style used in NFSv2/v3...
714 */
715#define WRITE32(n) *p++ = htonl(n)
716#define WRITE64(n) do { \
717 *p++ = htonl((uint32_t)((n) >> 32)); \
718 *p++ = htonl((uint32_t)(n)); \
719} while (0)
720#define WRITEMEM(ptr,nbytes) do { \
721 p = xdr_encode_opaque_fixed(p, ptr, nbytes); \
722} while (0)
723
724#define RESERVE_SPACE(nbytes) do { \
725 p = xdr_reserve_space(xdr, nbytes); \
726 BUG_ON(!p); \
727} while (0)
728 710
729static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) 711static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
730{ 712{
@@ -749,12 +731,11 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
749 731
750 dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag); 732 dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
751 BUG_ON(hdr->taglen > NFS4_MAXTAGLEN); 733 BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
752 RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2)); 734 p = reserve_space(xdr, 4 + hdr->taglen + 8);
753 WRITE32(hdr->taglen); 735 p = xdr_encode_opaque(p, hdr->tag, hdr->taglen);
754 WRITEMEM(hdr->tag, hdr->taglen); 736 *p++ = cpu_to_be32(hdr->minorversion);
755 WRITE32(hdr->minorversion);
756 hdr->nops_p = p; 737 hdr->nops_p = p;
757 WRITE32(hdr->nops); 738 *p = cpu_to_be32(hdr->nops);
758} 739}
759 740
760static void encode_nops(struct compound_hdr *hdr) 741static void encode_nops(struct compound_hdr *hdr)
@@ -829,55 +810,53 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
829 len += 16; 810 len += 16;
830 else if (iap->ia_valid & ATTR_MTIME) 811 else if (iap->ia_valid & ATTR_MTIME)
831 len += 4; 812 len += 4;
832 RESERVE_SPACE(len); 813 p = reserve_space(xdr, len);
833 814
834 /* 815 /*
835 * We write the bitmap length now, but leave the bitmap and the attribute 816 * We write the bitmap length now, but leave the bitmap and the attribute
836 * buffer length to be backfilled at the end of this routine. 817 * buffer length to be backfilled at the end of this routine.
837 */ 818 */
838 WRITE32(2); 819 *p++ = cpu_to_be32(2);
839 q = p; 820 q = p;
840 p += 3; 821 p += 3;
841 822
842 if (iap->ia_valid & ATTR_SIZE) { 823 if (iap->ia_valid & ATTR_SIZE) {
843 bmval0 |= FATTR4_WORD0_SIZE; 824 bmval0 |= FATTR4_WORD0_SIZE;
844 WRITE64(iap->ia_size); 825 p = xdr_encode_hyper(p, iap->ia_size);
845 } 826 }
846 if (iap->ia_valid & ATTR_MODE) { 827 if (iap->ia_valid & ATTR_MODE) {
847 bmval1 |= FATTR4_WORD1_MODE; 828 bmval1 |= FATTR4_WORD1_MODE;
848 WRITE32(iap->ia_mode & S_IALLUGO); 829 *p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO);
849 } 830 }
850 if (iap->ia_valid & ATTR_UID) { 831 if (iap->ia_valid & ATTR_UID) {
851 bmval1 |= FATTR4_WORD1_OWNER; 832 bmval1 |= FATTR4_WORD1_OWNER;
852 WRITE32(owner_namelen); 833 p = xdr_encode_opaque(p, owner_name, owner_namelen);
853 WRITEMEM(owner_name, owner_namelen);
854 } 834 }
855 if (iap->ia_valid & ATTR_GID) { 835 if (iap->ia_valid & ATTR_GID) {
856 bmval1 |= FATTR4_WORD1_OWNER_GROUP; 836 bmval1 |= FATTR4_WORD1_OWNER_GROUP;
857 WRITE32(owner_grouplen); 837 p = xdr_encode_opaque(p, owner_group, owner_grouplen);
858 WRITEMEM(owner_group, owner_grouplen);
859 } 838 }
860 if (iap->ia_valid & ATTR_ATIME_SET) { 839 if (iap->ia_valid & ATTR_ATIME_SET) {
861 bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; 840 bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
862 WRITE32(NFS4_SET_TO_CLIENT_TIME); 841 *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
863 WRITE32(0); 842 *p++ = cpu_to_be32(0);
864 WRITE32(iap->ia_mtime.tv_sec); 843 *p++ = cpu_to_be32(iap->ia_mtime.tv_sec);
865 WRITE32(iap->ia_mtime.tv_nsec); 844 *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
866 } 845 }
867 else if (iap->ia_valid & ATTR_ATIME) { 846 else if (iap->ia_valid & ATTR_ATIME) {
868 bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; 847 bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
869 WRITE32(NFS4_SET_TO_SERVER_TIME); 848 *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
870 } 849 }
871 if (iap->ia_valid & ATTR_MTIME_SET) { 850 if (iap->ia_valid & ATTR_MTIME_SET) {
872 bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; 851 bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
873 WRITE32(NFS4_SET_TO_CLIENT_TIME); 852 *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
874 WRITE32(0); 853 *p++ = cpu_to_be32(0);
875 WRITE32(iap->ia_mtime.tv_sec); 854 *p++ = cpu_to_be32(iap->ia_mtime.tv_sec);
876 WRITE32(iap->ia_mtime.tv_nsec); 855 *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
877 } 856 }
878 else if (iap->ia_valid & ATTR_MTIME) { 857 else if (iap->ia_valid & ATTR_MTIME) {
879 bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; 858 bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
880 WRITE32(NFS4_SET_TO_SERVER_TIME); 859 *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
881 } 860 }
882 861
883 /* 862 /*
@@ -891,7 +870,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
891 len = (char *)p - (char *)q - 12; 870 len = (char *)p - (char *)q - 12;
892 *q++ = htonl(bmval0); 871 *q++ = htonl(bmval0);
893 *q++ = htonl(bmval1); 872 *q++ = htonl(bmval1);
894 *q++ = htonl(len); 873 *q = htonl(len);
895 874
896/* out: */ 875/* out: */
897} 876}
@@ -900,9 +879,9 @@ static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hd
900{ 879{
901 __be32 *p; 880 __be32 *p;
902 881
903 RESERVE_SPACE(8); 882 p = reserve_space(xdr, 8);
904 WRITE32(OP_ACCESS); 883 *p++ = cpu_to_be32(OP_ACCESS);
905 WRITE32(access); 884 *p = cpu_to_be32(access);
906 hdr->nops++; 885 hdr->nops++;
907 hdr->replen += decode_access_maxsz; 886 hdr->replen += decode_access_maxsz;
908} 887}
@@ -911,10 +890,10 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
911{ 890{
912 __be32 *p; 891 __be32 *p;
913 892
914 RESERVE_SPACE(8+NFS4_STATEID_SIZE); 893 p = reserve_space(xdr, 8+NFS4_STATEID_SIZE);
915 WRITE32(OP_CLOSE); 894 *p++ = cpu_to_be32(OP_CLOSE);
916 WRITE32(arg->seqid->sequence->counter); 895 *p++ = cpu_to_be32(arg->seqid->sequence->counter);
917 WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); 896 xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
918 hdr->nops++; 897 hdr->nops++;
919 hdr->replen += decode_close_maxsz; 898 hdr->replen += decode_close_maxsz;
920} 899}
@@ -923,10 +902,10 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar
923{ 902{
924 __be32 *p; 903 __be32 *p;
925 904
926 RESERVE_SPACE(16); 905 p = reserve_space(xdr, 16);
927 WRITE32(OP_COMMIT); 906 *p++ = cpu_to_be32(OP_COMMIT);
928 WRITE64(args->offset); 907 p = xdr_encode_hyper(p, args->offset);
929 WRITE32(args->count); 908 *p = cpu_to_be32(args->count);
930 hdr->nops++; 909 hdr->nops++;
931 hdr->replen += decode_commit_maxsz; 910 hdr->replen += decode_commit_maxsz;
932} 911}
@@ -935,30 +914,28 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
935{ 914{
936 __be32 *p; 915 __be32 *p;
937 916
938 RESERVE_SPACE(8); 917 p = reserve_space(xdr, 8);
939 WRITE32(OP_CREATE); 918 *p++ = cpu_to_be32(OP_CREATE);
940 WRITE32(create->ftype); 919 *p = cpu_to_be32(create->ftype);
941 920
942 switch (create->ftype) { 921 switch (create->ftype) {
943 case NF4LNK: 922 case NF4LNK:
944 RESERVE_SPACE(4); 923 p = reserve_space(xdr, 4);
945 WRITE32(create->u.symlink.len); 924 *p = cpu_to_be32(create->u.symlink.len);
946 xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len); 925 xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
947 break; 926 break;
948 927
949 case NF4BLK: case NF4CHR: 928 case NF4BLK: case NF4CHR:
950 RESERVE_SPACE(8); 929 p = reserve_space(xdr, 8);
951 WRITE32(create->u.device.specdata1); 930 *p++ = cpu_to_be32(create->u.device.specdata1);
952 WRITE32(create->u.device.specdata2); 931 *p = cpu_to_be32(create->u.device.specdata2);
953 break; 932 break;
954 933
955 default: 934 default:
956 break; 935 break;
957 } 936 }
958 937
959 RESERVE_SPACE(4 + create->name->len); 938 encode_string(xdr, create->name->len, create->name->name);
960 WRITE32(create->name->len);
961 WRITEMEM(create->name->name, create->name->len);
962 hdr->nops++; 939 hdr->nops++;
963 hdr->replen += decode_create_maxsz; 940 hdr->replen += decode_create_maxsz;
964 941
@@ -969,10 +946,10 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c
969{ 946{
970 __be32 *p; 947 __be32 *p;
971 948
972 RESERVE_SPACE(12); 949 p = reserve_space(xdr, 12);
973 WRITE32(OP_GETATTR); 950 *p++ = cpu_to_be32(OP_GETATTR);
974 WRITE32(1); 951 *p++ = cpu_to_be32(1);
975 WRITE32(bitmap); 952 *p = cpu_to_be32(bitmap);
976 hdr->nops++; 953 hdr->nops++;
977 hdr->replen += decode_getattr_maxsz; 954 hdr->replen += decode_getattr_maxsz;
978} 955}
@@ -981,11 +958,11 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
981{ 958{
982 __be32 *p; 959 __be32 *p;
983 960
984 RESERVE_SPACE(16); 961 p = reserve_space(xdr, 16);
985 WRITE32(OP_GETATTR); 962 *p++ = cpu_to_be32(OP_GETATTR);
986 WRITE32(2); 963 *p++ = cpu_to_be32(2);
987 WRITE32(bm0); 964 *p++ = cpu_to_be32(bm0);
988 WRITE32(bm1); 965 *p = cpu_to_be32(bm1);
989 hdr->nops++; 966 hdr->nops++;
990 hdr->replen += decode_getattr_maxsz; 967 hdr->replen += decode_getattr_maxsz;
991} 968}
@@ -1012,8 +989,8 @@ static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1012{ 989{
1013 __be32 *p; 990 __be32 *p;
1014 991
1015 RESERVE_SPACE(4); 992 p = reserve_space(xdr, 4);
1016 WRITE32(OP_GETFH); 993 *p = cpu_to_be32(OP_GETFH);
1017 hdr->nops++; 994 hdr->nops++;
1018 hdr->replen += decode_getfh_maxsz; 995 hdr->replen += decode_getfh_maxsz;
1019} 996}
@@ -1022,10 +999,9 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct
1022{ 999{
1023 __be32 *p; 1000 __be32 *p;
1024 1001
1025 RESERVE_SPACE(8 + name->len); 1002 p = reserve_space(xdr, 8 + name->len);
1026 WRITE32(OP_LINK); 1003 *p++ = cpu_to_be32(OP_LINK);
1027 WRITE32(name->len); 1004 xdr_encode_opaque(p, name->name, name->len);
1028 WRITEMEM(name->name, name->len);
1029 hdr->nops++; 1005 hdr->nops++;
1030 hdr->replen += decode_link_maxsz; 1006 hdr->replen += decode_link_maxsz;
1031} 1007}
@@ -1052,27 +1028,27 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
1052{ 1028{
1053 __be32 *p; 1029 __be32 *p;
1054 1030
1055 RESERVE_SPACE(32); 1031 p = reserve_space(xdr, 32);
1056 WRITE32(OP_LOCK); 1032 *p++ = cpu_to_be32(OP_LOCK);
1057 WRITE32(nfs4_lock_type(args->fl, args->block)); 1033 *p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block));
1058 WRITE32(args->reclaim); 1034 *p++ = cpu_to_be32(args->reclaim);
1059 WRITE64(args->fl->fl_start); 1035 p = xdr_encode_hyper(p, args->fl->fl_start);
1060 WRITE64(nfs4_lock_length(args->fl)); 1036 p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
1061 WRITE32(args->new_lock_owner); 1037 *p = cpu_to_be32(args->new_lock_owner);
1062 if (args->new_lock_owner){ 1038 if (args->new_lock_owner){
1063 RESERVE_SPACE(4+NFS4_STATEID_SIZE+32); 1039 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32);
1064 WRITE32(args->open_seqid->sequence->counter); 1040 *p++ = cpu_to_be32(args->open_seqid->sequence->counter);
1065 WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE); 1041 p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE);
1066 WRITE32(args->lock_seqid->sequence->counter); 1042 *p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
1067 WRITE64(args->lock_owner.clientid); 1043 p = xdr_encode_hyper(p, args->lock_owner.clientid);
1068 WRITE32(16); 1044 *p++ = cpu_to_be32(16);
1069 WRITEMEM("lock id:", 8); 1045 p = xdr_encode_opaque_fixed(p, "lock id:", 8);
1070 WRITE64(args->lock_owner.id); 1046 xdr_encode_hyper(p, args->lock_owner.id);
1071 } 1047 }
1072 else { 1048 else {
1073 RESERVE_SPACE(NFS4_STATEID_SIZE+4); 1049 p = reserve_space(xdr, NFS4_STATEID_SIZE+4);
1074 WRITEMEM(args->lock_stateid->data, NFS4_STATEID_SIZE); 1050 p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE);
1075 WRITE32(args->lock_seqid->sequence->counter); 1051 *p = cpu_to_be32(args->lock_seqid->sequence->counter);
1076 } 1052 }
1077 hdr->nops++; 1053 hdr->nops++;
1078 hdr->replen += decode_lock_maxsz; 1054 hdr->replen += decode_lock_maxsz;
@@ -1082,15 +1058,15 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
1082{ 1058{
1083 __be32 *p; 1059 __be32 *p;
1084 1060
1085 RESERVE_SPACE(52); 1061 p = reserve_space(xdr, 52);
1086 WRITE32(OP_LOCKT); 1062 *p++ = cpu_to_be32(OP_LOCKT);
1087 WRITE32(nfs4_lock_type(args->fl, 0)); 1063 *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
1088 WRITE64(args->fl->fl_start); 1064 p = xdr_encode_hyper(p, args->fl->fl_start);
1089 WRITE64(nfs4_lock_length(args->fl)); 1065 p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
1090 WRITE64(args->lock_owner.clientid); 1066 p = xdr_encode_hyper(p, args->lock_owner.clientid);
1091 WRITE32(16); 1067 *p++ = cpu_to_be32(16);
1092 WRITEMEM("lock id:", 8); 1068 p = xdr_encode_opaque_fixed(p, "lock id:", 8);
1093 WRITE64(args->lock_owner.id); 1069 xdr_encode_hyper(p, args->lock_owner.id);
1094 hdr->nops++; 1070 hdr->nops++;
1095 hdr->replen += decode_lockt_maxsz; 1071 hdr->replen += decode_lockt_maxsz;
1096} 1072}
@@ -1099,13 +1075,13 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
1099{ 1075{
1100 __be32 *p; 1076 __be32 *p;
1101 1077
1102 RESERVE_SPACE(12+NFS4_STATEID_SIZE+16); 1078 p = reserve_space(xdr, 12+NFS4_STATEID_SIZE+16);
1103 WRITE32(OP_LOCKU); 1079 *p++ = cpu_to_be32(OP_LOCKU);
1104 WRITE32(nfs4_lock_type(args->fl, 0)); 1080 *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
1105 WRITE32(args->seqid->sequence->counter); 1081 *p++ = cpu_to_be32(args->seqid->sequence->counter);
1106 WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE); 1082 p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE);
1107 WRITE64(args->fl->fl_start); 1083 p = xdr_encode_hyper(p, args->fl->fl_start);
1108 WRITE64(nfs4_lock_length(args->fl)); 1084 xdr_encode_hyper(p, nfs4_lock_length(args->fl));
1109 hdr->nops++; 1085 hdr->nops++;
1110 hdr->replen += decode_locku_maxsz; 1086 hdr->replen += decode_locku_maxsz;
1111} 1087}
@@ -1115,10 +1091,9 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
1115 int len = name->len; 1091 int len = name->len;
1116 __be32 *p; 1092 __be32 *p;
1117 1093
1118 RESERVE_SPACE(8 + len); 1094 p = reserve_space(xdr, 8 + len);
1119 WRITE32(OP_LOOKUP); 1095 *p++ = cpu_to_be32(OP_LOOKUP);
1120 WRITE32(len); 1096 xdr_encode_opaque(p, name->name, len);
1121 WRITEMEM(name->name, len);
1122 hdr->nops++; 1097 hdr->nops++;
1123 hdr->replen += decode_lookup_maxsz; 1098 hdr->replen += decode_lookup_maxsz;
1124} 1099}
@@ -1127,21 +1102,21 @@ static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
1127{ 1102{
1128 __be32 *p; 1103 __be32 *p;
1129 1104
1130 RESERVE_SPACE(8); 1105 p = reserve_space(xdr, 8);
1131 switch (fmode & (FMODE_READ|FMODE_WRITE)) { 1106 switch (fmode & (FMODE_READ|FMODE_WRITE)) {
1132 case FMODE_READ: 1107 case FMODE_READ:
1133 WRITE32(NFS4_SHARE_ACCESS_READ); 1108 *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ);
1134 break; 1109 break;
1135 case FMODE_WRITE: 1110 case FMODE_WRITE:
1136 WRITE32(NFS4_SHARE_ACCESS_WRITE); 1111 *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_WRITE);
1137 break; 1112 break;
1138 case FMODE_READ|FMODE_WRITE: 1113 case FMODE_READ|FMODE_WRITE:
1139 WRITE32(NFS4_SHARE_ACCESS_BOTH); 1114 *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_BOTH);
1140 break; 1115 break;
1141 default: 1116 default:
1142 WRITE32(0); 1117 *p++ = cpu_to_be32(0);
1143 } 1118 }
1144 WRITE32(0); /* for linux, share_deny = 0 always */ 1119 *p = cpu_to_be32(0); /* for linux, share_deny = 0 always */
1145} 1120}
1146 1121
1147static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg) 1122static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg)
@@ -1151,29 +1126,29 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
1151 * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4, 1126 * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
1152 * owner 4 = 32 1127 * owner 4 = 32
1153 */ 1128 */
1154 RESERVE_SPACE(8); 1129 p = reserve_space(xdr, 8);
1155 WRITE32(OP_OPEN); 1130 *p++ = cpu_to_be32(OP_OPEN);
1156 WRITE32(arg->seqid->sequence->counter); 1131 *p = cpu_to_be32(arg->seqid->sequence->counter);
1157 encode_share_access(xdr, arg->fmode); 1132 encode_share_access(xdr, arg->fmode);
1158 RESERVE_SPACE(28); 1133 p = reserve_space(xdr, 28);
1159 WRITE64(arg->clientid); 1134 p = xdr_encode_hyper(p, arg->clientid);
1160 WRITE32(16); 1135 *p++ = cpu_to_be32(16);
1161 WRITEMEM("open id:", 8); 1136 p = xdr_encode_opaque_fixed(p, "open id:", 8);
1162 WRITE64(arg->id); 1137 xdr_encode_hyper(p, arg->id);
1163} 1138}
1164 1139
1165static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg) 1140static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
1166{ 1141{
1167 __be32 *p; 1142 __be32 *p;
1168 1143
1169 RESERVE_SPACE(4); 1144 p = reserve_space(xdr, 4);
1170 switch(arg->open_flags & O_EXCL) { 1145 switch(arg->open_flags & O_EXCL) {
1171 case 0: 1146 case 0:
1172 WRITE32(NFS4_CREATE_UNCHECKED); 1147 *p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
1173 encode_attrs(xdr, arg->u.attrs, arg->server); 1148 encode_attrs(xdr, arg->u.attrs, arg->server);
1174 break; 1149 break;
1175 default: 1150 default:
1176 WRITE32(NFS4_CREATE_EXCLUSIVE); 1151 *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
1177 encode_nfs4_verifier(xdr, &arg->u.verifier); 1152 encode_nfs4_verifier(xdr, &arg->u.verifier);
1178 } 1153 }
1179} 1154}
@@ -1182,14 +1157,14 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
1182{ 1157{
1183 __be32 *p; 1158 __be32 *p;
1184 1159
1185 RESERVE_SPACE(4); 1160 p = reserve_space(xdr, 4);
1186 switch (arg->open_flags & O_CREAT) { 1161 switch (arg->open_flags & O_CREAT) {
1187 case 0: 1162 case 0:
1188 WRITE32(NFS4_OPEN_NOCREATE); 1163 *p = cpu_to_be32(NFS4_OPEN_NOCREATE);
1189 break; 1164 break;
1190 default: 1165 default:
1191 BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL); 1166 BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
1192 WRITE32(NFS4_OPEN_CREATE); 1167 *p = cpu_to_be32(NFS4_OPEN_CREATE);
1193 encode_createmode(xdr, arg); 1168 encode_createmode(xdr, arg);
1194 } 1169 }
1195} 1170}
@@ -1198,16 +1173,16 @@ static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delega
1198{ 1173{
1199 __be32 *p; 1174 __be32 *p;
1200 1175
1201 RESERVE_SPACE(4); 1176 p = reserve_space(xdr, 4);
1202 switch (delegation_type) { 1177 switch (delegation_type) {
1203 case 0: 1178 case 0:
1204 WRITE32(NFS4_OPEN_DELEGATE_NONE); 1179 *p = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE);
1205 break; 1180 break;
1206 case FMODE_READ: 1181 case FMODE_READ:
1207 WRITE32(NFS4_OPEN_DELEGATE_READ); 1182 *p = cpu_to_be32(NFS4_OPEN_DELEGATE_READ);
1208 break; 1183 break;
1209 case FMODE_WRITE|FMODE_READ: 1184 case FMODE_WRITE|FMODE_READ:
1210 WRITE32(NFS4_OPEN_DELEGATE_WRITE); 1185 *p = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE);
1211 break; 1186 break;
1212 default: 1187 default:
1213 BUG(); 1188 BUG();
@@ -1218,8 +1193,8 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *
1218{ 1193{
1219 __be32 *p; 1194 __be32 *p;
1220 1195
1221 RESERVE_SPACE(4); 1196 p = reserve_space(xdr, 4);
1222 WRITE32(NFS4_OPEN_CLAIM_NULL); 1197 *p = cpu_to_be32(NFS4_OPEN_CLAIM_NULL);
1223 encode_string(xdr, name->len, name->name); 1198 encode_string(xdr, name->len, name->name);
1224} 1199}
1225 1200
@@ -1227,8 +1202,8 @@ static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type)
1227{ 1202{
1228 __be32 *p; 1203 __be32 *p;
1229 1204
1230 RESERVE_SPACE(4); 1205 p = reserve_space(xdr, 4);
1231 WRITE32(NFS4_OPEN_CLAIM_PREVIOUS); 1206 *p = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS);
1232 encode_delegation_type(xdr, type); 1207 encode_delegation_type(xdr, type);
1233} 1208}
1234 1209
@@ -1236,9 +1211,9 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
1236{ 1211{
1237 __be32 *p; 1212 __be32 *p;
1238 1213
1239 RESERVE_SPACE(4+NFS4_STATEID_SIZE); 1214 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
1240 WRITE32(NFS4_OPEN_CLAIM_DELEGATE_CUR); 1215 *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
1241 WRITEMEM(stateid->data, NFS4_STATEID_SIZE); 1216 xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
1242 encode_string(xdr, name->len, name->name); 1217 encode_string(xdr, name->len, name->name);
1243} 1218}
1244 1219
@@ -1267,10 +1242,10 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
1267{ 1242{
1268 __be32 *p; 1243 __be32 *p;
1269 1244
1270 RESERVE_SPACE(4+NFS4_STATEID_SIZE+4); 1245 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
1271 WRITE32(OP_OPEN_CONFIRM); 1246 *p++ = cpu_to_be32(OP_OPEN_CONFIRM);
1272 WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); 1247 p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
1273 WRITE32(arg->seqid->sequence->counter); 1248 *p = cpu_to_be32(arg->seqid->sequence->counter);
1274 hdr->nops++; 1249 hdr->nops++;
1275 hdr->replen += decode_open_confirm_maxsz; 1250 hdr->replen += decode_open_confirm_maxsz;
1276} 1251}
@@ -1279,10 +1254,10 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close
1279{ 1254{
1280 __be32 *p; 1255 __be32 *p;
1281 1256
1282 RESERVE_SPACE(4+NFS4_STATEID_SIZE+4); 1257 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
1283 WRITE32(OP_OPEN_DOWNGRADE); 1258 *p++ = cpu_to_be32(OP_OPEN_DOWNGRADE);
1284 WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); 1259 p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
1285 WRITE32(arg->seqid->sequence->counter); 1260 *p = cpu_to_be32(arg->seqid->sequence->counter);
1286 encode_share_access(xdr, arg->fmode); 1261 encode_share_access(xdr, arg->fmode);
1287 hdr->nops++; 1262 hdr->nops++;
1288 hdr->replen += decode_open_downgrade_maxsz; 1263 hdr->replen += decode_open_downgrade_maxsz;
@@ -1294,10 +1269,9 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd
1294 int len = fh->size; 1269 int len = fh->size;
1295 __be32 *p; 1270 __be32 *p;
1296 1271
1297 RESERVE_SPACE(8 + len); 1272 p = reserve_space(xdr, 8 + len);
1298 WRITE32(OP_PUTFH); 1273 *p++ = cpu_to_be32(OP_PUTFH);
1299 WRITE32(len); 1274 xdr_encode_opaque(p, fh->data, len);
1300 WRITEMEM(fh->data, len);
1301 hdr->nops++; 1275 hdr->nops++;
1302 hdr->replen += decode_putfh_maxsz; 1276 hdr->replen += decode_putfh_maxsz;
1303} 1277}
@@ -1306,8 +1280,8 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1306{ 1280{
1307 __be32 *p; 1281 __be32 *p;
1308 1282
1309 RESERVE_SPACE(4); 1283 p = reserve_space(xdr, 4);
1310 WRITE32(OP_PUTROOTFH); 1284 *p = cpu_to_be32(OP_PUTROOTFH);
1311 hdr->nops++; 1285 hdr->nops++;
1312 hdr->replen += decode_putrootfh_maxsz; 1286 hdr->replen += decode_putrootfh_maxsz;
1313} 1287}
@@ -1317,26 +1291,26 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
1317 nfs4_stateid stateid; 1291 nfs4_stateid stateid;
1318 __be32 *p; 1292 __be32 *p;
1319 1293
1320 RESERVE_SPACE(NFS4_STATEID_SIZE); 1294 p = reserve_space(xdr, NFS4_STATEID_SIZE);
1321 if (ctx->state != NULL) { 1295 if (ctx->state != NULL) {
1322 nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner); 1296 nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner);
1323 WRITEMEM(stateid.data, NFS4_STATEID_SIZE); 1297 xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
1324 } else 1298 } else
1325 WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE); 1299 xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
1326} 1300}
1327 1301
1328static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) 1302static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)
1329{ 1303{
1330 __be32 *p; 1304 __be32 *p;
1331 1305
1332 RESERVE_SPACE(4); 1306 p = reserve_space(xdr, 4);
1333 WRITE32(OP_READ); 1307 *p = cpu_to_be32(OP_READ);
1334 1308
1335 encode_stateid(xdr, args->context); 1309 encode_stateid(xdr, args->context);
1336 1310
1337 RESERVE_SPACE(12); 1311 p = reserve_space(xdr, 12);
1338 WRITE64(args->offset); 1312 p = xdr_encode_hyper(p, args->offset);
1339 WRITE32(args->count); 1313 *p = cpu_to_be32(args->count);
1340 hdr->nops++; 1314 hdr->nops++;
1341 hdr->replen += decode_read_maxsz; 1315 hdr->replen += decode_read_maxsz;
1342} 1316}
@@ -1349,20 +1323,20 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
1349 }; 1323 };
1350 __be32 *p; 1324 __be32 *p;
1351 1325
1352 RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20); 1326 p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20);
1353 WRITE32(OP_READDIR); 1327 *p++ = cpu_to_be32(OP_READDIR);
1354 WRITE64(readdir->cookie); 1328 p = xdr_encode_hyper(p, readdir->cookie);
1355 WRITEMEM(readdir->verifier.data, NFS4_VERIFIER_SIZE); 1329 p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE);
1356 WRITE32(readdir->count >> 1); /* We're not doing readdirplus */ 1330 *p++ = cpu_to_be32(readdir->count >> 1); /* We're not doing readdirplus */
1357 WRITE32(readdir->count); 1331 *p++ = cpu_to_be32(readdir->count);
1358 WRITE32(2); 1332 *p++ = cpu_to_be32(2);
1359 /* Switch to mounted_on_fileid if the server supports it */ 1333 /* Switch to mounted_on_fileid if the server supports it */
1360 if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) 1334 if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
1361 attrs[0] &= ~FATTR4_WORD0_FILEID; 1335 attrs[0] &= ~FATTR4_WORD0_FILEID;
1362 else 1336 else
1363 attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; 1337 attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
1364 WRITE32(attrs[0] & readdir->bitmask[0]); 1338 *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
1365 WRITE32(attrs[1] & readdir->bitmask[1]); 1339 *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
1366 hdr->nops++; 1340 hdr->nops++;
1367 hdr->replen += decode_readdir_maxsz; 1341 hdr->replen += decode_readdir_maxsz;
1368 dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n", 1342 dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
@@ -1378,8 +1352,8 @@ static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *
1378{ 1352{
1379 __be32 *p; 1353 __be32 *p;
1380 1354
1381 RESERVE_SPACE(4); 1355 p = reserve_space(xdr, 4);
1382 WRITE32(OP_READLINK); 1356 *p = cpu_to_be32(OP_READLINK);
1383 hdr->nops++; 1357 hdr->nops++;
1384 hdr->replen += decode_readlink_maxsz; 1358 hdr->replen += decode_readlink_maxsz;
1385} 1359}
@@ -1388,10 +1362,9 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc
1388{ 1362{
1389 __be32 *p; 1363 __be32 *p;
1390 1364
1391 RESERVE_SPACE(8 + name->len); 1365 p = reserve_space(xdr, 8 + name->len);
1392 WRITE32(OP_REMOVE); 1366 *p++ = cpu_to_be32(OP_REMOVE);
1393 WRITE32(name->len); 1367 xdr_encode_opaque(p, name->name, name->len);
1394 WRITEMEM(name->name, name->len);
1395 hdr->nops++; 1368 hdr->nops++;
1396 hdr->replen += decode_remove_maxsz; 1369 hdr->replen += decode_remove_maxsz;
1397} 1370}
@@ -1400,14 +1373,10 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co
1400{ 1373{
1401 __be32 *p; 1374 __be32 *p;
1402 1375
1403 RESERVE_SPACE(8 + oldname->len); 1376 p = reserve_space(xdr, 4);
1404 WRITE32(OP_RENAME); 1377 *p = cpu_to_be32(OP_RENAME);
1405 WRITE32(oldname->len); 1378 encode_string(xdr, oldname->len, oldname->name);
1406 WRITEMEM(oldname->name, oldname->len); 1379 encode_string(xdr, newname->len, newname->name);
1407
1408 RESERVE_SPACE(4 + newname->len);
1409 WRITE32(newname->len);
1410 WRITEMEM(newname->name, newname->len);
1411 hdr->nops++; 1380 hdr->nops++;
1412 hdr->replen += decode_rename_maxsz; 1381 hdr->replen += decode_rename_maxsz;
1413} 1382}
@@ -1416,9 +1385,9 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client
1416{ 1385{
1417 __be32 *p; 1386 __be32 *p;
1418 1387
1419 RESERVE_SPACE(12); 1388 p = reserve_space(xdr, 12);
1420 WRITE32(OP_RENEW); 1389 *p++ = cpu_to_be32(OP_RENEW);
1421 WRITE64(client_stateid->cl_clientid); 1390 xdr_encode_hyper(p, client_stateid->cl_clientid);
1422 hdr->nops++; 1391 hdr->nops++;
1423 hdr->replen += decode_renew_maxsz; 1392 hdr->replen += decode_renew_maxsz;
1424} 1393}
@@ -1428,8 +1397,8 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1428{ 1397{
1429 __be32 *p; 1398 __be32 *p;
1430 1399
1431 RESERVE_SPACE(4); 1400 p = reserve_space(xdr, 4);
1432 WRITE32(OP_RESTOREFH); 1401 *p = cpu_to_be32(OP_RESTOREFH);
1433 hdr->nops++; 1402 hdr->nops++;
1434 hdr->replen += decode_restorefh_maxsz; 1403 hdr->replen += decode_restorefh_maxsz;
1435} 1404}
@@ -1439,16 +1408,16 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
1439{ 1408{
1440 __be32 *p; 1409 __be32 *p;
1441 1410
1442 RESERVE_SPACE(4+NFS4_STATEID_SIZE); 1411 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
1443 WRITE32(OP_SETATTR); 1412 *p++ = cpu_to_be32(OP_SETATTR);
1444 WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE); 1413 xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
1445 RESERVE_SPACE(2*4); 1414 p = reserve_space(xdr, 2*4);
1446 WRITE32(1); 1415 *p++ = cpu_to_be32(1);
1447 WRITE32(FATTR4_WORD0_ACL); 1416 *p = cpu_to_be32(FATTR4_WORD0_ACL);
1448 if (arg->acl_len % 4) 1417 if (arg->acl_len % 4)
1449 return -EINVAL; 1418 return -EINVAL;
1450 RESERVE_SPACE(4); 1419 p = reserve_space(xdr, 4);
1451 WRITE32(arg->acl_len); 1420 *p = cpu_to_be32(arg->acl_len);
1452 xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); 1421 xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
1453 hdr->nops++; 1422 hdr->nops++;
1454 hdr->replen += decode_setacl_maxsz; 1423 hdr->replen += decode_setacl_maxsz;
@@ -1460,8 +1429,8 @@ encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1460{ 1429{
1461 __be32 *p; 1430 __be32 *p;
1462 1431
1463 RESERVE_SPACE(4); 1432 p = reserve_space(xdr, 4);
1464 WRITE32(OP_SAVEFH); 1433 *p = cpu_to_be32(OP_SAVEFH);
1465 hdr->nops++; 1434 hdr->nops++;
1466 hdr->replen += decode_savefh_maxsz; 1435 hdr->replen += decode_savefh_maxsz;
1467} 1436}
@@ -1470,9 +1439,9 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
1470{ 1439{
1471 __be32 *p; 1440 __be32 *p;
1472 1441
1473 RESERVE_SPACE(4+NFS4_STATEID_SIZE); 1442 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
1474 WRITE32(OP_SETATTR); 1443 *p++ = cpu_to_be32(OP_SETATTR);
1475 WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE); 1444 xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE);
1476 hdr->nops++; 1445 hdr->nops++;
1477 hdr->replen += decode_setattr_maxsz; 1446 hdr->replen += decode_setattr_maxsz;
1478 encode_attrs(xdr, arg->iap, server); 1447 encode_attrs(xdr, arg->iap, server);
@@ -1482,17 +1451,17 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
1482{ 1451{
1483 __be32 *p; 1452 __be32 *p;
1484 1453
1485 RESERVE_SPACE(4 + NFS4_VERIFIER_SIZE); 1454 p = reserve_space(xdr, 4 + NFS4_VERIFIER_SIZE);
1486 WRITE32(OP_SETCLIENTID); 1455 *p++ = cpu_to_be32(OP_SETCLIENTID);
1487 WRITEMEM(setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE); 1456 xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
1488 1457
1489 encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name); 1458 encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
1490 RESERVE_SPACE(4); 1459 p = reserve_space(xdr, 4);
1491 WRITE32(setclientid->sc_prog); 1460 *p = cpu_to_be32(setclientid->sc_prog);
1492 encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid); 1461 encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
1493 encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr); 1462 encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
1494 RESERVE_SPACE(4); 1463 p = reserve_space(xdr, 4);
1495 WRITE32(setclientid->sc_cb_ident); 1464 *p = cpu_to_be32(setclientid->sc_cb_ident);
1496 hdr->nops++; 1465 hdr->nops++;
1497 hdr->replen += decode_setclientid_maxsz; 1466 hdr->replen += decode_setclientid_maxsz;
1498} 1467}
@@ -1501,10 +1470,10 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_
1501{ 1470{
1502 __be32 *p; 1471 __be32 *p;
1503 1472
1504 RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE); 1473 p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE);
1505 WRITE32(OP_SETCLIENTID_CONFIRM); 1474 *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
1506 WRITE64(client_state->cl_clientid); 1475 p = xdr_encode_hyper(p, client_state->cl_clientid);
1507 WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); 1476 xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
1508 hdr->nops++; 1477 hdr->nops++;
1509 hdr->replen += decode_setclientid_confirm_maxsz; 1478 hdr->replen += decode_setclientid_confirm_maxsz;
1510} 1479}
@@ -1513,15 +1482,15 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
1513{ 1482{
1514 __be32 *p; 1483 __be32 *p;
1515 1484
1516 RESERVE_SPACE(4); 1485 p = reserve_space(xdr, 4);
1517 WRITE32(OP_WRITE); 1486 *p = cpu_to_be32(OP_WRITE);
1518 1487
1519 encode_stateid(xdr, args->context); 1488 encode_stateid(xdr, args->context);
1520 1489
1521 RESERVE_SPACE(16); 1490 p = reserve_space(xdr, 16);
1522 WRITE64(args->offset); 1491 p = xdr_encode_hyper(p, args->offset);
1523 WRITE32(args->stable); 1492 *p++ = cpu_to_be32(args->stable);
1524 WRITE32(args->count); 1493 *p = cpu_to_be32(args->count);
1525 1494
1526 xdr_write_pages(xdr, args->pages, args->pgbase, args->count); 1495 xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
1527 hdr->nops++; 1496 hdr->nops++;
@@ -1532,10 +1501,10 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state
1532{ 1501{
1533 __be32 *p; 1502 __be32 *p;
1534 1503
1535 RESERVE_SPACE(4+NFS4_STATEID_SIZE); 1504 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
1536 1505
1537 WRITE32(OP_DELEGRETURN); 1506 *p++ = cpu_to_be32(OP_DELEGRETURN);
1538 WRITEMEM(stateid->data, NFS4_STATEID_SIZE); 1507 xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
1539 hdr->nops++; 1508 hdr->nops++;
1540 hdr->replen += decode_delegreturn_maxsz; 1509 hdr->replen += decode_delegreturn_maxsz;
1541} 1510}
@@ -1548,16 +1517,16 @@ static void encode_exchange_id(struct xdr_stream *xdr,
1548{ 1517{
1549 __be32 *p; 1518 __be32 *p;
1550 1519
1551 RESERVE_SPACE(4 + sizeof(args->verifier->data)); 1520 p = reserve_space(xdr, 4 + sizeof(args->verifier->data));
1552 WRITE32(OP_EXCHANGE_ID); 1521 *p++ = cpu_to_be32(OP_EXCHANGE_ID);
1553 WRITEMEM(args->verifier->data, sizeof(args->verifier->data)); 1522 xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data));
1554 1523
1555 encode_string(xdr, args->id_len, args->id); 1524 encode_string(xdr, args->id_len, args->id);
1556 1525
1557 RESERVE_SPACE(12); 1526 p = reserve_space(xdr, 12);
1558 WRITE32(args->flags); 1527 *p++ = cpu_to_be32(args->flags);
1559 WRITE32(0); /* zero length state_protect4_a */ 1528 *p++ = cpu_to_be32(0); /* zero length state_protect4_a */
1560 WRITE32(0); /* zero length implementation id array */ 1529 *p = cpu_to_be32(0); /* zero length implementation id array */
1561 hdr->nops++; 1530 hdr->nops++;
1562 hdr->replen += decode_exchange_id_maxsz; 1531 hdr->replen += decode_exchange_id_maxsz;
1563} 1532}
@@ -1571,55 +1540,43 @@ static void encode_create_session(struct xdr_stream *xdr,
1571 uint32_t len; 1540 uint32_t len;
1572 struct nfs_client *clp = args->client; 1541 struct nfs_client *clp = args->client;
1573 1542
1574 RESERVE_SPACE(4); 1543 len = scnprintf(machine_name, sizeof(machine_name), "%s",
1575 WRITE32(OP_CREATE_SESSION); 1544 clp->cl_ipaddr);
1576
1577 RESERVE_SPACE(8);
1578 WRITE64(clp->cl_ex_clid);
1579 1545
1580 RESERVE_SPACE(8); 1546 p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12);
1581 WRITE32(clp->cl_seqid); /*Sequence id */ 1547 *p++ = cpu_to_be32(OP_CREATE_SESSION);
1582 WRITE32(args->flags); /*flags */ 1548 p = xdr_encode_hyper(p, clp->cl_ex_clid);
1549 *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */
1550 *p++ = cpu_to_be32(args->flags); /*flags */
1583 1551
1584 RESERVE_SPACE(2*28); /* 2 channel_attrs */
1585 /* Fore Channel */ 1552 /* Fore Channel */
1586 WRITE32(args->fc_attrs.headerpadsz); /* header padding size */ 1553 *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */
1587 WRITE32(args->fc_attrs.max_rqst_sz); /* max req size */ 1554 *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */
1588 WRITE32(args->fc_attrs.max_resp_sz); /* max resp size */ 1555 *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */
1589 WRITE32(args->fc_attrs.max_resp_sz_cached); /* Max resp sz cached */ 1556 *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz_cached); /* Max resp sz cached */
1590 WRITE32(args->fc_attrs.max_ops); /* max operations */ 1557 *p++ = cpu_to_be32(args->fc_attrs.max_ops); /* max operations */
1591 WRITE32(args->fc_attrs.max_reqs); /* max requests */ 1558 *p++ = cpu_to_be32(args->fc_attrs.max_reqs); /* max requests */
1592 WRITE32(0); /* rdmachannel_attrs */ 1559 *p++ = cpu_to_be32(0); /* rdmachannel_attrs */
1593 1560
1594 /* Back Channel */ 1561 /* Back Channel */
1595 WRITE32(args->fc_attrs.headerpadsz); /* header padding size */ 1562 *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */
1596 WRITE32(args->bc_attrs.max_rqst_sz); /* max req size */ 1563 *p++ = cpu_to_be32(args->bc_attrs.max_rqst_sz); /* max req size */
1597 WRITE32(args->bc_attrs.max_resp_sz); /* max resp size */ 1564 *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz); /* max resp size */
1598 WRITE32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */ 1565 *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */
1599 WRITE32(args->bc_attrs.max_ops); /* max operations */ 1566 *p++ = cpu_to_be32(args->bc_attrs.max_ops); /* max operations */
1600 WRITE32(args->bc_attrs.max_reqs); /* max requests */ 1567 *p++ = cpu_to_be32(args->bc_attrs.max_reqs); /* max requests */
1601 WRITE32(0); /* rdmachannel_attrs */ 1568 *p++ = cpu_to_be32(0); /* rdmachannel_attrs */
1602 1569
1603 RESERVE_SPACE(4); 1570 *p++ = cpu_to_be32(args->cb_program); /* cb_program */
1604 WRITE32(args->cb_program); /* cb_program */ 1571 *p++ = cpu_to_be32(1);
1605 1572 *p++ = cpu_to_be32(RPC_AUTH_UNIX); /* auth_sys */
1606 RESERVE_SPACE(4); /* # of security flavors */
1607 WRITE32(1);
1608
1609 RESERVE_SPACE(4);
1610 WRITE32(RPC_AUTH_UNIX); /* auth_sys */
1611 1573
1612 /* authsys_parms rfc1831 */ 1574 /* authsys_parms rfc1831 */
1613 RESERVE_SPACE(4); 1575 *p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec); /* stamp */
1614 WRITE32((u32)clp->cl_boot_time.tv_nsec); /* stamp */ 1576 p = xdr_encode_opaque(p, machine_name, len);
1615 len = scnprintf(machine_name, sizeof(machine_name), "%s", 1577 *p++ = cpu_to_be32(0); /* UID */
1616 clp->cl_ipaddr); 1578 *p++ = cpu_to_be32(0); /* GID */
1617 RESERVE_SPACE(16 + len); 1579 *p = cpu_to_be32(0); /* No more gids */
1618 WRITE32(len);
1619 WRITEMEM(machine_name, len);
1620 WRITE32(0); /* UID */
1621 WRITE32(0); /* GID */
1622 WRITE32(0); /* No more gids */
1623 hdr->nops++; 1580 hdr->nops++;
1624 hdr->replen += decode_create_session_maxsz; 1581 hdr->replen += decode_create_session_maxsz;
1625} 1582}
@@ -1629,9 +1586,9 @@ static void encode_destroy_session(struct xdr_stream *xdr,
1629 struct compound_hdr *hdr) 1586 struct compound_hdr *hdr)
1630{ 1587{
1631 __be32 *p; 1588 __be32 *p;
1632 RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN); 1589 p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN);
1633 WRITE32(OP_DESTROY_SESSION); 1590 *p++ = cpu_to_be32(OP_DESTROY_SESSION);
1634 WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN); 1591 xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
1635 hdr->nops++; 1592 hdr->nops++;
1636 hdr->replen += decode_destroy_session_maxsz; 1593 hdr->replen += decode_destroy_session_maxsz;
1637} 1594}
@@ -1655,8 +1612,8 @@ static void encode_sequence(struct xdr_stream *xdr,
1655 WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE); 1612 WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE);
1656 slot = tp->slots + args->sa_slotid; 1613 slot = tp->slots + args->sa_slotid;
1657 1614
1658 RESERVE_SPACE(4); 1615 p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN + 16);
1659 WRITE32(OP_SEQUENCE); 1616 *p++ = cpu_to_be32(OP_SEQUENCE);
1660 1617
1661 /* 1618 /*
1662 * Sessionid + seqid + slotid + max slotid + cache_this 1619 * Sessionid + seqid + slotid + max slotid + cache_this
@@ -1670,12 +1627,11 @@ static void encode_sequence(struct xdr_stream *xdr,
1670 ((u32 *)session->sess_id.data)[3], 1627 ((u32 *)session->sess_id.data)[3],
1671 slot->seq_nr, args->sa_slotid, 1628 slot->seq_nr, args->sa_slotid,
1672 tp->highest_used_slotid, args->sa_cache_this); 1629 tp->highest_used_slotid, args->sa_cache_this);
1673 RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 16); 1630 p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
1674 WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN); 1631 *p++ = cpu_to_be32(slot->seq_nr);
1675 WRITE32(slot->seq_nr); 1632 *p++ = cpu_to_be32(args->sa_slotid);
1676 WRITE32(args->sa_slotid); 1633 *p++ = cpu_to_be32(tp->highest_used_slotid);
1677 WRITE32(tp->highest_used_slotid); 1634 *p = cpu_to_be32(args->sa_cache_this);
1678 WRITE32(args->sa_cache_this);
1679 hdr->nops++; 1635 hdr->nops++;
1680 hdr->replen += decode_sequence_maxsz; 1636 hdr->replen += decode_sequence_maxsz;
1681#endif /* CONFIG_NFS_V4_1 */ 1637#endif /* CONFIG_NFS_V4_1 */
@@ -2466,68 +2422,53 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
2466} 2422}
2467#endif /* CONFIG_NFS_V4_1 */ 2423#endif /* CONFIG_NFS_V4_1 */
2468 2424
2469/* 2425static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
2470 * START OF "GENERIC" DECODE ROUTINES. 2426{
2471 * These may look a little ugly since they are imported from a "generic" 2427 dprintk("nfs: %s: prematurely hit end of receive buffer. "
2472 * set of XDR encode/decode routines which are intended to be shared by 2428 "Remaining buffer length is %tu words.\n",
2473 * all of our NFSv4 implementations (OpenBSD, MacOS X...). 2429 func, xdr->end - xdr->p);
2474 * 2430}
2475 * If the pain of reading these is too great, it should be a straightforward
2476 * task to translate them into Linux-specific versions which are more
2477 * consistent with the style used in NFSv2/v3...
2478 */
2479#define READ32(x) (x) = ntohl(*p++)
2480#define READ64(x) do { \
2481 (x) = (u64)ntohl(*p++) << 32; \
2482 (x) |= ntohl(*p++); \
2483} while (0)
2484#define READTIME(x) do { \
2485 p++; \
2486 (x.tv_sec) = ntohl(*p++); \
2487 (x.tv_nsec) = ntohl(*p++); \
2488} while (0)
2489#define COPYMEM(x,nbytes) do { \
2490 memcpy((x), p, nbytes); \
2491 p += XDR_QUADLEN(nbytes); \
2492} while (0)
2493
2494#define READ_BUF(nbytes) do { \
2495 p = xdr_inline_decode(xdr, nbytes); \
2496 if (unlikely(!p)) { \
2497 dprintk("nfs: %s: prematurely hit end of receive" \
2498 " buffer\n", __func__); \
2499 dprintk("nfs: %s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \
2500 __func__, xdr->p, nbytes, xdr->end); \
2501 return -EIO; \
2502 } \
2503} while (0)
2504 2431
2505static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string) 2432static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
2506{ 2433{
2507 __be32 *p; 2434 __be32 *p;
2508 2435
2509 READ_BUF(4); 2436 p = xdr_inline_decode(xdr, 4);
2510 READ32(*len); 2437 if (unlikely(!p))
2511 READ_BUF(*len); 2438 goto out_overflow;
2439 *len = be32_to_cpup(p);
2440 p = xdr_inline_decode(xdr, *len);
2441 if (unlikely(!p))
2442 goto out_overflow;
2512 *string = (char *)p; 2443 *string = (char *)p;
2513 return 0; 2444 return 0;
2445out_overflow:
2446 print_overflow_msg(__func__, xdr);
2447 return -EIO;
2514} 2448}
2515 2449
2516static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) 2450static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
2517{ 2451{
2518 __be32 *p; 2452 __be32 *p;
2519 2453
2520 READ_BUF(8); 2454 p = xdr_inline_decode(xdr, 8);
2521 READ32(hdr->status); 2455 if (unlikely(!p))
2522 READ32(hdr->taglen); 2456 goto out_overflow;
2457 hdr->status = be32_to_cpup(p++);
2458 hdr->taglen = be32_to_cpup(p);
2523 2459
2524 READ_BUF(hdr->taglen + 4); 2460 p = xdr_inline_decode(xdr, hdr->taglen + 4);
2461 if (unlikely(!p))
2462 goto out_overflow;
2525 hdr->tag = (char *)p; 2463 hdr->tag = (char *)p;
2526 p += XDR_QUADLEN(hdr->taglen); 2464 p += XDR_QUADLEN(hdr->taglen);
2527 READ32(hdr->nops); 2465 hdr->nops = be32_to_cpup(p);
2528 if (unlikely(hdr->nops < 1)) 2466 if (unlikely(hdr->nops < 1))
2529 return nfs4_stat_to_errno(hdr->status); 2467 return nfs4_stat_to_errno(hdr->status);
2530 return 0; 2468 return 0;
2469out_overflow:
2470 print_overflow_msg(__func__, xdr);
2471 return -EIO;
2531} 2472}
2532 2473
2533static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) 2474static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
@@ -2536,18 +2477,23 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
2536 uint32_t opnum; 2477 uint32_t opnum;
2537 int32_t nfserr; 2478 int32_t nfserr;
2538 2479
2539 READ_BUF(8); 2480 p = xdr_inline_decode(xdr, 8);
2540 READ32(opnum); 2481 if (unlikely(!p))
2482 goto out_overflow;
2483 opnum = be32_to_cpup(p++);
2541 if (opnum != expected) { 2484 if (opnum != expected) {
2542 dprintk("nfs: Server returned operation" 2485 dprintk("nfs: Server returned operation"
2543 " %d but we issued a request for %d\n", 2486 " %d but we issued a request for %d\n",
2544 opnum, expected); 2487 opnum, expected);
2545 return -EIO; 2488 return -EIO;
2546 } 2489 }
2547 READ32(nfserr); 2490 nfserr = be32_to_cpup(p);
2548 if (nfserr != NFS_OK) 2491 if (nfserr != NFS_OK)
2549 return nfs4_stat_to_errno(nfserr); 2492 return nfs4_stat_to_errno(nfserr);
2550 return 0; 2493 return 0;
2494out_overflow:
2495 print_overflow_msg(__func__, xdr);
2496 return -EIO;
2551} 2497}
2552 2498
2553/* Dummy routine */ 2499/* Dummy routine */
@@ -2557,8 +2503,11 @@ static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp)
2557 unsigned int strlen; 2503 unsigned int strlen;
2558 char *str; 2504 char *str;
2559 2505
2560 READ_BUF(12); 2506 p = xdr_inline_decode(xdr, 12);
2561 return decode_opaque_inline(xdr, &strlen, &str); 2507 if (likely(p))
2508 return decode_opaque_inline(xdr, &strlen, &str);
2509 print_overflow_msg(__func__, xdr);
2510 return -EIO;
2562} 2511}
2563 2512
2564static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) 2513static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
@@ -2566,27 +2515,39 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
2566 uint32_t bmlen; 2515 uint32_t bmlen;
2567 __be32 *p; 2516 __be32 *p;
2568 2517
2569 READ_BUF(4); 2518 p = xdr_inline_decode(xdr, 4);
2570 READ32(bmlen); 2519 if (unlikely(!p))
2520 goto out_overflow;
2521 bmlen = be32_to_cpup(p);
2571 2522
2572 bitmap[0] = bitmap[1] = 0; 2523 bitmap[0] = bitmap[1] = 0;
2573 READ_BUF((bmlen << 2)); 2524 p = xdr_inline_decode(xdr, (bmlen << 2));
2525 if (unlikely(!p))
2526 goto out_overflow;
2574 if (bmlen > 0) { 2527 if (bmlen > 0) {
2575 READ32(bitmap[0]); 2528 bitmap[0] = be32_to_cpup(p++);
2576 if (bmlen > 1) 2529 if (bmlen > 1)
2577 READ32(bitmap[1]); 2530 bitmap[1] = be32_to_cpup(p);
2578 } 2531 }
2579 return 0; 2532 return 0;
2533out_overflow:
2534 print_overflow_msg(__func__, xdr);
2535 return -EIO;
2580} 2536}
2581 2537
2582static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, __be32 **savep) 2538static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, __be32 **savep)
2583{ 2539{
2584 __be32 *p; 2540 __be32 *p;
2585 2541
2586 READ_BUF(4); 2542 p = xdr_inline_decode(xdr, 4);
2587 READ32(*attrlen); 2543 if (unlikely(!p))
2544 goto out_overflow;
2545 *attrlen = be32_to_cpup(p);
2588 *savep = xdr->p; 2546 *savep = xdr->p;
2589 return 0; 2547 return 0;
2548out_overflow:
2549 print_overflow_msg(__func__, xdr);
2550 return -EIO;
2590} 2551}
2591 2552
2592static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask) 2553static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask)
@@ -2609,8 +2570,10 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
2609 if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U))) 2570 if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
2610 return -EIO; 2571 return -EIO;
2611 if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) { 2572 if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) {
2612 READ_BUF(4); 2573 p = xdr_inline_decode(xdr, 4);
2613 READ32(*type); 2574 if (unlikely(!p))
2575 goto out_overflow;
2576 *type = be32_to_cpup(p);
2614 if (*type < NF4REG || *type > NF4NAMEDATTR) { 2577 if (*type < NF4REG || *type > NF4NAMEDATTR) {
2615 dprintk("%s: bad type %d\n", __func__, *type); 2578 dprintk("%s: bad type %d\n", __func__, *type);
2616 return -EIO; 2579 return -EIO;
@@ -2620,6 +2583,9 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
2620 } 2583 }
2621 dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]); 2584 dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
2622 return ret; 2585 return ret;
2586out_overflow:
2587 print_overflow_msg(__func__, xdr);
2588 return -EIO;
2623} 2589}
2624 2590
2625static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change) 2591static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
@@ -2631,14 +2597,19 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
2631 if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U))) 2597 if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
2632 return -EIO; 2598 return -EIO;
2633 if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) { 2599 if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) {
2634 READ_BUF(8); 2600 p = xdr_inline_decode(xdr, 8);
2635 READ64(*change); 2601 if (unlikely(!p))
2602 goto out_overflow;
2603 xdr_decode_hyper(p, change);
2636 bitmap[0] &= ~FATTR4_WORD0_CHANGE; 2604 bitmap[0] &= ~FATTR4_WORD0_CHANGE;
2637 ret = NFS_ATTR_FATTR_CHANGE; 2605 ret = NFS_ATTR_FATTR_CHANGE;
2638 } 2606 }
2639 dprintk("%s: change attribute=%Lu\n", __func__, 2607 dprintk("%s: change attribute=%Lu\n", __func__,
2640 (unsigned long long)*change); 2608 (unsigned long long)*change);
2641 return ret; 2609 return ret;
2610out_overflow:
2611 print_overflow_msg(__func__, xdr);
2612 return -EIO;
2642} 2613}
2643 2614
2644static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size) 2615static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
@@ -2650,13 +2621,18 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
2650 if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U))) 2621 if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
2651 return -EIO; 2622 return -EIO;
2652 if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) { 2623 if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) {
2653 READ_BUF(8); 2624 p = xdr_inline_decode(xdr, 8);
2654 READ64(*size); 2625 if (unlikely(!p))
2626 goto out_overflow;
2627 xdr_decode_hyper(p, size);
2655 bitmap[0] &= ~FATTR4_WORD0_SIZE; 2628 bitmap[0] &= ~FATTR4_WORD0_SIZE;
2656 ret = NFS_ATTR_FATTR_SIZE; 2629 ret = NFS_ATTR_FATTR_SIZE;
2657 } 2630 }
2658 dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size); 2631 dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
2659 return ret; 2632 return ret;
2633out_overflow:
2634 print_overflow_msg(__func__, xdr);
2635 return -EIO;
2660} 2636}
2661 2637
2662static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 2638static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2667,12 +2643,17 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui
2667 if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U))) 2643 if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U)))
2668 return -EIO; 2644 return -EIO;
2669 if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) { 2645 if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) {
2670 READ_BUF(4); 2646 p = xdr_inline_decode(xdr, 4);
2671 READ32(*res); 2647 if (unlikely(!p))
2648 goto out_overflow;
2649 *res = be32_to_cpup(p);
2672 bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT; 2650 bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT;
2673 } 2651 }
2674 dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true"); 2652 dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true");
2675 return 0; 2653 return 0;
2654out_overflow:
2655 print_overflow_msg(__func__, xdr);
2656 return -EIO;
2676} 2657}
2677 2658
2678static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 2659static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2683,12 +2664,17 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
2683 if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U))) 2664 if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U)))
2684 return -EIO; 2665 return -EIO;
2685 if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) { 2666 if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) {
2686 READ_BUF(4); 2667 p = xdr_inline_decode(xdr, 4);
2687 READ32(*res); 2668 if (unlikely(!p))
2669 goto out_overflow;
2670 *res = be32_to_cpup(p);
2688 bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT; 2671 bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT;
2689 } 2672 }
2690 dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true"); 2673 dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true");
2691 return 0; 2674 return 0;
2675out_overflow:
2676 print_overflow_msg(__func__, xdr);
2677 return -EIO;
2692} 2678}
2693 2679
2694static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid) 2680static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
@@ -2701,9 +2687,11 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
2701 if (unlikely(bitmap[0] & (FATTR4_WORD0_FSID - 1U))) 2687 if (unlikely(bitmap[0] & (FATTR4_WORD0_FSID - 1U)))
2702 return -EIO; 2688 return -EIO;
2703 if (likely(bitmap[0] & FATTR4_WORD0_FSID)) { 2689 if (likely(bitmap[0] & FATTR4_WORD0_FSID)) {
2704 READ_BUF(16); 2690 p = xdr_inline_decode(xdr, 16);
2705 READ64(fsid->major); 2691 if (unlikely(!p))
2706 READ64(fsid->minor); 2692 goto out_overflow;
2693 p = xdr_decode_hyper(p, &fsid->major);
2694 xdr_decode_hyper(p, &fsid->minor);
2707 bitmap[0] &= ~FATTR4_WORD0_FSID; 2695 bitmap[0] &= ~FATTR4_WORD0_FSID;
2708 ret = NFS_ATTR_FATTR_FSID; 2696 ret = NFS_ATTR_FATTR_FSID;
2709 } 2697 }
@@ -2711,6 +2699,9 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
2711 (unsigned long long)fsid->major, 2699 (unsigned long long)fsid->major,
2712 (unsigned long long)fsid->minor); 2700 (unsigned long long)fsid->minor);
2713 return ret; 2701 return ret;
2702out_overflow:
2703 print_overflow_msg(__func__, xdr);
2704 return -EIO;
2714} 2705}
2715 2706
2716static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 2707static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2721,12 +2712,17 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint
2721 if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U))) 2712 if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U)))
2722 return -EIO; 2713 return -EIO;
2723 if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) { 2714 if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) {
2724 READ_BUF(4); 2715 p = xdr_inline_decode(xdr, 4);
2725 READ32(*res); 2716 if (unlikely(!p))
2717 goto out_overflow;
2718 *res = be32_to_cpup(p);
2726 bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME; 2719 bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME;
2727 } 2720 }
2728 dprintk("%s: file size=%u\n", __func__, (unsigned int)*res); 2721 dprintk("%s: file size=%u\n", __func__, (unsigned int)*res);
2729 return 0; 2722 return 0;
2723out_overflow:
2724 print_overflow_msg(__func__, xdr);
2725 return -EIO;
2730} 2726}
2731 2727
2732static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 2728static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2737,12 +2733,17 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
2737 if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U))) 2733 if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U)))
2738 return -EIO; 2734 return -EIO;
2739 if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) { 2735 if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) {
2740 READ_BUF(4); 2736 p = xdr_inline_decode(xdr, 4);
2741 READ32(*res); 2737 if (unlikely(!p))
2738 goto out_overflow;
2739 *res = be32_to_cpup(p);
2742 bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT; 2740 bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT;
2743 } 2741 }
2744 dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res); 2742 dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res);
2745 return 0; 2743 return 0;
2744out_overflow:
2745 print_overflow_msg(__func__, xdr);
2746 return -EIO;
2746} 2747}
2747 2748
2748static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) 2749static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
@@ -2754,13 +2755,18 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
2754 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U))) 2755 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
2755 return -EIO; 2756 return -EIO;
2756 if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) { 2757 if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) {
2757 READ_BUF(8); 2758 p = xdr_inline_decode(xdr, 8);
2758 READ64(*fileid); 2759 if (unlikely(!p))
2760 goto out_overflow;
2761 xdr_decode_hyper(p, fileid);
2759 bitmap[0] &= ~FATTR4_WORD0_FILEID; 2762 bitmap[0] &= ~FATTR4_WORD0_FILEID;
2760 ret = NFS_ATTR_FATTR_FILEID; 2763 ret = NFS_ATTR_FATTR_FILEID;
2761 } 2764 }
2762 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); 2765 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
2763 return ret; 2766 return ret;
2767out_overflow:
2768 print_overflow_msg(__func__, xdr);
2769 return -EIO;
2764} 2770}
2765 2771
2766static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) 2772static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
@@ -2772,13 +2778,18 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
2772 if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U))) 2778 if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
2773 return -EIO; 2779 return -EIO;
2774 if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) { 2780 if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) {
2775 READ_BUF(8); 2781 p = xdr_inline_decode(xdr, 8);
2776 READ64(*fileid); 2782 if (unlikely(!p))
2783 goto out_overflow;
2784 xdr_decode_hyper(p, fileid);
2777 bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; 2785 bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
2778 ret = NFS_ATTR_FATTR_FILEID; 2786 ret = NFS_ATTR_FATTR_FILEID;
2779 } 2787 }
2780 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); 2788 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
2781 return ret; 2789 return ret;
2790out_overflow:
2791 print_overflow_msg(__func__, xdr);
2792 return -EIO;
2782} 2793}
2783 2794
2784static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 2795static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2790,12 +2801,17 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
2790 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_AVAIL - 1U))) 2801 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_AVAIL - 1U)))
2791 return -EIO; 2802 return -EIO;
2792 if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) { 2803 if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) {
2793 READ_BUF(8); 2804 p = xdr_inline_decode(xdr, 8);
2794 READ64(*res); 2805 if (unlikely(!p))
2806 goto out_overflow;
2807 xdr_decode_hyper(p, res);
2795 bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL; 2808 bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL;
2796 } 2809 }
2797 dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res); 2810 dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res);
2798 return status; 2811 return status;
2812out_overflow:
2813 print_overflow_msg(__func__, xdr);
2814 return -EIO;
2799} 2815}
2800 2816
2801static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 2817static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2807,12 +2823,17 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
2807 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_FREE - 1U))) 2823 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_FREE - 1U)))
2808 return -EIO; 2824 return -EIO;
2809 if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) { 2825 if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) {
2810 READ_BUF(8); 2826 p = xdr_inline_decode(xdr, 8);
2811 READ64(*res); 2827 if (unlikely(!p))
2828 goto out_overflow;
2829 xdr_decode_hyper(p, res);
2812 bitmap[0] &= ~FATTR4_WORD0_FILES_FREE; 2830 bitmap[0] &= ~FATTR4_WORD0_FILES_FREE;
2813 } 2831 }
2814 dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res); 2832 dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res);
2815 return status; 2833 return status;
2834out_overflow:
2835 print_overflow_msg(__func__, xdr);
2836 return -EIO;
2816} 2837}
2817 2838
2818static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 2839static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2824,12 +2845,17 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
2824 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_TOTAL - 1U))) 2845 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_TOTAL - 1U)))
2825 return -EIO; 2846 return -EIO;
2826 if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) { 2847 if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) {
2827 READ_BUF(8); 2848 p = xdr_inline_decode(xdr, 8);
2828 READ64(*res); 2849 if (unlikely(!p))
2850 goto out_overflow;
2851 xdr_decode_hyper(p, res);
2829 bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL; 2852 bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL;
2830 } 2853 }
2831 dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res); 2854 dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res);
2832 return status; 2855 return status;
2856out_overflow:
2857 print_overflow_msg(__func__, xdr);
2858 return -EIO;
2833} 2859}
2834 2860
2835static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) 2861static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
@@ -2838,8 +2864,10 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
2838 __be32 *p; 2864 __be32 *p;
2839 int status = 0; 2865 int status = 0;
2840 2866
2841 READ_BUF(4); 2867 p = xdr_inline_decode(xdr, 4);
2842 READ32(n); 2868 if (unlikely(!p))
2869 goto out_overflow;
2870 n = be32_to_cpup(p);
2843 if (n == 0) 2871 if (n == 0)
2844 goto root_path; 2872 goto root_path;
2845 dprintk("path "); 2873 dprintk("path ");
@@ -2873,6 +2901,9 @@ out_eio:
2873 dprintk(" status %d", status); 2901 dprintk(" status %d", status);
2874 status = -EIO; 2902 status = -EIO;
2875 goto out; 2903 goto out;
2904out_overflow:
2905 print_overflow_msg(__func__, xdr);
2906 return -EIO;
2876} 2907}
2877 2908
2878static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res) 2909static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
@@ -2890,8 +2921,10 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
2890 status = decode_pathname(xdr, &res->fs_path); 2921 status = decode_pathname(xdr, &res->fs_path);
2891 if (unlikely(status != 0)) 2922 if (unlikely(status != 0))
2892 goto out; 2923 goto out;
2893 READ_BUF(4); 2924 p = xdr_inline_decode(xdr, 4);
2894 READ32(n); 2925 if (unlikely(!p))
2926 goto out_overflow;
2927 n = be32_to_cpup(p);
2895 if (n <= 0) 2928 if (n <= 0)
2896 goto out_eio; 2929 goto out_eio;
2897 res->nlocations = 0; 2930 res->nlocations = 0;
@@ -2899,8 +2932,10 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
2899 u32 m; 2932 u32 m;
2900 struct nfs4_fs_location *loc = &res->locations[res->nlocations]; 2933 struct nfs4_fs_location *loc = &res->locations[res->nlocations];
2901 2934
2902 READ_BUF(4); 2935 p = xdr_inline_decode(xdr, 4);
2903 READ32(m); 2936 if (unlikely(!p))
2937 goto out_overflow;
2938 m = be32_to_cpup(p);
2904 2939
2905 loc->nservers = 0; 2940 loc->nservers = 0;
2906 dprintk("%s: servers ", __func__); 2941 dprintk("%s: servers ", __func__);
@@ -2939,6 +2974,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
2939out: 2974out:
2940 dprintk("%s: fs_locations done, error = %d\n", __func__, status); 2975 dprintk("%s: fs_locations done, error = %d\n", __func__, status);
2941 return status; 2976 return status;
2977out_overflow:
2978 print_overflow_msg(__func__, xdr);
2942out_eio: 2979out_eio:
2943 status = -EIO; 2980 status = -EIO;
2944 goto out; 2981 goto out;
@@ -2953,12 +2990,17 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin
2953 if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXFILESIZE - 1U))) 2990 if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXFILESIZE - 1U)))
2954 return -EIO; 2991 return -EIO;
2955 if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) { 2992 if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) {
2956 READ_BUF(8); 2993 p = xdr_inline_decode(xdr, 8);
2957 READ64(*res); 2994 if (unlikely(!p))
2995 goto out_overflow;
2996 xdr_decode_hyper(p, res);
2958 bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE; 2997 bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE;
2959 } 2998 }
2960 dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res); 2999 dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res);
2961 return status; 3000 return status;
3001out_overflow:
3002 print_overflow_msg(__func__, xdr);
3003 return -EIO;
2962} 3004}
2963 3005
2964static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink) 3006static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink)
@@ -2970,12 +3012,17 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
2970 if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXLINK - 1U))) 3012 if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXLINK - 1U)))
2971 return -EIO; 3013 return -EIO;
2972 if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) { 3014 if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) {
2973 READ_BUF(4); 3015 p = xdr_inline_decode(xdr, 4);
2974 READ32(*maxlink); 3016 if (unlikely(!p))
3017 goto out_overflow;
3018 *maxlink = be32_to_cpup(p);
2975 bitmap[0] &= ~FATTR4_WORD0_MAXLINK; 3019 bitmap[0] &= ~FATTR4_WORD0_MAXLINK;
2976 } 3020 }
2977 dprintk("%s: maxlink=%u\n", __func__, *maxlink); 3021 dprintk("%s: maxlink=%u\n", __func__, *maxlink);
2978 return status; 3022 return status;
3023out_overflow:
3024 print_overflow_msg(__func__, xdr);
3025 return -EIO;
2979} 3026}
2980 3027
2981static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname) 3028static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname)
@@ -2987,12 +3034,17 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
2987 if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXNAME - 1U))) 3034 if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXNAME - 1U)))
2988 return -EIO; 3035 return -EIO;
2989 if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) { 3036 if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) {
2990 READ_BUF(4); 3037 p = xdr_inline_decode(xdr, 4);
2991 READ32(*maxname); 3038 if (unlikely(!p))
3039 goto out_overflow;
3040 *maxname = be32_to_cpup(p);
2992 bitmap[0] &= ~FATTR4_WORD0_MAXNAME; 3041 bitmap[0] &= ~FATTR4_WORD0_MAXNAME;
2993 } 3042 }
2994 dprintk("%s: maxname=%u\n", __func__, *maxname); 3043 dprintk("%s: maxname=%u\n", __func__, *maxname);
2995 return status; 3044 return status;
3045out_overflow:
3046 print_overflow_msg(__func__, xdr);
3047 return -EIO;
2996} 3048}
2997 3049
2998static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 3050static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3005,8 +3057,10 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
3005 return -EIO; 3057 return -EIO;
3006 if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) { 3058 if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) {
3007 uint64_t maxread; 3059 uint64_t maxread;
3008 READ_BUF(8); 3060 p = xdr_inline_decode(xdr, 8);
3009 READ64(maxread); 3061 if (unlikely(!p))
3062 goto out_overflow;
3063 xdr_decode_hyper(p, &maxread);
3010 if (maxread > 0x7FFFFFFF) 3064 if (maxread > 0x7FFFFFFF)
3011 maxread = 0x7FFFFFFF; 3065 maxread = 0x7FFFFFFF;
3012 *res = (uint32_t)maxread; 3066 *res = (uint32_t)maxread;
@@ -3014,6 +3068,9 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
3014 } 3068 }
3015 dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res); 3069 dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res);
3016 return status; 3070 return status;
3071out_overflow:
3072 print_overflow_msg(__func__, xdr);
3073 return -EIO;
3017} 3074}
3018 3075
3019static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 3076static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3026,8 +3083,10 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
3026 return -EIO; 3083 return -EIO;
3027 if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) { 3084 if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) {
3028 uint64_t maxwrite; 3085 uint64_t maxwrite;
3029 READ_BUF(8); 3086 p = xdr_inline_decode(xdr, 8);
3030 READ64(maxwrite); 3087 if (unlikely(!p))
3088 goto out_overflow;
3089 xdr_decode_hyper(p, &maxwrite);
3031 if (maxwrite > 0x7FFFFFFF) 3090 if (maxwrite > 0x7FFFFFFF)
3032 maxwrite = 0x7FFFFFFF; 3091 maxwrite = 0x7FFFFFFF;
3033 *res = (uint32_t)maxwrite; 3092 *res = (uint32_t)maxwrite;
@@ -3035,6 +3094,9 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
3035 } 3094 }
3036 dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res); 3095 dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res);
3037 return status; 3096 return status;
3097out_overflow:
3098 print_overflow_msg(__func__, xdr);
3099 return -EIO;
3038} 3100}
3039 3101
3040static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode) 3102static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
@@ -3047,14 +3109,19 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m
3047 if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U))) 3109 if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
3048 return -EIO; 3110 return -EIO;
3049 if (likely(bitmap[1] & FATTR4_WORD1_MODE)) { 3111 if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
3050 READ_BUF(4); 3112 p = xdr_inline_decode(xdr, 4);
3051 READ32(tmp); 3113 if (unlikely(!p))
3114 goto out_overflow;
3115 tmp = be32_to_cpup(p);
3052 *mode = tmp & ~S_IFMT; 3116 *mode = tmp & ~S_IFMT;
3053 bitmap[1] &= ~FATTR4_WORD1_MODE; 3117 bitmap[1] &= ~FATTR4_WORD1_MODE;
3054 ret = NFS_ATTR_FATTR_MODE; 3118 ret = NFS_ATTR_FATTR_MODE;
3055 } 3119 }
3056 dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode); 3120 dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
3057 return ret; 3121 return ret;
3122out_overflow:
3123 print_overflow_msg(__func__, xdr);
3124 return -EIO;
3058} 3125}
3059 3126
3060static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink) 3127static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
@@ -3066,16 +3133,22 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
3066 if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U))) 3133 if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
3067 return -EIO; 3134 return -EIO;
3068 if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) { 3135 if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) {
3069 READ_BUF(4); 3136 p = xdr_inline_decode(xdr, 4);
3070 READ32(*nlink); 3137 if (unlikely(!p))
3138 goto out_overflow;
3139 *nlink = be32_to_cpup(p);
3071 bitmap[1] &= ~FATTR4_WORD1_NUMLINKS; 3140 bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
3072 ret = NFS_ATTR_FATTR_NLINK; 3141 ret = NFS_ATTR_FATTR_NLINK;
3073 } 3142 }
3074 dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink); 3143 dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
3075 return ret; 3144 return ret;
3145out_overflow:
3146 print_overflow_msg(__func__, xdr);
3147 return -EIO;
3076} 3148}
3077 3149
3078static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid) 3150static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
3151 struct nfs_client *clp, uint32_t *uid, int may_sleep)
3079{ 3152{
3080 uint32_t len; 3153 uint32_t len;
3081 __be32 *p; 3154 __be32 *p;
@@ -3085,10 +3158,16 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
3085 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U))) 3158 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
3086 return -EIO; 3159 return -EIO;
3087 if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) { 3160 if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) {
3088 READ_BUF(4); 3161 p = xdr_inline_decode(xdr, 4);
3089 READ32(len); 3162 if (unlikely(!p))
3090 READ_BUF(len); 3163 goto out_overflow;
3091 if (len < XDR_MAX_NETOBJ) { 3164 len = be32_to_cpup(p);
3165 p = xdr_inline_decode(xdr, len);
3166 if (unlikely(!p))
3167 goto out_overflow;
3168 if (!may_sleep) {
3169 /* do nothing */
3170 } else if (len < XDR_MAX_NETOBJ) {
3092 if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0) 3171 if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
3093 ret = NFS_ATTR_FATTR_OWNER; 3172 ret = NFS_ATTR_FATTR_OWNER;
3094 else 3173 else
@@ -3101,9 +3180,13 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
3101 } 3180 }
3102 dprintk("%s: uid=%d\n", __func__, (int)*uid); 3181 dprintk("%s: uid=%d\n", __func__, (int)*uid);
3103 return ret; 3182 return ret;
3183out_overflow:
3184 print_overflow_msg(__func__, xdr);
3185 return -EIO;
3104} 3186}
3105 3187
3106static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid) 3188static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
3189 struct nfs_client *clp, uint32_t *gid, int may_sleep)
3107{ 3190{
3108 uint32_t len; 3191 uint32_t len;
3109 __be32 *p; 3192 __be32 *p;
@@ -3113,10 +3196,16 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
3113 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U))) 3196 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
3114 return -EIO; 3197 return -EIO;
3115 if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) { 3198 if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) {
3116 READ_BUF(4); 3199 p = xdr_inline_decode(xdr, 4);
3117 READ32(len); 3200 if (unlikely(!p))
3118 READ_BUF(len); 3201 goto out_overflow;
3119 if (len < XDR_MAX_NETOBJ) { 3202 len = be32_to_cpup(p);
3203 p = xdr_inline_decode(xdr, len);
3204 if (unlikely(!p))
3205 goto out_overflow;
3206 if (!may_sleep) {
3207 /* do nothing */
3208 } else if (len < XDR_MAX_NETOBJ) {
3120 if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0) 3209 if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
3121 ret = NFS_ATTR_FATTR_GROUP; 3210 ret = NFS_ATTR_FATTR_GROUP;
3122 else 3211 else
@@ -3129,6 +3218,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
3129 } 3218 }
3130 dprintk("%s: gid=%d\n", __func__, (int)*gid); 3219 dprintk("%s: gid=%d\n", __func__, (int)*gid);
3131 return ret; 3220 return ret;
3221out_overflow:
3222 print_overflow_msg(__func__, xdr);
3223 return -EIO;
3132} 3224}
3133 3225
3134static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev) 3226static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
@@ -3143,9 +3235,11 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
3143 if (likely(bitmap[1] & FATTR4_WORD1_RAWDEV)) { 3235 if (likely(bitmap[1] & FATTR4_WORD1_RAWDEV)) {
3144 dev_t tmp; 3236 dev_t tmp;
3145 3237
3146 READ_BUF(8); 3238 p = xdr_inline_decode(xdr, 8);
3147 READ32(major); 3239 if (unlikely(!p))
3148 READ32(minor); 3240 goto out_overflow;
3241 major = be32_to_cpup(p++);
3242 minor = be32_to_cpup(p);
3149 tmp = MKDEV(major, minor); 3243 tmp = MKDEV(major, minor);
3150 if (MAJOR(tmp) == major && MINOR(tmp) == minor) 3244 if (MAJOR(tmp) == major && MINOR(tmp) == minor)
3151 *rdev = tmp; 3245 *rdev = tmp;
@@ -3154,6 +3248,9 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
3154 } 3248 }
3155 dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor); 3249 dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
3156 return ret; 3250 return ret;
3251out_overflow:
3252 print_overflow_msg(__func__, xdr);
3253 return -EIO;
3157} 3254}
3158 3255
3159static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 3256static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3165,12 +3262,17 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
3165 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_AVAIL - 1U))) 3262 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_AVAIL - 1U)))
3166 return -EIO; 3263 return -EIO;
3167 if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) { 3264 if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) {
3168 READ_BUF(8); 3265 p = xdr_inline_decode(xdr, 8);
3169 READ64(*res); 3266 if (unlikely(!p))
3267 goto out_overflow;
3268 xdr_decode_hyper(p, res);
3170 bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL; 3269 bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL;
3171 } 3270 }
3172 dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res); 3271 dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res);
3173 return status; 3272 return status;
3273out_overflow:
3274 print_overflow_msg(__func__, xdr);
3275 return -EIO;
3174} 3276}
3175 3277
3176static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 3278static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3182,12 +3284,17 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
3182 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_FREE - 1U))) 3284 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_FREE - 1U)))
3183 return -EIO; 3285 return -EIO;
3184 if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) { 3286 if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) {
3185 READ_BUF(8); 3287 p = xdr_inline_decode(xdr, 8);
3186 READ64(*res); 3288 if (unlikely(!p))
3289 goto out_overflow;
3290 xdr_decode_hyper(p, res);
3187 bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE; 3291 bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE;
3188 } 3292 }
3189 dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res); 3293 dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res);
3190 return status; 3294 return status;
3295out_overflow:
3296 print_overflow_msg(__func__, xdr);
3297 return -EIO;
3191} 3298}
3192 3299
3193static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 3300static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3199,12 +3306,17 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
3199 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_TOTAL - 1U))) 3306 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_TOTAL - 1U)))
3200 return -EIO; 3307 return -EIO;
3201 if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) { 3308 if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) {
3202 READ_BUF(8); 3309 p = xdr_inline_decode(xdr, 8);
3203 READ64(*res); 3310 if (unlikely(!p))
3311 goto out_overflow;
3312 xdr_decode_hyper(p, res);
3204 bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL; 3313 bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL;
3205 } 3314 }
3206 dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res); 3315 dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res);
3207 return status; 3316 return status;
3317out_overflow:
3318 print_overflow_msg(__func__, xdr);
3319 return -EIO;
3208} 3320}
3209 3321
3210static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used) 3322static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
@@ -3216,14 +3328,19 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
3216 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U))) 3328 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
3217 return -EIO; 3329 return -EIO;
3218 if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) { 3330 if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) {
3219 READ_BUF(8); 3331 p = xdr_inline_decode(xdr, 8);
3220 READ64(*used); 3332 if (unlikely(!p))
3333 goto out_overflow;
3334 xdr_decode_hyper(p, used);
3221 bitmap[1] &= ~FATTR4_WORD1_SPACE_USED; 3335 bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
3222 ret = NFS_ATTR_FATTR_SPACE_USED; 3336 ret = NFS_ATTR_FATTR_SPACE_USED;
3223 } 3337 }
3224 dprintk("%s: space used=%Lu\n", __func__, 3338 dprintk("%s: space used=%Lu\n", __func__,
3225 (unsigned long long)*used); 3339 (unsigned long long)*used);
3226 return ret; 3340 return ret;
3341out_overflow:
3342 print_overflow_msg(__func__, xdr);
3343 return -EIO;
3227} 3344}
3228 3345
3229static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) 3346static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
@@ -3232,12 +3349,17 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
3232 uint64_t sec; 3349 uint64_t sec;
3233 uint32_t nsec; 3350 uint32_t nsec;
3234 3351
3235 READ_BUF(12); 3352 p = xdr_inline_decode(xdr, 12);
3236 READ64(sec); 3353 if (unlikely(!p))
3237 READ32(nsec); 3354 goto out_overflow;
3355 p = xdr_decode_hyper(p, &sec);
3356 nsec = be32_to_cpup(p);
3238 time->tv_sec = (time_t)sec; 3357 time->tv_sec = (time_t)sec;
3239 time->tv_nsec = (long)nsec; 3358 time->tv_nsec = (long)nsec;
3240 return 0; 3359 return 0;
3360out_overflow:
3361 print_overflow_msg(__func__, xdr);
3362 return -EIO;
3241} 3363}
3242 3364
3243static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) 3365static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
@@ -3315,11 +3437,16 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c
3315{ 3437{
3316 __be32 *p; 3438 __be32 *p;
3317 3439
3318 READ_BUF(20); 3440 p = xdr_inline_decode(xdr, 20);
3319 READ32(cinfo->atomic); 3441 if (unlikely(!p))
3320 READ64(cinfo->before); 3442 goto out_overflow;
3321 READ64(cinfo->after); 3443 cinfo->atomic = be32_to_cpup(p++);
3444 p = xdr_decode_hyper(p, &cinfo->before);
3445 xdr_decode_hyper(p, &cinfo->after);
3322 return 0; 3446 return 0;
3447out_overflow:
3448 print_overflow_msg(__func__, xdr);
3449 return -EIO;
3323} 3450}
3324 3451
3325static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access) 3452static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
@@ -3331,40 +3458,62 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
3331 status = decode_op_hdr(xdr, OP_ACCESS); 3458 status = decode_op_hdr(xdr, OP_ACCESS);
3332 if (status) 3459 if (status)
3333 return status; 3460 return status;
3334 READ_BUF(8); 3461 p = xdr_inline_decode(xdr, 8);
3335 READ32(supp); 3462 if (unlikely(!p))
3336 READ32(acc); 3463 goto out_overflow;
3464 supp = be32_to_cpup(p++);
3465 acc = be32_to_cpup(p);
3337 access->supported = supp; 3466 access->supported = supp;
3338 access->access = acc; 3467 access->access = acc;
3339 return 0; 3468 return 0;
3469out_overflow:
3470 print_overflow_msg(__func__, xdr);
3471 return -EIO;
3340} 3472}
3341 3473
3342static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) 3474static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len)
3343{ 3475{
3344 __be32 *p; 3476 __be32 *p;
3477
3478 p = xdr_inline_decode(xdr, len);
3479 if (likely(p)) {
3480 memcpy(buf, p, len);
3481 return 0;
3482 }
3483 print_overflow_msg(__func__, xdr);
3484 return -EIO;
3485}
3486
3487static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
3488{
3489 return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE);
3490}
3491
3492static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
3493{
3345 int status; 3494 int status;
3346 3495
3347 status = decode_op_hdr(xdr, OP_CLOSE); 3496 status = decode_op_hdr(xdr, OP_CLOSE);
3348 if (status != -EIO) 3497 if (status != -EIO)
3349 nfs_increment_open_seqid(status, res->seqid); 3498 nfs_increment_open_seqid(status, res->seqid);
3350 if (status) 3499 if (!status)
3351 return status; 3500 status = decode_stateid(xdr, &res->stateid);
3352 READ_BUF(NFS4_STATEID_SIZE); 3501 return status;
3353 COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); 3502}
3354 return 0; 3503
3504static int decode_verifier(struct xdr_stream *xdr, void *verifier)
3505{
3506 return decode_opaque_fixed(xdr, verifier, 8);
3355} 3507}
3356 3508
3357static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res) 3509static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res)
3358{ 3510{
3359 __be32 *p;
3360 int status; 3511 int status;
3361 3512
3362 status = decode_op_hdr(xdr, OP_COMMIT); 3513 status = decode_op_hdr(xdr, OP_COMMIT);
3363 if (status) 3514 if (!status)
3364 return status; 3515 status = decode_verifier(xdr, res->verf->verifier);
3365 READ_BUF(8); 3516 return status;
3366 COPYMEM(res->verf->verifier, 8);
3367 return 0;
3368} 3517}
3369 3518
3370static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) 3519static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -3378,10 +3527,16 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
3378 return status; 3527 return status;
3379 if ((status = decode_change_info(xdr, cinfo))) 3528 if ((status = decode_change_info(xdr, cinfo)))
3380 return status; 3529 return status;
3381 READ_BUF(4); 3530 p = xdr_inline_decode(xdr, 4);
3382 READ32(bmlen); 3531 if (unlikely(!p))
3383 READ_BUF(bmlen << 2); 3532 goto out_overflow;
3384 return 0; 3533 bmlen = be32_to_cpup(p);
3534 p = xdr_inline_decode(xdr, bmlen << 2);
3535 if (likely(p))
3536 return 0;
3537out_overflow:
3538 print_overflow_msg(__func__, xdr);
3539 return -EIO;
3385} 3540}
3386 3541
3387static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) 3542static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
@@ -3466,7 +3621,8 @@ xdr_error:
3466 return status; 3621 return status;
3467} 3622}
3468 3623
3469static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, const struct nfs_server *server) 3624static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
3625 const struct nfs_server *server, int may_sleep)
3470{ 3626{
3471 __be32 *savep; 3627 __be32 *savep;
3472 uint32_t attrlen, 3628 uint32_t attrlen,
@@ -3538,12 +3694,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
3538 goto xdr_error; 3694 goto xdr_error;
3539 fattr->valid |= status; 3695 fattr->valid |= status;
3540 3696
3541 status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid); 3697 status = decode_attr_owner(xdr, bitmap, server->nfs_client,
3698 &fattr->uid, may_sleep);
3542 if (status < 0) 3699 if (status < 0)
3543 goto xdr_error; 3700 goto xdr_error;
3544 fattr->valid |= status; 3701 fattr->valid |= status;
3545 3702
3546 status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid); 3703 status = decode_attr_group(xdr, bitmap, server->nfs_client,
3704 &fattr->gid, may_sleep);
3547 if (status < 0) 3705 if (status < 0)
3548 goto xdr_error; 3706 goto xdr_error;
3549 fattr->valid |= status; 3707 fattr->valid |= status;
@@ -3633,14 +3791,21 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
3633 if (status) 3791 if (status)
3634 return status; 3792 return status;
3635 3793
3636 READ_BUF(4); 3794 p = xdr_inline_decode(xdr, 4);
3637 READ32(len); 3795 if (unlikely(!p))
3796 goto out_overflow;
3797 len = be32_to_cpup(p);
3638 if (len > NFS4_FHSIZE) 3798 if (len > NFS4_FHSIZE)
3639 return -EIO; 3799 return -EIO;
3640 fh->size = len; 3800 fh->size = len;
3641 READ_BUF(len); 3801 p = xdr_inline_decode(xdr, len);
3642 COPYMEM(fh->data, len); 3802 if (unlikely(!p))
3803 goto out_overflow;
3804 memcpy(fh->data, p, len);
3643 return 0; 3805 return 0;
3806out_overflow:
3807 print_overflow_msg(__func__, xdr);
3808 return -EIO;
3644} 3809}
3645 3810
3646static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) 3811static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -3662,10 +3827,12 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
3662 __be32 *p; 3827 __be32 *p;
3663 uint32_t namelen, type; 3828 uint32_t namelen, type;
3664 3829
3665 READ_BUF(32); 3830 p = xdr_inline_decode(xdr, 32);
3666 READ64(offset); 3831 if (unlikely(!p))
3667 READ64(length); 3832 goto out_overflow;
3668 READ32(type); 3833 p = xdr_decode_hyper(p, &offset);
3834 p = xdr_decode_hyper(p, &length);
3835 type = be32_to_cpup(p++);
3669 if (fl != NULL) { 3836 if (fl != NULL) {
3670 fl->fl_start = (loff_t)offset; 3837 fl->fl_start = (loff_t)offset;
3671 fl->fl_end = fl->fl_start + (loff_t)length - 1; 3838 fl->fl_end = fl->fl_start + (loff_t)length - 1;
@@ -3676,23 +3843,27 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
3676 fl->fl_type = F_RDLCK; 3843 fl->fl_type = F_RDLCK;
3677 fl->fl_pid = 0; 3844 fl->fl_pid = 0;
3678 } 3845 }
3679 READ64(clientid); 3846 p = xdr_decode_hyper(p, &clientid);
3680 READ32(namelen); 3847 namelen = be32_to_cpup(p);
3681 READ_BUF(namelen); 3848 p = xdr_inline_decode(xdr, namelen);
3682 return -NFS4ERR_DENIED; 3849 if (likely(p))
3850 return -NFS4ERR_DENIED;
3851out_overflow:
3852 print_overflow_msg(__func__, xdr);
3853 return -EIO;
3683} 3854}
3684 3855
3685static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res) 3856static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
3686{ 3857{
3687 __be32 *p;
3688 int status; 3858 int status;
3689 3859
3690 status = decode_op_hdr(xdr, OP_LOCK); 3860 status = decode_op_hdr(xdr, OP_LOCK);
3691 if (status == -EIO) 3861 if (status == -EIO)
3692 goto out; 3862 goto out;
3693 if (status == 0) { 3863 if (status == 0) {
3694 READ_BUF(NFS4_STATEID_SIZE); 3864 status = decode_stateid(xdr, &res->stateid);
3695 COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); 3865 if (unlikely(status))
3866 goto out;
3696 } else if (status == -NFS4ERR_DENIED) 3867 } else if (status == -NFS4ERR_DENIED)
3697 status = decode_lock_denied(xdr, NULL); 3868 status = decode_lock_denied(xdr, NULL);
3698 if (res->open_seqid != NULL) 3869 if (res->open_seqid != NULL)
@@ -3713,16 +3884,13 @@ static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res)
3713 3884
3714static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res) 3885static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
3715{ 3886{
3716 __be32 *p;
3717 int status; 3887 int status;
3718 3888
3719 status = decode_op_hdr(xdr, OP_LOCKU); 3889 status = decode_op_hdr(xdr, OP_LOCKU);
3720 if (status != -EIO) 3890 if (status != -EIO)
3721 nfs_increment_lock_seqid(status, res->seqid); 3891 nfs_increment_lock_seqid(status, res->seqid);
3722 if (status == 0) { 3892 if (status == 0)
3723 READ_BUF(NFS4_STATEID_SIZE); 3893 status = decode_stateid(xdr, &res->stateid);
3724 COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
3725 }
3726 return status; 3894 return status;
3727} 3895}
3728 3896
@@ -3737,34 +3905,46 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
3737 __be32 *p; 3905 __be32 *p;
3738 uint32_t limit_type, nblocks, blocksize; 3906 uint32_t limit_type, nblocks, blocksize;
3739 3907
3740 READ_BUF(12); 3908 p = xdr_inline_decode(xdr, 12);
3741 READ32(limit_type); 3909 if (unlikely(!p))
3910 goto out_overflow;
3911 limit_type = be32_to_cpup(p++);
3742 switch (limit_type) { 3912 switch (limit_type) {
3743 case 1: 3913 case 1:
3744 READ64(*maxsize); 3914 xdr_decode_hyper(p, maxsize);
3745 break; 3915 break;
3746 case 2: 3916 case 2:
3747 READ32(nblocks); 3917 nblocks = be32_to_cpup(p++);
3748 READ32(blocksize); 3918 blocksize = be32_to_cpup(p);
3749 *maxsize = (uint64_t)nblocks * (uint64_t)blocksize; 3919 *maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
3750 } 3920 }
3751 return 0; 3921 return 0;
3922out_overflow:
3923 print_overflow_msg(__func__, xdr);
3924 return -EIO;
3752} 3925}
3753 3926
3754static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) 3927static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
3755{ 3928{
3756 __be32 *p; 3929 __be32 *p;
3757 uint32_t delegation_type; 3930 uint32_t delegation_type;
3931 int status;
3758 3932
3759 READ_BUF(4); 3933 p = xdr_inline_decode(xdr, 4);
3760 READ32(delegation_type); 3934 if (unlikely(!p))
3935 goto out_overflow;
3936 delegation_type = be32_to_cpup(p);
3761 if (delegation_type == NFS4_OPEN_DELEGATE_NONE) { 3937 if (delegation_type == NFS4_OPEN_DELEGATE_NONE) {
3762 res->delegation_type = 0; 3938 res->delegation_type = 0;
3763 return 0; 3939 return 0;
3764 } 3940 }
3765 READ_BUF(NFS4_STATEID_SIZE+4); 3941 status = decode_stateid(xdr, &res->delegation);
3766 COPYMEM(res->delegation.data, NFS4_STATEID_SIZE); 3942 if (unlikely(status))
3767 READ32(res->do_recall); 3943 return status;
3944 p = xdr_inline_decode(xdr, 4);
3945 if (unlikely(!p))
3946 goto out_overflow;
3947 res->do_recall = be32_to_cpup(p);
3768 3948
3769 switch (delegation_type) { 3949 switch (delegation_type) {
3770 case NFS4_OPEN_DELEGATE_READ: 3950 case NFS4_OPEN_DELEGATE_READ:
@@ -3776,6 +3956,9 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
3776 return -EIO; 3956 return -EIO;
3777 } 3957 }
3778 return decode_ace(xdr, NULL, res->server->nfs_client); 3958 return decode_ace(xdr, NULL, res->server->nfs_client);
3959out_overflow:
3960 print_overflow_msg(__func__, xdr);
3961 return -EIO;
3779} 3962}
3780 3963
3781static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) 3964static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
@@ -3787,23 +3970,27 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
3787 status = decode_op_hdr(xdr, OP_OPEN); 3970 status = decode_op_hdr(xdr, OP_OPEN);
3788 if (status != -EIO) 3971 if (status != -EIO)
3789 nfs_increment_open_seqid(status, res->seqid); 3972 nfs_increment_open_seqid(status, res->seqid);
3790 if (status) 3973 if (!status)
3974 status = decode_stateid(xdr, &res->stateid);
3975 if (unlikely(status))
3791 return status; 3976 return status;
3792 READ_BUF(NFS4_STATEID_SIZE);
3793 COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
3794 3977
3795 decode_change_info(xdr, &res->cinfo); 3978 decode_change_info(xdr, &res->cinfo);
3796 3979
3797 READ_BUF(8); 3980 p = xdr_inline_decode(xdr, 8);
3798 READ32(res->rflags); 3981 if (unlikely(!p))
3799 READ32(bmlen); 3982 goto out_overflow;
3983 res->rflags = be32_to_cpup(p++);
3984 bmlen = be32_to_cpup(p);
3800 if (bmlen > 10) 3985 if (bmlen > 10)
3801 goto xdr_error; 3986 goto xdr_error;
3802 3987
3803 READ_BUF(bmlen << 2); 3988 p = xdr_inline_decode(xdr, bmlen << 2);
3989 if (unlikely(!p))
3990 goto out_overflow;
3804 savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE); 3991 savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE);
3805 for (i = 0; i < savewords; ++i) 3992 for (i = 0; i < savewords; ++i)
3806 READ32(res->attrset[i]); 3993 res->attrset[i] = be32_to_cpup(p++);
3807 for (; i < NFS4_BITMAP_SIZE; i++) 3994 for (; i < NFS4_BITMAP_SIZE; i++)
3808 res->attrset[i] = 0; 3995 res->attrset[i] = 0;
3809 3996
@@ -3811,36 +3998,33 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
3811xdr_error: 3998xdr_error:
3812 dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen); 3999 dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen);
3813 return -EIO; 4000 return -EIO;
4001out_overflow:
4002 print_overflow_msg(__func__, xdr);
4003 return -EIO;
3814} 4004}
3815 4005
3816static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res) 4006static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
3817{ 4007{
3818 __be32 *p;
3819 int status; 4008 int status;
3820 4009
3821 status = decode_op_hdr(xdr, OP_OPEN_CONFIRM); 4010 status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
3822 if (status != -EIO) 4011 if (status != -EIO)
3823 nfs_increment_open_seqid(status, res->seqid); 4012 nfs_increment_open_seqid(status, res->seqid);
3824 if (status) 4013 if (!status)
3825 return status; 4014 status = decode_stateid(xdr, &res->stateid);
3826 READ_BUF(NFS4_STATEID_SIZE); 4015 return status;
3827 COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
3828 return 0;
3829} 4016}
3830 4017
3831static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res) 4018static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res)
3832{ 4019{
3833 __be32 *p;
3834 int status; 4020 int status;
3835 4021
3836 status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE); 4022 status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE);
3837 if (status != -EIO) 4023 if (status != -EIO)
3838 nfs_increment_open_seqid(status, res->seqid); 4024 nfs_increment_open_seqid(status, res->seqid);
3839 if (status) 4025 if (!status)
3840 return status; 4026 status = decode_stateid(xdr, &res->stateid);
3841 READ_BUF(NFS4_STATEID_SIZE); 4027 return status;
3842 COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
3843 return 0;
3844} 4028}
3845 4029
3846static int decode_putfh(struct xdr_stream *xdr) 4030static int decode_putfh(struct xdr_stream *xdr)
@@ -3863,9 +4047,11 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
3863 status = decode_op_hdr(xdr, OP_READ); 4047 status = decode_op_hdr(xdr, OP_READ);
3864 if (status) 4048 if (status)
3865 return status; 4049 return status;
3866 READ_BUF(8); 4050 p = xdr_inline_decode(xdr, 8);
3867 READ32(eof); 4051 if (unlikely(!p))
3868 READ32(count); 4052 goto out_overflow;
4053 eof = be32_to_cpup(p++);
4054 count = be32_to_cpup(p);
3869 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 4055 hdrlen = (u8 *) p - (u8 *) iov->iov_base;
3870 recvd = req->rq_rcv_buf.len - hdrlen; 4056 recvd = req->rq_rcv_buf.len - hdrlen;
3871 if (count > recvd) { 4057 if (count > recvd) {
@@ -3878,6 +4064,9 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
3878 res->eof = eof; 4064 res->eof = eof;
3879 res->count = count; 4065 res->count = count;
3880 return 0; 4066 return 0;
4067out_overflow:
4068 print_overflow_msg(__func__, xdr);
4069 return -EIO;
3881} 4070}
3882 4071
3883static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir) 4072static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir)
@@ -3892,17 +4081,17 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
3892 int status; 4081 int status;
3893 4082
3894 status = decode_op_hdr(xdr, OP_READDIR); 4083 status = decode_op_hdr(xdr, OP_READDIR);
3895 if (status) 4084 if (!status)
4085 status = decode_verifier(xdr, readdir->verifier.data);
4086 if (unlikely(status))
3896 return status; 4087 return status;
3897 READ_BUF(8);
3898 COPYMEM(readdir->verifier.data, 8);
3899 dprintk("%s: verifier = %08x:%08x\n", 4088 dprintk("%s: verifier = %08x:%08x\n",
3900 __func__, 4089 __func__,
3901 ((u32 *)readdir->verifier.data)[0], 4090 ((u32 *)readdir->verifier.data)[0],
3902 ((u32 *)readdir->verifier.data)[1]); 4091 ((u32 *)readdir->verifier.data)[1]);
3903 4092
3904 4093
3905 hdrlen = (char *) p - (char *) iov->iov_base; 4094 hdrlen = (char *) xdr->p - (char *) iov->iov_base;
3906 recvd = rcvbuf->len - hdrlen; 4095 recvd = rcvbuf->len - hdrlen;
3907 if (pglen > recvd) 4096 if (pglen > recvd)
3908 pglen = recvd; 4097 pglen = recvd;
@@ -3990,8 +4179,10 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
3990 return status; 4179 return status;
3991 4180
3992 /* Convert length of symlink */ 4181 /* Convert length of symlink */
3993 READ_BUF(4); 4182 p = xdr_inline_decode(xdr, 4);
3994 READ32(len); 4183 if (unlikely(!p))
4184 goto out_overflow;
4185 len = be32_to_cpup(p);
3995 if (len >= rcvbuf->page_len || len <= 0) { 4186 if (len >= rcvbuf->page_len || len <= 0) {
3996 dprintk("nfs: server returned giant symlink!\n"); 4187 dprintk("nfs: server returned giant symlink!\n");
3997 return -ENAMETOOLONG; 4188 return -ENAMETOOLONG;
@@ -4015,6 +4206,9 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
4015 kaddr[len+rcvbuf->page_base] = '\0'; 4206 kaddr[len+rcvbuf->page_base] = '\0';
4016 kunmap_atomic(kaddr, KM_USER0); 4207 kunmap_atomic(kaddr, KM_USER0);
4017 return 0; 4208 return 0;
4209out_overflow:
4210 print_overflow_msg(__func__, xdr);
4211 return -EIO;
4018} 4212}
4019 4213
4020static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) 4214static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -4112,10 +4306,16 @@ static int decode_setattr(struct xdr_stream *xdr)
4112 status = decode_op_hdr(xdr, OP_SETATTR); 4306 status = decode_op_hdr(xdr, OP_SETATTR);
4113 if (status) 4307 if (status)
4114 return status; 4308 return status;
4115 READ_BUF(4); 4309 p = xdr_inline_decode(xdr, 4);
4116 READ32(bmlen); 4310 if (unlikely(!p))
4117 READ_BUF(bmlen << 2); 4311 goto out_overflow;
4118 return 0; 4312 bmlen = be32_to_cpup(p);
4313 p = xdr_inline_decode(xdr, bmlen << 2);
4314 if (likely(p))
4315 return 0;
4316out_overflow:
4317 print_overflow_msg(__func__, xdr);
4318 return -EIO;
4119} 4319}
4120 4320
4121static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) 4321static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
@@ -4124,35 +4324,50 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
4124 uint32_t opnum; 4324 uint32_t opnum;
4125 int32_t nfserr; 4325 int32_t nfserr;
4126 4326
4127 READ_BUF(8); 4327 p = xdr_inline_decode(xdr, 8);
4128 READ32(opnum); 4328 if (unlikely(!p))
4329 goto out_overflow;
4330 opnum = be32_to_cpup(p++);
4129 if (opnum != OP_SETCLIENTID) { 4331 if (opnum != OP_SETCLIENTID) {
4130 dprintk("nfs: decode_setclientid: Server returned operation" 4332 dprintk("nfs: decode_setclientid: Server returned operation"
4131 " %d\n", opnum); 4333 " %d\n", opnum);
4132 return -EIO; 4334 return -EIO;
4133 } 4335 }
4134 READ32(nfserr); 4336 nfserr = be32_to_cpup(p);
4135 if (nfserr == NFS_OK) { 4337 if (nfserr == NFS_OK) {
4136 READ_BUF(8 + NFS4_VERIFIER_SIZE); 4338 p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
4137 READ64(clp->cl_clientid); 4339 if (unlikely(!p))
4138 COPYMEM(clp->cl_confirm.data, NFS4_VERIFIER_SIZE); 4340 goto out_overflow;
4341 p = xdr_decode_hyper(p, &clp->cl_clientid);
4342 memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE);
4139 } else if (nfserr == NFSERR_CLID_INUSE) { 4343 } else if (nfserr == NFSERR_CLID_INUSE) {
4140 uint32_t len; 4344 uint32_t len;
4141 4345
4142 /* skip netid string */ 4346 /* skip netid string */
4143 READ_BUF(4); 4347 p = xdr_inline_decode(xdr, 4);
4144 READ32(len); 4348 if (unlikely(!p))
4145 READ_BUF(len); 4349 goto out_overflow;
4350 len = be32_to_cpup(p);
4351 p = xdr_inline_decode(xdr, len);
4352 if (unlikely(!p))
4353 goto out_overflow;
4146 4354
4147 /* skip uaddr string */ 4355 /* skip uaddr string */
4148 READ_BUF(4); 4356 p = xdr_inline_decode(xdr, 4);
4149 READ32(len); 4357 if (unlikely(!p))
4150 READ_BUF(len); 4358 goto out_overflow;
4359 len = be32_to_cpup(p);
4360 p = xdr_inline_decode(xdr, len);
4361 if (unlikely(!p))
4362 goto out_overflow;
4151 return -NFSERR_CLID_INUSE; 4363 return -NFSERR_CLID_INUSE;
4152 } else 4364 } else
4153 return nfs4_stat_to_errno(nfserr); 4365 return nfs4_stat_to_errno(nfserr);
4154 4366
4155 return 0; 4367 return 0;
4368out_overflow:
4369 print_overflow_msg(__func__, xdr);
4370 return -EIO;
4156} 4371}
4157 4372
4158static int decode_setclientid_confirm(struct xdr_stream *xdr) 4373static int decode_setclientid_confirm(struct xdr_stream *xdr)
@@ -4169,11 +4384,16 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)
4169 if (status) 4384 if (status)
4170 return status; 4385 return status;
4171 4386
4172 READ_BUF(16); 4387 p = xdr_inline_decode(xdr, 16);
4173 READ32(res->count); 4388 if (unlikely(!p))
4174 READ32(res->verf->committed); 4389 goto out_overflow;
4175 COPYMEM(res->verf->verifier, 8); 4390 res->count = be32_to_cpup(p++);
4391 res->verf->committed = be32_to_cpup(p++);
4392 memcpy(res->verf->verifier, p, 8);
4176 return 0; 4393 return 0;
4394out_overflow:
4395 print_overflow_msg(__func__, xdr);
4396 return -EIO;
4177} 4397}
4178 4398
4179static int decode_delegreturn(struct xdr_stream *xdr) 4399static int decode_delegreturn(struct xdr_stream *xdr)
@@ -4187,6 +4407,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
4187{ 4407{
4188 __be32 *p; 4408 __be32 *p;
4189 uint32_t dummy; 4409 uint32_t dummy;
4410 char *dummy_str;
4190 int status; 4411 int status;
4191 struct nfs_client *clp = res->client; 4412 struct nfs_client *clp = res->client;
4192 4413
@@ -4194,36 +4415,45 @@ static int decode_exchange_id(struct xdr_stream *xdr,
4194 if (status) 4415 if (status)
4195 return status; 4416 return status;
4196 4417
4197 READ_BUF(8); 4418 p = xdr_inline_decode(xdr, 8);
4198 READ64(clp->cl_ex_clid); 4419 if (unlikely(!p))
4199 READ_BUF(12); 4420 goto out_overflow;
4200 READ32(clp->cl_seqid); 4421 xdr_decode_hyper(p, &clp->cl_ex_clid);
4201 READ32(clp->cl_exchange_flags); 4422 p = xdr_inline_decode(xdr, 12);
4423 if (unlikely(!p))
4424 goto out_overflow;
4425 clp->cl_seqid = be32_to_cpup(p++);
4426 clp->cl_exchange_flags = be32_to_cpup(p++);
4202 4427
4203 /* We ask for SP4_NONE */ 4428 /* We ask for SP4_NONE */
4204 READ32(dummy); 4429 dummy = be32_to_cpup(p);
4205 if (dummy != SP4_NONE) 4430 if (dummy != SP4_NONE)
4206 return -EIO; 4431 return -EIO;
4207 4432
4208 /* Throw away minor_id */ 4433 /* Throw away minor_id */
4209 READ_BUF(8); 4434 p = xdr_inline_decode(xdr, 8);
4435 if (unlikely(!p))
4436 goto out_overflow;
4210 4437
4211 /* Throw away Major id */ 4438 /* Throw away Major id */
4212 READ_BUF(4); 4439 status = decode_opaque_inline(xdr, &dummy, &dummy_str);
4213 READ32(dummy); 4440 if (unlikely(status))
4214 READ_BUF(dummy); 4441 return status;
4215 4442
4216 /* Throw away server_scope */ 4443 /* Throw away server_scope */
4217 READ_BUF(4); 4444 status = decode_opaque_inline(xdr, &dummy, &dummy_str);
4218 READ32(dummy); 4445 if (unlikely(status))
4219 READ_BUF(dummy); 4446 return status;
4220 4447
4221 /* Throw away Implementation id array */ 4448 /* Throw away Implementation id array */
4222 READ_BUF(4); 4449 status = decode_opaque_inline(xdr, &dummy, &dummy_str);
4223 READ32(dummy); 4450 if (unlikely(status))
4224 READ_BUF(dummy); 4451 return status;
4225 4452
4226 return 0; 4453 return 0;
4454out_overflow:
4455 print_overflow_msg(__func__, xdr);
4456 return -EIO;
4227} 4457}
4228 4458
4229static int decode_chan_attrs(struct xdr_stream *xdr, 4459static int decode_chan_attrs(struct xdr_stream *xdr,
@@ -4232,22 +4462,35 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
4232 __be32 *p; 4462 __be32 *p;
4233 u32 nr_attrs; 4463 u32 nr_attrs;
4234 4464
4235 READ_BUF(28); 4465 p = xdr_inline_decode(xdr, 28);
4236 READ32(attrs->headerpadsz); 4466 if (unlikely(!p))
4237 READ32(attrs->max_rqst_sz); 4467 goto out_overflow;
4238 READ32(attrs->max_resp_sz); 4468 attrs->headerpadsz = be32_to_cpup(p++);
4239 READ32(attrs->max_resp_sz_cached); 4469 attrs->max_rqst_sz = be32_to_cpup(p++);
4240 READ32(attrs->max_ops); 4470 attrs->max_resp_sz = be32_to_cpup(p++);
4241 READ32(attrs->max_reqs); 4471 attrs->max_resp_sz_cached = be32_to_cpup(p++);
4242 READ32(nr_attrs); 4472 attrs->max_ops = be32_to_cpup(p++);
4473 attrs->max_reqs = be32_to_cpup(p++);
4474 nr_attrs = be32_to_cpup(p);
4243 if (unlikely(nr_attrs > 1)) { 4475 if (unlikely(nr_attrs > 1)) {
4244 printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n", 4476 printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n",
4245 __func__, nr_attrs); 4477 __func__, nr_attrs);
4246 return -EINVAL; 4478 return -EINVAL;
4247 } 4479 }
4248 if (nr_attrs == 1) 4480 if (nr_attrs == 1) {
4249 READ_BUF(4); /* skip rdma_attrs */ 4481 p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */
4482 if (unlikely(!p))
4483 goto out_overflow;
4484 }
4250 return 0; 4485 return 0;
4486out_overflow:
4487 print_overflow_msg(__func__, xdr);
4488 return -EIO;
4489}
4490
4491static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid)
4492{
4493 return decode_opaque_fixed(xdr, sid->data, NFS4_MAX_SESSIONID_LEN);
4251} 4494}
4252 4495
4253static int decode_create_session(struct xdr_stream *xdr, 4496static int decode_create_session(struct xdr_stream *xdr,
@@ -4259,24 +4502,26 @@ static int decode_create_session(struct xdr_stream *xdr,
4259 struct nfs4_session *session = clp->cl_session; 4502 struct nfs4_session *session = clp->cl_session;
4260 4503
4261 status = decode_op_hdr(xdr, OP_CREATE_SESSION); 4504 status = decode_op_hdr(xdr, OP_CREATE_SESSION);
4262 4505 if (!status)
4263 if (status) 4506 status = decode_sessionid(xdr, &session->sess_id);
4507 if (unlikely(status))
4264 return status; 4508 return status;
4265 4509
4266 /* sessionid */
4267 READ_BUF(NFS4_MAX_SESSIONID_LEN);
4268 COPYMEM(&session->sess_id, NFS4_MAX_SESSIONID_LEN);
4269
4270 /* seqid, flags */ 4510 /* seqid, flags */
4271 READ_BUF(8); 4511 p = xdr_inline_decode(xdr, 8);
4272 READ32(clp->cl_seqid); 4512 if (unlikely(!p))
4273 READ32(session->flags); 4513 goto out_overflow;
4514 clp->cl_seqid = be32_to_cpup(p++);
4515 session->flags = be32_to_cpup(p);
4274 4516
4275 /* Channel attributes */ 4517 /* Channel attributes */
4276 status = decode_chan_attrs(xdr, &session->fc_attrs); 4518 status = decode_chan_attrs(xdr, &session->fc_attrs);
4277 if (!status) 4519 if (!status)
4278 status = decode_chan_attrs(xdr, &session->bc_attrs); 4520 status = decode_chan_attrs(xdr, &session->bc_attrs);
4279 return status; 4521 return status;
4522out_overflow:
4523 print_overflow_msg(__func__, xdr);
4524 return -EIO;
4280} 4525}
4281 4526
4282static int decode_destroy_session(struct xdr_stream *xdr, void *dummy) 4527static int decode_destroy_session(struct xdr_stream *xdr, void *dummy)
@@ -4300,7 +4545,9 @@ static int decode_sequence(struct xdr_stream *xdr,
4300 return 0; 4545 return 0;
4301 4546
4302 status = decode_op_hdr(xdr, OP_SEQUENCE); 4547 status = decode_op_hdr(xdr, OP_SEQUENCE);
4303 if (status) 4548 if (!status)
4549 status = decode_sessionid(xdr, &id);
4550 if (unlikely(status))
4304 goto out_err; 4551 goto out_err;
4305 4552
4306 /* 4553 /*
@@ -4309,36 +4556,43 @@ static int decode_sequence(struct xdr_stream *xdr,
4309 */ 4556 */
4310 status = -ESERVERFAULT; 4557 status = -ESERVERFAULT;
4311 4558
4312 slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
4313 READ_BUF(NFS4_MAX_SESSIONID_LEN + 20);
4314 COPYMEM(id.data, NFS4_MAX_SESSIONID_LEN);
4315 if (memcmp(id.data, res->sr_session->sess_id.data, 4559 if (memcmp(id.data, res->sr_session->sess_id.data,
4316 NFS4_MAX_SESSIONID_LEN)) { 4560 NFS4_MAX_SESSIONID_LEN)) {
4317 dprintk("%s Invalid session id\n", __func__); 4561 dprintk("%s Invalid session id\n", __func__);
4318 goto out_err; 4562 goto out_err;
4319 } 4563 }
4564
4565 p = xdr_inline_decode(xdr, 20);
4566 if (unlikely(!p))
4567 goto out_overflow;
4568
4320 /* seqid */ 4569 /* seqid */
4321 READ32(dummy); 4570 slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
4571 dummy = be32_to_cpup(p++);
4322 if (dummy != slot->seq_nr) { 4572 if (dummy != slot->seq_nr) {
4323 dprintk("%s Invalid sequence number\n", __func__); 4573 dprintk("%s Invalid sequence number\n", __func__);
4324 goto out_err; 4574 goto out_err;
4325 } 4575 }
4326 /* slot id */ 4576 /* slot id */
4327 READ32(dummy); 4577 dummy = be32_to_cpup(p++);
4328 if (dummy != res->sr_slotid) { 4578 if (dummy != res->sr_slotid) {
4329 dprintk("%s Invalid slot id\n", __func__); 4579 dprintk("%s Invalid slot id\n", __func__);
4330 goto out_err; 4580 goto out_err;
4331 } 4581 }
4332 /* highest slot id - currently not processed */ 4582 /* highest slot id - currently not processed */
4333 READ32(dummy); 4583 dummy = be32_to_cpup(p++);
4334 /* target highest slot id - currently not processed */ 4584 /* target highest slot id - currently not processed */
4335 READ32(dummy); 4585 dummy = be32_to_cpup(p++);
4336 /* result flags - currently not processed */ 4586 /* result flags - currently not processed */
4337 READ32(dummy); 4587 dummy = be32_to_cpup(p);
4338 status = 0; 4588 status = 0;
4339out_err: 4589out_err:
4340 res->sr_status = status; 4590 res->sr_status = status;
4341 return status; 4591 return status;
4592out_overflow:
4593 print_overflow_msg(__func__, xdr);
4594 status = -EIO;
4595 goto out_err;
4342#else /* CONFIG_NFS_V4_1 */ 4596#else /* CONFIG_NFS_V4_1 */
4343 return 0; 4597 return 0;
4344#endif /* CONFIG_NFS_V4_1 */ 4598#endif /* CONFIG_NFS_V4_1 */
@@ -4370,7 +4624,8 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct
4370 status = decode_open_downgrade(&xdr, res); 4624 status = decode_open_downgrade(&xdr, res);
4371 if (status != 0) 4625 if (status != 0)
4372 goto out; 4626 goto out;
4373 decode_getfattr(&xdr, res->fattr, res->server); 4627 decode_getfattr(&xdr, res->fattr, res->server,
4628 !RPC_IS_ASYNC(rqstp->rq_task));
4374out: 4629out:
4375 return status; 4630 return status;
4376} 4631}
@@ -4397,7 +4652,8 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_ac
4397 status = decode_access(&xdr, res); 4652 status = decode_access(&xdr, res);
4398 if (status != 0) 4653 if (status != 0)
4399 goto out; 4654 goto out;
4400 decode_getfattr(&xdr, res->fattr, res->server); 4655 decode_getfattr(&xdr, res->fattr, res->server,
4656 !RPC_IS_ASYNC(rqstp->rq_task));
4401out: 4657out:
4402 return status; 4658 return status;
4403} 4659}
@@ -4424,7 +4680,8 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lo
4424 goto out; 4680 goto out;
4425 if ((status = decode_getfh(&xdr, res->fh)) != 0) 4681 if ((status = decode_getfh(&xdr, res->fh)) != 0)
4426 goto out; 4682 goto out;
4427 status = decode_getfattr(&xdr, res->fattr, res->server); 4683 status = decode_getfattr(&xdr, res->fattr, res->server
4684 ,!RPC_IS_ASYNC(rqstp->rq_task));
4428out: 4685out:
4429 return status; 4686 return status;
4430} 4687}
@@ -4448,7 +4705,8 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nf
4448 if ((status = decode_putrootfh(&xdr)) != 0) 4705 if ((status = decode_putrootfh(&xdr)) != 0)
4449 goto out; 4706 goto out;
4450 if ((status = decode_getfh(&xdr, res->fh)) == 0) 4707 if ((status = decode_getfh(&xdr, res->fh)) == 0)
4451 status = decode_getfattr(&xdr, res->fattr, res->server); 4708 status = decode_getfattr(&xdr, res->fattr, res->server,
4709 !RPC_IS_ASYNC(rqstp->rq_task));
4452out: 4710out:
4453 return status; 4711 return status;
4454} 4712}
@@ -4473,7 +4731,8 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem
4473 goto out; 4731 goto out;
4474 if ((status = decode_remove(&xdr, &res->cinfo)) != 0) 4732 if ((status = decode_remove(&xdr, &res->cinfo)) != 0)
4475 goto out; 4733 goto out;
4476 decode_getfattr(&xdr, &res->dir_attr, res->server); 4734 decode_getfattr(&xdr, &res->dir_attr, res->server,
4735 !RPC_IS_ASYNC(rqstp->rq_task));
4477out: 4736out:
4478 return status; 4737 return status;
4479} 4738}
@@ -4503,11 +4762,13 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_re
4503 if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0) 4762 if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0)
4504 goto out; 4763 goto out;
4505 /* Current FH is target directory */ 4764 /* Current FH is target directory */
4506 if (decode_getfattr(&xdr, res->new_fattr, res->server) != 0) 4765 if (decode_getfattr(&xdr, res->new_fattr, res->server,
4766 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
4507 goto out; 4767 goto out;
4508 if ((status = decode_restorefh(&xdr)) != 0) 4768 if ((status = decode_restorefh(&xdr)) != 0)
4509 goto out; 4769 goto out;
4510 decode_getfattr(&xdr, res->old_fattr, res->server); 4770 decode_getfattr(&xdr, res->old_fattr, res->server,
4771 !RPC_IS_ASYNC(rqstp->rq_task));
4511out: 4772out:
4512 return status; 4773 return status;
4513} 4774}
@@ -4540,11 +4801,13 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link
4540 * Note order: OP_LINK leaves the directory as the current 4801 * Note order: OP_LINK leaves the directory as the current
4541 * filehandle. 4802 * filehandle.
4542 */ 4803 */
4543 if (decode_getfattr(&xdr, res->dir_attr, res->server) != 0) 4804 if (decode_getfattr(&xdr, res->dir_attr, res->server,
4805 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
4544 goto out; 4806 goto out;
4545 if ((status = decode_restorefh(&xdr)) != 0) 4807 if ((status = decode_restorefh(&xdr)) != 0)
4546 goto out; 4808 goto out;
4547 decode_getfattr(&xdr, res->fattr, res->server); 4809 decode_getfattr(&xdr, res->fattr, res->server,
4810 !RPC_IS_ASYNC(rqstp->rq_task));
4548out: 4811out:
4549 return status; 4812 return status;
4550} 4813}
@@ -4573,11 +4836,13 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_cr
4573 goto out; 4836 goto out;
4574 if ((status = decode_getfh(&xdr, res->fh)) != 0) 4837 if ((status = decode_getfh(&xdr, res->fh)) != 0)
4575 goto out; 4838 goto out;
4576 if (decode_getfattr(&xdr, res->fattr, res->server) != 0) 4839 if (decode_getfattr(&xdr, res->fattr, res->server,
4840 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
4577 goto out; 4841 goto out;
4578 if ((status = decode_restorefh(&xdr)) != 0) 4842 if ((status = decode_restorefh(&xdr)) != 0)
4579 goto out; 4843 goto out;
4580 decode_getfattr(&xdr, res->dir_fattr, res->server); 4844 decode_getfattr(&xdr, res->dir_fattr, res->server,
4845 !RPC_IS_ASYNC(rqstp->rq_task));
4581out: 4846out:
4582 return status; 4847 return status;
4583} 4848}
@@ -4609,7 +4874,8 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_g
4609 status = decode_putfh(&xdr); 4874 status = decode_putfh(&xdr);
4610 if (status) 4875 if (status)
4611 goto out; 4876 goto out;
4612 status = decode_getfattr(&xdr, res->fattr, res->server); 4877 status = decode_getfattr(&xdr, res->fattr, res->server,
4878 !RPC_IS_ASYNC(rqstp->rq_task));
4613out: 4879out:
4614 return status; 4880 return status;
4615} 4881}
@@ -4716,7 +4982,8 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
4716 * an ESTALE error. Shouldn't be a problem, 4982 * an ESTALE error. Shouldn't be a problem,
4717 * though, since fattr->valid will remain unset. 4983 * though, since fattr->valid will remain unset.
4718 */ 4984 */
4719 decode_getfattr(&xdr, res->fattr, res->server); 4985 decode_getfattr(&xdr, res->fattr, res->server,
4986 !RPC_IS_ASYNC(rqstp->rq_task));
4720out: 4987out:
4721 return status; 4988 return status;
4722} 4989}
@@ -4748,11 +5015,13 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openr
4748 goto out; 5015 goto out;
4749 if (decode_getfh(&xdr, &res->fh) != 0) 5016 if (decode_getfh(&xdr, &res->fh) != 0)
4750 goto out; 5017 goto out;
4751 if (decode_getfattr(&xdr, res->f_attr, res->server) != 0) 5018 if (decode_getfattr(&xdr, res->f_attr, res->server,
5019 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
4752 goto out; 5020 goto out;
4753 if (decode_restorefh(&xdr) != 0) 5021 if (decode_restorefh(&xdr) != 0)
4754 goto out; 5022 goto out;
4755 decode_getfattr(&xdr, res->dir_attr, res->server); 5023 decode_getfattr(&xdr, res->dir_attr, res->server,
5024 !RPC_IS_ASYNC(rqstp->rq_task));
4756out: 5025out:
4757 return status; 5026 return status;
4758} 5027}
@@ -4800,7 +5069,8 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nf
4800 status = decode_open(&xdr, res); 5069 status = decode_open(&xdr, res);
4801 if (status) 5070 if (status)
4802 goto out; 5071 goto out;
4803 decode_getfattr(&xdr, res->f_attr, res->server); 5072 decode_getfattr(&xdr, res->f_attr, res->server,
5073 !RPC_IS_ASYNC(rqstp->rq_task));
4804out: 5074out:
4805 return status; 5075 return status;
4806} 5076}
@@ -4827,7 +5097,8 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se
4827 status = decode_setattr(&xdr); 5097 status = decode_setattr(&xdr);
4828 if (status) 5098 if (status)
4829 goto out; 5099 goto out;
4830 decode_getfattr(&xdr, res->fattr, res->server); 5100 decode_getfattr(&xdr, res->fattr, res->server,
5101 !RPC_IS_ASYNC(rqstp->rq_task));
4831out: 5102out:
4832 return status; 5103 return status;
4833} 5104}
@@ -5001,7 +5272,8 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writ
5001 status = decode_write(&xdr, res); 5272 status = decode_write(&xdr, res);
5002 if (status) 5273 if (status)
5003 goto out; 5274 goto out;
5004 decode_getfattr(&xdr, res->fattr, res->server); 5275 decode_getfattr(&xdr, res->fattr, res->server,
5276 !RPC_IS_ASYNC(rqstp->rq_task));
5005 if (!status) 5277 if (!status)
5006 status = res->count; 5278 status = res->count;
5007out: 5279out:
@@ -5030,7 +5302,8 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_wri
5030 status = decode_commit(&xdr, res); 5302 status = decode_commit(&xdr, res);
5031 if (status) 5303 if (status)
5032 goto out; 5304 goto out;
5033 decode_getfattr(&xdr, res->fattr, res->server); 5305 decode_getfattr(&xdr, res->fattr, res->server,
5306 !RPC_IS_ASYNC(rqstp->rq_task));
5034out: 5307out:
5035 return status; 5308 return status;
5036} 5309}
@@ -5194,7 +5467,8 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf
5194 if (status != 0) 5467 if (status != 0)
5195 goto out; 5468 goto out;
5196 status = decode_delegreturn(&xdr); 5469 status = decode_delegreturn(&xdr);
5197 decode_getfattr(&xdr, res->fattr, res->server); 5470 decode_getfattr(&xdr, res->fattr, res->server,
5471 !RPC_IS_ASYNC(rqstp->rq_task));
5198out: 5472out:
5199 return status; 5473 return status;
5200} 5474}
@@ -5222,7 +5496,8 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p,
5222 goto out; 5496 goto out;
5223 xdr_enter_page(&xdr, PAGE_SIZE); 5497 xdr_enter_page(&xdr, PAGE_SIZE);
5224 status = decode_getfattr(&xdr, &res->fs_locations->fattr, 5498 status = decode_getfattr(&xdr, &res->fs_locations->fattr,
5225 res->fs_locations->server); 5499 res->fs_locations->server,
5500 !RPC_IS_ASYNC(req->rq_task));
5226out: 5501out:
5227 return status; 5502 return status;
5228} 5503}
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 7be72d90d49d..ef583854d8d0 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -32,7 +32,6 @@
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/time.h> 33#include <linux/time.h>
34#include <linux/mm.h> 34#include <linux/mm.h>
35#include <linux/utsname.h>
36#include <linux/errno.h> 35#include <linux/errno.h>
37#include <linux/string.h> 36#include <linux/string.h>
38#include <linux/in.h> 37#include <linux/in.h>
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0b4cbdc60abd..29786d3b9326 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -73,7 +73,7 @@ enum {
73 Opt_cto, Opt_nocto, 73 Opt_cto, Opt_nocto,
74 Opt_ac, Opt_noac, 74 Opt_ac, Opt_noac,
75 Opt_lock, Opt_nolock, 75 Opt_lock, Opt_nolock,
76 Opt_v2, Opt_v3, 76 Opt_v2, Opt_v3, Opt_v4,
77 Opt_udp, Opt_tcp, Opt_rdma, 77 Opt_udp, Opt_tcp, Opt_rdma,
78 Opt_acl, Opt_noacl, 78 Opt_acl, Opt_noacl,
79 Opt_rdirplus, Opt_nordirplus, 79 Opt_rdirplus, Opt_nordirplus,
@@ -127,6 +127,7 @@ static const match_table_t nfs_mount_option_tokens = {
127 { Opt_nolock, "nolock" }, 127 { Opt_nolock, "nolock" },
128 { Opt_v2, "v2" }, 128 { Opt_v2, "v2" },
129 { Opt_v3, "v3" }, 129 { Opt_v3, "v3" },
130 { Opt_v4, "v4" },
130 { Opt_udp, "udp" }, 131 { Opt_udp, "udp" },
131 { Opt_tcp, "tcp" }, 132 { Opt_tcp, "tcp" },
132 { Opt_rdma, "rdma" }, 133 { Opt_rdma, "rdma" },
@@ -158,7 +159,7 @@ static const match_table_t nfs_mount_option_tokens = {
158 { Opt_mountvers, "mountvers=%s" }, 159 { Opt_mountvers, "mountvers=%s" },
159 { Opt_nfsvers, "nfsvers=%s" }, 160 { Opt_nfsvers, "nfsvers=%s" },
160 { Opt_nfsvers, "vers=%s" }, 161 { Opt_nfsvers, "vers=%s" },
161 { Opt_minorversion, "minorversion=%u" }, 162 { Opt_minorversion, "minorversion=%s" },
162 163
163 { Opt_sec, "sec=%s" }, 164 { Opt_sec, "sec=%s" },
164 { Opt_proto, "proto=%s" }, 165 { Opt_proto, "proto=%s" },
@@ -272,6 +273,10 @@ static const struct super_operations nfs_sops = {
272}; 273};
273 274
274#ifdef CONFIG_NFS_V4 275#ifdef CONFIG_NFS_V4
276static int nfs4_validate_text_mount_data(void *options,
277 struct nfs_parsed_mount_data *args, const char *dev_name);
278static int nfs4_try_mount(int flags, const char *dev_name,
279 struct nfs_parsed_mount_data *data, struct vfsmount *mnt);
275static int nfs4_get_sb(struct file_system_type *fs_type, 280static int nfs4_get_sb(struct file_system_type *fs_type,
276 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 281 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
277static int nfs4_remote_get_sb(struct file_system_type *fs_type, 282static int nfs4_remote_get_sb(struct file_system_type *fs_type,
@@ -723,6 +728,27 @@ static void nfs_umount_begin(struct super_block *sb)
723 unlock_kernel(); 728 unlock_kernel();
724} 729}
725 730
731static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(int flags)
732{
733 struct nfs_parsed_mount_data *data;
734
735 data = kzalloc(sizeof(*data), GFP_KERNEL);
736 if (data) {
737 data->flags = flags;
738 data->rsize = NFS_MAX_FILE_IO_SIZE;
739 data->wsize = NFS_MAX_FILE_IO_SIZE;
740 data->acregmin = NFS_DEF_ACREGMIN;
741 data->acregmax = NFS_DEF_ACREGMAX;
742 data->acdirmin = NFS_DEF_ACDIRMIN;
743 data->acdirmax = NFS_DEF_ACDIRMAX;
744 data->nfs_server.port = NFS_UNSPEC_PORT;
745 data->auth_flavors[0] = RPC_AUTH_UNIX;
746 data->auth_flavor_len = 1;
747 data->minorversion = 0;
748 }
749 return data;
750}
751
726/* 752/*
727 * Sanity-check a server address provided by the mount command. 753 * Sanity-check a server address provided by the mount command.
728 * 754 *
@@ -742,127 +768,23 @@ static int nfs_verify_server_address(struct sockaddr *addr)
742 } 768 }
743 } 769 }
744 770
771 dfprintk(MOUNT, "NFS: Invalid IP address specified\n");
745 return 0; 772 return 0;
746} 773}
747 774
748static void nfs_parse_ipv4_address(char *string, size_t str_len,
749 struct sockaddr *sap, size_t *addr_len)
750{
751 struct sockaddr_in *sin = (struct sockaddr_in *)sap;
752 u8 *addr = (u8 *)&sin->sin_addr.s_addr;
753
754 if (str_len <= INET_ADDRSTRLEN) {
755 dfprintk(MOUNT, "NFS: parsing IPv4 address %*s\n",
756 (int)str_len, string);
757
758 sin->sin_family = AF_INET;
759 *addr_len = sizeof(*sin);
760 if (in4_pton(string, str_len, addr, '\0', NULL))
761 return;
762 }
763
764 sap->sa_family = AF_UNSPEC;
765 *addr_len = 0;
766}
767
768#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
769static int nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
770 const char *delim,
771 struct sockaddr_in6 *sin6)
772{
773 char *p;
774 size_t len;
775
776 if ((string + str_len) == delim)
777 return 1;
778
779 if (*delim != IPV6_SCOPE_DELIMITER)
780 return 0;
781
782 if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
783 return 0;
784
785 len = (string + str_len) - delim - 1;
786 p = kstrndup(delim + 1, len, GFP_KERNEL);
787 if (p) {
788 unsigned long scope_id = 0;
789 struct net_device *dev;
790
791 dev = dev_get_by_name(&init_net, p);
792 if (dev != NULL) {
793 scope_id = dev->ifindex;
794 dev_put(dev);
795 } else {
796 if (strict_strtoul(p, 10, &scope_id) == 0) {
797 kfree(p);
798 return 0;
799 }
800 }
801
802 kfree(p);
803
804 sin6->sin6_scope_id = scope_id;
805 dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id);
806 return 1;
807 }
808
809 return 0;
810}
811
812static void nfs_parse_ipv6_address(char *string, size_t str_len,
813 struct sockaddr *sap, size_t *addr_len)
814{
815 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
816 u8 *addr = (u8 *)&sin6->sin6_addr.in6_u;
817 const char *delim;
818
819 if (str_len <= INET6_ADDRSTRLEN) {
820 dfprintk(MOUNT, "NFS: parsing IPv6 address %*s\n",
821 (int)str_len, string);
822
823 sin6->sin6_family = AF_INET6;
824 *addr_len = sizeof(*sin6);
825 if (in6_pton(string, str_len, addr,
826 IPV6_SCOPE_DELIMITER, &delim) != 0) {
827 if (nfs_parse_ipv6_scope_id(string, str_len,
828 delim, sin6) != 0)
829 return;
830 }
831 }
832
833 sap->sa_family = AF_UNSPEC;
834 *addr_len = 0;
835}
836#else
837static void nfs_parse_ipv6_address(char *string, size_t str_len,
838 struct sockaddr *sap, size_t *addr_len)
839{
840 sap->sa_family = AF_UNSPEC;
841 *addr_len = 0;
842}
843#endif
844
845/* 775/*
846 * Construct a sockaddr based on the contents of a string that contains 776 * Select between a default port value and a user-specified port value.
847 * an IP address in presentation format. 777 * If a zero value is set, then autobind will be used.
848 *
849 * If there is a problem constructing the new sockaddr, set the address
850 * family to AF_UNSPEC.
851 */ 778 */
852void nfs_parse_ip_address(char *string, size_t str_len, 779static void nfs_set_default_port(struct sockaddr *sap, const int parsed_port,
853 struct sockaddr *sap, size_t *addr_len) 780 const unsigned short default_port)
854{ 781{
855 unsigned int i, colons; 782 unsigned short port = default_port;
856 783
857 colons = 0; 784 if (parsed_port != NFS_UNSPEC_PORT)
858 for (i = 0; i < str_len; i++) 785 port = parsed_port;
859 if (string[i] == ':')
860 colons++;
861 786
862 if (colons >= 2) 787 rpc_set_port(sap, port);
863 nfs_parse_ipv6_address(string, str_len, sap, addr_len);
864 else
865 nfs_parse_ipv4_address(string, str_len, sap, addr_len);
866} 788}
867 789
868/* 790/*
@@ -904,8 +826,6 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
904 826
905/* 827/*
906 * Parse the value of the 'sec=' option. 828 * Parse the value of the 'sec=' option.
907 *
908 * The flavor_len setting is for v4 mounts.
909 */ 829 */
910static int nfs_parse_security_flavors(char *value, 830static int nfs_parse_security_flavors(char *value,
911 struct nfs_parsed_mount_data *mnt) 831 struct nfs_parsed_mount_data *mnt)
@@ -916,53 +836,43 @@ static int nfs_parse_security_flavors(char *value,
916 836
917 switch (match_token(value, nfs_secflavor_tokens, args)) { 837 switch (match_token(value, nfs_secflavor_tokens, args)) {
918 case Opt_sec_none: 838 case Opt_sec_none:
919 mnt->auth_flavor_len = 0;
920 mnt->auth_flavors[0] = RPC_AUTH_NULL; 839 mnt->auth_flavors[0] = RPC_AUTH_NULL;
921 break; 840 break;
922 case Opt_sec_sys: 841 case Opt_sec_sys:
923 mnt->auth_flavor_len = 0;
924 mnt->auth_flavors[0] = RPC_AUTH_UNIX; 842 mnt->auth_flavors[0] = RPC_AUTH_UNIX;
925 break; 843 break;
926 case Opt_sec_krb5: 844 case Opt_sec_krb5:
927 mnt->auth_flavor_len = 1;
928 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5; 845 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
929 break; 846 break;
930 case Opt_sec_krb5i: 847 case Opt_sec_krb5i:
931 mnt->auth_flavor_len = 1;
932 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I; 848 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
933 break; 849 break;
934 case Opt_sec_krb5p: 850 case Opt_sec_krb5p:
935 mnt->auth_flavor_len = 1;
936 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P; 851 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
937 break; 852 break;
938 case Opt_sec_lkey: 853 case Opt_sec_lkey:
939 mnt->auth_flavor_len = 1;
940 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY; 854 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
941 break; 855 break;
942 case Opt_sec_lkeyi: 856 case Opt_sec_lkeyi:
943 mnt->auth_flavor_len = 1;
944 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI; 857 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
945 break; 858 break;
946 case Opt_sec_lkeyp: 859 case Opt_sec_lkeyp:
947 mnt->auth_flavor_len = 1;
948 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP; 860 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
949 break; 861 break;
950 case Opt_sec_spkm: 862 case Opt_sec_spkm:
951 mnt->auth_flavor_len = 1;
952 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM; 863 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
953 break; 864 break;
954 case Opt_sec_spkmi: 865 case Opt_sec_spkmi:
955 mnt->auth_flavor_len = 1;
956 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI; 866 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
957 break; 867 break;
958 case Opt_sec_spkmp: 868 case Opt_sec_spkmp:
959 mnt->auth_flavor_len = 1;
960 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP; 869 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
961 break; 870 break;
962 default: 871 default:
963 return 0; 872 return 0;
964 } 873 }
965 874
875 mnt->auth_flavor_len = 1;
966 return 1; 876 return 1;
967} 877}
968 878
@@ -1001,7 +911,6 @@ static int nfs_parse_mount_options(char *raw,
1001 while ((p = strsep(&raw, ",")) != NULL) { 911 while ((p = strsep(&raw, ",")) != NULL) {
1002 substring_t args[MAX_OPT_ARGS]; 912 substring_t args[MAX_OPT_ARGS];
1003 unsigned long option; 913 unsigned long option;
1004 int int_option;
1005 int token; 914 int token;
1006 915
1007 if (!*p) 916 if (!*p)
@@ -1047,10 +956,18 @@ static int nfs_parse_mount_options(char *raw,
1047 break; 956 break;
1048 case Opt_v2: 957 case Opt_v2:
1049 mnt->flags &= ~NFS_MOUNT_VER3; 958 mnt->flags &= ~NFS_MOUNT_VER3;
959 mnt->version = 2;
1050 break; 960 break;
1051 case Opt_v3: 961 case Opt_v3:
1052 mnt->flags |= NFS_MOUNT_VER3; 962 mnt->flags |= NFS_MOUNT_VER3;
963 mnt->version = 3;
964 break;
965#ifdef CONFIG_NFS_V4
966 case Opt_v4:
967 mnt->flags &= ~NFS_MOUNT_VER3;
968 mnt->version = 4;
1053 break; 969 break;
970#endif
1054 case Opt_udp: 971 case Opt_udp:
1055 mnt->flags &= ~NFS_MOUNT_TCP; 972 mnt->flags &= ~NFS_MOUNT_TCP;
1056 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; 973 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
@@ -1264,20 +1181,33 @@ static int nfs_parse_mount_options(char *raw,
1264 switch (option) { 1181 switch (option) {
1265 case NFS2_VERSION: 1182 case NFS2_VERSION:
1266 mnt->flags &= ~NFS_MOUNT_VER3; 1183 mnt->flags &= ~NFS_MOUNT_VER3;
1184 mnt->version = 2;
1267 break; 1185 break;
1268 case NFS3_VERSION: 1186 case NFS3_VERSION:
1269 mnt->flags |= NFS_MOUNT_VER3; 1187 mnt->flags |= NFS_MOUNT_VER3;
1188 mnt->version = 3;
1189 break;
1190#ifdef CONFIG_NFS_V4
1191 case NFS4_VERSION:
1192 mnt->flags &= ~NFS_MOUNT_VER3;
1193 mnt->version = 4;
1270 break; 1194 break;
1195#endif
1271 default: 1196 default:
1272 goto out_invalid_value; 1197 goto out_invalid_value;
1273 } 1198 }
1274 break; 1199 break;
1275 case Opt_minorversion: 1200 case Opt_minorversion:
1276 if (match_int(args, &int_option)) 1201 string = match_strdup(args);
1277 return 0; 1202 if (string == NULL)
1278 if (int_option < 0 || int_option > NFS4_MAX_MINOR_VERSION) 1203 goto out_nomem;
1279 return 0; 1204 rc = strict_strtoul(string, 10, &option);
1280 mnt->minorversion = int_option; 1205 kfree(string);
1206 if (rc != 0)
1207 goto out_invalid_value;
1208 if (option > NFS4_MAX_MINOR_VERSION)
1209 goto out_invalid_value;
1210 mnt->minorversion = option;
1281 break; 1211 break;
1282 1212
1283 /* 1213 /*
@@ -1352,11 +1282,14 @@ static int nfs_parse_mount_options(char *raw,
1352 string = match_strdup(args); 1282 string = match_strdup(args);
1353 if (string == NULL) 1283 if (string == NULL)
1354 goto out_nomem; 1284 goto out_nomem;
1355 nfs_parse_ip_address(string, strlen(string), 1285 mnt->nfs_server.addrlen =
1356 (struct sockaddr *) 1286 rpc_pton(string, strlen(string),
1357 &mnt->nfs_server.address, 1287 (struct sockaddr *)
1358 &mnt->nfs_server.addrlen); 1288 &mnt->nfs_server.address,
1289 sizeof(mnt->nfs_server.address));
1359 kfree(string); 1290 kfree(string);
1291 if (mnt->nfs_server.addrlen == 0)
1292 goto out_invalid_address;
1360 break; 1293 break;
1361 case Opt_clientaddr: 1294 case Opt_clientaddr:
1362 string = match_strdup(args); 1295 string = match_strdup(args);
@@ -1376,11 +1309,14 @@ static int nfs_parse_mount_options(char *raw,
1376 string = match_strdup(args); 1309 string = match_strdup(args);
1377 if (string == NULL) 1310 if (string == NULL)
1378 goto out_nomem; 1311 goto out_nomem;
1379 nfs_parse_ip_address(string, strlen(string), 1312 mnt->mount_server.addrlen =
1380 (struct sockaddr *) 1313 rpc_pton(string, strlen(string),
1381 &mnt->mount_server.address, 1314 (struct sockaddr *)
1382 &mnt->mount_server.addrlen); 1315 &mnt->mount_server.address,
1316 sizeof(mnt->mount_server.address));
1383 kfree(string); 1317 kfree(string);
1318 if (mnt->mount_server.addrlen == 0)
1319 goto out_invalid_address;
1384 break; 1320 break;
1385 case Opt_lookupcache: 1321 case Opt_lookupcache:
1386 string = match_strdup(args); 1322 string = match_strdup(args);
@@ -1432,8 +1368,11 @@ static int nfs_parse_mount_options(char *raw,
1432 1368
1433 return 1; 1369 return 1;
1434 1370
1371out_invalid_address:
1372 printk(KERN_INFO "NFS: bad IP address specified: %s\n", p);
1373 return 0;
1435out_invalid_value: 1374out_invalid_value:
1436 printk(KERN_INFO "NFS: bad mount option value specified: %s \n", p); 1375 printk(KERN_INFO "NFS: bad mount option value specified: %s\n", p);
1437 return 0; 1376 return 0;
1438out_nomem: 1377out_nomem:
1439 printk(KERN_INFO "NFS: not enough memory to parse option\n"); 1378 printk(KERN_INFO "NFS: not enough memory to parse option\n");
@@ -1445,13 +1384,60 @@ out_security_failure:
1445} 1384}
1446 1385
1447/* 1386/*
1387 * Match the requested auth flavors with the list returned by
1388 * the server. Returns zero and sets the mount's authentication
1389 * flavor on success; returns -EACCES if server does not support
1390 * the requested flavor.
1391 */
1392static int nfs_walk_authlist(struct nfs_parsed_mount_data *args,
1393 struct nfs_mount_request *request)
1394{
1395 unsigned int i, j, server_authlist_len = *(request->auth_flav_len);
1396
1397 /*
1398 * Certain releases of Linux's mountd return an empty
1399 * flavor list. To prevent behavioral regression with
1400 * these servers (ie. rejecting mounts that used to
1401 * succeed), revert to pre-2.6.32 behavior (no checking)
1402 * if the returned flavor list is empty.
1403 */
1404 if (server_authlist_len == 0)
1405 return 0;
1406
1407 /*
1408 * We avoid sophisticated negotiating here, as there are
1409 * plenty of cases where we can get it wrong, providing
1410 * either too little or too much security.
1411 *
1412 * RFC 2623, section 2.7 suggests we SHOULD prefer the
1413 * flavor listed first. However, some servers list
1414 * AUTH_NULL first. Our caller plants AUTH_SYS, the
1415 * preferred default, in args->auth_flavors[0] if user
1416 * didn't specify sec= mount option.
1417 */
1418 for (i = 0; i < args->auth_flavor_len; i++)
1419 for (j = 0; j < server_authlist_len; j++)
1420 if (args->auth_flavors[i] == request->auth_flavs[j]) {
1421 dfprintk(MOUNT, "NFS: using auth flavor %d\n",
1422 request->auth_flavs[j]);
1423 args->auth_flavors[0] = request->auth_flavs[j];
1424 return 0;
1425 }
1426
1427 dfprintk(MOUNT, "NFS: server does not support requested auth flavor\n");
1428 nfs_umount(request);
1429 return -EACCES;
1430}
1431
1432/*
1448 * Use the remote server's MOUNT service to request the NFS file handle 1433 * Use the remote server's MOUNT service to request the NFS file handle
1449 * corresponding to the provided path. 1434 * corresponding to the provided path.
1450 */ 1435 */
1451static int nfs_try_mount(struct nfs_parsed_mount_data *args, 1436static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1452 struct nfs_fh *root_fh) 1437 struct nfs_fh *root_fh)
1453{ 1438{
1454 unsigned int auth_flavor_len = 0; 1439 rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS];
1440 unsigned int server_authlist_len = ARRAY_SIZE(server_authlist);
1455 struct nfs_mount_request request = { 1441 struct nfs_mount_request request = {
1456 .sap = (struct sockaddr *) 1442 .sap = (struct sockaddr *)
1457 &args->mount_server.address, 1443 &args->mount_server.address,
@@ -1459,15 +1445,19 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1459 .protocol = args->mount_server.protocol, 1445 .protocol = args->mount_server.protocol,
1460 .fh = root_fh, 1446 .fh = root_fh,
1461 .noresvport = args->flags & NFS_MOUNT_NORESVPORT, 1447 .noresvport = args->flags & NFS_MOUNT_NORESVPORT,
1462 .auth_flav_len = &auth_flavor_len, 1448 .auth_flav_len = &server_authlist_len,
1449 .auth_flavs = server_authlist,
1463 }; 1450 };
1464 int status; 1451 int status;
1465 1452
1466 if (args->mount_server.version == 0) { 1453 if (args->mount_server.version == 0) {
1467 if (args->flags & NFS_MOUNT_VER3) 1454 switch (args->version) {
1468 args->mount_server.version = NFS_MNT3_VERSION; 1455 default:
1469 else 1456 args->mount_server.version = NFS_MNT3_VERSION;
1470 args->mount_server.version = NFS_MNT_VERSION; 1457 break;
1458 case 2:
1459 args->mount_server.version = NFS_MNT_VERSION;
1460 }
1471 } 1461 }
1472 request.version = args->mount_server.version; 1462 request.version = args->mount_server.version;
1473 1463
@@ -1485,23 +1475,25 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1485 args->mount_server.addrlen = args->nfs_server.addrlen; 1475 args->mount_server.addrlen = args->nfs_server.addrlen;
1486 } 1476 }
1487 request.salen = args->mount_server.addrlen; 1477 request.salen = args->mount_server.addrlen;
1488 1478 nfs_set_default_port(request.sap, args->mount_server.port, 0);
1489 /*
1490 * autobind will be used if mount_server.port == 0
1491 */
1492 nfs_set_port(request.sap, args->mount_server.port);
1493 1479
1494 /* 1480 /*
1495 * Now ask the mount server to map our export path 1481 * Now ask the mount server to map our export path
1496 * to a file handle. 1482 * to a file handle.
1497 */ 1483 */
1498 status = nfs_mount(&request); 1484 status = nfs_mount(&request);
1499 if (status == 0) 1485 if (status != 0) {
1500 return 0; 1486 dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
1487 request.hostname, status);
1488 return status;
1489 }
1501 1490
1502 dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n", 1491 /*
1503 request.hostname, status); 1492 * MNTv1 (NFSv2) does not support auth flavor negotiation.
1504 return status; 1493 */
1494 if (args->mount_server.version != NFS_MNT3_VERSION)
1495 return 0;
1496 return nfs_walk_authlist(args, &request);
1505} 1497}
1506 1498
1507static int nfs_parse_simple_hostname(const char *dev_name, 1499static int nfs_parse_simple_hostname(const char *dev_name,
@@ -1661,22 +1653,11 @@ static int nfs_validate_mount_data(void *options,
1661 const char *dev_name) 1653 const char *dev_name)
1662{ 1654{
1663 struct nfs_mount_data *data = (struct nfs_mount_data *)options; 1655 struct nfs_mount_data *data = (struct nfs_mount_data *)options;
1656 struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
1664 1657
1665 if (data == NULL) 1658 if (data == NULL)
1666 goto out_no_data; 1659 goto out_no_data;
1667 1660
1668 args->flags = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP);
1669 args->rsize = NFS_MAX_FILE_IO_SIZE;
1670 args->wsize = NFS_MAX_FILE_IO_SIZE;
1671 args->acregmin = NFS_DEF_ACREGMIN;
1672 args->acregmax = NFS_DEF_ACREGMAX;
1673 args->acdirmin = NFS_DEF_ACDIRMIN;
1674 args->acdirmax = NFS_DEF_ACDIRMAX;
1675 args->mount_server.port = 0; /* autobind unless user sets port */
1676 args->nfs_server.port = 0; /* autobind unless user sets port */
1677 args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
1678 args->auth_flavors[0] = RPC_AUTH_UNIX;
1679
1680 switch (data->version) { 1661 switch (data->version) {
1681 case 1: 1662 case 1:
1682 data->namlen = 0; 1663 data->namlen = 0;
@@ -1697,8 +1678,11 @@ static int nfs_validate_mount_data(void *options,
1697 if (data->root.size > NFS3_FHSIZE || data->root.size == 0) 1678 if (data->root.size > NFS3_FHSIZE || data->root.size == 0)
1698 goto out_invalid_fh; 1679 goto out_invalid_fh;
1699 mntfh->size = data->root.size; 1680 mntfh->size = data->root.size;
1700 } else 1681 args->version = 3;
1682 } else {
1701 mntfh->size = NFS2_FHSIZE; 1683 mntfh->size = NFS2_FHSIZE;
1684 args->version = 2;
1685 }
1702 1686
1703 1687
1704 memcpy(mntfh->data, data->root.data, mntfh->size); 1688 memcpy(mntfh->data, data->root.data, mntfh->size);
@@ -1720,15 +1704,15 @@ static int nfs_validate_mount_data(void *options,
1720 args->acdirmin = data->acdirmin; 1704 args->acdirmin = data->acdirmin;
1721 args->acdirmax = data->acdirmax; 1705 args->acdirmax = data->acdirmax;
1722 1706
1723 memcpy(&args->nfs_server.address, &data->addr, 1707 memcpy(sap, &data->addr, sizeof(data->addr));
1724 sizeof(data->addr));
1725 args->nfs_server.addrlen = sizeof(data->addr); 1708 args->nfs_server.addrlen = sizeof(data->addr);
1726 if (!nfs_verify_server_address((struct sockaddr *) 1709 if (!nfs_verify_server_address(sap))
1727 &args->nfs_server.address))
1728 goto out_no_address; 1710 goto out_no_address;
1729 1711
1730 if (!(data->flags & NFS_MOUNT_TCP)) 1712 if (!(data->flags & NFS_MOUNT_TCP))
1731 args->nfs_server.protocol = XPRT_TRANSPORT_UDP; 1713 args->nfs_server.protocol = XPRT_TRANSPORT_UDP;
1714 else
1715 args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
1732 /* N.B. caller will free nfs_server.hostname in all cases */ 1716 /* N.B. caller will free nfs_server.hostname in all cases */
1733 args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL); 1717 args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL);
1734 args->namlen = data->namlen; 1718 args->namlen = data->namlen;
@@ -1772,12 +1756,18 @@ static int nfs_validate_mount_data(void *options,
1772 if (nfs_parse_mount_options((char *)options, args) == 0) 1756 if (nfs_parse_mount_options((char *)options, args) == 0)
1773 return -EINVAL; 1757 return -EINVAL;
1774 1758
1775 if (!nfs_verify_server_address((struct sockaddr *) 1759 if (!nfs_verify_server_address(sap))
1776 &args->nfs_server.address))
1777 goto out_no_address; 1760 goto out_no_address;
1778 1761
1779 nfs_set_port((struct sockaddr *)&args->nfs_server.address, 1762 if (args->version == 4)
1780 args->nfs_server.port); 1763#ifdef CONFIG_NFS_V4
1764 return nfs4_validate_text_mount_data(options,
1765 args, dev_name);
1766#else
1767 goto out_v4_not_compiled;
1768#endif
1769
1770 nfs_set_default_port(sap, args->nfs_server.port, 0);
1781 1771
1782 nfs_set_mount_transport_protocol(args); 1772 nfs_set_mount_transport_protocol(args);
1783 1773
@@ -1800,7 +1790,7 @@ static int nfs_validate_mount_data(void *options,
1800 } 1790 }
1801 1791
1802#ifndef CONFIG_NFS_V3 1792#ifndef CONFIG_NFS_V3
1803 if (args->flags & NFS_MOUNT_VER3) 1793 if (args->version == 3)
1804 goto out_v3_not_compiled; 1794 goto out_v3_not_compiled;
1805#endif /* !CONFIG_NFS_V3 */ 1795#endif /* !CONFIG_NFS_V3 */
1806 1796
@@ -1825,6 +1815,12 @@ out_v3_not_compiled:
1825 return -EPROTONOSUPPORT; 1815 return -EPROTONOSUPPORT;
1826#endif /* !CONFIG_NFS_V3 */ 1816#endif /* !CONFIG_NFS_V3 */
1827 1817
1818#ifndef CONFIG_NFS_V4
1819out_v4_not_compiled:
1820 dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n");
1821 return -EPROTONOSUPPORT;
1822#endif /* !CONFIG_NFS_V4 */
1823
1828out_nomem: 1824out_nomem:
1829 dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n"); 1825 dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n");
1830 return -ENOMEM; 1826 return -ENOMEM;
@@ -1934,6 +1930,8 @@ static inline void nfs_initialise_sb(struct super_block *sb)
1934 if (server->flags & NFS_MOUNT_NOAC) 1930 if (server->flags & NFS_MOUNT_NOAC)
1935 sb->s_flags |= MS_SYNCHRONOUS; 1931 sb->s_flags |= MS_SYNCHRONOUS;
1936 1932
1933 sb->s_bdi = &server->backing_dev_info;
1934
1937 nfs_super_set_maxbytes(sb, server->maxfilesize); 1935 nfs_super_set_maxbytes(sb, server->maxfilesize);
1938} 1936}
1939 1937
@@ -1950,7 +1948,7 @@ static void nfs_fill_super(struct super_block *sb,
1950 if (data->bsize) 1948 if (data->bsize)
1951 sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits); 1949 sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits);
1952 1950
1953 if (server->flags & NFS_MOUNT_VER3) { 1951 if (server->nfs_client->rpc_ops->version == 3) {
1954 /* The VFS shouldn't apply the umask to mode bits. We will do 1952 /* The VFS shouldn't apply the umask to mode bits. We will do
1955 * so ourselves when necessary. 1953 * so ourselves when necessary.
1956 */ 1954 */
@@ -1974,7 +1972,7 @@ static void nfs_clone_super(struct super_block *sb,
1974 sb->s_blocksize = old_sb->s_blocksize; 1972 sb->s_blocksize = old_sb->s_blocksize;
1975 sb->s_maxbytes = old_sb->s_maxbytes; 1973 sb->s_maxbytes = old_sb->s_maxbytes;
1976 1974
1977 if (server->flags & NFS_MOUNT_VER3) { 1975 if (server->nfs_client->rpc_ops->version == 3) {
1978 /* The VFS shouldn't apply the umask to mode bits. We will do 1976 /* The VFS shouldn't apply the umask to mode bits. We will do
1979 * so ourselves when necessary. 1977 * so ourselves when necessary.
1980 */ 1978 */
@@ -2108,7 +2106,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2108 }; 2106 };
2109 int error = -ENOMEM; 2107 int error = -ENOMEM;
2110 2108
2111 data = kzalloc(sizeof(*data), GFP_KERNEL); 2109 data = nfs_alloc_parsed_mount_data(NFS_MOUNT_VER3 | NFS_MOUNT_TCP);
2112 mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); 2110 mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
2113 if (data == NULL || mntfh == NULL) 2111 if (data == NULL || mntfh == NULL)
2114 goto out_free_fh; 2112 goto out_free_fh;
@@ -2120,6 +2118,14 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2120 if (error < 0) 2118 if (error < 0)
2121 goto out; 2119 goto out;
2122 2120
2121#ifdef CONFIG_NFS_V4
2122 if (data->version == 4) {
2123 error = nfs4_try_mount(flags, dev_name, data, mnt);
2124 kfree(data->client_address);
2125 goto out;
2126 }
2127#endif /* CONFIG_NFS_V4 */
2128
2123 /* Get a volume representation */ 2129 /* Get a volume representation */
2124 server = nfs_create_server(data, mntfh); 2130 server = nfs_create_server(data, mntfh);
2125 if (IS_ERR(server)) { 2131 if (IS_ERR(server)) {
@@ -2150,7 +2156,8 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2150 if (!s->s_root) { 2156 if (!s->s_root) {
2151 /* initial superblock/root creation */ 2157 /* initial superblock/root creation */
2152 nfs_fill_super(s, data); 2158 nfs_fill_super(s, data);
2153 nfs_fscache_get_super_cookie(s, data); 2159 nfs_fscache_get_super_cookie(
2160 s, data ? data->fscache_uniq : NULL, NULL);
2154 } 2161 }
2155 2162
2156 mntroot = nfs_get_root(s, mntfh); 2163 mntroot = nfs_get_root(s, mntfh);
@@ -2196,8 +2203,8 @@ static void nfs_kill_super(struct super_block *s)
2196{ 2203{
2197 struct nfs_server *server = NFS_SB(s); 2204 struct nfs_server *server = NFS_SB(s);
2198 2205
2199 bdi_unregister(&server->backing_dev_info);
2200 kill_anon_super(s); 2206 kill_anon_super(s);
2207 bdi_unregister(&server->backing_dev_info);
2201 nfs_fscache_release_super_cookie(s); 2208 nfs_fscache_release_super_cookie(s);
2202 nfs_free_server(server); 2209 nfs_free_server(server);
2203} 2210}
@@ -2251,6 +2258,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
2251 if (!s->s_root) { 2258 if (!s->s_root) {
2252 /* initial superblock/root creation */ 2259 /* initial superblock/root creation */
2253 nfs_clone_super(s, data->sb); 2260 nfs_clone_super(s, data->sb);
2261 nfs_fscache_get_super_cookie(s, NULL, data);
2254 } 2262 }
2255 2263
2256 mntroot = nfs_get_root(s, data->fh); 2264 mntroot = nfs_get_root(s, data->fh);
@@ -2317,6 +2325,43 @@ static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args)
2317 args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3); 2325 args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3);
2318} 2326}
2319 2327
2328static int nfs4_validate_text_mount_data(void *options,
2329 struct nfs_parsed_mount_data *args,
2330 const char *dev_name)
2331{
2332 struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
2333
2334 nfs_set_default_port(sap, args->nfs_server.port, NFS_PORT);
2335
2336 nfs_validate_transport_protocol(args);
2337
2338 nfs4_validate_mount_flags(args);
2339
2340 if (args->version != 4) {
2341 dfprintk(MOUNT,
2342 "NFS4: Illegal mount version\n");
2343 return -EINVAL;
2344 }
2345
2346 if (args->auth_flavor_len > 1) {
2347 dfprintk(MOUNT,
2348 "NFS4: Too many RPC auth flavours specified\n");
2349 return -EINVAL;
2350 }
2351
2352 if (args->client_address == NULL) {
2353 dfprintk(MOUNT,
2354 "NFS4: mount program didn't pass callback address\n");
2355 return -EINVAL;
2356 }
2357
2358 return nfs_parse_devname(dev_name,
2359 &args->nfs_server.hostname,
2360 NFS4_MAXNAMLEN,
2361 &args->nfs_server.export_path,
2362 NFS4_MAXPATHLEN);
2363}
2364
2320/* 2365/*
2321 * Validate NFSv4 mount options 2366 * Validate NFSv4 mount options
2322 */ 2367 */
@@ -2324,36 +2369,24 @@ static int nfs4_validate_mount_data(void *options,
2324 struct nfs_parsed_mount_data *args, 2369 struct nfs_parsed_mount_data *args,
2325 const char *dev_name) 2370 const char *dev_name)
2326{ 2371{
2327 struct sockaddr_in *ap; 2372 struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
2328 struct nfs4_mount_data *data = (struct nfs4_mount_data *)options; 2373 struct nfs4_mount_data *data = (struct nfs4_mount_data *)options;
2329 char *c; 2374 char *c;
2330 2375
2331 if (data == NULL) 2376 if (data == NULL)
2332 goto out_no_data; 2377 goto out_no_data;
2333 2378
2334 args->rsize = NFS_MAX_FILE_IO_SIZE; 2379 args->version = 4;
2335 args->wsize = NFS_MAX_FILE_IO_SIZE;
2336 args->acregmin = NFS_DEF_ACREGMIN;
2337 args->acregmax = NFS_DEF_ACREGMAX;
2338 args->acdirmin = NFS_DEF_ACDIRMIN;
2339 args->acdirmax = NFS_DEF_ACDIRMAX;
2340 args->nfs_server.port = NFS_PORT; /* 2049 unless user set port= */
2341 args->auth_flavors[0] = RPC_AUTH_UNIX;
2342 args->auth_flavor_len = 0;
2343 args->minorversion = 0;
2344
2345 switch (data->version) { 2380 switch (data->version) {
2346 case 1: 2381 case 1:
2347 ap = (struct sockaddr_in *)&args->nfs_server.address;
2348 if (data->host_addrlen > sizeof(args->nfs_server.address)) 2382 if (data->host_addrlen > sizeof(args->nfs_server.address))
2349 goto out_no_address; 2383 goto out_no_address;
2350 if (data->host_addrlen == 0) 2384 if (data->host_addrlen == 0)
2351 goto out_no_address; 2385 goto out_no_address;
2352 args->nfs_server.addrlen = data->host_addrlen; 2386 args->nfs_server.addrlen = data->host_addrlen;
2353 if (copy_from_user(ap, data->host_addr, data->host_addrlen)) 2387 if (copy_from_user(sap, data->host_addr, data->host_addrlen))
2354 return -EFAULT; 2388 return -EFAULT;
2355 if (!nfs_verify_server_address((struct sockaddr *) 2389 if (!nfs_verify_server_address(sap))
2356 &args->nfs_server.address))
2357 goto out_no_address; 2390 goto out_no_address;
2358 2391
2359 if (data->auth_flavourlen) { 2392 if (data->auth_flavourlen) {
@@ -2399,39 +2432,14 @@ static int nfs4_validate_mount_data(void *options,
2399 nfs_validate_transport_protocol(args); 2432 nfs_validate_transport_protocol(args);
2400 2433
2401 break; 2434 break;
2402 default: { 2435 default:
2403 int status;
2404
2405 if (nfs_parse_mount_options((char *)options, args) == 0) 2436 if (nfs_parse_mount_options((char *)options, args) == 0)
2406 return -EINVAL; 2437 return -EINVAL;
2407 2438
2408 if (!nfs_verify_server_address((struct sockaddr *) 2439 if (!nfs_verify_server_address(sap))
2409 &args->nfs_server.address))
2410 return -EINVAL; 2440 return -EINVAL;
2411 2441
2412 nfs_set_port((struct sockaddr *)&args->nfs_server.address, 2442 return nfs4_validate_text_mount_data(options, args, dev_name);
2413 args->nfs_server.port);
2414
2415 nfs_validate_transport_protocol(args);
2416
2417 nfs4_validate_mount_flags(args);
2418
2419 if (args->auth_flavor_len > 1)
2420 goto out_inval_auth;
2421
2422 if (args->client_address == NULL)
2423 goto out_no_client_address;
2424
2425 status = nfs_parse_devname(dev_name,
2426 &args->nfs_server.hostname,
2427 NFS4_MAXNAMLEN,
2428 &args->nfs_server.export_path,
2429 NFS4_MAXPATHLEN);
2430 if (status < 0)
2431 return status;
2432
2433 break;
2434 }
2435 } 2443 }
2436 2444
2437 return 0; 2445 return 0;
@@ -2448,10 +2456,6 @@ out_inval_auth:
2448out_no_address: 2456out_no_address:
2449 dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n"); 2457 dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n");
2450 return -EINVAL; 2458 return -EINVAL;
2451
2452out_no_client_address:
2453 dfprintk(MOUNT, "NFS4: mount program didn't pass callback address\n");
2454 return -EINVAL;
2455} 2459}
2456 2460
2457/* 2461/*
@@ -2507,7 +2511,8 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
2507 if (!s->s_root) { 2511 if (!s->s_root) {
2508 /* initial superblock/root creation */ 2512 /* initial superblock/root creation */
2509 nfs4_fill_super(s); 2513 nfs4_fill_super(s);
2510 nfs_fscache_get_super_cookie(s, data); 2514 nfs_fscache_get_super_cookie(
2515 s, data ? data->fscache_uniq : NULL, NULL);
2511 } 2516 }
2512 2517
2513 mntroot = nfs4_get_root(s, mntfh); 2518 mntroot = nfs4_get_root(s, mntfh);
@@ -2618,6 +2623,34 @@ out_err:
2618 return ret; 2623 return ret;
2619} 2624}
2620 2625
2626static int nfs4_try_mount(int flags, const char *dev_name,
2627 struct nfs_parsed_mount_data *data,
2628 struct vfsmount *mnt)
2629{
2630 char *export_path;
2631 struct vfsmount *root_mnt;
2632 int error;
2633
2634 dfprintk(MOUNT, "--> nfs4_try_mount()\n");
2635
2636 export_path = data->nfs_server.export_path;
2637 data->nfs_server.export_path = "/";
2638 root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data,
2639 data->nfs_server.hostname);
2640 data->nfs_server.export_path = export_path;
2641
2642 error = PTR_ERR(root_mnt);
2643 if (IS_ERR(root_mnt))
2644 goto out;
2645
2646 error = nfs_follow_remote_path(root_mnt, export_path, mnt);
2647
2648out:
2649 dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", error,
2650 error != 0 ? " [error]" : "");
2651 return error;
2652}
2653
2621/* 2654/*
2622 * Get the superblock for an NFS4 mountpoint 2655 * Get the superblock for an NFS4 mountpoint
2623 */ 2656 */
@@ -2625,11 +2658,9 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
2625 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) 2658 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
2626{ 2659{
2627 struct nfs_parsed_mount_data *data; 2660 struct nfs_parsed_mount_data *data;
2628 char *export_path;
2629 struct vfsmount *root_mnt;
2630 int error = -ENOMEM; 2661 int error = -ENOMEM;
2631 2662
2632 data = kzalloc(sizeof(*data), GFP_KERNEL); 2663 data = nfs_alloc_parsed_mount_data(0);
2633 if (data == NULL) 2664 if (data == NULL)
2634 goto out_free_data; 2665 goto out_free_data;
2635 2666
@@ -2638,17 +2669,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
2638 if (error < 0) 2669 if (error < 0)
2639 goto out; 2670 goto out;
2640 2671
2641 export_path = data->nfs_server.export_path; 2672 error = nfs4_try_mount(flags, dev_name, data, mnt);
2642 data->nfs_server.export_path = "/";
2643 root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data,
2644 data->nfs_server.hostname);
2645 data->nfs_server.export_path = export_path;
2646
2647 error = PTR_ERR(root_mnt);
2648 if (IS_ERR(root_mnt))
2649 goto out;
2650
2651 error = nfs_follow_remote_path(root_mnt, export_path, mnt);
2652 2673
2653out: 2674out:
2654 kfree(data->client_address); 2675 kfree(data->client_address);
@@ -2724,6 +2745,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
2724 if (!s->s_root) { 2745 if (!s->s_root) {
2725 /* initial superblock/root creation */ 2746 /* initial superblock/root creation */
2726 nfs4_clone_super(s, data->sb); 2747 nfs4_clone_super(s, data->sb);
2748 nfs_fscache_get_super_cookie(s, NULL, data);
2727 } 2749 }
2728 2750
2729 mntroot = nfs4_get_root(s, data->fh); 2751 mntroot = nfs4_get_root(s, data->fh);
@@ -2805,6 +2827,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
2805 if (!s->s_root) { 2827 if (!s->s_root) {
2806 /* initial superblock/root creation */ 2828 /* initial superblock/root creation */
2807 nfs4_fill_super(s); 2829 nfs4_fill_super(s);
2830 nfs_fscache_get_super_cookie(s, NULL, data);
2808 } 2831 }
2809 2832
2810 mntroot = nfs4_get_root(s, &mntfh); 2833 mntroot = nfs4_get_root(s, &mntfh);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index a34fae21fe10..53eb26c16b50 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -13,6 +13,7 @@
13#include <linux/file.h> 13#include <linux/file.h>
14#include <linux/writeback.h> 14#include <linux/writeback.h>
15#include <linux/swap.h> 15#include <linux/swap.h>
16#include <linux/migrate.h>
16 17
17#include <linux/sunrpc/clnt.h> 18#include <linux/sunrpc/clnt.h>
18#include <linux/nfs_fs.h> 19#include <linux/nfs_fs.h>
@@ -26,6 +27,7 @@
26#include "internal.h" 27#include "internal.h"
27#include "iostat.h" 28#include "iostat.h"
28#include "nfs4_fs.h" 29#include "nfs4_fs.h"
30#include "fscache.h"
29 31
30#define NFSDBG_FACILITY NFSDBG_PAGECACHE 32#define NFSDBG_FACILITY NFSDBG_PAGECACHE
31 33
@@ -218,24 +220,17 @@ static void nfs_end_page_writeback(struct page *page)
218 clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); 220 clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
219} 221}
220 222
221/* 223static struct nfs_page *nfs_find_and_lock_request(struct page *page)
222 * Find an associated nfs write request, and prepare to flush it out
223 * May return an error if the user signalled nfs_wait_on_request().
224 */
225static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
226 struct page *page)
227{ 224{
228 struct inode *inode = page->mapping->host; 225 struct inode *inode = page->mapping->host;
229 struct nfs_page *req; 226 struct nfs_page *req;
230 int ret; 227 int ret;
231 228
232 spin_lock(&inode->i_lock); 229 spin_lock(&inode->i_lock);
233 for(;;) { 230 for (;;) {
234 req = nfs_page_find_request_locked(page); 231 req = nfs_page_find_request_locked(page);
235 if (req == NULL) { 232 if (req == NULL)
236 spin_unlock(&inode->i_lock); 233 break;
237 return 0;
238 }
239 if (nfs_set_page_tag_locked(req)) 234 if (nfs_set_page_tag_locked(req))
240 break; 235 break;
241 /* Note: If we hold the page lock, as is the case in nfs_writepage, 236 /* Note: If we hold the page lock, as is the case in nfs_writepage,
@@ -247,23 +242,40 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
247 ret = nfs_wait_on_request(req); 242 ret = nfs_wait_on_request(req);
248 nfs_release_request(req); 243 nfs_release_request(req);
249 if (ret != 0) 244 if (ret != 0)
250 return ret; 245 return ERR_PTR(ret);
251 spin_lock(&inode->i_lock); 246 spin_lock(&inode->i_lock);
252 } 247 }
253 if (test_bit(PG_CLEAN, &req->wb_flags)) {
254 spin_unlock(&inode->i_lock);
255 BUG();
256 }
257 if (nfs_set_page_writeback(page) != 0) {
258 spin_unlock(&inode->i_lock);
259 BUG();
260 }
261 spin_unlock(&inode->i_lock); 248 spin_unlock(&inode->i_lock);
249 return req;
250}
251
252/*
253 * Find an associated nfs write request, and prepare to flush it out
254 * May return an error if the user signalled nfs_wait_on_request().
255 */
256static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
257 struct page *page)
258{
259 struct nfs_page *req;
260 int ret = 0;
261
262 req = nfs_find_and_lock_request(page);
263 if (!req)
264 goto out;
265 ret = PTR_ERR(req);
266 if (IS_ERR(req))
267 goto out;
268
269 ret = nfs_set_page_writeback(page);
270 BUG_ON(ret != 0);
271 BUG_ON(test_bit(PG_CLEAN, &req->wb_flags));
272
262 if (!nfs_pageio_add_request(pgio, req)) { 273 if (!nfs_pageio_add_request(pgio, req)) {
263 nfs_redirty_request(req); 274 nfs_redirty_request(req);
264 return pgio->pg_error; 275 ret = pgio->pg_error;
265 } 276 }
266 return 0; 277out:
278 return ret;
267} 279}
268 280
269static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) 281static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
@@ -1478,7 +1490,6 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
1478 .nr_to_write = LONG_MAX, 1490 .nr_to_write = LONG_MAX,
1479 .range_start = 0, 1491 .range_start = 0,
1480 .range_end = LLONG_MAX, 1492 .range_end = LLONG_MAX,
1481 .for_writepages = 1,
1482 }; 1493 };
1483 1494
1484 return __nfs_write_mapping(mapping, &wbc, how); 1495 return __nfs_write_mapping(mapping, &wbc, how);
@@ -1580,6 +1591,41 @@ int nfs_wb_page(struct inode *inode, struct page* page)
1580 return nfs_wb_page_priority(inode, page, FLUSH_STABLE); 1591 return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
1581} 1592}
1582 1593
1594#ifdef CONFIG_MIGRATION
1595int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1596 struct page *page)
1597{
1598 struct nfs_page *req;
1599 int ret;
1600
1601 if (PageFsCache(page))
1602 nfs_fscache_release_page(page, GFP_KERNEL);
1603
1604 req = nfs_find_and_lock_request(page);
1605 ret = PTR_ERR(req);
1606 if (IS_ERR(req))
1607 goto out;
1608
1609 ret = migrate_page(mapping, newpage, page);
1610 if (!req)
1611 goto out;
1612 if (ret)
1613 goto out_unlock;
1614 page_cache_get(newpage);
1615 req->wb_page = newpage;
1616 SetPagePrivate(newpage);
1617 set_page_private(newpage, page_private(page));
1618 ClearPagePrivate(page);
1619 set_page_private(page, 0);
1620 page_cache_release(page);
1621out_unlock:
1622 nfs_clear_page_tag_locked(req);
1623 nfs_release_request(req);
1624out:
1625 return ret;
1626}
1627#endif
1628
1583int __init nfs_init_writepagecache(void) 1629int __init nfs_init_writepagecache(void)
1584{ 1630{
1585 nfs_wdata_cachep = kmem_cache_create("nfs_write_data", 1631 nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 5573508f707f..36fcabbf5186 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -34,6 +34,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
34 int flags = nfsexp_flags(rqstp, exp); 34 int flags = nfsexp_flags(rqstp, exp);
35 int ret; 35 int ret;
36 36
37 validate_process_creds();
38
37 /* discard any old override before preparing the new set */ 39 /* discard any old override before preparing the new set */
38 revert_creds(get_cred(current->real_cred)); 40 revert_creds(get_cred(current->real_cred));
39 new = prepare_creds(); 41 new = prepare_creds();
@@ -86,8 +88,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
86 else 88 else
87 new->cap_effective = cap_raise_nfsd_set(new->cap_effective, 89 new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
88 new->cap_permitted); 90 new->cap_permitted);
91 validate_process_creds();
89 put_cred(override_creds(new)); 92 put_cred(override_creds(new));
90 put_cred(new); 93 put_cred(new);
94 validate_process_creds();
91 return 0; 95 return 0;
92 96
93oom: 97oom:
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index b92a27629fb7..c1c9e035d4a4 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -85,6 +85,11 @@ static void expkey_request(struct cache_detail *cd,
85 (*bpp)[-1] = '\n'; 85 (*bpp)[-1] = '\n';
86} 86}
87 87
88static int expkey_upcall(struct cache_detail *cd, struct cache_head *h)
89{
90 return sunrpc_cache_pipe_upcall(cd, h, expkey_request);
91}
92
88static struct svc_expkey *svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old); 93static struct svc_expkey *svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old);
89static struct svc_expkey *svc_expkey_lookup(struct svc_expkey *); 94static struct svc_expkey *svc_expkey_lookup(struct svc_expkey *);
90static struct cache_detail svc_expkey_cache; 95static struct cache_detail svc_expkey_cache;
@@ -259,7 +264,7 @@ static struct cache_detail svc_expkey_cache = {
259 .hash_table = expkey_table, 264 .hash_table = expkey_table,
260 .name = "nfsd.fh", 265 .name = "nfsd.fh",
261 .cache_put = expkey_put, 266 .cache_put = expkey_put,
262 .cache_request = expkey_request, 267 .cache_upcall = expkey_upcall,
263 .cache_parse = expkey_parse, 268 .cache_parse = expkey_parse,
264 .cache_show = expkey_show, 269 .cache_show = expkey_show,
265 .match = expkey_match, 270 .match = expkey_match,
@@ -355,6 +360,11 @@ static void svc_export_request(struct cache_detail *cd,
355 (*bpp)[-1] = '\n'; 360 (*bpp)[-1] = '\n';
356} 361}
357 362
363static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h)
364{
365 return sunrpc_cache_pipe_upcall(cd, h, svc_export_request);
366}
367
358static struct svc_export *svc_export_update(struct svc_export *new, 368static struct svc_export *svc_export_update(struct svc_export *new,
359 struct svc_export *old); 369 struct svc_export *old);
360static struct svc_export *svc_export_lookup(struct svc_export *); 370static struct svc_export *svc_export_lookup(struct svc_export *);
@@ -724,7 +734,7 @@ struct cache_detail svc_export_cache = {
724 .hash_table = export_table, 734 .hash_table = export_table,
725 .name = "nfsd.export", 735 .name = "nfsd.export",
726 .cache_put = svc_export_put, 736 .cache_put = svc_export_put,
727 .cache_request = svc_export_request, 737 .cache_upcall = svc_export_upcall,
728 .cache_parse = svc_export_parse, 738 .cache_parse = svc_export_parse,
729 .cache_show = svc_export_show, 739 .cache_show = svc_export_show,
730 .match = svc_export_match, 740 .match = svc_export_match,
@@ -1331,6 +1341,8 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
1331 if (rv) 1341 if (rv)
1332 goto out; 1342 goto out;
1333 rv = check_nfsd_access(exp, rqstp); 1343 rv = check_nfsd_access(exp, rqstp);
1344 if (rv)
1345 fh_put(fhp);
1334out: 1346out:
1335 exp_put(exp); 1347 exp_put(exp);
1336 return rv; 1348 return rv;
@@ -1505,7 +1517,7 @@ static int e_show(struct seq_file *m, void *p)
1505 return svc_export_show(m, &svc_export_cache, cp); 1517 return svc_export_show(m, &svc_export_cache, cp);
1506} 1518}
1507 1519
1508struct seq_operations nfs_exports_op = { 1520const struct seq_operations nfs_exports_op = {
1509 .start = e_start, 1521 .start = e_start,
1510 .next = e_next, 1522 .next = e_next,
1511 .stop = e_stop, 1523 .stop = e_stop,
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 01d4ec1c88e0..edf926e1062f 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -814,17 +814,6 @@ encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name,
814 return p; 814 return p;
815} 815}
816 816
817static __be32 *
818encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p,
819 struct svc_fh *fhp)
820{
821 p = encode_post_op_attr(cd->rqstp, p, fhp);
822 *p++ = xdr_one; /* yes, a file handle follows */
823 p = encode_fh(p, fhp);
824 fh_put(fhp);
825 return p;
826}
827
828static int 817static int
829compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, 818compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
830 const char *name, int namlen) 819 const char *name, int namlen)
@@ -836,29 +825,54 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
836 dparent = cd->fh.fh_dentry; 825 dparent = cd->fh.fh_dentry;
837 exp = cd->fh.fh_export; 826 exp = cd->fh.fh_export;
838 827
839 fh_init(fhp, NFS3_FHSIZE);
840 if (isdotent(name, namlen)) { 828 if (isdotent(name, namlen)) {
841 if (namlen == 2) { 829 if (namlen == 2) {
842 dchild = dget_parent(dparent); 830 dchild = dget_parent(dparent);
843 if (dchild == dparent) { 831 if (dchild == dparent) {
844 /* filesystem root - cannot return filehandle for ".." */ 832 /* filesystem root - cannot return filehandle for ".." */
845 dput(dchild); 833 dput(dchild);
846 return 1; 834 return -ENOENT;
847 } 835 }
848 } else 836 } else
849 dchild = dget(dparent); 837 dchild = dget(dparent);
850 } else 838 } else
851 dchild = lookup_one_len(name, dparent, namlen); 839 dchild = lookup_one_len(name, dparent, namlen);
852 if (IS_ERR(dchild)) 840 if (IS_ERR(dchild))
853 return 1; 841 return -ENOENT;
854 if (d_mountpoint(dchild) || 842 rv = -ENOENT;
855 fh_compose(fhp, exp, dchild, &cd->fh) != 0 || 843 if (d_mountpoint(dchild))
856 !dchild->d_inode) 844 goto out;
857 rv = 1; 845 rv = fh_compose(fhp, exp, dchild, &cd->fh);
846 if (rv)
847 goto out;
848 if (!dchild->d_inode)
849 goto out;
850 rv = 0;
851out:
858 dput(dchild); 852 dput(dchild);
859 return rv; 853 return rv;
860} 854}
861 855
856__be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen)
857{
858 struct svc_fh fh;
859 int err;
860
861 fh_init(&fh, NFS3_FHSIZE);
862 err = compose_entry_fh(cd, &fh, name, namlen);
863 if (err) {
864 *p++ = 0;
865 *p++ = 0;
866 goto out;
867 }
868 p = encode_post_op_attr(cd->rqstp, p, &fh);
869 *p++ = xdr_one; /* yes, a file handle follows */
870 p = encode_fh(p, &fh);
871out:
872 fh_put(&fh);
873 return p;
874}
875
862/* 876/*
863 * Encode a directory entry. This one works for both normal readdir 877 * Encode a directory entry. This one works for both normal readdir
864 * and readdirplus. 878 * and readdirplus.
@@ -929,16 +943,8 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
929 943
930 p = encode_entry_baggage(cd, p, name, namlen, ino); 944 p = encode_entry_baggage(cd, p, name, namlen, ino);
931 945
932 /* throw in readdirplus baggage */ 946 if (plus)
933 if (plus) { 947 p = encode_entryplus_baggage(cd, p, name, namlen);
934 struct svc_fh fh;
935
936 if (compose_entry_fh(cd, &fh, name, namlen) > 0) {
937 *p++ = 0;
938 *p++ = 0;
939 } else
940 p = encode_entryplus_baggage(cd, p, &fh);
941 }
942 num_entry_words = p - cd->buffer; 948 num_entry_words = p - cd->buffer;
943 } else if (cd->rqstp->rq_respages[pn+1] != NULL) { 949 } else if (cd->rqstp->rq_respages[pn+1] != NULL) {
944 /* temporarily encode entry into next page, then move back to 950 /* temporarily encode entry into next page, then move back to
@@ -951,17 +957,8 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
951 957
952 p1 = encode_entry_baggage(cd, p1, name, namlen, ino); 958 p1 = encode_entry_baggage(cd, p1, name, namlen, ino);
953 959
954 /* throw in readdirplus baggage */ 960 if (plus)
955 if (plus) { 961 p = encode_entryplus_baggage(cd, p1, name, namlen);
956 struct svc_fh fh;
957
958 if (compose_entry_fh(cd, &fh, name, namlen) > 0) {
959 /* zero out the filehandle */
960 *p1++ = 0;
961 *p1++ = 0;
962 } else
963 p1 = encode_entryplus_baggage(cd, p1, &fh);
964 }
965 962
966 /* determine entry word length and lengths to go in pages */ 963 /* determine entry word length and lengths to go in pages */
967 num_entry_words = p1 - tmp; 964 num_entry_words = p1 - tmp;
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 54b8b4140c8f..725d02f210e2 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -321,7 +321,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
321 deny = ~pas.group & pas.other; 321 deny = ~pas.group & pas.other;
322 if (deny) { 322 if (deny) {
323 ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; 323 ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE;
324 ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP; 324 ace->flag = eflag;
325 ace->access_mask = deny_mask_from_posix(deny, flags); 325 ace->access_mask = deny_mask_from_posix(deny, flags);
326 ace->whotype = NFS4_ACL_WHO_GROUP; 326 ace->whotype = NFS4_ACL_WHO_GROUP;
327 ace++; 327 ace++;
@@ -335,7 +335,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
335 if (deny) { 335 if (deny) {
336 ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; 336 ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE;
337 ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP; 337 ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP;
338 ace->access_mask = mask_from_posix(deny, flags); 338 ace->access_mask = deny_mask_from_posix(deny, flags);
339 ace->whotype = NFS4_ACL_WHO_NAMED; 339 ace->whotype = NFS4_ACL_WHO_NAMED;
340 ace->who = pa->e_id; 340 ace->who = pa->e_id;
341 ace++; 341 ace++;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 3fd23f7aceca..24e8d78f8dde 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -43,25 +43,30 @@
43#include <linux/sunrpc/xdr.h> 43#include <linux/sunrpc/xdr.h>
44#include <linux/sunrpc/svc.h> 44#include <linux/sunrpc/svc.h>
45#include <linux/sunrpc/clnt.h> 45#include <linux/sunrpc/clnt.h>
46#include <linux/sunrpc/svcsock.h>
46#include <linux/nfsd/nfsd.h> 47#include <linux/nfsd/nfsd.h>
47#include <linux/nfsd/state.h> 48#include <linux/nfsd/state.h>
48#include <linux/sunrpc/sched.h> 49#include <linux/sunrpc/sched.h>
49#include <linux/nfs4.h> 50#include <linux/nfs4.h>
51#include <linux/sunrpc/xprtsock.h>
50 52
51#define NFSDDBG_FACILITY NFSDDBG_PROC 53#define NFSDDBG_FACILITY NFSDDBG_PROC
52 54
53#define NFSPROC4_CB_NULL 0 55#define NFSPROC4_CB_NULL 0
54#define NFSPROC4_CB_COMPOUND 1 56#define NFSPROC4_CB_COMPOUND 1
57#define NFS4_STATEID_SIZE 16
55 58
56/* Index of predefined Linux callback client operations */ 59/* Index of predefined Linux callback client operations */
57 60
58enum { 61enum {
59 NFSPROC4_CLNT_CB_NULL = 0, 62 NFSPROC4_CLNT_CB_NULL = 0,
60 NFSPROC4_CLNT_CB_RECALL, 63 NFSPROC4_CLNT_CB_RECALL,
64 NFSPROC4_CLNT_CB_SEQUENCE,
61}; 65};
62 66
63enum nfs_cb_opnum4 { 67enum nfs_cb_opnum4 {
64 OP_CB_RECALL = 4, 68 OP_CB_RECALL = 4,
69 OP_CB_SEQUENCE = 11,
65}; 70};
66 71
67#define NFS4_MAXTAGLEN 20 72#define NFS4_MAXTAGLEN 20
@@ -70,17 +75,29 @@ enum nfs_cb_opnum4 {
70#define NFS4_dec_cb_null_sz 0 75#define NFS4_dec_cb_null_sz 0
71#define cb_compound_enc_hdr_sz 4 76#define cb_compound_enc_hdr_sz 4
72#define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2)) 77#define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2))
78#define sessionid_sz (NFS4_MAX_SESSIONID_LEN >> 2)
79#define cb_sequence_enc_sz (sessionid_sz + 4 + \
80 1 /* no referring calls list yet */)
81#define cb_sequence_dec_sz (op_dec_sz + sessionid_sz + 4)
82
73#define op_enc_sz 1 83#define op_enc_sz 1
74#define op_dec_sz 2 84#define op_dec_sz 2
75#define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2)) 85#define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2))
76#define enc_stateid_sz (NFS4_STATEID_SIZE >> 2) 86#define enc_stateid_sz (NFS4_STATEID_SIZE >> 2)
77#define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \ 87#define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \
88 cb_sequence_enc_sz + \
78 1 + enc_stateid_sz + \ 89 1 + enc_stateid_sz + \
79 enc_nfs4_fh_sz) 90 enc_nfs4_fh_sz)
80 91
81#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ 92#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \
93 cb_sequence_dec_sz + \
82 op_dec_sz) 94 op_dec_sz)
83 95
96struct nfs4_rpc_args {
97 void *args_op;
98 struct nfsd4_cb_sequence args_seq;
99};
100
84/* 101/*
85* Generic encode routines from fs/nfs/nfs4xdr.c 102* Generic encode routines from fs/nfs/nfs4xdr.c
86*/ 103*/
@@ -137,11 +154,13 @@ xdr_error: \
137} while (0) 154} while (0)
138 155
139struct nfs4_cb_compound_hdr { 156struct nfs4_cb_compound_hdr {
140 int status; 157 /* args */
141 u32 ident; 158 u32 ident; /* minorversion 0 only */
142 u32 nops; 159 u32 nops;
143 __be32 *nops_p; 160 __be32 *nops_p;
144 u32 minorversion; 161 u32 minorversion;
162 /* res */
163 int status;
145 u32 taglen; 164 u32 taglen;
146 char *tag; 165 char *tag;
147}; 166};
@@ -238,6 +257,27 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
238 hdr->nops++; 257 hdr->nops++;
239} 258}
240 259
260static void
261encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args,
262 struct nfs4_cb_compound_hdr *hdr)
263{
264 __be32 *p;
265
266 if (hdr->minorversion == 0)
267 return;
268
269 RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20);
270
271 WRITE32(OP_CB_SEQUENCE);
272 WRITEMEM(args->cbs_clp->cl_sessionid.data, NFS4_MAX_SESSIONID_LEN);
273 WRITE32(args->cbs_clp->cl_cb_seq_nr);
274 WRITE32(0); /* slotid, always 0 */
275 WRITE32(0); /* highest slotid always 0 */
276 WRITE32(0); /* cachethis always 0 */
277 WRITE32(0); /* FIXME: support referring_call_lists */
278 hdr->nops++;
279}
280
241static int 281static int
242nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p) 282nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
243{ 283{
@@ -249,15 +289,19 @@ nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
249} 289}
250 290
251static int 291static int
252nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, struct nfs4_delegation *args) 292nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
293 struct nfs4_rpc_args *rpc_args)
253{ 294{
254 struct xdr_stream xdr; 295 struct xdr_stream xdr;
296 struct nfs4_delegation *args = rpc_args->args_op;
255 struct nfs4_cb_compound_hdr hdr = { 297 struct nfs4_cb_compound_hdr hdr = {
256 .ident = args->dl_ident, 298 .ident = args->dl_ident,
299 .minorversion = rpc_args->args_seq.cbs_minorversion,
257 }; 300 };
258 301
259 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 302 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
260 encode_cb_compound_hdr(&xdr, &hdr); 303 encode_cb_compound_hdr(&xdr, &hdr);
304 encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr);
261 encode_cb_recall(&xdr, args, &hdr); 305 encode_cb_recall(&xdr, args, &hdr);
262 encode_cb_nops(&hdr); 306 encode_cb_nops(&hdr);
263 return 0; 307 return 0;
@@ -299,6 +343,57 @@ decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
299 return 0; 343 return 0;
300} 344}
301 345
346/*
347 * Our current back channel implmentation supports a single backchannel
348 * with a single slot.
349 */
350static int
351decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res,
352 struct rpc_rqst *rqstp)
353{
354 struct nfs4_sessionid id;
355 int status;
356 u32 dummy;
357 __be32 *p;
358
359 if (res->cbs_minorversion == 0)
360 return 0;
361
362 status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE);
363 if (status)
364 return status;
365
366 /*
367 * If the server returns different values for sessionID, slotID or
368 * sequence number, the server is looney tunes.
369 */
370 status = -ESERVERFAULT;
371
372 READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
373 memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
374 p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
375 if (memcmp(id.data, res->cbs_clp->cl_sessionid.data,
376 NFS4_MAX_SESSIONID_LEN)) {
377 dprintk("%s Invalid session id\n", __func__);
378 goto out;
379 }
380 READ32(dummy);
381 if (dummy != res->cbs_clp->cl_cb_seq_nr) {
382 dprintk("%s Invalid sequence number\n", __func__);
383 goto out;
384 }
385 READ32(dummy); /* slotid must be 0 */
386 if (dummy != 0) {
387 dprintk("%s Invalid slotid\n", __func__);
388 goto out;
389 }
390 /* FIXME: process highest slotid and target highest slotid */
391 status = 0;
392out:
393 return status;
394}
395
396
302static int 397static int
303nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p) 398nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
304{ 399{
@@ -306,7 +401,8 @@ nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
306} 401}
307 402
308static int 403static int
309nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p) 404nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
405 struct nfsd4_cb_sequence *seq)
310{ 406{
311 struct xdr_stream xdr; 407 struct xdr_stream xdr;
312 struct nfs4_cb_compound_hdr hdr; 408 struct nfs4_cb_compound_hdr hdr;
@@ -316,6 +412,11 @@ nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p)
316 status = decode_cb_compound_hdr(&xdr, &hdr); 412 status = decode_cb_compound_hdr(&xdr, &hdr);
317 if (status) 413 if (status)
318 goto out; 414 goto out;
415 if (seq) {
416 status = decode_cb_sequence(&xdr, seq, rqstp);
417 if (status)
418 goto out;
419 }
319 status = decode_cb_op_hdr(&xdr, OP_CB_RECALL); 420 status = decode_cb_op_hdr(&xdr, OP_CB_RECALL);
320out: 421out:
321 return status; 422 return status;
@@ -377,16 +478,15 @@ static int max_cb_time(void)
377 478
378int setup_callback_client(struct nfs4_client *clp) 479int setup_callback_client(struct nfs4_client *clp)
379{ 480{
380 struct sockaddr_in addr;
381 struct nfs4_cb_conn *cb = &clp->cl_cb_conn; 481 struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
382 struct rpc_timeout timeparms = { 482 struct rpc_timeout timeparms = {
383 .to_initval = max_cb_time(), 483 .to_initval = max_cb_time(),
384 .to_retries = 0, 484 .to_retries = 0,
385 }; 485 };
386 struct rpc_create_args args = { 486 struct rpc_create_args args = {
387 .protocol = IPPROTO_TCP, 487 .protocol = XPRT_TRANSPORT_TCP,
388 .address = (struct sockaddr *)&addr, 488 .address = (struct sockaddr *) &cb->cb_addr,
389 .addrsize = sizeof(addr), 489 .addrsize = cb->cb_addrlen,
390 .timeout = &timeparms, 490 .timeout = &timeparms,
391 .program = &cb_program, 491 .program = &cb_program,
392 .prognumber = cb->cb_prog, 492 .prognumber = cb->cb_prog,
@@ -399,13 +499,10 @@ int setup_callback_client(struct nfs4_client *clp)
399 499
400 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) 500 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
401 return -EINVAL; 501 return -EINVAL;
402 502 if (cb->cb_minorversion) {
403 /* Initialize address */ 503 args.bc_xprt = clp->cl_cb_xprt;
404 memset(&addr, 0, sizeof(addr)); 504 args.protocol = XPRT_TRANSPORT_BC_TCP;
405 addr.sin_family = AF_INET; 505 }
406 addr.sin_port = htons(cb->cb_port);
407 addr.sin_addr.s_addr = htonl(cb->cb_addr);
408
409 /* Create RPC client */ 506 /* Create RPC client */
410 client = rpc_create(&args); 507 client = rpc_create(&args);
411 if (IS_ERR(client)) { 508 if (IS_ERR(client)) {
@@ -439,42 +536,29 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = {
439 .rpc_call_done = nfsd4_cb_probe_done, 536 .rpc_call_done = nfsd4_cb_probe_done,
440}; 537};
441 538
442static struct rpc_cred *lookup_cb_cred(struct nfs4_cb_conn *cb) 539static struct rpc_cred *callback_cred;
443{
444 struct auth_cred acred = {
445 .machine_cred = 1
446 };
447 540
448 /* 541int set_callback_cred(void)
449 * Note in the gss case this doesn't actually have to wait for a 542{
450 * gss upcall (or any calls to the client); this just creates a 543 callback_cred = rpc_lookup_machine_cred();
451 * non-uptodate cred which the rpc state machine will fill in with 544 if (!callback_cred)
452 * a refresh_upcall later. 545 return -ENOMEM;
453 */ 546 return 0;
454 return rpcauth_lookup_credcache(cb->cb_client->cl_auth, &acred,
455 RPCAUTH_LOOKUP_NEW);
456} 547}
457 548
549
458void do_probe_callback(struct nfs4_client *clp) 550void do_probe_callback(struct nfs4_client *clp)
459{ 551{
460 struct nfs4_cb_conn *cb = &clp->cl_cb_conn; 552 struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
461 struct rpc_message msg = { 553 struct rpc_message msg = {
462 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], 554 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
463 .rpc_argp = clp, 555 .rpc_argp = clp,
556 .rpc_cred = callback_cred
464 }; 557 };
465 struct rpc_cred *cred;
466 int status; 558 int status;
467 559
468 cred = lookup_cb_cred(cb);
469 if (IS_ERR(cred)) {
470 status = PTR_ERR(cred);
471 goto out;
472 }
473 cb->cb_cred = cred;
474 msg.rpc_cred = cb->cb_cred;
475 status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_SOFT, 560 status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_SOFT,
476 &nfsd4_cb_probe_ops, (void *)clp); 561 &nfsd4_cb_probe_ops, (void *)clp);
477out:
478 if (status) { 562 if (status) {
479 warn_no_callback_path(clp, status); 563 warn_no_callback_path(clp, status);
480 put_nfs4_client(clp); 564 put_nfs4_client(clp);
@@ -503,11 +587,95 @@ nfsd4_probe_callback(struct nfs4_client *clp)
503 do_probe_callback(clp); 587 do_probe_callback(clp);
504} 588}
505 589
590/*
591 * There's currently a single callback channel slot.
592 * If the slot is available, then mark it busy. Otherwise, set the
593 * thread for sleeping on the callback RPC wait queue.
594 */
595static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
596 struct rpc_task *task)
597{
598 struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
599 u32 *ptr = (u32 *)clp->cl_sessionid.data;
600 int status = 0;
601
602 dprintk("%s: %u:%u:%u:%u\n", __func__,
603 ptr[0], ptr[1], ptr[2], ptr[3]);
604
605 if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
606 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
607 dprintk("%s slot is busy\n", __func__);
608 status = -EAGAIN;
609 goto out;
610 }
611
612 /*
613 * We'll need the clp during XDR encoding and decoding,
614 * and the sequence during decoding to verify the reply
615 */
616 args->args_seq.cbs_clp = clp;
617 task->tk_msg.rpc_resp = &args->args_seq;
618
619out:
620 dprintk("%s status=%d\n", __func__, status);
621 return status;
622}
623
624/*
625 * TODO: cb_sequence should support referring call lists, cachethis, multiple
626 * slots, and mark callback channel down on communication errors.
627 */
628static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
629{
630 struct nfs4_delegation *dp = calldata;
631 struct nfs4_client *clp = dp->dl_client;
632 struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
633 u32 minorversion = clp->cl_cb_conn.cb_minorversion;
634 int status = 0;
635
636 args->args_seq.cbs_minorversion = minorversion;
637 if (minorversion) {
638 status = nfsd41_cb_setup_sequence(clp, task);
639 if (status) {
640 if (status != -EAGAIN) {
641 /* terminate rpc task */
642 task->tk_status = status;
643 task->tk_action = NULL;
644 }
645 return;
646 }
647 }
648 rpc_call_start(task);
649}
650
651static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
652{
653 struct nfs4_delegation *dp = calldata;
654 struct nfs4_client *clp = dp->dl_client;
655
656 dprintk("%s: minorversion=%d\n", __func__,
657 clp->cl_cb_conn.cb_minorversion);
658
659 if (clp->cl_cb_conn.cb_minorversion) {
660 /* No need for lock, access serialized in nfsd4_cb_prepare */
661 ++clp->cl_cb_seq_nr;
662 clear_bit(0, &clp->cl_cb_slot_busy);
663 rpc_wake_up_next(&clp->cl_cb_waitq);
664 dprintk("%s: freed slot, new seqid=%d\n", __func__,
665 clp->cl_cb_seq_nr);
666
667 /* We're done looking into the sequence information */
668 task->tk_msg.rpc_resp = NULL;
669 }
670}
671
506static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) 672static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
507{ 673{
508 struct nfs4_delegation *dp = calldata; 674 struct nfs4_delegation *dp = calldata;
509 struct nfs4_client *clp = dp->dl_client; 675 struct nfs4_client *clp = dp->dl_client;
510 676
677 nfsd4_cb_done(task, calldata);
678
511 switch (task->tk_status) { 679 switch (task->tk_status) {
512 case -EIO: 680 case -EIO:
513 /* Network partition? */ 681 /* Network partition? */
@@ -520,16 +688,19 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
520 break; 688 break;
521 default: 689 default:
522 /* success, or error we can't handle */ 690 /* success, or error we can't handle */
523 return; 691 goto done;
524 } 692 }
525 if (dp->dl_retries--) { 693 if (dp->dl_retries--) {
526 rpc_delay(task, 2*HZ); 694 rpc_delay(task, 2*HZ);
527 task->tk_status = 0; 695 task->tk_status = 0;
528 rpc_restart_call(task); 696 rpc_restart_call(task);
697 return;
529 } else { 698 } else {
530 atomic_set(&clp->cl_cb_conn.cb_set, 0); 699 atomic_set(&clp->cl_cb_conn.cb_set, 0);
531 warn_no_callback_path(clp, task->tk_status); 700 warn_no_callback_path(clp, task->tk_status);
532 } 701 }
702done:
703 kfree(task->tk_msg.rpc_argp);
533} 704}
534 705
535static void nfsd4_cb_recall_release(void *calldata) 706static void nfsd4_cb_recall_release(void *calldata)
@@ -542,6 +713,7 @@ static void nfsd4_cb_recall_release(void *calldata)
542} 713}
543 714
544static const struct rpc_call_ops nfsd4_cb_recall_ops = { 715static const struct rpc_call_ops nfsd4_cb_recall_ops = {
716 .rpc_call_prepare = nfsd4_cb_prepare,
545 .rpc_call_done = nfsd4_cb_recall_done, 717 .rpc_call_done = nfsd4_cb_recall_done,
546 .rpc_release = nfsd4_cb_recall_release, 718 .rpc_release = nfsd4_cb_recall_release,
547}; 719};
@@ -554,17 +726,24 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
554{ 726{
555 struct nfs4_client *clp = dp->dl_client; 727 struct nfs4_client *clp = dp->dl_client;
556 struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client; 728 struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
729 struct nfs4_rpc_args *args;
557 struct rpc_message msg = { 730 struct rpc_message msg = {
558 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL], 731 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
559 .rpc_argp = dp, 732 .rpc_cred = callback_cred
560 .rpc_cred = clp->cl_cb_conn.cb_cred
561 }; 733 };
562 int status; 734 int status = -ENOMEM;
563 735
736 args = kzalloc(sizeof(*args), GFP_KERNEL);
737 if (!args)
738 goto out;
739 args->args_op = dp;
740 msg.rpc_argp = args;
564 dp->dl_retries = 1; 741 dp->dl_retries = 1;
565 status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, 742 status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT,
566 &nfsd4_cb_recall_ops, dp); 743 &nfsd4_cb_recall_ops, dp);
744out:
567 if (status) { 745 if (status) {
746 kfree(args);
568 put_nfs4_client(clp); 747 put_nfs4_client(clp);
569 nfs4_put_delegation(dp); 748 nfs4_put_delegation(dp);
570 } 749 }
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 5b398421b051..ba2c199592fd 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -38,7 +38,6 @@
38#include <linux/init.h> 38#include <linux/init.h>
39 39
40#include <linux/mm.h> 40#include <linux/mm.h>
41#include <linux/utsname.h>
42#include <linux/errno.h> 41#include <linux/errno.h>
43#include <linux/string.h> 42#include <linux/string.h>
44#include <linux/sunrpc/clnt.h> 43#include <linux/sunrpc/clnt.h>
@@ -146,6 +145,12 @@ idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
146} 145}
147 146
148static int 147static int
148idtoname_upcall(struct cache_detail *cd, struct cache_head *ch)
149{
150 return sunrpc_cache_pipe_upcall(cd, ch, idtoname_request);
151}
152
153static int
149idtoname_match(struct cache_head *ca, struct cache_head *cb) 154idtoname_match(struct cache_head *ca, struct cache_head *cb)
150{ 155{
151 struct ent *a = container_of(ca, struct ent, h); 156 struct ent *a = container_of(ca, struct ent, h);
@@ -175,10 +180,10 @@ idtoname_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h)
175} 180}
176 181
177static void 182static void
178warn_no_idmapd(struct cache_detail *detail) 183warn_no_idmapd(struct cache_detail *detail, int has_died)
179{ 184{
180 printk("nfsd: nfsv4 idmapping failing: has idmapd %s?\n", 185 printk("nfsd: nfsv4 idmapping failing: has idmapd %s?\n",
181 detail->last_close? "died" : "not been started"); 186 has_died ? "died" : "not been started");
182} 187}
183 188
184 189
@@ -192,7 +197,7 @@ static struct cache_detail idtoname_cache = {
192 .hash_table = idtoname_table, 197 .hash_table = idtoname_table,
193 .name = "nfs4.idtoname", 198 .name = "nfs4.idtoname",
194 .cache_put = ent_put, 199 .cache_put = ent_put,
195 .cache_request = idtoname_request, 200 .cache_upcall = idtoname_upcall,
196 .cache_parse = idtoname_parse, 201 .cache_parse = idtoname_parse,
197 .cache_show = idtoname_show, 202 .cache_show = idtoname_show,
198 .warn_no_listener = warn_no_idmapd, 203 .warn_no_listener = warn_no_idmapd,
@@ -325,6 +330,12 @@ nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
325} 330}
326 331
327static int 332static int
333nametoid_upcall(struct cache_detail *cd, struct cache_head *ch)
334{
335 return sunrpc_cache_pipe_upcall(cd, ch, nametoid_request);
336}
337
338static int
328nametoid_match(struct cache_head *ca, struct cache_head *cb) 339nametoid_match(struct cache_head *ca, struct cache_head *cb)
329{ 340{
330 struct ent *a = container_of(ca, struct ent, h); 341 struct ent *a = container_of(ca, struct ent, h);
@@ -363,7 +374,7 @@ static struct cache_detail nametoid_cache = {
363 .hash_table = nametoid_table, 374 .hash_table = nametoid_table,
364 .name = "nfs4.nametoid", 375 .name = "nfs4.nametoid",
365 .cache_put = ent_put, 376 .cache_put = ent_put,
366 .cache_request = nametoid_request, 377 .cache_upcall = nametoid_upcall,
367 .cache_parse = nametoid_parse, 378 .cache_parse = nametoid_parse,
368 .cache_show = nametoid_show, 379 .cache_show = nametoid_show,
369 .warn_no_listener = warn_no_idmapd, 380 .warn_no_listener = warn_no_idmapd,
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 7c8801769a3c..bebc0c2e1b0a 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -68,7 +68,6 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
68 u32 *bmval, u32 *writable) 68 u32 *bmval, u32 *writable)
69{ 69{
70 struct dentry *dentry = cstate->current_fh.fh_dentry; 70 struct dentry *dentry = cstate->current_fh.fh_dentry;
71 struct svc_export *exp = cstate->current_fh.fh_export;
72 71
73 /* 72 /*
74 * Check about attributes are supported by the NFSv4 server or not. 73 * Check about attributes are supported by the NFSv4 server or not.
@@ -80,17 +79,13 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
80 return nfserr_attrnotsupp; 79 return nfserr_attrnotsupp;
81 80
82 /* 81 /*
83 * Check FATTR4_WORD0_ACL & FATTR4_WORD0_FS_LOCATIONS can be supported 82 * Check FATTR4_WORD0_ACL can be supported
84 * in current environment or not. 83 * in current environment or not.
85 */ 84 */
86 if (bmval[0] & FATTR4_WORD0_ACL) { 85 if (bmval[0] & FATTR4_WORD0_ACL) {
87 if (!IS_POSIXACL(dentry->d_inode)) 86 if (!IS_POSIXACL(dentry->d_inode))
88 return nfserr_attrnotsupp; 87 return nfserr_attrnotsupp;
89 } 88 }
90 if (bmval[0] & FATTR4_WORD0_FS_LOCATIONS) {
91 if (exp->ex_fslocs.locations == NULL)
92 return nfserr_attrnotsupp;
93 }
94 89
95 /* 90 /*
96 * According to spec, read-only attributes return ERR_INVAL. 91 * According to spec, read-only attributes return ERR_INVAL.
@@ -123,6 +118,35 @@ nfsd4_check_open_attributes(struct svc_rqst *rqstp,
123 return status; 118 return status;
124} 119}
125 120
121static int
122is_create_with_attrs(struct nfsd4_open *open)
123{
124 return open->op_create == NFS4_OPEN_CREATE
125 && (open->op_createmode == NFS4_CREATE_UNCHECKED
126 || open->op_createmode == NFS4_CREATE_GUARDED
127 || open->op_createmode == NFS4_CREATE_EXCLUSIVE4_1);
128}
129
130/*
131 * if error occurs when setting the acl, just clear the acl bit
132 * in the returned attr bitmap.
133 */
134static void
135do_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
136 struct nfs4_acl *acl, u32 *bmval)
137{
138 __be32 status;
139
140 status = nfsd4_set_nfs4_acl(rqstp, fhp, acl);
141 if (status)
142 /*
143 * We should probably fail the whole open at this point,
144 * but we've already created the file, so it's too late;
145 * So this seems the least of evils:
146 */
147 bmval[0] &= ~FATTR4_WORD0_ACL;
148}
149
126static inline void 150static inline void
127fh_dup2(struct svc_fh *dst, struct svc_fh *src) 151fh_dup2(struct svc_fh *dst, struct svc_fh *src)
128{ 152{
@@ -206,6 +230,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
206 if (status) 230 if (status)
207 goto out; 231 goto out;
208 232
233 if (is_create_with_attrs(open) && open->op_acl != NULL)
234 do_set_nfs4_acl(rqstp, &resfh, open->op_acl, open->op_bmval);
235
209 set_change_info(&open->op_cinfo, current_fh); 236 set_change_info(&open->op_cinfo, current_fh);
210 fh_dup2(current_fh, &resfh); 237 fh_dup2(current_fh, &resfh);
211 238
@@ -536,12 +563,17 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
536 status = nfserr_badtype; 563 status = nfserr_badtype;
537 } 564 }
538 565
539 if (!status) { 566 if (status)
540 fh_unlock(&cstate->current_fh); 567 goto out;
541 set_change_info(&create->cr_cinfo, &cstate->current_fh); 568
542 fh_dup2(&cstate->current_fh, &resfh); 569 if (create->cr_acl != NULL)
543 } 570 do_set_nfs4_acl(rqstp, &resfh, create->cr_acl,
571 create->cr_bmval);
544 572
573 fh_unlock(&cstate->current_fh);
574 set_change_info(&create->cr_cinfo, &cstate->current_fh);
575 fh_dup2(&cstate->current_fh, &resfh);
576out:
545 fh_put(&resfh); 577 fh_put(&resfh);
546 return status; 578 return status;
547} 579}
@@ -947,34 +979,6 @@ static struct nfsd4_operation nfsd4_ops[];
947static const char *nfsd4_op_name(unsigned opnum); 979static const char *nfsd4_op_name(unsigned opnum);
948 980
949/* 981/*
950 * This is a replay of a compound for which no cache entry pages
951 * were used. Encode the sequence operation, and if cachethis is FALSE
952 * encode the uncache rep error on the next operation.
953 */
954static __be32
955nfsd4_enc_uncached_replay(struct nfsd4_compoundargs *args,
956 struct nfsd4_compoundres *resp)
957{
958 struct nfsd4_op *op;
959
960 dprintk("--> %s resp->opcnt %d ce_cachethis %u \n", __func__,
961 resp->opcnt, resp->cstate.slot->sl_cache_entry.ce_cachethis);
962
963 /* Encode the replayed sequence operation */
964 BUG_ON(resp->opcnt != 1);
965 op = &args->ops[resp->opcnt - 1];
966 nfsd4_encode_operation(resp, op);
967
968 /*return nfserr_retry_uncached_rep in next operation. */
969 if (resp->cstate.slot->sl_cache_entry.ce_cachethis == 0) {
970 op = &args->ops[resp->opcnt++];
971 op->status = nfserr_retry_uncached_rep;
972 nfsd4_encode_operation(resp, op);
973 }
974 return op->status;
975}
976
977/*
978 * Enforce NFSv4.1 COMPOUND ordering rules. 982 * Enforce NFSv4.1 COMPOUND ordering rules.
979 * 983 *
980 * TODO: 984 * TODO:
@@ -1083,13 +1087,10 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1083 BUG_ON(op->status == nfs_ok); 1087 BUG_ON(op->status == nfs_ok);
1084 1088
1085encode_op: 1089encode_op:
1086 /* Only from SEQUENCE or CREATE_SESSION */ 1090 /* Only from SEQUENCE */
1087 if (resp->cstate.status == nfserr_replay_cache) { 1091 if (resp->cstate.status == nfserr_replay_cache) {
1088 dprintk("%s NFS4.1 replay from cache\n", __func__); 1092 dprintk("%s NFS4.1 replay from cache\n", __func__);
1089 if (nfsd4_not_cached(resp)) 1093 status = op->status;
1090 status = nfsd4_enc_uncached_replay(args, resp);
1091 else
1092 status = op->status;
1093 goto out; 1094 goto out;
1094 } 1095 }
1095 if (op->status == nfserr_replay_me) { 1096 if (op->status == nfserr_replay_me) {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 980a216a48c8..2153f9bdbebd 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -55,6 +55,7 @@
55#include <linux/lockd/bind.h> 55#include <linux/lockd/bind.h>
56#include <linux/module.h> 56#include <linux/module.h>
57#include <linux/sunrpc/svcauth_gss.h> 57#include <linux/sunrpc/svcauth_gss.h>
58#include <linux/sunrpc/clnt.h>
58 59
59#define NFSDDBG_FACILITY NFSDDBG_PROC 60#define NFSDDBG_FACILITY NFSDDBG_PROC
60 61
@@ -413,36 +414,65 @@ gen_sessionid(struct nfsd4_session *ses)
413} 414}
414 415
415/* 416/*
416 * Give the client the number of slots it requests bound by 417 * The protocol defines ca_maxresponssize_cached to include the size of
417 * NFSD_MAX_SLOTS_PER_SESSION and by sv_drc_max_pages. 418 * the rpc header, but all we need to cache is the data starting after
419 * the end of the initial SEQUENCE operation--the rest we regenerate
420 * each time. Therefore we can advertise a ca_maxresponssize_cached
421 * value that is the number of bytes in our cache plus a few additional
422 * bytes. In order to stay on the safe side, and not promise more than
423 * we can cache, those additional bytes must be the minimum possible: 24
424 * bytes of rpc header (xid through accept state, with AUTH_NULL
425 * verifier), 12 for the compound header (with zero-length tag), and 44
426 * for the SEQUENCE op response:
427 */
428#define NFSD_MIN_HDR_SEQ_SZ (24 + 12 + 44)
429
430/*
431 * Give the client the number of ca_maxresponsesize_cached slots it
432 * requests, of size bounded by NFSD_SLOT_CACHE_SIZE,
433 * NFSD_MAX_MEM_PER_SESSION, and nfsd_drc_max_mem. Do not allow more
434 * than NFSD_MAX_SLOTS_PER_SESSION.
418 * 435 *
419 * If we run out of pages (sv_drc_pages_used == sv_drc_max_pages) we 436 * If we run out of reserved DRC memory we should (up to a point)
420 * should (up to a point) re-negotiate active sessions and reduce their 437 * re-negotiate active sessions and reduce their slot usage to make
421 * slot usage to make rooom for new connections. For now we just fail the 438 * rooom for new connections. For now we just fail the create session.
422 * create session.
423 */ 439 */
424static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan) 440static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan)
425{ 441{
426 int status = 0, np = fchan->maxreqs * NFSD_PAGES_PER_SLOT; 442 int mem, size = fchan->maxresp_cached;
427 443
428 if (fchan->maxreqs < 1) 444 if (fchan->maxreqs < 1)
429 return nfserr_inval; 445 return nfserr_inval;
430 else if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
431 fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
432 446
433 spin_lock(&nfsd_serv->sv_lock); 447 if (size < NFSD_MIN_HDR_SEQ_SZ)
434 if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages) 448 size = NFSD_MIN_HDR_SEQ_SZ;
435 np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used; 449 size -= NFSD_MIN_HDR_SEQ_SZ;
436 nfsd_serv->sv_drc_pages_used += np; 450 if (size > NFSD_SLOT_CACHE_SIZE)
437 spin_unlock(&nfsd_serv->sv_lock); 451 size = NFSD_SLOT_CACHE_SIZE;
452
453 /* bound the maxreqs by NFSD_MAX_MEM_PER_SESSION */
454 mem = fchan->maxreqs * size;
455 if (mem > NFSD_MAX_MEM_PER_SESSION) {
456 fchan->maxreqs = NFSD_MAX_MEM_PER_SESSION / size;
457 if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
458 fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
459 mem = fchan->maxreqs * size;
460 }
438 461
439 if (np <= 0) { 462 spin_lock(&nfsd_drc_lock);
440 status = nfserr_resource; 463 /* bound the total session drc memory ussage */
441 fchan->maxreqs = 0; 464 if (mem + nfsd_drc_mem_used > nfsd_drc_max_mem) {
442 } else 465 fchan->maxreqs = (nfsd_drc_max_mem - nfsd_drc_mem_used) / size;
443 fchan->maxreqs = np / NFSD_PAGES_PER_SLOT; 466 mem = fchan->maxreqs * size;
467 }
468 nfsd_drc_mem_used += mem;
469 spin_unlock(&nfsd_drc_lock);
444 470
445 return status; 471 if (fchan->maxreqs == 0)
472 return nfserr_serverfault;
473
474 fchan->maxresp_cached = size + NFSD_MIN_HDR_SEQ_SZ;
475 return 0;
446} 476}
447 477
448/* 478/*
@@ -466,36 +496,41 @@ static int init_forechannel_attrs(struct svc_rqst *rqstp,
466 fchan->maxresp_sz = maxcount; 496 fchan->maxresp_sz = maxcount;
467 session_fchan->maxresp_sz = fchan->maxresp_sz; 497 session_fchan->maxresp_sz = fchan->maxresp_sz;
468 498
469 /* Set the max response cached size our default which is
470 * a multiple of PAGE_SIZE and small */
471 session_fchan->maxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
472 fchan->maxresp_cached = session_fchan->maxresp_cached;
473
474 /* Use the client's maxops if possible */ 499 /* Use the client's maxops if possible */
475 if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND) 500 if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
476 fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND; 501 fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
477 session_fchan->maxops = fchan->maxops; 502 session_fchan->maxops = fchan->maxops;
478 503
479 /* try to use the client requested number of slots */
480 if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
481 fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
482
483 /* FIXME: Error means no more DRC pages so the server should 504 /* FIXME: Error means no more DRC pages so the server should
484 * recover pages from existing sessions. For now fail session 505 * recover pages from existing sessions. For now fail session
485 * creation. 506 * creation.
486 */ 507 */
487 status = set_forechannel_maxreqs(fchan); 508 status = set_forechannel_drc_size(fchan);
488 509
510 session_fchan->maxresp_cached = fchan->maxresp_cached;
489 session_fchan->maxreqs = fchan->maxreqs; 511 session_fchan->maxreqs = fchan->maxreqs;
512
513 dprintk("%s status %d\n", __func__, status);
490 return status; 514 return status;
491} 515}
492 516
517static void
518free_session_slots(struct nfsd4_session *ses)
519{
520 int i;
521
522 for (i = 0; i < ses->se_fchannel.maxreqs; i++)
523 kfree(ses->se_slots[i]);
524}
525
493static int 526static int
494alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, 527alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
495 struct nfsd4_create_session *cses) 528 struct nfsd4_create_session *cses)
496{ 529{
497 struct nfsd4_session *new, tmp; 530 struct nfsd4_session *new, tmp;
498 int idx, status = nfserr_resource, slotsize; 531 struct nfsd4_slot *sp;
532 int idx, slotsize, cachesize, i;
533 int status;
499 534
500 memset(&tmp, 0, sizeof(tmp)); 535 memset(&tmp, 0, sizeof(tmp));
501 536
@@ -506,14 +541,27 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
506 if (status) 541 if (status)
507 goto out; 542 goto out;
508 543
509 /* allocate struct nfsd4_session and slot table in one piece */ 544 BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot)
510 slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot); 545 + sizeof(struct nfsd4_session) > PAGE_SIZE);
546
547 status = nfserr_serverfault;
548 /* allocate struct nfsd4_session and slot table pointers in one piece */
549 slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot *);
511 new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL); 550 new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
512 if (!new) 551 if (!new)
513 goto out; 552 goto out;
514 553
515 memcpy(new, &tmp, sizeof(*new)); 554 memcpy(new, &tmp, sizeof(*new));
516 555
556 /* allocate each struct nfsd4_slot and data cache in one piece */
557 cachesize = new->se_fchannel.maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
558 for (i = 0; i < new->se_fchannel.maxreqs; i++) {
559 sp = kzalloc(sizeof(*sp) + cachesize, GFP_KERNEL);
560 if (!sp)
561 goto out_free;
562 new->se_slots[i] = sp;
563 }
564
517 new->se_client = clp; 565 new->se_client = clp;
518 gen_sessionid(new); 566 gen_sessionid(new);
519 idx = hash_sessionid(&new->se_sessionid); 567 idx = hash_sessionid(&new->se_sessionid);
@@ -530,6 +578,10 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
530 status = nfs_ok; 578 status = nfs_ok;
531out: 579out:
532 return status; 580 return status;
581out_free:
582 free_session_slots(new);
583 kfree(new);
584 goto out;
533} 585}
534 586
535/* caller must hold sessionid_lock */ 587/* caller must hold sessionid_lock */
@@ -572,19 +624,16 @@ release_session(struct nfsd4_session *ses)
572 nfsd4_put_session(ses); 624 nfsd4_put_session(ses);
573} 625}
574 626
575static void nfsd4_release_respages(struct page **respages, short resused);
576
577void 627void
578free_session(struct kref *kref) 628free_session(struct kref *kref)
579{ 629{
580 struct nfsd4_session *ses; 630 struct nfsd4_session *ses;
581 int i;
582 631
583 ses = container_of(kref, struct nfsd4_session, se_ref); 632 ses = container_of(kref, struct nfsd4_session, se_ref);
584 for (i = 0; i < ses->se_fchannel.maxreqs; i++) { 633 spin_lock(&nfsd_drc_lock);
585 struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry; 634 nfsd_drc_mem_used -= ses->se_fchannel.maxreqs * NFSD_SLOT_CACHE_SIZE;
586 nfsd4_release_respages(e->ce_respages, e->ce_resused); 635 spin_unlock(&nfsd_drc_lock);
587 } 636 free_session_slots(ses);
588 kfree(ses); 637 kfree(ses);
589} 638}
590 639
@@ -647,18 +696,14 @@ shutdown_callback_client(struct nfs4_client *clp)
647 clp->cl_cb_conn.cb_client = NULL; 696 clp->cl_cb_conn.cb_client = NULL;
648 rpc_shutdown_client(clnt); 697 rpc_shutdown_client(clnt);
649 } 698 }
650 if (clp->cl_cb_conn.cb_cred) {
651 put_rpccred(clp->cl_cb_conn.cb_cred);
652 clp->cl_cb_conn.cb_cred = NULL;
653 }
654} 699}
655 700
656static inline void 701static inline void
657free_client(struct nfs4_client *clp) 702free_client(struct nfs4_client *clp)
658{ 703{
659 shutdown_callback_client(clp); 704 shutdown_callback_client(clp);
660 nfsd4_release_respages(clp->cl_slot.sl_cache_entry.ce_respages, 705 if (clp->cl_cb_xprt)
661 clp->cl_slot.sl_cache_entry.ce_resused); 706 svc_xprt_put(clp->cl_cb_xprt);
662 if (clp->cl_cred.cr_group_info) 707 if (clp->cl_cred.cr_group_info)
663 put_group_info(clp->cl_cred.cr_group_info); 708 put_group_info(clp->cl_cred.cr_group_info);
664 kfree(clp->cl_principal); 709 kfree(clp->cl_principal);
@@ -714,25 +759,6 @@ expire_client(struct nfs4_client *clp)
714 put_nfs4_client(clp); 759 put_nfs4_client(clp);
715} 760}
716 761
717static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
718{
719 struct nfs4_client *clp;
720
721 clp = alloc_client(name);
722 if (clp == NULL)
723 return NULL;
724 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
725 atomic_set(&clp->cl_count, 1);
726 atomic_set(&clp->cl_cb_conn.cb_set, 0);
727 INIT_LIST_HEAD(&clp->cl_idhash);
728 INIT_LIST_HEAD(&clp->cl_strhash);
729 INIT_LIST_HEAD(&clp->cl_openowners);
730 INIT_LIST_HEAD(&clp->cl_delegations);
731 INIT_LIST_HEAD(&clp->cl_sessions);
732 INIT_LIST_HEAD(&clp->cl_lru);
733 return clp;
734}
735
736static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) 762static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
737{ 763{
738 memcpy(target->cl_verifier.data, source->data, 764 memcpy(target->cl_verifier.data, source->data,
@@ -795,6 +821,46 @@ static void gen_confirm(struct nfs4_client *clp)
795 *p++ = i++; 821 *p++ = i++;
796} 822}
797 823
824static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
825 struct svc_rqst *rqstp, nfs4_verifier *verf)
826{
827 struct nfs4_client *clp;
828 struct sockaddr *sa = svc_addr(rqstp);
829 char *princ;
830
831 clp = alloc_client(name);
832 if (clp == NULL)
833 return NULL;
834
835 princ = svc_gss_principal(rqstp);
836 if (princ) {
837 clp->cl_principal = kstrdup(princ, GFP_KERNEL);
838 if (clp->cl_principal == NULL) {
839 free_client(clp);
840 return NULL;
841 }
842 }
843
844 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
845 atomic_set(&clp->cl_count, 1);
846 atomic_set(&clp->cl_cb_conn.cb_set, 0);
847 INIT_LIST_HEAD(&clp->cl_idhash);
848 INIT_LIST_HEAD(&clp->cl_strhash);
849 INIT_LIST_HEAD(&clp->cl_openowners);
850 INIT_LIST_HEAD(&clp->cl_delegations);
851 INIT_LIST_HEAD(&clp->cl_sessions);
852 INIT_LIST_HEAD(&clp->cl_lru);
853 clear_bit(0, &clp->cl_cb_slot_busy);
854 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
855 copy_verf(clp, verf);
856 rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa);
857 clp->cl_flavor = rqstp->rq_flavor;
858 copy_cred(&clp->cl_cred, &rqstp->rq_cred);
859 gen_confirm(clp);
860
861 return clp;
862}
863
798static int check_name(struct xdr_netobj name) 864static int check_name(struct xdr_netobj name)
799{ 865{
800 if (name.len == 0) 866 if (name.len == 0)
@@ -902,93 +968,40 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
902 return NULL; 968 return NULL;
903} 969}
904 970
905/* a helper function for parse_callback */
906static int
907parse_octet(unsigned int *lenp, char **addrp)
908{
909 unsigned int len = *lenp;
910 char *p = *addrp;
911 int n = -1;
912 char c;
913
914 for (;;) {
915 if (!len)
916 break;
917 len--;
918 c = *p++;
919 if (c == '.')
920 break;
921 if ((c < '0') || (c > '9')) {
922 n = -1;
923 break;
924 }
925 if (n < 0)
926 n = 0;
927 n = (n * 10) + (c - '0');
928 if (n > 255) {
929 n = -1;
930 break;
931 }
932 }
933 *lenp = len;
934 *addrp = p;
935 return n;
936}
937
938/* parse and set the setclientid ipv4 callback address */
939static int
940parse_ipv4(unsigned int addr_len, char *addr_val, unsigned int *cbaddrp, unsigned short *cbportp)
941{
942 int temp = 0;
943 u32 cbaddr = 0;
944 u16 cbport = 0;
945 u32 addrlen = addr_len;
946 char *addr = addr_val;
947 int i, shift;
948
949 /* ipaddress */
950 shift = 24;
951 for(i = 4; i > 0 ; i--) {
952 if ((temp = parse_octet(&addrlen, &addr)) < 0) {
953 return 0;
954 }
955 cbaddr |= (temp << shift);
956 if (shift > 0)
957 shift -= 8;
958 }
959 *cbaddrp = cbaddr;
960
961 /* port */
962 shift = 8;
963 for(i = 2; i > 0 ; i--) {
964 if ((temp = parse_octet(&addrlen, &addr)) < 0) {
965 return 0;
966 }
967 cbport |= (temp << shift);
968 if (shift > 0)
969 shift -= 8;
970 }
971 *cbportp = cbport;
972 return 1;
973}
974
975static void 971static void
976gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se) 972gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
977{ 973{
978 struct nfs4_cb_conn *cb = &clp->cl_cb_conn; 974 struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
979 975 unsigned short expected_family;
980 /* Currently, we only support tcp for the callback channel */ 976
981 if ((se->se_callback_netid_len != 3) || memcmp((char *)se->se_callback_netid_val, "tcp", 3)) 977 /* Currently, we only support tcp and tcp6 for the callback channel */
978 if (se->se_callback_netid_len == 3 &&
979 !memcmp(se->se_callback_netid_val, "tcp", 3))
980 expected_family = AF_INET;
981 else if (se->se_callback_netid_len == 4 &&
982 !memcmp(se->se_callback_netid_val, "tcp6", 4))
983 expected_family = AF_INET6;
984 else
982 goto out_err; 985 goto out_err;
983 986
984 if ( !(parse_ipv4(se->se_callback_addr_len, se->se_callback_addr_val, 987 cb->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val,
985 &cb->cb_addr, &cb->cb_port))) 988 se->se_callback_addr_len,
989 (struct sockaddr *) &cb->cb_addr,
990 sizeof(cb->cb_addr));
991
992 if (!cb->cb_addrlen || cb->cb_addr.ss_family != expected_family)
986 goto out_err; 993 goto out_err;
994
995 if (cb->cb_addr.ss_family == AF_INET6)
996 ((struct sockaddr_in6 *) &cb->cb_addr)->sin6_scope_id = scopeid;
997
987 cb->cb_minorversion = 0; 998 cb->cb_minorversion = 0;
988 cb->cb_prog = se->se_callback_prog; 999 cb->cb_prog = se->se_callback_prog;
989 cb->cb_ident = se->se_callback_ident; 1000 cb->cb_ident = se->se_callback_ident;
990 return; 1001 return;
991out_err: 1002out_err:
1003 cb->cb_addr.ss_family = AF_UNSPEC;
1004 cb->cb_addrlen = 0;
992 dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) " 1005 dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) "
993 "will not receive delegations\n", 1006 "will not receive delegations\n",
994 clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); 1007 clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
@@ -996,175 +1009,87 @@ out_err:
996 return; 1009 return;
997} 1010}
998 1011
999void
1000nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
1001{
1002 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1003
1004 resp->cstate.statp = statp;
1005}
1006
1007/* 1012/*
1008 * Dereference the result pages. 1013 * Cache a reply. nfsd4_check_drc_limit() has bounded the cache size.
1009 */ 1014 */
1010static void 1015void
1011nfsd4_release_respages(struct page **respages, short resused) 1016nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
1012{ 1017{
1013 int i; 1018 struct nfsd4_slot *slot = resp->cstate.slot;
1019 unsigned int base;
1014 1020
1015 dprintk("--> %s\n", __func__); 1021 dprintk("--> %s slot %p\n", __func__, slot);
1016 for (i = 0; i < resused; i++) {
1017 if (!respages[i])
1018 continue;
1019 put_page(respages[i]);
1020 respages[i] = NULL;
1021 }
1022}
1023 1022
1024static void 1023 slot->sl_opcnt = resp->opcnt;
1025nfsd4_copy_pages(struct page **topages, struct page **frompages, short count) 1024 slot->sl_status = resp->cstate.status;
1026{
1027 int i;
1028 1025
1029 for (i = 0; i < count; i++) { 1026 if (nfsd4_not_cached(resp)) {
1030 topages[i] = frompages[i]; 1027 slot->sl_datalen = 0;
1031 if (!topages[i]) 1028 return;
1032 continue;
1033 get_page(topages[i]);
1034 } 1029 }
1030 slot->sl_datalen = (char *)resp->p - (char *)resp->cstate.datap;
1031 base = (char *)resp->cstate.datap -
1032 (char *)resp->xbuf->head[0].iov_base;
1033 if (read_bytes_from_xdr_buf(resp->xbuf, base, slot->sl_data,
1034 slot->sl_datalen))
1035 WARN("%s: sessions DRC could not cache compound\n", __func__);
1036 return;
1035} 1037}
1036 1038
1037/* 1039/*
1038 * Cache the reply pages up to NFSD_PAGES_PER_SLOT + 1, clearing the previous 1040 * Encode the replay sequence operation from the slot values.
1039 * pages. We add a page to NFSD_PAGES_PER_SLOT for the case where the total 1041 * If cachethis is FALSE encode the uncached rep error on the next
1040 * length of the XDR response is less than se_fmaxresp_cached 1042 * operation which sets resp->p and increments resp->opcnt for
1041 * (NFSD_PAGES_PER_SLOT * PAGE_SIZE) but the xdr_buf pages is used for a 1043 * nfs4svc_encode_compoundres.
1042 * of the reply (e.g. readdir).
1043 * 1044 *
1044 * Store the base and length of the rq_req.head[0] page
1045 * of the NFSv4.1 data, just past the rpc header.
1046 */ 1045 */
1047void 1046static __be32
1048nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) 1047nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args,
1048 struct nfsd4_compoundres *resp)
1049{ 1049{
1050 struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry; 1050 struct nfsd4_op *op;
1051 struct svc_rqst *rqstp = resp->rqstp; 1051 struct nfsd4_slot *slot = resp->cstate.slot;
1052 struct nfsd4_compoundargs *args = rqstp->rq_argp;
1053 struct nfsd4_op *op = &args->ops[resp->opcnt];
1054 struct kvec *resv = &rqstp->rq_res.head[0];
1055
1056 dprintk("--> %s entry %p\n", __func__, entry);
1057
1058 /* Don't cache a failed OP_SEQUENCE. */
1059 if (resp->opcnt == 1 && op->opnum == OP_SEQUENCE && resp->cstate.status)
1060 return;
1061 1052
1062 nfsd4_release_respages(entry->ce_respages, entry->ce_resused); 1053 dprintk("--> %s resp->opcnt %d cachethis %u \n", __func__,
1063 entry->ce_opcnt = resp->opcnt; 1054 resp->opcnt, resp->cstate.slot->sl_cachethis);
1064 entry->ce_status = resp->cstate.status;
1065 1055
1066 /* 1056 /* Encode the replayed sequence operation */
1067 * Don't need a page to cache just the sequence operation - the slot 1057 op = &args->ops[resp->opcnt - 1];
1068 * does this for us! 1058 nfsd4_encode_operation(resp, op);
1069 */
1070 1059
1071 if (nfsd4_not_cached(resp)) { 1060 /* Return nfserr_retry_uncached_rep in next operation. */
1072 entry->ce_resused = 0; 1061 if (args->opcnt > 1 && slot->sl_cachethis == 0) {
1073 entry->ce_rpchdrlen = 0; 1062 op = &args->ops[resp->opcnt++];
1074 dprintk("%s Just cache SEQUENCE. ce_cachethis %d\n", __func__, 1063 op->status = nfserr_retry_uncached_rep;
1075 resp->cstate.slot->sl_cache_entry.ce_cachethis); 1064 nfsd4_encode_operation(resp, op);
1076 return;
1077 }
1078 entry->ce_resused = rqstp->rq_resused;
1079 if (entry->ce_resused > NFSD_PAGES_PER_SLOT + 1)
1080 entry->ce_resused = NFSD_PAGES_PER_SLOT + 1;
1081 nfsd4_copy_pages(entry->ce_respages, rqstp->rq_respages,
1082 entry->ce_resused);
1083 entry->ce_datav.iov_base = resp->cstate.statp;
1084 entry->ce_datav.iov_len = resv->iov_len - ((char *)resp->cstate.statp -
1085 (char *)page_address(rqstp->rq_respages[0]));
1086 /* Current request rpc header length*/
1087 entry->ce_rpchdrlen = (char *)resp->cstate.statp -
1088 (char *)page_address(rqstp->rq_respages[0]);
1089}
1090
1091/*
1092 * We keep the rpc header, but take the nfs reply from the replycache.
1093 */
1094static int
1095nfsd41_copy_replay_data(struct nfsd4_compoundres *resp,
1096 struct nfsd4_cache_entry *entry)
1097{
1098 struct svc_rqst *rqstp = resp->rqstp;
1099 struct kvec *resv = &resp->rqstp->rq_res.head[0];
1100 int len;
1101
1102 /* Current request rpc header length*/
1103 len = (char *)resp->cstate.statp -
1104 (char *)page_address(rqstp->rq_respages[0]);
1105 if (entry->ce_datav.iov_len + len > PAGE_SIZE) {
1106 dprintk("%s v41 cached reply too large (%Zd).\n", __func__,
1107 entry->ce_datav.iov_len);
1108 return 0;
1109 } 1065 }
1110 /* copy the cached reply nfsd data past the current rpc header */ 1066 return op->status;
1111 memcpy((char *)resv->iov_base + len, entry->ce_datav.iov_base,
1112 entry->ce_datav.iov_len);
1113 resv->iov_len = len + entry->ce_datav.iov_len;
1114 return 1;
1115} 1067}
1116 1068
1117/* 1069/*
1118 * Keep the first page of the replay. Copy the NFSv4.1 data from the first 1070 * The sequence operation is not cached because we can use the slot and
1119 * cached page. Replace any futher replay pages from the cache. 1071 * session values.
1120 */ 1072 */
1121__be32 1073__be32
1122nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, 1074nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
1123 struct nfsd4_sequence *seq) 1075 struct nfsd4_sequence *seq)
1124{ 1076{
1125 struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry; 1077 struct nfsd4_slot *slot = resp->cstate.slot;
1126 __be32 status; 1078 __be32 status;
1127 1079
1128 dprintk("--> %s entry %p\n", __func__, entry); 1080 dprintk("--> %s slot %p\n", __func__, slot);
1129
1130 /*
1131 * If this is just the sequence operation, we did not keep
1132 * a page in the cache entry because we can just use the
1133 * slot info stored in struct nfsd4_sequence that was checked
1134 * against the slot in nfsd4_sequence().
1135 *
1136 * This occurs when seq->cachethis is FALSE, or when the client
1137 * session inactivity timer fires and a solo sequence operation
1138 * is sent (lease renewal).
1139 */
1140 if (seq && nfsd4_not_cached(resp)) {
1141 seq->maxslots = resp->cstate.session->se_fchannel.maxreqs;
1142 return nfs_ok;
1143 }
1144
1145 if (!nfsd41_copy_replay_data(resp, entry)) {
1146 /*
1147 * Not enough room to use the replay rpc header, send the
1148 * cached header. Release all the allocated result pages.
1149 */
1150 svc_free_res_pages(resp->rqstp);
1151 nfsd4_copy_pages(resp->rqstp->rq_respages, entry->ce_respages,
1152 entry->ce_resused);
1153 } else {
1154 /* Release all but the first allocated result page */
1155 1081
1156 resp->rqstp->rq_resused--; 1082 /* Either returns 0 or nfserr_retry_uncached */
1157 svc_free_res_pages(resp->rqstp); 1083 status = nfsd4_enc_sequence_replay(resp->rqstp->rq_argp, resp);
1084 if (status == nfserr_retry_uncached_rep)
1085 return status;
1158 1086
1159 nfsd4_copy_pages(&resp->rqstp->rq_respages[1], 1087 /* The sequence operation has been encoded, cstate->datap set. */
1160 &entry->ce_respages[1], 1088 memcpy(resp->cstate.datap, slot->sl_data, slot->sl_datalen);
1161 entry->ce_resused - 1);
1162 }
1163 1089
1164 resp->rqstp->rq_resused = entry->ce_resused; 1090 resp->opcnt = slot->sl_opcnt;
1165 resp->opcnt = entry->ce_opcnt; 1091 resp->p = resp->cstate.datap + XDR_QUADLEN(slot->sl_datalen);
1166 resp->cstate.iovlen = entry->ce_datav.iov_len + entry->ce_rpchdrlen; 1092 status = slot->sl_status;
1167 status = entry->ce_status;
1168 1093
1169 return status; 1094 return status;
1170} 1095}
@@ -1194,13 +1119,15 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1194 int status; 1119 int status;
1195 unsigned int strhashval; 1120 unsigned int strhashval;
1196 char dname[HEXDIR_LEN]; 1121 char dname[HEXDIR_LEN];
1122 char addr_str[INET6_ADDRSTRLEN];
1197 nfs4_verifier verf = exid->verifier; 1123 nfs4_verifier verf = exid->verifier;
1198 u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr; 1124 struct sockaddr *sa = svc_addr(rqstp);
1199 1125
1126 rpc_ntop(sa, addr_str, sizeof(addr_str));
1200 dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p " 1127 dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
1201 " ip_addr=%u flags %x, spa_how %d\n", 1128 "ip_addr=%s flags %x, spa_how %d\n",
1202 __func__, rqstp, exid, exid->clname.len, exid->clname.data, 1129 __func__, rqstp, exid, exid->clname.len, exid->clname.data,
1203 ip_addr, exid->flags, exid->spa_how); 1130 addr_str, exid->flags, exid->spa_how);
1204 1131
1205 if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A)) 1132 if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A))
1206 return nfserr_inval; 1133 return nfserr_inval;
@@ -1281,28 +1208,23 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1281 1208
1282out_new: 1209out_new:
1283 /* Normal case */ 1210 /* Normal case */
1284 new = create_client(exid->clname, dname); 1211 new = create_client(exid->clname, dname, rqstp, &verf);
1285 if (new == NULL) { 1212 if (new == NULL) {
1286 status = nfserr_resource; 1213 status = nfserr_serverfault;
1287 goto out; 1214 goto out;
1288 } 1215 }
1289 1216
1290 copy_verf(new, &verf);
1291 copy_cred(&new->cl_cred, &rqstp->rq_cred);
1292 new->cl_addr = ip_addr;
1293 gen_clid(new); 1217 gen_clid(new);
1294 gen_confirm(new);
1295 add_to_unconfirmed(new, strhashval); 1218 add_to_unconfirmed(new, strhashval);
1296out_copy: 1219out_copy:
1297 exid->clientid.cl_boot = new->cl_clientid.cl_boot; 1220 exid->clientid.cl_boot = new->cl_clientid.cl_boot;
1298 exid->clientid.cl_id = new->cl_clientid.cl_id; 1221 exid->clientid.cl_id = new->cl_clientid.cl_id;
1299 1222
1300 new->cl_slot.sl_seqid = 0;
1301 exid->seqid = 1; 1223 exid->seqid = 1;
1302 nfsd4_set_ex_flags(new, exid); 1224 nfsd4_set_ex_flags(new, exid);
1303 1225
1304 dprintk("nfsd4_exchange_id seqid %d flags %x\n", 1226 dprintk("nfsd4_exchange_id seqid %d flags %x\n",
1305 new->cl_slot.sl_seqid, new->cl_exchange_flags); 1227 new->cl_cs_slot.sl_seqid, new->cl_exchange_flags);
1306 status = nfs_ok; 1228 status = nfs_ok;
1307 1229
1308out: 1230out:
@@ -1313,40 +1235,60 @@ error:
1313} 1235}
1314 1236
1315static int 1237static int
1316check_slot_seqid(u32 seqid, struct nfsd4_slot *slot) 1238check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse)
1317{ 1239{
1318 dprintk("%s enter. seqid %d slot->sl_seqid %d\n", __func__, seqid, 1240 dprintk("%s enter. seqid %d slot_seqid %d\n", __func__, seqid,
1319 slot->sl_seqid); 1241 slot_seqid);
1320 1242
1321 /* The slot is in use, and no response has been sent. */ 1243 /* The slot is in use, and no response has been sent. */
1322 if (slot->sl_inuse) { 1244 if (slot_inuse) {
1323 if (seqid == slot->sl_seqid) 1245 if (seqid == slot_seqid)
1324 return nfserr_jukebox; 1246 return nfserr_jukebox;
1325 else 1247 else
1326 return nfserr_seq_misordered; 1248 return nfserr_seq_misordered;
1327 } 1249 }
1328 /* Normal */ 1250 /* Normal */
1329 if (likely(seqid == slot->sl_seqid + 1)) 1251 if (likely(seqid == slot_seqid + 1))
1330 return nfs_ok; 1252 return nfs_ok;
1331 /* Replay */ 1253 /* Replay */
1332 if (seqid == slot->sl_seqid) 1254 if (seqid == slot_seqid)
1333 return nfserr_replay_cache; 1255 return nfserr_replay_cache;
1334 /* Wraparound */ 1256 /* Wraparound */
1335 if (seqid == 1 && (slot->sl_seqid + 1) == 0) 1257 if (seqid == 1 && (slot_seqid + 1) == 0)
1336 return nfs_ok; 1258 return nfs_ok;
1337 /* Misordered replay or misordered new request */ 1259 /* Misordered replay or misordered new request */
1338 return nfserr_seq_misordered; 1260 return nfserr_seq_misordered;
1339} 1261}
1340 1262
1263/*
1264 * Cache the create session result into the create session single DRC
1265 * slot cache by saving the xdr structure. sl_seqid has been set.
1266 * Do this for solo or embedded create session operations.
1267 */
1268static void
1269nfsd4_cache_create_session(struct nfsd4_create_session *cr_ses,
1270 struct nfsd4_clid_slot *slot, int nfserr)
1271{
1272 slot->sl_status = nfserr;
1273 memcpy(&slot->sl_cr_ses, cr_ses, sizeof(*cr_ses));
1274}
1275
1276static __be32
1277nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses,
1278 struct nfsd4_clid_slot *slot)
1279{
1280 memcpy(cr_ses, &slot->sl_cr_ses, sizeof(*cr_ses));
1281 return slot->sl_status;
1282}
1283
1341__be32 1284__be32
1342nfsd4_create_session(struct svc_rqst *rqstp, 1285nfsd4_create_session(struct svc_rqst *rqstp,
1343 struct nfsd4_compound_state *cstate, 1286 struct nfsd4_compound_state *cstate,
1344 struct nfsd4_create_session *cr_ses) 1287 struct nfsd4_create_session *cr_ses)
1345{ 1288{
1346 u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr; 1289 struct sockaddr *sa = svc_addr(rqstp);
1347 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1348 struct nfs4_client *conf, *unconf; 1290 struct nfs4_client *conf, *unconf;
1349 struct nfsd4_slot *slot = NULL; 1291 struct nfsd4_clid_slot *cs_slot = NULL;
1350 int status = 0; 1292 int status = 0;
1351 1293
1352 nfs4_lock_state(); 1294 nfs4_lock_state();
@@ -1354,40 +1296,38 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1354 conf = find_confirmed_client(&cr_ses->clientid); 1296 conf = find_confirmed_client(&cr_ses->clientid);
1355 1297
1356 if (conf) { 1298 if (conf) {
1357 slot = &conf->cl_slot; 1299 cs_slot = &conf->cl_cs_slot;
1358 status = check_slot_seqid(cr_ses->seqid, slot); 1300 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
1359 if (status == nfserr_replay_cache) { 1301 if (status == nfserr_replay_cache) {
1360 dprintk("Got a create_session replay! seqid= %d\n", 1302 dprintk("Got a create_session replay! seqid= %d\n",
1361 slot->sl_seqid); 1303 cs_slot->sl_seqid);
1362 cstate->slot = slot;
1363 cstate->status = status;
1364 /* Return the cached reply status */ 1304 /* Return the cached reply status */
1365 status = nfsd4_replay_cache_entry(resp, NULL); 1305 status = nfsd4_replay_create_session(cr_ses, cs_slot);
1366 goto out; 1306 goto out;
1367 } else if (cr_ses->seqid != conf->cl_slot.sl_seqid + 1) { 1307 } else if (cr_ses->seqid != cs_slot->sl_seqid + 1) {
1368 status = nfserr_seq_misordered; 1308 status = nfserr_seq_misordered;
1369 dprintk("Sequence misordered!\n"); 1309 dprintk("Sequence misordered!\n");
1370 dprintk("Expected seqid= %d but got seqid= %d\n", 1310 dprintk("Expected seqid= %d but got seqid= %d\n",
1371 slot->sl_seqid, cr_ses->seqid); 1311 cs_slot->sl_seqid, cr_ses->seqid);
1372 goto out; 1312 goto out;
1373 } 1313 }
1374 conf->cl_slot.sl_seqid++; 1314 cs_slot->sl_seqid++;
1375 } else if (unconf) { 1315 } else if (unconf) {
1376 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || 1316 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
1377 (ip_addr != unconf->cl_addr)) { 1317 !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
1378 status = nfserr_clid_inuse; 1318 status = nfserr_clid_inuse;
1379 goto out; 1319 goto out;
1380 } 1320 }
1381 1321
1382 slot = &unconf->cl_slot; 1322 cs_slot = &unconf->cl_cs_slot;
1383 status = check_slot_seqid(cr_ses->seqid, slot); 1323 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
1384 if (status) { 1324 if (status) {
1385 /* an unconfirmed replay returns misordered */ 1325 /* an unconfirmed replay returns misordered */
1386 status = nfserr_seq_misordered; 1326 status = nfserr_seq_misordered;
1387 goto out; 1327 goto out_cache;
1388 } 1328 }
1389 1329
1390 slot->sl_seqid++; /* from 0 to 1 */ 1330 cs_slot->sl_seqid++; /* from 0 to 1 */
1391 move_to_confirmed(unconf); 1331 move_to_confirmed(unconf);
1392 1332
1393 /* 1333 /*
@@ -1396,6 +1336,19 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1396 cr_ses->flags &= ~SESSION4_PERSIST; 1336 cr_ses->flags &= ~SESSION4_PERSIST;
1397 cr_ses->flags &= ~SESSION4_RDMA; 1337 cr_ses->flags &= ~SESSION4_RDMA;
1398 1338
1339 if (cr_ses->flags & SESSION4_BACK_CHAN) {
1340 unconf->cl_cb_xprt = rqstp->rq_xprt;
1341 svc_xprt_get(unconf->cl_cb_xprt);
1342 rpc_copy_addr(
1343 (struct sockaddr *)&unconf->cl_cb_conn.cb_addr,
1344 sa);
1345 unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
1346 unconf->cl_cb_conn.cb_minorversion =
1347 cstate->minorversion;
1348 unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
1349 unconf->cl_cb_seq_nr = 1;
1350 nfsd4_probe_callback(unconf);
1351 }
1399 conf = unconf; 1352 conf = unconf;
1400 } else { 1353 } else {
1401 status = nfserr_stale_clientid; 1354 status = nfserr_stale_clientid;
@@ -1408,12 +1361,11 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1408 1361
1409 memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data, 1362 memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data,
1410 NFS4_MAX_SESSIONID_LEN); 1363 NFS4_MAX_SESSIONID_LEN);
1411 cr_ses->seqid = slot->sl_seqid; 1364 cr_ses->seqid = cs_slot->sl_seqid;
1412 1365
1413 slot->sl_inuse = true; 1366out_cache:
1414 cstate->slot = slot; 1367 /* cache solo and embedded create sessions under the state lock */
1415 /* Ensure a page is used for the cache */ 1368 nfsd4_cache_create_session(cr_ses, cs_slot, status);
1416 slot->sl_cache_entry.ce_cachethis = 1;
1417out: 1369out:
1418 nfs4_unlock_state(); 1370 nfs4_unlock_state();
1419 dprintk("%s returns %d\n", __func__, ntohl(status)); 1371 dprintk("%s returns %d\n", __func__, ntohl(status));
@@ -1478,18 +1430,23 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1478 if (seq->slotid >= session->se_fchannel.maxreqs) 1430 if (seq->slotid >= session->se_fchannel.maxreqs)
1479 goto out; 1431 goto out;
1480 1432
1481 slot = &session->se_slots[seq->slotid]; 1433 slot = session->se_slots[seq->slotid];
1482 dprintk("%s: slotid %d\n", __func__, seq->slotid); 1434 dprintk("%s: slotid %d\n", __func__, seq->slotid);
1483 1435
1484 status = check_slot_seqid(seq->seqid, slot); 1436 /* We do not negotiate the number of slots yet, so set the
1437 * maxslots to the session maxreqs which is used to encode
1438 * sr_highest_slotid and the sr_target_slot id to maxslots */
1439 seq->maxslots = session->se_fchannel.maxreqs;
1440
1441 status = check_slot_seqid(seq->seqid, slot->sl_seqid, slot->sl_inuse);
1485 if (status == nfserr_replay_cache) { 1442 if (status == nfserr_replay_cache) {
1486 cstate->slot = slot; 1443 cstate->slot = slot;
1487 cstate->session = session; 1444 cstate->session = session;
1488 /* Return the cached reply status and set cstate->status 1445 /* Return the cached reply status and set cstate->status
1489 * for nfsd4_svc_encode_compoundres processing */ 1446 * for nfsd4_proc_compound processing */
1490 status = nfsd4_replay_cache_entry(resp, seq); 1447 status = nfsd4_replay_cache_entry(resp, seq);
1491 cstate->status = nfserr_replay_cache; 1448 cstate->status = nfserr_replay_cache;
1492 goto replay_cache; 1449 goto out;
1493 } 1450 }
1494 if (status) 1451 if (status)
1495 goto out; 1452 goto out;
@@ -1497,23 +1454,23 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1497 /* Success! bump slot seqid */ 1454 /* Success! bump slot seqid */
1498 slot->sl_inuse = true; 1455 slot->sl_inuse = true;
1499 slot->sl_seqid = seq->seqid; 1456 slot->sl_seqid = seq->seqid;
1500 slot->sl_cache_entry.ce_cachethis = seq->cachethis; 1457 slot->sl_cachethis = seq->cachethis;
1501 /* Always set the cache entry cachethis for solo sequence */
1502 if (nfsd4_is_solo_sequence(resp))
1503 slot->sl_cache_entry.ce_cachethis = 1;
1504 1458
1505 cstate->slot = slot; 1459 cstate->slot = slot;
1506 cstate->session = session; 1460 cstate->session = session;
1507 1461
1508replay_cache: 1462 /* Hold a session reference until done processing the compound:
1509 /* Renew the clientid on success and on replay.
1510 * Hold a session reference until done processing the compound:
1511 * nfsd4_put_session called only if the cstate slot is set. 1463 * nfsd4_put_session called only if the cstate slot is set.
1512 */ 1464 */
1513 renew_client(session->se_client);
1514 nfsd4_get_session(session); 1465 nfsd4_get_session(session);
1515out: 1466out:
1516 spin_unlock(&sessionid_lock); 1467 spin_unlock(&sessionid_lock);
1468 /* Renew the clientid on success and on replay */
1469 if (cstate->session) {
1470 nfs4_lock_state();
1471 renew_client(session->se_client);
1472 nfs4_unlock_state();
1473 }
1517 dprintk("%s: return %d\n", __func__, ntohl(status)); 1474 dprintk("%s: return %d\n", __func__, ntohl(status));
1518 return status; 1475 return status;
1519} 1476}
@@ -1522,7 +1479,7 @@ __be32
1522nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 1479nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1523 struct nfsd4_setclientid *setclid) 1480 struct nfsd4_setclientid *setclid)
1524{ 1481{
1525 struct sockaddr_in *sin = svc_addr_in(rqstp); 1482 struct sockaddr *sa = svc_addr(rqstp);
1526 struct xdr_netobj clname = { 1483 struct xdr_netobj clname = {
1527 .len = setclid->se_namelen, 1484 .len = setclid->se_namelen,
1528 .data = setclid->se_name, 1485 .data = setclid->se_name,
@@ -1531,7 +1488,6 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1531 unsigned int strhashval; 1488 unsigned int strhashval;
1532 struct nfs4_client *conf, *unconf, *new; 1489 struct nfs4_client *conf, *unconf, *new;
1533 __be32 status; 1490 __be32 status;
1534 char *princ;
1535 char dname[HEXDIR_LEN]; 1491 char dname[HEXDIR_LEN];
1536 1492
1537 if (!check_name(clname)) 1493 if (!check_name(clname))
@@ -1554,8 +1510,11 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1554 /* RFC 3530 14.2.33 CASE 0: */ 1510 /* RFC 3530 14.2.33 CASE 0: */
1555 status = nfserr_clid_inuse; 1511 status = nfserr_clid_inuse;
1556 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { 1512 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
1557 dprintk("NFSD: setclientid: string in use by client" 1513 char addr_str[INET6_ADDRSTRLEN];
1558 " at %pI4\n", &conf->cl_addr); 1514 rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str,
1515 sizeof(addr_str));
1516 dprintk("NFSD: setclientid: string in use by client "
1517 "at %s\n", addr_str);
1559 goto out; 1518 goto out;
1560 } 1519 }
1561 } 1520 }
@@ -1573,7 +1532,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1573 */ 1532 */
1574 if (unconf) 1533 if (unconf)
1575 expire_client(unconf); 1534 expire_client(unconf);
1576 new = create_client(clname, dname); 1535 new = create_client(clname, dname, rqstp, &clverifier);
1577 if (new == NULL) 1536 if (new == NULL)
1578 goto out; 1537 goto out;
1579 gen_clid(new); 1538 gen_clid(new);
@@ -1590,7 +1549,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1590 */ 1549 */
1591 expire_client(unconf); 1550 expire_client(unconf);
1592 } 1551 }
1593 new = create_client(clname, dname); 1552 new = create_client(clname, dname, rqstp, &clverifier);
1594 if (new == NULL) 1553 if (new == NULL)
1595 goto out; 1554 goto out;
1596 copy_clid(new, conf); 1555 copy_clid(new, conf);
@@ -1600,7 +1559,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1600 * probable client reboot; state will be removed if 1559 * probable client reboot; state will be removed if
1601 * confirmed. 1560 * confirmed.
1602 */ 1561 */
1603 new = create_client(clname, dname); 1562 new = create_client(clname, dname, rqstp, &clverifier);
1604 if (new == NULL) 1563 if (new == NULL)
1605 goto out; 1564 goto out;
1606 gen_clid(new); 1565 gen_clid(new);
@@ -1611,25 +1570,12 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1611 * confirmed. 1570 * confirmed.
1612 */ 1571 */
1613 expire_client(unconf); 1572 expire_client(unconf);
1614 new = create_client(clname, dname); 1573 new = create_client(clname, dname, rqstp, &clverifier);
1615 if (new == NULL) 1574 if (new == NULL)
1616 goto out; 1575 goto out;
1617 gen_clid(new); 1576 gen_clid(new);
1618 } 1577 }
1619 copy_verf(new, &clverifier); 1578 gen_callback(new, setclid, rpc_get_scope_id(sa));
1620 new->cl_addr = sin->sin_addr.s_addr;
1621 new->cl_flavor = rqstp->rq_flavor;
1622 princ = svc_gss_principal(rqstp);
1623 if (princ) {
1624 new->cl_principal = kstrdup(princ, GFP_KERNEL);
1625 if (new->cl_principal == NULL) {
1626 free_client(new);
1627 goto out;
1628 }
1629 }
1630 copy_cred(&new->cl_cred, &rqstp->rq_cred);
1631 gen_confirm(new);
1632 gen_callback(new, setclid);
1633 add_to_unconfirmed(new, strhashval); 1579 add_to_unconfirmed(new, strhashval);
1634 setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; 1580 setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
1635 setclid->se_clientid.cl_id = new->cl_clientid.cl_id; 1581 setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
@@ -1651,7 +1597,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
1651 struct nfsd4_compound_state *cstate, 1597 struct nfsd4_compound_state *cstate,
1652 struct nfsd4_setclientid_confirm *setclientid_confirm) 1598 struct nfsd4_setclientid_confirm *setclientid_confirm)
1653{ 1599{
1654 struct sockaddr_in *sin = svc_addr_in(rqstp); 1600 struct sockaddr *sa = svc_addr(rqstp);
1655 struct nfs4_client *conf, *unconf; 1601 struct nfs4_client *conf, *unconf;
1656 nfs4_verifier confirm = setclientid_confirm->sc_confirm; 1602 nfs4_verifier confirm = setclientid_confirm->sc_confirm;
1657 clientid_t * clid = &setclientid_confirm->sc_clientid; 1603 clientid_t * clid = &setclientid_confirm->sc_clientid;
@@ -1670,9 +1616,9 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
1670 unconf = find_unconfirmed_client(clid); 1616 unconf = find_unconfirmed_client(clid);
1671 1617
1672 status = nfserr_clid_inuse; 1618 status = nfserr_clid_inuse;
1673 if (conf && conf->cl_addr != sin->sin_addr.s_addr) 1619 if (conf && !rpc_cmp_addr((struct sockaddr *) &conf->cl_addr, sa))
1674 goto out; 1620 goto out;
1675 if (unconf && unconf->cl_addr != sin->sin_addr.s_addr) 1621 if (unconf && !rpc_cmp_addr((struct sockaddr *) &unconf->cl_addr, sa))
1676 goto out; 1622 goto out;
1677 1623
1678 /* 1624 /*
@@ -2163,7 +2109,7 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
2163 return -EAGAIN; 2109 return -EAGAIN;
2164} 2110}
2165 2111
2166static struct lock_manager_operations nfsd_lease_mng_ops = { 2112static const struct lock_manager_operations nfsd_lease_mng_ops = {
2167 .fl_break = nfsd_break_deleg_cb, 2113 .fl_break = nfsd_break_deleg_cb,
2168 .fl_release_private = nfsd_release_deleg_cb, 2114 .fl_release_private = nfsd_release_deleg_cb,
2169 .fl_copy_lock = nfsd_copy_lock_deleg_cb, 2115 .fl_copy_lock = nfsd_copy_lock_deleg_cb,
@@ -3368,7 +3314,7 @@ nfs4_transform_lock_offset(struct file_lock *lock)
3368 3314
3369/* Hack!: For now, we're defining this just so we can use a pointer to it 3315/* Hack!: For now, we're defining this just so we can use a pointer to it
3370 * as a unique cookie to identify our (NFSv4's) posix locks. */ 3316 * as a unique cookie to identify our (NFSv4's) posix locks. */
3371static struct lock_manager_operations nfsd_posix_mng_ops = { 3317static const struct lock_manager_operations nfsd_posix_mng_ops = {
3372}; 3318};
3373 3319
3374static inline void 3320static inline void
@@ -4072,7 +4018,7 @@ set_max_delegations(void)
4072 4018
4073/* initialization to perform when the nfsd service is started: */ 4019/* initialization to perform when the nfsd service is started: */
4074 4020
4075static void 4021static int
4076__nfs4_state_start(void) 4022__nfs4_state_start(void)
4077{ 4023{
4078 unsigned long grace_time; 4024 unsigned long grace_time;
@@ -4084,19 +4030,26 @@ __nfs4_state_start(void)
4084 printk(KERN_INFO "NFSD: starting %ld-second grace period\n", 4030 printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
4085 grace_time/HZ); 4031 grace_time/HZ);
4086 laundry_wq = create_singlethread_workqueue("nfsd4"); 4032 laundry_wq = create_singlethread_workqueue("nfsd4");
4033 if (laundry_wq == NULL)
4034 return -ENOMEM;
4087 queue_delayed_work(laundry_wq, &laundromat_work, grace_time); 4035 queue_delayed_work(laundry_wq, &laundromat_work, grace_time);
4088 set_max_delegations(); 4036 set_max_delegations();
4037 return set_callback_cred();
4089} 4038}
4090 4039
4091void 4040int
4092nfs4_state_start(void) 4041nfs4_state_start(void)
4093{ 4042{
4043 int ret;
4044
4094 if (nfs4_init) 4045 if (nfs4_init)
4095 return; 4046 return 0;
4096 nfsd4_load_reboot_recovery_data(); 4047 nfsd4_load_reboot_recovery_data();
4097 __nfs4_state_start(); 4048 ret = __nfs4_state_start();
4049 if (ret)
4050 return ret;
4098 nfs4_init = 1; 4051 nfs4_init = 1;
4099 return; 4052 return 0;
4100} 4053}
4101 4054
4102time_t 4055time_t
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 2dcc7feaa6ff..0fbd50cee1f6 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1599,7 +1599,8 @@ static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location,
1599static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *stat) 1599static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *stat)
1600{ 1600{
1601 struct svc_fh tmp_fh; 1601 struct svc_fh tmp_fh;
1602 char *path, *rootpath; 1602 char *path = NULL, *rootpath;
1603 size_t rootlen;
1603 1604
1604 fh_init(&tmp_fh, NFS4_FHSIZE); 1605 fh_init(&tmp_fh, NFS4_FHSIZE);
1605 *stat = exp_pseudoroot(rqstp, &tmp_fh); 1606 *stat = exp_pseudoroot(rqstp, &tmp_fh);
@@ -1609,14 +1610,18 @@ static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *
1609 1610
1610 path = exp->ex_pathname; 1611 path = exp->ex_pathname;
1611 1612
1612 if (strncmp(path, rootpath, strlen(rootpath))) { 1613 rootlen = strlen(rootpath);
1614 if (strncmp(path, rootpath, rootlen)) {
1613 dprintk("nfsd: fs_locations failed;" 1615 dprintk("nfsd: fs_locations failed;"
1614 "%s is not contained in %s\n", path, rootpath); 1616 "%s is not contained in %s\n", path, rootpath);
1615 *stat = nfserr_notsupp; 1617 *stat = nfserr_notsupp;
1616 return NULL; 1618 path = NULL;
1619 goto out;
1617 } 1620 }
1618 1621 path += rootlen;
1619 return path + strlen(rootpath); 1622out:
1623 fh_put(&tmp_fh);
1624 return path;
1620} 1625}
1621 1626
1622/* 1627/*
@@ -1793,11 +1798,6 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1793 goto out_nfserr; 1798 goto out_nfserr;
1794 } 1799 }
1795 } 1800 }
1796 if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) {
1797 if (exp->ex_fslocs.locations == NULL) {
1798 bmval0 &= ~FATTR4_WORD0_FS_LOCATIONS;
1799 }
1800 }
1801 if ((buflen -= 16) < 0) 1801 if ((buflen -= 16) < 0)
1802 goto out_resource; 1802 goto out_resource;
1803 1803
@@ -1825,8 +1825,6 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1825 goto out_resource; 1825 goto out_resource;
1826 if (!aclsupport) 1826 if (!aclsupport)
1827 word0 &= ~FATTR4_WORD0_ACL; 1827 word0 &= ~FATTR4_WORD0_ACL;
1828 if (!exp->ex_fslocs.locations)
1829 word0 &= ~FATTR4_WORD0_FS_LOCATIONS;
1830 if (!word2) { 1828 if (!word2) {
1831 WRITE32(2); 1829 WRITE32(2);
1832 WRITE32(word0); 1830 WRITE32(word0);
@@ -3064,6 +3062,7 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
3064 WRITE32(0); 3062 WRITE32(0);
3065 3063
3066 ADJUST_ARGS(); 3064 ADJUST_ARGS();
3065 resp->cstate.datap = p; /* DRC cache data pointer */
3067 return 0; 3066 return 0;
3068} 3067}
3069 3068
@@ -3166,7 +3165,7 @@ static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
3166 return status; 3165 return status;
3167 3166
3168 session = resp->cstate.session; 3167 session = resp->cstate.session;
3169 if (session == NULL || slot->sl_cache_entry.ce_cachethis == 0) 3168 if (session == NULL || slot->sl_cachethis == 0)
3170 return status; 3169 return status;
3171 3170
3172 if (resp->opcnt >= args->opcnt) 3171 if (resp->opcnt >= args->opcnt)
@@ -3291,6 +3290,7 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
3291 /* 3290 /*
3292 * All that remains is to write the tag and operation count... 3291 * All that remains is to write the tag and operation count...
3293 */ 3292 */
3293 struct nfsd4_compound_state *cs = &resp->cstate;
3294 struct kvec *iov; 3294 struct kvec *iov;
3295 p = resp->tagp; 3295 p = resp->tagp;
3296 *p++ = htonl(resp->taglen); 3296 *p++ = htonl(resp->taglen);
@@ -3304,17 +3304,11 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
3304 iov = &rqstp->rq_res.head[0]; 3304 iov = &rqstp->rq_res.head[0];
3305 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base; 3305 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
3306 BUG_ON(iov->iov_len > PAGE_SIZE); 3306 BUG_ON(iov->iov_len > PAGE_SIZE);
3307 if (nfsd4_has_session(&resp->cstate)) { 3307 if (nfsd4_has_session(cs) && cs->status != nfserr_replay_cache) {
3308 if (resp->cstate.status == nfserr_replay_cache && 3308 nfsd4_store_cache_entry(resp);
3309 !nfsd4_not_cached(resp)) { 3309 dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
3310 iov->iov_len = resp->cstate.iovlen; 3310 resp->cstate.slot->sl_inuse = false;
3311 } else { 3311 nfsd4_put_session(resp->cstate.session);
3312 nfsd4_store_cache_entry(resp);
3313 dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
3314 resp->cstate.slot->sl_inuse = 0;
3315 }
3316 if (resp->cstate.session)
3317 nfsd4_put_session(resp->cstate.session);
3318 } 3312 }
3319 return 1; 3313 return 1;
3320} 3314}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 6d0847562d87..5c01fc148ce8 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -37,6 +37,7 @@
37#include <linux/nfsd/xdr.h> 37#include <linux/nfsd/xdr.h>
38#include <linux/nfsd/syscall.h> 38#include <linux/nfsd/syscall.h>
39#include <linux/lockd/lockd.h> 39#include <linux/lockd/lockd.h>
40#include <linux/sunrpc/clnt.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <net/ipv6.h> 43#include <net/ipv6.h>
@@ -173,12 +174,13 @@ static const struct file_operations exports_operations = {
173}; 174};
174 175
175extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); 176extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
177extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
176 178
177static struct file_operations pool_stats_operations = { 179static const struct file_operations pool_stats_operations = {
178 .open = nfsd_pool_stats_open, 180 .open = nfsd_pool_stats_open,
179 .read = seq_read, 181 .read = seq_read,
180 .llseek = seq_lseek, 182 .llseek = seq_lseek,
181 .release = seq_release, 183 .release = nfsd_pool_stats_release,
182 .owner = THIS_MODULE, 184 .owner = THIS_MODULE,
183}; 185};
184 186
@@ -490,22 +492,18 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
490 * 492 *
491 * Input: 493 * Input:
492 * buf: '\n'-terminated C string containing a 494 * buf: '\n'-terminated C string containing a
493 * presentation format IPv4 address 495 * presentation format IP address
494 * size: length of C string in @buf 496 * size: length of C string in @buf
495 * Output: 497 * Output:
496 * On success: returns zero if all specified locks were released; 498 * On success: returns zero if all specified locks were released;
497 * returns one if one or more locks were not released 499 * returns one if one or more locks were not released
498 * On error: return code is negative errno value 500 * On error: return code is negative errno value
499 *
500 * Note: Only AF_INET client addresses are passed in
501 */ 501 */
502static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) 502static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
503{ 503{
504 struct sockaddr_in sin = { 504 struct sockaddr_storage address;
505 .sin_family = AF_INET, 505 struct sockaddr *sap = (struct sockaddr *)&address;
506 }; 506 size_t salen = sizeof(address);
507 int b1, b2, b3, b4;
508 char c;
509 char *fo_path; 507 char *fo_path;
510 508
511 /* sanity check */ 509 /* sanity check */
@@ -519,14 +517,10 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
519 if (qword_get(&buf, fo_path, size) < 0) 517 if (qword_get(&buf, fo_path, size) < 0)
520 return -EINVAL; 518 return -EINVAL;
521 519
522 /* get ipv4 address */ 520 if (rpc_pton(fo_path, size, sap, salen) == 0)
523 if (sscanf(fo_path, "%u.%u.%u.%u%c", &b1, &b2, &b3, &b4, &c) != 4)
524 return -EINVAL;
525 if (b1 > 255 || b2 > 255 || b3 > 255 || b4 > 255)
526 return -EINVAL; 521 return -EINVAL;
527 sin.sin_addr.s_addr = htonl((b1 << 24) | (b2 << 16) | (b3 << 8) | b4);
528 522
529 return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin); 523 return nlmsvc_unlock_all_by_ip(sap);
530} 524}
531 525
532/** 526/**
@@ -783,10 +777,7 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
783 size -= len; 777 size -= len;
784 mesg += len; 778 mesg += len;
785 } 779 }
786 780 rv = mesg - buf;
787 mutex_unlock(&nfsd_mutex);
788 return (mesg-buf);
789
790out_free: 781out_free:
791 kfree(nthreads); 782 kfree(nthreads);
792 mutex_unlock(&nfsd_mutex); 783 mutex_unlock(&nfsd_mutex);
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 8847f3fbfc1e..01965b2f3a76 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -397,44 +397,51 @@ static inline void _fh_update_old(struct dentry *dentry,
397 fh->ofh_dirino = 0; 397 fh->ofh_dirino = 0;
398} 398}
399 399
400__be32 400static bool is_root_export(struct svc_export *exp)
401fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
402 struct svc_fh *ref_fh)
403{ 401{
404 /* ref_fh is a reference file handle. 402 return exp->ex_path.dentry == exp->ex_path.dentry->d_sb->s_root;
405 * if it is non-null and for the same filesystem, then we should compose 403}
406 * a filehandle which is of the same version, where possible.
407 * Currently, that means that if ref_fh->fh_handle.fh_version == 0xca
408 * Then create a 32byte filehandle using nfs_fhbase_old
409 *
410 */
411 404
412 u8 version; 405static struct super_block *exp_sb(struct svc_export *exp)
413 u8 fsid_type = 0; 406{
414 struct inode * inode = dentry->d_inode; 407 return exp->ex_path.dentry->d_inode->i_sb;
415 struct dentry *parent = dentry->d_parent; 408}
416 __u32 *datap;
417 dev_t ex_dev = exp->ex_path.dentry->d_inode->i_sb->s_dev;
418 int root_export = (exp->ex_path.dentry == exp->ex_path.dentry->d_sb->s_root);
419 409
420 dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %s/%s, ino=%ld)\n", 410static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp)
421 MAJOR(ex_dev), MINOR(ex_dev), 411{
422 (long) exp->ex_path.dentry->d_inode->i_ino, 412 switch (fsid_type) {
423 parent->d_name.name, dentry->d_name.name, 413 case FSID_DEV:
424 (inode ? inode->i_ino : 0)); 414 if (!old_valid_dev(exp_sb(exp)->s_dev))
415 return 0;
416 /* FALL THROUGH */
417 case FSID_MAJOR_MINOR:
418 case FSID_ENCODE_DEV:
419 return exp_sb(exp)->s_type->fs_flags & FS_REQUIRES_DEV;
420 case FSID_NUM:
421 return exp->ex_flags & NFSEXP_FSID;
422 case FSID_UUID8:
423 case FSID_UUID16:
424 if (!is_root_export(exp))
425 return 0;
426 /* fall through */
427 case FSID_UUID4_INUM:
428 case FSID_UUID16_INUM:
429 return exp->ex_uuid != NULL;
430 }
431 return 1;
432}
425 433
426 /* Choose filehandle version and fsid type based on 434
427 * the reference filehandle (if it is in the same export) 435static void set_version_and_fsid_type(struct svc_fh *fhp, struct svc_export *exp, struct svc_fh *ref_fh)
428 * or the export options. 436{
429 */ 437 u8 version;
430 retry: 438 u8 fsid_type;
439retry:
431 version = 1; 440 version = 1;
432 if (ref_fh && ref_fh->fh_export == exp) { 441 if (ref_fh && ref_fh->fh_export == exp) {
433 version = ref_fh->fh_handle.fh_version; 442 version = ref_fh->fh_handle.fh_version;
434 fsid_type = ref_fh->fh_handle.fh_fsid_type; 443 fsid_type = ref_fh->fh_handle.fh_fsid_type;
435 444
436 if (ref_fh == fhp)
437 fh_put(ref_fh);
438 ref_fh = NULL; 445 ref_fh = NULL;
439 446
440 switch (version) { 447 switch (version) {
@@ -447,58 +454,66 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
447 goto retry; 454 goto retry;
448 } 455 }
449 456
450 /* Need to check that this type works for this 457 /*
451 * export point. As the fsid -> filesystem mapping 458 * As the fsid -> filesystem mapping was guided by
452 * was guided by user-space, there is no guarantee 459 * user-space, there is no guarantee that the filesystem
453 * that the filesystem actually supports that fsid 460 * actually supports that fsid type. If it doesn't we
454 * type. If it doesn't we loop around again without 461 * loop around again without ref_fh set.
455 * ref_fh set.
456 */ 462 */
457 switch(fsid_type) { 463 if (!fsid_type_ok_for_exp(fsid_type, exp))
458 case FSID_DEV: 464 goto retry;
459 if (!old_valid_dev(ex_dev))
460 goto retry;
461 /* FALL THROUGH */
462 case FSID_MAJOR_MINOR:
463 case FSID_ENCODE_DEV:
464 if (!(exp->ex_path.dentry->d_inode->i_sb->s_type->fs_flags
465 & FS_REQUIRES_DEV))
466 goto retry;
467 break;
468 case FSID_NUM:
469 if (! (exp->ex_flags & NFSEXP_FSID))
470 goto retry;
471 break;
472 case FSID_UUID8:
473 case FSID_UUID16:
474 if (!root_export)
475 goto retry;
476 /* fall through */
477 case FSID_UUID4_INUM:
478 case FSID_UUID16_INUM:
479 if (exp->ex_uuid == NULL)
480 goto retry;
481 break;
482 }
483 } else if (exp->ex_flags & NFSEXP_FSID) { 465 } else if (exp->ex_flags & NFSEXP_FSID) {
484 fsid_type = FSID_NUM; 466 fsid_type = FSID_NUM;
485 } else if (exp->ex_uuid) { 467 } else if (exp->ex_uuid) {
486 if (fhp->fh_maxsize >= 64) { 468 if (fhp->fh_maxsize >= 64) {
487 if (root_export) 469 if (is_root_export(exp))
488 fsid_type = FSID_UUID16; 470 fsid_type = FSID_UUID16;
489 else 471 else
490 fsid_type = FSID_UUID16_INUM; 472 fsid_type = FSID_UUID16_INUM;
491 } else { 473 } else {
492 if (root_export) 474 if (is_root_export(exp))
493 fsid_type = FSID_UUID8; 475 fsid_type = FSID_UUID8;
494 else 476 else
495 fsid_type = FSID_UUID4_INUM; 477 fsid_type = FSID_UUID4_INUM;
496 } 478 }
497 } else if (!old_valid_dev(ex_dev)) 479 } else if (!old_valid_dev(exp_sb(exp)->s_dev))
498 /* for newer device numbers, we must use a newer fsid format */ 480 /* for newer device numbers, we must use a newer fsid format */
499 fsid_type = FSID_ENCODE_DEV; 481 fsid_type = FSID_ENCODE_DEV;
500 else 482 else
501 fsid_type = FSID_DEV; 483 fsid_type = FSID_DEV;
484 fhp->fh_handle.fh_version = version;
485 if (version)
486 fhp->fh_handle.fh_fsid_type = fsid_type;
487}
488
489__be32
490fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
491 struct svc_fh *ref_fh)
492{
493 /* ref_fh is a reference file handle.
494 * if it is non-null and for the same filesystem, then we should compose
495 * a filehandle which is of the same version, where possible.
496 * Currently, that means that if ref_fh->fh_handle.fh_version == 0xca
497 * Then create a 32byte filehandle using nfs_fhbase_old
498 *
499 */
500
501 struct inode * inode = dentry->d_inode;
502 struct dentry *parent = dentry->d_parent;
503 __u32 *datap;
504 dev_t ex_dev = exp_sb(exp)->s_dev;
505
506 dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %s/%s, ino=%ld)\n",
507 MAJOR(ex_dev), MINOR(ex_dev),
508 (long) exp->ex_path.dentry->d_inode->i_ino,
509 parent->d_name.name, dentry->d_name.name,
510 (inode ? inode->i_ino : 0));
511
512 /* Choose filehandle version and fsid type based on
513 * the reference filehandle (if it is in the same export)
514 * or the export options.
515 */
516 set_version_and_fsid_type(fhp, exp, ref_fh);
502 517
503 if (ref_fh == fhp) 518 if (ref_fh == fhp)
504 fh_put(ref_fh); 519 fh_put(ref_fh);
@@ -516,7 +531,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
516 fhp->fh_export = exp; 531 fhp->fh_export = exp;
517 cache_get(&exp->h); 532 cache_get(&exp->h);
518 533
519 if (version == 0xca) { 534 if (fhp->fh_handle.fh_version == 0xca) {
520 /* old style filehandle please */ 535 /* old style filehandle please */
521 memset(&fhp->fh_handle.fh_base, 0, NFS_FHSIZE); 536 memset(&fhp->fh_handle.fh_base, 0, NFS_FHSIZE);
522 fhp->fh_handle.fh_size = NFS_FHSIZE; 537 fhp->fh_handle.fh_size = NFS_FHSIZE;
@@ -530,22 +545,22 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
530 _fh_update_old(dentry, exp, &fhp->fh_handle); 545 _fh_update_old(dentry, exp, &fhp->fh_handle);
531 } else { 546 } else {
532 int len; 547 int len;
533 fhp->fh_handle.fh_version = 1;
534 fhp->fh_handle.fh_auth_type = 0; 548 fhp->fh_handle.fh_auth_type = 0;
535 datap = fhp->fh_handle.fh_auth+0; 549 datap = fhp->fh_handle.fh_auth+0;
536 fhp->fh_handle.fh_fsid_type = fsid_type; 550 mk_fsid(fhp->fh_handle.fh_fsid_type, datap, ex_dev,
537 mk_fsid(fsid_type, datap, ex_dev,
538 exp->ex_path.dentry->d_inode->i_ino, 551 exp->ex_path.dentry->d_inode->i_ino,
539 exp->ex_fsid, exp->ex_uuid); 552 exp->ex_fsid, exp->ex_uuid);
540 553
541 len = key_len(fsid_type); 554 len = key_len(fhp->fh_handle.fh_fsid_type);
542 datap += len/4; 555 datap += len/4;
543 fhp->fh_handle.fh_size = 4 + len; 556 fhp->fh_handle.fh_size = 4 + len;
544 557
545 if (inode) 558 if (inode)
546 _fh_update(fhp, exp, dentry); 559 _fh_update(fhp, exp, dentry);
547 if (fhp->fh_handle.fh_fileid_type == 255) 560 if (fhp->fh_handle.fh_fileid_type == 255) {
561 fh_put(fhp);
548 return nfserr_opnotsupp; 562 return nfserr_opnotsupp;
563 }
549 } 564 }
550 565
551 return 0; 566 return 0;
@@ -639,8 +654,7 @@ enum fsid_source fsid_source(struct svc_fh *fhp)
639 case FSID_DEV: 654 case FSID_DEV:
640 case FSID_ENCODE_DEV: 655 case FSID_ENCODE_DEV:
641 case FSID_MAJOR_MINOR: 656 case FSID_MAJOR_MINOR:
642 if (fhp->fh_export->ex_path.dentry->d_inode->i_sb->s_type->fs_flags 657 if (exp_sb(fhp->fh_export)->s_type->fs_flags & FS_REQUIRES_DEV)
643 & FS_REQUIRES_DEV)
644 return FSIDSOURCE_DEV; 658 return FSIDSOURCE_DEV;
645 break; 659 break;
646 case FSID_NUM: 660 case FSID_NUM:
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 492c79b7800b..67ea83eedd43 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -34,6 +34,7 @@
34#include <linux/nfsd/syscall.h> 34#include <linux/nfsd/syscall.h>
35#include <linux/lockd/bind.h> 35#include <linux/lockd/bind.h>
36#include <linux/nfsacl.h> 36#include <linux/nfsacl.h>
37#include <linux/seq_file.h>
37 38
38#define NFSDDBG_FACILITY NFSDDBG_SVC 39#define NFSDDBG_FACILITY NFSDDBG_SVC
39 40
@@ -66,6 +67,16 @@ struct timeval nfssvc_boot;
66DEFINE_MUTEX(nfsd_mutex); 67DEFINE_MUTEX(nfsd_mutex);
67struct svc_serv *nfsd_serv; 68struct svc_serv *nfsd_serv;
68 69
70/*
71 * nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used.
72 * nfsd_drc_max_pages limits the total amount of memory available for
73 * version 4.1 DRC caches.
74 * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage.
75 */
76spinlock_t nfsd_drc_lock;
77unsigned int nfsd_drc_max_mem;
78unsigned int nfsd_drc_mem_used;
79
69#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) 80#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
70static struct svc_stat nfsd_acl_svcstats; 81static struct svc_stat nfsd_acl_svcstats;
71static struct svc_version * nfsd_acl_version[] = { 82static struct svc_version * nfsd_acl_version[] = {
@@ -235,13 +246,12 @@ void nfsd_reset_versions(void)
235 */ 246 */
236static void set_max_drc(void) 247static void set_max_drc(void)
237{ 248{
238 /* The percent of nr_free_buffer_pages used by the V4.1 server DRC */ 249 #define NFSD_DRC_SIZE_SHIFT 10
239 #define NFSD_DRC_SIZE_SHIFT 7 250 nfsd_drc_max_mem = (nr_free_buffer_pages()
240 nfsd_serv->sv_drc_max_pages = nr_free_buffer_pages() 251 >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE;
241 >> NFSD_DRC_SIZE_SHIFT; 252 nfsd_drc_mem_used = 0;
242 nfsd_serv->sv_drc_pages_used = 0; 253 spin_lock_init(&nfsd_drc_lock);
243 dprintk("%s svc_drc_max_pages %u\n", __func__, 254 dprintk("%s nfsd_drc_max_mem %u \n", __func__, nfsd_drc_max_mem);
244 nfsd_serv->sv_drc_max_pages);
245} 255}
246 256
247int nfsd_create_serv(void) 257int nfsd_create_serv(void)
@@ -401,7 +411,9 @@ nfsd_svc(unsigned short port, int nrservs)
401 error = nfsd_racache_init(2*nrservs); 411 error = nfsd_racache_init(2*nrservs);
402 if (error<0) 412 if (error<0)
403 goto out; 413 goto out;
404 nfs4_state_start(); 414 error = nfs4_state_start();
415 if (error)
416 goto out;
405 417
406 nfsd_reset_versions(); 418 nfsd_reset_versions();
407 419
@@ -496,7 +508,9 @@ nfsd(void *vrqstp)
496 /* Lock the export hash tables for reading. */ 508 /* Lock the export hash tables for reading. */
497 exp_readlock(); 509 exp_readlock();
498 510
511 validate_process_creds();
499 svc_process(rqstp); 512 svc_process(rqstp);
513 validate_process_creds();
500 514
501 /* Unlock export hash tables */ 515 /* Unlock export hash tables */
502 exp_readunlock(); 516 exp_readunlock();
@@ -567,10 +581,6 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
567 + rqstp->rq_res.head[0].iov_len; 581 + rqstp->rq_res.head[0].iov_len;
568 rqstp->rq_res.head[0].iov_len += sizeof(__be32); 582 rqstp->rq_res.head[0].iov_len += sizeof(__be32);
569 583
570 /* NFSv4.1 DRC requires statp */
571 if (rqstp->rq_vers == 4)
572 nfsd4_set_statp(rqstp, statp);
573
574 /* Now call the procedure handler, and encode NFS status. */ 584 /* Now call the procedure handler, and encode NFS status. */
575 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); 585 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
576 nfserr = map_new_errors(rqstp->rq_vers, nfserr); 586 nfserr = map_new_errors(rqstp->rq_vers, nfserr);
@@ -605,7 +615,25 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
605 615
606int nfsd_pool_stats_open(struct inode *inode, struct file *file) 616int nfsd_pool_stats_open(struct inode *inode, struct file *file)
607{ 617{
608 if (nfsd_serv == NULL) 618 int ret;
619 mutex_lock(&nfsd_mutex);
620 if (nfsd_serv == NULL) {
621 mutex_unlock(&nfsd_mutex);
609 return -ENODEV; 622 return -ENODEV;
610 return svc_pool_stats_open(nfsd_serv, file); 623 }
624 /* bump up the psudo refcount while traversing */
625 svc_get(nfsd_serv);
626 ret = svc_pool_stats_open(nfsd_serv, file);
627 mutex_unlock(&nfsd_mutex);
628 return ret;
629}
630
631int nfsd_pool_stats_release(struct inode *inode, struct file *file)
632{
633 int ret = seq_release(inode, file);
634 mutex_lock(&nfsd_mutex);
635 /* this function really, really should have been called svc_put() */
636 svc_destroy(nfsd_serv);
637 mutex_unlock(&nfsd_mutex);
638 return ret;
611} 639}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 23341c1063bc..a293f0273263 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -89,6 +89,12 @@ struct raparm_hbucket {
89#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1) 89#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1)
90static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE]; 90static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE];
91 91
92static inline int
93nfsd_v4client(struct svc_rqst *rq)
94{
95 return rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4;
96}
97
92/* 98/*
93 * Called from nfsd_lookup and encode_dirent. Check if we have crossed 99 * Called from nfsd_lookup and encode_dirent. Check if we have crossed
94 * a mount point. 100 * a mount point.
@@ -115,7 +121,8 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
115 path_put(&path); 121 path_put(&path);
116 goto out; 122 goto out;
117 } 123 }
118 if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) { 124 if (nfsd_v4client(rqstp) ||
125 (exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
119 /* successfully crossed mount point */ 126 /* successfully crossed mount point */
120 /* 127 /*
121 * This is subtle: path.dentry is *not* on path.mnt 128 * This is subtle: path.dentry is *not* on path.mnt
@@ -684,6 +691,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
684 __be32 err; 691 __be32 err;
685 int host_err; 692 int host_err;
686 693
694 validate_process_creds();
695
687 /* 696 /*
688 * If we get here, then the client has already done an "open", 697 * If we get here, then the client has already done an "open",
689 * and (hopefully) checked permission - so allow OWNER_OVERRIDE 698 * and (hopefully) checked permission - so allow OWNER_OVERRIDE
@@ -740,6 +749,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
740out_nfserr: 749out_nfserr:
741 err = nfserrno(host_err); 750 err = nfserrno(host_err);
742out: 751out:
752 validate_process_creds();
743 return err; 753 return err;
744} 754}
745 755
diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig
index 72da095d4009..251da07b2a1d 100644
--- a/fs/nilfs2/Kconfig
+++ b/fs/nilfs2/Kconfig
@@ -1,6 +1,6 @@
1config NILFS2_FS 1config NILFS2_FS
2 tristate "NILFS2 file system support (EXPERIMENTAL)" 2 tristate "NILFS2 file system support (EXPERIMENTAL)"
3 depends on BLOCK && EXPERIMENTAL 3 depends on EXPERIMENTAL
4 select CRC32 4 select CRC32
5 help 5 help
6 NILFS2 is a log-structured file system (LFS) supporting continuous 6 NILFS2 is a log-structured file system (LFS) supporting continuous
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 99d58a028b94..08834df6ec68 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -36,6 +36,26 @@ struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
36 return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode)); 36 return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
37} 37}
38 38
39/**
40 * nilfs_bmap_lookup_at_level - find a data block or node block
41 * @bmap: bmap
42 * @key: key
43 * @level: level
44 * @ptrp: place to store the value associated to @key
45 *
46 * Description: nilfs_bmap_lookup_at_level() finds a record whose key
47 * matches @key in the block at @level of the bmap.
48 *
49 * Return Value: On success, 0 is returned and the record associated with @key
50 * is stored in the place pointed by @ptrp. On error, one of the following
51 * negative error codes is returned.
52 *
53 * %-EIO - I/O error.
54 *
55 * %-ENOMEM - Insufficient amount of memory available.
56 *
57 * %-ENOENT - A record associated with @key does not exist.
58 */
39int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level, 59int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
40 __u64 *ptrp) 60 __u64 *ptrp)
41{ 61{
@@ -69,39 +89,6 @@ int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
69 return ret; 89 return ret;
70} 90}
71 91
72/**
73 * nilfs_bmap_lookup - find a record
74 * @bmap: bmap
75 * @key: key
76 * @recp: pointer to record
77 *
78 * Description: nilfs_bmap_lookup() finds a record whose key matches @key in
79 * @bmap.
80 *
81 * Return Value: On success, 0 is returned and the record associated with @key
82 * is stored in the place pointed by @recp. On error, one of the following
83 * negative error codes is returned.
84 *
85 * %-EIO - I/O error.
86 *
87 * %-ENOMEM - Insufficient amount of memory available.
88 *
89 * %-ENOENT - A record associated with @key does not exist.
90 */
91int nilfs_bmap_lookup(struct nilfs_bmap *bmap,
92 unsigned long key,
93 unsigned long *recp)
94{
95 __u64 ptr;
96 int ret;
97
98 /* XXX: use macro for level 1 */
99 ret = nilfs_bmap_lookup_at_level(bmap, key, 1, &ptr);
100 if (recp != NULL)
101 *recp = ptr;
102 return ret;
103}
104
105static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) 92static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
106{ 93{
107 __u64 keys[NILFS_BMAP_SMALL_HIGH + 1]; 94 __u64 keys[NILFS_BMAP_SMALL_HIGH + 1];
@@ -469,104 +456,6 @@ __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
469 (entries_per_group / NILFS_BMAP_GROUP_DIV); 456 (entries_per_group / NILFS_BMAP_GROUP_DIV);
470} 457}
471 458
472int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap,
473 union nilfs_bmap_ptr_req *req)
474{
475 return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
476}
477
478void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap,
479 union nilfs_bmap_ptr_req *req)
480{
481 nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
482}
483
484void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap,
485 union nilfs_bmap_ptr_req *req)
486{
487 nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
488}
489
490int nilfs_bmap_start_v(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *req,
491 sector_t blocknr)
492{
493 struct inode *dat = nilfs_bmap_get_dat(bmap);
494 int ret;
495
496 ret = nilfs_dat_prepare_start(dat, &req->bpr_req);
497 if (likely(!ret))
498 nilfs_dat_commit_start(dat, &req->bpr_req, blocknr);
499 return ret;
500}
501
502int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap,
503 union nilfs_bmap_ptr_req *req)
504{
505 return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
506}
507
508void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap,
509 union nilfs_bmap_ptr_req *req)
510{
511 nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req,
512 bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
513}
514
515void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap,
516 union nilfs_bmap_ptr_req *req)
517{
518 nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
519}
520
521int nilfs_bmap_move_v(const struct nilfs_bmap *bmap, __u64 vblocknr,
522 sector_t blocknr)
523{
524 return nilfs_dat_move(nilfs_bmap_get_dat(bmap), vblocknr, blocknr);
525}
526
527int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr)
528{
529 return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr);
530}
531
532int nilfs_bmap_prepare_update_v(struct nilfs_bmap *bmap,
533 union nilfs_bmap_ptr_req *oldreq,
534 union nilfs_bmap_ptr_req *newreq)
535{
536 struct inode *dat = nilfs_bmap_get_dat(bmap);
537 int ret;
538
539 ret = nilfs_dat_prepare_end(dat, &oldreq->bpr_req);
540 if (ret < 0)
541 return ret;
542 ret = nilfs_dat_prepare_alloc(dat, &newreq->bpr_req);
543 if (ret < 0)
544 nilfs_dat_abort_end(dat, &oldreq->bpr_req);
545
546 return ret;
547}
548
549void nilfs_bmap_commit_update_v(struct nilfs_bmap *bmap,
550 union nilfs_bmap_ptr_req *oldreq,
551 union nilfs_bmap_ptr_req *newreq)
552{
553 struct inode *dat = nilfs_bmap_get_dat(bmap);
554
555 nilfs_dat_commit_end(dat, &oldreq->bpr_req,
556 bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
557 nilfs_dat_commit_alloc(dat, &newreq->bpr_req);
558}
559
560void nilfs_bmap_abort_update_v(struct nilfs_bmap *bmap,
561 union nilfs_bmap_ptr_req *oldreq,
562 union nilfs_bmap_ptr_req *newreq)
563{
564 struct inode *dat = nilfs_bmap_get_dat(bmap);
565
566 nilfs_dat_abort_end(dat, &oldreq->bpr_req);
567 nilfs_dat_abort_alloc(dat, &newreq->bpr_req);
568}
569
570static struct lock_class_key nilfs_bmap_dat_lock_key; 459static struct lock_class_key nilfs_bmap_dat_lock_key;
571static struct lock_class_key nilfs_bmap_mdt_lock_key; 460static struct lock_class_key nilfs_bmap_mdt_lock_key;
572 461
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index b2890cdcef12..9980d7dbab91 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -28,6 +28,7 @@
28#include <linux/buffer_head.h> 28#include <linux/buffer_head.h>
29#include <linux/nilfs2_fs.h> 29#include <linux/nilfs2_fs.h>
30#include "alloc.h" 30#include "alloc.h"
31#include "dat.h"
31 32
32#define NILFS_BMAP_INVALID_PTR 0 33#define NILFS_BMAP_INVALID_PTR 0
33 34
@@ -141,7 +142,6 @@ struct nilfs_bmap {
141int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *); 142int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
142int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *); 143int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
143void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *); 144void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
144int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *);
145int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned); 145int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
146int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long); 146int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
147int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long); 147int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
@@ -160,90 +160,76 @@ void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
160void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *); 160void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
161 161
162 162
163static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key,
164 __u64 *ptr)
165{
166 return nilfs_bmap_lookup_at_level(bmap, key, 1, ptr);
167}
168
163/* 169/*
164 * Internal use only 170 * Internal use only
165 */ 171 */
166struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *); 172struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *);
167int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *,
168 union nilfs_bmap_ptr_req *);
169void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *,
170 union nilfs_bmap_ptr_req *);
171void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *,
172 union nilfs_bmap_ptr_req *);
173 173
174static inline int nilfs_bmap_prepare_alloc_ptr(struct nilfs_bmap *bmap, 174static inline int nilfs_bmap_prepare_alloc_ptr(struct nilfs_bmap *bmap,
175 union nilfs_bmap_ptr_req *req) 175 union nilfs_bmap_ptr_req *req,
176 struct inode *dat)
176{ 177{
177 if (NILFS_BMAP_USE_VBN(bmap)) 178 if (dat)
178 return nilfs_bmap_prepare_alloc_v(bmap, req); 179 return nilfs_dat_prepare_alloc(dat, &req->bpr_req);
179 /* ignore target ptr */ 180 /* ignore target ptr */
180 req->bpr_ptr = bmap->b_last_allocated_ptr++; 181 req->bpr_ptr = bmap->b_last_allocated_ptr++;
181 return 0; 182 return 0;
182} 183}
183 184
184static inline void nilfs_bmap_commit_alloc_ptr(struct nilfs_bmap *bmap, 185static inline void nilfs_bmap_commit_alloc_ptr(struct nilfs_bmap *bmap,
185 union nilfs_bmap_ptr_req *req) 186 union nilfs_bmap_ptr_req *req,
187 struct inode *dat)
186{ 188{
187 if (NILFS_BMAP_USE_VBN(bmap)) 189 if (dat)
188 nilfs_bmap_commit_alloc_v(bmap, req); 190 nilfs_dat_commit_alloc(dat, &req->bpr_req);
189} 191}
190 192
191static inline void nilfs_bmap_abort_alloc_ptr(struct nilfs_bmap *bmap, 193static inline void nilfs_bmap_abort_alloc_ptr(struct nilfs_bmap *bmap,
192 union nilfs_bmap_ptr_req *req) 194 union nilfs_bmap_ptr_req *req,
195 struct inode *dat)
193{ 196{
194 if (NILFS_BMAP_USE_VBN(bmap)) 197 if (dat)
195 nilfs_bmap_abort_alloc_v(bmap, req); 198 nilfs_dat_abort_alloc(dat, &req->bpr_req);
196 else 199 else
197 bmap->b_last_allocated_ptr--; 200 bmap->b_last_allocated_ptr--;
198} 201}
199 202
200int nilfs_bmap_prepare_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
201void nilfs_bmap_commit_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
202void nilfs_bmap_abort_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
203
204static inline int nilfs_bmap_prepare_end_ptr(struct nilfs_bmap *bmap, 203static inline int nilfs_bmap_prepare_end_ptr(struct nilfs_bmap *bmap,
205 union nilfs_bmap_ptr_req *req) 204 union nilfs_bmap_ptr_req *req,
205 struct inode *dat)
206{ 206{
207 return NILFS_BMAP_USE_VBN(bmap) ? 207 return dat ? nilfs_dat_prepare_end(dat, &req->bpr_req) : 0;
208 nilfs_bmap_prepare_end_v(bmap, req) : 0;
209} 208}
210 209
211static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap, 210static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap,
212 union nilfs_bmap_ptr_req *req) 211 union nilfs_bmap_ptr_req *req,
212 struct inode *dat)
213{ 213{
214 if (NILFS_BMAP_USE_VBN(bmap)) 214 if (dat)
215 nilfs_bmap_commit_end_v(bmap, req); 215 nilfs_dat_commit_end(dat, &req->bpr_req,
216 bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
216} 217}
217 218
218static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap, 219static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap,
219 union nilfs_bmap_ptr_req *req) 220 union nilfs_bmap_ptr_req *req,
221 struct inode *dat)
220{ 222{
221 if (NILFS_BMAP_USE_VBN(bmap)) 223 if (dat)
222 nilfs_bmap_abort_end_v(bmap, req); 224 nilfs_dat_abort_end(dat, &req->bpr_req);
223} 225}
224 226
225int nilfs_bmap_start_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *,
226 sector_t);
227int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t);
228int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64);
229
230
231__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *, 227__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
232 const struct buffer_head *); 228 const struct buffer_head *);
233 229
234__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64); 230__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
235__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *); 231__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
236 232
237int nilfs_bmap_prepare_update_v(struct nilfs_bmap *,
238 union nilfs_bmap_ptr_req *,
239 union nilfs_bmap_ptr_req *);
240void nilfs_bmap_commit_update_v(struct nilfs_bmap *,
241 union nilfs_bmap_ptr_req *,
242 union nilfs_bmap_ptr_req *);
243void nilfs_bmap_abort_update_v(struct nilfs_bmap *,
244 union nilfs_bmap_ptr_req *,
245 union nilfs_bmap_ptr_req *);
246
247void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int); 233void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int);
248void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int); 234void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int);
249 235
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 7e0b61be212e..5941958f1e47 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -36,6 +36,7 @@
36 36
37void nilfs_btnode_cache_init_once(struct address_space *btnc) 37void nilfs_btnode_cache_init_once(struct address_space *btnc)
38{ 38{
39 memset(btnc, 0, sizeof(*btnc));
39 INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC); 40 INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC);
40 spin_lock_init(&btnc->tree_lock); 41 spin_lock_init(&btnc->tree_lock);
41 INIT_LIST_HEAD(&btnc->private_list); 42 INIT_LIST_HEAD(&btnc->private_list);
@@ -46,7 +47,7 @@ void nilfs_btnode_cache_init_once(struct address_space *btnc)
46 INIT_LIST_HEAD(&btnc->i_mmap_nonlinear); 47 INIT_LIST_HEAD(&btnc->i_mmap_nonlinear);
47} 48}
48 49
49static struct address_space_operations def_btnode_aops = { 50static const struct address_space_operations def_btnode_aops = {
50 .sync_page = block_sync_page, 51 .sync_page = block_sync_page,
51}; 52};
52 53
@@ -209,6 +210,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
209 * We cannot call radix_tree_preload for the kernels older 210 * We cannot call radix_tree_preload for the kernels older
210 * than 2.6.23, because it is not exported for modules. 211 * than 2.6.23, because it is not exported for modules.
211 */ 212 */
213retry:
212 err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 214 err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
213 if (err) 215 if (err)
214 goto failed_unlock; 216 goto failed_unlock;
@@ -219,7 +221,6 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
219 (unsigned long long)oldkey, 221 (unsigned long long)oldkey,
220 (unsigned long long)newkey); 222 (unsigned long long)newkey);
221 223
222retry:
223 spin_lock_irq(&btnc->tree_lock); 224 spin_lock_irq(&btnc->tree_lock);
224 err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page); 225 err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page);
225 spin_unlock_irq(&btnc->tree_lock); 226 spin_unlock_irq(&btnc->tree_lock);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index aa412724b64e..e25b507a474f 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -71,21 +71,17 @@ void nilfs_btree_path_cache_destroy(void)
71 kmem_cache_destroy(nilfs_btree_path_cache); 71 kmem_cache_destroy(nilfs_btree_path_cache);
72} 72}
73 73
74static inline struct nilfs_btree_path * 74static inline struct nilfs_btree_path *nilfs_btree_alloc_path(void)
75nilfs_btree_alloc_path(const struct nilfs_btree *btree)
76{ 75{
77 return (struct nilfs_btree_path *) 76 return kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
78 kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
79} 77}
80 78
81static inline void nilfs_btree_free_path(const struct nilfs_btree *btree, 79static inline void nilfs_btree_free_path(struct nilfs_btree_path *path)
82 struct nilfs_btree_path *path)
83{ 80{
84 kmem_cache_free(nilfs_btree_path_cache, path); 81 kmem_cache_free(nilfs_btree_path_cache, path);
85} 82}
86 83
87static void nilfs_btree_init_path(const struct nilfs_btree *btree, 84static void nilfs_btree_init_path(struct nilfs_btree_path *path)
88 struct nilfs_btree_path *path)
89{ 85{
90 int level; 86 int level;
91 87
@@ -101,26 +97,13 @@ static void nilfs_btree_init_path(const struct nilfs_btree *btree,
101 } 97 }
102} 98}
103 99
104static void nilfs_btree_clear_path(const struct nilfs_btree *btree, 100static void nilfs_btree_release_path(struct nilfs_btree_path *path)
105 struct nilfs_btree_path *path)
106{ 101{
107 int level; 102 int level;
108 103
109 for (level = NILFS_BTREE_LEVEL_DATA; 104 for (level = NILFS_BTREE_LEVEL_DATA; level < NILFS_BTREE_LEVEL_MAX;
110 level < NILFS_BTREE_LEVEL_MAX; 105 level++)
111 level++) { 106 brelse(path[level].bp_bh);
112 if (path[level].bp_bh != NULL) {
113 brelse(path[level].bp_bh);
114 path[level].bp_bh = NULL;
115 }
116 /* sib_bh is released or deleted by prepare or commit
117 * operations. */
118 path[level].bp_sib_bh = NULL;
119 path[level].bp_index = 0;
120 path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
121 path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
122 path[level].bp_op = NULL;
123 }
124} 107}
125 108
126/* 109/*
@@ -148,129 +131,110 @@ static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
148} 131}
149 132
150static inline int 133static inline int
151nilfs_btree_node_get_flags(const struct nilfs_btree *btree, 134nilfs_btree_node_get_flags(const struct nilfs_btree_node *node)
152 const struct nilfs_btree_node *node)
153{ 135{
154 return node->bn_flags; 136 return node->bn_flags;
155} 137}
156 138
157static inline void 139static inline void
158nilfs_btree_node_set_flags(struct nilfs_btree *btree, 140nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags)
159 struct nilfs_btree_node *node,
160 int flags)
161{ 141{
162 node->bn_flags = flags; 142 node->bn_flags = flags;
163} 143}
164 144
165static inline int nilfs_btree_node_root(const struct nilfs_btree *btree, 145static inline int nilfs_btree_node_root(const struct nilfs_btree_node *node)
166 const struct nilfs_btree_node *node)
167{ 146{
168 return nilfs_btree_node_get_flags(btree, node) & NILFS_BTREE_NODE_ROOT; 147 return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT;
169} 148}
170 149
171static inline int 150static inline int
172nilfs_btree_node_get_level(const struct nilfs_btree *btree, 151nilfs_btree_node_get_level(const struct nilfs_btree_node *node)
173 const struct nilfs_btree_node *node)
174{ 152{
175 return node->bn_level; 153 return node->bn_level;
176} 154}
177 155
178static inline void 156static inline void
179nilfs_btree_node_set_level(struct nilfs_btree *btree, 157nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level)
180 struct nilfs_btree_node *node,
181 int level)
182{ 158{
183 node->bn_level = level; 159 node->bn_level = level;
184} 160}
185 161
186static inline int 162static inline int
187nilfs_btree_node_get_nchildren(const struct nilfs_btree *btree, 163nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node)
188 const struct nilfs_btree_node *node)
189{ 164{
190 return le16_to_cpu(node->bn_nchildren); 165 return le16_to_cpu(node->bn_nchildren);
191} 166}
192 167
193static inline void 168static inline void
194nilfs_btree_node_set_nchildren(struct nilfs_btree *btree, 169nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren)
195 struct nilfs_btree_node *node,
196 int nchildren)
197{ 170{
198 node->bn_nchildren = cpu_to_le16(nchildren); 171 node->bn_nchildren = cpu_to_le16(nchildren);
199} 172}
200 173
201static inline int 174static inline int nilfs_btree_node_size(const struct nilfs_btree *btree)
202nilfs_btree_node_size(const struct nilfs_btree *btree)
203{ 175{
204 return 1 << btree->bt_bmap.b_inode->i_blkbits; 176 return 1 << btree->bt_bmap.b_inode->i_blkbits;
205} 177}
206 178
207static inline int 179static inline int
208nilfs_btree_node_nchildren_min(const struct nilfs_btree *btree, 180nilfs_btree_node_nchildren_min(const struct nilfs_btree_node *node,
209 const struct nilfs_btree_node *node) 181 const struct nilfs_btree *btree)
210{ 182{
211 return nilfs_btree_node_root(btree, node) ? 183 return nilfs_btree_node_root(node) ?
212 NILFS_BTREE_ROOT_NCHILDREN_MIN : 184 NILFS_BTREE_ROOT_NCHILDREN_MIN :
213 NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree)); 185 NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
214} 186}
215 187
216static inline int 188static inline int
217nilfs_btree_node_nchildren_max(const struct nilfs_btree *btree, 189nilfs_btree_node_nchildren_max(const struct nilfs_btree_node *node,
218 const struct nilfs_btree_node *node) 190 const struct nilfs_btree *btree)
219{ 191{
220 return nilfs_btree_node_root(btree, node) ? 192 return nilfs_btree_node_root(node) ?
221 NILFS_BTREE_ROOT_NCHILDREN_MAX : 193 NILFS_BTREE_ROOT_NCHILDREN_MAX :
222 NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree)); 194 NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree));
223} 195}
224 196
225static inline __le64 * 197static inline __le64 *
226nilfs_btree_node_dkeys(const struct nilfs_btree *btree, 198nilfs_btree_node_dkeys(const struct nilfs_btree_node *node)
227 const struct nilfs_btree_node *node)
228{ 199{
229 return (__le64 *)((char *)(node + 1) + 200 return (__le64 *)((char *)(node + 1) +
230 (nilfs_btree_node_root(btree, node) ? 201 (nilfs_btree_node_root(node) ?
231 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE)); 202 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE));
232} 203}
233 204
234static inline __le64 * 205static inline __le64 *
235nilfs_btree_node_dptrs(const struct nilfs_btree *btree, 206nilfs_btree_node_dptrs(const struct nilfs_btree_node *node,
236 const struct nilfs_btree_node *node) 207 const struct nilfs_btree *btree)
237{ 208{
238 return (__le64 *)(nilfs_btree_node_dkeys(btree, node) + 209 return (__le64 *)(nilfs_btree_node_dkeys(node) +
239 nilfs_btree_node_nchildren_max(btree, node)); 210 nilfs_btree_node_nchildren_max(node, btree));
240} 211}
241 212
242static inline __u64 213static inline __u64
243nilfs_btree_node_get_key(const struct nilfs_btree *btree, 214nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index)
244 const struct nilfs_btree_node *node, int index)
245{ 215{
246 return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(btree, node) + 216 return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(node) + index));
247 index));
248} 217}
249 218
250static inline void 219static inline void
251nilfs_btree_node_set_key(struct nilfs_btree *btree, 220nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key)
252 struct nilfs_btree_node *node, int index, __u64 key)
253{ 221{
254 *(nilfs_btree_node_dkeys(btree, node) + index) = 222 *(nilfs_btree_node_dkeys(node) + index) = nilfs_bmap_key_to_dkey(key);
255 nilfs_bmap_key_to_dkey(key);
256} 223}
257 224
258static inline __u64 225static inline __u64
259nilfs_btree_node_get_ptr(const struct nilfs_btree *btree, 226nilfs_btree_node_get_ptr(const struct nilfs_btree *btree,
260 const struct nilfs_btree_node *node, 227 const struct nilfs_btree_node *node, int index)
261 int index)
262{ 228{
263 return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(btree, node) + 229 return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(node, btree) +
264 index)); 230 index));
265} 231}
266 232
267static inline void 233static inline void
268nilfs_btree_node_set_ptr(struct nilfs_btree *btree, 234nilfs_btree_node_set_ptr(struct nilfs_btree *btree,
269 struct nilfs_btree_node *node, 235 struct nilfs_btree_node *node, int index, __u64 ptr)
270 int index,
271 __u64 ptr)
272{ 236{
273 *(nilfs_btree_node_dptrs(btree, node) + index) = 237 *(nilfs_btree_node_dptrs(node, btree) + index) =
274 nilfs_bmap_ptr_to_dptr(ptr); 238 nilfs_bmap_ptr_to_dptr(ptr);
275} 239}
276 240
@@ -283,12 +247,12 @@ static void nilfs_btree_node_init(struct nilfs_btree *btree,
283 __le64 *dptrs; 247 __le64 *dptrs;
284 int i; 248 int i;
285 249
286 nilfs_btree_node_set_flags(btree, node, flags); 250 nilfs_btree_node_set_flags(node, flags);
287 nilfs_btree_node_set_level(btree, node, level); 251 nilfs_btree_node_set_level(node, level);
288 nilfs_btree_node_set_nchildren(btree, node, nchildren); 252 nilfs_btree_node_set_nchildren(node, nchildren);
289 253
290 dkeys = nilfs_btree_node_dkeys(btree, node); 254 dkeys = nilfs_btree_node_dkeys(node);
291 dptrs = nilfs_btree_node_dptrs(btree, node); 255 dptrs = nilfs_btree_node_dptrs(node, btree);
292 for (i = 0; i < nchildren; i++) { 256 for (i = 0; i < nchildren; i++) {
293 dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]); 257 dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]);
294 dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]); 258 dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]);
@@ -305,13 +269,13 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
305 __le64 *ldptrs, *rdptrs; 269 __le64 *ldptrs, *rdptrs;
306 int lnchildren, rnchildren; 270 int lnchildren, rnchildren;
307 271
308 ldkeys = nilfs_btree_node_dkeys(btree, left); 272 ldkeys = nilfs_btree_node_dkeys(left);
309 ldptrs = nilfs_btree_node_dptrs(btree, left); 273 ldptrs = nilfs_btree_node_dptrs(left, btree);
310 lnchildren = nilfs_btree_node_get_nchildren(btree, left); 274 lnchildren = nilfs_btree_node_get_nchildren(left);
311 275
312 rdkeys = nilfs_btree_node_dkeys(btree, right); 276 rdkeys = nilfs_btree_node_dkeys(right);
313 rdptrs = nilfs_btree_node_dptrs(btree, right); 277 rdptrs = nilfs_btree_node_dptrs(right, btree);
314 rnchildren = nilfs_btree_node_get_nchildren(btree, right); 278 rnchildren = nilfs_btree_node_get_nchildren(right);
315 279
316 memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys)); 280 memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys));
317 memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs)); 281 memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs));
@@ -320,8 +284,8 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
320 284
321 lnchildren += n; 285 lnchildren += n;
322 rnchildren -= n; 286 rnchildren -= n;
323 nilfs_btree_node_set_nchildren(btree, left, lnchildren); 287 nilfs_btree_node_set_nchildren(left, lnchildren);
324 nilfs_btree_node_set_nchildren(btree, right, rnchildren); 288 nilfs_btree_node_set_nchildren(right, rnchildren);
325} 289}
326 290
327/* Assume that the buffer heads corresponding to left and right are locked. */ 291/* Assume that the buffer heads corresponding to left and right are locked. */
@@ -334,13 +298,13 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
334 __le64 *ldptrs, *rdptrs; 298 __le64 *ldptrs, *rdptrs;
335 int lnchildren, rnchildren; 299 int lnchildren, rnchildren;
336 300
337 ldkeys = nilfs_btree_node_dkeys(btree, left); 301 ldkeys = nilfs_btree_node_dkeys(left);
338 ldptrs = nilfs_btree_node_dptrs(btree, left); 302 ldptrs = nilfs_btree_node_dptrs(left, btree);
339 lnchildren = nilfs_btree_node_get_nchildren(btree, left); 303 lnchildren = nilfs_btree_node_get_nchildren(left);
340 304
341 rdkeys = nilfs_btree_node_dkeys(btree, right); 305 rdkeys = nilfs_btree_node_dkeys(right);
342 rdptrs = nilfs_btree_node_dptrs(btree, right); 306 rdptrs = nilfs_btree_node_dptrs(right, btree);
343 rnchildren = nilfs_btree_node_get_nchildren(btree, right); 307 rnchildren = nilfs_btree_node_get_nchildren(right);
344 308
345 memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys)); 309 memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys));
346 memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs)); 310 memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs));
@@ -349,8 +313,8 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
349 313
350 lnchildren -= n; 314 lnchildren -= n;
351 rnchildren += n; 315 rnchildren += n;
352 nilfs_btree_node_set_nchildren(btree, left, lnchildren); 316 nilfs_btree_node_set_nchildren(left, lnchildren);
353 nilfs_btree_node_set_nchildren(btree, right, rnchildren); 317 nilfs_btree_node_set_nchildren(right, rnchildren);
354} 318}
355 319
356/* Assume that the buffer head corresponding to node is locked. */ 320/* Assume that the buffer head corresponding to node is locked. */
@@ -362,9 +326,9 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree,
362 __le64 *dptrs; 326 __le64 *dptrs;
363 int nchildren; 327 int nchildren;
364 328
365 dkeys = nilfs_btree_node_dkeys(btree, node); 329 dkeys = nilfs_btree_node_dkeys(node);
366 dptrs = nilfs_btree_node_dptrs(btree, node); 330 dptrs = nilfs_btree_node_dptrs(node, btree);
367 nchildren = nilfs_btree_node_get_nchildren(btree, node); 331 nchildren = nilfs_btree_node_get_nchildren(node);
368 if (index < nchildren) { 332 if (index < nchildren) {
369 memmove(dkeys + index + 1, dkeys + index, 333 memmove(dkeys + index + 1, dkeys + index,
370 (nchildren - index) * sizeof(*dkeys)); 334 (nchildren - index) * sizeof(*dkeys));
@@ -374,7 +338,7 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree,
374 dkeys[index] = nilfs_bmap_key_to_dkey(key); 338 dkeys[index] = nilfs_bmap_key_to_dkey(key);
375 dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr); 339 dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr);
376 nchildren++; 340 nchildren++;
377 nilfs_btree_node_set_nchildren(btree, node, nchildren); 341 nilfs_btree_node_set_nchildren(node, nchildren);
378} 342}
379 343
380/* Assume that the buffer head corresponding to node is locked. */ 344/* Assume that the buffer head corresponding to node is locked. */
@@ -388,11 +352,11 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree,
388 __le64 *dptrs; 352 __le64 *dptrs;
389 int nchildren; 353 int nchildren;
390 354
391 dkeys = nilfs_btree_node_dkeys(btree, node); 355 dkeys = nilfs_btree_node_dkeys(node);
392 dptrs = nilfs_btree_node_dptrs(btree, node); 356 dptrs = nilfs_btree_node_dptrs(node, btree);
393 key = nilfs_bmap_dkey_to_key(dkeys[index]); 357 key = nilfs_bmap_dkey_to_key(dkeys[index]);
394 ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]); 358 ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]);
395 nchildren = nilfs_btree_node_get_nchildren(btree, node); 359 nchildren = nilfs_btree_node_get_nchildren(node);
396 if (keyp != NULL) 360 if (keyp != NULL)
397 *keyp = key; 361 *keyp = key;
398 if (ptrp != NULL) 362 if (ptrp != NULL)
@@ -405,11 +369,10 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree,
405 (nchildren - index - 1) * sizeof(*dptrs)); 369 (nchildren - index - 1) * sizeof(*dptrs));
406 } 370 }
407 nchildren--; 371 nchildren--;
408 nilfs_btree_node_set_nchildren(btree, node, nchildren); 372 nilfs_btree_node_set_nchildren(node, nchildren);
409} 373}
410 374
411static int nilfs_btree_node_lookup(const struct nilfs_btree *btree, 375static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node,
412 const struct nilfs_btree_node *node,
413 __u64 key, int *indexp) 376 __u64 key, int *indexp)
414{ 377{
415 __u64 nkey; 378 __u64 nkey;
@@ -417,12 +380,12 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
417 380
418 /* binary search */ 381 /* binary search */
419 low = 0; 382 low = 0;
420 high = nilfs_btree_node_get_nchildren(btree, node) - 1; 383 high = nilfs_btree_node_get_nchildren(node) - 1;
421 index = 0; 384 index = 0;
422 s = 0; 385 s = 0;
423 while (low <= high) { 386 while (low <= high) {
424 index = (low + high) / 2; 387 index = (low + high) / 2;
425 nkey = nilfs_btree_node_get_key(btree, node, index); 388 nkey = nilfs_btree_node_get_key(node, index);
426 if (nkey == key) { 389 if (nkey == key) {
427 s = 0; 390 s = 0;
428 goto out; 391 goto out;
@@ -436,9 +399,8 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
436 } 399 }
437 400
438 /* adjust index */ 401 /* adjust index */
439 if (nilfs_btree_node_get_level(btree, node) > 402 if (nilfs_btree_node_get_level(node) > NILFS_BTREE_LEVEL_NODE_MIN) {
440 NILFS_BTREE_LEVEL_NODE_MIN) { 403 if (s > 0 && index > 0)
441 if ((s > 0) && (index > 0))
442 index--; 404 index--;
443 } else if (s < 0) 405 } else if (s < 0)
444 index++; 406 index++;
@@ -456,25 +418,20 @@ nilfs_btree_get_root(const struct nilfs_btree *btree)
456} 418}
457 419
458static inline struct nilfs_btree_node * 420static inline struct nilfs_btree_node *
459nilfs_btree_get_nonroot_node(const struct nilfs_btree *btree, 421nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level)
460 const struct nilfs_btree_path *path,
461 int level)
462{ 422{
463 return (struct nilfs_btree_node *)path[level].bp_bh->b_data; 423 return (struct nilfs_btree_node *)path[level].bp_bh->b_data;
464} 424}
465 425
466static inline struct nilfs_btree_node * 426static inline struct nilfs_btree_node *
467nilfs_btree_get_sib_node(const struct nilfs_btree *btree, 427nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level)
468 const struct nilfs_btree_path *path,
469 int level)
470{ 428{
471 return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data; 429 return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data;
472} 430}
473 431
474static inline int nilfs_btree_height(const struct nilfs_btree *btree) 432static inline int nilfs_btree_height(const struct nilfs_btree *btree)
475{ 433{
476 return nilfs_btree_node_get_level(btree, nilfs_btree_get_root(btree)) 434 return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1;
477 + 1;
478} 435}
479 436
480static inline struct nilfs_btree_node * 437static inline struct nilfs_btree_node *
@@ -484,7 +441,7 @@ nilfs_btree_get_node(const struct nilfs_btree *btree,
484{ 441{
485 return (level == nilfs_btree_height(btree) - 1) ? 442 return (level == nilfs_btree_height(btree) - 1) ?
486 nilfs_btree_get_root(btree) : 443 nilfs_btree_get_root(btree) :
487 nilfs_btree_get_nonroot_node(btree, path, level); 444 nilfs_btree_get_nonroot_node(path, level);
488} 445}
489 446
490static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, 447static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
@@ -496,12 +453,11 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
496 int level, index, found, ret; 453 int level, index, found, ret;
497 454
498 node = nilfs_btree_get_root(btree); 455 node = nilfs_btree_get_root(btree);
499 level = nilfs_btree_node_get_level(btree, node); 456 level = nilfs_btree_node_get_level(node);
500 if ((level < minlevel) || 457 if (level < minlevel || nilfs_btree_node_get_nchildren(node) <= 0)
501 (nilfs_btree_node_get_nchildren(btree, node) <= 0))
502 return -ENOENT; 458 return -ENOENT;
503 459
504 found = nilfs_btree_node_lookup(btree, node, key, &index); 460 found = nilfs_btree_node_lookup(node, key, &index);
505 ptr = nilfs_btree_node_get_ptr(btree, node, index); 461 ptr = nilfs_btree_node_get_ptr(btree, node, index);
506 path[level].bp_bh = NULL; 462 path[level].bp_bh = NULL;
507 path[level].bp_index = index; 463 path[level].bp_index = index;
@@ -510,14 +466,13 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
510 ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); 466 ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
511 if (ret < 0) 467 if (ret < 0)
512 return ret; 468 return ret;
513 node = nilfs_btree_get_nonroot_node(btree, path, level); 469 node = nilfs_btree_get_nonroot_node(path, level);
514 BUG_ON(level != nilfs_btree_node_get_level(btree, node)); 470 BUG_ON(level != nilfs_btree_node_get_level(node));
515 if (!found) 471 if (!found)
516 found = nilfs_btree_node_lookup(btree, node, key, 472 found = nilfs_btree_node_lookup(node, key, &index);
517 &index);
518 else 473 else
519 index = 0; 474 index = 0;
520 if (index < nilfs_btree_node_nchildren_max(btree, node)) 475 if (index < nilfs_btree_node_nchildren_max(node, btree))
521 ptr = nilfs_btree_node_get_ptr(btree, node, index); 476 ptr = nilfs_btree_node_get_ptr(btree, node, index);
522 else { 477 else {
523 WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN); 478 WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN);
@@ -544,10 +499,10 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
544 int index, level, ret; 499 int index, level, ret;
545 500
546 node = nilfs_btree_get_root(btree); 501 node = nilfs_btree_get_root(btree);
547 index = nilfs_btree_node_get_nchildren(btree, node) - 1; 502 index = nilfs_btree_node_get_nchildren(node) - 1;
548 if (index < 0) 503 if (index < 0)
549 return -ENOENT; 504 return -ENOENT;
550 level = nilfs_btree_node_get_level(btree, node); 505 level = nilfs_btree_node_get_level(node);
551 ptr = nilfs_btree_node_get_ptr(btree, node, index); 506 ptr = nilfs_btree_node_get_ptr(btree, node, index);
552 path[level].bp_bh = NULL; 507 path[level].bp_bh = NULL;
553 path[level].bp_index = index; 508 path[level].bp_index = index;
@@ -556,15 +511,15 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
556 ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); 511 ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
557 if (ret < 0) 512 if (ret < 0)
558 return ret; 513 return ret;
559 node = nilfs_btree_get_nonroot_node(btree, path, level); 514 node = nilfs_btree_get_nonroot_node(path, level);
560 BUG_ON(level != nilfs_btree_node_get_level(btree, node)); 515 BUG_ON(level != nilfs_btree_node_get_level(node));
561 index = nilfs_btree_node_get_nchildren(btree, node) - 1; 516 index = nilfs_btree_node_get_nchildren(node) - 1;
562 ptr = nilfs_btree_node_get_ptr(btree, node, index); 517 ptr = nilfs_btree_node_get_ptr(btree, node, index);
563 path[level].bp_index = index; 518 path[level].bp_index = index;
564 } 519 }
565 520
566 if (keyp != NULL) 521 if (keyp != NULL)
567 *keyp = nilfs_btree_node_get_key(btree, node, index); 522 *keyp = nilfs_btree_node_get_key(node, index);
568 if (ptrp != NULL) 523 if (ptrp != NULL)
569 *ptrp = ptr; 524 *ptrp = ptr;
570 525
@@ -580,18 +535,18 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
580 int ret; 535 int ret;
581 536
582 btree = (struct nilfs_btree *)bmap; 537 btree = (struct nilfs_btree *)bmap;
583 path = nilfs_btree_alloc_path(btree); 538 path = nilfs_btree_alloc_path();
584 if (path == NULL) 539 if (path == NULL)
585 return -ENOMEM; 540 return -ENOMEM;
586 nilfs_btree_init_path(btree, path); 541 nilfs_btree_init_path(path);
587 542
588 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); 543 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
589 544
590 if (ptrp != NULL) 545 if (ptrp != NULL)
591 *ptrp = ptr; 546 *ptrp = ptr;
592 547
593 nilfs_btree_clear_path(btree, path); 548 nilfs_btree_release_path(path);
594 nilfs_btree_free_path(btree, path); 549 nilfs_btree_free_path(path);
595 550
596 return ret; 551 return ret;
597} 552}
@@ -608,10 +563,10 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
608 int level = NILFS_BTREE_LEVEL_NODE_MIN; 563 int level = NILFS_BTREE_LEVEL_NODE_MIN;
609 int ret, cnt, index, maxlevel; 564 int ret, cnt, index, maxlevel;
610 565
611 path = nilfs_btree_alloc_path(btree); 566 path = nilfs_btree_alloc_path();
612 if (path == NULL) 567 if (path == NULL)
613 return -ENOMEM; 568 return -ENOMEM;
614 nilfs_btree_init_path(btree, path); 569 nilfs_btree_init_path(path);
615 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); 570 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
616 if (ret < 0) 571 if (ret < 0)
617 goto out; 572 goto out;
@@ -631,8 +586,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
631 node = nilfs_btree_get_node(btree, path, level); 586 node = nilfs_btree_get_node(btree, path, level);
632 index = path[level].bp_index + 1; 587 index = path[level].bp_index + 1;
633 for (;;) { 588 for (;;) {
634 while (index < nilfs_btree_node_get_nchildren(btree, node)) { 589 while (index < nilfs_btree_node_get_nchildren(node)) {
635 if (nilfs_btree_node_get_key(btree, node, index) != 590 if (nilfs_btree_node_get_key(node, index) !=
636 key + cnt) 591 key + cnt)
637 goto end; 592 goto end;
638 ptr2 = nilfs_btree_node_get_ptr(btree, node, index); 593 ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
@@ -653,8 +608,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
653 /* look-up right sibling node */ 608 /* look-up right sibling node */
654 node = nilfs_btree_get_node(btree, path, level + 1); 609 node = nilfs_btree_get_node(btree, path, level + 1);
655 index = path[level + 1].bp_index + 1; 610 index = path[level + 1].bp_index + 1;
656 if (index >= nilfs_btree_node_get_nchildren(btree, node) || 611 if (index >= nilfs_btree_node_get_nchildren(node) ||
657 nilfs_btree_node_get_key(btree, node, index) != key + cnt) 612 nilfs_btree_node_get_key(node, index) != key + cnt)
658 break; 613 break;
659 ptr2 = nilfs_btree_node_get_ptr(btree, node, index); 614 ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
660 path[level + 1].bp_index = index; 615 path[level + 1].bp_index = index;
@@ -664,7 +619,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
664 ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh); 619 ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh);
665 if (ret < 0) 620 if (ret < 0)
666 goto out; 621 goto out;
667 node = nilfs_btree_get_nonroot_node(btree, path, level); 622 node = nilfs_btree_get_nonroot_node(path, level);
668 index = 0; 623 index = 0;
669 path[level].bp_index = index; 624 path[level].bp_index = index;
670 } 625 }
@@ -672,8 +627,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
672 *ptrp = ptr; 627 *ptrp = ptr;
673 ret = cnt; 628 ret = cnt;
674 out: 629 out:
675 nilfs_btree_clear_path(btree, path); 630 nilfs_btree_release_path(path);
676 nilfs_btree_free_path(btree, path); 631 nilfs_btree_free_path(path);
677 return ret; 632 return ret;
678} 633}
679 634
@@ -685,9 +640,7 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree,
685 do { 640 do {
686 lock_buffer(path[level].bp_bh); 641 lock_buffer(path[level].bp_bh);
687 nilfs_btree_node_set_key( 642 nilfs_btree_node_set_key(
688 btree, 643 nilfs_btree_get_nonroot_node(path, level),
689 nilfs_btree_get_nonroot_node(
690 btree, path, level),
691 path[level].bp_index, key); 644 path[level].bp_index, key);
692 if (!buffer_dirty(path[level].bp_bh)) 645 if (!buffer_dirty(path[level].bp_bh))
693 nilfs_btnode_mark_dirty(path[level].bp_bh); 646 nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -698,8 +651,7 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree,
698 651
699 /* root */ 652 /* root */
700 if (level == nilfs_btree_height(btree) - 1) { 653 if (level == nilfs_btree_height(btree) - 1) {
701 nilfs_btree_node_set_key(btree, 654 nilfs_btree_node_set_key(nilfs_btree_get_root(btree),
702 nilfs_btree_get_root(btree),
703 path[level].bp_index, key); 655 path[level].bp_index, key);
704 } 656 }
705} 657}
@@ -712,7 +664,7 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree,
712 664
713 if (level < nilfs_btree_height(btree) - 1) { 665 if (level < nilfs_btree_height(btree) - 1) {
714 lock_buffer(path[level].bp_bh); 666 lock_buffer(path[level].bp_bh);
715 node = nilfs_btree_get_nonroot_node(btree, path, level); 667 node = nilfs_btree_get_nonroot_node(path, level);
716 nilfs_btree_node_insert(btree, node, *keyp, *ptrp, 668 nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
717 path[level].bp_index); 669 path[level].bp_index);
718 if (!buffer_dirty(path[level].bp_bh)) 670 if (!buffer_dirty(path[level].bp_bh))
@@ -721,8 +673,8 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree,
721 673
722 if (path[level].bp_index == 0) 674 if (path[level].bp_index == 0)
723 nilfs_btree_promote_key(btree, path, level + 1, 675 nilfs_btree_promote_key(btree, path, level + 1,
724 nilfs_btree_node_get_key( 676 nilfs_btree_node_get_key(node,
725 btree, node, 0)); 677 0));
726 } else { 678 } else {
727 node = nilfs_btree_get_root(btree); 679 node = nilfs_btree_get_root(btree);
728 nilfs_btree_node_insert(btree, node, *keyp, *ptrp, 680 nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
@@ -740,10 +692,10 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
740 lock_buffer(path[level].bp_bh); 692 lock_buffer(path[level].bp_bh);
741 lock_buffer(path[level].bp_sib_bh); 693 lock_buffer(path[level].bp_sib_bh);
742 694
743 node = nilfs_btree_get_nonroot_node(btree, path, level); 695 node = nilfs_btree_get_nonroot_node(path, level);
744 left = nilfs_btree_get_sib_node(btree, path, level); 696 left = nilfs_btree_get_sib_node(path, level);
745 nchildren = nilfs_btree_node_get_nchildren(btree, node); 697 nchildren = nilfs_btree_node_get_nchildren(node);
746 lnchildren = nilfs_btree_node_get_nchildren(btree, left); 698 lnchildren = nilfs_btree_node_get_nchildren(left);
747 move = 0; 699 move = 0;
748 700
749 n = (nchildren + lnchildren + 1) / 2 - lnchildren; 701 n = (nchildren + lnchildren + 1) / 2 - lnchildren;
@@ -764,7 +716,7 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
764 unlock_buffer(path[level].bp_sib_bh); 716 unlock_buffer(path[level].bp_sib_bh);
765 717
766 nilfs_btree_promote_key(btree, path, level + 1, 718 nilfs_btree_promote_key(btree, path, level + 1,
767 nilfs_btree_node_get_key(btree, node, 0)); 719 nilfs_btree_node_get_key(node, 0));
768 720
769 if (move) { 721 if (move) {
770 brelse(path[level].bp_bh); 722 brelse(path[level].bp_bh);
@@ -791,10 +743,10 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
791 lock_buffer(path[level].bp_bh); 743 lock_buffer(path[level].bp_bh);
792 lock_buffer(path[level].bp_sib_bh); 744 lock_buffer(path[level].bp_sib_bh);
793 745
794 node = nilfs_btree_get_nonroot_node(btree, path, level); 746 node = nilfs_btree_get_nonroot_node(path, level);
795 right = nilfs_btree_get_sib_node(btree, path, level); 747 right = nilfs_btree_get_sib_node(path, level);
796 nchildren = nilfs_btree_node_get_nchildren(btree, node); 748 nchildren = nilfs_btree_node_get_nchildren(node);
797 rnchildren = nilfs_btree_node_get_nchildren(btree, right); 749 rnchildren = nilfs_btree_node_get_nchildren(right);
798 move = 0; 750 move = 0;
799 751
800 n = (nchildren + rnchildren + 1) / 2 - rnchildren; 752 n = (nchildren + rnchildren + 1) / 2 - rnchildren;
@@ -816,15 +768,14 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
816 768
817 path[level + 1].bp_index++; 769 path[level + 1].bp_index++;
818 nilfs_btree_promote_key(btree, path, level + 1, 770 nilfs_btree_promote_key(btree, path, level + 1,
819 nilfs_btree_node_get_key(btree, right, 0)); 771 nilfs_btree_node_get_key(right, 0));
820 path[level + 1].bp_index--; 772 path[level + 1].bp_index--;
821 773
822 if (move) { 774 if (move) {
823 brelse(path[level].bp_bh); 775 brelse(path[level].bp_bh);
824 path[level].bp_bh = path[level].bp_sib_bh; 776 path[level].bp_bh = path[level].bp_sib_bh;
825 path[level].bp_sib_bh = NULL; 777 path[level].bp_sib_bh = NULL;
826 path[level].bp_index -= 778 path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
827 nilfs_btree_node_get_nchildren(btree, node);
828 path[level + 1].bp_index++; 779 path[level + 1].bp_index++;
829 } else { 780 } else {
830 brelse(path[level].bp_sib_bh); 781 brelse(path[level].bp_sib_bh);
@@ -846,9 +797,9 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
846 lock_buffer(path[level].bp_bh); 797 lock_buffer(path[level].bp_bh);
847 lock_buffer(path[level].bp_sib_bh); 798 lock_buffer(path[level].bp_sib_bh);
848 799
849 node = nilfs_btree_get_nonroot_node(btree, path, level); 800 node = nilfs_btree_get_nonroot_node(path, level);
850 right = nilfs_btree_get_sib_node(btree, path, level); 801 right = nilfs_btree_get_sib_node(path, level);
851 nchildren = nilfs_btree_node_get_nchildren(btree, node); 802 nchildren = nilfs_btree_node_get_nchildren(node);
852 move = 0; 803 move = 0;
853 804
854 n = (nchildren + 1) / 2; 805 n = (nchildren + 1) / 2;
@@ -867,16 +818,15 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
867 unlock_buffer(path[level].bp_bh); 818 unlock_buffer(path[level].bp_bh);
868 unlock_buffer(path[level].bp_sib_bh); 819 unlock_buffer(path[level].bp_sib_bh);
869 820
870 newkey = nilfs_btree_node_get_key(btree, right, 0); 821 newkey = nilfs_btree_node_get_key(right, 0);
871 newptr = path[level].bp_newreq.bpr_ptr; 822 newptr = path[level].bp_newreq.bpr_ptr;
872 823
873 if (move) { 824 if (move) {
874 path[level].bp_index -= 825 path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
875 nilfs_btree_node_get_nchildren(btree, node);
876 nilfs_btree_node_insert(btree, right, *keyp, *ptrp, 826 nilfs_btree_node_insert(btree, right, *keyp, *ptrp,
877 path[level].bp_index); 827 path[level].bp_index);
878 828
879 *keyp = nilfs_btree_node_get_key(btree, right, 0); 829 *keyp = nilfs_btree_node_get_key(right, 0);
880 *ptrp = path[level].bp_newreq.bpr_ptr; 830 *ptrp = path[level].bp_newreq.bpr_ptr;
881 831
882 brelse(path[level].bp_bh); 832 brelse(path[level].bp_bh);
@@ -885,7 +835,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
885 } else { 835 } else {
886 nilfs_btree_do_insert(btree, path, level, keyp, ptrp); 836 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
887 837
888 *keyp = nilfs_btree_node_get_key(btree, right, 0); 838 *keyp = nilfs_btree_node_get_key(right, 0);
889 *ptrp = path[level].bp_newreq.bpr_ptr; 839 *ptrp = path[level].bp_newreq.bpr_ptr;
890 840
891 brelse(path[level].bp_sib_bh); 841 brelse(path[level].bp_sib_bh);
@@ -905,12 +855,12 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
905 lock_buffer(path[level].bp_sib_bh); 855 lock_buffer(path[level].bp_sib_bh);
906 856
907 root = nilfs_btree_get_root(btree); 857 root = nilfs_btree_get_root(btree);
908 child = nilfs_btree_get_sib_node(btree, path, level); 858 child = nilfs_btree_get_sib_node(path, level);
909 859
910 n = nilfs_btree_node_get_nchildren(btree, root); 860 n = nilfs_btree_node_get_nchildren(root);
911 861
912 nilfs_btree_node_move_right(btree, root, child, n); 862 nilfs_btree_node_move_right(btree, root, child, n);
913 nilfs_btree_node_set_level(btree, root, level + 1); 863 nilfs_btree_node_set_level(root, level + 1);
914 864
915 if (!buffer_dirty(path[level].bp_sib_bh)) 865 if (!buffer_dirty(path[level].bp_sib_bh))
916 nilfs_btnode_mark_dirty(path[level].bp_sib_bh); 866 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
@@ -922,7 +872,7 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
922 872
923 nilfs_btree_do_insert(btree, path, level, keyp, ptrp); 873 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
924 874
925 *keyp = nilfs_btree_node_get_key(btree, child, 0); 875 *keyp = nilfs_btree_node_get_key(child, 0);
926 *ptrp = path[level].bp_newreq.bpr_ptr; 876 *ptrp = path[level].bp_newreq.bpr_ptr;
927} 877}
928 878
@@ -990,26 +940,29 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
990 struct nilfs_btree_node *node, *parent, *sib; 940 struct nilfs_btree_node *node, *parent, *sib;
991 __u64 sibptr; 941 __u64 sibptr;
992 int pindex, level, ret; 942 int pindex, level, ret;
943 struct inode *dat = NULL;
993 944
994 stats->bs_nblocks = 0; 945 stats->bs_nblocks = 0;
995 level = NILFS_BTREE_LEVEL_DATA; 946 level = NILFS_BTREE_LEVEL_DATA;
996 947
997 /* allocate a new ptr for data block */ 948 /* allocate a new ptr for data block */
998 if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) 949 if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) {
999 path[level].bp_newreq.bpr_ptr = 950 path[level].bp_newreq.bpr_ptr =
1000 nilfs_btree_find_target_v(btree, path, key); 951 nilfs_btree_find_target_v(btree, path, key);
952 dat = nilfs_bmap_get_dat(&btree->bt_bmap);
953 }
1001 954
1002 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, 955 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
1003 &path[level].bp_newreq); 956 &path[level].bp_newreq, dat);
1004 if (ret < 0) 957 if (ret < 0)
1005 goto err_out_data; 958 goto err_out_data;
1006 959
1007 for (level = NILFS_BTREE_LEVEL_NODE_MIN; 960 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
1008 level < nilfs_btree_height(btree) - 1; 961 level < nilfs_btree_height(btree) - 1;
1009 level++) { 962 level++) {
1010 node = nilfs_btree_get_nonroot_node(btree, path, level); 963 node = nilfs_btree_get_nonroot_node(path, level);
1011 if (nilfs_btree_node_get_nchildren(btree, node) < 964 if (nilfs_btree_node_get_nchildren(node) <
1012 nilfs_btree_node_nchildren_max(btree, node)) { 965 nilfs_btree_node_nchildren_max(node, btree)) {
1013 path[level].bp_op = nilfs_btree_do_insert; 966 path[level].bp_op = nilfs_btree_do_insert;
1014 stats->bs_nblocks++; 967 stats->bs_nblocks++;
1015 goto out; 968 goto out;
@@ -1026,8 +979,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1026 if (ret < 0) 979 if (ret < 0)
1027 goto err_out_child_node; 980 goto err_out_child_node;
1028 sib = (struct nilfs_btree_node *)bh->b_data; 981 sib = (struct nilfs_btree_node *)bh->b_data;
1029 if (nilfs_btree_node_get_nchildren(btree, sib) < 982 if (nilfs_btree_node_get_nchildren(sib) <
1030 nilfs_btree_node_nchildren_max(btree, sib)) { 983 nilfs_btree_node_nchildren_max(sib, btree)) {
1031 path[level].bp_sib_bh = bh; 984 path[level].bp_sib_bh = bh;
1032 path[level].bp_op = nilfs_btree_carry_left; 985 path[level].bp_op = nilfs_btree_carry_left;
1033 stats->bs_nblocks++; 986 stats->bs_nblocks++;
@@ -1038,15 +991,15 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1038 991
1039 /* right sibling */ 992 /* right sibling */
1040 if (pindex < 993 if (pindex <
1041 nilfs_btree_node_get_nchildren(btree, parent) - 1) { 994 nilfs_btree_node_get_nchildren(parent) - 1) {
1042 sibptr = nilfs_btree_node_get_ptr(btree, parent, 995 sibptr = nilfs_btree_node_get_ptr(btree, parent,
1043 pindex + 1); 996 pindex + 1);
1044 ret = nilfs_btree_get_block(btree, sibptr, &bh); 997 ret = nilfs_btree_get_block(btree, sibptr, &bh);
1045 if (ret < 0) 998 if (ret < 0)
1046 goto err_out_child_node; 999 goto err_out_child_node;
1047 sib = (struct nilfs_btree_node *)bh->b_data; 1000 sib = (struct nilfs_btree_node *)bh->b_data;
1048 if (nilfs_btree_node_get_nchildren(btree, sib) < 1001 if (nilfs_btree_node_get_nchildren(sib) <
1049 nilfs_btree_node_nchildren_max(btree, sib)) { 1002 nilfs_btree_node_nchildren_max(sib, btree)) {
1050 path[level].bp_sib_bh = bh; 1003 path[level].bp_sib_bh = bh;
1051 path[level].bp_op = nilfs_btree_carry_right; 1004 path[level].bp_op = nilfs_btree_carry_right;
1052 stats->bs_nblocks++; 1005 stats->bs_nblocks++;
@@ -1059,7 +1012,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1059 path[level].bp_newreq.bpr_ptr = 1012 path[level].bp_newreq.bpr_ptr =
1060 path[level - 1].bp_newreq.bpr_ptr + 1; 1013 path[level - 1].bp_newreq.bpr_ptr + 1;
1061 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, 1014 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
1062 &path[level].bp_newreq); 1015 &path[level].bp_newreq, dat);
1063 if (ret < 0) 1016 if (ret < 0)
1064 goto err_out_child_node; 1017 goto err_out_child_node;
1065 ret = nilfs_btree_get_new_block(btree, 1018 ret = nilfs_btree_get_new_block(btree,
@@ -1081,8 +1034,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1081 1034
1082 /* root */ 1035 /* root */
1083 node = nilfs_btree_get_root(btree); 1036 node = nilfs_btree_get_root(btree);
1084 if (nilfs_btree_node_get_nchildren(btree, node) < 1037 if (nilfs_btree_node_get_nchildren(node) <
1085 nilfs_btree_node_nchildren_max(btree, node)) { 1038 nilfs_btree_node_nchildren_max(node, btree)) {
1086 path[level].bp_op = nilfs_btree_do_insert; 1039 path[level].bp_op = nilfs_btree_do_insert;
1087 stats->bs_nblocks++; 1040 stats->bs_nblocks++;
1088 goto out; 1041 goto out;
@@ -1091,7 +1044,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1091 /* grow */ 1044 /* grow */
1092 path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; 1045 path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
1093 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, 1046 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
1094 &path[level].bp_newreq); 1047 &path[level].bp_newreq, dat);
1095 if (ret < 0) 1048 if (ret < 0)
1096 goto err_out_child_node; 1049 goto err_out_child_node;
1097 ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr, 1050 ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr,
@@ -1119,16 +1072,18 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1119 1072
1120 /* error */ 1073 /* error */
1121 err_out_curr_node: 1074 err_out_curr_node:
1122 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq); 1075 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq,
1076 dat);
1123 err_out_child_node: 1077 err_out_child_node:
1124 for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) { 1078 for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
1125 nilfs_btnode_delete(path[level].bp_sib_bh); 1079 nilfs_btnode_delete(path[level].bp_sib_bh);
1126 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, 1080 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap,
1127 &path[level].bp_newreq); 1081 &path[level].bp_newreq, dat);
1128 1082
1129 } 1083 }
1130 1084
1131 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq); 1085 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq,
1086 dat);
1132 err_out_data: 1087 err_out_data:
1133 *levelp = level; 1088 *levelp = level;
1134 stats->bs_nblocks = 0; 1089 stats->bs_nblocks = 0;
@@ -1139,16 +1094,19 @@ static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
1139 struct nilfs_btree_path *path, 1094 struct nilfs_btree_path *path,
1140 int maxlevel, __u64 key, __u64 ptr) 1095 int maxlevel, __u64 key, __u64 ptr)
1141{ 1096{
1097 struct inode *dat = NULL;
1142 int level; 1098 int level;
1143 1099
1144 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); 1100 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
1145 ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr; 1101 ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
1146 if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) 1102 if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) {
1147 nilfs_btree_set_target_v(btree, key, ptr); 1103 nilfs_btree_set_target_v(btree, key, ptr);
1104 dat = nilfs_bmap_get_dat(&btree->bt_bmap);
1105 }
1148 1106
1149 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { 1107 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
1150 nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap, 1108 nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap,
1151 &path[level - 1].bp_newreq); 1109 &path[level - 1].bp_newreq, dat);
1152 path[level].bp_op(btree, path, level, &key, &ptr); 1110 path[level].bp_op(btree, path, level, &key, &ptr);
1153 } 1111 }
1154 1112
@@ -1164,10 +1122,10 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
1164 int level, ret; 1122 int level, ret;
1165 1123
1166 btree = (struct nilfs_btree *)bmap; 1124 btree = (struct nilfs_btree *)bmap;
1167 path = nilfs_btree_alloc_path(btree); 1125 path = nilfs_btree_alloc_path();
1168 if (path == NULL) 1126 if (path == NULL)
1169 return -ENOMEM; 1127 return -ENOMEM;
1170 nilfs_btree_init_path(btree, path); 1128 nilfs_btree_init_path(path);
1171 1129
1172 ret = nilfs_btree_do_lookup(btree, path, key, NULL, 1130 ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1173 NILFS_BTREE_LEVEL_NODE_MIN); 1131 NILFS_BTREE_LEVEL_NODE_MIN);
@@ -1184,8 +1142,8 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
1184 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); 1142 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
1185 1143
1186 out: 1144 out:
1187 nilfs_btree_clear_path(btree, path); 1145 nilfs_btree_release_path(path);
1188 nilfs_btree_free_path(btree, path); 1146 nilfs_btree_free_path(path);
1189 return ret; 1147 return ret;
1190} 1148}
1191 1149
@@ -1197,7 +1155,7 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree,
1197 1155
1198 if (level < nilfs_btree_height(btree) - 1) { 1156 if (level < nilfs_btree_height(btree) - 1) {
1199 lock_buffer(path[level].bp_bh); 1157 lock_buffer(path[level].bp_bh);
1200 node = nilfs_btree_get_nonroot_node(btree, path, level); 1158 node = nilfs_btree_get_nonroot_node(path, level);
1201 nilfs_btree_node_delete(btree, node, keyp, ptrp, 1159 nilfs_btree_node_delete(btree, node, keyp, ptrp,
1202 path[level].bp_index); 1160 path[level].bp_index);
1203 if (!buffer_dirty(path[level].bp_bh)) 1161 if (!buffer_dirty(path[level].bp_bh))
@@ -1205,7 +1163,7 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree,
1205 unlock_buffer(path[level].bp_bh); 1163 unlock_buffer(path[level].bp_bh);
1206 if (path[level].bp_index == 0) 1164 if (path[level].bp_index == 0)
1207 nilfs_btree_promote_key(btree, path, level + 1, 1165 nilfs_btree_promote_key(btree, path, level + 1,
1208 nilfs_btree_node_get_key(btree, node, 0)); 1166 nilfs_btree_node_get_key(node, 0));
1209 } else { 1167 } else {
1210 node = nilfs_btree_get_root(btree); 1168 node = nilfs_btree_get_root(btree);
1211 nilfs_btree_node_delete(btree, node, keyp, ptrp, 1169 nilfs_btree_node_delete(btree, node, keyp, ptrp,
@@ -1225,10 +1183,10 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
1225 lock_buffer(path[level].bp_bh); 1183 lock_buffer(path[level].bp_bh);
1226 lock_buffer(path[level].bp_sib_bh); 1184 lock_buffer(path[level].bp_sib_bh);
1227 1185
1228 node = nilfs_btree_get_nonroot_node(btree, path, level); 1186 node = nilfs_btree_get_nonroot_node(path, level);
1229 left = nilfs_btree_get_sib_node(btree, path, level); 1187 left = nilfs_btree_get_sib_node(path, level);
1230 nchildren = nilfs_btree_node_get_nchildren(btree, node); 1188 nchildren = nilfs_btree_node_get_nchildren(node);
1231 lnchildren = nilfs_btree_node_get_nchildren(btree, left); 1189 lnchildren = nilfs_btree_node_get_nchildren(left);
1232 1190
1233 n = (nchildren + lnchildren) / 2 - nchildren; 1191 n = (nchildren + lnchildren) / 2 - nchildren;
1234 1192
@@ -1243,7 +1201,7 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
1243 unlock_buffer(path[level].bp_sib_bh); 1201 unlock_buffer(path[level].bp_sib_bh);
1244 1202
1245 nilfs_btree_promote_key(btree, path, level + 1, 1203 nilfs_btree_promote_key(btree, path, level + 1,
1246 nilfs_btree_node_get_key(btree, node, 0)); 1204 nilfs_btree_node_get_key(node, 0));
1247 1205
1248 brelse(path[level].bp_sib_bh); 1206 brelse(path[level].bp_sib_bh);
1249 path[level].bp_sib_bh = NULL; 1207 path[level].bp_sib_bh = NULL;
@@ -1262,10 +1220,10 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
1262 lock_buffer(path[level].bp_bh); 1220 lock_buffer(path[level].bp_bh);
1263 lock_buffer(path[level].bp_sib_bh); 1221 lock_buffer(path[level].bp_sib_bh);
1264 1222
1265 node = nilfs_btree_get_nonroot_node(btree, path, level); 1223 node = nilfs_btree_get_nonroot_node(path, level);
1266 right = nilfs_btree_get_sib_node(btree, path, level); 1224 right = nilfs_btree_get_sib_node(path, level);
1267 nchildren = nilfs_btree_node_get_nchildren(btree, node); 1225 nchildren = nilfs_btree_node_get_nchildren(node);
1268 rnchildren = nilfs_btree_node_get_nchildren(btree, right); 1226 rnchildren = nilfs_btree_node_get_nchildren(right);
1269 1227
1270 n = (nchildren + rnchildren) / 2 - nchildren; 1228 n = (nchildren + rnchildren) / 2 - nchildren;
1271 1229
@@ -1281,7 +1239,7 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
1281 1239
1282 path[level + 1].bp_index++; 1240 path[level + 1].bp_index++;
1283 nilfs_btree_promote_key(btree, path, level + 1, 1241 nilfs_btree_promote_key(btree, path, level + 1,
1284 nilfs_btree_node_get_key(btree, right, 0)); 1242 nilfs_btree_node_get_key(right, 0));
1285 path[level + 1].bp_index--; 1243 path[level + 1].bp_index--;
1286 1244
1287 brelse(path[level].bp_sib_bh); 1245 brelse(path[level].bp_sib_bh);
@@ -1300,10 +1258,10 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
1300 lock_buffer(path[level].bp_bh); 1258 lock_buffer(path[level].bp_bh);
1301 lock_buffer(path[level].bp_sib_bh); 1259 lock_buffer(path[level].bp_sib_bh);
1302 1260
1303 node = nilfs_btree_get_nonroot_node(btree, path, level); 1261 node = nilfs_btree_get_nonroot_node(path, level);
1304 left = nilfs_btree_get_sib_node(btree, path, level); 1262 left = nilfs_btree_get_sib_node(path, level);
1305 1263
1306 n = nilfs_btree_node_get_nchildren(btree, node); 1264 n = nilfs_btree_node_get_nchildren(node);
1307 1265
1308 nilfs_btree_node_move_left(btree, left, node, n); 1266 nilfs_btree_node_move_left(btree, left, node, n);
1309 1267
@@ -1316,7 +1274,7 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
1316 nilfs_btnode_delete(path[level].bp_bh); 1274 nilfs_btnode_delete(path[level].bp_bh);
1317 path[level].bp_bh = path[level].bp_sib_bh; 1275 path[level].bp_bh = path[level].bp_sib_bh;
1318 path[level].bp_sib_bh = NULL; 1276 path[level].bp_sib_bh = NULL;
1319 path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left); 1277 path[level].bp_index += nilfs_btree_node_get_nchildren(left);
1320} 1278}
1321 1279
1322static void nilfs_btree_concat_right(struct nilfs_btree *btree, 1280static void nilfs_btree_concat_right(struct nilfs_btree *btree,
@@ -1331,10 +1289,10 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
1331 lock_buffer(path[level].bp_bh); 1289 lock_buffer(path[level].bp_bh);
1332 lock_buffer(path[level].bp_sib_bh); 1290 lock_buffer(path[level].bp_sib_bh);
1333 1291
1334 node = nilfs_btree_get_nonroot_node(btree, path, level); 1292 node = nilfs_btree_get_nonroot_node(path, level);
1335 right = nilfs_btree_get_sib_node(btree, path, level); 1293 right = nilfs_btree_get_sib_node(path, level);
1336 1294
1337 n = nilfs_btree_node_get_nchildren(btree, right); 1295 n = nilfs_btree_node_get_nchildren(right);
1338 1296
1339 nilfs_btree_node_move_left(btree, node, right, n); 1297 nilfs_btree_node_move_left(btree, node, right, n);
1340 1298
@@ -1360,11 +1318,11 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
1360 1318
1361 lock_buffer(path[level].bp_bh); 1319 lock_buffer(path[level].bp_bh);
1362 root = nilfs_btree_get_root(btree); 1320 root = nilfs_btree_get_root(btree);
1363 child = nilfs_btree_get_nonroot_node(btree, path, level); 1321 child = nilfs_btree_get_nonroot_node(path, level);
1364 1322
1365 nilfs_btree_node_delete(btree, root, NULL, NULL, 0); 1323 nilfs_btree_node_delete(btree, root, NULL, NULL, 0);
1366 nilfs_btree_node_set_level(btree, root, level); 1324 nilfs_btree_node_set_level(root, level);
1367 n = nilfs_btree_node_get_nchildren(btree, child); 1325 n = nilfs_btree_node_get_nchildren(child);
1368 nilfs_btree_node_move_left(btree, root, child, n); 1326 nilfs_btree_node_move_left(btree, root, child, n);
1369 unlock_buffer(path[level].bp_bh); 1327 unlock_buffer(path[level].bp_bh);
1370 1328
@@ -1376,7 +1334,8 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
1376static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, 1334static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1377 struct nilfs_btree_path *path, 1335 struct nilfs_btree_path *path,
1378 int *levelp, 1336 int *levelp,
1379 struct nilfs_bmap_stats *stats) 1337 struct nilfs_bmap_stats *stats,
1338 struct inode *dat)
1380{ 1339{
1381 struct buffer_head *bh; 1340 struct buffer_head *bh;
1382 struct nilfs_btree_node *node, *parent, *sib; 1341 struct nilfs_btree_node *node, *parent, *sib;
@@ -1388,17 +1347,17 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1388 for (level = NILFS_BTREE_LEVEL_NODE_MIN; 1347 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
1389 level < nilfs_btree_height(btree) - 1; 1348 level < nilfs_btree_height(btree) - 1;
1390 level++) { 1349 level++) {
1391 node = nilfs_btree_get_nonroot_node(btree, path, level); 1350 node = nilfs_btree_get_nonroot_node(path, level);
1392 path[level].bp_oldreq.bpr_ptr = 1351 path[level].bp_oldreq.bpr_ptr =
1393 nilfs_btree_node_get_ptr(btree, node, 1352 nilfs_btree_node_get_ptr(btree, node,
1394 path[level].bp_index); 1353 path[level].bp_index);
1395 ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap, 1354 ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
1396 &path[level].bp_oldreq); 1355 &path[level].bp_oldreq, dat);
1397 if (ret < 0) 1356 if (ret < 0)
1398 goto err_out_child_node; 1357 goto err_out_child_node;
1399 1358
1400 if (nilfs_btree_node_get_nchildren(btree, node) > 1359 if (nilfs_btree_node_get_nchildren(node) >
1401 nilfs_btree_node_nchildren_min(btree, node)) { 1360 nilfs_btree_node_nchildren_min(node, btree)) {
1402 path[level].bp_op = nilfs_btree_do_delete; 1361 path[level].bp_op = nilfs_btree_do_delete;
1403 stats->bs_nblocks++; 1362 stats->bs_nblocks++;
1404 goto out; 1363 goto out;
@@ -1415,8 +1374,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1415 if (ret < 0) 1374 if (ret < 0)
1416 goto err_out_curr_node; 1375 goto err_out_curr_node;
1417 sib = (struct nilfs_btree_node *)bh->b_data; 1376 sib = (struct nilfs_btree_node *)bh->b_data;
1418 if (nilfs_btree_node_get_nchildren(btree, sib) > 1377 if (nilfs_btree_node_get_nchildren(sib) >
1419 nilfs_btree_node_nchildren_min(btree, sib)) { 1378 nilfs_btree_node_nchildren_min(sib, btree)) {
1420 path[level].bp_sib_bh = bh; 1379 path[level].bp_sib_bh = bh;
1421 path[level].bp_op = nilfs_btree_borrow_left; 1380 path[level].bp_op = nilfs_btree_borrow_left;
1422 stats->bs_nblocks++; 1381 stats->bs_nblocks++;
@@ -1428,7 +1387,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1428 /* continue; */ 1387 /* continue; */
1429 } 1388 }
1430 } else if (pindex < 1389 } else if (pindex <
1431 nilfs_btree_node_get_nchildren(btree, parent) - 1) { 1390 nilfs_btree_node_get_nchildren(parent) - 1) {
1432 /* right sibling */ 1391 /* right sibling */
1433 sibptr = nilfs_btree_node_get_ptr(btree, parent, 1392 sibptr = nilfs_btree_node_get_ptr(btree, parent,
1434 pindex + 1); 1393 pindex + 1);
@@ -1436,8 +1395,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1436 if (ret < 0) 1395 if (ret < 0)
1437 goto err_out_curr_node; 1396 goto err_out_curr_node;
1438 sib = (struct nilfs_btree_node *)bh->b_data; 1397 sib = (struct nilfs_btree_node *)bh->b_data;
1439 if (nilfs_btree_node_get_nchildren(btree, sib) > 1398 if (nilfs_btree_node_get_nchildren(sib) >
1440 nilfs_btree_node_nchildren_min(btree, sib)) { 1399 nilfs_btree_node_nchildren_min(sib, btree)) {
1441 path[level].bp_sib_bh = bh; 1400 path[level].bp_sib_bh = bh;
1442 path[level].bp_op = nilfs_btree_borrow_right; 1401 path[level].bp_op = nilfs_btree_borrow_right;
1443 stats->bs_nblocks++; 1402 stats->bs_nblocks++;
@@ -1452,7 +1411,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1452 /* no siblings */ 1411 /* no siblings */
1453 /* the only child of the root node */ 1412 /* the only child of the root node */
1454 WARN_ON(level != nilfs_btree_height(btree) - 2); 1413 WARN_ON(level != nilfs_btree_height(btree) - 2);
1455 if (nilfs_btree_node_get_nchildren(btree, node) - 1 <= 1414 if (nilfs_btree_node_get_nchildren(node) - 1 <=
1456 NILFS_BTREE_ROOT_NCHILDREN_MAX) { 1415 NILFS_BTREE_ROOT_NCHILDREN_MAX) {
1457 path[level].bp_op = nilfs_btree_shrink; 1416 path[level].bp_op = nilfs_btree_shrink;
1458 stats->bs_nblocks += 2; 1417 stats->bs_nblocks += 2;
@@ -1471,7 +1430,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1471 nilfs_btree_node_get_ptr(btree, node, path[level].bp_index); 1430 nilfs_btree_node_get_ptr(btree, node, path[level].bp_index);
1472 1431
1473 ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap, 1432 ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
1474 &path[level].bp_oldreq); 1433 &path[level].bp_oldreq, dat);
1475 if (ret < 0) 1434 if (ret < 0)
1476 goto err_out_child_node; 1435 goto err_out_child_node;
1477 1436
@@ -1486,12 +1445,12 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1486 1445
1487 /* error */ 1446 /* error */
1488 err_out_curr_node: 1447 err_out_curr_node:
1489 nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq); 1448 nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq, dat);
1490 err_out_child_node: 1449 err_out_child_node:
1491 for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) { 1450 for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
1492 brelse(path[level].bp_sib_bh); 1451 brelse(path[level].bp_sib_bh);
1493 nilfs_bmap_abort_end_ptr(&btree->bt_bmap, 1452 nilfs_bmap_abort_end_ptr(&btree->bt_bmap,
1494 &path[level].bp_oldreq); 1453 &path[level].bp_oldreq, dat);
1495 } 1454 }
1496 *levelp = level; 1455 *levelp = level;
1497 stats->bs_nblocks = 0; 1456 stats->bs_nblocks = 0;
@@ -1500,13 +1459,13 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1500 1459
1501static void nilfs_btree_commit_delete(struct nilfs_btree *btree, 1460static void nilfs_btree_commit_delete(struct nilfs_btree *btree,
1502 struct nilfs_btree_path *path, 1461 struct nilfs_btree_path *path,
1503 int maxlevel) 1462 int maxlevel, struct inode *dat)
1504{ 1463{
1505 int level; 1464 int level;
1506 1465
1507 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { 1466 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
1508 nilfs_bmap_commit_end_ptr(&btree->bt_bmap, 1467 nilfs_bmap_commit_end_ptr(&btree->bt_bmap,
1509 &path[level].bp_oldreq); 1468 &path[level].bp_oldreq, dat);
1510 path[level].bp_op(btree, path, level, NULL, NULL); 1469 path[level].bp_op(btree, path, level, NULL, NULL);
1511 } 1470 }
1512 1471
@@ -1520,27 +1479,32 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
1520 struct nilfs_btree *btree; 1479 struct nilfs_btree *btree;
1521 struct nilfs_btree_path *path; 1480 struct nilfs_btree_path *path;
1522 struct nilfs_bmap_stats stats; 1481 struct nilfs_bmap_stats stats;
1482 struct inode *dat;
1523 int level, ret; 1483 int level, ret;
1524 1484
1525 btree = (struct nilfs_btree *)bmap; 1485 btree = (struct nilfs_btree *)bmap;
1526 path = nilfs_btree_alloc_path(btree); 1486 path = nilfs_btree_alloc_path();
1527 if (path == NULL) 1487 if (path == NULL)
1528 return -ENOMEM; 1488 return -ENOMEM;
1529 nilfs_btree_init_path(btree, path); 1489 nilfs_btree_init_path(path);
1530 ret = nilfs_btree_do_lookup(btree, path, key, NULL, 1490 ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1531 NILFS_BTREE_LEVEL_NODE_MIN); 1491 NILFS_BTREE_LEVEL_NODE_MIN);
1532 if (ret < 0) 1492 if (ret < 0)
1533 goto out; 1493 goto out;
1534 1494
1535 ret = nilfs_btree_prepare_delete(btree, path, &level, &stats); 1495
1496 dat = NILFS_BMAP_USE_VBN(&btree->bt_bmap) ?
1497 nilfs_bmap_get_dat(&btree->bt_bmap) : NULL;
1498
1499 ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat);
1536 if (ret < 0) 1500 if (ret < 0)
1537 goto out; 1501 goto out;
1538 nilfs_btree_commit_delete(btree, path, level); 1502 nilfs_btree_commit_delete(btree, path, level, dat);
1539 nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks); 1503 nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
1540 1504
1541out: 1505out:
1542 nilfs_btree_clear_path(btree, path); 1506 nilfs_btree_release_path(path);
1543 nilfs_btree_free_path(btree, path); 1507 nilfs_btree_free_path(path);
1544 return ret; 1508 return ret;
1545} 1509}
1546 1510
@@ -1551,15 +1515,15 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
1551 int ret; 1515 int ret;
1552 1516
1553 btree = (struct nilfs_btree *)bmap; 1517 btree = (struct nilfs_btree *)bmap;
1554 path = nilfs_btree_alloc_path(btree); 1518 path = nilfs_btree_alloc_path();
1555 if (path == NULL) 1519 if (path == NULL)
1556 return -ENOMEM; 1520 return -ENOMEM;
1557 nilfs_btree_init_path(btree, path); 1521 nilfs_btree_init_path(path);
1558 1522
1559 ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL); 1523 ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
1560 1524
1561 nilfs_btree_clear_path(btree, path); 1525 nilfs_btree_release_path(path);
1562 nilfs_btree_free_path(btree, path); 1526 nilfs_btree_free_path(path);
1563 1527
1564 return ret; 1528 return ret;
1565} 1529}
@@ -1581,7 +1545,7 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
1581 node = root; 1545 node = root;
1582 break; 1546 break;
1583 case 3: 1547 case 3:
1584 nchildren = nilfs_btree_node_get_nchildren(btree, root); 1548 nchildren = nilfs_btree_node_get_nchildren(root);
1585 if (nchildren > 1) 1549 if (nchildren > 1)
1586 return 0; 1550 return 0;
1587 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); 1551 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
@@ -1594,10 +1558,10 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
1594 return 0; 1558 return 0;
1595 } 1559 }
1596 1560
1597 nchildren = nilfs_btree_node_get_nchildren(btree, node); 1561 nchildren = nilfs_btree_node_get_nchildren(node);
1598 maxkey = nilfs_btree_node_get_key(btree, node, nchildren - 1); 1562 maxkey = nilfs_btree_node_get_key(node, nchildren - 1);
1599 nextmaxkey = (nchildren > 1) ? 1563 nextmaxkey = (nchildren > 1) ?
1600 nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0; 1564 nilfs_btree_node_get_key(node, nchildren - 2) : 0;
1601 if (bh != NULL) 1565 if (bh != NULL)
1602 brelse(bh); 1566 brelse(bh);
1603 1567
@@ -1623,7 +1587,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
1623 node = root; 1587 node = root;
1624 break; 1588 break;
1625 case 3: 1589 case 3:
1626 nchildren = nilfs_btree_node_get_nchildren(btree, root); 1590 nchildren = nilfs_btree_node_get_nchildren(root);
1627 WARN_ON(nchildren > 1); 1591 WARN_ON(nchildren > 1);
1628 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); 1592 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
1629 ret = nilfs_btree_get_block(btree, ptr, &bh); 1593 ret = nilfs_btree_get_block(btree, ptr, &bh);
@@ -1636,11 +1600,11 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
1636 return -EINVAL; 1600 return -EINVAL;
1637 } 1601 }
1638 1602
1639 nchildren = nilfs_btree_node_get_nchildren(btree, node); 1603 nchildren = nilfs_btree_node_get_nchildren(node);
1640 if (nchildren < nitems) 1604 if (nchildren < nitems)
1641 nitems = nchildren; 1605 nitems = nchildren;
1642 dkeys = nilfs_btree_node_dkeys(btree, node); 1606 dkeys = nilfs_btree_node_dkeys(node);
1643 dptrs = nilfs_btree_node_dptrs(btree, node); 1607 dptrs = nilfs_btree_node_dptrs(node, btree);
1644 for (i = 0; i < nitems; i++) { 1608 for (i = 0; i < nitems; i++) {
1645 keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]); 1609 keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]);
1646 ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]); 1610 ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]);
@@ -1660,18 +1624,20 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
1660 struct nilfs_bmap_stats *stats) 1624 struct nilfs_bmap_stats *stats)
1661{ 1625{
1662 struct buffer_head *bh; 1626 struct buffer_head *bh;
1663 struct nilfs_btree *btree; 1627 struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
1628 struct inode *dat = NULL;
1664 int ret; 1629 int ret;
1665 1630
1666 btree = (struct nilfs_btree *)bmap;
1667 stats->bs_nblocks = 0; 1631 stats->bs_nblocks = 0;
1668 1632
1669 /* for data */ 1633 /* for data */
1670 /* cannot find near ptr */ 1634 /* cannot find near ptr */
1671 if (NILFS_BMAP_USE_VBN(bmap)) 1635 if (NILFS_BMAP_USE_VBN(bmap)) {
1672 dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key); 1636 dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key);
1637 dat = nilfs_bmap_get_dat(bmap);
1638 }
1673 1639
1674 ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq); 1640 ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq, dat);
1675 if (ret < 0) 1641 if (ret < 0)
1676 return ret; 1642 return ret;
1677 1643
@@ -1679,7 +1645,7 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
1679 stats->bs_nblocks++; 1645 stats->bs_nblocks++;
1680 if (nreq != NULL) { 1646 if (nreq != NULL) {
1681 nreq->bpr_ptr = dreq->bpr_ptr + 1; 1647 nreq->bpr_ptr = dreq->bpr_ptr + 1;
1682 ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq); 1648 ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq, dat);
1683 if (ret < 0) 1649 if (ret < 0)
1684 goto err_out_dreq; 1650 goto err_out_dreq;
1685 1651
@@ -1696,9 +1662,9 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
1696 1662
1697 /* error */ 1663 /* error */
1698 err_out_nreq: 1664 err_out_nreq:
1699 nilfs_bmap_abort_alloc_ptr(bmap, nreq); 1665 nilfs_bmap_abort_alloc_ptr(bmap, nreq, dat);
1700 err_out_dreq: 1666 err_out_dreq:
1701 nilfs_bmap_abort_alloc_ptr(bmap, dreq); 1667 nilfs_bmap_abort_alloc_ptr(bmap, dreq, dat);
1702 stats->bs_nblocks = 0; 1668 stats->bs_nblocks = 0;
1703 return ret; 1669 return ret;
1704 1670
@@ -1713,8 +1679,9 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
1713 union nilfs_bmap_ptr_req *nreq, 1679 union nilfs_bmap_ptr_req *nreq,
1714 struct buffer_head *bh) 1680 struct buffer_head *bh)
1715{ 1681{
1716 struct nilfs_btree *btree; 1682 struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
1717 struct nilfs_btree_node *node; 1683 struct nilfs_btree_node *node;
1684 struct inode *dat;
1718 __u64 tmpptr; 1685 __u64 tmpptr;
1719 1686
1720 /* free resources */ 1687 /* free resources */
@@ -1725,11 +1692,11 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
1725 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); 1692 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
1726 1693
1727 /* convert and insert */ 1694 /* convert and insert */
1728 btree = (struct nilfs_btree *)bmap; 1695 dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
1729 nilfs_btree_init(bmap); 1696 nilfs_btree_init(bmap);
1730 if (nreq != NULL) { 1697 if (nreq != NULL) {
1731 nilfs_bmap_commit_alloc_ptr(bmap, dreq); 1698 nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat);
1732 nilfs_bmap_commit_alloc_ptr(bmap, nreq); 1699 nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat);
1733 1700
1734 /* create child node at level 1 */ 1701 /* create child node at level 1 */
1735 lock_buffer(bh); 1702 lock_buffer(bh);
@@ -1751,7 +1718,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
1751 nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT, 1718 nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
1752 2, 1, &keys[0], &tmpptr); 1719 2, 1, &keys[0], &tmpptr);
1753 } else { 1720 } else {
1754 nilfs_bmap_commit_alloc_ptr(bmap, dreq); 1721 nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat);
1755 1722
1756 /* create root node at level 1 */ 1723 /* create root node at level 1 */
1757 node = nilfs_btree_get_root(btree); 1724 node = nilfs_btree_get_root(btree);
@@ -1822,7 +1789,7 @@ static int nilfs_btree_propagate_p(struct nilfs_btree *btree,
1822 1789
1823static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, 1790static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
1824 struct nilfs_btree_path *path, 1791 struct nilfs_btree_path *path,
1825 int level) 1792 int level, struct inode *dat)
1826{ 1793{
1827 struct nilfs_btree_node *parent; 1794 struct nilfs_btree_node *parent;
1828 int ret; 1795 int ret;
@@ -1832,9 +1799,8 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
1832 nilfs_btree_node_get_ptr(btree, parent, 1799 nilfs_btree_node_get_ptr(btree, parent,
1833 path[level + 1].bp_index); 1800 path[level + 1].bp_index);
1834 path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1; 1801 path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
1835 ret = nilfs_bmap_prepare_update_v(&btree->bt_bmap, 1802 ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req,
1836 &path[level].bp_oldreq, 1803 &path[level].bp_newreq.bpr_req);
1837 &path[level].bp_newreq);
1838 if (ret < 0) 1804 if (ret < 0)
1839 return ret; 1805 return ret;
1840 1806
@@ -1846,9 +1812,9 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
1846 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, 1812 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
1847 &path[level].bp_ctxt); 1813 &path[level].bp_ctxt);
1848 if (ret < 0) { 1814 if (ret < 0) {
1849 nilfs_bmap_abort_update_v(&btree->bt_bmap, 1815 nilfs_dat_abort_update(dat,
1850 &path[level].bp_oldreq, 1816 &path[level].bp_oldreq.bpr_req,
1851 &path[level].bp_newreq); 1817 &path[level].bp_newreq.bpr_req);
1852 return ret; 1818 return ret;
1853 } 1819 }
1854 } 1820 }
@@ -1858,13 +1824,13 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
1858 1824
1859static void nilfs_btree_commit_update_v(struct nilfs_btree *btree, 1825static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
1860 struct nilfs_btree_path *path, 1826 struct nilfs_btree_path *path,
1861 int level) 1827 int level, struct inode *dat)
1862{ 1828{
1863 struct nilfs_btree_node *parent; 1829 struct nilfs_btree_node *parent;
1864 1830
1865 nilfs_bmap_commit_update_v(&btree->bt_bmap, 1831 nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req,
1866 &path[level].bp_oldreq, 1832 &path[level].bp_newreq.bpr_req,
1867 &path[level].bp_newreq); 1833 btree->bt_bmap.b_ptr_type == NILFS_BMAP_PTR_VS);
1868 1834
1869 if (buffer_nilfs_node(path[level].bp_bh)) { 1835 if (buffer_nilfs_node(path[level].bp_bh)) {
1870 nilfs_btnode_commit_change_key( 1836 nilfs_btnode_commit_change_key(
@@ -1881,11 +1847,10 @@ static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
1881 1847
1882static void nilfs_btree_abort_update_v(struct nilfs_btree *btree, 1848static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
1883 struct nilfs_btree_path *path, 1849 struct nilfs_btree_path *path,
1884 int level) 1850 int level, struct inode *dat)
1885{ 1851{
1886 nilfs_bmap_abort_update_v(&btree->bt_bmap, 1852 nilfs_dat_abort_update(dat, &path[level].bp_oldreq.bpr_req,
1887 &path[level].bp_oldreq, 1853 &path[level].bp_newreq.bpr_req);
1888 &path[level].bp_newreq);
1889 if (buffer_nilfs_node(path[level].bp_bh)) 1854 if (buffer_nilfs_node(path[level].bp_bh))
1890 nilfs_btnode_abort_change_key( 1855 nilfs_btnode_abort_change_key(
1891 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, 1856 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
@@ -1894,14 +1859,14 @@ static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
1894 1859
1895static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree, 1860static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
1896 struct nilfs_btree_path *path, 1861 struct nilfs_btree_path *path,
1897 int minlevel, 1862 int minlevel, int *maxlevelp,
1898 int *maxlevelp) 1863 struct inode *dat)
1899{ 1864{
1900 int level, ret; 1865 int level, ret;
1901 1866
1902 level = minlevel; 1867 level = minlevel;
1903 if (!buffer_nilfs_volatile(path[level].bp_bh)) { 1868 if (!buffer_nilfs_volatile(path[level].bp_bh)) {
1904 ret = nilfs_btree_prepare_update_v(btree, path, level); 1869 ret = nilfs_btree_prepare_update_v(btree, path, level, dat);
1905 if (ret < 0) 1870 if (ret < 0)
1906 return ret; 1871 return ret;
1907 } 1872 }
@@ -1909,7 +1874,7 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
1909 !buffer_dirty(path[level].bp_bh)) { 1874 !buffer_dirty(path[level].bp_bh)) {
1910 1875
1911 WARN_ON(buffer_nilfs_volatile(path[level].bp_bh)); 1876 WARN_ON(buffer_nilfs_volatile(path[level].bp_bh));
1912 ret = nilfs_btree_prepare_update_v(btree, path, level); 1877 ret = nilfs_btree_prepare_update_v(btree, path, level, dat);
1913 if (ret < 0) 1878 if (ret < 0)
1914 goto out; 1879 goto out;
1915 } 1880 }
@@ -1921,39 +1886,40 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
1921 /* error */ 1886 /* error */
1922 out: 1887 out:
1923 while (--level > minlevel) 1888 while (--level > minlevel)
1924 nilfs_btree_abort_update_v(btree, path, level); 1889 nilfs_btree_abort_update_v(btree, path, level, dat);
1925 if (!buffer_nilfs_volatile(path[level].bp_bh)) 1890 if (!buffer_nilfs_volatile(path[level].bp_bh))
1926 nilfs_btree_abort_update_v(btree, path, level); 1891 nilfs_btree_abort_update_v(btree, path, level, dat);
1927 return ret; 1892 return ret;
1928} 1893}
1929 1894
1930static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree, 1895static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree,
1931 struct nilfs_btree_path *path, 1896 struct nilfs_btree_path *path,
1932 int minlevel, 1897 int minlevel, int maxlevel,
1933 int maxlevel, 1898 struct buffer_head *bh,
1934 struct buffer_head *bh) 1899 struct inode *dat)
1935{ 1900{
1936 int level; 1901 int level;
1937 1902
1938 if (!buffer_nilfs_volatile(path[minlevel].bp_bh)) 1903 if (!buffer_nilfs_volatile(path[minlevel].bp_bh))
1939 nilfs_btree_commit_update_v(btree, path, minlevel); 1904 nilfs_btree_commit_update_v(btree, path, minlevel, dat);
1940 1905
1941 for (level = minlevel + 1; level <= maxlevel; level++) 1906 for (level = minlevel + 1; level <= maxlevel; level++)
1942 nilfs_btree_commit_update_v(btree, path, level); 1907 nilfs_btree_commit_update_v(btree, path, level, dat);
1943} 1908}
1944 1909
1945static int nilfs_btree_propagate_v(struct nilfs_btree *btree, 1910static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
1946 struct nilfs_btree_path *path, 1911 struct nilfs_btree_path *path,
1947 int level, 1912 int level, struct buffer_head *bh)
1948 struct buffer_head *bh)
1949{ 1913{
1950 int maxlevel, ret; 1914 int maxlevel, ret;
1951 struct nilfs_btree_node *parent; 1915 struct nilfs_btree_node *parent;
1916 struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
1952 __u64 ptr; 1917 __u64 ptr;
1953 1918
1954 get_bh(bh); 1919 get_bh(bh);
1955 path[level].bp_bh = bh; 1920 path[level].bp_bh = bh;
1956 ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel); 1921 ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel,
1922 dat);
1957 if (ret < 0) 1923 if (ret < 0)
1958 goto out; 1924 goto out;
1959 1925
@@ -1961,12 +1927,12 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
1961 parent = nilfs_btree_get_node(btree, path, level + 1); 1927 parent = nilfs_btree_get_node(btree, path, level + 1);
1962 ptr = nilfs_btree_node_get_ptr(btree, parent, 1928 ptr = nilfs_btree_node_get_ptr(btree, parent,
1963 path[level + 1].bp_index); 1929 path[level + 1].bp_index);
1964 ret = nilfs_bmap_mark_dirty(&btree->bt_bmap, ptr); 1930 ret = nilfs_dat_mark_dirty(dat, ptr);
1965 if (ret < 0) 1931 if (ret < 0)
1966 goto out; 1932 goto out;
1967 } 1933 }
1968 1934
1969 nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh); 1935 nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh, dat);
1970 1936
1971 out: 1937 out:
1972 brelse(path[level].bp_bh); 1938 brelse(path[level].bp_bh);
@@ -1986,15 +1952,15 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
1986 WARN_ON(!buffer_dirty(bh)); 1952 WARN_ON(!buffer_dirty(bh));
1987 1953
1988 btree = (struct nilfs_btree *)bmap; 1954 btree = (struct nilfs_btree *)bmap;
1989 path = nilfs_btree_alloc_path(btree); 1955 path = nilfs_btree_alloc_path();
1990 if (path == NULL) 1956 if (path == NULL)
1991 return -ENOMEM; 1957 return -ENOMEM;
1992 nilfs_btree_init_path(btree, path); 1958 nilfs_btree_init_path(path);
1993 1959
1994 if (buffer_nilfs_node(bh)) { 1960 if (buffer_nilfs_node(bh)) {
1995 node = (struct nilfs_btree_node *)bh->b_data; 1961 node = (struct nilfs_btree_node *)bh->b_data;
1996 key = nilfs_btree_node_get_key(btree, node, 0); 1962 key = nilfs_btree_node_get_key(node, 0);
1997 level = nilfs_btree_node_get_level(btree, node); 1963 level = nilfs_btree_node_get_level(node);
1998 } else { 1964 } else {
1999 key = nilfs_bmap_data_get_key(bmap, bh); 1965 key = nilfs_bmap_data_get_key(bmap, bh);
2000 level = NILFS_BTREE_LEVEL_DATA; 1966 level = NILFS_BTREE_LEVEL_DATA;
@@ -2013,8 +1979,8 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
2013 nilfs_btree_propagate_p(btree, path, level, bh); 1979 nilfs_btree_propagate_p(btree, path, level, bh);
2014 1980
2015 out: 1981 out:
2016 nilfs_btree_clear_path(btree, path); 1982 nilfs_btree_release_path(path);
2017 nilfs_btree_free_path(btree, path); 1983 nilfs_btree_free_path(path);
2018 1984
2019 return ret; 1985 return ret;
2020} 1986}
@@ -2022,7 +1988,7 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
2022static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap, 1988static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap,
2023 struct buffer_head *bh) 1989 struct buffer_head *bh)
2024{ 1990{
2025 return nilfs_bmap_mark_dirty(bmap, bh->b_blocknr); 1991 return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), bh->b_blocknr);
2026} 1992}
2027 1993
2028static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree, 1994static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
@@ -2037,12 +2003,12 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
2037 2003
2038 get_bh(bh); 2004 get_bh(bh);
2039 node = (struct nilfs_btree_node *)bh->b_data; 2005 node = (struct nilfs_btree_node *)bh->b_data;
2040 key = nilfs_btree_node_get_key(btree, node, 0); 2006 key = nilfs_btree_node_get_key(node, 0);
2041 level = nilfs_btree_node_get_level(btree, node); 2007 level = nilfs_btree_node_get_level(node);
2042 list_for_each(head, &lists[level]) { 2008 list_for_each(head, &lists[level]) {
2043 cbh = list_entry(head, struct buffer_head, b_assoc_buffers); 2009 cbh = list_entry(head, struct buffer_head, b_assoc_buffers);
2044 cnode = (struct nilfs_btree_node *)cbh->b_data; 2010 cnode = (struct nilfs_btree_node *)cbh->b_data;
2045 ckey = nilfs_btree_node_get_key(btree, cnode, 0); 2011 ckey = nilfs_btree_node_get_key(cnode, 0);
2046 if (key < ckey) 2012 if (key < ckey)
2047 break; 2013 break;
2048 } 2014 }
@@ -2120,8 +2086,7 @@ static int nilfs_btree_assign_p(struct nilfs_btree *btree,
2120 nilfs_btree_node_set_ptr(btree, parent, 2086 nilfs_btree_node_set_ptr(btree, parent,
2121 path[level + 1].bp_index, blocknr); 2087 path[level + 1].bp_index, blocknr);
2122 2088
2123 key = nilfs_btree_node_get_key(btree, parent, 2089 key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
2124 path[level + 1].bp_index);
2125 /* on-disk format */ 2090 /* on-disk format */
2126 binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key); 2091 binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
2127 binfo->bi_dat.bi_level = level; 2092 binfo->bi_dat.bi_level = level;
@@ -2137,6 +2102,7 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
2137 union nilfs_binfo *binfo) 2102 union nilfs_binfo *binfo)
2138{ 2103{
2139 struct nilfs_btree_node *parent; 2104 struct nilfs_btree_node *parent;
2105 struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
2140 __u64 key; 2106 __u64 key;
2141 __u64 ptr; 2107 __u64 ptr;
2142 union nilfs_bmap_ptr_req req; 2108 union nilfs_bmap_ptr_req req;
@@ -2146,12 +2112,12 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
2146 ptr = nilfs_btree_node_get_ptr(btree, parent, 2112 ptr = nilfs_btree_node_get_ptr(btree, parent,
2147 path[level + 1].bp_index); 2113 path[level + 1].bp_index);
2148 req.bpr_ptr = ptr; 2114 req.bpr_ptr = ptr;
2149 ret = nilfs_bmap_start_v(&btree->bt_bmap, &req, blocknr); 2115 ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
2150 if (unlikely(ret < 0)) 2116 if (ret < 0)
2151 return ret; 2117 return ret;
2118 nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
2152 2119
2153 key = nilfs_btree_node_get_key(btree, parent, 2120 key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
2154 path[level + 1].bp_index);
2155 /* on-disk format */ 2121 /* on-disk format */
2156 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); 2122 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
2157 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); 2123 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
@@ -2171,15 +2137,15 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
2171 int level, ret; 2137 int level, ret;
2172 2138
2173 btree = (struct nilfs_btree *)bmap; 2139 btree = (struct nilfs_btree *)bmap;
2174 path = nilfs_btree_alloc_path(btree); 2140 path = nilfs_btree_alloc_path();
2175 if (path == NULL) 2141 if (path == NULL)
2176 return -ENOMEM; 2142 return -ENOMEM;
2177 nilfs_btree_init_path(btree, path); 2143 nilfs_btree_init_path(path);
2178 2144
2179 if (buffer_nilfs_node(*bh)) { 2145 if (buffer_nilfs_node(*bh)) {
2180 node = (struct nilfs_btree_node *)(*bh)->b_data; 2146 node = (struct nilfs_btree_node *)(*bh)->b_data;
2181 key = nilfs_btree_node_get_key(btree, node, 0); 2147 key = nilfs_btree_node_get_key(node, 0);
2182 level = nilfs_btree_node_get_level(btree, node); 2148 level = nilfs_btree_node_get_level(node);
2183 } else { 2149 } else {
2184 key = nilfs_bmap_data_get_key(bmap, *bh); 2150 key = nilfs_bmap_data_get_key(bmap, *bh);
2185 level = NILFS_BTREE_LEVEL_DATA; 2151 level = NILFS_BTREE_LEVEL_DATA;
@@ -2196,8 +2162,8 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
2196 nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo); 2162 nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
2197 2163
2198 out: 2164 out:
2199 nilfs_btree_clear_path(btree, path); 2165 nilfs_btree_release_path(path);
2200 nilfs_btree_free_path(btree, path); 2166 nilfs_btree_free_path(path);
2201 2167
2202 return ret; 2168 return ret;
2203} 2169}
@@ -2207,19 +2173,18 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
2207 sector_t blocknr, 2173 sector_t blocknr,
2208 union nilfs_binfo *binfo) 2174 union nilfs_binfo *binfo)
2209{ 2175{
2210 struct nilfs_btree *btree;
2211 struct nilfs_btree_node *node; 2176 struct nilfs_btree_node *node;
2212 __u64 key; 2177 __u64 key;
2213 int ret; 2178 int ret;
2214 2179
2215 btree = (struct nilfs_btree *)bmap; 2180 ret = nilfs_dat_move(nilfs_bmap_get_dat(bmap), (*bh)->b_blocknr,
2216 ret = nilfs_bmap_move_v(bmap, (*bh)->b_blocknr, blocknr); 2181 blocknr);
2217 if (ret < 0) 2182 if (ret < 0)
2218 return ret; 2183 return ret;
2219 2184
2220 if (buffer_nilfs_node(*bh)) { 2185 if (buffer_nilfs_node(*bh)) {
2221 node = (struct nilfs_btree_node *)(*bh)->b_data; 2186 node = (struct nilfs_btree_node *)(*bh)->b_data;
2222 key = nilfs_btree_node_get_key(btree, node, 0); 2187 key = nilfs_btree_node_get_key(node, 0);
2223 } else 2188 } else
2224 key = nilfs_bmap_data_get_key(bmap, *bh); 2189 key = nilfs_bmap_data_get_key(bmap, *bh);
2225 2190
@@ -2239,10 +2204,10 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
2239 int ret; 2204 int ret;
2240 2205
2241 btree = (struct nilfs_btree *)bmap; 2206 btree = (struct nilfs_btree *)bmap;
2242 path = nilfs_btree_alloc_path(btree); 2207 path = nilfs_btree_alloc_path();
2243 if (path == NULL) 2208 if (path == NULL)
2244 return -ENOMEM; 2209 return -ENOMEM;
2245 nilfs_btree_init_path(btree, path); 2210 nilfs_btree_init_path(path);
2246 2211
2247 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1); 2212 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
2248 if (ret < 0) { 2213 if (ret < 0) {
@@ -2262,8 +2227,8 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
2262 nilfs_bmap_set_dirty(&btree->bt_bmap); 2227 nilfs_bmap_set_dirty(&btree->bt_bmap);
2263 2228
2264 out: 2229 out:
2265 nilfs_btree_clear_path(btree, path); 2230 nilfs_btree_release_path(path);
2266 nilfs_btree_free_path(btree, path); 2231 nilfs_btree_free_path(path);
2267 return ret; 2232 return ret;
2268} 2233}
2269 2234
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index aec942cf79e3..1c6cfb59128d 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -815,8 +815,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
815 void *kaddr; 815 void *kaddr;
816 int ret; 816 int ret;
817 817
818 if (cno == 0) 818 /* CP number is invalid if it's zero or larger than the
819 return -ENOENT; /* checkpoint number 0 is invalid */ 819 largest exist one.*/
820 if (cno == 0 || cno >= nilfs_mdt_cno(cpfile))
821 return -ENOENT;
820 down_read(&NILFS_MDT(cpfile)->mi_sem); 822 down_read(&NILFS_MDT(cpfile)->mi_sem);
821 823
822 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh); 824 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
@@ -824,7 +826,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
824 goto out; 826 goto out;
825 kaddr = kmap_atomic(bh->b_page, KM_USER0); 827 kaddr = kmap_atomic(bh->b_page, KM_USER0);
826 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); 828 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
827 ret = nilfs_checkpoint_snapshot(cp); 829 if (nilfs_checkpoint_invalid(cp))
830 ret = -ENOENT;
831 else
832 ret = nilfs_checkpoint_snapshot(cp);
828 kunmap_atomic(kaddr, KM_USER0); 833 kunmap_atomic(kaddr, KM_USER0);
829 brelse(bh); 834 brelse(bh);
830 835
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index 788a45950197..debea896e701 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -27,8 +27,6 @@
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/nilfs2_fs.h> 28#include <linux/nilfs2_fs.h>
29 29
30#define NILFS_CPFILE_GFP NILFS_MDT_GFP
31
32 30
33int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int, 31int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
34 struct nilfs_checkpoint **, 32 struct nilfs_checkpoint **,
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 8927ca27e6f7..1ff8e15bd36b 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -109,12 +109,6 @@ void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req)
109 nilfs_palloc_commit_free_entry(dat, req); 109 nilfs_palloc_commit_free_entry(dat, req);
110} 110}
111 111
112void nilfs_dat_abort_free(struct inode *dat, struct nilfs_palloc_req *req)
113{
114 nilfs_dat_abort_entry(dat, req);
115 nilfs_palloc_abort_free_entry(dat, req);
116}
117
118int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req) 112int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req)
119{ 113{
120 int ret; 114 int ret;
@@ -140,11 +134,6 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
140 nilfs_dat_commit_entry(dat, req); 134 nilfs_dat_commit_entry(dat, req);
141} 135}
142 136
143void nilfs_dat_abort_start(struct inode *dat, struct nilfs_palloc_req *req)
144{
145 nilfs_dat_abort_entry(dat, req);
146}
147
148int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req) 137int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
149{ 138{
150 struct nilfs_dat_entry *entry; 139 struct nilfs_dat_entry *entry;
@@ -222,6 +211,37 @@ void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req)
222 nilfs_dat_abort_entry(dat, req); 211 nilfs_dat_abort_entry(dat, req);
223} 212}
224 213
214int nilfs_dat_prepare_update(struct inode *dat,
215 struct nilfs_palloc_req *oldreq,
216 struct nilfs_palloc_req *newreq)
217{
218 int ret;
219
220 ret = nilfs_dat_prepare_end(dat, oldreq);
221 if (!ret) {
222 ret = nilfs_dat_prepare_alloc(dat, newreq);
223 if (ret < 0)
224 nilfs_dat_abort_end(dat, oldreq);
225 }
226 return ret;
227}
228
229void nilfs_dat_commit_update(struct inode *dat,
230 struct nilfs_palloc_req *oldreq,
231 struct nilfs_palloc_req *newreq, int dead)
232{
233 nilfs_dat_commit_end(dat, oldreq, dead);
234 nilfs_dat_commit_alloc(dat, newreq);
235}
236
237void nilfs_dat_abort_update(struct inode *dat,
238 struct nilfs_palloc_req *oldreq,
239 struct nilfs_palloc_req *newreq)
240{
241 nilfs_dat_abort_end(dat, oldreq);
242 nilfs_dat_abort_alloc(dat, newreq);
243}
244
225/** 245/**
226 * nilfs_dat_mark_dirty - 246 * nilfs_dat_mark_dirty -
227 * @dat: DAT file inode 247 * @dat: DAT file inode
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index d328b81eead4..406070d3ff49 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -27,7 +27,6 @@
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/fs.h> 28#include <linux/fs.h>
29 29
30#define NILFS_DAT_GFP NILFS_MDT_GFP
31 30
32struct nilfs_palloc_req; 31struct nilfs_palloc_req;
33 32
@@ -39,10 +38,15 @@ void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *);
39int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *); 38int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *);
40void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *, 39void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *,
41 sector_t); 40 sector_t);
42void nilfs_dat_abort_start(struct inode *, struct nilfs_palloc_req *);
43int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *); 41int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *);
44void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int); 42void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int);
45void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *); 43void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *);
44int nilfs_dat_prepare_update(struct inode *, struct nilfs_palloc_req *,
45 struct nilfs_palloc_req *);
46void nilfs_dat_commit_update(struct inode *, struct nilfs_palloc_req *,
47 struct nilfs_palloc_req *, int);
48void nilfs_dat_abort_update(struct inode *, struct nilfs_palloc_req *,
49 struct nilfs_palloc_req *);
46 50
47int nilfs_dat_mark_dirty(struct inode *, __u64); 51int nilfs_dat_mark_dirty(struct inode *, __u64);
48int nilfs_dat_freev(struct inode *, __u64 *, size_t); 52int nilfs_dat_freev(struct inode *, __u64 *, size_t);
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 1a4fa04cf071..e097099bfc8f 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -697,7 +697,7 @@ not_empty:
697 return 0; 697 return 0;
698} 698}
699 699
700struct file_operations nilfs_dir_operations = { 700const struct file_operations nilfs_dir_operations = {
701 .llseek = generic_file_llseek, 701 .llseek = generic_file_llseek,
702 .read = generic_read_dir, 702 .read = generic_read_dir,
703 .readdir = nilfs_readdir, 703 .readdir = nilfs_readdir,
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 342d9765df8d..d369ac718277 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -125,106 +125,64 @@ static void nilfs_direct_set_target_v(struct nilfs_direct *direct,
125 direct->d_bmap.b_last_allocated_ptr = ptr; 125 direct->d_bmap.b_last_allocated_ptr = ptr;
126} 126}
127 127
128static int nilfs_direct_prepare_insert(struct nilfs_direct *direct,
129 __u64 key,
130 union nilfs_bmap_ptr_req *req,
131 struct nilfs_bmap_stats *stats)
132{
133 int ret;
134
135 if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
136 req->bpr_ptr = nilfs_direct_find_target_v(direct, key);
137 ret = nilfs_bmap_prepare_alloc_ptr(&direct->d_bmap, req);
138 if (ret < 0)
139 return ret;
140
141 stats->bs_nblocks = 1;
142 return 0;
143}
144
145static void nilfs_direct_commit_insert(struct nilfs_direct *direct,
146 union nilfs_bmap_ptr_req *req,
147 __u64 key, __u64 ptr)
148{
149 struct buffer_head *bh;
150
151 /* ptr must be a pointer to a buffer head. */
152 bh = (struct buffer_head *)((unsigned long)ptr);
153 set_buffer_nilfs_volatile(bh);
154
155 nilfs_bmap_commit_alloc_ptr(&direct->d_bmap, req);
156 nilfs_direct_set_ptr(direct, key, req->bpr_ptr);
157
158 if (!nilfs_bmap_dirty(&direct->d_bmap))
159 nilfs_bmap_set_dirty(&direct->d_bmap);
160
161 if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
162 nilfs_direct_set_target_v(direct, key, req->bpr_ptr);
163}
164
165static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) 128static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
166{ 129{
167 struct nilfs_direct *direct; 130 struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
168 union nilfs_bmap_ptr_req req; 131 union nilfs_bmap_ptr_req req;
169 struct nilfs_bmap_stats stats; 132 struct inode *dat = NULL;
133 struct buffer_head *bh;
170 int ret; 134 int ret;
171 135
172 direct = (struct nilfs_direct *)bmap;
173 if (key > NILFS_DIRECT_KEY_MAX) 136 if (key > NILFS_DIRECT_KEY_MAX)
174 return -ENOENT; 137 return -ENOENT;
175 if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR) 138 if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR)
176 return -EEXIST; 139 return -EEXIST;
177 140
178 ret = nilfs_direct_prepare_insert(direct, key, &req, &stats); 141 if (NILFS_BMAP_USE_VBN(bmap)) {
179 if (ret < 0) 142 req.bpr_ptr = nilfs_direct_find_target_v(direct, key);
180 return ret; 143 dat = nilfs_bmap_get_dat(bmap);
181 nilfs_direct_commit_insert(direct, &req, key, ptr); 144 }
182 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); 145 ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat);
146 if (!ret) {
147 /* ptr must be a pointer to a buffer head. */
148 bh = (struct buffer_head *)((unsigned long)ptr);
149 set_buffer_nilfs_volatile(bh);
183 150
184 return 0; 151 nilfs_bmap_commit_alloc_ptr(bmap, &req, dat);
185} 152 nilfs_direct_set_ptr(direct, key, req.bpr_ptr);
186 153
187static int nilfs_direct_prepare_delete(struct nilfs_direct *direct, 154 if (!nilfs_bmap_dirty(bmap))
188 union nilfs_bmap_ptr_req *req, 155 nilfs_bmap_set_dirty(bmap);
189 __u64 key,
190 struct nilfs_bmap_stats *stats)
191{
192 int ret;
193 156
194 req->bpr_ptr = nilfs_direct_get_ptr(direct, key); 157 if (NILFS_BMAP_USE_VBN(bmap))
195 ret = nilfs_bmap_prepare_end_ptr(&direct->d_bmap, req); 158 nilfs_direct_set_target_v(direct, key, req.bpr_ptr);
196 if (!ret)
197 stats->bs_nblocks = 1;
198 return ret;
199}
200 159
201static void nilfs_direct_commit_delete(struct nilfs_direct *direct, 160 nilfs_bmap_add_blocks(bmap, 1);
202 union nilfs_bmap_ptr_req *req, 161 }
203 __u64 key) 162 return ret;
204{
205 nilfs_bmap_commit_end_ptr(&direct->d_bmap, req);
206 nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
207} 163}
208 164
209static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key) 165static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
210{ 166{
211 struct nilfs_direct *direct; 167 struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
212 union nilfs_bmap_ptr_req req; 168 union nilfs_bmap_ptr_req req;
213 struct nilfs_bmap_stats stats; 169 struct inode *dat;
214 int ret; 170 int ret;
215 171
216 direct = (struct nilfs_direct *)bmap; 172 if (key > NILFS_DIRECT_KEY_MAX ||
217 if ((key > NILFS_DIRECT_KEY_MAX) ||
218 nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR) 173 nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR)
219 return -ENOENT; 174 return -ENOENT;
220 175
221 ret = nilfs_direct_prepare_delete(direct, &req, key, &stats); 176 dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
222 if (ret < 0) 177 req.bpr_ptr = nilfs_direct_get_ptr(direct, key);
223 return ret;
224 nilfs_direct_commit_delete(direct, &req, key);
225 nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
226 178
227 return 0; 179 ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat);
180 if (!ret) {
181 nilfs_bmap_commit_end_ptr(bmap, &req, dat);
182 nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
183 nilfs_bmap_sub_blocks(bmap, 1);
184 }
185 return ret;
228} 186}
229 187
230static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) 188static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
@@ -310,59 +268,56 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
310 return 0; 268 return 0;
311} 269}
312 270
313static int nilfs_direct_propagate_v(struct nilfs_direct *direct, 271static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
314 struct buffer_head *bh) 272 struct buffer_head *bh)
315{ 273{
316 union nilfs_bmap_ptr_req oldreq, newreq; 274 struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
275 struct nilfs_palloc_req oldreq, newreq;
276 struct inode *dat;
317 __u64 key; 277 __u64 key;
318 __u64 ptr; 278 __u64 ptr;
319 int ret; 279 int ret;
320 280
321 key = nilfs_bmap_data_get_key(&direct->d_bmap, bh); 281 if (!NILFS_BMAP_USE_VBN(bmap))
282 return 0;
283
284 dat = nilfs_bmap_get_dat(bmap);
285 key = nilfs_bmap_data_get_key(bmap, bh);
322 ptr = nilfs_direct_get_ptr(direct, key); 286 ptr = nilfs_direct_get_ptr(direct, key);
323 if (!buffer_nilfs_volatile(bh)) { 287 if (!buffer_nilfs_volatile(bh)) {
324 oldreq.bpr_ptr = ptr; 288 oldreq.pr_entry_nr = ptr;
325 newreq.bpr_ptr = ptr; 289 newreq.pr_entry_nr = ptr;
326 ret = nilfs_bmap_prepare_update_v(&direct->d_bmap, &oldreq, 290 ret = nilfs_dat_prepare_update(dat, &oldreq, &newreq);
327 &newreq);
328 if (ret < 0) 291 if (ret < 0)
329 return ret; 292 return ret;
330 nilfs_bmap_commit_update_v(&direct->d_bmap, &oldreq, &newreq); 293 nilfs_dat_commit_update(dat, &oldreq, &newreq,
294 bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
331 set_buffer_nilfs_volatile(bh); 295 set_buffer_nilfs_volatile(bh);
332 nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr); 296 nilfs_direct_set_ptr(direct, key, newreq.pr_entry_nr);
333 } else 297 } else
334 ret = nilfs_bmap_mark_dirty(&direct->d_bmap, ptr); 298 ret = nilfs_dat_mark_dirty(dat, ptr);
335 299
336 return ret; 300 return ret;
337} 301}
338 302
339static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
340 struct buffer_head *bh)
341{
342 struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
343
344 return NILFS_BMAP_USE_VBN(bmap) ?
345 nilfs_direct_propagate_v(direct, bh) : 0;
346}
347
348static int nilfs_direct_assign_v(struct nilfs_direct *direct, 303static int nilfs_direct_assign_v(struct nilfs_direct *direct,
349 __u64 key, __u64 ptr, 304 __u64 key, __u64 ptr,
350 struct buffer_head **bh, 305 struct buffer_head **bh,
351 sector_t blocknr, 306 sector_t blocknr,
352 union nilfs_binfo *binfo) 307 union nilfs_binfo *binfo)
353{ 308{
309 struct inode *dat = nilfs_bmap_get_dat(&direct->d_bmap);
354 union nilfs_bmap_ptr_req req; 310 union nilfs_bmap_ptr_req req;
355 int ret; 311 int ret;
356 312
357 req.bpr_ptr = ptr; 313 req.bpr_ptr = ptr;
358 ret = nilfs_bmap_start_v(&direct->d_bmap, &req, blocknr); 314 ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
359 if (unlikely(ret < 0)) 315 if (!ret) {
360 return ret; 316 nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
361 317 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
362 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); 318 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
363 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); 319 }
364 320 return ret;
365 return 0;
366} 321}
367 322
368static int nilfs_direct_assign_p(struct nilfs_direct *direct, 323static int nilfs_direct_assign_p(struct nilfs_direct *direct,
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 6bd84a0d8238..30292df443ce 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -117,7 +117,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
117 return 0; 117 return 0;
118} 118}
119 119
120struct vm_operations_struct nilfs_file_vm_ops = { 120static const struct vm_operations_struct nilfs_file_vm_ops = {
121 .fault = filemap_fault, 121 .fault = filemap_fault,
122 .page_mkwrite = nilfs_page_mkwrite, 122 .page_mkwrite = nilfs_page_mkwrite,
123}; 123};
@@ -134,7 +134,7 @@ static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
134 * We have mostly NULL's here: the current defaults are ok for 134 * We have mostly NULL's here: the current defaults are ok for
135 * the nilfs filesystem. 135 * the nilfs filesystem.
136 */ 136 */
137struct file_operations nilfs_file_operations = { 137const struct file_operations nilfs_file_operations = {
138 .llseek = generic_file_llseek, 138 .llseek = generic_file_llseek,
139 .read = do_sync_read, 139 .read = do_sync_read,
140 .write = do_sync_write, 140 .write = do_sync_write,
@@ -151,7 +151,7 @@ struct file_operations nilfs_file_operations = {
151 .splice_read = generic_file_splice_read, 151 .splice_read = generic_file_splice_read,
152}; 152};
153 153
154struct inode_operations nilfs_file_inode_operations = { 154const struct inode_operations nilfs_file_inode_operations = {
155 .truncate = nilfs_truncate, 155 .truncate = nilfs_truncate,
156 .setattr = nilfs_setattr, 156 .setattr = nilfs_setattr,
157 .permission = nilfs_permission, 157 .permission = nilfs_permission,
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 1b3c2bb20da9..e6de0a27ab5d 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -52,7 +52,7 @@
52#include "dat.h" 52#include "dat.h"
53#include "ifile.h" 53#include "ifile.h"
54 54
55static struct address_space_operations def_gcinode_aops = { 55static const struct address_space_operations def_gcinode_aops = {
56 .sync_page = block_sync_page, 56 .sync_page = block_sync_page,
57}; 57};
58 58
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 5d30a35679b5..ecc3ba76db47 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -31,7 +31,6 @@
31#include "mdt.h" 31#include "mdt.h"
32#include "alloc.h" 32#include "alloc.h"
33 33
34#define NILFS_IFILE_GFP NILFS_MDT_GFP
35 34
36static inline struct nilfs_inode * 35static inline struct nilfs_inode *
37nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh) 36nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh)
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index fe9d8f2a13f8..5040220c3732 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -238,7 +238,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
238 return size; 238 return size;
239} 239}
240 240
241struct address_space_operations nilfs_aops = { 241const struct address_space_operations nilfs_aops = {
242 .writepage = nilfs_writepage, 242 .writepage = nilfs_writepage,
243 .readpage = nilfs_readpage, 243 .readpage = nilfs_readpage,
244 .sync_page = block_sync_page, 244 .sync_page = block_sync_page,
@@ -400,6 +400,7 @@ int nilfs_read_inode_common(struct inode *inode,
400 ii->i_dir_acl = S_ISREG(inode->i_mode) ? 400 ii->i_dir_acl = S_ISREG(inode->i_mode) ?
401 0 : le32_to_cpu(raw_inode->i_dir_acl); 401 0 : le32_to_cpu(raw_inode->i_dir_acl);
402#endif 402#endif
403 ii->i_dir_start_lookup = 0;
403 ii->i_cno = 0; 404 ii->i_cno = 0;
404 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 405 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
405 406
@@ -430,7 +431,8 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
430 431
431 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh); 432 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh);
432 433
433 if (nilfs_read_inode_common(inode, raw_inode)) 434 err = nilfs_read_inode_common(inode, raw_inode);
435 if (err)
434 goto failed_unmap; 436 goto failed_unmap;
435 437
436 if (S_ISREG(inode->i_mode)) { 438 if (S_ISREG(inode->i_mode)) {
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 6ea5f872e2de..6572ea4bc4df 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -442,12 +442,6 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
442 const char *msg; 442 const char *msg;
443 int ret; 443 int ret;
444 444
445 ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]);
446 if (ret < 0) {
447 msg = "cannot read source blocks";
448 goto failed;
449 }
450
451 ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], kbufs[1]); 445 ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], kbufs[1]);
452 if (ret < 0) { 446 if (ret < 0) {
453 /* 447 /*
@@ -548,7 +542,25 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
548 } 542 }
549 } 543 }
550 544
551 ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); 545 /*
546 * nilfs_ioctl_move_blocks() will call nilfs_gc_iget(),
547 * which will operates an inode list without blocking.
548 * To protect the list from concurrent operations,
549 * nilfs_ioctl_move_blocks should be atomic operation.
550 */
551 if (test_and_set_bit(THE_NILFS_GC_RUNNING, &nilfs->ns_flags)) {
552 ret = -EBUSY;
553 goto out_free;
554 }
555
556 ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]);
557 if (ret < 0)
558 printk(KERN_ERR "NILFS: GC failed during preparation: "
559 "cannot read source blocks: err=%d\n", ret);
560 else
561 ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
562
563 clear_nilfs_gc_running(nilfs);
552 564
553 out_free: 565 out_free:
554 while (--n >= 0) 566 while (--n >= 0)
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 2dfd47714ae5..f6326112d647 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -103,15 +103,12 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
103 goto failed_unlock; 103 goto failed_unlock;
104 104
105 err = -EEXIST; 105 err = -EEXIST;
106 if (buffer_uptodate(bh) || buffer_mapped(bh)) 106 if (buffer_uptodate(bh))
107 goto failed_bh; 107 goto failed_bh;
108#if 0 108
109 /* The uptodate flag is not protected by the page lock, but
110 the mapped flag is. Thus, we don't have to wait the buffer. */
111 wait_on_buffer(bh); 109 wait_on_buffer(bh);
112 if (buffer_uptodate(bh)) 110 if (buffer_uptodate(bh))
113 goto failed_bh; 111 goto failed_bh;
114#endif
115 112
116 bh->b_bdev = nilfs->ns_bdev; 113 bh->b_bdev = nilfs->ns_bdev;
117 err = nilfs_mdt_insert_new_block(inode, block, bh, init_block); 114 err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
@@ -139,7 +136,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
139 int mode, struct buffer_head **out_bh) 136 int mode, struct buffer_head **out_bh)
140{ 137{
141 struct buffer_head *bh; 138 struct buffer_head *bh;
142 unsigned long blknum = 0; 139 __u64 blknum = 0;
143 int ret = -ENOMEM; 140 int ret = -ENOMEM;
144 141
145 bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0); 142 bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
@@ -162,17 +159,15 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
162 unlock_buffer(bh); 159 unlock_buffer(bh);
163 goto out; 160 goto out;
164 } 161 }
165 if (!buffer_mapped(bh)) { /* unused buffer */ 162
166 ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, 163 ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, &blknum);
167 &blknum); 164 if (unlikely(ret)) {
168 if (unlikely(ret)) { 165 unlock_buffer(bh);
169 unlock_buffer(bh); 166 goto failed_bh;
170 goto failed_bh;
171 }
172 bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
173 bh->b_blocknr = blknum;
174 set_buffer_mapped(bh);
175 } 167 }
168 bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
169 bh->b_blocknr = (sector_t)blknum;
170 set_buffer_mapped(bh);
176 171
177 bh->b_end_io = end_buffer_read_sync; 172 bh->b_end_io = end_buffer_read_sync;
178 get_bh(bh); 173 get_bh(bh);
@@ -402,6 +397,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
402 struct inode *inode = container_of(page->mapping, 397 struct inode *inode = container_of(page->mapping,
403 struct inode, i_data); 398 struct inode, i_data);
404 struct super_block *sb = inode->i_sb; 399 struct super_block *sb = inode->i_sb;
400 struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
405 struct nilfs_sb_info *writer = NULL; 401 struct nilfs_sb_info *writer = NULL;
406 int err = 0; 402 int err = 0;
407 403
@@ -411,9 +407,10 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
411 if (page->mapping->assoc_mapping) 407 if (page->mapping->assoc_mapping)
412 return 0; /* Do not request flush for shadow page cache */ 408 return 0; /* Do not request flush for shadow page cache */
413 if (!sb) { 409 if (!sb) {
414 writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs); 410 down_read(&nilfs->ns_writer_sem);
411 writer = nilfs->ns_writer;
415 if (!writer) { 412 if (!writer) {
416 nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs); 413 up_read(&nilfs->ns_writer_sem);
417 return -EROFS; 414 return -EROFS;
418 } 415 }
419 sb = writer->s_super; 416 sb = writer->s_super;
@@ -425,18 +422,18 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
425 nilfs_flush_segment(sb, inode->i_ino); 422 nilfs_flush_segment(sb, inode->i_ino);
426 423
427 if (writer) 424 if (writer)
428 nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs); 425 up_read(&nilfs->ns_writer_sem);
429 return err; 426 return err;
430} 427}
431 428
432 429
433static struct address_space_operations def_mdt_aops = { 430static const struct address_space_operations def_mdt_aops = {
434 .writepage = nilfs_mdt_write_page, 431 .writepage = nilfs_mdt_write_page,
435 .sync_page = block_sync_page, 432 .sync_page = block_sync_page,
436}; 433};
437 434
438static struct inode_operations def_mdt_iops; 435static const struct inode_operations def_mdt_iops;
439static struct file_operations def_mdt_fops; 436static const struct file_operations def_mdt_fops;
440 437
441/* 438/*
442 * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile, 439 * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile,
@@ -516,9 +513,10 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
516} 513}
517 514
518struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb, 515struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
519 ino_t ino, gfp_t gfp_mask) 516 ino_t ino)
520{ 517{
521 struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, gfp_mask); 518 struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino,
519 NILFS_MDT_GFP);
522 520
523 if (!inode) 521 if (!inode)
524 return NULL; 522 return NULL;
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index df683e0bca6a..431599733c9b 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -74,8 +74,7 @@ int nilfs_mdt_forget_block(struct inode *, unsigned long);
74int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long); 74int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
75int nilfs_mdt_fetch_dirty(struct inode *); 75int nilfs_mdt_fetch_dirty(struct inode *);
76 76
77struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t, 77struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t);
78 gfp_t);
79struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *, 78struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
80 ino_t, gfp_t); 79 ino_t, gfp_t);
81void nilfs_mdt_destroy(struct inode *); 80void nilfs_mdt_destroy(struct inode *);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index df70dadb336f..ed02e886fa79 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -448,7 +448,7 @@ out:
448 return err; 448 return err;
449} 449}
450 450
451struct inode_operations nilfs_dir_inode_operations = { 451const struct inode_operations nilfs_dir_inode_operations = {
452 .create = nilfs_create, 452 .create = nilfs_create,
453 .lookup = nilfs_lookup, 453 .lookup = nilfs_lookup,
454 .link = nilfs_link, 454 .link = nilfs_link,
@@ -462,12 +462,12 @@ struct inode_operations nilfs_dir_inode_operations = {
462 .permission = nilfs_permission, 462 .permission = nilfs_permission,
463}; 463};
464 464
465struct inode_operations nilfs_special_inode_operations = { 465const struct inode_operations nilfs_special_inode_operations = {
466 .setattr = nilfs_setattr, 466 .setattr = nilfs_setattr,
467 .permission = nilfs_permission, 467 .permission = nilfs_permission,
468}; 468};
469 469
470struct inode_operations nilfs_symlink_inode_operations = { 470const struct inode_operations nilfs_symlink_inode_operations = {
471 .readlink = generic_readlink, 471 .readlink = generic_readlink,
472 .follow_link = page_follow_link_light, 472 .follow_link = page_follow_link_light,
473 .put_link = page_put_link, 473 .put_link = page_put_link,
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 724c63766e82..4da6f67e9a91 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -294,13 +294,13 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *);
294/* 294/*
295 * Inodes and files operations 295 * Inodes and files operations
296 */ 296 */
297extern struct file_operations nilfs_dir_operations; 297extern const struct file_operations nilfs_dir_operations;
298extern struct inode_operations nilfs_file_inode_operations; 298extern const struct inode_operations nilfs_file_inode_operations;
299extern struct file_operations nilfs_file_operations; 299extern const struct file_operations nilfs_file_operations;
300extern struct address_space_operations nilfs_aops; 300extern const struct address_space_operations nilfs_aops;
301extern struct inode_operations nilfs_dir_inode_operations; 301extern const struct inode_operations nilfs_dir_inode_operations;
302extern struct inode_operations nilfs_special_inode_operations; 302extern const struct inode_operations nilfs_special_inode_operations;
303extern struct inode_operations nilfs_symlink_inode_operations; 303extern const struct inode_operations nilfs_symlink_inode_operations;
304 304
305/* 305/*
306 * filesystem type 306 * filesystem type
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index d80cc71be749..6dc83591d118 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -552,7 +552,8 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
552 printk(KERN_WARNING 552 printk(KERN_WARNING
553 "NILFS warning: error recovering data block " 553 "NILFS warning: error recovering data block "
554 "(err=%d, ino=%lu, block-offset=%llu)\n", 554 "(err=%d, ino=%lu, block-offset=%llu)\n",
555 err, rb->ino, (unsigned long long)rb->blkoff); 555 err, (unsigned long)rb->ino,
556 (unsigned long long)rb->blkoff);
556 if (!err2) 557 if (!err2)
557 err2 = err; 558 err2 = err;
558 next: 559 next:
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 9e3fe17bb96b..e6d9e37fa241 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -316,10 +316,10 @@ static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start,
316{ 316{
317 struct bio *bio; 317 struct bio *bio;
318 318
319 bio = bio_alloc(GFP_NOWAIT, nr_vecs); 319 bio = bio_alloc(GFP_NOIO, nr_vecs);
320 if (bio == NULL) { 320 if (bio == NULL) {
321 while (!bio && (nr_vecs >>= 1)) 321 while (!bio && (nr_vecs >>= 1))
322 bio = bio_alloc(GFP_NOWAIT, nr_vecs); 322 bio = bio_alloc(GFP_NOIO, nr_vecs);
323 } 323 }
324 if (likely(bio)) { 324 if (likely(bio)) {
325 bio->bi_bdev = sb->s_bdev; 325 bio->bi_bdev = sb->s_bdev;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 51ff3d0a4ee2..683df89dbae5 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2501,7 +2501,8 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci,
2501 if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) && 2501 if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
2502 nilfs_discontinued(nilfs)) { 2502 nilfs_discontinued(nilfs)) {
2503 down_write(&nilfs->ns_sem); 2503 down_write(&nilfs->ns_sem);
2504 req->sb_err = nilfs_commit_super(sbi, 0); 2504 req->sb_err = nilfs_commit_super(sbi,
2505 nilfs_altsb_need_update(nilfs));
2505 up_write(&nilfs->ns_sem); 2506 up_write(&nilfs->ns_sem);
2506 } 2507 }
2507 } 2508 }
@@ -2689,6 +2690,7 @@ static int nilfs_segctor_thread(void *arg)
2689 } else { 2690 } else {
2690 DEFINE_WAIT(wait); 2691 DEFINE_WAIT(wait);
2691 int should_sleep = 1; 2692 int should_sleep = 1;
2693 struct the_nilfs *nilfs;
2692 2694
2693 prepare_to_wait(&sci->sc_wait_daemon, &wait, 2695 prepare_to_wait(&sci->sc_wait_daemon, &wait,
2694 TASK_INTERRUPTIBLE); 2696 TASK_INTERRUPTIBLE);
@@ -2709,6 +2711,9 @@ static int nilfs_segctor_thread(void *arg)
2709 finish_wait(&sci->sc_wait_daemon, &wait); 2711 finish_wait(&sci->sc_wait_daemon, &wait);
2710 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && 2712 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
2711 time_after_eq(jiffies, sci->sc_timer->expires)); 2713 time_after_eq(jiffies, sci->sc_timer->expires));
2714 nilfs = sci->sc_sbi->s_nilfs;
2715 if (sci->sc_super->s_dirt && nilfs_sb_need_update(nilfs))
2716 set_nilfs_discontinued(nilfs);
2712 } 2717 }
2713 goto loop; 2718 goto loop;
2714 2719
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index a2c4d76c3366..0e99e5c0bd0f 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -28,7 +28,6 @@
28#include <linux/nilfs2_fs.h> 28#include <linux/nilfs2_fs.h>
29#include "mdt.h" 29#include "mdt.h"
30 30
31#define NILFS_SUFILE_GFP NILFS_MDT_GFP
32 31
33static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile) 32static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
34{ 33{
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 151964f0de4c..644e66727dd0 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -50,6 +50,8 @@
50#include <linux/writeback.h> 50#include <linux/writeback.h>
51#include <linux/kobject.h> 51#include <linux/kobject.h>
52#include <linux/exportfs.h> 52#include <linux/exportfs.h>
53#include <linux/seq_file.h>
54#include <linux/mount.h>
53#include "nilfs.h" 55#include "nilfs.h"
54#include "mdt.h" 56#include "mdt.h"
55#include "alloc.h" 57#include "alloc.h"
@@ -65,7 +67,6 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
65 "(NILFS)"); 67 "(NILFS)");
66MODULE_LICENSE("GPL"); 68MODULE_LICENSE("GPL");
67 69
68static void nilfs_write_super(struct super_block *sb);
69static int nilfs_remount(struct super_block *sb, int *flags, char *data); 70static int nilfs_remount(struct super_block *sb, int *flags, char *data);
70 71
71/** 72/**
@@ -311,9 +312,6 @@ static void nilfs_put_super(struct super_block *sb)
311 312
312 lock_kernel(); 313 lock_kernel();
313 314
314 if (sb->s_dirt)
315 nilfs_write_super(sb);
316
317 nilfs_detach_segment_constructor(sbi); 315 nilfs_detach_segment_constructor(sbi);
318 316
319 if (!(sb->s_flags & MS_RDONLY)) { 317 if (!(sb->s_flags & MS_RDONLY)) {
@@ -336,63 +334,21 @@ static void nilfs_put_super(struct super_block *sb)
336 unlock_kernel(); 334 unlock_kernel();
337} 335}
338 336
339/** 337static int nilfs_sync_fs(struct super_block *sb, int wait)
340 * nilfs_write_super - write super block(s) of NILFS
341 * @sb: super_block
342 *
343 * nilfs_write_super() gets a fs-dependent lock, writes super block(s), and
344 * clears s_dirt. This function is called in the section protected by
345 * lock_super().
346 *
347 * The s_dirt flag is managed by each filesystem and we protect it by ns_sem
348 * of the struct the_nilfs. Lock order must be as follows:
349 *
350 * 1. lock_super()
351 * 2. down_write(&nilfs->ns_sem)
352 *
353 * Inside NILFS, locking ns_sem is enough to protect s_dirt and the buffer
354 * of the super block (nilfs->ns_sbp[]).
355 *
356 * In most cases, VFS functions call lock_super() before calling these
357 * methods. So we must be careful not to bring on deadlocks when using
358 * lock_super(); see generic_shutdown_super(), write_super(), and so on.
359 *
360 * Note that order of lock_kernel() and lock_super() depends on contexts
361 * of VFS. We should also note that lock_kernel() can be used in its
362 * protective section and only the outermost one has an effect.
363 */
364static void nilfs_write_super(struct super_block *sb)
365{ 338{
366 struct nilfs_sb_info *sbi = NILFS_SB(sb); 339 struct nilfs_sb_info *sbi = NILFS_SB(sb);
367 struct the_nilfs *nilfs = sbi->s_nilfs; 340 struct the_nilfs *nilfs = sbi->s_nilfs;
368
369 down_write(&nilfs->ns_sem);
370 if (!(sb->s_flags & MS_RDONLY)) {
371 struct nilfs_super_block **sbp = nilfs->ns_sbp;
372 u64 t = get_seconds();
373 int dupsb;
374
375 if (!nilfs_discontinued(nilfs) && t >= nilfs->ns_sbwtime[0] &&
376 t < nilfs->ns_sbwtime[0] + NILFS_SB_FREQ) {
377 up_write(&nilfs->ns_sem);
378 return;
379 }
380 dupsb = sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
381 nilfs_commit_super(sbi, dupsb);
382 }
383 sb->s_dirt = 0;
384 up_write(&nilfs->ns_sem);
385}
386
387static int nilfs_sync_fs(struct super_block *sb, int wait)
388{
389 int err = 0; 341 int err = 0;
390 342
391 nilfs_write_super(sb);
392
393 /* This function is called when super block should be written back */ 343 /* This function is called when super block should be written back */
394 if (wait) 344 if (wait)
395 err = nilfs_construct_segment(sb); 345 err = nilfs_construct_segment(sb);
346
347 down_write(&nilfs->ns_sem);
348 if (sb->s_dirt)
349 nilfs_commit_super(sbi, 1);
350 up_write(&nilfs->ns_sem);
351
396 return err; 352 return err;
397} 353}
398 354
@@ -407,8 +363,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
407 list_add(&sbi->s_list, &nilfs->ns_supers); 363 list_add(&sbi->s_list, &nilfs->ns_supers);
408 up_write(&nilfs->ns_super_sem); 364 up_write(&nilfs->ns_super_sem);
409 365
410 sbi->s_ifile = nilfs_mdt_new( 366 sbi->s_ifile = nilfs_mdt_new(nilfs, sbi->s_super, NILFS_IFILE_INO);
411 nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP);
412 if (!sbi->s_ifile) 367 if (!sbi->s_ifile)
413 return -ENOMEM; 368 return -ENOMEM;
414 369
@@ -529,7 +484,27 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
529 return 0; 484 return 0;
530} 485}
531 486
532static struct super_operations nilfs_sops = { 487static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
488{
489 struct super_block *sb = vfs->mnt_sb;
490 struct nilfs_sb_info *sbi = NILFS_SB(sb);
491
492 if (!nilfs_test_opt(sbi, BARRIER))
493 seq_printf(seq, ",barrier=off");
494 if (nilfs_test_opt(sbi, SNAPSHOT))
495 seq_printf(seq, ",cp=%llu",
496 (unsigned long long int)sbi->s_snapshot_cno);
497 if (nilfs_test_opt(sbi, ERRORS_RO))
498 seq_printf(seq, ",errors=remount-ro");
499 if (nilfs_test_opt(sbi, ERRORS_PANIC))
500 seq_printf(seq, ",errors=panic");
501 if (nilfs_test_opt(sbi, STRICT_ORDER))
502 seq_printf(seq, ",order=strict");
503
504 return 0;
505}
506
507static const struct super_operations nilfs_sops = {
533 .alloc_inode = nilfs_alloc_inode, 508 .alloc_inode = nilfs_alloc_inode,
534 .destroy_inode = nilfs_destroy_inode, 509 .destroy_inode = nilfs_destroy_inode,
535 .dirty_inode = nilfs_dirty_inode, 510 .dirty_inode = nilfs_dirty_inode,
@@ -538,7 +513,7 @@ static struct super_operations nilfs_sops = {
538 /* .drop_inode = nilfs_drop_inode, */ 513 /* .drop_inode = nilfs_drop_inode, */
539 .delete_inode = nilfs_delete_inode, 514 .delete_inode = nilfs_delete_inode,
540 .put_super = nilfs_put_super, 515 .put_super = nilfs_put_super,
541 .write_super = nilfs_write_super, 516 /* .write_super = nilfs_write_super, */
542 .sync_fs = nilfs_sync_fs, 517 .sync_fs = nilfs_sync_fs,
543 /* .write_super_lockfs */ 518 /* .write_super_lockfs */
544 /* .unlockfs */ 519 /* .unlockfs */
@@ -546,7 +521,7 @@ static struct super_operations nilfs_sops = {
546 .remount_fs = nilfs_remount, 521 .remount_fs = nilfs_remount,
547 .clear_inode = nilfs_clear_inode, 522 .clear_inode = nilfs_clear_inode,
548 /* .umount_begin */ 523 /* .umount_begin */
549 /* .show_options */ 524 .show_options = nilfs_show_options
550}; 525};
551 526
552static struct inode * 527static struct inode *
@@ -585,7 +560,7 @@ nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len,
585 nilfs_nfs_get_inode); 560 nilfs_nfs_get_inode);
586} 561}
587 562
588static struct export_operations nilfs_export_ops = { 563static const struct export_operations nilfs_export_ops = {
589 .fh_to_dentry = nilfs_fh_to_dentry, 564 .fh_to_dentry = nilfs_fh_to_dentry,
590 .fh_to_parent = nilfs_fh_to_parent, 565 .fh_to_parent = nilfs_fh_to_parent,
591 .get_parent = nilfs_get_parent, 566 .get_parent = nilfs_get_parent,
@@ -816,10 +791,15 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
816 791
817 if (sb->s_flags & MS_RDONLY) { 792 if (sb->s_flags & MS_RDONLY) {
818 if (nilfs_test_opt(sbi, SNAPSHOT)) { 793 if (nilfs_test_opt(sbi, SNAPSHOT)) {
794 down_read(&nilfs->ns_segctor_sem);
819 err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, 795 err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile,
820 sbi->s_snapshot_cno); 796 sbi->s_snapshot_cno);
821 if (err < 0) 797 up_read(&nilfs->ns_segctor_sem);
798 if (err < 0) {
799 if (err == -ENOENT)
800 err = -EINVAL;
822 goto failed_sbi; 801 goto failed_sbi;
802 }
823 if (!err) { 803 if (!err) {
824 printk(KERN_ERR 804 printk(KERN_ERR
825 "NILFS: The specified checkpoint is " 805 "NILFS: The specified checkpoint is "
@@ -1127,10 +1107,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1127 */ 1107 */
1128 sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno); 1108 sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno);
1129 1109
1130 if (!sd.cno)
1131 /* trying to get the latest checkpoint. */
1132 sd.cno = nilfs_last_cno(nilfs);
1133
1134 /* 1110 /*
1135 * Get super block instance holding the nilfs_sb_info struct. 1111 * Get super block instance holding the nilfs_sb_info struct.
1136 * A new instance is allocated if no existing mount is present or 1112 * A new instance is allocated if no existing mount is present or
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 8b8889825716..ad391a8c3e7e 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -68,12 +68,11 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
68 68
69 nilfs->ns_bdev = bdev; 69 nilfs->ns_bdev = bdev;
70 atomic_set(&nilfs->ns_count, 1); 70 atomic_set(&nilfs->ns_count, 1);
71 atomic_set(&nilfs->ns_writer_refcount, -1);
72 atomic_set(&nilfs->ns_ndirtyblks, 0); 71 atomic_set(&nilfs->ns_ndirtyblks, 0);
73 init_rwsem(&nilfs->ns_sem); 72 init_rwsem(&nilfs->ns_sem);
74 init_rwsem(&nilfs->ns_super_sem); 73 init_rwsem(&nilfs->ns_super_sem);
75 mutex_init(&nilfs->ns_mount_mutex); 74 mutex_init(&nilfs->ns_mount_mutex);
76 mutex_init(&nilfs->ns_writer_mutex); 75 init_rwsem(&nilfs->ns_writer_sem);
77 INIT_LIST_HEAD(&nilfs->ns_list); 76 INIT_LIST_HEAD(&nilfs->ns_list);
78 INIT_LIST_HEAD(&nilfs->ns_supers); 77 INIT_LIST_HEAD(&nilfs->ns_supers);
79 spin_lock_init(&nilfs->ns_last_segment_lock); 78 spin_lock_init(&nilfs->ns_last_segment_lock);
@@ -188,23 +187,19 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
188 inode_size = nilfs->ns_inode_size; 187 inode_size = nilfs->ns_inode_size;
189 188
190 err = -ENOMEM; 189 err = -ENOMEM;
191 nilfs->ns_dat = nilfs_mdt_new( 190 nilfs->ns_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO);
192 nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
193 if (unlikely(!nilfs->ns_dat)) 191 if (unlikely(!nilfs->ns_dat))
194 goto failed; 192 goto failed;
195 193
196 nilfs->ns_gc_dat = nilfs_mdt_new( 194 nilfs->ns_gc_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO);
197 nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
198 if (unlikely(!nilfs->ns_gc_dat)) 195 if (unlikely(!nilfs->ns_gc_dat))
199 goto failed_dat; 196 goto failed_dat;
200 197
201 nilfs->ns_cpfile = nilfs_mdt_new( 198 nilfs->ns_cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO);
202 nilfs, NULL, NILFS_CPFILE_INO, NILFS_CPFILE_GFP);
203 if (unlikely(!nilfs->ns_cpfile)) 199 if (unlikely(!nilfs->ns_cpfile))
204 goto failed_gc_dat; 200 goto failed_gc_dat;
205 201
206 nilfs->ns_sufile = nilfs_mdt_new( 202 nilfs->ns_sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO);
207 nilfs, NULL, NILFS_SUFILE_INO, NILFS_SUFILE_GFP);
208 if (unlikely(!nilfs->ns_sufile)) 203 if (unlikely(!nilfs->ns_sufile))
209 goto failed_cpfile; 204 goto failed_cpfile;
210 205
@@ -596,9 +591,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
596 591
597 nilfs->ns_mount_state = le16_to_cpu(sbp->s_state); 592 nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
598 593
599 bdi = nilfs->ns_bdev->bd_inode_backing_dev_info; 594 bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
600 if (!bdi)
601 bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
602 nilfs->ns_bdi = bdi ? : &default_backing_dev_info; 595 nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
603 596
604 /* Finding last segment */ 597 /* Finding last segment */
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 1b9caafb8662..20abd55881e0 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -37,6 +37,7 @@ enum {
37 THE_NILFS_LOADED, /* Roll-back/roll-forward has done and 37 THE_NILFS_LOADED, /* Roll-back/roll-forward has done and
38 the latest checkpoint was loaded */ 38 the latest checkpoint was loaded */
39 THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */ 39 THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
40 THE_NILFS_GC_RUNNING, /* gc process is running */
40}; 41};
41 42
42/** 43/**
@@ -50,8 +51,7 @@ enum {
50 * @ns_sem: semaphore for shared states 51 * @ns_sem: semaphore for shared states
51 * @ns_super_sem: semaphore for global operations across super block instances 52 * @ns_super_sem: semaphore for global operations across super block instances
52 * @ns_mount_mutex: mutex protecting mount process of nilfs 53 * @ns_mount_mutex: mutex protecting mount process of nilfs
53 * @ns_writer_mutex: mutex protecting ns_writer attach/detach 54 * @ns_writer_sem: semaphore protecting ns_writer attach/detach
54 * @ns_writer_refcount: number of referrers on ns_writer
55 * @ns_current: back pointer to current mount 55 * @ns_current: back pointer to current mount
56 * @ns_sbh: buffer heads of on-disk super blocks 56 * @ns_sbh: buffer heads of on-disk super blocks
57 * @ns_sbp: pointers to super block data 57 * @ns_sbp: pointers to super block data
@@ -100,8 +100,7 @@ struct the_nilfs {
100 struct rw_semaphore ns_sem; 100 struct rw_semaphore ns_sem;
101 struct rw_semaphore ns_super_sem; 101 struct rw_semaphore ns_super_sem;
102 struct mutex ns_mount_mutex; 102 struct mutex ns_mount_mutex;
103 struct mutex ns_writer_mutex; 103 struct rw_semaphore ns_writer_sem;
104 atomic_t ns_writer_refcount;
105 104
106 /* 105 /*
107 * components protected by ns_super_sem 106 * components protected by ns_super_sem
@@ -197,11 +196,26 @@ static inline int nilfs_##name(struct the_nilfs *nilfs) \
197THE_NILFS_FNS(INIT, init) 196THE_NILFS_FNS(INIT, init)
198THE_NILFS_FNS(LOADED, loaded) 197THE_NILFS_FNS(LOADED, loaded)
199THE_NILFS_FNS(DISCONTINUED, discontinued) 198THE_NILFS_FNS(DISCONTINUED, discontinued)
199THE_NILFS_FNS(GC_RUNNING, gc_running)
200 200
201/* Minimum interval of periodical update of superblocks (in seconds) */ 201/* Minimum interval of periodical update of superblocks (in seconds) */
202#define NILFS_SB_FREQ 10 202#define NILFS_SB_FREQ 10
203#define NILFS_ALTSB_FREQ 60 /* spare superblock */ 203#define NILFS_ALTSB_FREQ 60 /* spare superblock */
204 204
205static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
206{
207 u64 t = get_seconds();
208 return t < nilfs->ns_sbwtime[0] ||
209 t > nilfs->ns_sbwtime[0] + NILFS_SB_FREQ;
210}
211
212static inline int nilfs_altsb_need_update(struct the_nilfs *nilfs)
213{
214 u64 t = get_seconds();
215 struct nilfs_super_block **sbp = nilfs->ns_sbp;
216 return sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
217}
218
205void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); 219void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
206struct the_nilfs *find_or_create_nilfs(struct block_device *); 220struct the_nilfs *find_or_create_nilfs(struct block_device *);
207void put_nilfs(struct the_nilfs *); 221void put_nilfs(struct the_nilfs *);
@@ -221,34 +235,21 @@ static inline void get_nilfs(struct the_nilfs *nilfs)
221 atomic_inc(&nilfs->ns_count); 235 atomic_inc(&nilfs->ns_count);
222} 236}
223 237
224static inline struct nilfs_sb_info *nilfs_get_writer(struct the_nilfs *nilfs)
225{
226 if (atomic_inc_and_test(&nilfs->ns_writer_refcount))
227 mutex_lock(&nilfs->ns_writer_mutex);
228 return nilfs->ns_writer;
229}
230
231static inline void nilfs_put_writer(struct the_nilfs *nilfs)
232{
233 if (atomic_add_negative(-1, &nilfs->ns_writer_refcount))
234 mutex_unlock(&nilfs->ns_writer_mutex);
235}
236
237static inline void 238static inline void
238nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) 239nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
239{ 240{
240 mutex_lock(&nilfs->ns_writer_mutex); 241 down_write(&nilfs->ns_writer_sem);
241 nilfs->ns_writer = sbi; 242 nilfs->ns_writer = sbi;
242 mutex_unlock(&nilfs->ns_writer_mutex); 243 up_write(&nilfs->ns_writer_sem);
243} 244}
244 245
245static inline void 246static inline void
246nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) 247nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
247{ 248{
248 mutex_lock(&nilfs->ns_writer_mutex); 249 down_write(&nilfs->ns_writer_sem);
249 if (sbi == nilfs->ns_writer) 250 if (sbi == nilfs->ns_writer)
250 nilfs->ns_writer = NULL; 251 nilfs->ns_writer = NULL;
251 mutex_unlock(&nilfs->ns_writer_mutex); 252 up_write(&nilfs->ns_writer_sem);
252} 253}
253 254
254static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi) 255static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 477d37d83b31..44a88a9fa2c8 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -124,10 +124,10 @@ int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs)
124 while (*s && len > 0) { 124 while (*s && len > 0) {
125 if (*s & 0x80) { 125 if (*s & 0x80) {
126 size = utf8_to_utf32(s, len, &u); 126 size = utf8_to_utf32(s, len, &u);
127 if (size < 0) { 127 if (size < 0)
128 /* Ignore character and move on */ 128 return -EINVAL;
129 size = 1; 129
130 } else if (u >= PLANE_SIZE) { 130 if (u >= PLANE_SIZE) {
131 u -= PLANE_SIZE; 131 u -= PLANE_SIZE;
132 *op++ = (wchar_t) (SURROGATE_PAIR | 132 *op++ = (wchar_t) (SURROGATE_PAIR |
133 ((u >> 10) & SURROGATE_BITS)); 133 ((u >> 10) & SURROGATE_BITS));
@@ -270,7 +270,8 @@ struct nls_table *load_nls(char *charset)
270 270
271void unload_nls(struct nls_table *nls) 271void unload_nls(struct nls_table *nls)
272{ 272{
273 module_put(nls->owner); 273 if (nls)
274 module_put(nls->owner);
274} 275}
275 276
276static const wchar_t charset2uni[256] = { 277static const wchar_t charset2uni[256] = {
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 5dcbafe72d71..c9ee67b442e1 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -105,16 +105,45 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode
105 return send; 105 return send;
106} 106}
107 107
108/*
109 * This is NEVER supposed to be called. Inotify marks should either have been
110 * removed from the idr when the watch was removed or in the
111 * fsnotify_destroy_mark_by_group() call when the inotify instance was being
112 * torn down. This is only called if the idr is about to be freed but there
113 * are still marks in it.
114 */
108static int idr_callback(int id, void *p, void *data) 115static int idr_callback(int id, void *p, void *data)
109{ 116{
110 BUG(); 117 struct fsnotify_mark_entry *entry;
118 struct inotify_inode_mark_entry *ientry;
119 static bool warned = false;
120
121 if (warned)
122 return 0;
123
124 warned = false;
125 entry = p;
126 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
127
128 WARN(1, "inotify closing but id=%d for entry=%p in group=%p still in "
129 "idr. Probably leaking memory\n", id, p, data);
130
131 /*
132 * I'm taking the liberty of assuming that the mark in question is a
133 * valid address and I'm dereferencing it. This might help to figure
134 * out why we got here and the panic is no worse than the original
135 * BUG() that was here.
136 */
137 if (entry)
138 printk(KERN_WARNING "entry->group=%p inode=%p wd=%d\n",
139 entry->group, entry->inode, ientry->wd);
111 return 0; 140 return 0;
112} 141}
113 142
114static void inotify_free_group_priv(struct fsnotify_group *group) 143static void inotify_free_group_priv(struct fsnotify_group *group)
115{ 144{
116 /* ideally the idr is empty and we won't hit the BUG in teh callback */ 145 /* ideally the idr is empty and we won't hit the BUG in teh callback */
117 idr_for_each(&group->inotify_data.idr, idr_callback, NULL); 146 idr_for_each(&group->inotify_data.idr, idr_callback, group);
118 idr_remove_all(&group->inotify_data.idr); 147 idr_remove_all(&group->inotify_data.idr);
119 idr_destroy(&group->inotify_data.idr); 148 idr_destroy(&group->inotify_data.idr);
120} 149}
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index dc32ed8323ba..dcd2040d330c 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -47,9 +47,6 @@
47 47
48static struct vfsmount *inotify_mnt __read_mostly; 48static struct vfsmount *inotify_mnt __read_mostly;
49 49
50/* this just sits here and wastes global memory. used to just pad userspace messages with zeros */
51static struct inotify_event nul_inotify_event;
52
53/* these are configurable via /proc/sys/fs/inotify/ */ 50/* these are configurable via /proc/sys/fs/inotify/ */
54static int inotify_max_user_instances __read_mostly; 51static int inotify_max_user_instances __read_mostly;
55static int inotify_max_queued_events __read_mostly; 52static int inotify_max_queued_events __read_mostly;
@@ -157,7 +154,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
157 154
158 event = fsnotify_peek_notify_event(group); 155 event = fsnotify_peek_notify_event(group);
159 156
160 event_size += roundup(event->name_len, event_size); 157 if (event->name_len)
158 event_size += roundup(event->name_len + 1, event_size);
161 159
162 if (event_size > count) 160 if (event_size > count)
163 return ERR_PTR(-EINVAL); 161 return ERR_PTR(-EINVAL);
@@ -183,7 +181,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
183 struct fsnotify_event_private_data *fsn_priv; 181 struct fsnotify_event_private_data *fsn_priv;
184 struct inotify_event_private_data *priv; 182 struct inotify_event_private_data *priv;
185 size_t event_size = sizeof(struct inotify_event); 183 size_t event_size = sizeof(struct inotify_event);
186 size_t name_len; 184 size_t name_len = 0;
187 185
188 /* we get the inotify watch descriptor from the event private data */ 186 /* we get the inotify watch descriptor from the event private data */
189 spin_lock(&event->lock); 187 spin_lock(&event->lock);
@@ -199,8 +197,12 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
199 inotify_free_event_priv(fsn_priv); 197 inotify_free_event_priv(fsn_priv);
200 } 198 }
201 199
202 /* round up event->name_len so it is a multiple of event_size */ 200 /*
203 name_len = roundup(event->name_len, event_size); 201 * round up event->name_len so it is a multiple of event_size
202 * plus an extra byte for the terminating '\0'.
203 */
204 if (event->name_len)
205 name_len = roundup(event->name_len + 1, event_size);
204 inotify_event.len = name_len; 206 inotify_event.len = name_len;
205 207
206 inotify_event.mask = inotify_mask_to_arg(event->mask); 208 inotify_event.mask = inotify_mask_to_arg(event->mask);
@@ -224,8 +226,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
224 return -EFAULT; 226 return -EFAULT;
225 buf += event->name_len; 227 buf += event->name_len;
226 228
227 /* fill userspace with 0's from nul_inotify_event */ 229 /* fill userspace with 0's */
228 if (copy_to_user(buf, &nul_inotify_event, len_to_zero)) 230 if (clear_user(buf, len_to_zero))
229 return -EFAULT; 231 return -EFAULT;
230 buf += len_to_zero; 232 buf += len_to_zero;
231 event_size += name_len; 233 event_size += name_len;
@@ -326,8 +328,9 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
326 list_for_each_entry(holder, &group->notification_list, event_list) { 328 list_for_each_entry(holder, &group->notification_list, event_list) {
327 event = holder->event; 329 event = holder->event;
328 send_len += sizeof(struct inotify_event); 330 send_len += sizeof(struct inotify_event);
329 send_len += roundup(event->name_len, 331 if (event->name_len)
330 sizeof(struct inotify_event)); 332 send_len += roundup(event->name_len + 1,
333 sizeof(struct inotify_event));
331 } 334 }
332 mutex_unlock(&group->notification_mutex); 335 mutex_unlock(&group->notification_mutex);
333 ret = put_user(send_len, (int __user *) p); 336 ret = put_user(send_len, (int __user *) p);
@@ -364,20 +367,53 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
364 return error; 367 return error;
365} 368}
366 369
370/*
371 * Remove the mark from the idr (if present) and drop the reference
372 * on the mark because it was in the idr.
373 */
367static void inotify_remove_from_idr(struct fsnotify_group *group, 374static void inotify_remove_from_idr(struct fsnotify_group *group,
368 struct inotify_inode_mark_entry *ientry) 375 struct inotify_inode_mark_entry *ientry)
369{ 376{
370 struct idr *idr; 377 struct idr *idr;
378 struct fsnotify_mark_entry *entry;
379 struct inotify_inode_mark_entry *found_ientry;
380 int wd;
371 381
372 spin_lock(&group->inotify_data.idr_lock); 382 spin_lock(&group->inotify_data.idr_lock);
373 idr = &group->inotify_data.idr; 383 idr = &group->inotify_data.idr;
374 idr_remove(idr, ientry->wd); 384 wd = ientry->wd;
375 spin_unlock(&group->inotify_data.idr_lock); 385
386 if (wd == -1)
387 goto out;
388
389 entry = idr_find(&group->inotify_data.idr, wd);
390 if (unlikely(!entry))
391 goto out;
392
393 found_ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
394 if (unlikely(found_ientry != ientry)) {
395 /* We found an entry in the idr with the right wd, but it's
396 * not the entry we were told to remove. eparis seriously
397 * fucked up somewhere. */
398 WARN_ON(1);
399 ientry->wd = -1;
400 goto out;
401 }
402
403 /* One ref for being in the idr, one ref held by the caller */
404 BUG_ON(atomic_read(&entry->refcnt) < 2);
405
406 idr_remove(idr, wd);
376 ientry->wd = -1; 407 ientry->wd = -1;
408
409 /* removed from the idr, drop that ref */
410 fsnotify_put_mark(entry);
411out:
412 spin_unlock(&group->inotify_data.idr_lock);
377} 413}
414
378/* 415/*
379 * Send IN_IGNORED for this wd, remove this wd from the idr, and drop the 416 * Send IN_IGNORED for this wd, remove this wd from the idr.
380 * internal reference help on the mark because it is in the idr.
381 */ 417 */
382void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, 418void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
383 struct fsnotify_group *group) 419 struct fsnotify_group *group)
@@ -417,9 +453,6 @@ skip_send_ignore:
417 /* remove this entry from the idr */ 453 /* remove this entry from the idr */
418 inotify_remove_from_idr(group, ientry); 454 inotify_remove_from_idr(group, ientry);
419 455
420 /* removed from idr, drop that reference */
421 fsnotify_put_mark(entry);
422
423 atomic_dec(&group->inotify_data.user->inotify_watches); 456 atomic_dec(&group->inotify_data.user->inotify_watches);
424} 457}
425 458
@@ -431,80 +464,29 @@ static void inotify_free_mark(struct fsnotify_mark_entry *entry)
431 kmem_cache_free(inotify_inode_mark_cachep, ientry); 464 kmem_cache_free(inotify_inode_mark_cachep, ientry);
432} 465}
433 466
434static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) 467static int inotify_update_existing_watch(struct fsnotify_group *group,
468 struct inode *inode,
469 u32 arg)
435{ 470{
436 struct fsnotify_mark_entry *entry = NULL; 471 struct fsnotify_mark_entry *entry;
437 struct inotify_inode_mark_entry *ientry; 472 struct inotify_inode_mark_entry *ientry;
438 struct inotify_inode_mark_entry *tmp_ientry;
439 int ret = 0;
440 int add = (arg & IN_MASK_ADD);
441 __u32 mask;
442 __u32 old_mask, new_mask; 473 __u32 old_mask, new_mask;
474 __u32 mask;
475 int add = (arg & IN_MASK_ADD);
476 int ret;
443 477
444 /* don't allow invalid bits: we don't want flags set */ 478 /* don't allow invalid bits: we don't want flags set */
445 mask = inotify_arg_to_mask(arg); 479 mask = inotify_arg_to_mask(arg);
446 if (unlikely(!mask)) 480 if (unlikely(!mask))
447 return -EINVAL; 481 return -EINVAL;
448 482
449 tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
450 if (unlikely(!tmp_ientry))
451 return -ENOMEM;
452 /* we set the mask at the end after attaching it */
453 fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark);
454 tmp_ientry->wd = -1;
455
456find_entry:
457 spin_lock(&inode->i_lock); 483 spin_lock(&inode->i_lock);
458 entry = fsnotify_find_mark_entry(group, inode); 484 entry = fsnotify_find_mark_entry(group, inode);
459 spin_unlock(&inode->i_lock); 485 spin_unlock(&inode->i_lock);
460 if (entry) { 486 if (!entry)
461 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); 487 return -ENOENT;
462 } else {
463 ret = -ENOSPC;
464 if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
465 goto out_err;
466retry:
467 ret = -ENOMEM;
468 if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
469 goto out_err;
470
471 spin_lock(&group->inotify_data.idr_lock);
472 ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
473 group->inotify_data.last_wd,
474 &tmp_ientry->wd);
475 spin_unlock(&group->inotify_data.idr_lock);
476 if (ret) {
477 if (ret == -EAGAIN)
478 goto retry;
479 goto out_err;
480 }
481
482 ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
483 if (ret) {
484 inotify_remove_from_idr(group, tmp_ientry);
485 if (ret == -EEXIST)
486 goto find_entry;
487 goto out_err;
488 }
489
490 /* tmp_ientry has been added to the inode, so we are all set up.
491 * now we just need to make sure tmp_ientry doesn't get freed and
492 * we need to set up entry and ientry so the generic code can
493 * do its thing. */
494 ientry = tmp_ientry;
495 entry = &ientry->fsn_entry;
496 tmp_ientry = NULL;
497
498 atomic_inc(&group->inotify_data.user->inotify_watches);
499
500 /* update the idr hint */
501 group->inotify_data.last_wd = ientry->wd;
502
503 /* we put the mark on the idr, take a reference */
504 fsnotify_get_mark(entry);
505 }
506 488
507 ret = ientry->wd; 489 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
508 490
509 spin_lock(&entry->lock); 491 spin_lock(&entry->lock);
510 492
@@ -536,18 +518,107 @@ retry:
536 fsnotify_recalc_group_mask(group); 518 fsnotify_recalc_group_mask(group);
537 } 519 }
538 520
539 /* this either matches fsnotify_find_mark_entry, or init_mark_entry 521 /* return the wd */
540 * depending on which path we took... */ 522 ret = ientry->wd;
523
524 /* match the get from fsnotify_find_mark_entry() */
541 fsnotify_put_mark(entry); 525 fsnotify_put_mark(entry);
542 526
527 return ret;
528}
529
530static int inotify_new_watch(struct fsnotify_group *group,
531 struct inode *inode,
532 u32 arg)
533{
534 struct inotify_inode_mark_entry *tmp_ientry;
535 __u32 mask;
536 int ret;
537
538 /* don't allow invalid bits: we don't want flags set */
539 mask = inotify_arg_to_mask(arg);
540 if (unlikely(!mask))
541 return -EINVAL;
542
543 tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
544 if (unlikely(!tmp_ientry))
545 return -ENOMEM;
546
547 fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark);
548 tmp_ientry->fsn_entry.mask = mask;
549 tmp_ientry->wd = -1;
550
551 ret = -ENOSPC;
552 if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
553 goto out_err;
554retry:
555 ret = -ENOMEM;
556 if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
557 goto out_err;
558
559 spin_lock(&group->inotify_data.idr_lock);
560 ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
561 group->inotify_data.last_wd,
562 &tmp_ientry->wd);
563 spin_unlock(&group->inotify_data.idr_lock);
564 if (ret) {
565 /* idr was out of memory allocate and try again */
566 if (ret == -EAGAIN)
567 goto retry;
568 goto out_err;
569 }
570
571 /* we put the mark on the idr, take a reference */
572 fsnotify_get_mark(&tmp_ientry->fsn_entry);
573
574 /* we are on the idr, now get on the inode */
575 ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
576 if (ret) {
577 /* we failed to get on the inode, get off the idr */
578 inotify_remove_from_idr(group, tmp_ientry);
579 goto out_err;
580 }
581
582 /* update the idr hint, who cares about races, it's just a hint */
583 group->inotify_data.last_wd = tmp_ientry->wd;
584
585 /* increment the number of watches the user has */
586 atomic_inc(&group->inotify_data.user->inotify_watches);
587
588 /* return the watch descriptor for this new entry */
589 ret = tmp_ientry->wd;
590
591 /* match the ref from fsnotify_init_markentry() */
592 fsnotify_put_mark(&tmp_ientry->fsn_entry);
593
594 /* if this mark added a new event update the group mask */
595 if (mask & ~group->mask)
596 fsnotify_recalc_group_mask(group);
597
543out_err: 598out_err:
544 /* could be an error, could be that we found an existing mark */ 599 if (ret < 0)
545 if (tmp_ientry) {
546 /* on the idr but didn't make it on the inode */
547 if (tmp_ientry->wd != -1)
548 inotify_remove_from_idr(group, tmp_ientry);
549 kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry); 600 kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry);
550 } 601
602 return ret;
603}
604
605static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg)
606{
607 int ret = 0;
608
609retry:
610 /* try to update and existing watch with the new arg */
611 ret = inotify_update_existing_watch(group, inode, arg);
612 /* no mark present, try to add a new one */
613 if (ret == -ENOENT)
614 ret = inotify_new_watch(group, inode, arg);
615 /*
616 * inotify_new_watch could race with another thread which did an
617 * inotify_new_watch between the update_existing and the add watch
618 * here, go back and try to update an existing mark again.
619 */
620 if (ret == -EEXIST)
621 goto retry;
551 622
552 return ret; 623 return ret;
553} 624}
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index b38f944f0667..cfce53cb65d7 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1550,6 +1550,7 @@ const struct address_space_operations ntfs_aops = {
1550 .migratepage = buffer_migrate_page, /* Move a page cache page from 1550 .migratepage = buffer_migrate_page, /* Move a page cache page from
1551 one physical page to an 1551 one physical page to an
1552 other. */ 1552 other. */
1553 .error_remove_page = generic_error_remove_page,
1553}; 1554};
1554 1555
1555/** 1556/**
@@ -1569,6 +1570,7 @@ const struct address_space_operations ntfs_mst_aops = {
1569 .migratepage = buffer_migrate_page, /* Move a page cache page from 1570 .migratepage = buffer_migrate_page, /* Move a page cache page from
1570 one physical page to an 1571 one physical page to an
1571 other. */ 1572 other. */
1573 .error_remove_page = generic_error_remove_page,
1572}; 1574};
1573 1575
1574#ifdef NTFS_RW 1576#ifdef NTFS_RW
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 3140a4429af1..663c0e341f8b 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2076,14 +2076,6 @@ err_out:
2076 *ppos = pos; 2076 *ppos = pos;
2077 if (cached_page) 2077 if (cached_page)
2078 page_cache_release(cached_page); 2078 page_cache_release(cached_page);
2079 /* For now, when the user asks for O_SYNC, we actually give O_DSYNC. */
2080 if (likely(!status)) {
2081 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(vi))) {
2082 if (!mapping->a_ops->writepage || !is_sync_kiocb(iocb))
2083 status = generic_osync_inode(vi, mapping,
2084 OSYNC_METADATA|OSYNC_DATA);
2085 }
2086 }
2087 pagevec_lru_add_file(&lru_pvec); 2079 pagevec_lru_add_file(&lru_pvec);
2088 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", 2080 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
2089 written ? "written" : "status", (unsigned long)written, 2081 written ? "written" : "status", (unsigned long)written,
@@ -2145,8 +2137,8 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2145 mutex_lock(&inode->i_mutex); 2137 mutex_lock(&inode->i_mutex);
2146 ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); 2138 ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
2147 mutex_unlock(&inode->i_mutex); 2139 mutex_unlock(&inode->i_mutex);
2148 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2140 if (ret > 0) {
2149 int err = sync_page_range(inode, mapping, pos, ret); 2141 int err = generic_write_sync(file, pos, ret);
2150 if (err < 0) 2142 if (err < 0)
2151 ret = err; 2143 ret = err;
2152 } 2144 }
@@ -2154,46 +2146,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2154} 2146}
2155 2147
2156/** 2148/**
2157 * ntfs_file_writev -
2158 *
2159 * Basically the same as generic_file_writev() except that it ends up calling
2160 * ntfs_file_aio_write_nolock() instead of __generic_file_aio_write_nolock().
2161 */
2162static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov,
2163 unsigned long nr_segs, loff_t *ppos)
2164{
2165 struct address_space *mapping = file->f_mapping;
2166 struct inode *inode = mapping->host;
2167 struct kiocb kiocb;
2168 ssize_t ret;
2169
2170 mutex_lock(&inode->i_mutex);
2171 init_sync_kiocb(&kiocb, file);
2172 ret = ntfs_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
2173 if (ret == -EIOCBQUEUED)
2174 ret = wait_on_sync_kiocb(&kiocb);
2175 mutex_unlock(&inode->i_mutex);
2176 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2177 int err = sync_page_range(inode, mapping, *ppos - ret, ret);
2178 if (err < 0)
2179 ret = err;
2180 }
2181 return ret;
2182}
2183
2184/**
2185 * ntfs_file_write - simple wrapper for ntfs_file_writev()
2186 */
2187static ssize_t ntfs_file_write(struct file *file, const char __user *buf,
2188 size_t count, loff_t *ppos)
2189{
2190 struct iovec local_iov = { .iov_base = (void __user *)buf,
2191 .iov_len = count };
2192
2193 return ntfs_file_writev(file, &local_iov, 1, ppos);
2194}
2195
2196/**
2197 * ntfs_file_fsync - sync a file to disk 2149 * ntfs_file_fsync - sync a file to disk
2198 * @filp: file to be synced 2150 * @filp: file to be synced
2199 * @dentry: dentry describing the file to sync 2151 * @dentry: dentry describing the file to sync
@@ -2255,7 +2207,7 @@ const struct file_operations ntfs_file_ops = {
2255 .read = do_sync_read, /* Read from file. */ 2207 .read = do_sync_read, /* Read from file. */
2256 .aio_read = generic_file_aio_read, /* Async read from file. */ 2208 .aio_read = generic_file_aio_read, /* Async read from file. */
2257#ifdef NTFS_RW 2209#ifdef NTFS_RW
2258 .write = ntfs_file_write, /* Write to file. */ 2210 .write = do_sync_write, /* Write to file. */
2259 .aio_write = ntfs_file_aio_write, /* Async write to file. */ 2211 .aio_write = ntfs_file_aio_write, /* Async write to file. */
2260 /*.release = ,*/ /* Last file is closed. See 2212 /*.release = ,*/ /* Last file is closed. See
2261 fs/ext2/file.c:: 2213 fs/ext2/file.c::
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index 50931b1ce4b9..8b2549f672bf 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h
@@ -829,7 +829,7 @@ enum {
829 /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the 829 /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the
830 F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT, 830 F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT,
831 F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask 831 F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask
832 is used to to obtain all flags that are valid for setting. */ 832 is used to obtain all flags that are valid for setting. */
833 /* 833 /*
834 * The flag FILE_ATTR_DUP_FILENAME_INDEX_PRESENT is present in all 834 * The flag FILE_ATTR_DUP_FILENAME_INDEX_PRESENT is present in all
835 * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION 835 * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index cd0be3f5c3cd..a44b14cbceeb 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -47,7 +47,7 @@ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask)
47 return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM); 47 return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM);
48 /* return (void *)__get_free_page(gfp_mask); */ 48 /* return (void *)__get_free_page(gfp_mask); */
49 } 49 }
50 if (likely(size >> PAGE_SHIFT < num_physpages)) 50 if (likely((size >> PAGE_SHIFT) < totalram_pages))
51 return __vmalloc(size, gfp_mask, PAGE_KERNEL); 51 return __vmalloc(size, gfp_mask, PAGE_KERNEL);
52 return NULL; 52 return NULL;
53} 53}
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 23bf68453d7d..1caa0ef0b2bb 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -384,13 +384,12 @@ unm_err_out:
384 * it is dirty in the inode meta data rather than the data page cache of the 384 * it is dirty in the inode meta data rather than the data page cache of the
385 * inode, and thus there are no data pages that need writing out. Therefore, a 385 * inode, and thus there are no data pages that need writing out. Therefore, a
386 * full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the 386 * full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the
387 * other hand, is not sufficient, because I_DIRTY_DATASYNC needs to be set to 387 * other hand, is not sufficient, because ->write_inode needs to be called even
388 * ensure ->write_inode is called from generic_osync_inode() and this needs to 388 * in case of fdatasync. This needs to happen or the file data would not
389 * happen or the file data would not necessarily hit the device synchronously, 389 * necessarily hit the device synchronously, even though the vfs inode has the
390 * even though the vfs inode has the O_SYNC flag set. Also, I_DIRTY_DATASYNC 390 * O_SYNC flag set. Also, I_DIRTY_DATASYNC simply "feels" better than just
391 * simply "feels" better than just I_DIRTY_SYNC, since the file data has not 391 * I_DIRTY_SYNC, since the file data has not actually hit the block device yet,
392 * actually hit the block device yet, which is not what I_DIRTY_SYNC on its own 392 * which is not what I_DIRTY_SYNC on its own would suggest.
393 * would suggest.
394 */ 393 */
395void __mark_mft_record_dirty(ntfs_inode *ni) 394void __mark_mft_record_dirty(ntfs_inode *ni)
396{ 395{
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index abaaa1cbf8de..80b04770e8e9 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -201,8 +201,7 @@ use_utf8:
201 v, old_nls->charset); 201 v, old_nls->charset);
202 nls_map = old_nls; 202 nls_map = old_nls;
203 } else /* nls_map */ { 203 } else /* nls_map */ {
204 if (old_nls) 204 unload_nls(old_nls);
205 unload_nls(old_nls);
206 } 205 }
207 } else if (!strcmp(p, "utf8")) { 206 } else if (!strcmp(p, "utf8")) {
208 bool val = false; 207 bool val = false;
@@ -2427,10 +2426,9 @@ static void ntfs_put_super(struct super_block *sb)
2427 ntfs_free(vol->upcase); 2426 ntfs_free(vol->upcase);
2428 vol->upcase = NULL; 2427 vol->upcase = NULL;
2429 } 2428 }
2430 if (vol->nls_map) { 2429
2431 unload_nls(vol->nls_map); 2430 unload_nls(vol->nls_map);
2432 vol->nls_map = NULL; 2431
2433 }
2434 sb->s_fs_info = NULL; 2432 sb->s_fs_info = NULL;
2435 kfree(vol); 2433 kfree(vol);
2436 2434
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 01596079dd63..31f25ce32c97 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -28,6 +28,7 @@ ocfs2-objs := \
28 locks.o \ 28 locks.o \
29 mmap.o \ 29 mmap.o \
30 namei.o \ 30 namei.o \
31 refcounttree.o \
31 resize.o \ 32 resize.o \
32 slot_map.o \ 33 slot_map.o \
33 suballoc.o \ 34 suballoc.o \
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index ab513ddaeff2..38a42f5d59ff 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -49,10 +49,21 @@
49#include "super.h" 49#include "super.h"
50#include "uptodate.h" 50#include "uptodate.h"
51#include "xattr.h" 51#include "xattr.h"
52#include "refcounttree.h"
52 53
53#include "buffer_head_io.h" 54#include "buffer_head_io.h"
54 55
56enum ocfs2_contig_type {
57 CONTIG_NONE = 0,
58 CONTIG_LEFT,
59 CONTIG_RIGHT,
60 CONTIG_LEFTRIGHT,
61};
55 62
63static enum ocfs2_contig_type
64 ocfs2_extent_rec_contig(struct super_block *sb,
65 struct ocfs2_extent_rec *ext,
66 struct ocfs2_extent_rec *insert_rec);
56/* 67/*
57 * Operations for a specific extent tree type. 68 * Operations for a specific extent tree type.
58 * 69 *
@@ -79,18 +90,30 @@ struct ocfs2_extent_tree_operations {
79 * that value. new_clusters is the delta, and must be 90 * that value. new_clusters is the delta, and must be
80 * added to the total. Required. 91 * added to the total. Required.
81 */ 92 */
82 void (*eo_update_clusters)(struct inode *inode, 93 void (*eo_update_clusters)(struct ocfs2_extent_tree *et,
83 struct ocfs2_extent_tree *et,
84 u32 new_clusters); 94 u32 new_clusters);
85 95
86 /* 96 /*
97 * If this extent tree is supported by an extent map, insert
98 * a record into the map.
99 */
100 void (*eo_extent_map_insert)(struct ocfs2_extent_tree *et,
101 struct ocfs2_extent_rec *rec);
102
103 /*
104 * If this extent tree is supported by an extent map, truncate the
105 * map to clusters,
106 */
107 void (*eo_extent_map_truncate)(struct ocfs2_extent_tree *et,
108 u32 clusters);
109
110 /*
87 * If ->eo_insert_check() exists, it is called before rec is 111 * If ->eo_insert_check() exists, it is called before rec is
88 * inserted into the extent tree. It is optional. 112 * inserted into the extent tree. It is optional.
89 */ 113 */
90 int (*eo_insert_check)(struct inode *inode, 114 int (*eo_insert_check)(struct ocfs2_extent_tree *et,
91 struct ocfs2_extent_tree *et,
92 struct ocfs2_extent_rec *rec); 115 struct ocfs2_extent_rec *rec);
93 int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et); 116 int (*eo_sanity_check)(struct ocfs2_extent_tree *et);
94 117
95 /* 118 /*
96 * -------------------------------------------------------------- 119 * --------------------------------------------------------------
@@ -109,8 +132,17 @@ struct ocfs2_extent_tree_operations {
109 * it exists. If it does not, et->et_max_leaf_clusters is set 132 * it exists. If it does not, et->et_max_leaf_clusters is set
110 * to 0 (unlimited). Optional. 133 * to 0 (unlimited). Optional.
111 */ 134 */
112 void (*eo_fill_max_leaf_clusters)(struct inode *inode, 135 void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et);
113 struct ocfs2_extent_tree *et); 136
137 /*
138 * ->eo_extent_contig test whether the 2 ocfs2_extent_rec
139 * are contiguous or not. Optional. Don't need to set it if use
140 * ocfs2_extent_rec as the tree leaf.
141 */
142 enum ocfs2_contig_type
143 (*eo_extent_contig)(struct ocfs2_extent_tree *et,
144 struct ocfs2_extent_rec *ext,
145 struct ocfs2_extent_rec *insert_rec);
114}; 146};
115 147
116 148
@@ -121,19 +153,22 @@ struct ocfs2_extent_tree_operations {
121static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et); 153static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
122static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, 154static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
123 u64 blkno); 155 u64 blkno);
124static void ocfs2_dinode_update_clusters(struct inode *inode, 156static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
125 struct ocfs2_extent_tree *et,
126 u32 clusters); 157 u32 clusters);
127static int ocfs2_dinode_insert_check(struct inode *inode, 158static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
128 struct ocfs2_extent_tree *et, 159 struct ocfs2_extent_rec *rec);
160static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
161 u32 clusters);
162static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
129 struct ocfs2_extent_rec *rec); 163 struct ocfs2_extent_rec *rec);
130static int ocfs2_dinode_sanity_check(struct inode *inode, 164static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
131 struct ocfs2_extent_tree *et);
132static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); 165static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
133static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { 166static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
134 .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, 167 .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk,
135 .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, 168 .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk,
136 .eo_update_clusters = ocfs2_dinode_update_clusters, 169 .eo_update_clusters = ocfs2_dinode_update_clusters,
170 .eo_extent_map_insert = ocfs2_dinode_extent_map_insert,
171 .eo_extent_map_truncate = ocfs2_dinode_extent_map_truncate,
137 .eo_insert_check = ocfs2_dinode_insert_check, 172 .eo_insert_check = ocfs2_dinode_insert_check,
138 .eo_sanity_check = ocfs2_dinode_sanity_check, 173 .eo_sanity_check = ocfs2_dinode_sanity_check,
139 .eo_fill_root_el = ocfs2_dinode_fill_root_el, 174 .eo_fill_root_el = ocfs2_dinode_fill_root_el,
@@ -156,40 +191,53 @@ static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
156 return le64_to_cpu(di->i_last_eb_blk); 191 return le64_to_cpu(di->i_last_eb_blk);
157} 192}
158 193
159static void ocfs2_dinode_update_clusters(struct inode *inode, 194static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
160 struct ocfs2_extent_tree *et,
161 u32 clusters) 195 u32 clusters)
162{ 196{
197 struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
163 struct ocfs2_dinode *di = et->et_object; 198 struct ocfs2_dinode *di = et->et_object;
164 199
165 le32_add_cpu(&di->i_clusters, clusters); 200 le32_add_cpu(&di->i_clusters, clusters);
166 spin_lock(&OCFS2_I(inode)->ip_lock); 201 spin_lock(&oi->ip_lock);
167 OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters); 202 oi->ip_clusters = le32_to_cpu(di->i_clusters);
168 spin_unlock(&OCFS2_I(inode)->ip_lock); 203 spin_unlock(&oi->ip_lock);
169} 204}
170 205
171static int ocfs2_dinode_insert_check(struct inode *inode, 206static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
172 struct ocfs2_extent_tree *et, 207 struct ocfs2_extent_rec *rec)
208{
209 struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
210
211 ocfs2_extent_map_insert_rec(inode, rec);
212}
213
214static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
215 u32 clusters)
216{
217 struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
218
219 ocfs2_extent_map_trunc(inode, clusters);
220}
221
222static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
173 struct ocfs2_extent_rec *rec) 223 struct ocfs2_extent_rec *rec)
174{ 224{
175 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 225 struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
226 struct ocfs2_super *osb = OCFS2_SB(oi->vfs_inode.i_sb);
176 227
177 BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL); 228 BUG_ON(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL);
178 mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && 229 mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
179 (OCFS2_I(inode)->ip_clusters != 230 (oi->ip_clusters != le32_to_cpu(rec->e_cpos)),
180 le32_to_cpu(rec->e_cpos)),
181 "Device %s, asking for sparse allocation: inode %llu, " 231 "Device %s, asking for sparse allocation: inode %llu, "
182 "cpos %u, clusters %u\n", 232 "cpos %u, clusters %u\n",
183 osb->dev_str, 233 osb->dev_str,
184 (unsigned long long)OCFS2_I(inode)->ip_blkno, 234 (unsigned long long)oi->ip_blkno,
185 rec->e_cpos, 235 rec->e_cpos, oi->ip_clusters);
186 OCFS2_I(inode)->ip_clusters);
187 236
188 return 0; 237 return 0;
189} 238}
190 239
191static int ocfs2_dinode_sanity_check(struct inode *inode, 240static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et)
192 struct ocfs2_extent_tree *et)
193{ 241{
194 struct ocfs2_dinode *di = et->et_object; 242 struct ocfs2_dinode *di = et->et_object;
195 243
@@ -229,8 +277,7 @@ static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
229 return le64_to_cpu(vb->vb_xv->xr_last_eb_blk); 277 return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
230} 278}
231 279
232static void ocfs2_xattr_value_update_clusters(struct inode *inode, 280static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et,
233 struct ocfs2_extent_tree *et,
234 u32 clusters) 281 u32 clusters)
235{ 282{
236 struct ocfs2_xattr_value_buf *vb = et->et_object; 283 struct ocfs2_xattr_value_buf *vb = et->et_object;
@@ -252,12 +299,11 @@ static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
252 et->et_root_el = &xb->xb_attrs.xb_root.xt_list; 299 et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
253} 300}
254 301
255static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode, 302static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct ocfs2_extent_tree *et)
256 struct ocfs2_extent_tree *et)
257{ 303{
304 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
258 et->et_max_leaf_clusters = 305 et->et_max_leaf_clusters =
259 ocfs2_clusters_for_bytes(inode->i_sb, 306 ocfs2_clusters_for_bytes(sb, OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
260 OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
261} 307}
262 308
263static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et, 309static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
@@ -277,8 +323,7 @@ static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
277 return le64_to_cpu(xt->xt_last_eb_blk); 323 return le64_to_cpu(xt->xt_last_eb_blk);
278} 324}
279 325
280static void ocfs2_xattr_tree_update_clusters(struct inode *inode, 326static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et,
281 struct ocfs2_extent_tree *et,
282 u32 clusters) 327 u32 clusters)
283{ 328{
284 struct ocfs2_xattr_block *xb = et->et_object; 329 struct ocfs2_xattr_block *xb = et->et_object;
@@ -309,8 +354,7 @@ static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
309 return le64_to_cpu(dx_root->dr_last_eb_blk); 354 return le64_to_cpu(dx_root->dr_last_eb_blk);
310} 355}
311 356
312static void ocfs2_dx_root_update_clusters(struct inode *inode, 357static void ocfs2_dx_root_update_clusters(struct ocfs2_extent_tree *et,
313 struct ocfs2_extent_tree *et,
314 u32 clusters) 358 u32 clusters)
315{ 359{
316 struct ocfs2_dx_root_block *dx_root = et->et_object; 360 struct ocfs2_dx_root_block *dx_root = et->et_object;
@@ -318,8 +362,7 @@ static void ocfs2_dx_root_update_clusters(struct inode *inode,
318 le32_add_cpu(&dx_root->dr_clusters, clusters); 362 le32_add_cpu(&dx_root->dr_clusters, clusters);
319} 363}
320 364
321static int ocfs2_dx_root_sanity_check(struct inode *inode, 365static int ocfs2_dx_root_sanity_check(struct ocfs2_extent_tree *et)
322 struct ocfs2_extent_tree *et)
323{ 366{
324 struct ocfs2_dx_root_block *dx_root = et->et_object; 367 struct ocfs2_dx_root_block *dx_root = et->et_object;
325 368
@@ -343,8 +386,54 @@ static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
343 .eo_fill_root_el = ocfs2_dx_root_fill_root_el, 386 .eo_fill_root_el = ocfs2_dx_root_fill_root_el,
344}; 387};
345 388
389static void ocfs2_refcount_tree_fill_root_el(struct ocfs2_extent_tree *et)
390{
391 struct ocfs2_refcount_block *rb = et->et_object;
392
393 et->et_root_el = &rb->rf_list;
394}
395
396static void ocfs2_refcount_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
397 u64 blkno)
398{
399 struct ocfs2_refcount_block *rb = et->et_object;
400
401 rb->rf_last_eb_blk = cpu_to_le64(blkno);
402}
403
404static u64 ocfs2_refcount_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
405{
406 struct ocfs2_refcount_block *rb = et->et_object;
407
408 return le64_to_cpu(rb->rf_last_eb_blk);
409}
410
411static void ocfs2_refcount_tree_update_clusters(struct ocfs2_extent_tree *et,
412 u32 clusters)
413{
414 struct ocfs2_refcount_block *rb = et->et_object;
415
416 le32_add_cpu(&rb->rf_clusters, clusters);
417}
418
419static enum ocfs2_contig_type
420ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et,
421 struct ocfs2_extent_rec *ext,
422 struct ocfs2_extent_rec *insert_rec)
423{
424 return CONTIG_NONE;
425}
426
427static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
428 .eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk,
429 .eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk,
430 .eo_update_clusters = ocfs2_refcount_tree_update_clusters,
431 .eo_fill_root_el = ocfs2_refcount_tree_fill_root_el,
432 .eo_extent_contig = ocfs2_refcount_tree_extent_contig,
433};
434
346static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, 435static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
347 struct inode *inode, 436 struct ocfs2_caching_info *ci,
348 struct buffer_head *bh, 437 struct buffer_head *bh,
349 ocfs2_journal_access_func access, 438 ocfs2_journal_access_func access,
350 void *obj, 439 void *obj,
@@ -352,6 +441,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
352{ 441{
353 et->et_ops = ops; 442 et->et_ops = ops;
354 et->et_root_bh = bh; 443 et->et_root_bh = bh;
444 et->et_ci = ci;
355 et->et_root_journal_access = access; 445 et->et_root_journal_access = access;
356 if (!obj) 446 if (!obj)
357 obj = (void *)bh->b_data; 447 obj = (void *)bh->b_data;
@@ -361,41 +451,49 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
361 if (!et->et_ops->eo_fill_max_leaf_clusters) 451 if (!et->et_ops->eo_fill_max_leaf_clusters)
362 et->et_max_leaf_clusters = 0; 452 et->et_max_leaf_clusters = 0;
363 else 453 else
364 et->et_ops->eo_fill_max_leaf_clusters(inode, et); 454 et->et_ops->eo_fill_max_leaf_clusters(et);
365} 455}
366 456
367void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, 457void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
368 struct inode *inode, 458 struct ocfs2_caching_info *ci,
369 struct buffer_head *bh) 459 struct buffer_head *bh)
370{ 460{
371 __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_di, 461 __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_di,
372 NULL, &ocfs2_dinode_et_ops); 462 NULL, &ocfs2_dinode_et_ops);
373} 463}
374 464
375void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, 465void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
376 struct inode *inode, 466 struct ocfs2_caching_info *ci,
377 struct buffer_head *bh) 467 struct buffer_head *bh)
378{ 468{
379 __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_xb, 469 __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_xb,
380 NULL, &ocfs2_xattr_tree_et_ops); 470 NULL, &ocfs2_xattr_tree_et_ops);
381} 471}
382 472
383void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, 473void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
384 struct inode *inode, 474 struct ocfs2_caching_info *ci,
385 struct ocfs2_xattr_value_buf *vb) 475 struct ocfs2_xattr_value_buf *vb)
386{ 476{
387 __ocfs2_init_extent_tree(et, inode, vb->vb_bh, vb->vb_access, vb, 477 __ocfs2_init_extent_tree(et, ci, vb->vb_bh, vb->vb_access, vb,
388 &ocfs2_xattr_value_et_ops); 478 &ocfs2_xattr_value_et_ops);
389} 479}
390 480
391void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et, 481void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
392 struct inode *inode, 482 struct ocfs2_caching_info *ci,
393 struct buffer_head *bh) 483 struct buffer_head *bh)
394{ 484{
395 __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_dr, 485 __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_dr,
396 NULL, &ocfs2_dx_root_et_ops); 486 NULL, &ocfs2_dx_root_et_ops);
397} 487}
398 488
489void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
490 struct ocfs2_caching_info *ci,
491 struct buffer_head *bh)
492{
493 __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_rb,
494 NULL, &ocfs2_refcount_tree_et_ops);
495}
496
399static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et, 497static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
400 u64 new_last_eb_blk) 498 u64 new_last_eb_blk)
401{ 499{
@@ -407,78 +505,71 @@ static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
407 return et->et_ops->eo_get_last_eb_blk(et); 505 return et->et_ops->eo_get_last_eb_blk(et);
408} 506}
409 507
410static inline void ocfs2_et_update_clusters(struct inode *inode, 508static inline void ocfs2_et_update_clusters(struct ocfs2_extent_tree *et,
411 struct ocfs2_extent_tree *et,
412 u32 clusters) 509 u32 clusters)
413{ 510{
414 et->et_ops->eo_update_clusters(inode, et, clusters); 511 et->et_ops->eo_update_clusters(et, clusters);
512}
513
514static inline void ocfs2_et_extent_map_insert(struct ocfs2_extent_tree *et,
515 struct ocfs2_extent_rec *rec)
516{
517 if (et->et_ops->eo_extent_map_insert)
518 et->et_ops->eo_extent_map_insert(et, rec);
519}
520
521static inline void ocfs2_et_extent_map_truncate(struct ocfs2_extent_tree *et,
522 u32 clusters)
523{
524 if (et->et_ops->eo_extent_map_truncate)
525 et->et_ops->eo_extent_map_truncate(et, clusters);
415} 526}
416 527
417static inline int ocfs2_et_root_journal_access(handle_t *handle, 528static inline int ocfs2_et_root_journal_access(handle_t *handle,
418 struct inode *inode,
419 struct ocfs2_extent_tree *et, 529 struct ocfs2_extent_tree *et,
420 int type) 530 int type)
421{ 531{
422 return et->et_root_journal_access(handle, inode, et->et_root_bh, 532 return et->et_root_journal_access(handle, et->et_ci, et->et_root_bh,
423 type); 533 type);
424} 534}
425 535
426static inline int ocfs2_et_insert_check(struct inode *inode, 536static inline enum ocfs2_contig_type
427 struct ocfs2_extent_tree *et, 537 ocfs2_et_extent_contig(struct ocfs2_extent_tree *et,
538 struct ocfs2_extent_rec *rec,
539 struct ocfs2_extent_rec *insert_rec)
540{
541 if (et->et_ops->eo_extent_contig)
542 return et->et_ops->eo_extent_contig(et, rec, insert_rec);
543
544 return ocfs2_extent_rec_contig(
545 ocfs2_metadata_cache_get_super(et->et_ci),
546 rec, insert_rec);
547}
548
549static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et,
428 struct ocfs2_extent_rec *rec) 550 struct ocfs2_extent_rec *rec)
429{ 551{
430 int ret = 0; 552 int ret = 0;
431 553
432 if (et->et_ops->eo_insert_check) 554 if (et->et_ops->eo_insert_check)
433 ret = et->et_ops->eo_insert_check(inode, et, rec); 555 ret = et->et_ops->eo_insert_check(et, rec);
434 return ret; 556 return ret;
435} 557}
436 558
437static inline int ocfs2_et_sanity_check(struct inode *inode, 559static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
438 struct ocfs2_extent_tree *et)
439{ 560{
440 int ret = 0; 561 int ret = 0;
441 562
442 if (et->et_ops->eo_sanity_check) 563 if (et->et_ops->eo_sanity_check)
443 ret = et->et_ops->eo_sanity_check(inode, et); 564 ret = et->et_ops->eo_sanity_check(et);
444 return ret; 565 return ret;
445} 566}
446 567
447static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); 568static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
448static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, 569static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
449 struct ocfs2_extent_block *eb); 570 struct ocfs2_extent_block *eb);
450 571static void ocfs2_adjust_rightmost_records(handle_t *handle,
451/* 572 struct ocfs2_extent_tree *et,
452 * Structures which describe a path through a btree, and functions to
453 * manipulate them.
454 *
455 * The idea here is to be as generic as possible with the tree
456 * manipulation code.
457 */
458struct ocfs2_path_item {
459 struct buffer_head *bh;
460 struct ocfs2_extent_list *el;
461};
462
463#define OCFS2_MAX_PATH_DEPTH 5
464
465struct ocfs2_path {
466 int p_tree_depth;
467 ocfs2_journal_access_func p_root_access;
468 struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH];
469};
470
471#define path_root_bh(_path) ((_path)->p_node[0].bh)
472#define path_root_el(_path) ((_path)->p_node[0].el)
473#define path_root_access(_path)((_path)->p_root_access)
474#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
475#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
476#define path_num_items(_path) ((_path)->p_tree_depth + 1)
477
478static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
479 u32 cpos);
480static void ocfs2_adjust_rightmost_records(struct inode *inode,
481 handle_t *handle,
482 struct ocfs2_path *path, 573 struct ocfs2_path *path,
483 struct ocfs2_extent_rec *insert_rec); 574 struct ocfs2_extent_rec *insert_rec);
484/* 575/*
@@ -486,7 +577,7 @@ static void ocfs2_adjust_rightmost_records(struct inode *inode,
486 * to build another path. Generally, this involves freeing the buffer 577 * to build another path. Generally, this involves freeing the buffer
487 * heads. 578 * heads.
488 */ 579 */
489static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) 580void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
490{ 581{
491 int i, start = 0, depth = 0; 582 int i, start = 0, depth = 0;
492 struct ocfs2_path_item *node; 583 struct ocfs2_path_item *node;
@@ -515,7 +606,7 @@ static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
515 path->p_tree_depth = depth; 606 path->p_tree_depth = depth;
516} 607}
517 608
518static void ocfs2_free_path(struct ocfs2_path *path) 609void ocfs2_free_path(struct ocfs2_path *path)
519{ 610{
520 if (path) { 611 if (path) {
521 ocfs2_reinit_path(path, 0); 612 ocfs2_reinit_path(path, 0);
@@ -613,13 +704,13 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
613 return path; 704 return path;
614} 705}
615 706
616static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path) 707struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
617{ 708{
618 return ocfs2_new_path(path_root_bh(path), path_root_el(path), 709 return ocfs2_new_path(path_root_bh(path), path_root_el(path),
619 path_root_access(path)); 710 path_root_access(path));
620} 711}
621 712
622static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et) 713struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
623{ 714{
624 return ocfs2_new_path(et->et_root_bh, et->et_root_el, 715 return ocfs2_new_path(et->et_root_bh, et->et_root_el,
625 et->et_root_journal_access); 716 et->et_root_journal_access);
@@ -632,10 +723,10 @@ static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
632 * I don't like the way this function's name looks next to 723 * I don't like the way this function's name looks next to
633 * ocfs2_journal_access_path(), but I don't have a better one. 724 * ocfs2_journal_access_path(), but I don't have a better one.
634 */ 725 */
635static int ocfs2_path_bh_journal_access(handle_t *handle, 726int ocfs2_path_bh_journal_access(handle_t *handle,
636 struct inode *inode, 727 struct ocfs2_caching_info *ci,
637 struct ocfs2_path *path, 728 struct ocfs2_path *path,
638 int idx) 729 int idx)
639{ 730{
640 ocfs2_journal_access_func access = path_root_access(path); 731 ocfs2_journal_access_func access = path_root_access(path);
641 732
@@ -645,15 +736,16 @@ static int ocfs2_path_bh_journal_access(handle_t *handle,
645 if (idx) 736 if (idx)
646 access = ocfs2_journal_access_eb; 737 access = ocfs2_journal_access_eb;
647 738
648 return access(handle, inode, path->p_node[idx].bh, 739 return access(handle, ci, path->p_node[idx].bh,
649 OCFS2_JOURNAL_ACCESS_WRITE); 740 OCFS2_JOURNAL_ACCESS_WRITE);
650} 741}
651 742
652/* 743/*
653 * Convenience function to journal all components in a path. 744 * Convenience function to journal all components in a path.
654 */ 745 */
655static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, 746int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
656 struct ocfs2_path *path) 747 handle_t *handle,
748 struct ocfs2_path *path)
657{ 749{
658 int i, ret = 0; 750 int i, ret = 0;
659 751
@@ -661,7 +753,7 @@ static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
661 goto out; 753 goto out;
662 754
663 for(i = 0; i < path_num_items(path); i++) { 755 for(i = 0; i < path_num_items(path); i++) {
664 ret = ocfs2_path_bh_journal_access(handle, inode, path, i); 756 ret = ocfs2_path_bh_journal_access(handle, ci, path, i);
665 if (ret < 0) { 757 if (ret < 0) {
666 mlog_errno(ret); 758 mlog_errno(ret);
667 goto out; 759 goto out;
@@ -702,17 +794,9 @@ int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
702 return ret; 794 return ret;
703} 795}
704 796
705enum ocfs2_contig_type {
706 CONTIG_NONE = 0,
707 CONTIG_LEFT,
708 CONTIG_RIGHT,
709 CONTIG_LEFTRIGHT,
710};
711
712
713/* 797/*
714 * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and 798 * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
715 * ocfs2_extent_contig only work properly against leaf nodes! 799 * ocfs2_extent_rec_contig only work properly against leaf nodes!
716 */ 800 */
717static int ocfs2_block_extent_contig(struct super_block *sb, 801static int ocfs2_block_extent_contig(struct super_block *sb,
718 struct ocfs2_extent_rec *ext, 802 struct ocfs2_extent_rec *ext,
@@ -738,9 +822,9 @@ static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
738} 822}
739 823
740static enum ocfs2_contig_type 824static enum ocfs2_contig_type
741 ocfs2_extent_contig(struct inode *inode, 825 ocfs2_extent_rec_contig(struct super_block *sb,
742 struct ocfs2_extent_rec *ext, 826 struct ocfs2_extent_rec *ext,
743 struct ocfs2_extent_rec *insert_rec) 827 struct ocfs2_extent_rec *insert_rec)
744{ 828{
745 u64 blkno = le64_to_cpu(insert_rec->e_blkno); 829 u64 blkno = le64_to_cpu(insert_rec->e_blkno);
746 830
@@ -753,12 +837,12 @@ static enum ocfs2_contig_type
753 return CONTIG_NONE; 837 return CONTIG_NONE;
754 838
755 if (ocfs2_extents_adjacent(ext, insert_rec) && 839 if (ocfs2_extents_adjacent(ext, insert_rec) &&
756 ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) 840 ocfs2_block_extent_contig(sb, ext, blkno))
757 return CONTIG_RIGHT; 841 return CONTIG_RIGHT;
758 842
759 blkno = le64_to_cpu(ext->e_blkno); 843 blkno = le64_to_cpu(ext->e_blkno);
760 if (ocfs2_extents_adjacent(insert_rec, ext) && 844 if (ocfs2_extents_adjacent(insert_rec, ext) &&
761 ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno)) 845 ocfs2_block_extent_contig(sb, insert_rec, blkno))
762 return CONTIG_LEFT; 846 return CONTIG_LEFT;
763 847
764 return CONTIG_NONE; 848 return CONTIG_NONE;
@@ -853,13 +937,13 @@ static int ocfs2_validate_extent_block(struct super_block *sb,
853 return 0; 937 return 0;
854} 938}
855 939
856int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno, 940int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
857 struct buffer_head **bh) 941 struct buffer_head **bh)
858{ 942{
859 int rc; 943 int rc;
860 struct buffer_head *tmp = *bh; 944 struct buffer_head *tmp = *bh;
861 945
862 rc = ocfs2_read_block(inode, eb_blkno, &tmp, 946 rc = ocfs2_read_block(ci, eb_blkno, &tmp,
863 ocfs2_validate_extent_block); 947 ocfs2_validate_extent_block);
864 948
865 /* If ocfs2_read_block() got us a new bh, pass it up. */ 949 /* If ocfs2_read_block() got us a new bh, pass it up. */
@@ -874,7 +958,6 @@ int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
874 * How many free extents have we got before we need more meta data? 958 * How many free extents have we got before we need more meta data?
875 */ 959 */
876int ocfs2_num_free_extents(struct ocfs2_super *osb, 960int ocfs2_num_free_extents(struct ocfs2_super *osb,
877 struct inode *inode,
878 struct ocfs2_extent_tree *et) 961 struct ocfs2_extent_tree *et)
879{ 962{
880 int retval; 963 int retval;
@@ -889,7 +972,8 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
889 last_eb_blk = ocfs2_et_get_last_eb_blk(et); 972 last_eb_blk = ocfs2_et_get_last_eb_blk(et);
890 973
891 if (last_eb_blk) { 974 if (last_eb_blk) {
892 retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh); 975 retval = ocfs2_read_extent_block(et->et_ci, last_eb_blk,
976 &eb_bh);
893 if (retval < 0) { 977 if (retval < 0) {
894 mlog_errno(retval); 978 mlog_errno(retval);
895 goto bail; 979 goto bail;
@@ -913,9 +997,8 @@ bail:
913 * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and 997 * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
914 * l_count for you 998 * l_count for you
915 */ 999 */
916static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, 1000static int ocfs2_create_new_meta_bhs(handle_t *handle,
917 handle_t *handle, 1001 struct ocfs2_extent_tree *et,
918 struct inode *inode,
919 int wanted, 1002 int wanted,
920 struct ocfs2_alloc_context *meta_ac, 1003 struct ocfs2_alloc_context *meta_ac,
921 struct buffer_head *bhs[]) 1004 struct buffer_head *bhs[])
@@ -924,6 +1007,8 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
924 u16 suballoc_bit_start; 1007 u16 suballoc_bit_start;
925 u32 num_got; 1008 u32 num_got;
926 u64 first_blkno; 1009 u64 first_blkno;
1010 struct ocfs2_super *osb =
1011 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
927 struct ocfs2_extent_block *eb; 1012 struct ocfs2_extent_block *eb;
928 1013
929 mlog_entry_void(); 1014 mlog_entry_void();
@@ -949,9 +1034,10 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
949 mlog_errno(status); 1034 mlog_errno(status);
950 goto bail; 1035 goto bail;
951 } 1036 }
952 ocfs2_set_new_buffer_uptodate(inode, bhs[i]); 1037 ocfs2_set_new_buffer_uptodate(et->et_ci, bhs[i]);
953 1038
954 status = ocfs2_journal_access_eb(handle, inode, bhs[i], 1039 status = ocfs2_journal_access_eb(handle, et->et_ci,
1040 bhs[i],
955 OCFS2_JOURNAL_ACCESS_CREATE); 1041 OCFS2_JOURNAL_ACCESS_CREATE);
956 if (status < 0) { 1042 if (status < 0) {
957 mlog_errno(status); 1043 mlog_errno(status);
@@ -1023,7 +1109,6 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el)
1023 * extent block's rightmost record. 1109 * extent block's rightmost record.
1024 */ 1110 */
1025static int ocfs2_adjust_rightmost_branch(handle_t *handle, 1111static int ocfs2_adjust_rightmost_branch(handle_t *handle,
1026 struct inode *inode,
1027 struct ocfs2_extent_tree *et) 1112 struct ocfs2_extent_tree *et)
1028{ 1113{
1029 int status; 1114 int status;
@@ -1037,7 +1122,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle,
1037 return status; 1122 return status;
1038 } 1123 }
1039 1124
1040 status = ocfs2_find_path(inode, path, UINT_MAX); 1125 status = ocfs2_find_path(et->et_ci, path, UINT_MAX);
1041 if (status < 0) { 1126 if (status < 0) {
1042 mlog_errno(status); 1127 mlog_errno(status);
1043 goto out; 1128 goto out;
@@ -1050,7 +1135,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle,
1050 goto out; 1135 goto out;
1051 } 1136 }
1052 1137
1053 status = ocfs2_journal_access_path(inode, handle, path); 1138 status = ocfs2_journal_access_path(et->et_ci, handle, path);
1054 if (status < 0) { 1139 if (status < 0) {
1055 mlog_errno(status); 1140 mlog_errno(status);
1056 goto out; 1141 goto out;
@@ -1059,7 +1144,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle,
1059 el = path_leaf_el(path); 1144 el = path_leaf_el(path);
1060 rec = &el->l_recs[le32_to_cpu(el->l_next_free_rec) - 1]; 1145 rec = &el->l_recs[le32_to_cpu(el->l_next_free_rec) - 1];
1061 1146
1062 ocfs2_adjust_rightmost_records(inode, handle, path, rec); 1147 ocfs2_adjust_rightmost_records(handle, et, path, rec);
1063 1148
1064out: 1149out:
1065 ocfs2_free_path(path); 1150 ocfs2_free_path(path);
@@ -1068,7 +1153,7 @@ out:
1068 1153
1069/* 1154/*
1070 * Add an entire tree branch to our inode. eb_bh is the extent block 1155 * Add an entire tree branch to our inode. eb_bh is the extent block
1071 * to start at, if we don't want to start the branch at the dinode 1156 * to start at, if we don't want to start the branch at the root
1072 * structure. 1157 * structure.
1073 * 1158 *
1074 * last_eb_bh is required as we have to update it's next_leaf pointer 1159 * last_eb_bh is required as we have to update it's next_leaf pointer
@@ -1077,9 +1162,7 @@ out:
1077 * the new branch will be 'empty' in the sense that every block will 1162 * the new branch will be 'empty' in the sense that every block will
1078 * contain a single record with cluster count == 0. 1163 * contain a single record with cluster count == 0.
1079 */ 1164 */
1080static int ocfs2_add_branch(struct ocfs2_super *osb, 1165static int ocfs2_add_branch(handle_t *handle,
1081 handle_t *handle,
1082 struct inode *inode,
1083 struct ocfs2_extent_tree *et, 1166 struct ocfs2_extent_tree *et,
1084 struct buffer_head *eb_bh, 1167 struct buffer_head *eb_bh,
1085 struct buffer_head **last_eb_bh, 1168 struct buffer_head **last_eb_bh,
@@ -1123,7 +1206,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
1123 if (root_end > new_cpos) { 1206 if (root_end > new_cpos) {
1124 mlog(0, "adjust the cluster end from %u to %u\n", 1207 mlog(0, "adjust the cluster end from %u to %u\n",
1125 root_end, new_cpos); 1208 root_end, new_cpos);
1126 status = ocfs2_adjust_rightmost_branch(handle, inode, et); 1209 status = ocfs2_adjust_rightmost_branch(handle, et);
1127 if (status) { 1210 if (status) {
1128 mlog_errno(status); 1211 mlog_errno(status);
1129 goto bail; 1212 goto bail;
@@ -1139,7 +1222,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
1139 goto bail; 1222 goto bail;
1140 } 1223 }
1141 1224
1142 status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks, 1225 status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
1143 meta_ac, new_eb_bhs); 1226 meta_ac, new_eb_bhs);
1144 if (status < 0) { 1227 if (status < 0) {
1145 mlog_errno(status); 1228 mlog_errno(status);
@@ -1161,7 +1244,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
1161 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb)); 1244 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1162 eb_el = &eb->h_list; 1245 eb_el = &eb->h_list;
1163 1246
1164 status = ocfs2_journal_access_eb(handle, inode, bh, 1247 status = ocfs2_journal_access_eb(handle, et->et_ci, bh,
1165 OCFS2_JOURNAL_ACCESS_CREATE); 1248 OCFS2_JOURNAL_ACCESS_CREATE);
1166 if (status < 0) { 1249 if (status < 0) {
1167 mlog_errno(status); 1250 mlog_errno(status);
@@ -1201,20 +1284,20 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
1201 * journal_dirty erroring as it won't unless we've aborted the 1284 * journal_dirty erroring as it won't unless we've aborted the
1202 * handle (in which case we would never be here) so reserving 1285 * handle (in which case we would never be here) so reserving
1203 * the write with journal_access is all we need to do. */ 1286 * the write with journal_access is all we need to do. */
1204 status = ocfs2_journal_access_eb(handle, inode, *last_eb_bh, 1287 status = ocfs2_journal_access_eb(handle, et->et_ci, *last_eb_bh,
1205 OCFS2_JOURNAL_ACCESS_WRITE); 1288 OCFS2_JOURNAL_ACCESS_WRITE);
1206 if (status < 0) { 1289 if (status < 0) {
1207 mlog_errno(status); 1290 mlog_errno(status);
1208 goto bail; 1291 goto bail;
1209 } 1292 }
1210 status = ocfs2_et_root_journal_access(handle, inode, et, 1293 status = ocfs2_et_root_journal_access(handle, et,
1211 OCFS2_JOURNAL_ACCESS_WRITE); 1294 OCFS2_JOURNAL_ACCESS_WRITE);
1212 if (status < 0) { 1295 if (status < 0) {
1213 mlog_errno(status); 1296 mlog_errno(status);
1214 goto bail; 1297 goto bail;
1215 } 1298 }
1216 if (eb_bh) { 1299 if (eb_bh) {
1217 status = ocfs2_journal_access_eb(handle, inode, eb_bh, 1300 status = ocfs2_journal_access_eb(handle, et->et_ci, eb_bh,
1218 OCFS2_JOURNAL_ACCESS_WRITE); 1301 OCFS2_JOURNAL_ACCESS_WRITE);
1219 if (status < 0) { 1302 if (status < 0) {
1220 mlog_errno(status); 1303 mlog_errno(status);
@@ -1274,9 +1357,7 @@ bail:
1274 * returns back the new extent block so you can add a branch to it 1357 * returns back the new extent block so you can add a branch to it
1275 * after this call. 1358 * after this call.
1276 */ 1359 */
1277static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, 1360static int ocfs2_shift_tree_depth(handle_t *handle,
1278 handle_t *handle,
1279 struct inode *inode,
1280 struct ocfs2_extent_tree *et, 1361 struct ocfs2_extent_tree *et,
1281 struct ocfs2_alloc_context *meta_ac, 1362 struct ocfs2_alloc_context *meta_ac,
1282 struct buffer_head **ret_new_eb_bh) 1363 struct buffer_head **ret_new_eb_bh)
@@ -1290,7 +1371,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
1290 1371
1291 mlog_entry_void(); 1372 mlog_entry_void();
1292 1373
1293 status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac, 1374 status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
1294 &new_eb_bh); 1375 &new_eb_bh);
1295 if (status < 0) { 1376 if (status < 0) {
1296 mlog_errno(status); 1377 mlog_errno(status);
@@ -1304,7 +1385,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
1304 eb_el = &eb->h_list; 1385 eb_el = &eb->h_list;
1305 root_el = et->et_root_el; 1386 root_el = et->et_root_el;
1306 1387
1307 status = ocfs2_journal_access_eb(handle, inode, new_eb_bh, 1388 status = ocfs2_journal_access_eb(handle, et->et_ci, new_eb_bh,
1308 OCFS2_JOURNAL_ACCESS_CREATE); 1389 OCFS2_JOURNAL_ACCESS_CREATE);
1309 if (status < 0) { 1390 if (status < 0) {
1310 mlog_errno(status); 1391 mlog_errno(status);
@@ -1323,7 +1404,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
1323 goto bail; 1404 goto bail;
1324 } 1405 }
1325 1406
1326 status = ocfs2_et_root_journal_access(handle, inode, et, 1407 status = ocfs2_et_root_journal_access(handle, et,
1327 OCFS2_JOURNAL_ACCESS_WRITE); 1408 OCFS2_JOURNAL_ACCESS_WRITE);
1328 if (status < 0) { 1409 if (status < 0) {
1329 mlog_errno(status); 1410 mlog_errno(status);
@@ -1379,9 +1460,7 @@ bail:
1379 * 1460 *
1380 * return status < 0 indicates an error. 1461 * return status < 0 indicates an error.
1381 */ 1462 */
1382static int ocfs2_find_branch_target(struct ocfs2_super *osb, 1463static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
1383 struct inode *inode,
1384 struct ocfs2_extent_tree *et,
1385 struct buffer_head **target_bh) 1464 struct buffer_head **target_bh)
1386{ 1465{
1387 int status = 0, i; 1466 int status = 0, i;
@@ -1399,19 +1478,21 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
1399 1478
1400 while(le16_to_cpu(el->l_tree_depth) > 1) { 1479 while(le16_to_cpu(el->l_tree_depth) > 1) {
1401 if (le16_to_cpu(el->l_next_free_rec) == 0) { 1480 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1402 ocfs2_error(inode->i_sb, "Dinode %llu has empty " 1481 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
1482 "Owner %llu has empty "
1403 "extent list (next_free_rec == 0)", 1483 "extent list (next_free_rec == 0)",
1404 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1484 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
1405 status = -EIO; 1485 status = -EIO;
1406 goto bail; 1486 goto bail;
1407 } 1487 }
1408 i = le16_to_cpu(el->l_next_free_rec) - 1; 1488 i = le16_to_cpu(el->l_next_free_rec) - 1;
1409 blkno = le64_to_cpu(el->l_recs[i].e_blkno); 1489 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1410 if (!blkno) { 1490 if (!blkno) {
1411 ocfs2_error(inode->i_sb, "Dinode %llu has extent " 1491 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
1492 "Owner %llu has extent "
1412 "list where extent # %d has no physical " 1493 "list where extent # %d has no physical "
1413 "block start", 1494 "block start",
1414 (unsigned long long)OCFS2_I(inode)->ip_blkno, i); 1495 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
1415 status = -EIO; 1496 status = -EIO;
1416 goto bail; 1497 goto bail;
1417 } 1498 }
@@ -1419,7 +1500,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
1419 brelse(bh); 1500 brelse(bh);
1420 bh = NULL; 1501 bh = NULL;
1421 1502
1422 status = ocfs2_read_extent_block(inode, blkno, &bh); 1503 status = ocfs2_read_extent_block(et->et_ci, blkno, &bh);
1423 if (status < 0) { 1504 if (status < 0) {
1424 mlog_errno(status); 1505 mlog_errno(status);
1425 goto bail; 1506 goto bail;
@@ -1460,20 +1541,18 @@ bail:
1460 * 1541 *
1461 * *last_eb_bh will be updated by ocfs2_add_branch(). 1542 * *last_eb_bh will be updated by ocfs2_add_branch().
1462 */ 1543 */
1463static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, 1544static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
1464 struct ocfs2_extent_tree *et, int *final_depth, 1545 int *final_depth, struct buffer_head **last_eb_bh,
1465 struct buffer_head **last_eb_bh,
1466 struct ocfs2_alloc_context *meta_ac) 1546 struct ocfs2_alloc_context *meta_ac)
1467{ 1547{
1468 int ret, shift; 1548 int ret, shift;
1469 struct ocfs2_extent_list *el = et->et_root_el; 1549 struct ocfs2_extent_list *el = et->et_root_el;
1470 int depth = le16_to_cpu(el->l_tree_depth); 1550 int depth = le16_to_cpu(el->l_tree_depth);
1471 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1472 struct buffer_head *bh = NULL; 1551 struct buffer_head *bh = NULL;
1473 1552
1474 BUG_ON(meta_ac == NULL); 1553 BUG_ON(meta_ac == NULL);
1475 1554
1476 shift = ocfs2_find_branch_target(osb, inode, et, &bh); 1555 shift = ocfs2_find_branch_target(et, &bh);
1477 if (shift < 0) { 1556 if (shift < 0) {
1478 ret = shift; 1557 ret = shift;
1479 mlog_errno(ret); 1558 mlog_errno(ret);
@@ -1490,8 +1569,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
1490 /* ocfs2_shift_tree_depth will return us a buffer with 1569 /* ocfs2_shift_tree_depth will return us a buffer with
1491 * the new extent block (so we can pass that to 1570 * the new extent block (so we can pass that to
1492 * ocfs2_add_branch). */ 1571 * ocfs2_add_branch). */
1493 ret = ocfs2_shift_tree_depth(osb, handle, inode, et, 1572 ret = ocfs2_shift_tree_depth(handle, et, meta_ac, &bh);
1494 meta_ac, &bh);
1495 if (ret < 0) { 1573 if (ret < 0) {
1496 mlog_errno(ret); 1574 mlog_errno(ret);
1497 goto out; 1575 goto out;
@@ -1517,7 +1595,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
1517 /* call ocfs2_add_branch to add the final part of the tree with 1595 /* call ocfs2_add_branch to add the final part of the tree with
1518 * the new data. */ 1596 * the new data. */
1519 mlog(0, "add branch. bh = %p\n", bh); 1597 mlog(0, "add branch. bh = %p\n", bh);
1520 ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh, 1598 ret = ocfs2_add_branch(handle, et, bh, last_eb_bh,
1521 meta_ac); 1599 meta_ac);
1522 if (ret < 0) { 1600 if (ret < 0) {
1523 mlog_errno(ret); 1601 mlog_errno(ret);
@@ -1687,7 +1765,7 @@ set_and_inc:
1687 * 1765 *
1688 * The array index of the subtree root is passed back. 1766 * The array index of the subtree root is passed back.
1689 */ 1767 */
1690static int ocfs2_find_subtree_root(struct inode *inode, 1768static int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
1691 struct ocfs2_path *left, 1769 struct ocfs2_path *left,
1692 struct ocfs2_path *right) 1770 struct ocfs2_path *right)
1693{ 1771{
@@ -1705,10 +1783,10 @@ static int ocfs2_find_subtree_root(struct inode *inode,
1705 * The caller didn't pass two adjacent paths. 1783 * The caller didn't pass two adjacent paths.
1706 */ 1784 */
1707 mlog_bug_on_msg(i > left->p_tree_depth, 1785 mlog_bug_on_msg(i > left->p_tree_depth,
1708 "Inode %lu, left depth %u, right depth %u\n" 1786 "Owner %llu, left depth %u, right depth %u\n"
1709 "left leaf blk %llu, right leaf blk %llu\n", 1787 "left leaf blk %llu, right leaf blk %llu\n",
1710 inode->i_ino, left->p_tree_depth, 1788 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
1711 right->p_tree_depth, 1789 left->p_tree_depth, right->p_tree_depth,
1712 (unsigned long long)path_leaf_bh(left)->b_blocknr, 1790 (unsigned long long)path_leaf_bh(left)->b_blocknr,
1713 (unsigned long long)path_leaf_bh(right)->b_blocknr); 1791 (unsigned long long)path_leaf_bh(right)->b_blocknr);
1714 } while (left->p_node[i].bh->b_blocknr == 1792 } while (left->p_node[i].bh->b_blocknr ==
@@ -1725,7 +1803,7 @@ typedef void (path_insert_t)(void *, struct buffer_head *);
1725 * This code can be called with a cpos larger than the tree, in which 1803 * This code can be called with a cpos larger than the tree, in which
1726 * case it will return the rightmost path. 1804 * case it will return the rightmost path.
1727 */ 1805 */
1728static int __ocfs2_find_path(struct inode *inode, 1806static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
1729 struct ocfs2_extent_list *root_el, u32 cpos, 1807 struct ocfs2_extent_list *root_el, u32 cpos,
1730 path_insert_t *func, void *data) 1808 path_insert_t *func, void *data)
1731{ 1809{
@@ -1736,15 +1814,14 @@ static int __ocfs2_find_path(struct inode *inode,
1736 struct ocfs2_extent_block *eb; 1814 struct ocfs2_extent_block *eb;
1737 struct ocfs2_extent_list *el; 1815 struct ocfs2_extent_list *el;
1738 struct ocfs2_extent_rec *rec; 1816 struct ocfs2_extent_rec *rec;
1739 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1740 1817
1741 el = root_el; 1818 el = root_el;
1742 while (el->l_tree_depth) { 1819 while (el->l_tree_depth) {
1743 if (le16_to_cpu(el->l_next_free_rec) == 0) { 1820 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1744 ocfs2_error(inode->i_sb, 1821 ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1745 "Inode %llu has empty extent list at " 1822 "Owner %llu has empty extent list at "
1746 "depth %u\n", 1823 "depth %u\n",
1747 (unsigned long long)oi->ip_blkno, 1824 (unsigned long long)ocfs2_metadata_cache_owner(ci),
1748 le16_to_cpu(el->l_tree_depth)); 1825 le16_to_cpu(el->l_tree_depth));
1749 ret = -EROFS; 1826 ret = -EROFS;
1750 goto out; 1827 goto out;
@@ -1767,10 +1844,10 @@ static int __ocfs2_find_path(struct inode *inode,
1767 1844
1768 blkno = le64_to_cpu(el->l_recs[i].e_blkno); 1845 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1769 if (blkno == 0) { 1846 if (blkno == 0) {
1770 ocfs2_error(inode->i_sb, 1847 ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1771 "Inode %llu has bad blkno in extent list " 1848 "Owner %llu has bad blkno in extent list "
1772 "at depth %u (index %d)\n", 1849 "at depth %u (index %d)\n",
1773 (unsigned long long)oi->ip_blkno, 1850 (unsigned long long)ocfs2_metadata_cache_owner(ci),
1774 le16_to_cpu(el->l_tree_depth), i); 1851 le16_to_cpu(el->l_tree_depth), i);
1775 ret = -EROFS; 1852 ret = -EROFS;
1776 goto out; 1853 goto out;
@@ -1778,7 +1855,7 @@ static int __ocfs2_find_path(struct inode *inode,
1778 1855
1779 brelse(bh); 1856 brelse(bh);
1780 bh = NULL; 1857 bh = NULL;
1781 ret = ocfs2_read_extent_block(inode, blkno, &bh); 1858 ret = ocfs2_read_extent_block(ci, blkno, &bh);
1782 if (ret) { 1859 if (ret) {
1783 mlog_errno(ret); 1860 mlog_errno(ret);
1784 goto out; 1861 goto out;
@@ -1789,10 +1866,10 @@ static int __ocfs2_find_path(struct inode *inode,
1789 1866
1790 if (le16_to_cpu(el->l_next_free_rec) > 1867 if (le16_to_cpu(el->l_next_free_rec) >
1791 le16_to_cpu(el->l_count)) { 1868 le16_to_cpu(el->l_count)) {
1792 ocfs2_error(inode->i_sb, 1869 ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1793 "Inode %llu has bad count in extent list " 1870 "Owner %llu has bad count in extent list "
1794 "at block %llu (next free=%u, count=%u)\n", 1871 "at block %llu (next free=%u, count=%u)\n",
1795 (unsigned long long)oi->ip_blkno, 1872 (unsigned long long)ocfs2_metadata_cache_owner(ci),
1796 (unsigned long long)bh->b_blocknr, 1873 (unsigned long long)bh->b_blocknr,
1797 le16_to_cpu(el->l_next_free_rec), 1874 le16_to_cpu(el->l_next_free_rec),
1798 le16_to_cpu(el->l_count)); 1875 le16_to_cpu(el->l_count));
@@ -1836,14 +1913,14 @@ static void find_path_ins(void *data, struct buffer_head *bh)
1836 ocfs2_path_insert_eb(fp->path, fp->index, bh); 1913 ocfs2_path_insert_eb(fp->path, fp->index, bh);
1837 fp->index++; 1914 fp->index++;
1838} 1915}
1839static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path, 1916int ocfs2_find_path(struct ocfs2_caching_info *ci,
1840 u32 cpos) 1917 struct ocfs2_path *path, u32 cpos)
1841{ 1918{
1842 struct find_path_data data; 1919 struct find_path_data data;
1843 1920
1844 data.index = 1; 1921 data.index = 1;
1845 data.path = path; 1922 data.path = path;
1846 return __ocfs2_find_path(inode, path_root_el(path), cpos, 1923 return __ocfs2_find_path(ci, path_root_el(path), cpos,
1847 find_path_ins, &data); 1924 find_path_ins, &data);
1848} 1925}
1849 1926
@@ -1868,13 +1945,14 @@ static void find_leaf_ins(void *data, struct buffer_head *bh)
1868 * 1945 *
1869 * This function doesn't handle non btree extent lists. 1946 * This function doesn't handle non btree extent lists.
1870 */ 1947 */
1871int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, 1948int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
1872 u32 cpos, struct buffer_head **leaf_bh) 1949 struct ocfs2_extent_list *root_el, u32 cpos,
1950 struct buffer_head **leaf_bh)
1873{ 1951{
1874 int ret; 1952 int ret;
1875 struct buffer_head *bh = NULL; 1953 struct buffer_head *bh = NULL;
1876 1954
1877 ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh); 1955 ret = __ocfs2_find_path(ci, root_el, cpos, find_leaf_ins, &bh);
1878 if (ret) { 1956 if (ret) {
1879 mlog_errno(ret); 1957 mlog_errno(ret);
1880 goto out; 1958 goto out;
@@ -1980,7 +2058,7 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1980 * - When we've adjusted the last extent record in the left path leaf and the 2058 * - When we've adjusted the last extent record in the left path leaf and the
1981 * 1st extent record in the right path leaf during cross extent block merge. 2059 * 1st extent record in the right path leaf during cross extent block merge.
1982 */ 2060 */
1983static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, 2061static void ocfs2_complete_edge_insert(handle_t *handle,
1984 struct ocfs2_path *left_path, 2062 struct ocfs2_path *left_path,
1985 struct ocfs2_path *right_path, 2063 struct ocfs2_path *right_path,
1986 int subtree_index) 2064 int subtree_index)
@@ -2058,8 +2136,8 @@ static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
2058 mlog_errno(ret); 2136 mlog_errno(ret);
2059} 2137}
2060 2138
2061static int ocfs2_rotate_subtree_right(struct inode *inode, 2139static int ocfs2_rotate_subtree_right(handle_t *handle,
2062 handle_t *handle, 2140 struct ocfs2_extent_tree *et,
2063 struct ocfs2_path *left_path, 2141 struct ocfs2_path *left_path,
2064 struct ocfs2_path *right_path, 2142 struct ocfs2_path *right_path,
2065 int subtree_index) 2143 int subtree_index)
@@ -2075,10 +2153,10 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
2075 left_el = path_leaf_el(left_path); 2153 left_el = path_leaf_el(left_path);
2076 2154
2077 if (left_el->l_next_free_rec != left_el->l_count) { 2155 if (left_el->l_next_free_rec != left_el->l_count) {
2078 ocfs2_error(inode->i_sb, 2156 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
2079 "Inode %llu has non-full interior leaf node %llu" 2157 "Inode %llu has non-full interior leaf node %llu"
2080 "(next free = %u)", 2158 "(next free = %u)",
2081 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2159 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2082 (unsigned long long)left_leaf_bh->b_blocknr, 2160 (unsigned long long)left_leaf_bh->b_blocknr,
2083 le16_to_cpu(left_el->l_next_free_rec)); 2161 le16_to_cpu(left_el->l_next_free_rec));
2084 return -EROFS; 2162 return -EROFS;
@@ -2094,7 +2172,7 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
2094 root_bh = left_path->p_node[subtree_index].bh; 2172 root_bh = left_path->p_node[subtree_index].bh;
2095 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 2173 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2096 2174
2097 ret = ocfs2_path_bh_journal_access(handle, inode, right_path, 2175 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
2098 subtree_index); 2176 subtree_index);
2099 if (ret) { 2177 if (ret) {
2100 mlog_errno(ret); 2178 mlog_errno(ret);
@@ -2102,14 +2180,14 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
2102 } 2180 }
2103 2181
2104 for(i = subtree_index + 1; i < path_num_items(right_path); i++) { 2182 for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2105 ret = ocfs2_path_bh_journal_access(handle, inode, 2183 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2106 right_path, i); 2184 right_path, i);
2107 if (ret) { 2185 if (ret) {
2108 mlog_errno(ret); 2186 mlog_errno(ret);
2109 goto out; 2187 goto out;
2110 } 2188 }
2111 2189
2112 ret = ocfs2_path_bh_journal_access(handle, inode, 2190 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2113 left_path, i); 2191 left_path, i);
2114 if (ret) { 2192 if (ret) {
2115 mlog_errno(ret); 2193 mlog_errno(ret);
@@ -2123,7 +2201,7 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
2123 /* This is a code error, not a disk corruption. */ 2201 /* This is a code error, not a disk corruption. */
2124 mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails " 2202 mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
2125 "because rightmost leaf block %llu is empty\n", 2203 "because rightmost leaf block %llu is empty\n",
2126 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2204 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2127 (unsigned long long)right_leaf_bh->b_blocknr); 2205 (unsigned long long)right_leaf_bh->b_blocknr);
2128 2206
2129 ocfs2_create_empty_extent(right_el); 2207 ocfs2_create_empty_extent(right_el);
@@ -2157,8 +2235,8 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
2157 goto out; 2235 goto out;
2158 } 2236 }
2159 2237
2160 ocfs2_complete_edge_insert(inode, handle, left_path, right_path, 2238 ocfs2_complete_edge_insert(handle, left_path, right_path,
2161 subtree_index); 2239 subtree_index);
2162 2240
2163out: 2241out:
2164 return ret; 2242 return ret;
@@ -2248,10 +2326,18 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
2248 int op_credits, 2326 int op_credits,
2249 struct ocfs2_path *path) 2327 struct ocfs2_path *path)
2250{ 2328{
2329 int ret;
2251 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits; 2330 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
2252 2331
2253 if (handle->h_buffer_credits < credits) 2332 if (handle->h_buffer_credits < credits) {
2254 return ocfs2_extend_trans(handle, credits); 2333 ret = ocfs2_extend_trans(handle,
2334 credits - handle->h_buffer_credits);
2335 if (ret)
2336 return ret;
2337
2338 if (unlikely(handle->h_buffer_credits < credits))
2339 return ocfs2_extend_trans(handle, credits);
2340 }
2255 2341
2256 return 0; 2342 return 0;
2257} 2343}
@@ -2321,8 +2407,8 @@ static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
2321 * *ret_left_path will contain a valid path which can be passed to 2407 * *ret_left_path will contain a valid path which can be passed to
2322 * ocfs2_insert_path(). 2408 * ocfs2_insert_path().
2323 */ 2409 */
2324static int ocfs2_rotate_tree_right(struct inode *inode, 2410static int ocfs2_rotate_tree_right(handle_t *handle,
2325 handle_t *handle, 2411 struct ocfs2_extent_tree *et,
2326 enum ocfs2_split_type split, 2412 enum ocfs2_split_type split,
2327 u32 insert_cpos, 2413 u32 insert_cpos,
2328 struct ocfs2_path *right_path, 2414 struct ocfs2_path *right_path,
@@ -2331,6 +2417,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
2331 int ret, start, orig_credits = handle->h_buffer_credits; 2417 int ret, start, orig_credits = handle->h_buffer_credits;
2332 u32 cpos; 2418 u32 cpos;
2333 struct ocfs2_path *left_path = NULL; 2419 struct ocfs2_path *left_path = NULL;
2420 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2334 2421
2335 *ret_left_path = NULL; 2422 *ret_left_path = NULL;
2336 2423
@@ -2341,7 +2428,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
2341 goto out; 2428 goto out;
2342 } 2429 }
2343 2430
2344 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos); 2431 ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
2345 if (ret) { 2432 if (ret) {
2346 mlog_errno(ret); 2433 mlog_errno(ret);
2347 goto out; 2434 goto out;
@@ -2379,7 +2466,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
2379 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n", 2466 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
2380 insert_cpos, cpos); 2467 insert_cpos, cpos);
2381 2468
2382 ret = ocfs2_find_path(inode, left_path, cpos); 2469 ret = ocfs2_find_path(et->et_ci, left_path, cpos);
2383 if (ret) { 2470 if (ret) {
2384 mlog_errno(ret); 2471 mlog_errno(ret);
2385 goto out; 2472 goto out;
@@ -2387,10 +2474,11 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
2387 2474
2388 mlog_bug_on_msg(path_leaf_bh(left_path) == 2475 mlog_bug_on_msg(path_leaf_bh(left_path) ==
2389 path_leaf_bh(right_path), 2476 path_leaf_bh(right_path),
2390 "Inode %lu: error during insert of %u " 2477 "Owner %llu: error during insert of %u "
2391 "(left path cpos %u) results in two identical " 2478 "(left path cpos %u) results in two identical "
2392 "paths ending at %llu\n", 2479 "paths ending at %llu\n",
2393 inode->i_ino, insert_cpos, cpos, 2480 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2481 insert_cpos, cpos,
2394 (unsigned long long) 2482 (unsigned long long)
2395 path_leaf_bh(left_path)->b_blocknr); 2483 path_leaf_bh(left_path)->b_blocknr);
2396 2484
@@ -2416,7 +2504,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
2416 goto out_ret_path; 2504 goto out_ret_path;
2417 } 2505 }
2418 2506
2419 start = ocfs2_find_subtree_root(inode, left_path, right_path); 2507 start = ocfs2_find_subtree_root(et, left_path, right_path);
2420 2508
2421 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", 2509 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2422 start, 2510 start,
@@ -2430,7 +2518,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
2430 goto out; 2518 goto out;
2431 } 2519 }
2432 2520
2433 ret = ocfs2_rotate_subtree_right(inode, handle, left_path, 2521 ret = ocfs2_rotate_subtree_right(handle, et, left_path,
2434 right_path, start); 2522 right_path, start);
2435 if (ret) { 2523 if (ret) {
2436 mlog_errno(ret); 2524 mlog_errno(ret);
@@ -2462,8 +2550,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
2462 */ 2550 */
2463 ocfs2_mv_path(right_path, left_path); 2551 ocfs2_mv_path(right_path, left_path);
2464 2552
2465 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, 2553 ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
2466 &cpos);
2467 if (ret) { 2554 if (ret) {
2468 mlog_errno(ret); 2555 mlog_errno(ret);
2469 goto out; 2556 goto out;
@@ -2477,7 +2564,8 @@ out_ret_path:
2477 return ret; 2564 return ret;
2478} 2565}
2479 2566
2480static int ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, 2567static int ocfs2_update_edge_lengths(handle_t *handle,
2568 struct ocfs2_extent_tree *et,
2481 int subtree_index, struct ocfs2_path *path) 2569 int subtree_index, struct ocfs2_path *path)
2482{ 2570{
2483 int i, idx, ret; 2571 int i, idx, ret;
@@ -2502,7 +2590,7 @@ static int ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
2502 goto out; 2590 goto out;
2503 } 2591 }
2504 2592
2505 ret = ocfs2_journal_access_path(inode, handle, path); 2593 ret = ocfs2_journal_access_path(et->et_ci, handle, path);
2506 if (ret) { 2594 if (ret) {
2507 mlog_errno(ret); 2595 mlog_errno(ret);
2508 goto out; 2596 goto out;
@@ -2532,7 +2620,8 @@ out:
2532 return ret; 2620 return ret;
2533} 2621}
2534 2622
2535static void ocfs2_unlink_path(struct inode *inode, handle_t *handle, 2623static void ocfs2_unlink_path(handle_t *handle,
2624 struct ocfs2_extent_tree *et,
2536 struct ocfs2_cached_dealloc_ctxt *dealloc, 2625 struct ocfs2_cached_dealloc_ctxt *dealloc,
2537 struct ocfs2_path *path, int unlink_start) 2626 struct ocfs2_path *path, int unlink_start)
2538{ 2627{
@@ -2554,12 +2643,12 @@ static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
2554 mlog(ML_ERROR, 2643 mlog(ML_ERROR,
2555 "Inode %llu, attempted to remove extent block " 2644 "Inode %llu, attempted to remove extent block "
2556 "%llu with %u records\n", 2645 "%llu with %u records\n",
2557 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2646 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2558 (unsigned long long)le64_to_cpu(eb->h_blkno), 2647 (unsigned long long)le64_to_cpu(eb->h_blkno),
2559 le16_to_cpu(el->l_next_free_rec)); 2648 le16_to_cpu(el->l_next_free_rec));
2560 2649
2561 ocfs2_journal_dirty(handle, bh); 2650 ocfs2_journal_dirty(handle, bh);
2562 ocfs2_remove_from_cache(inode, bh); 2651 ocfs2_remove_from_cache(et->et_ci, bh);
2563 continue; 2652 continue;
2564 } 2653 }
2565 2654
@@ -2572,11 +2661,12 @@ static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
2572 if (ret) 2661 if (ret)
2573 mlog_errno(ret); 2662 mlog_errno(ret);
2574 2663
2575 ocfs2_remove_from_cache(inode, bh); 2664 ocfs2_remove_from_cache(et->et_ci, bh);
2576 } 2665 }
2577} 2666}
2578 2667
2579static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle, 2668static void ocfs2_unlink_subtree(handle_t *handle,
2669 struct ocfs2_extent_tree *et,
2580 struct ocfs2_path *left_path, 2670 struct ocfs2_path *left_path,
2581 struct ocfs2_path *right_path, 2671 struct ocfs2_path *right_path,
2582 int subtree_index, 2672 int subtree_index,
@@ -2607,17 +2697,17 @@ static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2607 ocfs2_journal_dirty(handle, root_bh); 2697 ocfs2_journal_dirty(handle, root_bh);
2608 ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); 2698 ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2609 2699
2610 ocfs2_unlink_path(inode, handle, dealloc, right_path, 2700 ocfs2_unlink_path(handle, et, dealloc, right_path,
2611 subtree_index + 1); 2701 subtree_index + 1);
2612} 2702}
2613 2703
2614static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, 2704static int ocfs2_rotate_subtree_left(handle_t *handle,
2705 struct ocfs2_extent_tree *et,
2615 struct ocfs2_path *left_path, 2706 struct ocfs2_path *left_path,
2616 struct ocfs2_path *right_path, 2707 struct ocfs2_path *right_path,
2617 int subtree_index, 2708 int subtree_index,
2618 struct ocfs2_cached_dealloc_ctxt *dealloc, 2709 struct ocfs2_cached_dealloc_ctxt *dealloc,
2619 int *deleted, 2710 int *deleted)
2620 struct ocfs2_extent_tree *et)
2621{ 2711{
2622 int ret, i, del_right_subtree = 0, right_has_empty = 0; 2712 int ret, i, del_right_subtree = 0, right_has_empty = 0;
2623 struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path); 2713 struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
@@ -2653,7 +2743,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2653 return -EAGAIN; 2743 return -EAGAIN;
2654 2744
2655 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) { 2745 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2656 ret = ocfs2_journal_access_eb(handle, inode, 2746 ret = ocfs2_journal_access_eb(handle, et->et_ci,
2657 path_leaf_bh(right_path), 2747 path_leaf_bh(right_path),
2658 OCFS2_JOURNAL_ACCESS_WRITE); 2748 OCFS2_JOURNAL_ACCESS_WRITE);
2659 if (ret) { 2749 if (ret) {
@@ -2672,7 +2762,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2672 * We have to update i_last_eb_blk during the meta 2762 * We have to update i_last_eb_blk during the meta
2673 * data delete. 2763 * data delete.
2674 */ 2764 */
2675 ret = ocfs2_et_root_journal_access(handle, inode, et, 2765 ret = ocfs2_et_root_journal_access(handle, et,
2676 OCFS2_JOURNAL_ACCESS_WRITE); 2766 OCFS2_JOURNAL_ACCESS_WRITE);
2677 if (ret) { 2767 if (ret) {
2678 mlog_errno(ret); 2768 mlog_errno(ret);
@@ -2688,7 +2778,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2688 */ 2778 */
2689 BUG_ON(right_has_empty && !del_right_subtree); 2779 BUG_ON(right_has_empty && !del_right_subtree);
2690 2780
2691 ret = ocfs2_path_bh_journal_access(handle, inode, right_path, 2781 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
2692 subtree_index); 2782 subtree_index);
2693 if (ret) { 2783 if (ret) {
2694 mlog_errno(ret); 2784 mlog_errno(ret);
@@ -2696,14 +2786,14 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2696 } 2786 }
2697 2787
2698 for(i = subtree_index + 1; i < path_num_items(right_path); i++) { 2788 for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2699 ret = ocfs2_path_bh_journal_access(handle, inode, 2789 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2700 right_path, i); 2790 right_path, i);
2701 if (ret) { 2791 if (ret) {
2702 mlog_errno(ret); 2792 mlog_errno(ret);
2703 goto out; 2793 goto out;
2704 } 2794 }
2705 2795
2706 ret = ocfs2_path_bh_journal_access(handle, inode, 2796 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2707 left_path, i); 2797 left_path, i);
2708 if (ret) { 2798 if (ret) {
2709 mlog_errno(ret); 2799 mlog_errno(ret);
@@ -2740,9 +2830,9 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2740 mlog_errno(ret); 2830 mlog_errno(ret);
2741 2831
2742 if (del_right_subtree) { 2832 if (del_right_subtree) {
2743 ocfs2_unlink_subtree(inode, handle, left_path, right_path, 2833 ocfs2_unlink_subtree(handle, et, left_path, right_path,
2744 subtree_index, dealloc); 2834 subtree_index, dealloc);
2745 ret = ocfs2_update_edge_lengths(inode, handle, subtree_index, 2835 ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
2746 left_path); 2836 left_path);
2747 if (ret) { 2837 if (ret) {
2748 mlog_errno(ret); 2838 mlog_errno(ret);
@@ -2766,7 +2856,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2766 2856
2767 *deleted = 1; 2857 *deleted = 1;
2768 } else 2858 } else
2769 ocfs2_complete_edge_insert(inode, handle, left_path, right_path, 2859 ocfs2_complete_edge_insert(handle, left_path, right_path,
2770 subtree_index); 2860 subtree_index);
2771 2861
2772out: 2862out:
@@ -2852,8 +2942,8 @@ out:
2852 return ret; 2942 return ret;
2853} 2943}
2854 2944
2855static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode, 2945static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
2856 handle_t *handle, 2946 struct ocfs2_extent_tree *et,
2857 struct ocfs2_path *path) 2947 struct ocfs2_path *path)
2858{ 2948{
2859 int ret; 2949 int ret;
@@ -2863,7 +2953,7 @@ static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2863 if (!ocfs2_is_empty_extent(&el->l_recs[0])) 2953 if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2864 return 0; 2954 return 0;
2865 2955
2866 ret = ocfs2_path_bh_journal_access(handle, inode, path, 2956 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
2867 path_num_items(path) - 1); 2957 path_num_items(path) - 1);
2868 if (ret) { 2958 if (ret) {
2869 mlog_errno(ret); 2959 mlog_errno(ret);
@@ -2880,24 +2970,24 @@ out:
2880 return ret; 2970 return ret;
2881} 2971}
2882 2972
2883static int __ocfs2_rotate_tree_left(struct inode *inode, 2973static int __ocfs2_rotate_tree_left(handle_t *handle,
2884 handle_t *handle, int orig_credits, 2974 struct ocfs2_extent_tree *et,
2975 int orig_credits,
2885 struct ocfs2_path *path, 2976 struct ocfs2_path *path,
2886 struct ocfs2_cached_dealloc_ctxt *dealloc, 2977 struct ocfs2_cached_dealloc_ctxt *dealloc,
2887 struct ocfs2_path **empty_extent_path, 2978 struct ocfs2_path **empty_extent_path)
2888 struct ocfs2_extent_tree *et)
2889{ 2979{
2890 int ret, subtree_root, deleted; 2980 int ret, subtree_root, deleted;
2891 u32 right_cpos; 2981 u32 right_cpos;
2892 struct ocfs2_path *left_path = NULL; 2982 struct ocfs2_path *left_path = NULL;
2893 struct ocfs2_path *right_path = NULL; 2983 struct ocfs2_path *right_path = NULL;
2984 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2894 2985
2895 BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))); 2986 BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2896 2987
2897 *empty_extent_path = NULL; 2988 *empty_extent_path = NULL;
2898 2989
2899 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path, 2990 ret = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
2900 &right_cpos);
2901 if (ret) { 2991 if (ret) {
2902 mlog_errno(ret); 2992 mlog_errno(ret);
2903 goto out; 2993 goto out;
@@ -2920,13 +3010,13 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
2920 } 3010 }
2921 3011
2922 while (right_cpos) { 3012 while (right_cpos) {
2923 ret = ocfs2_find_path(inode, right_path, right_cpos); 3013 ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
2924 if (ret) { 3014 if (ret) {
2925 mlog_errno(ret); 3015 mlog_errno(ret);
2926 goto out; 3016 goto out;
2927 } 3017 }
2928 3018
2929 subtree_root = ocfs2_find_subtree_root(inode, left_path, 3019 subtree_root = ocfs2_find_subtree_root(et, left_path,
2930 right_path); 3020 right_path);
2931 3021
2932 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", 3022 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
@@ -2946,16 +3036,16 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
2946 * Caller might still want to make changes to the 3036 * Caller might still want to make changes to the
2947 * tree root, so re-add it to the journal here. 3037 * tree root, so re-add it to the journal here.
2948 */ 3038 */
2949 ret = ocfs2_path_bh_journal_access(handle, inode, 3039 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2950 left_path, 0); 3040 left_path, 0);
2951 if (ret) { 3041 if (ret) {
2952 mlog_errno(ret); 3042 mlog_errno(ret);
2953 goto out; 3043 goto out;
2954 } 3044 }
2955 3045
2956 ret = ocfs2_rotate_subtree_left(inode, handle, left_path, 3046 ret = ocfs2_rotate_subtree_left(handle, et, left_path,
2957 right_path, subtree_root, 3047 right_path, subtree_root,
2958 dealloc, &deleted, et); 3048 dealloc, &deleted);
2959 if (ret == -EAGAIN) { 3049 if (ret == -EAGAIN) {
2960 /* 3050 /*
2961 * The rotation has to temporarily stop due to 3051 * The rotation has to temporarily stop due to
@@ -2982,7 +3072,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
2982 3072
2983 ocfs2_mv_path(left_path, right_path); 3073 ocfs2_mv_path(left_path, right_path);
2984 3074
2985 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path, 3075 ret = ocfs2_find_cpos_for_right_leaf(sb, left_path,
2986 &right_cpos); 3076 &right_cpos);
2987 if (ret) { 3077 if (ret) {
2988 mlog_errno(ret); 3078 mlog_errno(ret);
@@ -2997,10 +3087,10 @@ out:
2997 return ret; 3087 return ret;
2998} 3088}
2999 3089
3000static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, 3090static int ocfs2_remove_rightmost_path(handle_t *handle,
3091 struct ocfs2_extent_tree *et,
3001 struct ocfs2_path *path, 3092 struct ocfs2_path *path,
3002 struct ocfs2_cached_dealloc_ctxt *dealloc, 3093 struct ocfs2_cached_dealloc_ctxt *dealloc)
3003 struct ocfs2_extent_tree *et)
3004{ 3094{
3005 int ret, subtree_index; 3095 int ret, subtree_index;
3006 u32 cpos; 3096 u32 cpos;
@@ -3009,7 +3099,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
3009 struct ocfs2_extent_list *el; 3099 struct ocfs2_extent_list *el;
3010 3100
3011 3101
3012 ret = ocfs2_et_sanity_check(inode, et); 3102 ret = ocfs2_et_sanity_check(et);
3013 if (ret) 3103 if (ret)
3014 goto out; 3104 goto out;
3015 /* 3105 /*
@@ -3024,13 +3114,14 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
3024 goto out; 3114 goto out;
3025 } 3115 }
3026 3116
3027 ret = ocfs2_journal_access_path(inode, handle, path); 3117 ret = ocfs2_journal_access_path(et->et_ci, handle, path);
3028 if (ret) { 3118 if (ret) {
3029 mlog_errno(ret); 3119 mlog_errno(ret);
3030 goto out; 3120 goto out;
3031 } 3121 }
3032 3122
3033 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos); 3123 ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3124 path, &cpos);
3034 if (ret) { 3125 if (ret) {
3035 mlog_errno(ret); 3126 mlog_errno(ret);
3036 goto out; 3127 goto out;
@@ -3048,23 +3139,23 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
3048 goto out; 3139 goto out;
3049 } 3140 }
3050 3141
3051 ret = ocfs2_find_path(inode, left_path, cpos); 3142 ret = ocfs2_find_path(et->et_ci, left_path, cpos);
3052 if (ret) { 3143 if (ret) {
3053 mlog_errno(ret); 3144 mlog_errno(ret);
3054 goto out; 3145 goto out;
3055 } 3146 }
3056 3147
3057 ret = ocfs2_journal_access_path(inode, handle, left_path); 3148 ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
3058 if (ret) { 3149 if (ret) {
3059 mlog_errno(ret); 3150 mlog_errno(ret);
3060 goto out; 3151 goto out;
3061 } 3152 }
3062 3153
3063 subtree_index = ocfs2_find_subtree_root(inode, left_path, path); 3154 subtree_index = ocfs2_find_subtree_root(et, left_path, path);
3064 3155
3065 ocfs2_unlink_subtree(inode, handle, left_path, path, 3156 ocfs2_unlink_subtree(handle, et, left_path, path,
3066 subtree_index, dealloc); 3157 subtree_index, dealloc);
3067 ret = ocfs2_update_edge_lengths(inode, handle, subtree_index, 3158 ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
3068 left_path); 3159 left_path);
3069 if (ret) { 3160 if (ret) {
3070 mlog_errno(ret); 3161 mlog_errno(ret);
@@ -3078,10 +3169,10 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
3078 * 'path' is also the leftmost path which 3169 * 'path' is also the leftmost path which
3079 * means it must be the only one. This gets 3170 * means it must be the only one. This gets
3080 * handled differently because we want to 3171 * handled differently because we want to
3081 * revert the inode back to having extents 3172 * revert the root back to having extents
3082 * in-line. 3173 * in-line.
3083 */ 3174 */
3084 ocfs2_unlink_path(inode, handle, dealloc, path, 1); 3175 ocfs2_unlink_path(handle, et, dealloc, path, 1);
3085 3176
3086 el = et->et_root_el; 3177 el = et->et_root_el;
3087 el->l_tree_depth = 0; 3178 el->l_tree_depth = 0;
@@ -3114,10 +3205,10 @@ out:
3114 * the rightmost tree leaf record is removed so the caller is 3205 * the rightmost tree leaf record is removed so the caller is
3115 * responsible for detecting and correcting that. 3206 * responsible for detecting and correcting that.
3116 */ 3207 */
3117static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle, 3208static int ocfs2_rotate_tree_left(handle_t *handle,
3209 struct ocfs2_extent_tree *et,
3118 struct ocfs2_path *path, 3210 struct ocfs2_path *path,
3119 struct ocfs2_cached_dealloc_ctxt *dealloc, 3211 struct ocfs2_cached_dealloc_ctxt *dealloc)
3120 struct ocfs2_extent_tree *et)
3121{ 3212{
3122 int ret, orig_credits = handle->h_buffer_credits; 3213 int ret, orig_credits = handle->h_buffer_credits;
3123 struct ocfs2_path *tmp_path = NULL, *restart_path = NULL; 3214 struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
@@ -3134,8 +3225,7 @@ rightmost_no_delete:
3134 * Inline extents. This is trivially handled, so do 3225 * Inline extents. This is trivially handled, so do
3135 * it up front. 3226 * it up front.
3136 */ 3227 */
3137 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle, 3228 ret = ocfs2_rotate_rightmost_leaf_left(handle, et, path);
3138 path);
3139 if (ret) 3229 if (ret)
3140 mlog_errno(ret); 3230 mlog_errno(ret);
3141 goto out; 3231 goto out;
@@ -3151,7 +3241,7 @@ rightmost_no_delete:
3151 * 3241 *
3152 * 1) is handled via ocfs2_rotate_rightmost_leaf_left() 3242 * 1) is handled via ocfs2_rotate_rightmost_leaf_left()
3153 * 2a) we need the left branch so that we can update it with the unlink 3243 * 2a) we need the left branch so that we can update it with the unlink
3154 * 2b) we need to bring the inode back to inline extents. 3244 * 2b) we need to bring the root back to inline extents.
3155 */ 3245 */
3156 3246
3157 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; 3247 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
@@ -3167,9 +3257,9 @@ rightmost_no_delete:
3167 3257
3168 if (le16_to_cpu(el->l_next_free_rec) == 0) { 3258 if (le16_to_cpu(el->l_next_free_rec) == 0) {
3169 ret = -EIO; 3259 ret = -EIO;
3170 ocfs2_error(inode->i_sb, 3260 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
3171 "Inode %llu has empty extent block at %llu", 3261 "Owner %llu has empty extent block at %llu",
3172 (unsigned long long)OCFS2_I(inode)->ip_blkno, 3262 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
3173 (unsigned long long)le64_to_cpu(eb->h_blkno)); 3263 (unsigned long long)le64_to_cpu(eb->h_blkno));
3174 goto out; 3264 goto out;
3175 } 3265 }
@@ -3183,8 +3273,8 @@ rightmost_no_delete:
3183 * nonempty list. 3273 * nonempty list.
3184 */ 3274 */
3185 3275
3186 ret = ocfs2_remove_rightmost_path(inode, handle, path, 3276 ret = ocfs2_remove_rightmost_path(handle, et, path,
3187 dealloc, et); 3277 dealloc);
3188 if (ret) 3278 if (ret)
3189 mlog_errno(ret); 3279 mlog_errno(ret);
3190 goto out; 3280 goto out;
@@ -3195,8 +3285,8 @@ rightmost_no_delete:
3195 * and restarting from there. 3285 * and restarting from there.
3196 */ 3286 */
3197try_rotate: 3287try_rotate:
3198 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path, 3288 ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, path,
3199 dealloc, &restart_path, et); 3289 dealloc, &restart_path);
3200 if (ret && ret != -EAGAIN) { 3290 if (ret && ret != -EAGAIN) {
3201 mlog_errno(ret); 3291 mlog_errno(ret);
3202 goto out; 3292 goto out;
@@ -3206,9 +3296,9 @@ try_rotate:
3206 tmp_path = restart_path; 3296 tmp_path = restart_path;
3207 restart_path = NULL; 3297 restart_path = NULL;
3208 3298
3209 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, 3299 ret = __ocfs2_rotate_tree_left(handle, et, orig_credits,
3210 tmp_path, dealloc, 3300 tmp_path, dealloc,
3211 &restart_path, et); 3301 &restart_path);
3212 if (ret && ret != -EAGAIN) { 3302 if (ret && ret != -EAGAIN) {
3213 mlog_errno(ret); 3303 mlog_errno(ret);
3214 goto out; 3304 goto out;
@@ -3259,7 +3349,7 @@ static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
3259 } 3349 }
3260} 3350}
3261 3351
3262static int ocfs2_get_right_path(struct inode *inode, 3352static int ocfs2_get_right_path(struct ocfs2_extent_tree *et,
3263 struct ocfs2_path *left_path, 3353 struct ocfs2_path *left_path,
3264 struct ocfs2_path **ret_right_path) 3354 struct ocfs2_path **ret_right_path)
3265{ 3355{
@@ -3276,8 +3366,8 @@ static int ocfs2_get_right_path(struct inode *inode,
3276 left_el = path_leaf_el(left_path); 3366 left_el = path_leaf_el(left_path);
3277 BUG_ON(left_el->l_next_free_rec != left_el->l_count); 3367 BUG_ON(left_el->l_next_free_rec != left_el->l_count);
3278 3368
3279 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path, 3369 ret = ocfs2_find_cpos_for_right_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3280 &right_cpos); 3370 left_path, &right_cpos);
3281 if (ret) { 3371 if (ret) {
3282 mlog_errno(ret); 3372 mlog_errno(ret);
3283 goto out; 3373 goto out;
@@ -3293,7 +3383,7 @@ static int ocfs2_get_right_path(struct inode *inode,
3293 goto out; 3383 goto out;
3294 } 3384 }
3295 3385
3296 ret = ocfs2_find_path(inode, right_path, right_cpos); 3386 ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
3297 if (ret) { 3387 if (ret) {
3298 mlog_errno(ret); 3388 mlog_errno(ret);
3299 goto out; 3389 goto out;
@@ -3313,9 +3403,9 @@ out:
3313 * For index == l_count - 1, the "next" means the 1st extent rec of the 3403 * For index == l_count - 1, the "next" means the 1st extent rec of the
3314 * next extent block. 3404 * next extent block.
3315 */ 3405 */
3316static int ocfs2_merge_rec_right(struct inode *inode, 3406static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
3317 struct ocfs2_path *left_path,
3318 handle_t *handle, 3407 handle_t *handle,
3408 struct ocfs2_extent_tree *et,
3319 struct ocfs2_extent_rec *split_rec, 3409 struct ocfs2_extent_rec *split_rec,
3320 int index) 3410 int index)
3321{ 3411{
@@ -3336,7 +3426,7 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3336 if (index == le16_to_cpu(el->l_next_free_rec) - 1 && 3426 if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
3337 le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) { 3427 le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
3338 /* we meet with a cross extent block merge. */ 3428 /* we meet with a cross extent block merge. */
3339 ret = ocfs2_get_right_path(inode, left_path, &right_path); 3429 ret = ocfs2_get_right_path(et, left_path, &right_path);
3340 if (ret) { 3430 if (ret) {
3341 mlog_errno(ret); 3431 mlog_errno(ret);
3342 goto out; 3432 goto out;
@@ -3355,8 +3445,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3355 le16_to_cpu(left_rec->e_leaf_clusters) != 3445 le16_to_cpu(left_rec->e_leaf_clusters) !=
3356 le32_to_cpu(right_rec->e_cpos)); 3446 le32_to_cpu(right_rec->e_cpos));
3357 3447
3358 subtree_index = ocfs2_find_subtree_root(inode, 3448 subtree_index = ocfs2_find_subtree_root(et, left_path,
3359 left_path, right_path); 3449 right_path);
3360 3450
3361 ret = ocfs2_extend_rotate_transaction(handle, subtree_index, 3451 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3362 handle->h_buffer_credits, 3452 handle->h_buffer_credits,
@@ -3369,7 +3459,7 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3369 root_bh = left_path->p_node[subtree_index].bh; 3459 root_bh = left_path->p_node[subtree_index].bh;
3370 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 3460 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3371 3461
3372 ret = ocfs2_path_bh_journal_access(handle, inode, right_path, 3462 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3373 subtree_index); 3463 subtree_index);
3374 if (ret) { 3464 if (ret) {
3375 mlog_errno(ret); 3465 mlog_errno(ret);
@@ -3378,14 +3468,14 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3378 3468
3379 for (i = subtree_index + 1; 3469 for (i = subtree_index + 1;
3380 i < path_num_items(right_path); i++) { 3470 i < path_num_items(right_path); i++) {
3381 ret = ocfs2_path_bh_journal_access(handle, inode, 3471 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3382 right_path, i); 3472 right_path, i);
3383 if (ret) { 3473 if (ret) {
3384 mlog_errno(ret); 3474 mlog_errno(ret);
3385 goto out; 3475 goto out;
3386 } 3476 }
3387 3477
3388 ret = ocfs2_path_bh_journal_access(handle, inode, 3478 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3389 left_path, i); 3479 left_path, i);
3390 if (ret) { 3480 if (ret) {
3391 mlog_errno(ret); 3481 mlog_errno(ret);
@@ -3398,7 +3488,7 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3398 right_rec = &el->l_recs[index + 1]; 3488 right_rec = &el->l_recs[index + 1];
3399 } 3489 }
3400 3490
3401 ret = ocfs2_path_bh_journal_access(handle, inode, left_path, 3491 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path,
3402 path_num_items(left_path) - 1); 3492 path_num_items(left_path) - 1);
3403 if (ret) { 3493 if (ret) {
3404 mlog_errno(ret); 3494 mlog_errno(ret);
@@ -3409,7 +3499,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3409 3499
3410 le32_add_cpu(&right_rec->e_cpos, -split_clusters); 3500 le32_add_cpu(&right_rec->e_cpos, -split_clusters);
3411 le64_add_cpu(&right_rec->e_blkno, 3501 le64_add_cpu(&right_rec->e_blkno,
3412 -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters)); 3502 -ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
3503 split_clusters));
3413 le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters); 3504 le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
3414 3505
3415 ocfs2_cleanup_merge(el, index); 3506 ocfs2_cleanup_merge(el, index);
@@ -3423,8 +3514,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3423 if (ret) 3514 if (ret)
3424 mlog_errno(ret); 3515 mlog_errno(ret);
3425 3516
3426 ocfs2_complete_edge_insert(inode, handle, left_path, 3517 ocfs2_complete_edge_insert(handle, left_path, right_path,
3427 right_path, subtree_index); 3518 subtree_index);
3428 } 3519 }
3429out: 3520out:
3430 if (right_path) 3521 if (right_path)
@@ -3432,7 +3523,7 @@ out:
3432 return ret; 3523 return ret;
3433} 3524}
3434 3525
3435static int ocfs2_get_left_path(struct inode *inode, 3526static int ocfs2_get_left_path(struct ocfs2_extent_tree *et,
3436 struct ocfs2_path *right_path, 3527 struct ocfs2_path *right_path,
3437 struct ocfs2_path **ret_left_path) 3528 struct ocfs2_path **ret_left_path)
3438{ 3529{
@@ -3445,7 +3536,7 @@ static int ocfs2_get_left_path(struct inode *inode,
3445 /* This function shouldn't be called for non-trees. */ 3536 /* This function shouldn't be called for non-trees. */
3446 BUG_ON(right_path->p_tree_depth == 0); 3537 BUG_ON(right_path->p_tree_depth == 0);
3447 3538
3448 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, 3539 ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3449 right_path, &left_cpos); 3540 right_path, &left_cpos);
3450 if (ret) { 3541 if (ret) {
3451 mlog_errno(ret); 3542 mlog_errno(ret);
@@ -3462,7 +3553,7 @@ static int ocfs2_get_left_path(struct inode *inode,
3462 goto out; 3553 goto out;
3463 } 3554 }
3464 3555
3465 ret = ocfs2_find_path(inode, left_path, left_cpos); 3556 ret = ocfs2_find_path(et->et_ci, left_path, left_cpos);
3466 if (ret) { 3557 if (ret) {
3467 mlog_errno(ret); 3558 mlog_errno(ret);
3468 goto out; 3559 goto out;
@@ -3485,12 +3576,11 @@ out:
3485 * remove the rightmost leaf extent block in the right_path and change 3576 * remove the rightmost leaf extent block in the right_path and change
3486 * the right path to indicate the new rightmost path. 3577 * the right path to indicate the new rightmost path.
3487 */ 3578 */
3488static int ocfs2_merge_rec_left(struct inode *inode, 3579static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
3489 struct ocfs2_path *right_path,
3490 handle_t *handle, 3580 handle_t *handle,
3581 struct ocfs2_extent_tree *et,
3491 struct ocfs2_extent_rec *split_rec, 3582 struct ocfs2_extent_rec *split_rec,
3492 struct ocfs2_cached_dealloc_ctxt *dealloc, 3583 struct ocfs2_cached_dealloc_ctxt *dealloc,
3493 struct ocfs2_extent_tree *et,
3494 int index) 3584 int index)
3495{ 3585{
3496 int ret, i, subtree_index = 0, has_empty_extent = 0; 3586 int ret, i, subtree_index = 0, has_empty_extent = 0;
@@ -3508,7 +3598,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3508 right_rec = &el->l_recs[index]; 3598 right_rec = &el->l_recs[index];
3509 if (index == 0) { 3599 if (index == 0) {
3510 /* we meet with a cross extent block merge. */ 3600 /* we meet with a cross extent block merge. */
3511 ret = ocfs2_get_left_path(inode, right_path, &left_path); 3601 ret = ocfs2_get_left_path(et, right_path, &left_path);
3512 if (ret) { 3602 if (ret) {
3513 mlog_errno(ret); 3603 mlog_errno(ret);
3514 goto out; 3604 goto out;
@@ -3524,8 +3614,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3524 le16_to_cpu(left_rec->e_leaf_clusters) != 3614 le16_to_cpu(left_rec->e_leaf_clusters) !=
3525 le32_to_cpu(split_rec->e_cpos)); 3615 le32_to_cpu(split_rec->e_cpos));
3526 3616
3527 subtree_index = ocfs2_find_subtree_root(inode, 3617 subtree_index = ocfs2_find_subtree_root(et, left_path,
3528 left_path, right_path); 3618 right_path);
3529 3619
3530 ret = ocfs2_extend_rotate_transaction(handle, subtree_index, 3620 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3531 handle->h_buffer_credits, 3621 handle->h_buffer_credits,
@@ -3538,7 +3628,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3538 root_bh = left_path->p_node[subtree_index].bh; 3628 root_bh = left_path->p_node[subtree_index].bh;
3539 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 3629 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3540 3630
3541 ret = ocfs2_path_bh_journal_access(handle, inode, right_path, 3631 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3542 subtree_index); 3632 subtree_index);
3543 if (ret) { 3633 if (ret) {
3544 mlog_errno(ret); 3634 mlog_errno(ret);
@@ -3547,14 +3637,14 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3547 3637
3548 for (i = subtree_index + 1; 3638 for (i = subtree_index + 1;
3549 i < path_num_items(right_path); i++) { 3639 i < path_num_items(right_path); i++) {
3550 ret = ocfs2_path_bh_journal_access(handle, inode, 3640 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3551 right_path, i); 3641 right_path, i);
3552 if (ret) { 3642 if (ret) {
3553 mlog_errno(ret); 3643 mlog_errno(ret);
3554 goto out; 3644 goto out;
3555 } 3645 }
3556 3646
3557 ret = ocfs2_path_bh_journal_access(handle, inode, 3647 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3558 left_path, i); 3648 left_path, i);
3559 if (ret) { 3649 if (ret) {
3560 mlog_errno(ret); 3650 mlog_errno(ret);
@@ -3567,7 +3657,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3567 has_empty_extent = 1; 3657 has_empty_extent = 1;
3568 } 3658 }
3569 3659
3570 ret = ocfs2_path_bh_journal_access(handle, inode, right_path, 3660 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3571 path_num_items(right_path) - 1); 3661 path_num_items(right_path) - 1);
3572 if (ret) { 3662 if (ret) {
3573 mlog_errno(ret); 3663 mlog_errno(ret);
@@ -3586,7 +3676,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3586 3676
3587 le32_add_cpu(&right_rec->e_cpos, split_clusters); 3677 le32_add_cpu(&right_rec->e_cpos, split_clusters);
3588 le64_add_cpu(&right_rec->e_blkno, 3678 le64_add_cpu(&right_rec->e_blkno,
3589 ocfs2_clusters_to_blocks(inode->i_sb, split_clusters)); 3679 ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
3680 split_clusters));
3590 le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters); 3681 le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3591 3682
3592 ocfs2_cleanup_merge(el, index); 3683 ocfs2_cleanup_merge(el, index);
@@ -3608,9 +3699,9 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3608 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 && 3699 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3609 le16_to_cpu(el->l_next_free_rec) == 1) { 3700 le16_to_cpu(el->l_next_free_rec) == 1) {
3610 3701
3611 ret = ocfs2_remove_rightmost_path(inode, handle, 3702 ret = ocfs2_remove_rightmost_path(handle, et,
3612 right_path, 3703 right_path,
3613 dealloc, et); 3704 dealloc);
3614 if (ret) { 3705 if (ret) {
3615 mlog_errno(ret); 3706 mlog_errno(ret);
3616 goto out; 3707 goto out;
@@ -3622,7 +3713,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3622 ocfs2_mv_path(right_path, left_path); 3713 ocfs2_mv_path(right_path, left_path);
3623 left_path = NULL; 3714 left_path = NULL;
3624 } else 3715 } else
3625 ocfs2_complete_edge_insert(inode, handle, left_path, 3716 ocfs2_complete_edge_insert(handle, left_path,
3626 right_path, subtree_index); 3717 right_path, subtree_index);
3627 } 3718 }
3628out: 3719out:
@@ -3631,15 +3722,13 @@ out:
3631 return ret; 3722 return ret;
3632} 3723}
3633 3724
3634static int ocfs2_try_to_merge_extent(struct inode *inode, 3725static int ocfs2_try_to_merge_extent(handle_t *handle,
3635 handle_t *handle, 3726 struct ocfs2_extent_tree *et,
3636 struct ocfs2_path *path, 3727 struct ocfs2_path *path,
3637 int split_index, 3728 int split_index,
3638 struct ocfs2_extent_rec *split_rec, 3729 struct ocfs2_extent_rec *split_rec,
3639 struct ocfs2_cached_dealloc_ctxt *dealloc, 3730 struct ocfs2_cached_dealloc_ctxt *dealloc,
3640 struct ocfs2_merge_ctxt *ctxt, 3731 struct ocfs2_merge_ctxt *ctxt)
3641 struct ocfs2_extent_tree *et)
3642
3643{ 3732{
3644 int ret = 0; 3733 int ret = 0;
3645 struct ocfs2_extent_list *el = path_leaf_el(path); 3734 struct ocfs2_extent_list *el = path_leaf_el(path);
@@ -3655,8 +3744,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3655 * extents - having more than one in a leaf is 3744 * extents - having more than one in a leaf is
3656 * illegal. 3745 * illegal.
3657 */ 3746 */
3658 ret = ocfs2_rotate_tree_left(inode, handle, path, 3747 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3659 dealloc, et);
3660 if (ret) { 3748 if (ret) {
3661 mlog_errno(ret); 3749 mlog_errno(ret);
3662 goto out; 3750 goto out;
@@ -3685,8 +3773,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3685 * prevoius extent block. It is more efficient and easier 3773 * prevoius extent block. It is more efficient and easier
3686 * if we do merge_right first and merge_left later. 3774 * if we do merge_right first and merge_left later.
3687 */ 3775 */
3688 ret = ocfs2_merge_rec_right(inode, path, 3776 ret = ocfs2_merge_rec_right(path, handle, et, split_rec,
3689 handle, split_rec,
3690 split_index); 3777 split_index);
3691 if (ret) { 3778 if (ret) {
3692 mlog_errno(ret); 3779 mlog_errno(ret);
@@ -3699,8 +3786,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3699 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); 3786 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3700 3787
3701 /* The merge left us with an empty extent, remove it. */ 3788 /* The merge left us with an empty extent, remove it. */
3702 ret = ocfs2_rotate_tree_left(inode, handle, path, 3789 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3703 dealloc, et);
3704 if (ret) { 3790 if (ret) {
3705 mlog_errno(ret); 3791 mlog_errno(ret);
3706 goto out; 3792 goto out;
@@ -3712,18 +3798,15 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3712 * Note that we don't pass split_rec here on purpose - 3798 * Note that we don't pass split_rec here on purpose -
3713 * we've merged it into the rec already. 3799 * we've merged it into the rec already.
3714 */ 3800 */
3715 ret = ocfs2_merge_rec_left(inode, path, 3801 ret = ocfs2_merge_rec_left(path, handle, et, rec,
3716 handle, rec, 3802 dealloc, split_index);
3717 dealloc, et,
3718 split_index);
3719 3803
3720 if (ret) { 3804 if (ret) {
3721 mlog_errno(ret); 3805 mlog_errno(ret);
3722 goto out; 3806 goto out;
3723 } 3807 }
3724 3808
3725 ret = ocfs2_rotate_tree_left(inode, handle, path, 3809 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3726 dealloc, et);
3727 /* 3810 /*
3728 * Error from this last rotate is not critical, so 3811 * Error from this last rotate is not critical, so
3729 * print but don't bubble it up. 3812 * print but don't bubble it up.
@@ -3740,19 +3823,16 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3740 * the record on the left (hence the left merge). 3823 * the record on the left (hence the left merge).
3741 */ 3824 */
3742 if (ctxt->c_contig_type == CONTIG_RIGHT) { 3825 if (ctxt->c_contig_type == CONTIG_RIGHT) {
3743 ret = ocfs2_merge_rec_left(inode, 3826 ret = ocfs2_merge_rec_left(path, handle, et,
3744 path, 3827 split_rec, dealloc,
3745 handle, split_rec,
3746 dealloc, et,
3747 split_index); 3828 split_index);
3748 if (ret) { 3829 if (ret) {
3749 mlog_errno(ret); 3830 mlog_errno(ret);
3750 goto out; 3831 goto out;
3751 } 3832 }
3752 } else { 3833 } else {
3753 ret = ocfs2_merge_rec_right(inode, 3834 ret = ocfs2_merge_rec_right(path, handle,
3754 path, 3835 et, split_rec,
3755 handle, split_rec,
3756 split_index); 3836 split_index);
3757 if (ret) { 3837 if (ret) {
3758 mlog_errno(ret); 3838 mlog_errno(ret);
@@ -3765,8 +3845,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3765 * The merge may have left an empty extent in 3845 * The merge may have left an empty extent in
3766 * our leaf. Try to rotate it away. 3846 * our leaf. Try to rotate it away.
3767 */ 3847 */
3768 ret = ocfs2_rotate_tree_left(inode, handle, path, 3848 ret = ocfs2_rotate_tree_left(handle, et, path,
3769 dealloc, et); 3849 dealloc);
3770 if (ret) 3850 if (ret)
3771 mlog_errno(ret); 3851 mlog_errno(ret);
3772 ret = 0; 3852 ret = 0;
@@ -3812,10 +3892,10 @@ static void ocfs2_subtract_from_rec(struct super_block *sb,
3812 * list. If this leaf is part of an allocation tree, it is assumed 3892 * list. If this leaf is part of an allocation tree, it is assumed
3813 * that the tree above has been prepared. 3893 * that the tree above has been prepared.
3814 */ 3894 */
3815static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec, 3895static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et,
3896 struct ocfs2_extent_rec *insert_rec,
3816 struct ocfs2_extent_list *el, 3897 struct ocfs2_extent_list *el,
3817 struct ocfs2_insert_type *insert, 3898 struct ocfs2_insert_type *insert)
3818 struct inode *inode)
3819{ 3899{
3820 int i = insert->ins_contig_index; 3900 int i = insert->ins_contig_index;
3821 unsigned int range; 3901 unsigned int range;
@@ -3827,7 +3907,8 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
3827 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos)); 3907 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3828 BUG_ON(i == -1); 3908 BUG_ON(i == -1);
3829 rec = &el->l_recs[i]; 3909 rec = &el->l_recs[i];
3830 ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec, 3910 ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
3911 insert->ins_split, rec,
3831 insert_rec); 3912 insert_rec);
3832 goto rotate; 3913 goto rotate;
3833 } 3914 }
@@ -3869,10 +3950,10 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
3869 3950
3870 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >= 3951 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3871 le16_to_cpu(el->l_count), 3952 le16_to_cpu(el->l_count),
3872 "inode %lu, depth %u, count %u, next free %u, " 3953 "owner %llu, depth %u, count %u, next free %u, "
3873 "rec.cpos %u, rec.clusters %u, " 3954 "rec.cpos %u, rec.clusters %u, "
3874 "insert.cpos %u, insert.clusters %u\n", 3955 "insert.cpos %u, insert.clusters %u\n",
3875 inode->i_ino, 3956 ocfs2_metadata_cache_owner(et->et_ci),
3876 le16_to_cpu(el->l_tree_depth), 3957 le16_to_cpu(el->l_tree_depth),
3877 le16_to_cpu(el->l_count), 3958 le16_to_cpu(el->l_count),
3878 le16_to_cpu(el->l_next_free_rec), 3959 le16_to_cpu(el->l_next_free_rec),
@@ -3900,8 +3981,8 @@ rotate:
3900 ocfs2_rotate_leaf(el, insert_rec); 3981 ocfs2_rotate_leaf(el, insert_rec);
3901} 3982}
3902 3983
3903static void ocfs2_adjust_rightmost_records(struct inode *inode, 3984static void ocfs2_adjust_rightmost_records(handle_t *handle,
3904 handle_t *handle, 3985 struct ocfs2_extent_tree *et,
3905 struct ocfs2_path *path, 3986 struct ocfs2_path *path,
3906 struct ocfs2_extent_rec *insert_rec) 3987 struct ocfs2_extent_rec *insert_rec)
3907{ 3988{
@@ -3919,9 +4000,9 @@ static void ocfs2_adjust_rightmost_records(struct inode *inode,
3919 4000
3920 next_free = le16_to_cpu(el->l_next_free_rec); 4001 next_free = le16_to_cpu(el->l_next_free_rec);
3921 if (next_free == 0) { 4002 if (next_free == 0) {
3922 ocfs2_error(inode->i_sb, 4003 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
3923 "Dinode %llu has a bad extent list", 4004 "Owner %llu has a bad extent list",
3924 (unsigned long long)OCFS2_I(inode)->ip_blkno); 4005 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
3925 ret = -EIO; 4006 ret = -EIO;
3926 return; 4007 return;
3927 } 4008 }
@@ -3941,7 +4022,8 @@ static void ocfs2_adjust_rightmost_records(struct inode *inode,
3941 } 4022 }
3942} 4023}
3943 4024
3944static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, 4025static int ocfs2_append_rec_to_path(handle_t *handle,
4026 struct ocfs2_extent_tree *et,
3945 struct ocfs2_extent_rec *insert_rec, 4027 struct ocfs2_extent_rec *insert_rec,
3946 struct ocfs2_path *right_path, 4028 struct ocfs2_path *right_path,
3947 struct ocfs2_path **ret_left_path) 4029 struct ocfs2_path **ret_left_path)
@@ -3969,8 +4051,8 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3969 (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) { 4051 (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3970 u32 left_cpos; 4052 u32 left_cpos;
3971 4053
3972 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, 4054 ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3973 &left_cpos); 4055 right_path, &left_cpos);
3974 if (ret) { 4056 if (ret) {
3975 mlog_errno(ret); 4057 mlog_errno(ret);
3976 goto out; 4058 goto out;
@@ -3992,7 +4074,8 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3992 goto out; 4074 goto out;
3993 } 4075 }
3994 4076
3995 ret = ocfs2_find_path(inode, left_path, left_cpos); 4077 ret = ocfs2_find_path(et->et_ci, left_path,
4078 left_cpos);
3996 if (ret) { 4079 if (ret) {
3997 mlog_errno(ret); 4080 mlog_errno(ret);
3998 goto out; 4081 goto out;
@@ -4005,13 +4088,13 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
4005 } 4088 }
4006 } 4089 }
4007 4090
4008 ret = ocfs2_journal_access_path(inode, handle, right_path); 4091 ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
4009 if (ret) { 4092 if (ret) {
4010 mlog_errno(ret); 4093 mlog_errno(ret);
4011 goto out; 4094 goto out;
4012 } 4095 }
4013 4096
4014 ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec); 4097 ocfs2_adjust_rightmost_records(handle, et, right_path, insert_rec);
4015 4098
4016 *ret_left_path = left_path; 4099 *ret_left_path = left_path;
4017 ret = 0; 4100 ret = 0;
@@ -4022,7 +4105,7 @@ out:
4022 return ret; 4105 return ret;
4023} 4106}
4024 4107
4025static void ocfs2_split_record(struct inode *inode, 4108static void ocfs2_split_record(struct ocfs2_extent_tree *et,
4026 struct ocfs2_path *left_path, 4109 struct ocfs2_path *left_path,
4027 struct ocfs2_path *right_path, 4110 struct ocfs2_path *right_path,
4028 struct ocfs2_extent_rec *split_rec, 4111 struct ocfs2_extent_rec *split_rec,
@@ -4095,7 +4178,8 @@ static void ocfs2_split_record(struct inode *inode,
4095 } 4178 }
4096 4179
4097 rec = &el->l_recs[index]; 4180 rec = &el->l_recs[index];
4098 ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec); 4181 ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
4182 split, rec, split_rec);
4099 ocfs2_rotate_leaf(insert_el, split_rec); 4183 ocfs2_rotate_leaf(insert_el, split_rec);
4100} 4184}
4101 4185
@@ -4107,8 +4191,8 @@ static void ocfs2_split_record(struct inode *inode,
4107 * in. left_path should only be passed in if we need to update that 4191 * in. left_path should only be passed in if we need to update that
4108 * portion of the tree after an edge insert. 4192 * portion of the tree after an edge insert.
4109 */ 4193 */
4110static int ocfs2_insert_path(struct inode *inode, 4194static int ocfs2_insert_path(handle_t *handle,
4111 handle_t *handle, 4195 struct ocfs2_extent_tree *et,
4112 struct ocfs2_path *left_path, 4196 struct ocfs2_path *left_path,
4113 struct ocfs2_path *right_path, 4197 struct ocfs2_path *right_path,
4114 struct ocfs2_extent_rec *insert_rec, 4198 struct ocfs2_extent_rec *insert_rec,
@@ -4134,7 +4218,7 @@ static int ocfs2_insert_path(struct inode *inode,
4134 goto out; 4218 goto out;
4135 } 4219 }
4136 4220
4137 ret = ocfs2_journal_access_path(inode, handle, left_path); 4221 ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
4138 if (ret < 0) { 4222 if (ret < 0) {
4139 mlog_errno(ret); 4223 mlog_errno(ret);
4140 goto out; 4224 goto out;
@@ -4145,7 +4229,7 @@ static int ocfs2_insert_path(struct inode *inode,
4145 * Pass both paths to the journal. The majority of inserts 4229 * Pass both paths to the journal. The majority of inserts
4146 * will be touching all components anyway. 4230 * will be touching all components anyway.
4147 */ 4231 */
4148 ret = ocfs2_journal_access_path(inode, handle, right_path); 4232 ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
4149 if (ret < 0) { 4233 if (ret < 0) {
4150 mlog_errno(ret); 4234 mlog_errno(ret);
4151 goto out; 4235 goto out;
@@ -4157,7 +4241,7 @@ static int ocfs2_insert_path(struct inode *inode,
4157 * of splits, but it's easier to just let one separate 4241 * of splits, but it's easier to just let one separate
4158 * function sort it all out. 4242 * function sort it all out.
4159 */ 4243 */
4160 ocfs2_split_record(inode, left_path, right_path, 4244 ocfs2_split_record(et, left_path, right_path,
4161 insert_rec, insert->ins_split); 4245 insert_rec, insert->ins_split);
4162 4246
4163 /* 4247 /*
@@ -4171,8 +4255,8 @@ static int ocfs2_insert_path(struct inode *inode,
4171 if (ret) 4255 if (ret)
4172 mlog_errno(ret); 4256 mlog_errno(ret);
4173 } else 4257 } else
4174 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path), 4258 ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
4175 insert, inode); 4259 insert);
4176 4260
4177 ret = ocfs2_journal_dirty(handle, leaf_bh); 4261 ret = ocfs2_journal_dirty(handle, leaf_bh);
4178 if (ret) 4262 if (ret)
@@ -4185,10 +4269,10 @@ static int ocfs2_insert_path(struct inode *inode,
4185 * 4269 *
4186 * XXX: Should we extend the transaction here? 4270 * XXX: Should we extend the transaction here?
4187 */ 4271 */
4188 subtree_index = ocfs2_find_subtree_root(inode, left_path, 4272 subtree_index = ocfs2_find_subtree_root(et, left_path,
4189 right_path); 4273 right_path);
4190 ocfs2_complete_edge_insert(inode, handle, left_path, 4274 ocfs2_complete_edge_insert(handle, left_path, right_path,
4191 right_path, subtree_index); 4275 subtree_index);
4192 } 4276 }
4193 4277
4194 ret = 0; 4278 ret = 0;
@@ -4196,8 +4280,7 @@ out:
4196 return ret; 4280 return ret;
4197} 4281}
4198 4282
4199static int ocfs2_do_insert_extent(struct inode *inode, 4283static int ocfs2_do_insert_extent(handle_t *handle,
4200 handle_t *handle,
4201 struct ocfs2_extent_tree *et, 4284 struct ocfs2_extent_tree *et,
4202 struct ocfs2_extent_rec *insert_rec, 4285 struct ocfs2_extent_rec *insert_rec,
4203 struct ocfs2_insert_type *type) 4286 struct ocfs2_insert_type *type)
@@ -4210,7 +4293,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
4210 4293
4211 el = et->et_root_el; 4294 el = et->et_root_el;
4212 4295
4213 ret = ocfs2_et_root_journal_access(handle, inode, et, 4296 ret = ocfs2_et_root_journal_access(handle, et,
4214 OCFS2_JOURNAL_ACCESS_WRITE); 4297 OCFS2_JOURNAL_ACCESS_WRITE);
4215 if (ret) { 4298 if (ret) {
4216 mlog_errno(ret); 4299 mlog_errno(ret);
@@ -4218,7 +4301,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
4218 } 4301 }
4219 4302
4220 if (le16_to_cpu(el->l_tree_depth) == 0) { 4303 if (le16_to_cpu(el->l_tree_depth) == 0) {
4221 ocfs2_insert_at_leaf(insert_rec, el, type, inode); 4304 ocfs2_insert_at_leaf(et, insert_rec, el, type);
4222 goto out_update_clusters; 4305 goto out_update_clusters;
4223 } 4306 }
4224 4307
@@ -4241,7 +4324,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
4241 cpos = UINT_MAX; 4324 cpos = UINT_MAX;
4242 } 4325 }
4243 4326
4244 ret = ocfs2_find_path(inode, right_path, cpos); 4327 ret = ocfs2_find_path(et->et_ci, right_path, cpos);
4245 if (ret) { 4328 if (ret) {
4246 mlog_errno(ret); 4329 mlog_errno(ret);
4247 goto out; 4330 goto out;
@@ -4260,7 +4343,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
4260 * can wind up skipping both of these two special cases... 4343 * can wind up skipping both of these two special cases...
4261 */ 4344 */
4262 if (rotate) { 4345 if (rotate) {
4263 ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split, 4346 ret = ocfs2_rotate_tree_right(handle, et, type->ins_split,
4264 le32_to_cpu(insert_rec->e_cpos), 4347 le32_to_cpu(insert_rec->e_cpos),
4265 right_path, &left_path); 4348 right_path, &left_path);
4266 if (ret) { 4349 if (ret) {
@@ -4272,7 +4355,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
4272 * ocfs2_rotate_tree_right() might have extended the 4355 * ocfs2_rotate_tree_right() might have extended the
4273 * transaction without re-journaling our tree root. 4356 * transaction without re-journaling our tree root.
4274 */ 4357 */
4275 ret = ocfs2_et_root_journal_access(handle, inode, et, 4358 ret = ocfs2_et_root_journal_access(handle, et,
4276 OCFS2_JOURNAL_ACCESS_WRITE); 4359 OCFS2_JOURNAL_ACCESS_WRITE);
4277 if (ret) { 4360 if (ret) {
4278 mlog_errno(ret); 4361 mlog_errno(ret);
@@ -4280,7 +4363,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
4280 } 4363 }
4281 } else if (type->ins_appending == APPEND_TAIL 4364 } else if (type->ins_appending == APPEND_TAIL
4282 && type->ins_contig != CONTIG_LEFT) { 4365 && type->ins_contig != CONTIG_LEFT) {
4283 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec, 4366 ret = ocfs2_append_rec_to_path(handle, et, insert_rec,
4284 right_path, &left_path); 4367 right_path, &left_path);
4285 if (ret) { 4368 if (ret) {
4286 mlog_errno(ret); 4369 mlog_errno(ret);
@@ -4288,7 +4371,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
4288 } 4371 }
4289 } 4372 }
4290 4373
4291 ret = ocfs2_insert_path(inode, handle, left_path, right_path, 4374 ret = ocfs2_insert_path(handle, et, left_path, right_path,
4292 insert_rec, type); 4375 insert_rec, type);
4293 if (ret) { 4376 if (ret) {
4294 mlog_errno(ret); 4377 mlog_errno(ret);
@@ -4297,7 +4380,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
4297 4380
4298out_update_clusters: 4381out_update_clusters:
4299 if (type->ins_split == SPLIT_NONE) 4382 if (type->ins_split == SPLIT_NONE)
4300 ocfs2_et_update_clusters(inode, et, 4383 ocfs2_et_update_clusters(et,
4301 le16_to_cpu(insert_rec->e_leaf_clusters)); 4384 le16_to_cpu(insert_rec->e_leaf_clusters));
4302 4385
4303 ret = ocfs2_journal_dirty(handle, et->et_root_bh); 4386 ret = ocfs2_journal_dirty(handle, et->et_root_bh);
@@ -4312,7 +4395,8 @@ out:
4312} 4395}
4313 4396
4314static enum ocfs2_contig_type 4397static enum ocfs2_contig_type
4315ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, 4398ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
4399 struct ocfs2_path *path,
4316 struct ocfs2_extent_list *el, int index, 4400 struct ocfs2_extent_list *el, int index,
4317 struct ocfs2_extent_rec *split_rec) 4401 struct ocfs2_extent_rec *split_rec)
4318{ 4402{
@@ -4324,12 +4408,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4324 struct ocfs2_path *left_path = NULL, *right_path = NULL; 4408 struct ocfs2_path *left_path = NULL, *right_path = NULL;
4325 struct buffer_head *bh; 4409 struct buffer_head *bh;
4326 struct ocfs2_extent_block *eb; 4410 struct ocfs2_extent_block *eb;
4411 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
4327 4412
4328 if (index > 0) { 4413 if (index > 0) {
4329 rec = &el->l_recs[index - 1]; 4414 rec = &el->l_recs[index - 1];
4330 } else if (path->p_tree_depth > 0) { 4415 } else if (path->p_tree_depth > 0) {
4331 status = ocfs2_find_cpos_for_left_leaf(inode->i_sb, 4416 status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
4332 path, &left_cpos);
4333 if (status) 4417 if (status)
4334 goto out; 4418 goto out;
4335 4419
@@ -4338,7 +4422,8 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4338 if (!left_path) 4422 if (!left_path)
4339 goto out; 4423 goto out;
4340 4424
4341 status = ocfs2_find_path(inode, left_path, left_cpos); 4425 status = ocfs2_find_path(et->et_ci, left_path,
4426 left_cpos);
4342 if (status) 4427 if (status)
4343 goto out; 4428 goto out;
4344 4429
@@ -4348,7 +4433,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4348 le16_to_cpu(new_el->l_count)) { 4433 le16_to_cpu(new_el->l_count)) {
4349 bh = path_leaf_bh(left_path); 4434 bh = path_leaf_bh(left_path);
4350 eb = (struct ocfs2_extent_block *)bh->b_data; 4435 eb = (struct ocfs2_extent_block *)bh->b_data;
4351 ocfs2_error(inode->i_sb, 4436 ocfs2_error(sb,
4352 "Extent block #%llu has an " 4437 "Extent block #%llu has an "
4353 "invalid l_next_free_rec of " 4438 "invalid l_next_free_rec of "
4354 "%d. It should have " 4439 "%d. It should have "
@@ -4373,7 +4458,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4373 if (split_rec->e_cpos == el->l_recs[index].e_cpos) 4458 if (split_rec->e_cpos == el->l_recs[index].e_cpos)
4374 ret = CONTIG_RIGHT; 4459 ret = CONTIG_RIGHT;
4375 } else { 4460 } else {
4376 ret = ocfs2_extent_contig(inode, rec, split_rec); 4461 ret = ocfs2_et_extent_contig(et, rec, split_rec);
4377 } 4462 }
4378 } 4463 }
4379 4464
@@ -4382,8 +4467,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4382 rec = &el->l_recs[index + 1]; 4467 rec = &el->l_recs[index + 1];
4383 else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) && 4468 else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
4384 path->p_tree_depth > 0) { 4469 path->p_tree_depth > 0) {
4385 status = ocfs2_find_cpos_for_right_leaf(inode->i_sb, 4470 status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
4386 path, &right_cpos);
4387 if (status) 4471 if (status)
4388 goto out; 4472 goto out;
4389 4473
@@ -4394,7 +4478,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4394 if (!right_path) 4478 if (!right_path)
4395 goto out; 4479 goto out;
4396 4480
4397 status = ocfs2_find_path(inode, right_path, right_cpos); 4481 status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
4398 if (status) 4482 if (status)
4399 goto out; 4483 goto out;
4400 4484
@@ -4404,7 +4488,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4404 if (le16_to_cpu(new_el->l_next_free_rec) <= 1) { 4488 if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4405 bh = path_leaf_bh(right_path); 4489 bh = path_leaf_bh(right_path);
4406 eb = (struct ocfs2_extent_block *)bh->b_data; 4490 eb = (struct ocfs2_extent_block *)bh->b_data;
4407 ocfs2_error(inode->i_sb, 4491 ocfs2_error(sb,
4408 "Extent block #%llu has an " 4492 "Extent block #%llu has an "
4409 "invalid l_next_free_rec of %d", 4493 "invalid l_next_free_rec of %d",
4410 (unsigned long long)le64_to_cpu(eb->h_blkno), 4494 (unsigned long long)le64_to_cpu(eb->h_blkno),
@@ -4419,7 +4503,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4419 if (rec) { 4503 if (rec) {
4420 enum ocfs2_contig_type contig_type; 4504 enum ocfs2_contig_type contig_type;
4421 4505
4422 contig_type = ocfs2_extent_contig(inode, rec, split_rec); 4506 contig_type = ocfs2_et_extent_contig(et, rec, split_rec);
4423 4507
4424 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT) 4508 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
4425 ret = CONTIG_LEFTRIGHT; 4509 ret = CONTIG_LEFTRIGHT;
@@ -4436,11 +4520,10 @@ out:
4436 return ret; 4520 return ret;
4437} 4521}
4438 4522
4439static void ocfs2_figure_contig_type(struct inode *inode, 4523static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
4440 struct ocfs2_insert_type *insert, 4524 struct ocfs2_insert_type *insert,
4441 struct ocfs2_extent_list *el, 4525 struct ocfs2_extent_list *el,
4442 struct ocfs2_extent_rec *insert_rec, 4526 struct ocfs2_extent_rec *insert_rec)
4443 struct ocfs2_extent_tree *et)
4444{ 4527{
4445 int i; 4528 int i;
4446 enum ocfs2_contig_type contig_type = CONTIG_NONE; 4529 enum ocfs2_contig_type contig_type = CONTIG_NONE;
@@ -4448,8 +4531,8 @@ static void ocfs2_figure_contig_type(struct inode *inode,
4448 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 4531 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4449 4532
4450 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 4533 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4451 contig_type = ocfs2_extent_contig(inode, &el->l_recs[i], 4534 contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i],
4452 insert_rec); 4535 insert_rec);
4453 if (contig_type != CONTIG_NONE) { 4536 if (contig_type != CONTIG_NONE) {
4454 insert->ins_contig_index = i; 4537 insert->ins_contig_index = i;
4455 break; 4538 break;
@@ -4530,8 +4613,7 @@ set_tail_append:
4530 * All of the information is stored on the ocfs2_insert_type 4613 * All of the information is stored on the ocfs2_insert_type
4531 * structure. 4614 * structure.
4532 */ 4615 */
4533static int ocfs2_figure_insert_type(struct inode *inode, 4616static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et,
4534 struct ocfs2_extent_tree *et,
4535 struct buffer_head **last_eb_bh, 4617 struct buffer_head **last_eb_bh,
4536 struct ocfs2_extent_rec *insert_rec, 4618 struct ocfs2_extent_rec *insert_rec,
4537 int *free_records, 4619 int *free_records,
@@ -4555,7 +4637,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
4555 * ocfs2_figure_insert_type() and ocfs2_add_branch() 4637 * ocfs2_figure_insert_type() and ocfs2_add_branch()
4556 * may want it later. 4638 * may want it later.
4557 */ 4639 */
4558 ret = ocfs2_read_extent_block(inode, 4640 ret = ocfs2_read_extent_block(et->et_ci,
4559 ocfs2_et_get_last_eb_blk(et), 4641 ocfs2_et_get_last_eb_blk(et),
4560 &bh); 4642 &bh);
4561 if (ret) { 4643 if (ret) {
@@ -4578,7 +4660,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
4578 le16_to_cpu(el->l_next_free_rec); 4660 le16_to_cpu(el->l_next_free_rec);
4579 4661
4580 if (!insert->ins_tree_depth) { 4662 if (!insert->ins_tree_depth) {
4581 ocfs2_figure_contig_type(inode, insert, el, insert_rec, et); 4663 ocfs2_figure_contig_type(et, insert, el, insert_rec);
4582 ocfs2_figure_appending_type(insert, el, insert_rec); 4664 ocfs2_figure_appending_type(insert, el, insert_rec);
4583 return 0; 4665 return 0;
4584 } 4666 }
@@ -4596,7 +4678,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
4596 * us the rightmost tree path. This is accounted for below in 4678 * us the rightmost tree path. This is accounted for below in
4597 * the appending code. 4679 * the appending code.
4598 */ 4680 */
4599 ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos)); 4681 ret = ocfs2_find_path(et->et_ci, path, le32_to_cpu(insert_rec->e_cpos));
4600 if (ret) { 4682 if (ret) {
4601 mlog_errno(ret); 4683 mlog_errno(ret);
4602 goto out; 4684 goto out;
@@ -4612,7 +4694,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
4612 * into two types of appends: simple record append, or a 4694 * into two types of appends: simple record append, or a
4613 * rotate inside the tail leaf. 4695 * rotate inside the tail leaf.
4614 */ 4696 */
4615 ocfs2_figure_contig_type(inode, insert, el, insert_rec, et); 4697 ocfs2_figure_contig_type(et, insert, el, insert_rec);
4616 4698
4617 /* 4699 /*
4618 * The insert code isn't quite ready to deal with all cases of 4700 * The insert code isn't quite ready to deal with all cases of
@@ -4657,13 +4739,11 @@ out:
4657} 4739}
4658 4740
4659/* 4741/*
4660 * Insert an extent into an inode btree. 4742 * Insert an extent into a btree.
4661 * 4743 *
4662 * The caller needs to update fe->i_clusters 4744 * The caller needs to update the owning btree's cluster count.
4663 */ 4745 */
4664int ocfs2_insert_extent(struct ocfs2_super *osb, 4746int ocfs2_insert_extent(handle_t *handle,
4665 handle_t *handle,
4666 struct inode *inode,
4667 struct ocfs2_extent_tree *et, 4747 struct ocfs2_extent_tree *et,
4668 u32 cpos, 4748 u32 cpos,
4669 u64 start_blk, 4749 u64 start_blk,
@@ -4677,21 +4757,22 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
4677 struct ocfs2_insert_type insert = {0, }; 4757 struct ocfs2_insert_type insert = {0, };
4678 struct ocfs2_extent_rec rec; 4758 struct ocfs2_extent_rec rec;
4679 4759
4680 mlog(0, "add %u clusters at position %u to inode %llu\n", 4760 mlog(0, "add %u clusters at position %u to owner %llu\n",
4681 new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno); 4761 new_clusters, cpos,
4762 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
4682 4763
4683 memset(&rec, 0, sizeof(rec)); 4764 memset(&rec, 0, sizeof(rec));
4684 rec.e_cpos = cpu_to_le32(cpos); 4765 rec.e_cpos = cpu_to_le32(cpos);
4685 rec.e_blkno = cpu_to_le64(start_blk); 4766 rec.e_blkno = cpu_to_le64(start_blk);
4686 rec.e_leaf_clusters = cpu_to_le16(new_clusters); 4767 rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4687 rec.e_flags = flags; 4768 rec.e_flags = flags;
4688 status = ocfs2_et_insert_check(inode, et, &rec); 4769 status = ocfs2_et_insert_check(et, &rec);
4689 if (status) { 4770 if (status) {
4690 mlog_errno(status); 4771 mlog_errno(status);
4691 goto bail; 4772 goto bail;
4692 } 4773 }
4693 4774
4694 status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec, 4775 status = ocfs2_figure_insert_type(et, &last_eb_bh, &rec,
4695 &free_records, &insert); 4776 &free_records, &insert);
4696 if (status < 0) { 4777 if (status < 0) {
4697 mlog_errno(status); 4778 mlog_errno(status);
@@ -4705,7 +4786,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
4705 free_records, insert.ins_tree_depth); 4786 free_records, insert.ins_tree_depth);
4706 4787
4707 if (insert.ins_contig == CONTIG_NONE && free_records == 0) { 4788 if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4708 status = ocfs2_grow_tree(inode, handle, et, 4789 status = ocfs2_grow_tree(handle, et,
4709 &insert.ins_tree_depth, &last_eb_bh, 4790 &insert.ins_tree_depth, &last_eb_bh,
4710 meta_ac); 4791 meta_ac);
4711 if (status) { 4792 if (status) {
@@ -4715,11 +4796,11 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
4715 } 4796 }
4716 4797
4717 /* Finally, we can add clusters. This might rotate the tree for us. */ 4798 /* Finally, we can add clusters. This might rotate the tree for us. */
4718 status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert); 4799 status = ocfs2_do_insert_extent(handle, et, &rec, &insert);
4719 if (status < 0) 4800 if (status < 0)
4720 mlog_errno(status); 4801 mlog_errno(status);
4721 else if (et->et_ops == &ocfs2_dinode_et_ops) 4802 else
4722 ocfs2_extent_map_insert_rec(inode, &rec); 4803 ocfs2_et_extent_map_insert(et, &rec);
4723 4804
4724bail: 4805bail:
4725 brelse(last_eb_bh); 4806 brelse(last_eb_bh);
@@ -4735,13 +4816,11 @@ bail:
4735 * it is not limited to the file storage. Any extent tree can use this 4816 * it is not limited to the file storage. Any extent tree can use this
4736 * function if it implements the proper ocfs2_extent_tree. 4817 * function if it implements the proper ocfs2_extent_tree.
4737 */ 4818 */
4738int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, 4819int ocfs2_add_clusters_in_btree(handle_t *handle,
4739 struct inode *inode, 4820 struct ocfs2_extent_tree *et,
4740 u32 *logical_offset, 4821 u32 *logical_offset,
4741 u32 clusters_to_add, 4822 u32 clusters_to_add,
4742 int mark_unwritten, 4823 int mark_unwritten,
4743 struct ocfs2_extent_tree *et,
4744 handle_t *handle,
4745 struct ocfs2_alloc_context *data_ac, 4824 struct ocfs2_alloc_context *data_ac,
4746 struct ocfs2_alloc_context *meta_ac, 4825 struct ocfs2_alloc_context *meta_ac,
4747 enum ocfs2_alloc_restarted *reason_ret) 4826 enum ocfs2_alloc_restarted *reason_ret)
@@ -4752,13 +4831,15 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4752 u32 bit_off, num_bits; 4831 u32 bit_off, num_bits;
4753 u64 block; 4832 u64 block;
4754 u8 flags = 0; 4833 u8 flags = 0;
4834 struct ocfs2_super *osb =
4835 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
4755 4836
4756 BUG_ON(!clusters_to_add); 4837 BUG_ON(!clusters_to_add);
4757 4838
4758 if (mark_unwritten) 4839 if (mark_unwritten)
4759 flags = OCFS2_EXT_UNWRITTEN; 4840 flags = OCFS2_EXT_UNWRITTEN;
4760 4841
4761 free_extents = ocfs2_num_free_extents(osb, inode, et); 4842 free_extents = ocfs2_num_free_extents(osb, et);
4762 if (free_extents < 0) { 4843 if (free_extents < 0) {
4763 status = free_extents; 4844 status = free_extents;
4764 mlog_errno(status); 4845 mlog_errno(status);
@@ -4795,7 +4876,7 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4795 BUG_ON(num_bits > clusters_to_add); 4876 BUG_ON(num_bits > clusters_to_add);
4796 4877
4797 /* reserve our write early -- insert_extent may update the tree root */ 4878 /* reserve our write early -- insert_extent may update the tree root */
4798 status = ocfs2_et_root_journal_access(handle, inode, et, 4879 status = ocfs2_et_root_journal_access(handle, et,
4799 OCFS2_JOURNAL_ACCESS_WRITE); 4880 OCFS2_JOURNAL_ACCESS_WRITE);
4800 if (status < 0) { 4881 if (status < 0) {
4801 mlog_errno(status); 4882 mlog_errno(status);
@@ -4803,10 +4884,10 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4803 } 4884 }
4804 4885
4805 block = ocfs2_clusters_to_blocks(osb->sb, bit_off); 4886 block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4806 mlog(0, "Allocating %u clusters at block %u for inode %llu\n", 4887 mlog(0, "Allocating %u clusters at block %u for owner %llu\n",
4807 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); 4888 num_bits, bit_off,
4808 status = ocfs2_insert_extent(osb, handle, inode, et, 4889 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
4809 *logical_offset, block, 4890 status = ocfs2_insert_extent(handle, et, *logical_offset, block,
4810 num_bits, flags, meta_ac); 4891 num_bits, flags, meta_ac);
4811 if (status < 0) { 4892 if (status < 0) {
4812 mlog_errno(status); 4893 mlog_errno(status);
@@ -4856,10 +4937,9 @@ static void ocfs2_make_right_split_rec(struct super_block *sb,
4856 split_rec->e_flags = rec->e_flags; 4937 split_rec->e_flags = rec->e_flags;
4857} 4938}
4858 4939
4859static int ocfs2_split_and_insert(struct inode *inode, 4940static int ocfs2_split_and_insert(handle_t *handle,
4860 handle_t *handle,
4861 struct ocfs2_path *path,
4862 struct ocfs2_extent_tree *et, 4941 struct ocfs2_extent_tree *et,
4942 struct ocfs2_path *path,
4863 struct buffer_head **last_eb_bh, 4943 struct buffer_head **last_eb_bh,
4864 int split_index, 4944 int split_index,
4865 struct ocfs2_extent_rec *orig_split_rec, 4945 struct ocfs2_extent_rec *orig_split_rec,
@@ -4892,7 +4972,7 @@ leftright:
4892 4972
4893 if (le16_to_cpu(rightmost_el->l_next_free_rec) == 4973 if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4894 le16_to_cpu(rightmost_el->l_count)) { 4974 le16_to_cpu(rightmost_el->l_count)) {
4895 ret = ocfs2_grow_tree(inode, handle, et, 4975 ret = ocfs2_grow_tree(handle, et,
4896 &depth, last_eb_bh, meta_ac); 4976 &depth, last_eb_bh, meta_ac);
4897 if (ret) { 4977 if (ret) {
4898 mlog_errno(ret); 4978 mlog_errno(ret);
@@ -4921,8 +5001,8 @@ leftright:
4921 */ 5001 */
4922 insert.ins_split = SPLIT_RIGHT; 5002 insert.ins_split = SPLIT_RIGHT;
4923 5003
4924 ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range, 5004 ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
4925 &rec); 5005 &tmprec, insert_range, &rec);
4926 5006
4927 split_rec = tmprec; 5007 split_rec = tmprec;
4928 5008
@@ -4930,7 +5010,7 @@ leftright:
4930 do_leftright = 1; 5010 do_leftright = 1;
4931 } 5011 }
4932 5012
4933 ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert); 5013 ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
4934 if (ret) { 5014 if (ret) {
4935 mlog_errno(ret); 5015 mlog_errno(ret);
4936 goto out; 5016 goto out;
@@ -4946,7 +5026,7 @@ leftright:
4946 ocfs2_reinit_path(path, 1); 5026 ocfs2_reinit_path(path, 1);
4947 5027
4948 cpos = le32_to_cpu(split_rec.e_cpos); 5028 cpos = le32_to_cpu(split_rec.e_cpos);
4949 ret = ocfs2_find_path(inode, path, cpos); 5029 ret = ocfs2_find_path(et->et_ci, path, cpos);
4950 if (ret) { 5030 if (ret) {
4951 mlog_errno(ret); 5031 mlog_errno(ret);
4952 goto out; 5032 goto out;
@@ -4961,8 +5041,8 @@ out:
4961 return ret; 5041 return ret;
4962} 5042}
4963 5043
4964static int ocfs2_replace_extent_rec(struct inode *inode, 5044static int ocfs2_replace_extent_rec(handle_t *handle,
4965 handle_t *handle, 5045 struct ocfs2_extent_tree *et,
4966 struct ocfs2_path *path, 5046 struct ocfs2_path *path,
4967 struct ocfs2_extent_list *el, 5047 struct ocfs2_extent_list *el,
4968 int split_index, 5048 int split_index,
@@ -4970,7 +5050,7 @@ static int ocfs2_replace_extent_rec(struct inode *inode,
4970{ 5050{
4971 int ret; 5051 int ret;
4972 5052
4973 ret = ocfs2_path_bh_journal_access(handle, inode, path, 5053 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
4974 path_num_items(path) - 1); 5054 path_num_items(path) - 1);
4975 if (ret) { 5055 if (ret) {
4976 mlog_errno(ret); 5056 mlog_errno(ret);
@@ -4985,9 +5065,8 @@ out:
4985} 5065}
4986 5066
4987/* 5067/*
4988 * Mark part or all of the extent record at split_index in the leaf 5068 * Split part or all of the extent record at split_index in the leaf
4989 * pointed to by path as written. This removes the unwritten 5069 * pointed to by path. Merge with the contiguous extent record if needed.
4990 * extent flag.
4991 * 5070 *
4992 * Care is taken to handle contiguousness so as to not grow the tree. 5071 * Care is taken to handle contiguousness so as to not grow the tree.
4993 * 5072 *
@@ -5004,14 +5083,13 @@ out:
5004 * have been brought into cache (and pinned via the journal), so the 5083 * have been brought into cache (and pinned via the journal), so the
5005 * extra overhead is not expressed in terms of disk reads. 5084 * extra overhead is not expressed in terms of disk reads.
5006 */ 5085 */
5007static int __ocfs2_mark_extent_written(struct inode *inode, 5086int ocfs2_split_extent(handle_t *handle,
5008 struct ocfs2_extent_tree *et, 5087 struct ocfs2_extent_tree *et,
5009 handle_t *handle, 5088 struct ocfs2_path *path,
5010 struct ocfs2_path *path, 5089 int split_index,
5011 int split_index, 5090 struct ocfs2_extent_rec *split_rec,
5012 struct ocfs2_extent_rec *split_rec, 5091 struct ocfs2_alloc_context *meta_ac,
5013 struct ocfs2_alloc_context *meta_ac, 5092 struct ocfs2_cached_dealloc_ctxt *dealloc)
5014 struct ocfs2_cached_dealloc_ctxt *dealloc)
5015{ 5093{
5016 int ret = 0; 5094 int ret = 0;
5017 struct ocfs2_extent_list *el = path_leaf_el(path); 5095 struct ocfs2_extent_list *el = path_leaf_el(path);
@@ -5020,12 +5098,6 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
5020 struct ocfs2_merge_ctxt ctxt; 5098 struct ocfs2_merge_ctxt ctxt;
5021 struct ocfs2_extent_list *rightmost_el; 5099 struct ocfs2_extent_list *rightmost_el;
5022 5100
5023 if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
5024 ret = -EIO;
5025 mlog_errno(ret);
5026 goto out;
5027 }
5028
5029 if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) || 5101 if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
5030 ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) < 5102 ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
5031 (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) { 5103 (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
@@ -5034,19 +5106,19 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
5034 goto out; 5106 goto out;
5035 } 5107 }
5036 5108
5037 ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el, 5109 ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el,
5038 split_index, 5110 split_index,
5039 split_rec); 5111 split_rec);
5040 5112
5041 /* 5113 /*
5042 * The core merge / split code wants to know how much room is 5114 * The core merge / split code wants to know how much room is
5043 * left in this inodes allocation tree, so we pass the 5115 * left in this allocation tree, so we pass the
5044 * rightmost extent list. 5116 * rightmost extent list.
5045 */ 5117 */
5046 if (path->p_tree_depth) { 5118 if (path->p_tree_depth) {
5047 struct ocfs2_extent_block *eb; 5119 struct ocfs2_extent_block *eb;
5048 5120
5049 ret = ocfs2_read_extent_block(inode, 5121 ret = ocfs2_read_extent_block(et->et_ci,
5050 ocfs2_et_get_last_eb_blk(et), 5122 ocfs2_et_get_last_eb_blk(et),
5051 &last_eb_bh); 5123 &last_eb_bh);
5052 if (ret) { 5124 if (ret) {
@@ -5073,19 +5145,18 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
5073 5145
5074 if (ctxt.c_contig_type == CONTIG_NONE) { 5146 if (ctxt.c_contig_type == CONTIG_NONE) {
5075 if (ctxt.c_split_covers_rec) 5147 if (ctxt.c_split_covers_rec)
5076 ret = ocfs2_replace_extent_rec(inode, handle, 5148 ret = ocfs2_replace_extent_rec(handle, et, path, el,
5077 path, el,
5078 split_index, split_rec); 5149 split_index, split_rec);
5079 else 5150 else
5080 ret = ocfs2_split_and_insert(inode, handle, path, et, 5151 ret = ocfs2_split_and_insert(handle, et, path,
5081 &last_eb_bh, split_index, 5152 &last_eb_bh, split_index,
5082 split_rec, meta_ac); 5153 split_rec, meta_ac);
5083 if (ret) 5154 if (ret)
5084 mlog_errno(ret); 5155 mlog_errno(ret);
5085 } else { 5156 } else {
5086 ret = ocfs2_try_to_merge_extent(inode, handle, path, 5157 ret = ocfs2_try_to_merge_extent(handle, et, path,
5087 split_index, split_rec, 5158 split_index, split_rec,
5088 dealloc, &ctxt, et); 5159 dealloc, &ctxt);
5089 if (ret) 5160 if (ret)
5090 mlog_errno(ret); 5161 mlog_errno(ret);
5091 } 5162 }
@@ -5096,46 +5167,31 @@ out:
5096} 5167}
5097 5168
5098/* 5169/*
5099 * Mark the already-existing extent at cpos as written for len clusters. 5170 * Change the flags of the already-existing extent at cpos for len clusters.
5171 *
5172 * new_flags: the flags we want to set.
5173 * clear_flags: the flags we want to clear.
5174 * phys: the new physical offset we want this new extent starts from.
5100 * 5175 *
5101 * If the existing extent is larger than the request, initiate a 5176 * If the existing extent is larger than the request, initiate a
5102 * split. An attempt will be made at merging with adjacent extents. 5177 * split. An attempt will be made at merging with adjacent extents.
5103 * 5178 *
5104 * The caller is responsible for passing down meta_ac if we'll need it. 5179 * The caller is responsible for passing down meta_ac if we'll need it.
5105 */ 5180 */
5106int ocfs2_mark_extent_written(struct inode *inode, 5181int ocfs2_change_extent_flag(handle_t *handle,
5107 struct ocfs2_extent_tree *et, 5182 struct ocfs2_extent_tree *et,
5108 handle_t *handle, u32 cpos, u32 len, u32 phys, 5183 u32 cpos, u32 len, u32 phys,
5109 struct ocfs2_alloc_context *meta_ac, 5184 struct ocfs2_alloc_context *meta_ac,
5110 struct ocfs2_cached_dealloc_ctxt *dealloc) 5185 struct ocfs2_cached_dealloc_ctxt *dealloc,
5186 int new_flags, int clear_flags)
5111{ 5187{
5112 int ret, index; 5188 int ret, index;
5113 u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys); 5189 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
5190 u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys);
5114 struct ocfs2_extent_rec split_rec; 5191 struct ocfs2_extent_rec split_rec;
5115 struct ocfs2_path *left_path = NULL; 5192 struct ocfs2_path *left_path = NULL;
5116 struct ocfs2_extent_list *el; 5193 struct ocfs2_extent_list *el;
5117 5194 struct ocfs2_extent_rec *rec;
5118 mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
5119 inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
5120
5121 if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
5122 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
5123 "that are being written to, but the feature bit "
5124 "is not set in the super block.",
5125 (unsigned long long)OCFS2_I(inode)->ip_blkno);
5126 ret = -EROFS;
5127 goto out;
5128 }
5129
5130 /*
5131 * XXX: This should be fixed up so that we just re-insert the
5132 * next extent records.
5133 *
5134 * XXX: This is a hack on the extent tree, maybe it should be
5135 * an op?
5136 */
5137 if (et->et_ops == &ocfs2_dinode_et_ops)
5138 ocfs2_extent_map_trunc(inode, 0);
5139 5195
5140 left_path = ocfs2_new_path_from_et(et); 5196 left_path = ocfs2_new_path_from_et(et);
5141 if (!left_path) { 5197 if (!left_path) {
@@ -5144,7 +5200,7 @@ int ocfs2_mark_extent_written(struct inode *inode,
5144 goto out; 5200 goto out;
5145 } 5201 }
5146 5202
5147 ret = ocfs2_find_path(inode, left_path, cpos); 5203 ret = ocfs2_find_path(et->et_ci, left_path, cpos);
5148 if (ret) { 5204 if (ret) {
5149 mlog_errno(ret); 5205 mlog_errno(ret);
5150 goto out; 5206 goto out;
@@ -5153,34 +5209,102 @@ int ocfs2_mark_extent_written(struct inode *inode,
5153 5209
5154 index = ocfs2_search_extent_list(el, cpos); 5210 index = ocfs2_search_extent_list(el, cpos);
5155 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 5211 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5156 ocfs2_error(inode->i_sb, 5212 ocfs2_error(sb,
5157 "Inode %llu has an extent at cpos %u which can no " 5213 "Owner %llu has an extent at cpos %u which can no "
5158 "longer be found.\n", 5214 "longer be found.\n",
5159 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); 5215 (unsigned long long)
5216 ocfs2_metadata_cache_owner(et->et_ci), cpos);
5160 ret = -EROFS; 5217 ret = -EROFS;
5161 goto out; 5218 goto out;
5162 } 5219 }
5163 5220
5221 ret = -EIO;
5222 rec = &el->l_recs[index];
5223 if (new_flags && (rec->e_flags & new_flags)) {
5224 mlog(ML_ERROR, "Owner %llu tried to set %d flags on an "
5225 "extent that already had them",
5226 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5227 new_flags);
5228 goto out;
5229 }
5230
5231 if (clear_flags && !(rec->e_flags & clear_flags)) {
5232 mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an "
5233 "extent that didn't have them",
5234 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5235 clear_flags);
5236 goto out;
5237 }
5238
5164 memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec)); 5239 memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
5165 split_rec.e_cpos = cpu_to_le32(cpos); 5240 split_rec.e_cpos = cpu_to_le32(cpos);
5166 split_rec.e_leaf_clusters = cpu_to_le16(len); 5241 split_rec.e_leaf_clusters = cpu_to_le16(len);
5167 split_rec.e_blkno = cpu_to_le64(start_blkno); 5242 split_rec.e_blkno = cpu_to_le64(start_blkno);
5168 split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags; 5243 split_rec.e_flags = rec->e_flags;
5169 split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN; 5244 if (new_flags)
5170 5245 split_rec.e_flags |= new_flags;
5171 ret = __ocfs2_mark_extent_written(inode, et, handle, left_path, 5246 if (clear_flags)
5172 index, &split_rec, meta_ac, 5247 split_rec.e_flags &= ~clear_flags;
5173 dealloc); 5248
5249 ret = ocfs2_split_extent(handle, et, left_path,
5250 index, &split_rec, meta_ac,
5251 dealloc);
5174 if (ret) 5252 if (ret)
5175 mlog_errno(ret); 5253 mlog_errno(ret);
5176 5254
5177out: 5255out:
5178 ocfs2_free_path(left_path); 5256 ocfs2_free_path(left_path);
5179 return ret; 5257 return ret;
5258
5180} 5259}
5181 5260
5182static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et, 5261/*
5183 handle_t *handle, struct ocfs2_path *path, 5262 * Mark the already-existing extent at cpos as written for len clusters.
5263 * This removes the unwritten extent flag.
5264 *
5265 * If the existing extent is larger than the request, initiate a
5266 * split. An attempt will be made at merging with adjacent extents.
5267 *
5268 * The caller is responsible for passing down meta_ac if we'll need it.
5269 */
5270int ocfs2_mark_extent_written(struct inode *inode,
5271 struct ocfs2_extent_tree *et,
5272 handle_t *handle, u32 cpos, u32 len, u32 phys,
5273 struct ocfs2_alloc_context *meta_ac,
5274 struct ocfs2_cached_dealloc_ctxt *dealloc)
5275{
5276 int ret;
5277
5278 mlog(0, "Inode %lu cpos %u, len %u, phys clusters %u\n",
5279 inode->i_ino, cpos, len, phys);
5280
5281 if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
5282 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
5283 "that are being written to, but the feature bit "
5284 "is not set in the super block.",
5285 (unsigned long long)OCFS2_I(inode)->ip_blkno);
5286 ret = -EROFS;
5287 goto out;
5288 }
5289
5290 /*
5291 * XXX: This should be fixed up so that we just re-insert the
5292 * next extent records.
5293 */
5294 ocfs2_et_extent_map_truncate(et, 0);
5295
5296 ret = ocfs2_change_extent_flag(handle, et, cpos,
5297 len, phys, meta_ac, dealloc,
5298 0, OCFS2_EXT_UNWRITTEN);
5299 if (ret)
5300 mlog_errno(ret);
5301
5302out:
5303 return ret;
5304}
5305
5306static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
5307 struct ocfs2_path *path,
5184 int index, u32 new_range, 5308 int index, u32 new_range,
5185 struct ocfs2_alloc_context *meta_ac) 5309 struct ocfs2_alloc_context *meta_ac)
5186{ 5310{
@@ -5197,11 +5321,12 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
5197 */ 5321 */
5198 el = path_leaf_el(path); 5322 el = path_leaf_el(path);
5199 rec = &el->l_recs[index]; 5323 rec = &el->l_recs[index];
5200 ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec); 5324 ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
5325 &split_rec, new_range, rec);
5201 5326
5202 depth = path->p_tree_depth; 5327 depth = path->p_tree_depth;
5203 if (depth > 0) { 5328 if (depth > 0) {
5204 ret = ocfs2_read_extent_block(inode, 5329 ret = ocfs2_read_extent_block(et->et_ci,
5205 ocfs2_et_get_last_eb_blk(et), 5330 ocfs2_et_get_last_eb_blk(et),
5206 &last_eb_bh); 5331 &last_eb_bh);
5207 if (ret < 0) { 5332 if (ret < 0) {
@@ -5224,7 +5349,7 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
5224 5349
5225 if (le16_to_cpu(rightmost_el->l_next_free_rec) == 5350 if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
5226 le16_to_cpu(rightmost_el->l_count)) { 5351 le16_to_cpu(rightmost_el->l_count)) {
5227 ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh, 5352 ret = ocfs2_grow_tree(handle, et, &depth, &last_eb_bh,
5228 meta_ac); 5353 meta_ac);
5229 if (ret) { 5354 if (ret) {
5230 mlog_errno(ret); 5355 mlog_errno(ret);
@@ -5238,7 +5363,7 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
5238 insert.ins_split = SPLIT_RIGHT; 5363 insert.ins_split = SPLIT_RIGHT;
5239 insert.ins_tree_depth = depth; 5364 insert.ins_tree_depth = depth;
5240 5365
5241 ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert); 5366 ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
5242 if (ret) 5367 if (ret)
5243 mlog_errno(ret); 5368 mlog_errno(ret);
5244 5369
@@ -5247,23 +5372,23 @@ out:
5247 return ret; 5372 return ret;
5248} 5373}
5249 5374
5250static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, 5375static int ocfs2_truncate_rec(handle_t *handle,
5376 struct ocfs2_extent_tree *et,
5251 struct ocfs2_path *path, int index, 5377 struct ocfs2_path *path, int index,
5252 struct ocfs2_cached_dealloc_ctxt *dealloc, 5378 struct ocfs2_cached_dealloc_ctxt *dealloc,
5253 u32 cpos, u32 len, 5379 u32 cpos, u32 len)
5254 struct ocfs2_extent_tree *et)
5255{ 5380{
5256 int ret; 5381 int ret;
5257 u32 left_cpos, rec_range, trunc_range; 5382 u32 left_cpos, rec_range, trunc_range;
5258 int wants_rotate = 0, is_rightmost_tree_rec = 0; 5383 int wants_rotate = 0, is_rightmost_tree_rec = 0;
5259 struct super_block *sb = inode->i_sb; 5384 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
5260 struct ocfs2_path *left_path = NULL; 5385 struct ocfs2_path *left_path = NULL;
5261 struct ocfs2_extent_list *el = path_leaf_el(path); 5386 struct ocfs2_extent_list *el = path_leaf_el(path);
5262 struct ocfs2_extent_rec *rec; 5387 struct ocfs2_extent_rec *rec;
5263 struct ocfs2_extent_block *eb; 5388 struct ocfs2_extent_block *eb;
5264 5389
5265 if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { 5390 if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
5266 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et); 5391 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
5267 if (ret) { 5392 if (ret) {
5268 mlog_errno(ret); 5393 mlog_errno(ret);
5269 goto out; 5394 goto out;
@@ -5295,14 +5420,13 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5295 * by this leaf and the one to it's left. 5420 * by this leaf and the one to it's left.
5296 * 5421 *
5297 * There are two cases we can skip: 5422 * There are two cases we can skip:
5298 * 1) Path is the leftmost one in our inode tree. 5423 * 1) Path is the leftmost one in our btree.
5299 * 2) The leaf is rightmost and will be empty after 5424 * 2) The leaf is rightmost and will be empty after
5300 * we remove the extent record - the rotate code 5425 * we remove the extent record - the rotate code
5301 * knows how to update the newly formed edge. 5426 * knows how to update the newly formed edge.
5302 */ 5427 */
5303 5428
5304 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, 5429 ret = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
5305 &left_cpos);
5306 if (ret) { 5430 if (ret) {
5307 mlog_errno(ret); 5431 mlog_errno(ret);
5308 goto out; 5432 goto out;
@@ -5316,7 +5440,8 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5316 goto out; 5440 goto out;
5317 } 5441 }
5318 5442
5319 ret = ocfs2_find_path(inode, left_path, left_cpos); 5443 ret = ocfs2_find_path(et->et_ci, left_path,
5444 left_cpos);
5320 if (ret) { 5445 if (ret) {
5321 mlog_errno(ret); 5446 mlog_errno(ret);
5322 goto out; 5447 goto out;
@@ -5332,13 +5457,13 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5332 goto out; 5457 goto out;
5333 } 5458 }
5334 5459
5335 ret = ocfs2_journal_access_path(inode, handle, path); 5460 ret = ocfs2_journal_access_path(et->et_ci, handle, path);
5336 if (ret) { 5461 if (ret) {
5337 mlog_errno(ret); 5462 mlog_errno(ret);
5338 goto out; 5463 goto out;
5339 } 5464 }
5340 5465
5341 ret = ocfs2_journal_access_path(inode, handle, left_path); 5466 ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
5342 if (ret) { 5467 if (ret) {
5343 mlog_errno(ret); 5468 mlog_errno(ret);
5344 goto out; 5469 goto out;
@@ -5361,7 +5486,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5361 * be deleted by the rotate code. 5486 * be deleted by the rotate code.
5362 */ 5487 */
5363 rec = &el->l_recs[next_free - 1]; 5488 rec = &el->l_recs[next_free - 1];
5364 ocfs2_adjust_rightmost_records(inode, handle, path, 5489 ocfs2_adjust_rightmost_records(handle, et, path,
5365 rec); 5490 rec);
5366 } 5491 }
5367 } else if (le32_to_cpu(rec->e_cpos) == cpos) { 5492 } else if (le32_to_cpu(rec->e_cpos) == cpos) {
@@ -5373,11 +5498,12 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5373 /* Remove rightmost portion of the record */ 5498 /* Remove rightmost portion of the record */
5374 le16_add_cpu(&rec->e_leaf_clusters, -len); 5499 le16_add_cpu(&rec->e_leaf_clusters, -len);
5375 if (is_rightmost_tree_rec) 5500 if (is_rightmost_tree_rec)
5376 ocfs2_adjust_rightmost_records(inode, handle, path, rec); 5501 ocfs2_adjust_rightmost_records(handle, et, path, rec);
5377 } else { 5502 } else {
5378 /* Caller should have trapped this. */ 5503 /* Caller should have trapped this. */
5379 mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) " 5504 mlog(ML_ERROR, "Owner %llu: Invalid record truncate: (%u, %u) "
5380 "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, 5505 "(%u, %u)\n",
5506 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5381 le32_to_cpu(rec->e_cpos), 5507 le32_to_cpu(rec->e_cpos),
5382 le16_to_cpu(rec->e_leaf_clusters), cpos, len); 5508 le16_to_cpu(rec->e_leaf_clusters), cpos, len);
5383 BUG(); 5509 BUG();
@@ -5386,14 +5512,14 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5386 if (left_path) { 5512 if (left_path) {
5387 int subtree_index; 5513 int subtree_index;
5388 5514
5389 subtree_index = ocfs2_find_subtree_root(inode, left_path, path); 5515 subtree_index = ocfs2_find_subtree_root(et, left_path, path);
5390 ocfs2_complete_edge_insert(inode, handle, left_path, path, 5516 ocfs2_complete_edge_insert(handle, left_path, path,
5391 subtree_index); 5517 subtree_index);
5392 } 5518 }
5393 5519
5394 ocfs2_journal_dirty(handle, path_leaf_bh(path)); 5520 ocfs2_journal_dirty(handle, path_leaf_bh(path));
5395 5521
5396 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et); 5522 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
5397 if (ret) { 5523 if (ret) {
5398 mlog_errno(ret); 5524 mlog_errno(ret);
5399 goto out; 5525 goto out;
@@ -5404,9 +5530,9 @@ out:
5404 return ret; 5530 return ret;
5405} 5531}
5406 5532
5407int ocfs2_remove_extent(struct inode *inode, 5533int ocfs2_remove_extent(handle_t *handle,
5408 struct ocfs2_extent_tree *et, 5534 struct ocfs2_extent_tree *et,
5409 u32 cpos, u32 len, handle_t *handle, 5535 u32 cpos, u32 len,
5410 struct ocfs2_alloc_context *meta_ac, 5536 struct ocfs2_alloc_context *meta_ac,
5411 struct ocfs2_cached_dealloc_ctxt *dealloc) 5537 struct ocfs2_cached_dealloc_ctxt *dealloc)
5412{ 5538{
@@ -5416,7 +5542,11 @@ int ocfs2_remove_extent(struct inode *inode,
5416 struct ocfs2_extent_list *el; 5542 struct ocfs2_extent_list *el;
5417 struct ocfs2_path *path = NULL; 5543 struct ocfs2_path *path = NULL;
5418 5544
5419 ocfs2_extent_map_trunc(inode, 0); 5545 /*
5546 * XXX: Why are we truncating to 0 instead of wherever this
5547 * affects us?
5548 */
5549 ocfs2_et_extent_map_truncate(et, 0);
5420 5550
5421 path = ocfs2_new_path_from_et(et); 5551 path = ocfs2_new_path_from_et(et);
5422 if (!path) { 5552 if (!path) {
@@ -5425,7 +5555,7 @@ int ocfs2_remove_extent(struct inode *inode,
5425 goto out; 5555 goto out;
5426 } 5556 }
5427 5557
5428 ret = ocfs2_find_path(inode, path, cpos); 5558 ret = ocfs2_find_path(et->et_ci, path, cpos);
5429 if (ret) { 5559 if (ret) {
5430 mlog_errno(ret); 5560 mlog_errno(ret);
5431 goto out; 5561 goto out;
@@ -5434,10 +5564,11 @@ int ocfs2_remove_extent(struct inode *inode,
5434 el = path_leaf_el(path); 5564 el = path_leaf_el(path);
5435 index = ocfs2_search_extent_list(el, cpos); 5565 index = ocfs2_search_extent_list(el, cpos);
5436 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 5566 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5437 ocfs2_error(inode->i_sb, 5567 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5438 "Inode %llu has an extent at cpos %u which can no " 5568 "Owner %llu has an extent at cpos %u which can no "
5439 "longer be found.\n", 5569 "longer be found.\n",
5440 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); 5570 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5571 cpos);
5441 ret = -EROFS; 5572 ret = -EROFS;
5442 goto out; 5573 goto out;
5443 } 5574 }
@@ -5464,20 +5595,21 @@ int ocfs2_remove_extent(struct inode *inode,
5464 5595
5465 BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range); 5596 BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5466 5597
5467 mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d " 5598 mlog(0, "Owner %llu, remove (cpos %u, len %u). Existing index %d "
5468 "(cpos %u, len %u)\n", 5599 "(cpos %u, len %u)\n",
5469 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index, 5600 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5601 cpos, len, index,
5470 le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); 5602 le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
5471 5603
5472 if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) { 5604 if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5473 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, 5605 ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
5474 cpos, len, et); 5606 cpos, len);
5475 if (ret) { 5607 if (ret) {
5476 mlog_errno(ret); 5608 mlog_errno(ret);
5477 goto out; 5609 goto out;
5478 } 5610 }
5479 } else { 5611 } else {
5480 ret = ocfs2_split_tree(inode, et, handle, path, index, 5612 ret = ocfs2_split_tree(handle, et, path, index,
5481 trunc_range, meta_ac); 5613 trunc_range, meta_ac);
5482 if (ret) { 5614 if (ret) {
5483 mlog_errno(ret); 5615 mlog_errno(ret);
@@ -5490,7 +5622,7 @@ int ocfs2_remove_extent(struct inode *inode,
5490 */ 5622 */
5491 ocfs2_reinit_path(path, 1); 5623 ocfs2_reinit_path(path, 1);
5492 5624
5493 ret = ocfs2_find_path(inode, path, cpos); 5625 ret = ocfs2_find_path(et->et_ci, path, cpos);
5494 if (ret) { 5626 if (ret) {
5495 mlog_errno(ret); 5627 mlog_errno(ret);
5496 goto out; 5628 goto out;
@@ -5499,9 +5631,9 @@ int ocfs2_remove_extent(struct inode *inode,
5499 el = path_leaf_el(path); 5631 el = path_leaf_el(path);
5500 index = ocfs2_search_extent_list(el, cpos); 5632 index = ocfs2_search_extent_list(el, cpos);
5501 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 5633 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5502 ocfs2_error(inode->i_sb, 5634 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5503 "Inode %llu: split at cpos %u lost record.", 5635 "Owner %llu: split at cpos %u lost record.",
5504 (unsigned long long)OCFS2_I(inode)->ip_blkno, 5636 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5505 cpos); 5637 cpos);
5506 ret = -EROFS; 5638 ret = -EROFS;
5507 goto out; 5639 goto out;
@@ -5515,18 +5647,18 @@ int ocfs2_remove_extent(struct inode *inode,
5515 rec_range = le32_to_cpu(rec->e_cpos) + 5647 rec_range = le32_to_cpu(rec->e_cpos) +
5516 ocfs2_rec_clusters(el, rec); 5648 ocfs2_rec_clusters(el, rec);
5517 if (rec_range != trunc_range) { 5649 if (rec_range != trunc_range) {
5518 ocfs2_error(inode->i_sb, 5650 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5519 "Inode %llu: error after split at cpos %u" 5651 "Owner %llu: error after split at cpos %u"
5520 "trunc len %u, existing record is (%u,%u)", 5652 "trunc len %u, existing record is (%u,%u)",
5521 (unsigned long long)OCFS2_I(inode)->ip_blkno, 5653 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5522 cpos, len, le32_to_cpu(rec->e_cpos), 5654 cpos, len, le32_to_cpu(rec->e_cpos),
5523 ocfs2_rec_clusters(el, rec)); 5655 ocfs2_rec_clusters(el, rec));
5524 ret = -EROFS; 5656 ret = -EROFS;
5525 goto out; 5657 goto out;
5526 } 5658 }
5527 5659
5528 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, 5660 ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
5529 cpos, len, et); 5661 cpos, len);
5530 if (ret) { 5662 if (ret) {
5531 mlog_errno(ret); 5663 mlog_errno(ret);
5532 goto out; 5664 goto out;
@@ -5573,7 +5705,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
5573 goto out; 5705 goto out;
5574 } 5706 }
5575 5707
5576 ret = ocfs2_et_root_journal_access(handle, inode, et, 5708 ret = ocfs2_et_root_journal_access(handle, et,
5577 OCFS2_JOURNAL_ACCESS_WRITE); 5709 OCFS2_JOURNAL_ACCESS_WRITE);
5578 if (ret) { 5710 if (ret) {
5579 mlog_errno(ret); 5711 mlog_errno(ret);
@@ -5583,14 +5715,13 @@ int ocfs2_remove_btree_range(struct inode *inode,
5583 vfs_dq_free_space_nodirty(inode, 5715 vfs_dq_free_space_nodirty(inode,
5584 ocfs2_clusters_to_bytes(inode->i_sb, len)); 5716 ocfs2_clusters_to_bytes(inode->i_sb, len));
5585 5717
5586 ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac, 5718 ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc);
5587 dealloc);
5588 if (ret) { 5719 if (ret) {
5589 mlog_errno(ret); 5720 mlog_errno(ret);
5590 goto out_commit; 5721 goto out_commit;
5591 } 5722 }
5592 5723
5593 ocfs2_et_update_clusters(inode, et, -len); 5724 ocfs2_et_update_clusters(et, -len);
5594 5725
5595 ret = ocfs2_journal_dirty(handle, et->et_root_bh); 5726 ret = ocfs2_journal_dirty(handle, et->et_root_bh);
5596 if (ret) { 5727 if (ret) {
@@ -5690,7 +5821,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5690 goto bail; 5821 goto bail;
5691 } 5822 }
5692 5823
5693 status = ocfs2_journal_access_di(handle, tl_inode, tl_bh, 5824 status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
5694 OCFS2_JOURNAL_ACCESS_WRITE); 5825 OCFS2_JOURNAL_ACCESS_WRITE);
5695 if (status < 0) { 5826 if (status < 0) {
5696 mlog_errno(status); 5827 mlog_errno(status);
@@ -5752,7 +5883,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5752 while (i >= 0) { 5883 while (i >= 0) {
5753 /* Caller has given us at least enough credits to 5884 /* Caller has given us at least enough credits to
5754 * update the truncate log dinode */ 5885 * update the truncate log dinode */
5755 status = ocfs2_journal_access_di(handle, tl_inode, tl_bh, 5886 status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
5756 OCFS2_JOURNAL_ACCESS_WRITE); 5887 OCFS2_JOURNAL_ACCESS_WRITE);
5757 if (status < 0) { 5888 if (status < 0) {
5758 mlog_errno(status); 5889 mlog_errno(status);
@@ -6010,7 +6141,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
6010 tl->tl_used = 0; 6141 tl->tl_used = 0;
6011 6142
6012 ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check); 6143 ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
6013 status = ocfs2_write_block(osb, tl_bh, tl_inode); 6144 status = ocfs2_write_block(osb, tl_bh, INODE_CACHE(tl_inode));
6014 if (status < 0) { 6145 if (status < 0) {
6015 mlog_errno(status); 6146 mlog_errno(status);
6016 goto bail; 6147 goto bail;
@@ -6400,9 +6531,9 @@ ocfs2_find_per_slot_free_list(int type,
6400 return fl; 6531 return fl;
6401} 6532}
6402 6533
6403static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 6534int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6404 int type, int slot, u64 blkno, 6535 int type, int slot, u64 blkno,
6405 unsigned int bit) 6536 unsigned int bit)
6406{ 6537{
6407 int ret; 6538 int ret;
6408 struct ocfs2_per_slot_free_list *fl; 6539 struct ocfs2_per_slot_free_list *fl;
@@ -6518,7 +6649,7 @@ static int ocfs2_find_new_last_ext_blk(struct inode *inode,
6518 goto out; 6649 goto out;
6519 } 6650 }
6520 6651
6521 ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh); 6652 ret = ocfs2_find_leaf(INODE_CACHE(inode), path_root_el(path), cpos, &bh);
6522 if (ret) { 6653 if (ret) {
6523 mlog_errno(ret); 6654 mlog_errno(ret);
6524 goto out; 6655 goto out;
@@ -6551,7 +6682,7 @@ out:
6551 */ 6682 */
6552static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path, 6683static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
6553 handle_t *handle, struct ocfs2_truncate_context *tc, 6684 handle_t *handle, struct ocfs2_truncate_context *tc,
6554 u32 clusters_to_del, u64 *delete_start) 6685 u32 clusters_to_del, u64 *delete_start, u8 *flags)
6555{ 6686{
6556 int ret, i, index = path->p_tree_depth; 6687 int ret, i, index = path->p_tree_depth;
6557 u32 new_edge = 0; 6688 u32 new_edge = 0;
@@ -6561,6 +6692,7 @@ static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
6561 struct ocfs2_extent_rec *rec; 6692 struct ocfs2_extent_rec *rec;
6562 6693
6563 *delete_start = 0; 6694 *delete_start = 0;
6695 *flags = 0;
6564 6696
6565 while (index >= 0) { 6697 while (index >= 0) {
6566 bh = path->p_node[index].bh; 6698 bh = path->p_node[index].bh;
@@ -6648,6 +6780,7 @@ find_tail_record:
6648 *delete_start = le64_to_cpu(rec->e_blkno) 6780 *delete_start = le64_to_cpu(rec->e_blkno)
6649 + ocfs2_clusters_to_blocks(inode->i_sb, 6781 + ocfs2_clusters_to_blocks(inode->i_sb,
6650 le16_to_cpu(rec->e_leaf_clusters)); 6782 le16_to_cpu(rec->e_leaf_clusters));
6783 *flags = rec->e_flags;
6651 6784
6652 /* 6785 /*
6653 * If it's now empty, remove this record. 6786 * If it's now empty, remove this record.
@@ -6719,7 +6852,7 @@ delete:
6719 6852
6720 mlog(0, "deleting this extent block.\n"); 6853 mlog(0, "deleting this extent block.\n");
6721 6854
6722 ocfs2_remove_from_cache(inode, bh); 6855 ocfs2_remove_from_cache(INODE_CACHE(inode), bh);
6723 6856
6724 BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0])); 6857 BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
6725 BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos)); 6858 BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
@@ -6747,7 +6880,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6747 struct buffer_head *fe_bh, 6880 struct buffer_head *fe_bh,
6748 handle_t *handle, 6881 handle_t *handle,
6749 struct ocfs2_truncate_context *tc, 6882 struct ocfs2_truncate_context *tc,
6750 struct ocfs2_path *path) 6883 struct ocfs2_path *path,
6884 struct ocfs2_alloc_context *meta_ac)
6751{ 6885{
6752 int status; 6886 int status;
6753 struct ocfs2_dinode *fe; 6887 struct ocfs2_dinode *fe;
@@ -6755,6 +6889,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6755 struct ocfs2_extent_list *el; 6889 struct ocfs2_extent_list *el;
6756 struct buffer_head *last_eb_bh = NULL; 6890 struct buffer_head *last_eb_bh = NULL;
6757 u64 delete_blk = 0; 6891 u64 delete_blk = 0;
6892 u8 rec_flags;
6758 6893
6759 fe = (struct ocfs2_dinode *) fe_bh->b_data; 6894 fe = (struct ocfs2_dinode *) fe_bh->b_data;
6760 6895
@@ -6769,14 +6904,14 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6769 * Each component will be touched, so we might as well journal 6904 * Each component will be touched, so we might as well journal
6770 * here to avoid having to handle errors later. 6905 * here to avoid having to handle errors later.
6771 */ 6906 */
6772 status = ocfs2_journal_access_path(inode, handle, path); 6907 status = ocfs2_journal_access_path(INODE_CACHE(inode), handle, path);
6773 if (status < 0) { 6908 if (status < 0) {
6774 mlog_errno(status); 6909 mlog_errno(status);
6775 goto bail; 6910 goto bail;
6776 } 6911 }
6777 6912
6778 if (last_eb_bh) { 6913 if (last_eb_bh) {
6779 status = ocfs2_journal_access_eb(handle, inode, last_eb_bh, 6914 status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), last_eb_bh,
6780 OCFS2_JOURNAL_ACCESS_WRITE); 6915 OCFS2_JOURNAL_ACCESS_WRITE);
6781 if (status < 0) { 6916 if (status < 0) {
6782 mlog_errno(status); 6917 mlog_errno(status);
@@ -6810,7 +6945,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6810 inode->i_blocks = ocfs2_inode_sector_count(inode); 6945 inode->i_blocks = ocfs2_inode_sector_count(inode);
6811 6946
6812 status = ocfs2_trim_tree(inode, path, handle, tc, 6947 status = ocfs2_trim_tree(inode, path, handle, tc,
6813 clusters_to_del, &delete_blk); 6948 clusters_to_del, &delete_blk, &rec_flags);
6814 if (status) { 6949 if (status) {
6815 mlog_errno(status); 6950 mlog_errno(status);
6816 goto bail; 6951 goto bail;
@@ -6842,8 +6977,16 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6842 } 6977 }
6843 6978
6844 if (delete_blk) { 6979 if (delete_blk) {
6845 status = ocfs2_truncate_log_append(osb, handle, delete_blk, 6980 if (rec_flags & OCFS2_EXT_REFCOUNTED)
6846 clusters_to_del); 6981 status = ocfs2_decrease_refcount(inode, handle,
6982 ocfs2_blocks_to_clusters(osb->sb,
6983 delete_blk),
6984 clusters_to_del, meta_ac,
6985 &tc->tc_dealloc, 1);
6986 else
6987 status = ocfs2_truncate_log_append(osb, handle,
6988 delete_blk,
6989 clusters_to_del);
6847 if (status < 0) { 6990 if (status < 0) {
6848 mlog_errno(status); 6991 mlog_errno(status);
6849 goto bail; 6992 goto bail;
@@ -6863,9 +7006,9 @@ static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
6863 return 0; 7006 return 0;
6864} 7007}
6865 7008
6866static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, 7009void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6867 unsigned int from, unsigned int to, 7010 unsigned int from, unsigned int to,
6868 struct page *page, int zero, u64 *phys) 7011 struct page *page, int zero, u64 *phys)
6869{ 7012{
6870 int ret, partial = 0; 7013 int ret, partial = 0;
6871 7014
@@ -6933,20 +7076,16 @@ out:
6933 ocfs2_unlock_and_free_pages(pages, numpages); 7076 ocfs2_unlock_and_free_pages(pages, numpages);
6934} 7077}
6935 7078
6936static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, 7079int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
6937 struct page **pages, int *num) 7080 struct page **pages, int *num)
6938{ 7081{
6939 int numpages, ret = 0; 7082 int numpages, ret = 0;
6940 struct super_block *sb = inode->i_sb;
6941 struct address_space *mapping = inode->i_mapping; 7083 struct address_space *mapping = inode->i_mapping;
6942 unsigned long index; 7084 unsigned long index;
6943 loff_t last_page_bytes; 7085 loff_t last_page_bytes;
6944 7086
6945 BUG_ON(start > end); 7087 BUG_ON(start > end);
6946 7088
6947 BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6948 (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6949
6950 numpages = 0; 7089 numpages = 0;
6951 last_page_bytes = PAGE_ALIGN(end); 7090 last_page_bytes = PAGE_ALIGN(end);
6952 index = start >> PAGE_CACHE_SHIFT; 7091 index = start >> PAGE_CACHE_SHIFT;
@@ -6974,6 +7113,17 @@ out:
6974 return ret; 7113 return ret;
6975} 7114}
6976 7115
7116static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
7117 struct page **pages, int *num)
7118{
7119 struct super_block *sb = inode->i_sb;
7120
7121 BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
7122 (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
7123
7124 return ocfs2_grab_pages(inode, start, end, pages, num);
7125}
7126
6977/* 7127/*
6978 * Zero the area past i_size but still within an allocated 7128 * Zero the area past i_size but still within an allocated
6979 * cluster. This avoids exposing nonzero data on subsequent file 7129 * cluster. This avoids exposing nonzero data on subsequent file
@@ -7138,7 +7288,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
7138 goto out_unlock; 7288 goto out_unlock;
7139 } 7289 }
7140 7290
7141 ret = ocfs2_journal_access_di(handle, inode, di_bh, 7291 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
7142 OCFS2_JOURNAL_ACCESS_WRITE); 7292 OCFS2_JOURNAL_ACCESS_WRITE);
7143 if (ret) { 7293 if (ret) {
7144 mlog_errno(ret); 7294 mlog_errno(ret);
@@ -7218,9 +7368,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
7218 * this proves to be false, we could always re-build 7368 * this proves to be false, we could always re-build
7219 * the in-inode data from our pages. 7369 * the in-inode data from our pages.
7220 */ 7370 */
7221 ocfs2_init_dinode_extent_tree(&et, inode, di_bh); 7371 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
7222 ret = ocfs2_insert_extent(osb, handle, inode, &et, 7372 ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL);
7223 0, block, 1, 0, NULL);
7224 if (ret) { 7373 if (ret) {
7225 mlog_errno(ret); 7374 mlog_errno(ret);
7226 goto out_commit; 7375 goto out_commit;
@@ -7262,11 +7411,14 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
7262{ 7411{
7263 int status, i, credits, tl_sem = 0; 7412 int status, i, credits, tl_sem = 0;
7264 u32 clusters_to_del, new_highest_cpos, range; 7413 u32 clusters_to_del, new_highest_cpos, range;
7414 u64 blkno = 0;
7265 struct ocfs2_extent_list *el; 7415 struct ocfs2_extent_list *el;
7266 handle_t *handle = NULL; 7416 handle_t *handle = NULL;
7267 struct inode *tl_inode = osb->osb_tl_inode; 7417 struct inode *tl_inode = osb->osb_tl_inode;
7268 struct ocfs2_path *path = NULL; 7418 struct ocfs2_path *path = NULL;
7269 struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; 7419 struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
7420 struct ocfs2_alloc_context *meta_ac = NULL;
7421 struct ocfs2_refcount_tree *ref_tree = NULL;
7270 7422
7271 mlog_entry_void(); 7423 mlog_entry_void();
7272 7424
@@ -7292,10 +7444,12 @@ start:
7292 goto bail; 7444 goto bail;
7293 } 7445 }
7294 7446
7447 credits = 0;
7448
7295 /* 7449 /*
7296 * Truncate always works against the rightmost tree branch. 7450 * Truncate always works against the rightmost tree branch.
7297 */ 7451 */
7298 status = ocfs2_find_path(inode, path, UINT_MAX); 7452 status = ocfs2_find_path(INODE_CACHE(inode), path, UINT_MAX);
7299 if (status) { 7453 if (status) {
7300 mlog_errno(status); 7454 mlog_errno(status);
7301 goto bail; 7455 goto bail;
@@ -7332,10 +7486,15 @@ start:
7332 clusters_to_del = 0; 7486 clusters_to_del = 0;
7333 } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) { 7487 } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
7334 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]); 7488 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
7489 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
7335 } else if (range > new_highest_cpos) { 7490 } else if (range > new_highest_cpos) {
7336 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) + 7491 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
7337 le32_to_cpu(el->l_recs[i].e_cpos)) - 7492 le32_to_cpu(el->l_recs[i].e_cpos)) -
7338 new_highest_cpos; 7493 new_highest_cpos;
7494 blkno = le64_to_cpu(el->l_recs[i].e_blkno) +
7495 ocfs2_clusters_to_blocks(inode->i_sb,
7496 ocfs2_rec_clusters(el, &el->l_recs[i]) -
7497 clusters_to_del);
7339 } else { 7498 } else {
7340 status = 0; 7499 status = 0;
7341 goto bail; 7500 goto bail;
@@ -7344,6 +7503,29 @@ start:
7344 mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", 7503 mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
7345 clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr); 7504 clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
7346 7505
7506 if (el->l_recs[i].e_flags & OCFS2_EXT_REFCOUNTED && clusters_to_del) {
7507 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
7508 OCFS2_HAS_REFCOUNT_FL));
7509
7510 status = ocfs2_lock_refcount_tree(osb,
7511 le64_to_cpu(di->i_refcount_loc),
7512 1, &ref_tree, NULL);
7513 if (status) {
7514 mlog_errno(status);
7515 goto bail;
7516 }
7517
7518 status = ocfs2_prepare_refcount_change_for_del(inode, fe_bh,
7519 blkno,
7520 clusters_to_del,
7521 &credits,
7522 &meta_ac);
7523 if (status < 0) {
7524 mlog_errno(status);
7525 goto bail;
7526 }
7527 }
7528
7347 mutex_lock(&tl_inode->i_mutex); 7529 mutex_lock(&tl_inode->i_mutex);
7348 tl_sem = 1; 7530 tl_sem = 1;
7349 /* ocfs2_truncate_log_needs_flush guarantees us at least one 7531 /* ocfs2_truncate_log_needs_flush guarantees us at least one
@@ -7357,7 +7539,7 @@ start:
7357 } 7539 }
7358 } 7540 }
7359 7541
7360 credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, 7542 credits += ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
7361 (struct ocfs2_dinode *)fe_bh->b_data, 7543 (struct ocfs2_dinode *)fe_bh->b_data,
7362 el); 7544 el);
7363 handle = ocfs2_start_trans(osb, credits); 7545 handle = ocfs2_start_trans(osb, credits);
@@ -7369,7 +7551,7 @@ start:
7369 } 7551 }
7370 7552
7371 status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle, 7553 status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
7372 tc, path); 7554 tc, path, meta_ac);
7373 if (status < 0) { 7555 if (status < 0) {
7374 mlog_errno(status); 7556 mlog_errno(status);
7375 goto bail; 7557 goto bail;
@@ -7383,6 +7565,16 @@ start:
7383 7565
7384 ocfs2_reinit_path(path, 1); 7566 ocfs2_reinit_path(path, 1);
7385 7567
7568 if (meta_ac) {
7569 ocfs2_free_alloc_context(meta_ac);
7570 meta_ac = NULL;
7571 }
7572
7573 if (ref_tree) {
7574 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
7575 ref_tree = NULL;
7576 }
7577
7386 /* 7578 /*
7387 * The check above will catch the case where we've truncated 7579 * The check above will catch the case where we've truncated
7388 * away all allocation. 7580 * away all allocation.
@@ -7399,6 +7591,12 @@ bail:
7399 if (handle) 7591 if (handle)
7400 ocfs2_commit_trans(osb, handle); 7592 ocfs2_commit_trans(osb, handle);
7401 7593
7594 if (meta_ac)
7595 ocfs2_free_alloc_context(meta_ac);
7596
7597 if (ref_tree)
7598 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
7599
7402 ocfs2_run_deallocs(osb, &tc->tc_dealloc); 7600 ocfs2_run_deallocs(osb, &tc->tc_dealloc);
7403 7601
7404 ocfs2_free_path(path); 7602 ocfs2_free_path(path);
@@ -7445,7 +7643,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
7445 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc); 7643 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
7446 7644
7447 if (fe->id2.i_list.l_tree_depth) { 7645 if (fe->id2.i_list.l_tree_depth) {
7448 status = ocfs2_read_extent_block(inode, 7646 status = ocfs2_read_extent_block(INODE_CACHE(inode),
7449 le64_to_cpu(fe->i_last_eb_blk), 7647 le64_to_cpu(fe->i_last_eb_blk),
7450 &last_eb_bh); 7648 &last_eb_bh);
7451 if (status < 0) { 7649 if (status < 0) {
@@ -7507,7 +7705,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7507 goto out; 7705 goto out;
7508 } 7706 }
7509 7707
7510 ret = ocfs2_journal_access_di(handle, inode, di_bh, 7708 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
7511 OCFS2_JOURNAL_ACCESS_WRITE); 7709 OCFS2_JOURNAL_ACCESS_WRITE);
7512 if (ret) { 7710 if (ret) {
7513 mlog_errno(ret); 7711 mlog_errno(ret);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 353254ba29e1..9c122d574464 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -45,7 +45,8 @@
45 * 45 *
46 * ocfs2_extent_tree contains info for the root of the b-tree, it must have a 46 * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
47 * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree 47 * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
48 * functions. With metadata ecc, we now call different journal_access 48 * functions. It needs the ocfs2_caching_info structure associated with
49 * I/O on the tree. With metadata ecc, we now call different journal_access
49 * functions for each type of metadata, so it must have the 50 * functions for each type of metadata, so it must have the
50 * root_journal_access function. 51 * root_journal_access function.
51 * ocfs2_extent_tree_operations abstract the normal operations we do for 52 * ocfs2_extent_tree_operations abstract the normal operations we do for
@@ -56,6 +57,7 @@ struct ocfs2_extent_tree {
56 struct ocfs2_extent_tree_operations *et_ops; 57 struct ocfs2_extent_tree_operations *et_ops;
57 struct buffer_head *et_root_bh; 58 struct buffer_head *et_root_bh;
58 struct ocfs2_extent_list *et_root_el; 59 struct ocfs2_extent_list *et_root_el;
60 struct ocfs2_caching_info *et_ci;
59 ocfs2_journal_access_func et_root_journal_access; 61 ocfs2_journal_access_func et_root_journal_access;
60 void *et_object; 62 void *et_object;
61 unsigned int et_max_leaf_clusters; 63 unsigned int et_max_leaf_clusters;
@@ -66,31 +68,32 @@ struct ocfs2_extent_tree {
66 * specified object buffer. 68 * specified object buffer.
67 */ 69 */
68void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, 70void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
69 struct inode *inode, 71 struct ocfs2_caching_info *ci,
70 struct buffer_head *bh); 72 struct buffer_head *bh);
71void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, 73void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
72 struct inode *inode, 74 struct ocfs2_caching_info *ci,
73 struct buffer_head *bh); 75 struct buffer_head *bh);
74struct ocfs2_xattr_value_buf; 76struct ocfs2_xattr_value_buf;
75void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, 77void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
76 struct inode *inode, 78 struct ocfs2_caching_info *ci,
77 struct ocfs2_xattr_value_buf *vb); 79 struct ocfs2_xattr_value_buf *vb);
78void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et, 80void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
79 struct inode *inode, 81 struct ocfs2_caching_info *ci,
80 struct buffer_head *bh); 82 struct buffer_head *bh);
83void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
84 struct ocfs2_caching_info *ci,
85 struct buffer_head *bh);
81 86
82/* 87/*
83 * Read an extent block into *bh. If *bh is NULL, a bh will be 88 * Read an extent block into *bh. If *bh is NULL, a bh will be
84 * allocated. This is a cached read. The extent block will be validated 89 * allocated. This is a cached read. The extent block will be validated
85 * with ocfs2_validate_extent_block(). 90 * with ocfs2_validate_extent_block().
86 */ 91 */
87int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno, 92int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
88 struct buffer_head **bh); 93 struct buffer_head **bh);
89 94
90struct ocfs2_alloc_context; 95struct ocfs2_alloc_context;
91int ocfs2_insert_extent(struct ocfs2_super *osb, 96int ocfs2_insert_extent(handle_t *handle,
92 handle_t *handle,
93 struct inode *inode,
94 struct ocfs2_extent_tree *et, 97 struct ocfs2_extent_tree *et,
95 u32 cpos, 98 u32 cpos,
96 u64 start_blk, 99 u64 start_blk,
@@ -103,25 +106,36 @@ enum ocfs2_alloc_restarted {
103 RESTART_TRANS, 106 RESTART_TRANS,
104 RESTART_META 107 RESTART_META
105}; 108};
106int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, 109int ocfs2_add_clusters_in_btree(handle_t *handle,
107 struct inode *inode, 110 struct ocfs2_extent_tree *et,
108 u32 *logical_offset, 111 u32 *logical_offset,
109 u32 clusters_to_add, 112 u32 clusters_to_add,
110 int mark_unwritten, 113 int mark_unwritten,
111 struct ocfs2_extent_tree *et,
112 handle_t *handle,
113 struct ocfs2_alloc_context *data_ac, 114 struct ocfs2_alloc_context *data_ac,
114 struct ocfs2_alloc_context *meta_ac, 115 struct ocfs2_alloc_context *meta_ac,
115 enum ocfs2_alloc_restarted *reason_ret); 116 enum ocfs2_alloc_restarted *reason_ret);
116struct ocfs2_cached_dealloc_ctxt; 117struct ocfs2_cached_dealloc_ctxt;
118struct ocfs2_path;
119int ocfs2_split_extent(handle_t *handle,
120 struct ocfs2_extent_tree *et,
121 struct ocfs2_path *path,
122 int split_index,
123 struct ocfs2_extent_rec *split_rec,
124 struct ocfs2_alloc_context *meta_ac,
125 struct ocfs2_cached_dealloc_ctxt *dealloc);
117int ocfs2_mark_extent_written(struct inode *inode, 126int ocfs2_mark_extent_written(struct inode *inode,
118 struct ocfs2_extent_tree *et, 127 struct ocfs2_extent_tree *et,
119 handle_t *handle, u32 cpos, u32 len, u32 phys, 128 handle_t *handle, u32 cpos, u32 len, u32 phys,
120 struct ocfs2_alloc_context *meta_ac, 129 struct ocfs2_alloc_context *meta_ac,
121 struct ocfs2_cached_dealloc_ctxt *dealloc); 130 struct ocfs2_cached_dealloc_ctxt *dealloc);
122int ocfs2_remove_extent(struct inode *inode, 131int ocfs2_change_extent_flag(handle_t *handle,
123 struct ocfs2_extent_tree *et, 132 struct ocfs2_extent_tree *et,
124 u32 cpos, u32 len, handle_t *handle, 133 u32 cpos, u32 len, u32 phys,
134 struct ocfs2_alloc_context *meta_ac,
135 struct ocfs2_cached_dealloc_ctxt *dealloc,
136 int new_flags, int clear_flags);
137int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et,
138 u32 cpos, u32 len,
125 struct ocfs2_alloc_context *meta_ac, 139 struct ocfs2_alloc_context *meta_ac,
126 struct ocfs2_cached_dealloc_ctxt *dealloc); 140 struct ocfs2_cached_dealloc_ctxt *dealloc);
127int ocfs2_remove_btree_range(struct inode *inode, 141int ocfs2_remove_btree_range(struct inode *inode,
@@ -130,7 +144,6 @@ int ocfs2_remove_btree_range(struct inode *inode,
130 struct ocfs2_cached_dealloc_ctxt *dealloc); 144 struct ocfs2_cached_dealloc_ctxt *dealloc);
131 145
132int ocfs2_num_free_extents(struct ocfs2_super *osb, 146int ocfs2_num_free_extents(struct ocfs2_super *osb,
133 struct inode *inode,
134 struct ocfs2_extent_tree *et); 147 struct ocfs2_extent_tree *et);
135 148
136/* 149/*
@@ -195,6 +208,9 @@ static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
195} 208}
196int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 209int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
197 u64 blkno, unsigned int bit); 210 u64 blkno, unsigned int bit);
211int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
212 int type, int slot, u64 blkno,
213 unsigned int bit);
198static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c) 214static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
199{ 215{
200 return c->c_global_allocator != NULL; 216 return c->c_global_allocator != NULL;
@@ -222,8 +238,9 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
222int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, 238int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
223 unsigned int start, unsigned int end, int trunc); 239 unsigned int start, unsigned int end, int trunc);
224 240
225int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, 241int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
226 u32 cpos, struct buffer_head **leaf_bh); 242 struct ocfs2_extent_list *root_el, u32 cpos,
243 struct buffer_head **leaf_bh);
227int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster); 244int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
228 245
229/* 246/*
@@ -254,4 +271,50 @@ static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
254 return !rec->e_leaf_clusters; 271 return !rec->e_leaf_clusters;
255} 272}
256 273
274int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
275 struct page **pages, int *num);
276void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
277 unsigned int from, unsigned int to,
278 struct page *page, int zero, u64 *phys);
279/*
280 * Structures which describe a path through a btree, and functions to
281 * manipulate them.
282 *
283 * The idea here is to be as generic as possible with the tree
284 * manipulation code.
285 */
286struct ocfs2_path_item {
287 struct buffer_head *bh;
288 struct ocfs2_extent_list *el;
289};
290
291#define OCFS2_MAX_PATH_DEPTH 5
292
293struct ocfs2_path {
294 int p_tree_depth;
295 ocfs2_journal_access_func p_root_access;
296 struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH];
297};
298
299#define path_root_bh(_path) ((_path)->p_node[0].bh)
300#define path_root_el(_path) ((_path)->p_node[0].el)
301#define path_root_access(_path)((_path)->p_root_access)
302#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
303#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
304#define path_num_items(_path) ((_path)->p_tree_depth + 1)
305
306void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root);
307void ocfs2_free_path(struct ocfs2_path *path);
308int ocfs2_find_path(struct ocfs2_caching_info *ci,
309 struct ocfs2_path *path,
310 u32 cpos);
311struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path);
312struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et);
313int ocfs2_path_bh_journal_access(handle_t *handle,
314 struct ocfs2_caching_info *ci,
315 struct ocfs2_path *path,
316 int idx);
317int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
318 handle_t *handle,
319 struct ocfs2_path *path);
257#endif /* OCFS2_ALLOC_H */ 320#endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index b401654011a2..deb2b132ae5e 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -44,6 +44,7 @@
44#include "suballoc.h" 44#include "suballoc.h"
45#include "super.h" 45#include "super.h"
46#include "symlink.h" 46#include "symlink.h"
47#include "refcounttree.h"
47 48
48#include "buffer_head_io.h" 49#include "buffer_head_io.h"
49 50
@@ -126,8 +127,8 @@ bail:
126 return err; 127 return err;
127} 128}
128 129
129static int ocfs2_get_block(struct inode *inode, sector_t iblock, 130int ocfs2_get_block(struct inode *inode, sector_t iblock,
130 struct buffer_head *bh_result, int create) 131 struct buffer_head *bh_result, int create)
131{ 132{
132 int err = 0; 133 int err = 0;
133 unsigned int ext_flags; 134 unsigned int ext_flags;
@@ -590,6 +591,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
590 goto bail; 591 goto bail;
591 } 592 }
592 593
594 /* We should already CoW the refcounted extent. */
595 BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
593 /* 596 /*
594 * get_more_blocks() expects us to describe a hole by clearing 597 * get_more_blocks() expects us to describe a hole by clearing
595 * the mapped bit on bh_result(). 598 * the mapped bit on bh_result().
@@ -687,6 +690,10 @@ static ssize_t ocfs2_direct_IO(int rw,
687 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 690 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
688 return 0; 691 return 0;
689 692
693 /* Fallback to buffered I/O if we are appending. */
694 if (i_size_read(inode) <= offset)
695 return 0;
696
690 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, 697 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
691 inode->i_sb->s_bdev, iov, offset, 698 inode->i_sb->s_bdev, iov, offset,
692 nr_segs, 699 nr_segs,
@@ -1259,7 +1266,8 @@ static int ocfs2_write_cluster(struct address_space *mapping,
1259 goto out; 1266 goto out;
1260 } 1267 }
1261 } else if (unwritten) { 1268 } else if (unwritten) {
1262 ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh); 1269 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
1270 wc->w_di_bh);
1263 ret = ocfs2_mark_extent_written(inode, &et, 1271 ret = ocfs2_mark_extent_written(inode, &et,
1264 wc->w_handle, cpos, 1, phys, 1272 wc->w_handle, cpos, 1, phys,
1265 meta_ac, &wc->w_dealloc); 1273 meta_ac, &wc->w_dealloc);
@@ -1448,6 +1456,9 @@ static int ocfs2_populate_write_desc(struct inode *inode,
1448 goto out; 1456 goto out;
1449 } 1457 }
1450 1458
1459 /* We should already CoW the refcountd extent. */
1460 BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
1461
1451 /* 1462 /*
1452 * Assume worst case - that we're writing in 1463 * Assume worst case - that we're writing in
1453 * the middle of the extent. 1464 * the middle of the extent.
@@ -1528,7 +1539,7 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
1528 goto out; 1539 goto out;
1529 } 1540 }
1530 1541
1531 ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh, 1542 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
1532 OCFS2_JOURNAL_ACCESS_WRITE); 1543 OCFS2_JOURNAL_ACCESS_WRITE);
1533 if (ret) { 1544 if (ret) {
1534 ocfs2_commit_trans(osb, handle); 1545 ocfs2_commit_trans(osb, handle);
@@ -1699,6 +1710,19 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1699 goto out; 1710 goto out;
1700 } 1711 }
1701 1712
1713 ret = ocfs2_check_range_for_refcount(inode, pos, len);
1714 if (ret < 0) {
1715 mlog_errno(ret);
1716 goto out;
1717 } else if (ret == 1) {
1718 ret = ocfs2_refcount_cow(inode, di_bh,
1719 wc->w_cpos, wc->w_clen, UINT_MAX);
1720 if (ret) {
1721 mlog_errno(ret);
1722 goto out;
1723 }
1724 }
1725
1702 ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc, 1726 ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
1703 &extents_to_split); 1727 &extents_to_split);
1704 if (ret) { 1728 if (ret) {
@@ -1726,7 +1750,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1726 (long long)i_size_read(inode), le32_to_cpu(di->i_clusters), 1750 (long long)i_size_read(inode), le32_to_cpu(di->i_clusters),
1727 clusters_to_alloc, extents_to_split); 1751 clusters_to_alloc, extents_to_split);
1728 1752
1729 ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh); 1753 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
1754 wc->w_di_bh);
1730 ret = ocfs2_lock_allocators(inode, &et, 1755 ret = ocfs2_lock_allocators(inode, &et,
1731 clusters_to_alloc, extents_to_split, 1756 clusters_to_alloc, extents_to_split,
1732 &data_ac, &meta_ac); 1757 &data_ac, &meta_ac);
@@ -1747,8 +1772,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1747 * we know zeros will only be needed in the first and/or last cluster. 1772 * we know zeros will only be needed in the first and/or last cluster.
1748 */ 1773 */
1749 if (clusters_to_alloc || extents_to_split || 1774 if (clusters_to_alloc || extents_to_split ||
1750 wc->w_desc[0].c_needs_zero || 1775 (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
1751 wc->w_desc[wc->w_clen - 1].c_needs_zero) 1776 wc->w_desc[wc->w_clen - 1].c_needs_zero)))
1752 cluster_of_pages = 1; 1777 cluster_of_pages = 1;
1753 else 1778 else
1754 cluster_of_pages = 0; 1779 cluster_of_pages = 0;
@@ -1773,7 +1798,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1773 * We don't want this to fail in ocfs2_write_end(), so do it 1798 * We don't want this to fail in ocfs2_write_end(), so do it
1774 * here. 1799 * here.
1775 */ 1800 */
1776 ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh, 1801 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
1777 OCFS2_JOURNAL_ACCESS_WRITE); 1802 OCFS2_JOURNAL_ACCESS_WRITE);
1778 if (ret) { 1803 if (ret) {
1779 mlog_errno(ret); 1804 mlog_errno(ret);
@@ -1997,4 +2022,5 @@ const struct address_space_operations ocfs2_aops = {
1997 .releasepage = ocfs2_releasepage, 2022 .releasepage = ocfs2_releasepage,
1998 .migratepage = buffer_migrate_page, 2023 .migratepage = buffer_migrate_page,
1999 .is_partially_uptodate = block_is_partially_uptodate, 2024 .is_partially_uptodate = block_is_partially_uptodate,
2025 .error_remove_page = generic_error_remove_page,
2000}; 2026};
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 503e49232e11..c48e93ffc513 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -57,6 +57,8 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
57 struct buffer_head *di_bh); 57 struct buffer_head *di_bh);
58int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size); 58int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size);
59 59
60int ocfs2_get_block(struct inode *inode, sector_t iblock,
61 struct buffer_head *bh_result, int create);
60/* all ocfs2_dio_end_io()'s fault */ 62/* all ocfs2_dio_end_io()'s fault */
61#define ocfs2_iocb_is_rw_locked(iocb) \ 63#define ocfs2_iocb_is_rw_locked(iocb) \
62 test_bit(0, (unsigned long *)&iocb->private) 64 test_bit(0, (unsigned long *)&iocb->private)
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 15c8e6deee2e..d43d34a1dd31 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -52,12 +52,12 @@ enum ocfs2_state_bits {
52BUFFER_FNS(NeedsValidate, needs_validate); 52BUFFER_FNS(NeedsValidate, needs_validate);
53 53
54int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, 54int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
55 struct inode *inode) 55 struct ocfs2_caching_info *ci)
56{ 56{
57 int ret = 0; 57 int ret = 0;
58 58
59 mlog_entry("(bh->b_blocknr = %llu, inode=%p)\n", 59 mlog_entry("(bh->b_blocknr = %llu, ci=%p)\n",
60 (unsigned long long)bh->b_blocknr, inode); 60 (unsigned long long)bh->b_blocknr, ci);
61 61
62 BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO); 62 BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
63 BUG_ON(buffer_jbd(bh)); 63 BUG_ON(buffer_jbd(bh));
@@ -70,7 +70,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
70 goto out; 70 goto out;
71 } 71 }
72 72
73 mutex_lock(&OCFS2_I(inode)->ip_io_mutex); 73 ocfs2_metadata_cache_io_lock(ci);
74 74
75 lock_buffer(bh); 75 lock_buffer(bh);
76 set_buffer_uptodate(bh); 76 set_buffer_uptodate(bh);
@@ -85,7 +85,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
85 wait_on_buffer(bh); 85 wait_on_buffer(bh);
86 86
87 if (buffer_uptodate(bh)) { 87 if (buffer_uptodate(bh)) {
88 ocfs2_set_buffer_uptodate(inode, bh); 88 ocfs2_set_buffer_uptodate(ci, bh);
89 } else { 89 } else {
90 /* We don't need to remove the clustered uptodate 90 /* We don't need to remove the clustered uptodate
91 * information for this bh as it's not marked locally 91 * information for this bh as it's not marked locally
@@ -94,7 +94,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
94 put_bh(bh); 94 put_bh(bh);
95 } 95 }
96 96
97 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); 97 ocfs2_metadata_cache_io_unlock(ci);
98out: 98out:
99 mlog_exit(ret); 99 mlog_exit(ret);
100 return ret; 100 return ret;
@@ -177,7 +177,7 @@ bail:
177 return status; 177 return status;
178} 178}
179 179
180int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, 180int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
181 struct buffer_head *bhs[], int flags, 181 struct buffer_head *bhs[], int flags,
182 int (*validate)(struct super_block *sb, 182 int (*validate)(struct super_block *sb,
183 struct buffer_head *bh)) 183 struct buffer_head *bh))
@@ -185,11 +185,12 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
185 int status = 0; 185 int status = 0;
186 int i, ignore_cache = 0; 186 int i, ignore_cache = 0;
187 struct buffer_head *bh; 187 struct buffer_head *bh;
188 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
188 189
189 mlog_entry("(inode=%p, block=(%llu), nr=(%d), flags=%d)\n", 190 mlog_entry("(ci=%p, block=(%llu), nr=(%d), flags=%d)\n",
190 inode, (unsigned long long)block, nr, flags); 191 ci, (unsigned long long)block, nr, flags);
191 192
192 BUG_ON(!inode); 193 BUG_ON(!ci);
193 BUG_ON((flags & OCFS2_BH_READAHEAD) && 194 BUG_ON((flags & OCFS2_BH_READAHEAD) &&
194 (flags & OCFS2_BH_IGNORE_CACHE)); 195 (flags & OCFS2_BH_IGNORE_CACHE));
195 196
@@ -212,12 +213,12 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
212 goto bail; 213 goto bail;
213 } 214 }
214 215
215 mutex_lock(&OCFS2_I(inode)->ip_io_mutex); 216 ocfs2_metadata_cache_io_lock(ci);
216 for (i = 0 ; i < nr ; i++) { 217 for (i = 0 ; i < nr ; i++) {
217 if (bhs[i] == NULL) { 218 if (bhs[i] == NULL) {
218 bhs[i] = sb_getblk(inode->i_sb, block++); 219 bhs[i] = sb_getblk(sb, block++);
219 if (bhs[i] == NULL) { 220 if (bhs[i] == NULL) {
220 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); 221 ocfs2_metadata_cache_io_unlock(ci);
221 status = -EIO; 222 status = -EIO;
222 mlog_errno(status); 223 mlog_errno(status);
223 goto bail; 224 goto bail;
@@ -250,11 +251,11 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
250 * before our is-it-in-flight check. 251 * before our is-it-in-flight check.
251 */ 252 */
252 253
253 if (!ignore_cache && !ocfs2_buffer_uptodate(inode, bh)) { 254 if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) {
254 mlog(ML_UPTODATE, 255 mlog(ML_UPTODATE,
255 "bh (%llu), inode %llu not uptodate\n", 256 "bh (%llu), owner %llu not uptodate\n",
256 (unsigned long long)bh->b_blocknr, 257 (unsigned long long)bh->b_blocknr,
257 (unsigned long long)OCFS2_I(inode)->ip_blkno); 258 (unsigned long long)ocfs2_metadata_cache_owner(ci));
258 /* We're using ignore_cache here to say 259 /* We're using ignore_cache here to say
259 * "go to disk" */ 260 * "go to disk" */
260 ignore_cache = 1; 261 ignore_cache = 1;
@@ -283,7 +284,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
283 * previously submitted request than we are 284 * previously submitted request than we are
284 * done here. */ 285 * done here. */
285 if ((flags & OCFS2_BH_READAHEAD) 286 if ((flags & OCFS2_BH_READAHEAD)
286 && ocfs2_buffer_read_ahead(inode, bh)) 287 && ocfs2_buffer_read_ahead(ci, bh))
287 continue; 288 continue;
288 289
289 lock_buffer(bh); 290 lock_buffer(bh);
@@ -305,7 +306,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
305 * buffer lock. */ 306 * buffer lock. */
306 if (!(flags & OCFS2_BH_IGNORE_CACHE) 307 if (!(flags & OCFS2_BH_IGNORE_CACHE)
307 && !(flags & OCFS2_BH_READAHEAD) 308 && !(flags & OCFS2_BH_READAHEAD)
308 && ocfs2_buffer_uptodate(inode, bh)) { 309 && ocfs2_buffer_uptodate(ci, bh)) {
309 unlock_buffer(bh); 310 unlock_buffer(bh);
310 continue; 311 continue;
311 } 312 }
@@ -327,7 +328,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
327 328
328 if (!(flags & OCFS2_BH_READAHEAD)) { 329 if (!(flags & OCFS2_BH_READAHEAD)) {
329 /* We know this can't have changed as we hold the 330 /* We know this can't have changed as we hold the
330 * inode sem. Avoid doing any work on the bh if the 331 * owner sem. Avoid doing any work on the bh if the
331 * journal has it. */ 332 * journal has it. */
332 if (!buffer_jbd(bh)) 333 if (!buffer_jbd(bh))
333 wait_on_buffer(bh); 334 wait_on_buffer(bh);
@@ -351,7 +352,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
351 * that better not have changed */ 352 * that better not have changed */
352 BUG_ON(buffer_jbd(bh)); 353 BUG_ON(buffer_jbd(bh));
353 clear_buffer_needs_validate(bh); 354 clear_buffer_needs_validate(bh);
354 status = validate(inode->i_sb, bh); 355 status = validate(sb, bh);
355 if (status) { 356 if (status) {
356 put_bh(bh); 357 put_bh(bh);
357 bhs[i] = NULL; 358 bhs[i] = NULL;
@@ -363,9 +364,9 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
363 /* Always set the buffer in the cache, even if it was 364 /* Always set the buffer in the cache, even if it was
364 * a forced read, or read-ahead which hasn't yet 365 * a forced read, or read-ahead which hasn't yet
365 * completed. */ 366 * completed. */
366 ocfs2_set_buffer_uptodate(inode, bh); 367 ocfs2_set_buffer_uptodate(ci, bh);
367 } 368 }
368 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); 369 ocfs2_metadata_cache_io_unlock(ci);
369 370
370 mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", 371 mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n",
371 (unsigned long long)block, nr, 372 (unsigned long long)block, nr,
@@ -399,7 +400,7 @@ static void ocfs2_check_super_or_backup(struct super_block *sb,
399 400
400/* 401/*
401 * Write super block and backups doesn't need to collaborate with journal, 402 * Write super block and backups doesn't need to collaborate with journal,
402 * so we don't need to lock ip_io_mutex and inode doesn't need to bea passed 403 * so we don't need to lock ip_io_mutex and ci doesn't need to bea passed
403 * into this function. 404 * into this function.
404 */ 405 */
405int ocfs2_write_super_or_backup(struct ocfs2_super *osb, 406int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index c75d682dadd8..b97bcc6dde7c 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -33,7 +33,7 @@ void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
33 33
34int ocfs2_write_block(struct ocfs2_super *osb, 34int ocfs2_write_block(struct ocfs2_super *osb,
35 struct buffer_head *bh, 35 struct buffer_head *bh,
36 struct inode *inode); 36 struct ocfs2_caching_info *ci);
37int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, 37int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
38 unsigned int nr, struct buffer_head *bhs[]); 38 unsigned int nr, struct buffer_head *bhs[]);
39 39
@@ -44,7 +44,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
44 * be set even for a READAHEAD call, as it marks the buffer for later 44 * be set even for a READAHEAD call, as it marks the buffer for later
45 * validation. 45 * validation.
46 */ 46 */
47int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, 47int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
48 struct buffer_head *bhs[], int flags, 48 struct buffer_head *bhs[], int flags,
49 int (*validate)(struct super_block *sb, 49 int (*validate)(struct super_block *sb,
50 struct buffer_head *bh)); 50 struct buffer_head *bh));
@@ -55,7 +55,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
55#define OCFS2_BH_IGNORE_CACHE 1 55#define OCFS2_BH_IGNORE_CACHE 1
56#define OCFS2_BH_READAHEAD 8 56#define OCFS2_BH_READAHEAD 8
57 57
58static inline int ocfs2_read_block(struct inode *inode, u64 off, 58static inline int ocfs2_read_block(struct ocfs2_caching_info *ci, u64 off,
59 struct buffer_head **bh, 59 struct buffer_head **bh,
60 int (*validate)(struct super_block *sb, 60 int (*validate)(struct super_block *sb,
61 struct buffer_head *bh)) 61 struct buffer_head *bh))
@@ -68,7 +68,7 @@ static inline int ocfs2_read_block(struct inode *inode, u64 off,
68 goto bail; 68 goto bail;
69 } 69 }
70 70
71 status = ocfs2_read_blocks(inode, off, 1, bh, 0, validate); 71 status = ocfs2_read_blocks(ci, off, 1, bh, 0, validate);
72 72
73bail: 73bail:
74 return status; 74 return status;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 09cc25d04611..c452d116b892 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -966,7 +966,7 @@ static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
966} 966}
967#endif /* CONFIG_DEBUG_FS */ 967#endif /* CONFIG_DEBUG_FS */
968 968
969static struct file_operations o2hb_debug_fops = { 969static const struct file_operations o2hb_debug_fops = {
970 .open = o2hb_debug_open, 970 .open = o2hb_debug_open,
971 .release = o2hb_debug_release, 971 .release = o2hb_debug_release,
972 .read = o2hb_debug_read, 972 .read = o2hb_debug_read,
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 96df5416993e..1cd2934de615 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -111,6 +111,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
111 define_mask(EXPORT), 111 define_mask(EXPORT),
112 define_mask(XATTR), 112 define_mask(XATTR),
113 define_mask(QUOTA), 113 define_mask(QUOTA),
114 define_mask(REFCOUNT),
114 define_mask(ERROR), 115 define_mask(ERROR),
115 define_mask(NOTICE), 116 define_mask(NOTICE),
116 define_mask(KTHREAD), 117 define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 696c32e50716..9b4d11726cf2 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -113,6 +113,7 @@
113#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */ 113#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */
114#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ 114#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */
115#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */ 115#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */
116#define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */
116/* bits that are infrequently given and frequently matched in the high word */ 117/* bits that are infrequently given and frequently matched in the high word */
117#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ 118#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
118#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ 119#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index f8424874fa07..da794bc07a6c 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -163,7 +163,7 @@ static void nst_seq_stop(struct seq_file *seq, void *v)
163{ 163{
164} 164}
165 165
166static struct seq_operations nst_seq_ops = { 166static const struct seq_operations nst_seq_ops = {
167 .start = nst_seq_start, 167 .start = nst_seq_start,
168 .next = nst_seq_next, 168 .next = nst_seq_next,
169 .stop = nst_seq_stop, 169 .stop = nst_seq_stop,
@@ -207,7 +207,7 @@ static int nst_fop_release(struct inode *inode, struct file *file)
207 return seq_release_private(inode, file); 207 return seq_release_private(inode, file);
208} 208}
209 209
210static struct file_operations nst_seq_fops = { 210static const struct file_operations nst_seq_fops = {
211 .open = nst_fop_open, 211 .open = nst_fop_open,
212 .read = seq_read, 212 .read = seq_read,
213 .llseek = seq_lseek, 213 .llseek = seq_lseek,
@@ -344,7 +344,7 @@ static void sc_seq_stop(struct seq_file *seq, void *v)
344{ 344{
345} 345}
346 346
347static struct seq_operations sc_seq_ops = { 347static const struct seq_operations sc_seq_ops = {
348 .start = sc_seq_start, 348 .start = sc_seq_start,
349 .next = sc_seq_next, 349 .next = sc_seq_next,
350 .stop = sc_seq_stop, 350 .stop = sc_seq_stop,
@@ -388,7 +388,7 @@ static int sc_fop_release(struct inode *inode, struct file *file)
388 return seq_release_private(inode, file); 388 return seq_release_private(inode, file);
389} 389}
390 390
391static struct file_operations sc_seq_fops = { 391static const struct file_operations sc_seq_fops = {
392 .open = sc_fop_open, 392 .open = sc_fop_open,
393 .read = seq_read, 393 .read = seq_read,
394 .llseek = seq_lseek, 394 .llseek = seq_lseek,
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 2f28b7de2c8d..b4957c7d9fe2 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -85,6 +85,17 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
85 goto bail; 85 goto bail;
86 } 86 }
87 87
88 /*
89 * If the last lookup failed to create dentry lock, let us
90 * redo it.
91 */
92 if (!dentry->d_fsdata) {
93 mlog(0, "Inode %llu doesn't have dentry lock, "
94 "returning false\n",
95 (unsigned long long)OCFS2_I(inode)->ip_blkno);
96 goto bail;
97 }
98
88 ret = 1; 99 ret = 1;
89 100
90bail: 101bail:
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index b358f3bf896d..28c3ec238796 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -176,7 +176,7 @@ static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle,
176 struct ocfs2_dx_root_block *dx_root; 176 struct ocfs2_dx_root_block *dx_root;
177 struct ocfs2_dir_block_trailer *trailer; 177 struct ocfs2_dir_block_trailer *trailer;
178 178
179 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, 179 ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
180 OCFS2_JOURNAL_ACCESS_WRITE); 180 OCFS2_JOURNAL_ACCESS_WRITE);
181 if (ret) { 181 if (ret) {
182 mlog_errno(ret); 182 mlog_errno(ret);
@@ -564,7 +564,8 @@ static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys,
564 int ret; 564 int ret;
565 struct buffer_head *tmp = *bh; 565 struct buffer_head *tmp = *bh;
566 566
567 ret = ocfs2_read_block(dir, phys, &tmp, ocfs2_validate_dir_block); 567 ret = ocfs2_read_block(INODE_CACHE(dir), phys, &tmp,
568 ocfs2_validate_dir_block);
568 if (ret) { 569 if (ret) {
569 mlog_errno(ret); 570 mlog_errno(ret);
570 goto out; 571 goto out;
@@ -622,7 +623,8 @@ static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
622 u64 blkno = le64_to_cpu(di->i_dx_root); 623 u64 blkno = le64_to_cpu(di->i_dx_root);
623 struct buffer_head *tmp = *dx_root_bh; 624 struct buffer_head *tmp = *dx_root_bh;
624 625
625 ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_root); 626 ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp,
627 ocfs2_validate_dx_root);
626 628
627 /* If ocfs2_read_block() got us a new bh, pass it up. */ 629 /* If ocfs2_read_block() got us a new bh, pass it up. */
628 if (!ret && !*dx_root_bh) 630 if (!ret && !*dx_root_bh)
@@ -662,7 +664,8 @@ static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
662 int ret; 664 int ret;
663 struct buffer_head *tmp = *dx_leaf_bh; 665 struct buffer_head *tmp = *dx_leaf_bh;
664 666
665 ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_leaf); 667 ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp,
668 ocfs2_validate_dx_leaf);
666 669
667 /* If ocfs2_read_block() got us a new bh, pass it up. */ 670 /* If ocfs2_read_block() got us a new bh, pass it up. */
668 if (!ret && !*dx_leaf_bh) 671 if (!ret && !*dx_leaf_bh)
@@ -680,7 +683,7 @@ static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num,
680{ 683{
681 int ret; 684 int ret;
682 685
683 ret = ocfs2_read_blocks(dir, start, num, dx_leaf_bhs, 0, 686 ret = ocfs2_read_blocks(INODE_CACHE(dir), start, num, dx_leaf_bhs, 0,
684 ocfs2_validate_dx_leaf); 687 ocfs2_validate_dx_leaf);
685 if (ret) 688 if (ret)
686 mlog_errno(ret); 689 mlog_errno(ret);
@@ -802,7 +805,8 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
802 struct ocfs2_extent_rec *rec = NULL; 805 struct ocfs2_extent_rec *rec = NULL;
803 806
804 if (el->l_tree_depth) { 807 if (el->l_tree_depth) {
805 ret = ocfs2_find_leaf(inode, el, major_hash, &eb_bh); 808 ret = ocfs2_find_leaf(INODE_CACHE(inode), el, major_hash,
809 &eb_bh);
806 if (ret) { 810 if (ret) {
807 mlog_errno(ret); 811 mlog_errno(ret);
808 goto out; 812 goto out;
@@ -1133,7 +1137,8 @@ int ocfs2_update_entry(struct inode *dir, handle_t *handle,
1133 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1137 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1134 access = ocfs2_journal_access_di; 1138 access = ocfs2_journal_access_di;
1135 1139
1136 ret = access(handle, dir, de_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1140 ret = access(handle, INODE_CACHE(dir), de_bh,
1141 OCFS2_JOURNAL_ACCESS_WRITE);
1137 if (ret) { 1142 if (ret) {
1138 mlog_errno(ret); 1143 mlog_errno(ret);
1139 goto out; 1144 goto out;
@@ -1176,7 +1181,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
1176 goto bail; 1181 goto bail;
1177 } 1182 }
1178 if (de == de_del) { 1183 if (de == de_del) {
1179 status = access(handle, dir, bh, 1184 status = access(handle, INODE_CACHE(dir), bh,
1180 OCFS2_JOURNAL_ACCESS_WRITE); 1185 OCFS2_JOURNAL_ACCESS_WRITE);
1181 if (status < 0) { 1186 if (status < 0) {
1182 status = -EIO; 1187 status = -EIO;
@@ -1326,7 +1331,7 @@ static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
1326 * the entry count needs to be updated. Also, we might be 1331 * the entry count needs to be updated. Also, we might be
1327 * adding to the start of the free list. 1332 * adding to the start of the free list.
1328 */ 1333 */
1329 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, 1334 ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
1330 OCFS2_JOURNAL_ACCESS_WRITE); 1335 OCFS2_JOURNAL_ACCESS_WRITE);
1331 if (ret) { 1336 if (ret) {
1332 mlog_errno(ret); 1337 mlog_errno(ret);
@@ -1334,7 +1339,7 @@ static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
1334 } 1339 }
1335 1340
1336 if (!ocfs2_dx_root_inline(dx_root)) { 1341 if (!ocfs2_dx_root_inline(dx_root)) {
1337 ret = ocfs2_journal_access_dl(handle, dir, 1342 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
1338 lookup->dl_dx_leaf_bh, 1343 lookup->dl_dx_leaf_bh,
1339 OCFS2_JOURNAL_ACCESS_WRITE); 1344 OCFS2_JOURNAL_ACCESS_WRITE);
1340 if (ret) { 1345 if (ret) {
@@ -1493,7 +1498,7 @@ static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle,
1493 int ret; 1498 int ret;
1494 struct ocfs2_dx_leaf *dx_leaf; 1499 struct ocfs2_dx_leaf *dx_leaf;
1495 1500
1496 ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh, 1501 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
1497 OCFS2_JOURNAL_ACCESS_WRITE); 1502 OCFS2_JOURNAL_ACCESS_WRITE);
1498 if (ret) { 1503 if (ret) {
1499 mlog_errno(ret); 1504 mlog_errno(ret);
@@ -1523,7 +1528,7 @@ static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle,
1523 struct ocfs2_dx_root_block *dx_root; 1528 struct ocfs2_dx_root_block *dx_root;
1524 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; 1529 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
1525 1530
1526 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, 1531 ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
1527 OCFS2_JOURNAL_ACCESS_WRITE); 1532 OCFS2_JOURNAL_ACCESS_WRITE);
1528 if (ret) { 1533 if (ret) {
1529 mlog_errno(ret); 1534 mlog_errno(ret);
@@ -1645,11 +1650,13 @@ int __ocfs2_add_entry(handle_t *handle,
1645 */ 1650 */
1646 if (ocfs2_free_list_at_root(lookup)) { 1651 if (ocfs2_free_list_at_root(lookup)) {
1647 bh = lookup->dl_dx_root_bh; 1652 bh = lookup->dl_dx_root_bh;
1648 retval = ocfs2_journal_access_dr(handle, dir, bh, 1653 retval = ocfs2_journal_access_dr(handle,
1654 INODE_CACHE(dir), bh,
1649 OCFS2_JOURNAL_ACCESS_WRITE); 1655 OCFS2_JOURNAL_ACCESS_WRITE);
1650 } else { 1656 } else {
1651 bh = lookup->dl_prev_leaf_bh; 1657 bh = lookup->dl_prev_leaf_bh;
1652 retval = ocfs2_journal_access_db(handle, dir, bh, 1658 retval = ocfs2_journal_access_db(handle,
1659 INODE_CACHE(dir), bh,
1653 OCFS2_JOURNAL_ACCESS_WRITE); 1660 OCFS2_JOURNAL_ACCESS_WRITE);
1654 } 1661 }
1655 if (retval) { 1662 if (retval) {
@@ -1700,11 +1707,13 @@ int __ocfs2_add_entry(handle_t *handle,
1700 } 1707 }
1701 1708
1702 if (insert_bh == parent_fe_bh) 1709 if (insert_bh == parent_fe_bh)
1703 status = ocfs2_journal_access_di(handle, dir, 1710 status = ocfs2_journal_access_di(handle,
1711 INODE_CACHE(dir),
1704 insert_bh, 1712 insert_bh,
1705 OCFS2_JOURNAL_ACCESS_WRITE); 1713 OCFS2_JOURNAL_ACCESS_WRITE);
1706 else { 1714 else {
1707 status = ocfs2_journal_access_db(handle, dir, 1715 status = ocfs2_journal_access_db(handle,
1716 INODE_CACHE(dir),
1708 insert_bh, 1717 insert_bh,
1709 OCFS2_JOURNAL_ACCESS_WRITE); 1718 OCFS2_JOURNAL_ACCESS_WRITE);
1710 1719
@@ -2280,7 +2289,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
2280 struct ocfs2_inline_data *data = &di->id2.i_data; 2289 struct ocfs2_inline_data *data = &di->id2.i_data;
2281 unsigned int size = le16_to_cpu(data->id_count); 2290 unsigned int size = le16_to_cpu(data->id_count);
2282 2291
2283 ret = ocfs2_journal_access_di(handle, inode, di_bh, 2292 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
2284 OCFS2_JOURNAL_ACCESS_WRITE); 2293 OCFS2_JOURNAL_ACCESS_WRITE);
2285 if (ret) { 2294 if (ret) {
2286 mlog_errno(ret); 2295 mlog_errno(ret);
@@ -2332,9 +2341,9 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
2332 goto bail; 2341 goto bail;
2333 } 2342 }
2334 2343
2335 ocfs2_set_new_buffer_uptodate(inode, new_bh); 2344 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
2336 2345
2337 status = ocfs2_journal_access_db(handle, inode, new_bh, 2346 status = ocfs2_journal_access_db(handle, INODE_CACHE(inode), new_bh,
2338 OCFS2_JOURNAL_ACCESS_CREATE); 2347 OCFS2_JOURNAL_ACCESS_CREATE);
2339 if (status < 0) { 2348 if (status < 0) {
2340 mlog_errno(status); 2349 mlog_errno(status);
@@ -2418,9 +2427,9 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2418 ret = -EIO; 2427 ret = -EIO;
2419 goto out; 2428 goto out;
2420 } 2429 }
2421 ocfs2_set_new_buffer_uptodate(dir, dx_root_bh); 2430 ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dx_root_bh);
2422 2431
2423 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, 2432 ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
2424 OCFS2_JOURNAL_ACCESS_CREATE); 2433 OCFS2_JOURNAL_ACCESS_CREATE);
2425 if (ret < 0) { 2434 if (ret < 0) {
2426 mlog_errno(ret); 2435 mlog_errno(ret);
@@ -2454,7 +2463,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2454 if (ret) 2463 if (ret)
2455 mlog_errno(ret); 2464 mlog_errno(ret);
2456 2465
2457 ret = ocfs2_journal_access_di(handle, dir, di_bh, 2466 ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
2458 OCFS2_JOURNAL_ACCESS_CREATE); 2467 OCFS2_JOURNAL_ACCESS_CREATE);
2459 if (ret) { 2468 if (ret) {
2460 mlog_errno(ret); 2469 mlog_errno(ret);
@@ -2495,9 +2504,9 @@ static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
2495 } 2504 }
2496 dx_leaves[i] = bh; 2505 dx_leaves[i] = bh;
2497 2506
2498 ocfs2_set_new_buffer_uptodate(dir, bh); 2507 ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), bh);
2499 2508
2500 ret = ocfs2_journal_access_dl(handle, dir, bh, 2509 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), bh,
2501 OCFS2_JOURNAL_ACCESS_CREATE); 2510 OCFS2_JOURNAL_ACCESS_CREATE);
2502 if (ret < 0) { 2511 if (ret < 0) {
2503 mlog_errno(ret); 2512 mlog_errno(ret);
@@ -2582,7 +2591,6 @@ static int ocfs2_dx_dir_new_cluster(struct inode *dir,
2582{ 2591{
2583 int ret; 2592 int ret;
2584 u64 phys_blkno; 2593 u64 phys_blkno;
2585 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2586 2594
2587 ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves, 2595 ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves,
2588 num_dx_leaves, &phys_blkno); 2596 num_dx_leaves, &phys_blkno);
@@ -2591,7 +2599,7 @@ static int ocfs2_dx_dir_new_cluster(struct inode *dir,
2591 goto out; 2599 goto out;
2592 } 2600 }
2593 2601
2594 ret = ocfs2_insert_extent(osb, handle, dir, et, cpos, phys_blkno, 1, 0, 2602 ret = ocfs2_insert_extent(handle, et, cpos, phys_blkno, 1, 0,
2595 meta_ac); 2603 meta_ac);
2596 if (ret) 2604 if (ret)
2597 mlog_errno(ret); 2605 mlog_errno(ret);
@@ -2895,7 +2903,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
2895 struct ocfs2_extent_tree dx_et; 2903 struct ocfs2_extent_tree dx_et;
2896 int did_quota = 0, bytes_allocated = 0; 2904 int did_quota = 0, bytes_allocated = 0;
2897 2905
2898 ocfs2_init_dinode_extent_tree(&et, dir, di_bh); 2906 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), di_bh);
2899 2907
2900 alloc = ocfs2_clusters_for_bytes(sb, bytes); 2908 alloc = ocfs2_clusters_for_bytes(sb, bytes);
2901 dx_alloc = 0; 2909 dx_alloc = 0;
@@ -3005,9 +3013,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3005 goto out_commit; 3013 goto out_commit;
3006 } 3014 }
3007 3015
3008 ocfs2_set_new_buffer_uptodate(dir, dirdata_bh); 3016 ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dirdata_bh);
3009 3017
3010 ret = ocfs2_journal_access_db(handle, dir, dirdata_bh, 3018 ret = ocfs2_journal_access_db(handle, INODE_CACHE(dir), dirdata_bh,
3011 OCFS2_JOURNAL_ACCESS_CREATE); 3019 OCFS2_JOURNAL_ACCESS_CREATE);
3012 if (ret) { 3020 if (ret) {
3013 mlog_errno(ret); 3021 mlog_errno(ret);
@@ -3060,7 +3068,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3060 * We let the later dirent insert modify c/mtime - to the user 3068 * We let the later dirent insert modify c/mtime - to the user
3061 * the data hasn't changed. 3069 * the data hasn't changed.
3062 */ 3070 */
3063 ret = ocfs2_journal_access_di(handle, dir, di_bh, 3071 ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
3064 OCFS2_JOURNAL_ACCESS_CREATE); 3072 OCFS2_JOURNAL_ACCESS_CREATE);
3065 if (ret) { 3073 if (ret) {
3066 mlog_errno(ret); 3074 mlog_errno(ret);
@@ -3085,7 +3093,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3085 * This should never fail as our extent list is empty and all 3093 * This should never fail as our extent list is empty and all
3086 * related blocks have been journaled already. 3094 * related blocks have been journaled already.
3087 */ 3095 */
3088 ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, blkno, len, 3096 ret = ocfs2_insert_extent(handle, &et, 0, blkno, len,
3089 0, NULL); 3097 0, NULL);
3090 if (ret) { 3098 if (ret) {
3091 mlog_errno(ret); 3099 mlog_errno(ret);
@@ -3117,8 +3125,10 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3117 ocfs2_dx_dir_index_root_block(dir, dx_root_bh, 3125 ocfs2_dx_dir_index_root_block(dir, dx_root_bh,
3118 dirdata_bh); 3126 dirdata_bh);
3119 } else { 3127 } else {
3120 ocfs2_init_dx_root_extent_tree(&dx_et, dir, dx_root_bh); 3128 ocfs2_init_dx_root_extent_tree(&dx_et,
3121 ret = ocfs2_insert_extent(osb, handle, dir, &dx_et, 0, 3129 INODE_CACHE(dir),
3130 dx_root_bh);
3131 ret = ocfs2_insert_extent(handle, &dx_et, 0,
3122 dx_insert_blkno, 1, 0, NULL); 3132 dx_insert_blkno, 1, 0, NULL);
3123 if (ret) 3133 if (ret)
3124 mlog_errno(ret); 3134 mlog_errno(ret);
@@ -3138,7 +3148,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3138 } 3148 }
3139 blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off); 3149 blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
3140 3150
3141 ret = ocfs2_insert_extent(osb, handle, dir, &et, 1, 3151 ret = ocfs2_insert_extent(handle, &et, 1,
3142 blkno, len, 0, NULL); 3152 blkno, len, 0, NULL);
3143 if (ret) { 3153 if (ret) {
3144 mlog_errno(ret); 3154 mlog_errno(ret);
@@ -3337,8 +3347,9 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
3337 spin_lock(&OCFS2_I(dir)->ip_lock); 3347 spin_lock(&OCFS2_I(dir)->ip_lock);
3338 if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) { 3348 if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
3339 spin_unlock(&OCFS2_I(dir)->ip_lock); 3349 spin_unlock(&OCFS2_I(dir)->ip_lock);
3340 ocfs2_init_dinode_extent_tree(&et, dir, parent_fe_bh); 3350 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir),
3341 num_free_extents = ocfs2_num_free_extents(osb, dir, &et); 3351 parent_fe_bh);
3352 num_free_extents = ocfs2_num_free_extents(osb, &et);
3342 if (num_free_extents < 0) { 3353 if (num_free_extents < 0) {
3343 status = num_free_extents; 3354 status = num_free_extents;
3344 mlog_errno(status); 3355 mlog_errno(status);
@@ -3387,9 +3398,9 @@ do_extend:
3387 goto bail; 3398 goto bail;
3388 } 3399 }
3389 3400
3390 ocfs2_set_new_buffer_uptodate(dir, new_bh); 3401 ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), new_bh);
3391 3402
3392 status = ocfs2_journal_access_db(handle, dir, new_bh, 3403 status = ocfs2_journal_access_db(handle, INODE_CACHE(dir), new_bh,
3393 OCFS2_JOURNAL_ACCESS_CREATE); 3404 OCFS2_JOURNAL_ACCESS_CREATE);
3394 if (status < 0) { 3405 if (status < 0) {
3395 mlog_errno(status); 3406 mlog_errno(status);
@@ -3829,7 +3840,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3829 (unsigned long long)OCFS2_I(dir)->ip_blkno, 3840 (unsigned long long)OCFS2_I(dir)->ip_blkno,
3830 (unsigned long long)leaf_blkno, insert_hash); 3841 (unsigned long long)leaf_blkno, insert_hash);
3831 3842
3832 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh); 3843 ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
3833 3844
3834 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 3845 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
3835 /* 3846 /*
@@ -3885,7 +3896,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3885 } 3896 }
3886 did_quota = 1; 3897 did_quota = 1;
3887 3898
3888 ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh, 3899 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
3889 OCFS2_JOURNAL_ACCESS_WRITE); 3900 OCFS2_JOURNAL_ACCESS_WRITE);
3890 if (ret) { 3901 if (ret) {
3891 mlog_errno(ret); 3902 mlog_errno(ret);
@@ -3949,7 +3960,8 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3949 } 3960 }
3950 3961
3951 for (i = 0; i < num_dx_leaves; i++) { 3962 for (i = 0; i < num_dx_leaves; i++) {
3952 ret = ocfs2_journal_access_dl(handle, dir, orig_dx_leaves[i], 3963 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
3964 orig_dx_leaves[i],
3953 OCFS2_JOURNAL_ACCESS_WRITE); 3965 OCFS2_JOURNAL_ACCESS_WRITE);
3954 if (ret) { 3966 if (ret) {
3955 mlog_errno(ret); 3967 mlog_errno(ret);
@@ -4165,7 +4177,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
4165 * failure to add the dx_root_bh to the journal won't result 4177 * failure to add the dx_root_bh to the journal won't result
4166 * us losing clusters. 4178 * us losing clusters.
4167 */ 4179 */
4168 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, 4180 ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
4169 OCFS2_JOURNAL_ACCESS_WRITE); 4181 OCFS2_JOURNAL_ACCESS_WRITE);
4170 if (ret) { 4182 if (ret) {
4171 mlog_errno(ret); 4183 mlog_errno(ret);
@@ -4207,9 +4219,8 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
4207 4219
4208 /* This should never fail considering we start with an empty 4220 /* This should never fail considering we start with an empty
4209 * dx_root. */ 4221 * dx_root. */
4210 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh); 4222 ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
4211 ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, 4223 ret = ocfs2_insert_extent(handle, &et, 0, insert_blkno, 1, 0, NULL);
4212 insert_blkno, 1, 0, NULL);
4213 if (ret) 4224 if (ret)
4214 mlog_errno(ret); 4225 mlog_errno(ret);
4215 did_quota = 0; 4226 did_quota = 0;
@@ -4469,7 +4480,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
4469 goto out_unlock; 4480 goto out_unlock;
4470 } 4481 }
4471 4482
4472 ret = ocfs2_journal_access_di(handle, dir, di_bh, 4483 ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
4473 OCFS2_JOURNAL_ACCESS_WRITE); 4484 OCFS2_JOURNAL_ACCESS_WRITE);
4474 if (ret) { 4485 if (ret) {
4475 mlog_errno(ret); 4486 mlog_errno(ret);
@@ -4532,7 +4543,7 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
4532 if (ocfs2_dx_root_inline(dx_root)) 4543 if (ocfs2_dx_root_inline(dx_root))
4533 goto remove_index; 4544 goto remove_index;
4534 4545
4535 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh); 4546 ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
4536 4547
4537 /* XXX: What if dr_clusters is too large? */ 4548 /* XXX: What if dr_clusters is too large? */
4538 while (le32_to_cpu(dx_root->dr_clusters)) { 4549 while (le32_to_cpu(dx_root->dr_clusters)) {
@@ -4565,7 +4576,7 @@ remove_index:
4565 goto out; 4576 goto out;
4566 } 4577 }
4567 4578
4568 ocfs2_remove_from_cache(dir, dx_root_bh); 4579 ocfs2_remove_from_cache(INODE_CACHE(dir), dx_root_bh);
4569out: 4580out:
4570 ocfs2_schedule_truncate_log_flush(osb, 1); 4581 ocfs2_schedule_truncate_log_flush(osb, 1);
4571 ocfs2_run_deallocs(osb, &dealloc); 4582 ocfs2_run_deallocs(osb, &dealloc);
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 81eff8e58322..01cf8cc3d286 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -30,7 +30,6 @@
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/highmem.h> 32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h> 33#include <linux/init.h>
35#include <linux/sysctl.h> 34#include <linux/sysctl.h>
36#include <linux/random.h> 35#include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 75997b4deaf3..ca96bce50e18 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -30,7 +30,6 @@
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/highmem.h> 32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h> 33#include <linux/init.h>
35#include <linux/sysctl.h> 34#include <linux/sysctl.h>
36#include <linux/random.h> 35#include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index df52f706f669..42b0bad7a612 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -27,7 +27,6 @@
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/utsname.h>
31#include <linux/sysctl.h> 30#include <linux/sysctl.h>
32#include <linux/spinlock.h> 31#include <linux/spinlock.h>
33#include <linux/debugfs.h> 32#include <linux/debugfs.h>
@@ -479,7 +478,7 @@ bail:
479 return -ENOMEM; 478 return -ENOMEM;
480} 479}
481 480
482static struct file_operations debug_purgelist_fops = { 481static const struct file_operations debug_purgelist_fops = {
483 .open = debug_purgelist_open, 482 .open = debug_purgelist_open,
484 .release = debug_buffer_release, 483 .release = debug_buffer_release,
485 .read = debug_buffer_read, 484 .read = debug_buffer_read,
@@ -539,7 +538,7 @@ bail:
539 return -ENOMEM; 538 return -ENOMEM;
540} 539}
541 540
542static struct file_operations debug_mle_fops = { 541static const struct file_operations debug_mle_fops = {
543 .open = debug_mle_open, 542 .open = debug_mle_open,
544 .release = debug_buffer_release, 543 .release = debug_buffer_release,
545 .read = debug_buffer_read, 544 .read = debug_buffer_read,
@@ -683,7 +682,7 @@ static int lockres_seq_show(struct seq_file *s, void *v)
683 return 0; 682 return 0;
684} 683}
685 684
686static struct seq_operations debug_lockres_ops = { 685static const struct seq_operations debug_lockres_ops = {
687 .start = lockres_seq_start, 686 .start = lockres_seq_start,
688 .stop = lockres_seq_stop, 687 .stop = lockres_seq_stop,
689 .next = lockres_seq_next, 688 .next = lockres_seq_next,
@@ -742,7 +741,7 @@ static int debug_lockres_release(struct inode *inode, struct file *file)
742 return seq_release_private(inode, file); 741 return seq_release_private(inode, file);
743} 742}
744 743
745static struct file_operations debug_lockres_fops = { 744static const struct file_operations debug_lockres_fops = {
746 .open = debug_lockres_open, 745 .open = debug_lockres_open,
747 .release = debug_lockres_release, 746 .release = debug_lockres_release,
748 .read = seq_read, 747 .read = seq_read,
@@ -926,7 +925,7 @@ bail:
926 return -ENOMEM; 925 return -ENOMEM;
927} 926}
928 927
929static struct file_operations debug_state_fops = { 928static const struct file_operations debug_state_fops = {
930 .open = debug_state_open, 929 .open = debug_state_open,
931 .release = debug_buffer_release, 930 .release = debug_buffer_release,
932 .read = debug_buffer_read, 931 .read = debug_buffer_read,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 4d9e6b288dd8..0334000676d3 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -28,7 +28,6 @@
28#include <linux/types.h> 28#include <linux/types.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/highmem.h> 30#include <linux/highmem.h>
31#include <linux/utsname.h>
32#include <linux/init.h> 31#include <linux/init.h>
33#include <linux/spinlock.h> 32#include <linux/spinlock.h>
34#include <linux/delay.h> 33#include <linux/delay.h>
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 1c9efb406a96..02bf17808bdc 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -325,6 +325,7 @@ clear_fields:
325} 325}
326 326
327static struct backing_dev_info dlmfs_backing_dev_info = { 327static struct backing_dev_info dlmfs_backing_dev_info = {
328 .name = "ocfs2-dlmfs",
328 .ra_pages = 0, /* No readahead */ 329 .ra_pages = 0, /* No readahead */
329 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 330 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
330}; 331};
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 83a9f2972ac8..437698e9465f 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -30,7 +30,6 @@
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/highmem.h> 32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h> 33#include <linux/init.h>
35#include <linux/sysctl.h> 34#include <linux/sysctl.h>
36#include <linux/random.h> 35#include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index f8b653fcd4dd..83bcaf266b35 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -30,7 +30,6 @@
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/highmem.h> 32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h> 33#include <linux/init.h>
35#include <linux/sysctl.h> 34#include <linux/sysctl.h>
36#include <linux/random.h> 35#include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 43e6e3280569..d9fa3d22e17c 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -30,7 +30,6 @@
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/highmem.h> 32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h> 33#include <linux/init.h>
35#include <linux/sysctl.h> 34#include <linux/sysctl.h>
36#include <linux/random.h> 35#include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index d490b66ad9d7..52ec020ea78b 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -30,7 +30,6 @@
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/highmem.h> 32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h> 33#include <linux/init.h>
35#include <linux/sysctl.h> 34#include <linux/sysctl.h>
36#include <linux/random.h> 35#include <linux/random.h>
@@ -212,14 +211,18 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
212 spin_lock(&dlm->spinlock); 211 spin_lock(&dlm->spinlock);
213 } 212 }
214 213
214 spin_lock(&res->spinlock);
215 if (!list_empty(&res->purge)) { 215 if (!list_empty(&res->purge)) {
216 mlog(0, "removing lockres %.*s:%p from purgelist, " 216 mlog(0, "removing lockres %.*s:%p from purgelist, "
217 "master = %d\n", res->lockname.len, res->lockname.name, 217 "master = %d\n", res->lockname.len, res->lockname.name,
218 res, master); 218 res, master);
219 list_del_init(&res->purge); 219 list_del_init(&res->purge);
220 spin_unlock(&res->spinlock);
220 dlm_lockres_put(res); 221 dlm_lockres_put(res);
221 dlm->purge_count--; 222 dlm->purge_count--;
222 } 223 } else
224 spin_unlock(&res->spinlock);
225
223 __dlm_unhash_lockres(res); 226 __dlm_unhash_lockres(res);
224 227
225 /* lockres is not in the hash now. drop the flag and wake up 228 /* lockres is not in the hash now. drop the flag and wake up
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 756f5b0998e0..00f53b2aea76 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -30,7 +30,6 @@
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/highmem.h> 32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h> 33#include <linux/init.h>
35#include <linux/sysctl.h> 34#include <linux/sysctl.h>
36#include <linux/random.h> 35#include <linux/random.h>
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 110bb57c46ab..0d38d67194cb 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -53,6 +53,7 @@
53#include "super.h" 53#include "super.h"
54#include "uptodate.h" 54#include "uptodate.h"
55#include "quota.h" 55#include "quota.h"
56#include "refcounttree.h"
56 57
57#include "buffer_head_io.h" 58#include "buffer_head_io.h"
58 59
@@ -110,6 +111,11 @@ static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
110 111
111static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres); 112static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres);
112 113
114static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres,
115 int new_level);
116static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres,
117 int blocking);
118
113#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres) 119#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
114 120
115/* This aids in debugging situations where a bad LVB might be involved. */ 121/* This aids in debugging situations where a bad LVB might be involved. */
@@ -278,6 +284,12 @@ static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
278 .flags = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB, 284 .flags = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB,
279}; 285};
280 286
287static struct ocfs2_lock_res_ops ocfs2_refcount_block_lops = {
288 .check_downconvert = ocfs2_check_refcount_downconvert,
289 .downconvert_worker = ocfs2_refcount_convert_worker,
290 .flags = 0,
291};
292
281static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) 293static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
282{ 294{
283 return lockres->l_type == OCFS2_LOCK_TYPE_META || 295 return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -306,6 +318,12 @@ static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_re
306 return (struct ocfs2_mem_dqinfo *)lockres->l_priv; 318 return (struct ocfs2_mem_dqinfo *)lockres->l_priv;
307} 319}
308 320
321static inline struct ocfs2_refcount_tree *
322ocfs2_lock_res_refcount_tree(struct ocfs2_lock_res *res)
323{
324 return container_of(res, struct ocfs2_refcount_tree, rf_lockres);
325}
326
309static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres) 327static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres)
310{ 328{
311 if (lockres->l_ops->get_osb) 329 if (lockres->l_ops->get_osb)
@@ -693,6 +711,17 @@ void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
693 info); 711 info);
694} 712}
695 713
714void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
715 struct ocfs2_super *osb, u64 ref_blkno,
716 unsigned int generation)
717{
718 ocfs2_lock_res_init_once(lockres);
719 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_REFCOUNT, ref_blkno,
720 generation, lockres->l_name);
721 ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_REFCOUNT,
722 &ocfs2_refcount_block_lops, osb);
723}
724
696void ocfs2_lock_res_free(struct ocfs2_lock_res *res) 725void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
697{ 726{
698 mlog_entry_void(); 727 mlog_entry_void();
@@ -1548,8 +1577,10 @@ int ocfs2_rw_lock(struct inode *inode, int write)
1548 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1577 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1549 write ? "EXMODE" : "PRMODE"); 1578 write ? "EXMODE" : "PRMODE");
1550 1579
1551 if (ocfs2_mount_local(osb)) 1580 if (ocfs2_mount_local(osb)) {
1581 mlog_exit(0);
1552 return 0; 1582 return 0;
1583 }
1553 1584
1554 lockres = &OCFS2_I(inode)->ip_rw_lockres; 1585 lockres = &OCFS2_I(inode)->ip_rw_lockres;
1555 1586
@@ -2127,7 +2158,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
2127 2158
2128 /* This will discard any caching information we might have had 2159 /* This will discard any caching information we might have had
2129 * for the inode metadata. */ 2160 * for the inode metadata. */
2130 ocfs2_metadata_cache_purge(inode); 2161 ocfs2_metadata_cache_purge(INODE_CACHE(inode));
2131 2162
2132 ocfs2_extent_map_trunc(inode, 0); 2163 ocfs2_extent_map_trunc(inode, 0);
2133 2164
@@ -3009,6 +3040,7 @@ static void ocfs2_unlock_ast(void *opaque, int error)
3009 "unlock_action %d\n", error, lockres->l_name, 3040 "unlock_action %d\n", error, lockres->l_name,
3010 lockres->l_unlock_action); 3041 lockres->l_unlock_action);
3011 spin_unlock_irqrestore(&lockres->l_lock, flags); 3042 spin_unlock_irqrestore(&lockres->l_lock, flags);
3043 mlog_exit_void();
3012 return; 3044 return;
3013 } 3045 }
3014 3046
@@ -3495,11 +3527,11 @@ out:
3495 return UNBLOCK_CONTINUE; 3527 return UNBLOCK_CONTINUE;
3496} 3528}
3497 3529
3498static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres, 3530static int ocfs2_ci_checkpointed(struct ocfs2_caching_info *ci,
3499 int new_level) 3531 struct ocfs2_lock_res *lockres,
3532 int new_level)
3500{ 3533{
3501 struct inode *inode = ocfs2_lock_res_inode(lockres); 3534 int checkpointed = ocfs2_ci_fully_checkpointed(ci);
3502 int checkpointed = ocfs2_inode_fully_checkpointed(inode);
3503 3535
3504 BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR); 3536 BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR);
3505 BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed); 3537 BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed);
@@ -3507,10 +3539,18 @@ static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
3507 if (checkpointed) 3539 if (checkpointed)
3508 return 1; 3540 return 1;
3509 3541
3510 ocfs2_start_checkpoint(OCFS2_SB(inode->i_sb)); 3542 ocfs2_start_checkpoint(OCFS2_SB(ocfs2_metadata_cache_get_super(ci)));
3511 return 0; 3543 return 0;
3512} 3544}
3513 3545
3546static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
3547 int new_level)
3548{
3549 struct inode *inode = ocfs2_lock_res_inode(lockres);
3550
3551 return ocfs2_ci_checkpointed(INODE_CACHE(inode), lockres, new_level);
3552}
3553
3514static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres) 3554static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres)
3515{ 3555{
3516 struct inode *inode = ocfs2_lock_res_inode(lockres); 3556 struct inode *inode = ocfs2_lock_res_inode(lockres);
@@ -3640,6 +3680,26 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
3640 return UNBLOCK_CONTINUE_POST; 3680 return UNBLOCK_CONTINUE_POST;
3641} 3681}
3642 3682
3683static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres,
3684 int new_level)
3685{
3686 struct ocfs2_refcount_tree *tree =
3687 ocfs2_lock_res_refcount_tree(lockres);
3688
3689 return ocfs2_ci_checkpointed(&tree->rf_ci, lockres, new_level);
3690}
3691
3692static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres,
3693 int blocking)
3694{
3695 struct ocfs2_refcount_tree *tree =
3696 ocfs2_lock_res_refcount_tree(lockres);
3697
3698 ocfs2_metadata_cache_purge(&tree->rf_ci);
3699
3700 return UNBLOCK_CONTINUE;
3701}
3702
3643static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres) 3703static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
3644{ 3704{
3645 struct ocfs2_qinfo_lvb *lvb; 3705 struct ocfs2_qinfo_lvb *lvb;
@@ -3752,6 +3812,37 @@ bail:
3752 return status; 3812 return status;
3753} 3813}
3754 3814
3815int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex)
3816{
3817 int status;
3818 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
3819 struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres;
3820 struct ocfs2_super *osb = lockres->l_priv;
3821
3822
3823 if (ocfs2_is_hard_readonly(osb))
3824 return -EROFS;
3825
3826 if (ocfs2_mount_local(osb))
3827 return 0;
3828
3829 status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
3830 if (status < 0)
3831 mlog_errno(status);
3832
3833 return status;
3834}
3835
3836void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex)
3837{
3838 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
3839 struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres;
3840 struct ocfs2_super *osb = lockres->l_priv;
3841
3842 if (!ocfs2_mount_local(osb))
3843 ocfs2_cluster_unlock(osb, lockres, level);
3844}
3845
3755/* 3846/*
3756 * This is the filesystem locking protocol. It provides the lock handling 3847 * This is the filesystem locking protocol. It provides the lock handling
3757 * hooks for the underlying DLM. It has a maximum version number. 3848 * hooks for the underlying DLM. It has a maximum version number.
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 7553836931de..d1ce48e1b3d6 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -101,6 +101,9 @@ void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
101struct ocfs2_mem_dqinfo; 101struct ocfs2_mem_dqinfo;
102void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres, 102void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
103 struct ocfs2_mem_dqinfo *info); 103 struct ocfs2_mem_dqinfo *info);
104void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
105 struct ocfs2_super *osb, u64 ref_blkno,
106 unsigned int generation);
104void ocfs2_lock_res_free(struct ocfs2_lock_res *res); 107void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
105int ocfs2_create_new_inode_locks(struct inode *inode); 108int ocfs2_create_new_inode_locks(struct inode *inode);
106int ocfs2_drop_inode_locks(struct inode *inode); 109int ocfs2_drop_inode_locks(struct inode *inode);
@@ -148,6 +151,9 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock);
148void ocfs2_file_unlock(struct file *file); 151void ocfs2_file_unlock(struct file *file);
149int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex); 152int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex);
150void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex); 153void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex);
154struct ocfs2_refcount_tree;
155int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex);
156void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex);
151 157
152 158
153void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); 159void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index f2bb1a04d253..843db64e9d4a 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -293,7 +293,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
293 struct ocfs2_extent_block *eb; 293 struct ocfs2_extent_block *eb;
294 struct ocfs2_extent_list *el; 294 struct ocfs2_extent_list *el;
295 295
296 ret = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh); 296 ret = ocfs2_read_extent_block(INODE_CACHE(inode), last_eb_blk, &eb_bh);
297 if (ret) { 297 if (ret) {
298 mlog_errno(ret); 298 mlog_errno(ret);
299 goto out; 299 goto out;
@@ -353,11 +353,11 @@ static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,
353 * eb_bh is NULL. Otherwise, eb_bh should point to the extent block 353 * eb_bh is NULL. Otherwise, eb_bh should point to the extent block
354 * containing el. 354 * containing el.
355 */ 355 */
356static int ocfs2_figure_hole_clusters(struct inode *inode, 356int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci,
357 struct ocfs2_extent_list *el, 357 struct ocfs2_extent_list *el,
358 struct buffer_head *eb_bh, 358 struct buffer_head *eb_bh,
359 u32 v_cluster, 359 u32 v_cluster,
360 u32 *num_clusters) 360 u32 *num_clusters)
361{ 361{
362 int ret, i; 362 int ret, i;
363 struct buffer_head *next_eb_bh = NULL; 363 struct buffer_head *next_eb_bh = NULL;
@@ -375,7 +375,7 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
375 if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL) 375 if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
376 goto no_more_extents; 376 goto no_more_extents;
377 377
378 ret = ocfs2_read_extent_block(inode, 378 ret = ocfs2_read_extent_block(ci,
379 le64_to_cpu(eb->h_next_leaf_blk), 379 le64_to_cpu(eb->h_next_leaf_blk),
380 &next_eb_bh); 380 &next_eb_bh);
381 if (ret) { 381 if (ret) {
@@ -428,7 +428,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
428 tree_height = le16_to_cpu(el->l_tree_depth); 428 tree_height = le16_to_cpu(el->l_tree_depth);
429 429
430 if (tree_height > 0) { 430 if (tree_height > 0) {
431 ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); 431 ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster,
432 &eb_bh);
432 if (ret) { 433 if (ret) {
433 mlog_errno(ret); 434 mlog_errno(ret);
434 goto out; 435 goto out;
@@ -455,7 +456,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
455 * field. 456 * field.
456 */ 457 */
457 if (hole_len) { 458 if (hole_len) {
458 ret = ocfs2_figure_hole_clusters(inode, el, eb_bh, 459 ret = ocfs2_figure_hole_clusters(INODE_CACHE(inode),
460 el, eb_bh,
459 v_cluster, &len); 461 v_cluster, &len);
460 if (ret) { 462 if (ret) {
461 mlog_errno(ret); 463 mlog_errno(ret);
@@ -539,7 +541,8 @@ static void ocfs2_relative_extent_offsets(struct super_block *sb,
539 541
540int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, 542int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
541 u32 *p_cluster, u32 *num_clusters, 543 u32 *p_cluster, u32 *num_clusters,
542 struct ocfs2_extent_list *el) 544 struct ocfs2_extent_list *el,
545 unsigned int *extent_flags)
543{ 546{
544 int ret = 0, i; 547 int ret = 0, i;
545 struct buffer_head *eb_bh = NULL; 548 struct buffer_head *eb_bh = NULL;
@@ -548,7 +551,8 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
548 u32 coff; 551 u32 coff;
549 552
550 if (el->l_tree_depth) { 553 if (el->l_tree_depth) {
551 ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); 554 ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster,
555 &eb_bh);
552 if (ret) { 556 if (ret) {
553 mlog_errno(ret); 557 mlog_errno(ret);
554 goto out; 558 goto out;
@@ -590,6 +594,9 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
590 *p_cluster = *p_cluster + coff; 594 *p_cluster = *p_cluster + coff;
591 if (num_clusters) 595 if (num_clusters)
592 *num_clusters = ocfs2_rec_clusters(el, rec) - coff; 596 *num_clusters = ocfs2_rec_clusters(el, rec) - coff;
597
598 if (extent_flags)
599 *extent_flags = rec->e_flags;
593 } 600 }
594out: 601out:
595 if (eb_bh) 602 if (eb_bh)
@@ -862,8 +869,8 @@ int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
862 BUG_ON(bhs[done + i]->b_blocknr != (p_block + i)); 869 BUG_ON(bhs[done + i]->b_blocknr != (p_block + i));
863 } 870 }
864 871
865 rc = ocfs2_read_blocks(inode, p_block, count, bhs + done, 872 rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, count,
866 flags, validate); 873 bhs + done, flags, validate);
867 if (rc) { 874 if (rc) {
868 mlog_errno(rc); 875 mlog_errno(rc);
869 break; 876 break;
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index b7dd9731b462..e79d41c2c909 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -55,12 +55,18 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
55 55
56int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, 56int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
57 u32 *p_cluster, u32 *num_clusters, 57 u32 *p_cluster, u32 *num_clusters,
58 struct ocfs2_extent_list *el); 58 struct ocfs2_extent_list *el,
59 unsigned int *extent_flags);
59 60
60int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, 61int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
61 struct buffer_head *bhs[], int flags, 62 struct buffer_head *bhs[], int flags,
62 int (*validate)(struct super_block *sb, 63 int (*validate)(struct super_block *sb,
63 struct buffer_head *bh)); 64 struct buffer_head *bh));
65int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci,
66 struct ocfs2_extent_list *el,
67 struct buffer_head *eb_bh,
68 u32 v_cluster,
69 u32 *num_clusters);
64static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block, 70static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block,
65 struct buffer_head **bh, 71 struct buffer_head **bh,
66 int (*validate)(struct super_block *sb, 72 int (*validate)(struct super_block *sb,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index aa501d3f93f1..89fc8ee1f5a5 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -59,6 +59,7 @@
59#include "xattr.h" 59#include "xattr.h"
60#include "acl.h" 60#include "acl.h"
61#include "quota.h" 61#include "quota.h"
62#include "refcounttree.h"
62 63
63#include "buffer_head_io.h" 64#include "buffer_head_io.h"
64 65
@@ -259,7 +260,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
259 goto out; 260 goto out;
260 } 261 }
261 262
262 ret = ocfs2_journal_access_di(handle, inode, bh, 263 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
263 OCFS2_JOURNAL_ACCESS_WRITE); 264 OCFS2_JOURNAL_ACCESS_WRITE);
264 if (ret) { 265 if (ret) {
265 mlog_errno(ret); 266 mlog_errno(ret);
@@ -334,6 +335,39 @@ out:
334 return ret; 335 return ret;
335} 336}
336 337
338static int ocfs2_cow_file_pos(struct inode *inode,
339 struct buffer_head *fe_bh,
340 u64 offset)
341{
342 int status;
343 u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
344 unsigned int num_clusters = 0;
345 unsigned int ext_flags = 0;
346
347 /*
348 * If the new offset is aligned to the range of the cluster, there is
349 * no space for ocfs2_zero_range_for_truncate to fill, so no need to
350 * CoW either.
351 */
352 if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)
353 return 0;
354
355 status = ocfs2_get_clusters(inode, cpos, &phys,
356 &num_clusters, &ext_flags);
357 if (status) {
358 mlog_errno(status);
359 goto out;
360 }
361
362 if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
363 goto out;
364
365 return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
366
367out:
368 return status;
369}
370
337static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, 371static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
338 struct inode *inode, 372 struct inode *inode,
339 struct buffer_head *fe_bh, 373 struct buffer_head *fe_bh,
@@ -346,6 +380,17 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
346 380
347 mlog_entry_void(); 381 mlog_entry_void();
348 382
383 /*
384 * We need to CoW the cluster contains the offset if it is reflinked
385 * since we will call ocfs2_zero_range_for_truncate later which will
386 * write "0" from offset to the end of the cluster.
387 */
388 status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);
389 if (status) {
390 mlog_errno(status);
391 return status;
392 }
393
349 /* TODO: This needs to actually orphan the inode in this 394 /* TODO: This needs to actually orphan the inode in this
350 * transaction. */ 395 * transaction. */
351 396
@@ -356,7 +401,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
356 goto out; 401 goto out;
357 } 402 }
358 403
359 status = ocfs2_journal_access_di(handle, inode, fe_bh, 404 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
360 OCFS2_JOURNAL_ACCESS_WRITE); 405 OCFS2_JOURNAL_ACCESS_WRITE);
361 if (status < 0) { 406 if (status < 0) {
362 mlog_errno(status); 407 mlog_errno(status);
@@ -486,6 +531,8 @@ bail_unlock_sem:
486 up_write(&OCFS2_I(inode)->ip_alloc_sem); 531 up_write(&OCFS2_I(inode)->ip_alloc_sem);
487 532
488bail: 533bail:
534 if (!status && OCFS2_I(inode)->ip_clusters == 0)
535 status = ocfs2_try_remove_refcount_tree(inode, di_bh);
489 536
490 mlog_exit(status); 537 mlog_exit(status);
491 return status; 538 return status;
@@ -515,11 +562,10 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
515 int ret; 562 int ret;
516 struct ocfs2_extent_tree et; 563 struct ocfs2_extent_tree et;
517 564
518 ocfs2_init_dinode_extent_tree(&et, inode, fe_bh); 565 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
519 ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset, 566 ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
520 clusters_to_add, mark_unwritten, 567 clusters_to_add, mark_unwritten,
521 &et, handle, 568 data_ac, meta_ac, reason_ret);
522 data_ac, meta_ac, reason_ret);
523 569
524 return ret; 570 return ret;
525} 571}
@@ -564,7 +610,7 @@ restart_all:
564 (unsigned long long)OCFS2_I(inode)->ip_blkno, 610 (unsigned long long)OCFS2_I(inode)->ip_blkno,
565 (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters), 611 (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
566 clusters_to_add); 612 clusters_to_add);
567 ocfs2_init_dinode_extent_tree(&et, inode, bh); 613 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
568 status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, 614 status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
569 &data_ac, &meta_ac); 615 &data_ac, &meta_ac);
570 if (status) { 616 if (status) {
@@ -593,7 +639,7 @@ restarted_transaction:
593 /* reserve a write to the file entry early on - that we if we 639 /* reserve a write to the file entry early on - that we if we
594 * run out of credits in the allocation path, we can still 640 * run out of credits in the allocation path, we can still
595 * update i_size. */ 641 * update i_size. */
596 status = ocfs2_journal_access_di(handle, inode, bh, 642 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
597 OCFS2_JOURNAL_ACCESS_WRITE); 643 OCFS2_JOURNAL_ACCESS_WRITE);
598 if (status < 0) { 644 if (status < 0) {
599 mlog_errno(status); 645 mlog_errno(status);
@@ -1131,7 +1177,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
1131 goto out; 1177 goto out;
1132 } 1178 }
1133 1179
1134 ret = ocfs2_journal_access_di(handle, inode, bh, 1180 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
1135 OCFS2_JOURNAL_ACCESS_WRITE); 1181 OCFS2_JOURNAL_ACCESS_WRITE);
1136 if (ret < 0) { 1182 if (ret < 0) {
1137 mlog_errno(ret); 1183 mlog_errno(ret);
@@ -1395,7 +1441,7 @@ static int ocfs2_remove_inode_range(struct inode *inode,
1395 struct address_space *mapping = inode->i_mapping; 1441 struct address_space *mapping = inode->i_mapping;
1396 struct ocfs2_extent_tree et; 1442 struct ocfs2_extent_tree et;
1397 1443
1398 ocfs2_init_dinode_extent_tree(&et, inode, di_bh); 1444 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
1399 ocfs2_init_dealloc_ctxt(&dealloc); 1445 ocfs2_init_dealloc_ctxt(&dealloc);
1400 1446
1401 if (byte_len == 0) 1447 if (byte_len == 0)
@@ -1657,6 +1703,70 @@ static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
1657 OCFS2_IOC_RESVSP64, &sr, change_size); 1703 OCFS2_IOC_RESVSP64, &sr, change_size);
1658} 1704}
1659 1705
1706int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
1707 size_t count)
1708{
1709 int ret = 0;
1710 unsigned int extent_flags;
1711 u32 cpos, clusters, extent_len, phys_cpos;
1712 struct super_block *sb = inode->i_sb;
1713
1714 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
1715 !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
1716 return 0;
1717
1718 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
1719 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
1720
1721 while (clusters) {
1722 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
1723 &extent_flags);
1724 if (ret < 0) {
1725 mlog_errno(ret);
1726 goto out;
1727 }
1728
1729 if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
1730 ret = 1;
1731 break;
1732 }
1733
1734 if (extent_len > clusters)
1735 extent_len = clusters;
1736
1737 clusters -= extent_len;
1738 cpos += extent_len;
1739 }
1740out:
1741 return ret;
1742}
1743
1744static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
1745 loff_t pos, size_t count,
1746 int *meta_level)
1747{
1748 int ret;
1749 struct buffer_head *di_bh = NULL;
1750 u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
1751 u32 clusters =
1752 ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
1753
1754 ret = ocfs2_inode_lock(inode, &di_bh, 1);
1755 if (ret) {
1756 mlog_errno(ret);
1757 goto out;
1758 }
1759
1760 *meta_level = 1;
1761
1762 ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
1763 if (ret)
1764 mlog_errno(ret);
1765out:
1766 brelse(di_bh);
1767 return ret;
1768}
1769
1660static int ocfs2_prepare_inode_for_write(struct dentry *dentry, 1770static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1661 loff_t *ppos, 1771 loff_t *ppos,
1662 size_t count, 1772 size_t count,
@@ -1713,6 +1823,22 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1713 1823
1714 end = saved_pos + count; 1824 end = saved_pos + count;
1715 1825
1826 ret = ocfs2_check_range_for_refcount(inode, saved_pos, count);
1827 if (ret == 1) {
1828 ocfs2_inode_unlock(inode, meta_level);
1829 meta_level = -1;
1830
1831 ret = ocfs2_prepare_inode_for_refcount(inode,
1832 saved_pos,
1833 count,
1834 &meta_level);
1835 }
1836
1837 if (ret < 0) {
1838 mlog_errno(ret);
1839 goto out_unlock;
1840 }
1841
1716 /* 1842 /*
1717 * Skip the O_DIRECT checks if we don't need 1843 * Skip the O_DIRECT checks if we don't need
1718 * them. 1844 * them.
@@ -1759,7 +1885,8 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1759 *ppos = saved_pos; 1885 *ppos = saved_pos;
1760 1886
1761out_unlock: 1887out_unlock:
1762 ocfs2_inode_unlock(inode, meta_level); 1888 if (meta_level >= 0)
1889 ocfs2_inode_unlock(inode, meta_level);
1763 1890
1764out: 1891out:
1765 return ret; 1892 return ret;
@@ -1871,8 +1998,7 @@ relock:
1871 goto out_dio; 1998 goto out_dio;
1872 } 1999 }
1873 } else { 2000 } else {
1874 written = generic_file_aio_write_nolock(iocb, iov, nr_segs, 2001 written = __generic_file_aio_write(iocb, iov, nr_segs, ppos);
1875 *ppos);
1876 } 2002 }
1877 2003
1878out_dio: 2004out_dio:
@@ -1880,18 +2006,21 @@ out_dio:
1880 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); 2006 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
1881 2007
1882 if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) { 2008 if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) {
1883 /* 2009 ret = filemap_fdatawrite_range(file->f_mapping, pos,
1884 * The generic write paths have handled getting data 2010 pos + count - 1);
1885 * to disk, but since we don't make use of the dirty 2011 if (ret < 0)
1886 * inode list, a manual journal commit is necessary 2012 written = ret;
1887 * here. 2013
1888 */ 2014 if (!ret && (old_size != i_size_read(inode) ||
1889 if (old_size != i_size_read(inode) || 2015 old_clusters != OCFS2_I(inode)->ip_clusters)) {
1890 old_clusters != OCFS2_I(inode)->ip_clusters) {
1891 ret = jbd2_journal_force_commit(osb->journal->j_journal); 2016 ret = jbd2_journal_force_commit(osb->journal->j_journal);
1892 if (ret < 0) 2017 if (ret < 0)
1893 written = ret; 2018 written = ret;
1894 } 2019 }
2020
2021 if (!ret)
2022 ret = filemap_fdatawait_range(file->f_mapping, pos,
2023 pos + count - 1);
1895 } 2024 }
1896 2025
1897 /* 2026 /*
@@ -1991,31 +2120,16 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1991 2120
1992 if (ret > 0) { 2121 if (ret > 0) {
1993 unsigned long nr_pages; 2122 unsigned long nr_pages;
2123 int err;
1994 2124
1995 *ppos += ret;
1996 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 2125 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1997 2126
1998 /* 2127 err = generic_write_sync(out, *ppos, ret);
1999 * If file or inode is SYNC and we actually wrote some data, 2128 if (err)
2000 * sync it. 2129 ret = err;
2001 */ 2130 else
2002 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 2131 *ppos += ret;
2003 int err;
2004
2005 mutex_lock(&inode->i_mutex);
2006 err = ocfs2_rw_lock(inode, 1);
2007 if (err < 0) {
2008 mlog_errno(err);
2009 } else {
2010 err = generic_osync_inode(inode, mapping,
2011 OSYNC_METADATA|OSYNC_DATA);
2012 ocfs2_rw_unlock(inode, 1);
2013 }
2014 mutex_unlock(&inode->i_mutex);
2015 2132
2016 if (err)
2017 ret = err;
2018 }
2019 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 2133 balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
2020 } 2134 }
2021 2135
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 172f9fbc9fc7..d66cf4f7c70e 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -69,4 +69,6 @@ int ocfs2_update_inode_atime(struct inode *inode,
69int ocfs2_change_file_space(struct file *file, unsigned int cmd, 69int ocfs2_change_file_space(struct file *file, unsigned int cmd,
70 struct ocfs2_space_resv *sr); 70 struct ocfs2_space_resv *sr);
71 71
72int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
73 size_t count);
72#endif /* OCFS2_FILE_H */ 74#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 4dc8890ba316..0297fb8982b8 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -53,6 +53,7 @@
53#include "sysfile.h" 53#include "sysfile.h"
54#include "uptodate.h" 54#include "uptodate.h"
55#include "xattr.h" 55#include "xattr.h"
56#include "refcounttree.h"
56 57
57#include "buffer_head_io.h" 58#include "buffer_head_io.h"
58 59
@@ -562,7 +563,8 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
562 goto out; 563 goto out;
563 } 564 }
564 565
565 status = ocfs2_journal_access_di(handle, inode, fe_bh, 566 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
567 fe_bh,
566 OCFS2_JOURNAL_ACCESS_WRITE); 568 OCFS2_JOURNAL_ACCESS_WRITE);
567 if (status < 0) { 569 if (status < 0) {
568 mlog_errno(status); 570 mlog_errno(status);
@@ -646,7 +648,7 @@ static int ocfs2_remove_inode(struct inode *inode,
646 } 648 }
647 649
648 /* set the inodes dtime */ 650 /* set the inodes dtime */
649 status = ocfs2_journal_access_di(handle, inode, di_bh, 651 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
650 OCFS2_JOURNAL_ACCESS_WRITE); 652 OCFS2_JOURNAL_ACCESS_WRITE);
651 if (status < 0) { 653 if (status < 0) {
652 mlog_errno(status); 654 mlog_errno(status);
@@ -662,7 +664,7 @@ static int ocfs2_remove_inode(struct inode *inode,
662 goto bail_commit; 664 goto bail_commit;
663 } 665 }
664 666
665 ocfs2_remove_from_cache(inode, di_bh); 667 ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
666 vfs_dq_free_inode(inode); 668 vfs_dq_free_inode(inode);
667 669
668 status = ocfs2_free_dinode(handle, inode_alloc_inode, 670 status = ocfs2_free_dinode(handle, inode_alloc_inode,
@@ -781,6 +783,12 @@ static int ocfs2_wipe_inode(struct inode *inode,
781 goto bail_unlock_dir; 783 goto bail_unlock_dir;
782 } 784 }
783 785
786 status = ocfs2_remove_refcount_tree(inode, di_bh);
787 if (status < 0) {
788 mlog_errno(status);
789 goto bail_unlock_dir;
790 }
791
784 status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode, 792 status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
785 orphan_dir_bh); 793 orphan_dir_bh);
786 if (status < 0) 794 if (status < 0)
@@ -1112,13 +1120,14 @@ void ocfs2_clear_inode(struct inode *inode)
1112 ocfs2_lock_res_free(&oi->ip_inode_lockres); 1120 ocfs2_lock_res_free(&oi->ip_inode_lockres);
1113 ocfs2_lock_res_free(&oi->ip_open_lockres); 1121 ocfs2_lock_res_free(&oi->ip_open_lockres);
1114 1122
1115 ocfs2_metadata_cache_purge(inode); 1123 ocfs2_metadata_cache_exit(INODE_CACHE(inode));
1116 1124
1117 mlog_bug_on_msg(oi->ip_metadata_cache.ci_num_cached, 1125 mlog_bug_on_msg(INODE_CACHE(inode)->ci_num_cached,
1118 "Clear inode of %llu, inode has %u cache items\n", 1126 "Clear inode of %llu, inode has %u cache items\n",
1119 (unsigned long long)oi->ip_blkno, oi->ip_metadata_cache.ci_num_cached); 1127 (unsigned long long)oi->ip_blkno,
1128 INODE_CACHE(inode)->ci_num_cached);
1120 1129
1121 mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE), 1130 mlog_bug_on_msg(!(INODE_CACHE(inode)->ci_flags & OCFS2_CACHE_FL_INLINE),
1122 "Clear inode of %llu, inode has a bad flag\n", 1131 "Clear inode of %llu, inode has a bad flag\n",
1123 (unsigned long long)oi->ip_blkno); 1132 (unsigned long long)oi->ip_blkno);
1124 1133
@@ -1145,9 +1154,7 @@ void ocfs2_clear_inode(struct inode *inode)
1145 (unsigned long long)oi->ip_blkno, oi->ip_open_count); 1154 (unsigned long long)oi->ip_blkno, oi->ip_open_count);
1146 1155
1147 /* Clear all other flags. */ 1156 /* Clear all other flags. */
1148 oi->ip_flags = OCFS2_INODE_CACHE_INLINE; 1157 oi->ip_flags = 0;
1149 oi->ip_created_trans = 0;
1150 oi->ip_last_trans = 0;
1151 oi->ip_dir_start_lookup = 0; 1158 oi->ip_dir_start_lookup = 0;
1152 oi->ip_blkno = 0ULL; 1159 oi->ip_blkno = 0ULL;
1153 1160
@@ -1239,7 +1246,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
1239 mlog_entry("(inode %llu)\n", 1246 mlog_entry("(inode %llu)\n",
1240 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1247 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1241 1248
1242 status = ocfs2_journal_access_di(handle, inode, bh, 1249 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
1243 OCFS2_JOURNAL_ACCESS_WRITE); 1250 OCFS2_JOURNAL_ACCESS_WRITE);
1244 if (status < 0) { 1251 if (status < 0) {
1245 mlog_errno(status); 1252 mlog_errno(status);
@@ -1380,8 +1387,8 @@ int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
1380 int rc; 1387 int rc;
1381 struct buffer_head *tmp = *bh; 1388 struct buffer_head *tmp = *bh;
1382 1389
1383 rc = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, &tmp, 1390 rc = ocfs2_read_blocks(INODE_CACHE(inode), OCFS2_I(inode)->ip_blkno,
1384 flags, ocfs2_validate_inode_block); 1391 1, &tmp, flags, ocfs2_validate_inode_block);
1385 1392
1386 /* If ocfs2_read_blocks() got us a new bh, pass it up. */ 1393 /* If ocfs2_read_blocks() got us a new bh, pass it up. */
1387 if (!rc && !*bh) 1394 if (!rc && !*bh)
@@ -1394,3 +1401,56 @@ int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh)
1394{ 1401{
1395 return ocfs2_read_inode_block_full(inode, bh, 0); 1402 return ocfs2_read_inode_block_full(inode, bh, 0);
1396} 1403}
1404
1405
1406static u64 ocfs2_inode_cache_owner(struct ocfs2_caching_info *ci)
1407{
1408 struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
1409
1410 return oi->ip_blkno;
1411}
1412
1413static struct super_block *ocfs2_inode_cache_get_super(struct ocfs2_caching_info *ci)
1414{
1415 struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
1416
1417 return oi->vfs_inode.i_sb;
1418}
1419
1420static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci)
1421{
1422 struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
1423
1424 spin_lock(&oi->ip_lock);
1425}
1426
1427static void ocfs2_inode_cache_unlock(struct ocfs2_caching_info *ci)
1428{
1429 struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
1430
1431 spin_unlock(&oi->ip_lock);
1432}
1433
1434static void ocfs2_inode_cache_io_lock(struct ocfs2_caching_info *ci)
1435{
1436 struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
1437
1438 mutex_lock(&oi->ip_io_mutex);
1439}
1440
1441static void ocfs2_inode_cache_io_unlock(struct ocfs2_caching_info *ci)
1442{
1443 struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
1444
1445 mutex_unlock(&oi->ip_io_mutex);
1446}
1447
1448const struct ocfs2_caching_operations ocfs2_inode_caching_ops = {
1449 .co_owner = ocfs2_inode_cache_owner,
1450 .co_get_super = ocfs2_inode_cache_get_super,
1451 .co_cache_lock = ocfs2_inode_cache_lock,
1452 .co_cache_unlock = ocfs2_inode_cache_unlock,
1453 .co_io_lock = ocfs2_inode_cache_io_lock,
1454 .co_io_unlock = ocfs2_inode_cache_io_unlock,
1455};
1456
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index ea71525aad41..ba4fe07b293c 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -60,12 +60,6 @@ struct ocfs2_inode_info
60 60
61 u32 ip_dir_start_lookup; 61 u32 ip_dir_start_lookup;
62 62
63 /* next two are protected by trans_inc_lock */
64 /* which transaction were we created on? Zero if none. */
65 unsigned long ip_created_trans;
66 /* last transaction we were a part of. */
67 unsigned long ip_last_trans;
68
69 struct ocfs2_caching_info ip_metadata_cache; 63 struct ocfs2_caching_info ip_metadata_cache;
70 64
71 struct ocfs2_extent_map ip_extent_map; 65 struct ocfs2_extent_map ip_extent_map;
@@ -106,8 +100,6 @@ struct ocfs2_inode_info
106#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020 100#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020
107/* Does someone have the file open O_DIRECT */ 101/* Does someone have the file open O_DIRECT */
108#define OCFS2_INODE_OPEN_DIRECT 0x00000040 102#define OCFS2_INODE_OPEN_DIRECT 0x00000040
109/* Indicates that the metadata cache should be used as an array. */
110#define OCFS2_INODE_CACHE_INLINE 0x00000080
111 103
112static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) 104static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
113{ 105{
@@ -120,6 +112,12 @@ static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
120extern struct kmem_cache *ocfs2_inode_cache; 112extern struct kmem_cache *ocfs2_inode_cache;
121 113
122extern const struct address_space_operations ocfs2_aops; 114extern const struct address_space_operations ocfs2_aops;
115extern const struct ocfs2_caching_operations ocfs2_inode_caching_ops;
116
117static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode)
118{
119 return &OCFS2_I(inode)->ip_metadata_cache;
120}
123 121
124void ocfs2_clear_inode(struct inode *inode); 122void ocfs2_clear_inode(struct inode *inode);
125void ocfs2_delete_inode(struct inode *inode); 123void ocfs2_delete_inode(struct inode *inode);
@@ -172,4 +170,10 @@ int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh);
172/* The same, but can be passed OCFS2_BH_* flags */ 170/* The same, but can be passed OCFS2_BH_* flags */
173int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, 171int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
174 int flags); 172 int flags);
173
174static inline struct ocfs2_inode_info *cache_info_to_inode(struct ocfs2_caching_info *ci)
175{
176 return container_of(ci, struct ocfs2_inode_info, ip_metadata_cache);
177}
178
175#endif /* OCFS2_INODE_H */ 179#endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 467b413bec21..31fbb0619510 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -21,6 +21,7 @@
21#include "ocfs2_fs.h" 21#include "ocfs2_fs.h"
22#include "ioctl.h" 22#include "ioctl.h"
23#include "resize.h" 23#include "resize.h"
24#include "refcounttree.h"
24 25
25#include <linux/ext2_fs.h> 26#include <linux/ext2_fs.h>
26 27
@@ -115,6 +116,9 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
115 int status; 116 int status;
116 struct ocfs2_space_resv sr; 117 struct ocfs2_space_resv sr;
117 struct ocfs2_new_group_input input; 118 struct ocfs2_new_group_input input;
119 struct reflink_arguments args;
120 const char *old_path, *new_path;
121 bool preserve;
118 122
119 switch (cmd) { 123 switch (cmd) {
120 case OCFS2_IOC_GETFLAGS: 124 case OCFS2_IOC_GETFLAGS:
@@ -160,6 +164,15 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
160 return -EFAULT; 164 return -EFAULT;
161 165
162 return ocfs2_group_add(inode, &input); 166 return ocfs2_group_add(inode, &input);
167 case OCFS2_IOC_REFLINK:
168 if (copy_from_user(&args, (struct reflink_arguments *)arg,
169 sizeof(args)))
170 return -EFAULT;
171 old_path = (const char *)(unsigned long)args.old_path;
172 new_path = (const char *)(unsigned long)args.new_path;
173 preserve = (args.preserve != 0);
174
175 return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
163 default: 176 default:
164 return -ENOTTY; 177 return -ENOTTY;
165 } 178 }
@@ -182,6 +195,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
182 case OCFS2_IOC_GROUP_EXTEND: 195 case OCFS2_IOC_GROUP_EXTEND:
183 case OCFS2_IOC_GROUP_ADD: 196 case OCFS2_IOC_GROUP_ADD:
184 case OCFS2_IOC_GROUP_ADD64: 197 case OCFS2_IOC_GROUP_ADD64:
198 case OCFS2_IOC_REFLINK:
185 break; 199 break;
186 default: 200 default:
187 return -ENOIOCTLCMD; 201 return -ENOIOCTLCMD;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index c48b93ac6b65..54c16b66327e 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -48,6 +48,7 @@
48#include "slot_map.h" 48#include "slot_map.h"
49#include "super.h" 49#include "super.h"
50#include "sysfile.h" 50#include "sysfile.h"
51#include "uptodate.h"
51#include "quota.h" 52#include "quota.h"
52 53
53#include "buffer_head_io.h" 54#include "buffer_head_io.h"
@@ -554,6 +555,14 @@ static struct ocfs2_triggers eb_triggers = {
554 .ot_offset = offsetof(struct ocfs2_extent_block, h_check), 555 .ot_offset = offsetof(struct ocfs2_extent_block, h_check),
555}; 556};
556 557
558static struct ocfs2_triggers rb_triggers = {
559 .ot_triggers = {
560 .t_commit = ocfs2_commit_trigger,
561 .t_abort = ocfs2_abort_trigger,
562 },
563 .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check),
564};
565
557static struct ocfs2_triggers gd_triggers = { 566static struct ocfs2_triggers gd_triggers = {
558 .ot_triggers = { 567 .ot_triggers = {
559 .t_commit = ocfs2_commit_trigger, 568 .t_commit = ocfs2_commit_trigger,
@@ -601,14 +610,16 @@ static struct ocfs2_triggers dl_triggers = {
601}; 610};
602 611
603static int __ocfs2_journal_access(handle_t *handle, 612static int __ocfs2_journal_access(handle_t *handle,
604 struct inode *inode, 613 struct ocfs2_caching_info *ci,
605 struct buffer_head *bh, 614 struct buffer_head *bh,
606 struct ocfs2_triggers *triggers, 615 struct ocfs2_triggers *triggers,
607 int type) 616 int type)
608{ 617{
609 int status; 618 int status;
619 struct ocfs2_super *osb =
620 OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
610 621
611 BUG_ON(!inode); 622 BUG_ON(!ci || !ci->ci_ops);
612 BUG_ON(!handle); 623 BUG_ON(!handle);
613 BUG_ON(!bh); 624 BUG_ON(!bh);
614 625
@@ -627,15 +638,15 @@ static int __ocfs2_journal_access(handle_t *handle,
627 BUG(); 638 BUG();
628 } 639 }
629 640
630 /* Set the current transaction information on the inode so 641 /* Set the current transaction information on the ci so
631 * that the locking code knows whether it can drop it's locks 642 * that the locking code knows whether it can drop it's locks
632 * on this inode or not. We're protected from the commit 643 * on this ci or not. We're protected from the commit
633 * thread updating the current transaction id until 644 * thread updating the current transaction id until
634 * ocfs2_commit_trans() because ocfs2_start_trans() took 645 * ocfs2_commit_trans() because ocfs2_start_trans() took
635 * j_trans_barrier for us. */ 646 * j_trans_barrier for us. */
636 ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode); 647 ocfs2_set_ci_lock_trans(osb->journal, ci);
637 648
638 mutex_lock(&OCFS2_I(inode)->ip_io_mutex); 649 ocfs2_metadata_cache_io_lock(ci);
639 switch (type) { 650 switch (type) {
640 case OCFS2_JOURNAL_ACCESS_CREATE: 651 case OCFS2_JOURNAL_ACCESS_CREATE:
641 case OCFS2_JOURNAL_ACCESS_WRITE: 652 case OCFS2_JOURNAL_ACCESS_WRITE:
@@ -650,9 +661,9 @@ static int __ocfs2_journal_access(handle_t *handle,
650 status = -EINVAL; 661 status = -EINVAL;
651 mlog(ML_ERROR, "Uknown access type!\n"); 662 mlog(ML_ERROR, "Uknown access type!\n");
652 } 663 }
653 if (!status && ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)) && triggers) 664 if (!status && ocfs2_meta_ecc(osb) && triggers)
654 jbd2_journal_set_triggers(bh, &triggers->ot_triggers); 665 jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
655 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); 666 ocfs2_metadata_cache_io_unlock(ci);
656 667
657 if (status < 0) 668 if (status < 0)
658 mlog(ML_ERROR, "Error %d getting %d access to buffer!\n", 669 mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
@@ -662,66 +673,65 @@ static int __ocfs2_journal_access(handle_t *handle,
662 return status; 673 return status;
663} 674}
664 675
665int ocfs2_journal_access_di(handle_t *handle, struct inode *inode, 676int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci,
666 struct buffer_head *bh, int type) 677 struct buffer_head *bh, int type)
667{ 678{
668 return __ocfs2_journal_access(handle, inode, bh, &di_triggers, 679 return __ocfs2_journal_access(handle, ci, bh, &di_triggers, type);
669 type);
670} 680}
671 681
672int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode, 682int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci,
673 struct buffer_head *bh, int type) 683 struct buffer_head *bh, int type)
674{ 684{
675 return __ocfs2_journal_access(handle, inode, bh, &eb_triggers, 685 return __ocfs2_journal_access(handle, ci, bh, &eb_triggers, type);
676 type);
677} 686}
678 687
679int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode, 688int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci,
680 struct buffer_head *bh, int type) 689 struct buffer_head *bh, int type)
681{ 690{
682 return __ocfs2_journal_access(handle, inode, bh, &gd_triggers, 691 return __ocfs2_journal_access(handle, ci, bh, &rb_triggers,
683 type); 692 type);
684} 693}
685 694
686int ocfs2_journal_access_db(handle_t *handle, struct inode *inode, 695int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci,
687 struct buffer_head *bh, int type) 696 struct buffer_head *bh, int type)
688{ 697{
689 return __ocfs2_journal_access(handle, inode, bh, &db_triggers, 698 return __ocfs2_journal_access(handle, ci, bh, &gd_triggers, type);
690 type);
691} 699}
692 700
693int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode, 701int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci,
694 struct buffer_head *bh, int type) 702 struct buffer_head *bh, int type)
695{ 703{
696 return __ocfs2_journal_access(handle, inode, bh, &xb_triggers, 704 return __ocfs2_journal_access(handle, ci, bh, &db_triggers, type);
697 type);
698} 705}
699 706
700int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode, 707int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci,
701 struct buffer_head *bh, int type) 708 struct buffer_head *bh, int type)
702{ 709{
703 return __ocfs2_journal_access(handle, inode, bh, &dq_triggers, 710 return __ocfs2_journal_access(handle, ci, bh, &xb_triggers, type);
704 type);
705} 711}
706 712
707int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode, 713int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci,
708 struct buffer_head *bh, int type) 714 struct buffer_head *bh, int type)
709{ 715{
710 return __ocfs2_journal_access(handle, inode, bh, &dr_triggers, 716 return __ocfs2_journal_access(handle, ci, bh, &dq_triggers, type);
711 type);
712} 717}
713 718
714int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode, 719int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci,
715 struct buffer_head *bh, int type) 720 struct buffer_head *bh, int type)
716{ 721{
717 return __ocfs2_journal_access(handle, inode, bh, &dl_triggers, 722 return __ocfs2_journal_access(handle, ci, bh, &dr_triggers, type);
718 type); 723}
724
725int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci,
726 struct buffer_head *bh, int type)
727{
728 return __ocfs2_journal_access(handle, ci, bh, &dl_triggers, type);
719} 729}
720 730
721int ocfs2_journal_access(handle_t *handle, struct inode *inode, 731int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
722 struct buffer_head *bh, int type) 732 struct buffer_head *bh, int type)
723{ 733{
724 return __ocfs2_journal_access(handle, inode, bh, NULL, type); 734 return __ocfs2_journal_access(handle, ci, bh, NULL, type);
725} 735}
726 736
727int ocfs2_journal_dirty(handle_t *handle, 737int ocfs2_journal_dirty(handle_t *handle,
@@ -898,7 +908,7 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
898 ocfs2_bump_recovery_generation(fe); 908 ocfs2_bump_recovery_generation(fe);
899 909
900 ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check); 910 ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
901 status = ocfs2_write_block(osb, bh, journal->j_inode); 911 status = ocfs2_write_block(osb, bh, INODE_CACHE(journal->j_inode));
902 if (status < 0) 912 if (status < 0)
903 mlog_errno(status); 913 mlog_errno(status);
904 914
@@ -1642,7 +1652,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1642 ocfs2_get_recovery_generation(fe); 1652 ocfs2_get_recovery_generation(fe);
1643 1653
1644 ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check); 1654 ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
1645 status = ocfs2_write_block(osb, bh, inode); 1655 status = ocfs2_write_block(osb, bh, INODE_CACHE(inode));
1646 if (status < 0) 1656 if (status < 0)
1647 mlog_errno(status); 1657 mlog_errno(status);
1648 1658
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 2c3222aec622..3f74e09b0d80 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -90,56 +90,66 @@ static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j)
90 return old_id; 90 return old_id;
91} 91}
92 92
93static inline void ocfs2_set_inode_lock_trans(struct ocfs2_journal *journal, 93static inline void ocfs2_set_ci_lock_trans(struct ocfs2_journal *journal,
94 struct inode *inode) 94 struct ocfs2_caching_info *ci)
95{ 95{
96 spin_lock(&trans_inc_lock); 96 spin_lock(&trans_inc_lock);
97 OCFS2_I(inode)->ip_last_trans = journal->j_trans_id; 97 ci->ci_last_trans = journal->j_trans_id;
98 spin_unlock(&trans_inc_lock); 98 spin_unlock(&trans_inc_lock);
99} 99}
100 100
101/* Used to figure out whether it's safe to drop a metadata lock on an 101/* Used to figure out whether it's safe to drop a metadata lock on an
102 * inode. Returns true if all the inodes changes have been 102 * cached object. Returns true if all the object's changes have been
103 * checkpointed to disk. You should be holding the spinlock on the 103 * checkpointed to disk. You should be holding the spinlock on the
104 * metadata lock while calling this to be sure that nobody can take 104 * metadata lock while calling this to be sure that nobody can take
105 * the lock and put it on another transaction. */ 105 * the lock and put it on another transaction. */
106static inline int ocfs2_inode_fully_checkpointed(struct inode *inode) 106static inline int ocfs2_ci_fully_checkpointed(struct ocfs2_caching_info *ci)
107{ 107{
108 int ret; 108 int ret;
109 struct ocfs2_journal *journal = OCFS2_SB(inode->i_sb)->journal; 109 struct ocfs2_journal *journal =
110 OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal;
110 111
111 spin_lock(&trans_inc_lock); 112 spin_lock(&trans_inc_lock);
112 ret = time_after(journal->j_trans_id, OCFS2_I(inode)->ip_last_trans); 113 ret = time_after(journal->j_trans_id, ci->ci_last_trans);
113 spin_unlock(&trans_inc_lock); 114 spin_unlock(&trans_inc_lock);
114 return ret; 115 return ret;
115} 116}
116 117
117/* convenience function to check if an inode is still new (has never 118/* convenience function to check if an object backed by struct
118 * hit disk) Will do you a favor and set created_trans = 0 when you've 119 * ocfs2_caching_info is still new (has never hit disk) Will do you a
119 * been checkpointed. returns '1' if the inode is still new. */ 120 * favor and set created_trans = 0 when you've
120static inline int ocfs2_inode_is_new(struct inode *inode) 121 * been checkpointed. returns '1' if the ci is still new. */
122static inline int ocfs2_ci_is_new(struct ocfs2_caching_info *ci)
121{ 123{
122 int ret; 124 int ret;
125 struct ocfs2_journal *journal =
126 OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal;
123 127
128 spin_lock(&trans_inc_lock);
129 ret = !(time_after(journal->j_trans_id, ci->ci_created_trans));
130 if (!ret)
131 ci->ci_created_trans = 0;
132 spin_unlock(&trans_inc_lock);
133 return ret;
134}
135
136/* Wrapper for inodes so we can check system files */
137static inline int ocfs2_inode_is_new(struct inode *inode)
138{
124 /* System files are never "new" as they're written out by 139 /* System files are never "new" as they're written out by
125 * mkfs. This helps us early during mount, before we have the 140 * mkfs. This helps us early during mount, before we have the
126 * journal open and j_trans_id could be junk. */ 141 * journal open and j_trans_id could be junk. */
127 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) 142 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
128 return 0; 143 return 0;
129 spin_lock(&trans_inc_lock); 144
130 ret = !(time_after(OCFS2_SB(inode->i_sb)->journal->j_trans_id, 145 return ocfs2_ci_is_new(INODE_CACHE(inode));
131 OCFS2_I(inode)->ip_created_trans));
132 if (!ret)
133 OCFS2_I(inode)->ip_created_trans = 0;
134 spin_unlock(&trans_inc_lock);
135 return ret;
136} 146}
137 147
138static inline void ocfs2_inode_set_new(struct ocfs2_super *osb, 148static inline void ocfs2_ci_set_new(struct ocfs2_super *osb,
139 struct inode *inode) 149 struct ocfs2_caching_info *ci)
140{ 150{
141 spin_lock(&trans_inc_lock); 151 spin_lock(&trans_inc_lock);
142 OCFS2_I(inode)->ip_created_trans = osb->journal->j_trans_id; 152 ci->ci_created_trans = osb->journal->j_trans_id;
143 spin_unlock(&trans_inc_lock); 153 spin_unlock(&trans_inc_lock);
144} 154}
145 155
@@ -200,7 +210,7 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
200 if (ocfs2_mount_local(osb)) 210 if (ocfs2_mount_local(osb))
201 return; 211 return;
202 212
203 if (!ocfs2_inode_fully_checkpointed(inode)) { 213 if (!ocfs2_ci_fully_checkpointed(INODE_CACHE(inode))) {
204 /* WARNING: This only kicks off a single 214 /* WARNING: This only kicks off a single
205 * checkpoint. If someone races you and adds more 215 * checkpoint. If someone races you and adds more
206 * metadata to the journal, you won't know, and will 216 * metadata to the journal, you won't know, and will
@@ -210,7 +220,7 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
210 ocfs2_start_checkpoint(osb); 220 ocfs2_start_checkpoint(osb);
211 221
212 wait_event(osb->journal->j_checkpointed, 222 wait_event(osb->journal->j_checkpointed,
213 ocfs2_inode_fully_checkpointed(inode)); 223 ocfs2_ci_fully_checkpointed(INODE_CACHE(inode)));
214 } 224 }
215} 225}
216 226
@@ -266,31 +276,34 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks);
266 276
267 277
268/* ocfs2_inode */ 278/* ocfs2_inode */
269int ocfs2_journal_access_di(handle_t *handle, struct inode *inode, 279int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci,
270 struct buffer_head *bh, int type); 280 struct buffer_head *bh, int type);
271/* ocfs2_extent_block */ 281/* ocfs2_extent_block */
272int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode, 282int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci,
283 struct buffer_head *bh, int type);
284/* ocfs2_refcount_block */
285int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci,
273 struct buffer_head *bh, int type); 286 struct buffer_head *bh, int type);
274/* ocfs2_group_desc */ 287/* ocfs2_group_desc */
275int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode, 288int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci,
276 struct buffer_head *bh, int type); 289 struct buffer_head *bh, int type);
277/* ocfs2_xattr_block */ 290/* ocfs2_xattr_block */
278int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode, 291int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci,
279 struct buffer_head *bh, int type); 292 struct buffer_head *bh, int type);
280/* quota blocks */ 293/* quota blocks */
281int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode, 294int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci,
282 struct buffer_head *bh, int type); 295 struct buffer_head *bh, int type);
283/* dirblock */ 296/* dirblock */
284int ocfs2_journal_access_db(handle_t *handle, struct inode *inode, 297int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci,
285 struct buffer_head *bh, int type); 298 struct buffer_head *bh, int type);
286/* ocfs2_dx_root_block */ 299/* ocfs2_dx_root_block */
287int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode, 300int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci,
288 struct buffer_head *bh, int type); 301 struct buffer_head *bh, int type);
289/* ocfs2_dx_leaf */ 302/* ocfs2_dx_leaf */
290int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode, 303int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci,
291 struct buffer_head *bh, int type); 304 struct buffer_head *bh, int type);
292/* Anything that has no ecc */ 305/* Anything that has no ecc */
293int ocfs2_journal_access(handle_t *handle, struct inode *inode, 306int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
294 struct buffer_head *bh, int type); 307 struct buffer_head *bh, int type);
295 308
296/* 309/*
@@ -477,6 +490,23 @@ static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb)
477 return credits; 490 return credits;
478} 491}
479 492
493/* inode update, new refcount block and its allocation credits. */
494#define OCFS2_REFCOUNT_TREE_CREATE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1 \
495 + OCFS2_SUBALLOC_ALLOC)
496
497/* inode and the refcount block update. */
498#define OCFS2_REFCOUNT_TREE_SET_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
499
500/*
501 * inode and the refcount block update.
502 * It doesn't include the credits for sub alloc change.
503 * So if we need to free the bit, OCFS2_SUBALLOC_FREE needs to be added.
504 */
505#define OCFS2_REFCOUNT_TREE_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
506
507/* 2 metadata alloc, 2 new blocks and root refcount block */
508#define OCFS2_EXPAND_REFCOUNT_TREE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + 3)
509
480/* 510/*
481 * Please note that the caller must make sure that root_el is the root 511 * Please note that the caller must make sure that root_el is the root
482 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise 512 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index bac7e6abaf47..ac10f83edb95 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -297,8 +297,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
297 } 297 }
298 memcpy(alloc_copy, alloc, bh->b_size); 298 memcpy(alloc_copy, alloc, bh->b_size);
299 299
300 status = ocfs2_journal_access_di(handle, local_alloc_inode, bh, 300 status = ocfs2_journal_access_di(handle, INODE_CACHE(local_alloc_inode),
301 OCFS2_JOURNAL_ACCESS_WRITE); 301 bh, OCFS2_JOURNAL_ACCESS_WRITE);
302 if (status < 0) { 302 if (status < 0) {
303 mlog_errno(status); 303 mlog_errno(status);
304 goto out_commit; 304 goto out_commit;
@@ -392,7 +392,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
392 ocfs2_clear_local_alloc(alloc); 392 ocfs2_clear_local_alloc(alloc);
393 393
394 ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check); 394 ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check);
395 status = ocfs2_write_block(osb, alloc_bh, inode); 395 status = ocfs2_write_block(osb, alloc_bh, INODE_CACHE(inode));
396 if (status < 0) 396 if (status < 0)
397 mlog_errno(status); 397 mlog_errno(status);
398 398
@@ -678,7 +678,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
678 * delete bits from it! */ 678 * delete bits from it! */
679 *num_bits = bits_wanted; 679 *num_bits = bits_wanted;
680 680
681 status = ocfs2_journal_access_di(handle, local_alloc_inode, 681 status = ocfs2_journal_access_di(handle,
682 INODE_CACHE(local_alloc_inode),
682 osb->local_alloc_bh, 683 osb->local_alloc_bh,
683 OCFS2_JOURNAL_ACCESS_WRITE); 684 OCFS2_JOURNAL_ACCESS_WRITE);
684 if (status < 0) { 685 if (status < 0) {
@@ -1156,7 +1157,8 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
1156 } 1157 }
1157 memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size); 1158 memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
1158 1159
1159 status = ocfs2_journal_access_di(handle, local_alloc_inode, 1160 status = ocfs2_journal_access_di(handle,
1161 INODE_CACHE(local_alloc_inode),
1160 osb->local_alloc_bh, 1162 osb->local_alloc_bh,
1161 OCFS2_JOURNAL_ACCESS_WRITE); 1163 OCFS2_JOURNAL_ACCESS_WRITE);
1162 if (status < 0) { 1164 if (status < 0) {
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index b606496b72ec..39737613424a 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -202,7 +202,7 @@ out:
202 return ret; 202 return ret;
203} 203}
204 204
205static struct vm_operations_struct ocfs2_file_vm_ops = { 205static const struct vm_operations_struct ocfs2_file_vm_ops = {
206 .fault = ocfs2_fault, 206 .fault = ocfs2_fault,
207 .page_mkwrite = ocfs2_page_mkwrite, 207 .page_mkwrite = ocfs2_page_mkwrite,
208}; 208};
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 8601f934010b..f010b22b1c44 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -69,7 +69,6 @@
69static int ocfs2_mknod_locked(struct ocfs2_super *osb, 69static int ocfs2_mknod_locked(struct ocfs2_super *osb,
70 struct inode *dir, 70 struct inode *dir,
71 struct inode *inode, 71 struct inode *inode,
72 struct dentry *dentry,
73 dev_t dev, 72 dev_t dev,
74 struct buffer_head **new_fe_bh, 73 struct buffer_head **new_fe_bh,
75 struct buffer_head *parent_fe_bh, 74 struct buffer_head *parent_fe_bh,
@@ -78,7 +77,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
78 77
79static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, 78static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
80 struct inode **ret_orphan_dir, 79 struct inode **ret_orphan_dir,
81 struct inode *inode, 80 u64 blkno,
82 char *name, 81 char *name,
83 struct ocfs2_dir_lookup_result *lookup); 82 struct ocfs2_dir_lookup_result *lookup);
84 83
@@ -358,8 +357,12 @@ static int ocfs2_mknod(struct inode *dir,
358 } 357 }
359 did_quota_inode = 1; 358 did_quota_inode = 1;
360 359
360 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
361 inode->i_mode, (unsigned long)dev, dentry->d_name.len,
362 dentry->d_name.name);
363
361 /* do the real work now. */ 364 /* do the real work now. */
362 status = ocfs2_mknod_locked(osb, dir, inode, dentry, dev, 365 status = ocfs2_mknod_locked(osb, dir, inode, dev,
363 &new_fe_bh, parent_fe_bh, handle, 366 &new_fe_bh, parent_fe_bh, handle,
364 inode_ac); 367 inode_ac);
365 if (status < 0) { 368 if (status < 0) {
@@ -375,7 +378,8 @@ static int ocfs2_mknod(struct inode *dir,
375 goto leave; 378 goto leave;
376 } 379 }
377 380
378 status = ocfs2_journal_access_di(handle, dir, parent_fe_bh, 381 status = ocfs2_journal_access_di(handle, INODE_CACHE(dir),
382 parent_fe_bh,
379 OCFS2_JOURNAL_ACCESS_WRITE); 383 OCFS2_JOURNAL_ACCESS_WRITE);
380 if (status < 0) { 384 if (status < 0) {
381 mlog_errno(status); 385 mlog_errno(status);
@@ -465,7 +469,6 @@ leave:
465static int ocfs2_mknod_locked(struct ocfs2_super *osb, 469static int ocfs2_mknod_locked(struct ocfs2_super *osb,
466 struct inode *dir, 470 struct inode *dir,
467 struct inode *inode, 471 struct inode *inode,
468 struct dentry *dentry,
469 dev_t dev, 472 dev_t dev,
470 struct buffer_head **new_fe_bh, 473 struct buffer_head **new_fe_bh,
471 struct buffer_head *parent_fe_bh, 474 struct buffer_head *parent_fe_bh,
@@ -479,10 +482,6 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
479 u16 suballoc_bit; 482 u16 suballoc_bit;
480 u16 feat; 483 u16 feat;
481 484
482 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
483 inode->i_mode, (unsigned long)dev, dentry->d_name.len,
484 dentry->d_name.name);
485
486 *new_fe_bh = NULL; 485 *new_fe_bh = NULL;
487 486
488 status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh, 487 status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh,
@@ -507,9 +506,10 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
507 mlog_errno(status); 506 mlog_errno(status);
508 goto leave; 507 goto leave;
509 } 508 }
510 ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh); 509 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), *new_fe_bh);
511 510
512 status = ocfs2_journal_access_di(handle, inode, *new_fe_bh, 511 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
512 *new_fe_bh,
513 OCFS2_JOURNAL_ACCESS_CREATE); 513 OCFS2_JOURNAL_ACCESS_CREATE);
514 if (status < 0) { 514 if (status < 0) {
515 mlog_errno(status); 515 mlog_errno(status);
@@ -565,7 +565,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
565 } 565 }
566 566
567 ocfs2_populate_inode(inode, fe, 1); 567 ocfs2_populate_inode(inode, fe, 1);
568 ocfs2_inode_set_new(osb, inode); 568 ocfs2_ci_set_new(osb, INODE_CACHE(inode));
569 if (!ocfs2_mount_local(osb)) { 569 if (!ocfs2_mount_local(osb)) {
570 status = ocfs2_create_new_inode_locks(inode); 570 status = ocfs2_create_new_inode_locks(inode);
571 if (status < 0) 571 if (status < 0)
@@ -682,7 +682,7 @@ static int ocfs2_link(struct dentry *old_dentry,
682 goto out_unlock_inode; 682 goto out_unlock_inode;
683 } 683 }
684 684
685 err = ocfs2_journal_access_di(handle, inode, fe_bh, 685 err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
686 OCFS2_JOURNAL_ACCESS_WRITE); 686 OCFS2_JOURNAL_ACCESS_WRITE);
687 if (err < 0) { 687 if (err < 0) {
688 mlog_errno(err); 688 mlog_errno(err);
@@ -850,7 +850,8 @@ static int ocfs2_unlink(struct inode *dir,
850 } 850 }
851 851
852 if (inode_is_unlinkable(inode)) { 852 if (inode_is_unlinkable(inode)) {
853 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode, 853 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
854 OCFS2_I(inode)->ip_blkno,
854 orphan_name, &orphan_insert); 855 orphan_name, &orphan_insert);
855 if (status < 0) { 856 if (status < 0) {
856 mlog_errno(status); 857 mlog_errno(status);
@@ -866,7 +867,7 @@ static int ocfs2_unlink(struct inode *dir,
866 goto leave; 867 goto leave;
867 } 868 }
868 869
869 status = ocfs2_journal_access_di(handle, inode, fe_bh, 870 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
870 OCFS2_JOURNAL_ACCESS_WRITE); 871 OCFS2_JOURNAL_ACCESS_WRITE);
871 if (status < 0) { 872 if (status < 0) {
872 mlog_errno(status); 873 mlog_errno(status);
@@ -1241,9 +1242,8 @@ static int ocfs2_rename(struct inode *old_dir,
1241 1242
1242 if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) { 1243 if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
1243 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, 1244 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
1244 new_inode, 1245 OCFS2_I(new_inode)->ip_blkno,
1245 orphan_name, 1246 orphan_name, &orphan_insert);
1246 &orphan_insert);
1247 if (status < 0) { 1247 if (status < 0) {
1248 mlog_errno(status); 1248 mlog_errno(status);
1249 goto bail; 1249 goto bail;
@@ -1284,7 +1284,8 @@ static int ocfs2_rename(struct inode *old_dir,
1284 goto bail; 1284 goto bail;
1285 } 1285 }
1286 } 1286 }
1287 status = ocfs2_journal_access_di(handle, new_inode, newfe_bh, 1287 status = ocfs2_journal_access_di(handle, INODE_CACHE(new_inode),
1288 newfe_bh,
1288 OCFS2_JOURNAL_ACCESS_WRITE); 1289 OCFS2_JOURNAL_ACCESS_WRITE);
1289 if (status < 0) { 1290 if (status < 0) {
1290 mlog_errno(status); 1291 mlog_errno(status);
@@ -1331,7 +1332,8 @@ static int ocfs2_rename(struct inode *old_dir,
1331 old_inode->i_ctime = CURRENT_TIME; 1332 old_inode->i_ctime = CURRENT_TIME;
1332 mark_inode_dirty(old_inode); 1333 mark_inode_dirty(old_inode);
1333 1334
1334 status = ocfs2_journal_access_di(handle, old_inode, old_inode_bh, 1335 status = ocfs2_journal_access_di(handle, INODE_CACHE(old_inode),
1336 old_inode_bh,
1335 OCFS2_JOURNAL_ACCESS_WRITE); 1337 OCFS2_JOURNAL_ACCESS_WRITE);
1336 if (status >= 0) { 1338 if (status >= 0) {
1337 old_di = (struct ocfs2_dinode *) old_inode_bh->b_data; 1339 old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
@@ -1407,9 +1409,10 @@ static int ocfs2_rename(struct inode *old_dir,
1407 (int)old_dir_nlink, old_dir->i_nlink); 1409 (int)old_dir_nlink, old_dir->i_nlink);
1408 } else { 1410 } else {
1409 struct ocfs2_dinode *fe; 1411 struct ocfs2_dinode *fe;
1410 status = ocfs2_journal_access_di(handle, old_dir, 1412 status = ocfs2_journal_access_di(handle,
1411 old_dir_bh, 1413 INODE_CACHE(old_dir),
1412 OCFS2_JOURNAL_ACCESS_WRITE); 1414 old_dir_bh,
1415 OCFS2_JOURNAL_ACCESS_WRITE);
1413 fe = (struct ocfs2_dinode *) old_dir_bh->b_data; 1416 fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
1414 ocfs2_set_links_count(fe, old_dir->i_nlink); 1417 ocfs2_set_links_count(fe, old_dir->i_nlink);
1415 status = ocfs2_journal_dirty(handle, old_dir_bh); 1418 status = ocfs2_journal_dirty(handle, old_dir_bh);
@@ -1527,9 +1530,11 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
1527 mlog_errno(status); 1530 mlog_errno(status);
1528 goto bail; 1531 goto bail;
1529 } 1532 }
1530 ocfs2_set_new_buffer_uptodate(inode, bhs[virtual]); 1533 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode),
1534 bhs[virtual]);
1531 1535
1532 status = ocfs2_journal_access(handle, inode, bhs[virtual], 1536 status = ocfs2_journal_access(handle, INODE_CACHE(inode),
1537 bhs[virtual],
1533 OCFS2_JOURNAL_ACCESS_CREATE); 1538 OCFS2_JOURNAL_ACCESS_CREATE);
1534 if (status < 0) { 1539 if (status < 0) {
1535 mlog_errno(status); 1540 mlog_errno(status);
@@ -1692,7 +1697,11 @@ static int ocfs2_symlink(struct inode *dir,
1692 } 1697 }
1693 did_quota_inode = 1; 1698 did_quota_inode = 1;
1694 1699
1695 status = ocfs2_mknod_locked(osb, dir, inode, dentry, 1700 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry,
1701 inode->i_mode, dentry->d_name.len,
1702 dentry->d_name.name);
1703
1704 status = ocfs2_mknod_locked(osb, dir, inode,
1696 0, &new_fe_bh, parent_fe_bh, handle, 1705 0, &new_fe_bh, parent_fe_bh, handle,
1697 inode_ac); 1706 inode_ac);
1698 if (status < 0) { 1707 if (status < 0) {
@@ -1842,7 +1851,7 @@ bail:
1842 1851
1843static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, 1852static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1844 struct inode **ret_orphan_dir, 1853 struct inode **ret_orphan_dir,
1845 struct inode *inode, 1854 u64 blkno,
1846 char *name, 1855 char *name,
1847 struct ocfs2_dir_lookup_result *lookup) 1856 struct ocfs2_dir_lookup_result *lookup)
1848{ 1857{
@@ -1850,7 +1859,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1850 struct buffer_head *orphan_dir_bh = NULL; 1859 struct buffer_head *orphan_dir_bh = NULL;
1851 int status = 0; 1860 int status = 0;
1852 1861
1853 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); 1862 status = ocfs2_blkno_stringify(blkno, name);
1854 if (status < 0) { 1863 if (status < 0) {
1855 mlog_errno(status); 1864 mlog_errno(status);
1856 return status; 1865 return status;
@@ -1917,7 +1926,9 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1917 goto leave; 1926 goto leave;
1918 } 1927 }
1919 1928
1920 status = ocfs2_journal_access_di(handle, orphan_dir_inode, orphan_dir_bh, 1929 status = ocfs2_journal_access_di(handle,
1930 INODE_CACHE(orphan_dir_inode),
1931 orphan_dir_bh,
1921 OCFS2_JOURNAL_ACCESS_WRITE); 1932 OCFS2_JOURNAL_ACCESS_WRITE);
1922 if (status < 0) { 1933 if (status < 0) {
1923 mlog_errno(status); 1934 mlog_errno(status);
@@ -2002,7 +2013,9 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
2002 goto leave; 2013 goto leave;
2003 } 2014 }
2004 2015
2005 status = ocfs2_journal_access_di(handle,orphan_dir_inode, orphan_dir_bh, 2016 status = ocfs2_journal_access_di(handle,
2017 INODE_CACHE(orphan_dir_inode),
2018 orphan_dir_bh,
2006 OCFS2_JOURNAL_ACCESS_WRITE); 2019 OCFS2_JOURNAL_ACCESS_WRITE);
2007 if (status < 0) { 2020 if (status < 0) {
2008 mlog_errno(status); 2021 mlog_errno(status);
@@ -2028,6 +2041,274 @@ leave:
2028 return status; 2041 return status;
2029} 2042}
2030 2043
2044int ocfs2_create_inode_in_orphan(struct inode *dir,
2045 int mode,
2046 struct inode **new_inode)
2047{
2048 int status, did_quota_inode = 0;
2049 struct inode *inode = NULL;
2050 struct inode *orphan_dir = NULL;
2051 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2052 struct ocfs2_dinode *di = NULL;
2053 handle_t *handle = NULL;
2054 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
2055 struct buffer_head *parent_di_bh = NULL;
2056 struct buffer_head *new_di_bh = NULL;
2057 struct ocfs2_alloc_context *inode_ac = NULL;
2058 struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
2059
2060 status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
2061 if (status < 0) {
2062 if (status != -ENOENT)
2063 mlog_errno(status);
2064 return status;
2065 }
2066
2067 /*
2068 * We give the orphan dir the root blkno to fake an orphan name,
2069 * and allocate enough space for our insertion.
2070 */
2071 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
2072 osb->root_blkno,
2073 orphan_name, &orphan_insert);
2074 if (status < 0) {
2075 mlog_errno(status);
2076 goto leave;
2077 }
2078
2079 /* reserve an inode spot */
2080 status = ocfs2_reserve_new_inode(osb, &inode_ac);
2081 if (status < 0) {
2082 if (status != -ENOSPC)
2083 mlog_errno(status);
2084 goto leave;
2085 }
2086
2087 inode = ocfs2_get_init_inode(dir, mode);
2088 if (!inode) {
2089 status = -ENOMEM;
2090 mlog_errno(status);
2091 goto leave;
2092 }
2093
2094 handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, 0, 0));
2095 if (IS_ERR(handle)) {
2096 status = PTR_ERR(handle);
2097 handle = NULL;
2098 mlog_errno(status);
2099 goto leave;
2100 }
2101
2102 /* We don't use standard VFS wrapper because we don't want vfs_dq_init
2103 * to be called. */
2104 if (sb_any_quota_active(osb->sb) &&
2105 osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
2106 status = -EDQUOT;
2107 goto leave;
2108 }
2109 did_quota_inode = 1;
2110
2111 /* do the real work now. */
2112 status = ocfs2_mknod_locked(osb, dir, inode,
2113 0, &new_di_bh, parent_di_bh, handle,
2114 inode_ac);
2115 if (status < 0) {
2116 mlog_errno(status);
2117 goto leave;
2118 }
2119
2120 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, orphan_name);
2121 if (status < 0) {
2122 mlog_errno(status);
2123 goto leave;
2124 }
2125
2126 di = (struct ocfs2_dinode *)new_di_bh->b_data;
2127 status = ocfs2_orphan_add(osb, handle, inode, di, orphan_name,
2128 &orphan_insert, orphan_dir);
2129 if (status < 0) {
2130 mlog_errno(status);
2131 goto leave;
2132 }
2133
2134 /* get open lock so that only nodes can't remove it from orphan dir. */
2135 status = ocfs2_open_lock(inode);
2136 if (status < 0)
2137 mlog_errno(status);
2138
2139leave:
2140 if (status < 0 && did_quota_inode)
2141 vfs_dq_free_inode(inode);
2142 if (handle)
2143 ocfs2_commit_trans(osb, handle);
2144
2145 if (orphan_dir) {
2146 /* This was locked for us in ocfs2_prepare_orphan_dir() */
2147 ocfs2_inode_unlock(orphan_dir, 1);
2148 mutex_unlock(&orphan_dir->i_mutex);
2149 iput(orphan_dir);
2150 }
2151
2152 if (status == -ENOSPC)
2153 mlog(0, "Disk is full\n");
2154
2155 if ((status < 0) && inode) {
2156 clear_nlink(inode);
2157 iput(inode);
2158 }
2159
2160 if (inode_ac)
2161 ocfs2_free_alloc_context(inode_ac);
2162
2163 brelse(new_di_bh);
2164
2165 if (!status)
2166 *new_inode = inode;
2167
2168 ocfs2_free_dir_lookup_result(&orphan_insert);
2169
2170 ocfs2_inode_unlock(dir, 1);
2171 brelse(parent_di_bh);
2172 return status;
2173}
2174
2175int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
2176 struct inode *inode,
2177 struct dentry *dentry)
2178{
2179 int status = 0;
2180 struct buffer_head *parent_di_bh = NULL;
2181 handle_t *handle = NULL;
2182 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2183 struct ocfs2_dinode *dir_di, *di;
2184 struct inode *orphan_dir_inode = NULL;
2185 struct buffer_head *orphan_dir_bh = NULL;
2186 struct buffer_head *di_bh = NULL;
2187 struct ocfs2_dir_lookup_result lookup = { NULL, };
2188
2189 mlog_entry("(0x%p, 0x%p, %.*s')\n", dir, dentry,
2190 dentry->d_name.len, dentry->d_name.name);
2191
2192 status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
2193 if (status < 0) {
2194 if (status != -ENOENT)
2195 mlog_errno(status);
2196 return status;
2197 }
2198
2199 dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data;
2200 if (!dir_di->i_links_count) {
2201 /* can't make a file in a deleted directory. */
2202 status = -ENOENT;
2203 goto leave;
2204 }
2205
2206 status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
2207 dentry->d_name.len);
2208 if (status)
2209 goto leave;
2210
2211 /* get a spot inside the dir. */
2212 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh,
2213 dentry->d_name.name,
2214 dentry->d_name.len, &lookup);
2215 if (status < 0) {
2216 mlog_errno(status);
2217 goto leave;
2218 }
2219
2220 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
2221 ORPHAN_DIR_SYSTEM_INODE,
2222 osb->slot_num);
2223 if (!orphan_dir_inode) {
2224 status = -EEXIST;
2225 mlog_errno(status);
2226 goto leave;
2227 }
2228
2229 mutex_lock(&orphan_dir_inode->i_mutex);
2230
2231 status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
2232 if (status < 0) {
2233 mlog_errno(status);
2234 mutex_unlock(&orphan_dir_inode->i_mutex);
2235 iput(orphan_dir_inode);
2236 goto leave;
2237 }
2238
2239 status = ocfs2_read_inode_block(inode, &di_bh);
2240 if (status < 0) {
2241 mlog_errno(status);
2242 goto orphan_unlock;
2243 }
2244
2245 handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb));
2246 if (IS_ERR(handle)) {
2247 status = PTR_ERR(handle);
2248 handle = NULL;
2249 mlog_errno(status);
2250 goto orphan_unlock;
2251 }
2252
2253 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
2254 di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2255 if (status < 0) {
2256 mlog_errno(status);
2257 goto out_commit;
2258 }
2259
2260 status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
2261 orphan_dir_bh);
2262 if (status < 0) {
2263 mlog_errno(status);
2264 goto out_commit;
2265 }
2266
2267 di = (struct ocfs2_dinode *)di_bh->b_data;
2268 le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL);
2269 di->i_orphaned_slot = 0;
2270 ocfs2_journal_dirty(handle, di_bh);
2271
2272 status = ocfs2_add_entry(handle, dentry, inode,
2273 OCFS2_I(inode)->ip_blkno, parent_di_bh,
2274 &lookup);
2275 if (status < 0) {
2276 mlog_errno(status);
2277 goto out_commit;
2278 }
2279
2280 status = ocfs2_dentry_attach_lock(dentry, inode,
2281 OCFS2_I(dir)->ip_blkno);
2282 if (status) {
2283 mlog_errno(status);
2284 goto out_commit;
2285 }
2286
2287 insert_inode_hash(inode);
2288 dentry->d_op = &ocfs2_dentry_ops;
2289 d_instantiate(dentry, inode);
2290 status = 0;
2291out_commit:
2292 ocfs2_commit_trans(osb, handle);
2293orphan_unlock:
2294 ocfs2_inode_unlock(orphan_dir_inode, 1);
2295 mutex_unlock(&orphan_dir_inode->i_mutex);
2296 iput(orphan_dir_inode);
2297leave:
2298
2299 ocfs2_inode_unlock(dir, 1);
2300
2301 brelse(di_bh);
2302 brelse(parent_di_bh);
2303 brelse(orphan_dir_bh);
2304
2305 ocfs2_free_dir_lookup_result(&lookup);
2306
2307 mlog_exit(status);
2308
2309 return status;
2310}
2311
2031const struct inode_operations ocfs2_dir_iops = { 2312const struct inode_operations ocfs2_dir_iops = {
2032 .create = ocfs2_create, 2313 .create = ocfs2_create,
2033 .lookup = ocfs2_lookup, 2314 .lookup = ocfs2_lookup,
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index 688aef64c879..e5d059d4f115 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -35,5 +35,11 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
35 struct inode *orphan_dir_inode, 35 struct inode *orphan_dir_inode,
36 struct inode *inode, 36 struct inode *inode,
37 struct buffer_head *orphan_dir_bh); 37 struct buffer_head *orphan_dir_bh);
38int ocfs2_create_inode_in_orphan(struct inode *dir,
39 int mode,
40 struct inode **new_inode);
41int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
42 struct inode *new_inode,
43 struct dentry *new_dentry);
38 44
39#endif /* OCFS2_NAMEI_H */ 45#endif /* OCFS2_NAMEI_H */
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 39e1d5a39505..eae404602424 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -51,20 +51,51 @@
51/* For struct ocfs2_blockcheck_stats */ 51/* For struct ocfs2_blockcheck_stats */
52#include "blockcheck.h" 52#include "blockcheck.h"
53 53
54
55/* Caching of metadata buffers */
56
54/* Most user visible OCFS2 inodes will have very few pieces of 57/* Most user visible OCFS2 inodes will have very few pieces of
55 * metadata, but larger files (including bitmaps, etc) must be taken 58 * metadata, but larger files (including bitmaps, etc) must be taken
56 * into account when designing an access scheme. We allow a small 59 * into account when designing an access scheme. We allow a small
57 * amount of inlined blocks to be stored on an array and grow the 60 * amount of inlined blocks to be stored on an array and grow the
58 * structure into a rb tree when necessary. */ 61 * structure into a rb tree when necessary. */
59#define OCFS2_INODE_MAX_CACHE_ARRAY 2 62#define OCFS2_CACHE_INFO_MAX_ARRAY 2
63
64/* Flags for ocfs2_caching_info */
65
66enum ocfs2_caching_info_flags {
67 /* Indicates that the metadata cache is using the inline array */
68 OCFS2_CACHE_FL_INLINE = 1<<1,
69};
60 70
71struct ocfs2_caching_operations;
61struct ocfs2_caching_info { 72struct ocfs2_caching_info {
73 /*
74 * The parent structure provides the locks, but because the
75 * parent structure can differ, it provides locking operations
76 * to struct ocfs2_caching_info.
77 */
78 const struct ocfs2_caching_operations *ci_ops;
79
80 /* next two are protected by trans_inc_lock */
81 /* which transaction were we created on? Zero if none. */
82 unsigned long ci_created_trans;
83 /* last transaction we were a part of. */
84 unsigned long ci_last_trans;
85
86 /* Cache structures */
87 unsigned int ci_flags;
62 unsigned int ci_num_cached; 88 unsigned int ci_num_cached;
63 union { 89 union {
64 sector_t ci_array[OCFS2_INODE_MAX_CACHE_ARRAY]; 90 sector_t ci_array[OCFS2_CACHE_INFO_MAX_ARRAY];
65 struct rb_root ci_tree; 91 struct rb_root ci_tree;
66 } ci_cache; 92 } ci_cache;
67}; 93};
94/*
95 * Need this prototype here instead of in uptodate.h because journal.h
96 * uses it.
97 */
98struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci);
68 99
69/* this limits us to 256 nodes 100/* this limits us to 256 nodes
70 * if we need more, we can do a kmalloc for the map */ 101 * if we need more, we can do a kmalloc for the map */
@@ -377,12 +408,17 @@ struct ocfs2_super
377 408
378 /* the group we used to allocate inodes. */ 409 /* the group we used to allocate inodes. */
379 u64 osb_inode_alloc_group; 410 u64 osb_inode_alloc_group;
411
412 /* rb tree root for refcount lock. */
413 struct rb_root osb_rf_lock_tree;
414 struct ocfs2_refcount_tree *osb_ref_tree_lru;
380}; 415};
381 416
382#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) 417#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
383 418
384/* Useful typedef for passing around journal access functions */ 419/* Useful typedef for passing around journal access functions */
385typedef int (*ocfs2_journal_access_func)(handle_t *handle, struct inode *inode, 420typedef int (*ocfs2_journal_access_func)(handle_t *handle,
421 struct ocfs2_caching_info *ci,
386 struct buffer_head *bh, int type); 422 struct buffer_head *bh, int type);
387 423
388static inline int ocfs2_should_order_data(struct inode *inode) 424static inline int ocfs2_should_order_data(struct inode *inode)
@@ -480,6 +516,13 @@ static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n)
480 ocfs2_set_links_count(di, links); 516 ocfs2_set_links_count(di, links);
481} 517}
482 518
519static inline int ocfs2_refcount_tree(struct ocfs2_super *osb)
520{
521 if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE)
522 return 1;
523 return 0;
524}
525
483/* set / clear functions because cluster events can make these happen 526/* set / clear functions because cluster events can make these happen
484 * in parallel so we want the transitions to be atomic. this also 527 * in parallel so we want the transitions to be atomic. this also
485 * means that any future flags osb_flags must be protected by spinlock 528 * means that any future flags osb_flags must be protected by spinlock
@@ -578,6 +621,9 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
578#define OCFS2_IS_VALID_DX_LEAF(ptr) \ 621#define OCFS2_IS_VALID_DX_LEAF(ptr) \
579 (!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE)) 622 (!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE))
580 623
624#define OCFS2_IS_VALID_REFCOUNT_BLOCK(ptr) \
625 (!strcmp((ptr)->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE))
626
581static inline unsigned long ino_from_blkno(struct super_block *sb, 627static inline unsigned long ino_from_blkno(struct super_block *sb,
582 u64 blkno) 628 u64 blkno)
583{ 629{
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 7ab6e9e5e77c..e9431e4a5e7c 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -68,6 +68,7 @@
68#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1" 68#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1"
69#define OCFS2_DX_ROOT_SIGNATURE "DXDIR01" 69#define OCFS2_DX_ROOT_SIGNATURE "DXDIR01"
70#define OCFS2_DX_LEAF_SIGNATURE "DXLEAF1" 70#define OCFS2_DX_LEAF_SIGNATURE "DXLEAF1"
71#define OCFS2_REFCOUNT_BLOCK_SIGNATURE "REFCNT1"
71 72
72/* Compatibility flags */ 73/* Compatibility flags */
73#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ 74#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
@@ -98,7 +99,8 @@
98 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \ 99 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
99 | OCFS2_FEATURE_INCOMPAT_XATTR \ 100 | OCFS2_FEATURE_INCOMPAT_XATTR \
100 | OCFS2_FEATURE_INCOMPAT_META_ECC \ 101 | OCFS2_FEATURE_INCOMPAT_META_ECC \
101 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS) 102 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
103 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE)
102#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ 104#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
103 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ 105 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
104 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) 106 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -160,6 +162,9 @@
160/* Metadata checksum and error correction */ 162/* Metadata checksum and error correction */
161#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800 163#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800
162 164
165/* Refcount tree support */
166#define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000
167
163/* 168/*
164 * backup superblock flag is used to indicate that this volume 169 * backup superblock flag is used to indicate that this volume
165 * has backup superblocks. 170 * has backup superblocks.
@@ -223,6 +228,7 @@
223#define OCFS2_HAS_XATTR_FL (0x0002) 228#define OCFS2_HAS_XATTR_FL (0x0002)
224#define OCFS2_INLINE_XATTR_FL (0x0004) 229#define OCFS2_INLINE_XATTR_FL (0x0004)
225#define OCFS2_INDEXED_DIR_FL (0x0008) 230#define OCFS2_INDEXED_DIR_FL (0x0008)
231#define OCFS2_HAS_REFCOUNT_FL (0x0010)
226 232
227/* Inode attributes, keep in sync with EXT2 */ 233/* Inode attributes, keep in sync with EXT2 */
228#define OCFS2_SECRM_FL (0x00000001) /* Secure deletion */ 234#define OCFS2_SECRM_FL (0x00000001) /* Secure deletion */
@@ -241,8 +247,11 @@
241/* 247/*
242 * Extent record flags (e_node.leaf.flags) 248 * Extent record flags (e_node.leaf.flags)
243 */ 249 */
244#define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but 250#define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but
245 * unwritten */ 251 * unwritten */
252#define OCFS2_EXT_REFCOUNTED (0x02) /* Extent is reference
253 * counted in an associated
254 * refcount tree */
246 255
247/* 256/*
248 * ioctl commands 257 * ioctl commands
@@ -292,6 +301,15 @@ struct ocfs2_new_group_input {
292#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input) 301#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input)
293#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input) 302#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input)
294 303
304/* Used to pass 2 file names to reflink. */
305struct reflink_arguments {
306 __u64 old_path;
307 __u64 new_path;
308 __u64 preserve;
309};
310#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
311
312
295/* 313/*
296 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) 314 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
297 */ 315 */
@@ -717,7 +735,8 @@ struct ocfs2_dinode {
717 __le64 i_xattr_loc; 735 __le64 i_xattr_loc;
718/*80*/ struct ocfs2_block_check i_check; /* Error checking */ 736/*80*/ struct ocfs2_block_check i_check; /* Error checking */
719/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */ 737/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */
720 __le64 i_reserved2[5]; 738/*90*/ __le64 i_refcount_loc;
739 __le64 i_reserved2[4];
721/*B8*/ union { 740/*B8*/ union {
722 __le64 i_pad1; /* Generic way to refer to this 741 __le64 i_pad1; /* Generic way to refer to this
723 64bit union */ 742 64bit union */
@@ -901,6 +920,60 @@ struct ocfs2_group_desc
901/*40*/ __u8 bg_bitmap[0]; 920/*40*/ __u8 bg_bitmap[0];
902}; 921};
903 922
923struct ocfs2_refcount_rec {
924/*00*/ __le64 r_cpos; /* Physical offset, in clusters */
925 __le32 r_clusters; /* Clusters covered by this extent */
926 __le32 r_refcount; /* Reference count of this extent */
927/*10*/
928};
929#define OCFS2_32BIT_POS_MASK (0xffffffffULL)
930
931#define OCFS2_REFCOUNT_LEAF_FL (0x00000001)
932#define OCFS2_REFCOUNT_TREE_FL (0x00000002)
933
934struct ocfs2_refcount_list {
935/*00*/ __le16 rl_count; /* Maximum number of entries possible
936 in rl_records */
937 __le16 rl_used; /* Current number of used records */
938 __le32 rl_reserved2;
939 __le64 rl_reserved1; /* Pad to sizeof(ocfs2_refcount_record) */
940/*10*/ struct ocfs2_refcount_rec rl_recs[0]; /* Refcount records */
941};
942
943
944struct ocfs2_refcount_block {
945/*00*/ __u8 rf_signature[8]; /* Signature for verification */
946 __le16 rf_suballoc_slot; /* Slot suballocator this block
947 belongs to */
948 __le16 rf_suballoc_bit; /* Bit offset in suballocator
949 block group */
950 __le32 rf_fs_generation; /* Must match superblock */
951/*10*/ __le64 rf_blkno; /* Offset on disk, in blocks */
952 __le64 rf_parent; /* Parent block, only valid if
953 OCFS2_REFCOUNT_LEAF_FL is set in
954 rf_flags */
955/*20*/ struct ocfs2_block_check rf_check; /* Error checking */
956 __le64 rf_last_eb_blk; /* Pointer to last extent block */
957/*30*/ __le32 rf_count; /* Number of inodes sharing this
958 refcount tree */
959 __le32 rf_flags; /* See the flags above */
960 __le32 rf_clusters; /* clusters covered by refcount tree. */
961 __le32 rf_cpos; /* cluster offset in refcount tree.*/
962/*40*/ __le32 rf_generation; /* generation number. all be the same
963 * for the same refcount tree. */
964 __le32 rf_reserved0;
965 __le64 rf_reserved1[7];
966/*80*/ union {
967 struct ocfs2_refcount_list rf_records; /* List of refcount
968 records */
969 struct ocfs2_extent_list rf_list; /* Extent record list,
970 only valid if
971 OCFS2_REFCOUNT_TREE_FL
972 is set in rf_flags */
973 };
974/* Actual on-disk size is one block */
975};
976
904/* 977/*
905 * On disk extended attribute structure for OCFS2. 978 * On disk extended attribute structure for OCFS2.
906 */ 979 */
@@ -1312,6 +1385,32 @@ static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb)
1312 1385
1313 return size / sizeof(struct ocfs2_extent_rec); 1386 return size / sizeof(struct ocfs2_extent_rec);
1314} 1387}
1388
1389static inline u16 ocfs2_extent_recs_per_rb(struct super_block *sb)
1390{
1391 int size;
1392
1393 size = sb->s_blocksize -
1394 offsetof(struct ocfs2_refcount_block, rf_list.l_recs);
1395
1396 return size / sizeof(struct ocfs2_extent_rec);
1397}
1398
1399static inline u16 ocfs2_refcount_recs_per_rb(struct super_block *sb)
1400{
1401 int size;
1402
1403 size = sb->s_blocksize -
1404 offsetof(struct ocfs2_refcount_block, rf_records.rl_recs);
1405
1406 return size / sizeof(struct ocfs2_refcount_rec);
1407}
1408
1409static inline u32
1410ocfs2_get_ref_rec_low_cpos(const struct ocfs2_refcount_rec *rec)
1411{
1412 return le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
1413}
1315#else 1414#else
1316static inline int ocfs2_fast_symlink_chars(int blocksize) 1415static inline int ocfs2_fast_symlink_chars(int blocksize)
1317{ 1416{
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index c212cf5a2bdf..d277aabf5dfb 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -49,6 +49,7 @@ enum ocfs2_lock_type {
49 OCFS2_LOCK_TYPE_QINFO, 49 OCFS2_LOCK_TYPE_QINFO,
50 OCFS2_LOCK_TYPE_NFS_SYNC, 50 OCFS2_LOCK_TYPE_NFS_SYNC,
51 OCFS2_LOCK_TYPE_ORPHAN_SCAN, 51 OCFS2_LOCK_TYPE_ORPHAN_SCAN,
52 OCFS2_LOCK_TYPE_REFCOUNT,
52 OCFS2_NUM_LOCK_TYPES 53 OCFS2_NUM_LOCK_TYPES
53}; 54};
54 55
@@ -89,6 +90,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
89 case OCFS2_LOCK_TYPE_ORPHAN_SCAN: 90 case OCFS2_LOCK_TYPE_ORPHAN_SCAN:
90 c = 'P'; 91 c = 'P';
91 break; 92 break;
93 case OCFS2_LOCK_TYPE_REFCOUNT:
94 c = 'T';
95 break;
92 default: 96 default:
93 c = '\0'; 97 c = '\0';
94 } 98 }
@@ -110,6 +114,7 @@ static char *ocfs2_lock_type_strings[] = {
110 [OCFS2_LOCK_TYPE_QINFO] = "Quota", 114 [OCFS2_LOCK_TYPE_QINFO] = "Quota",
111 [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync", 115 [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync",
112 [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan", 116 [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
117 [OCFS2_LOCK_TYPE_REFCOUNT] = "Refcount",
113}; 118};
114 119
115static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) 120static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 3fb96fcd4c81..e5df9d170b0c 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -109,7 +109,7 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
109int ocfs2_read_quota_block(struct inode *inode, u64 v_block, 109int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
110 struct buffer_head **bh); 110 struct buffer_head **bh);
111 111
112extern struct dquot_operations ocfs2_quota_operations; 112extern const struct dquot_operations ocfs2_quota_operations;
113extern struct quota_format_type ocfs2_quota_format; 113extern struct quota_format_type ocfs2_quota_format;
114 114
115int ocfs2_quota_setup(void); 115int ocfs2_quota_setup(void);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 44f2a5e1d042..b437dc0c4cad 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -154,7 +154,7 @@ static int ocfs2_get_quota_block(struct inode *inode, int block,
154 err = -EIO; 154 err = -EIO;
155 mlog_errno(err); 155 mlog_errno(err);
156 } 156 }
157 return err;; 157 return err;
158} 158}
159 159
160/* Read data from global quotafile - avoid pagecache and such because we cannot 160/* Read data from global quotafile - avoid pagecache and such because we cannot
@@ -253,8 +253,9 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
253 flush_dcache_page(bh->b_page); 253 flush_dcache_page(bh->b_page);
254 set_buffer_uptodate(bh); 254 set_buffer_uptodate(bh);
255 unlock_buffer(bh); 255 unlock_buffer(bh);
256 ocfs2_set_buffer_uptodate(gqinode, bh); 256 ocfs2_set_buffer_uptodate(INODE_CACHE(gqinode), bh);
257 err = ocfs2_journal_access_dq(handle, gqinode, bh, ja_type); 257 err = ocfs2_journal_access_dq(handle, INODE_CACHE(gqinode), bh,
258 ja_type);
258 if (err < 0) { 259 if (err < 0) {
259 brelse(bh); 260 brelse(bh);
260 goto out; 261 goto out;
@@ -849,7 +850,7 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
849 kmem_cache_free(ocfs2_dquot_cachep, dquot); 850 kmem_cache_free(ocfs2_dquot_cachep, dquot);
850} 851}
851 852
852struct dquot_operations ocfs2_quota_operations = { 853const struct dquot_operations ocfs2_quota_operations = {
853 .initialize = dquot_initialize, 854 .initialize = dquot_initialize,
854 .drop = dquot_drop, 855 .drop = dquot_drop,
855 .alloc_space = dquot_alloc_space, 856 .alloc_space = dquot_alloc_space,
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index bdb09cb6e1fe..1a2c50a759fa 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -108,7 +108,7 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
108 mlog_errno(status); 108 mlog_errno(status);
109 return status; 109 return status;
110 } 110 }
111 status = ocfs2_journal_access_dq(handle, inode, bh, 111 status = ocfs2_journal_access_dq(handle, INODE_CACHE(inode), bh,
112 OCFS2_JOURNAL_ACCESS_WRITE); 112 OCFS2_JOURNAL_ACCESS_WRITE);
113 if (status < 0) { 113 if (status < 0) {
114 mlog_errno(status); 114 mlog_errno(status);
@@ -510,7 +510,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
510 goto out_commit; 510 goto out_commit;
511 } 511 }
512 /* Release local quota file entry */ 512 /* Release local quota file entry */
513 status = ocfs2_journal_access_dq(handle, lqinode, 513 status = ocfs2_journal_access_dq(handle,
514 INODE_CACHE(lqinode),
514 qbh, OCFS2_JOURNAL_ACCESS_WRITE); 515 qbh, OCFS2_JOURNAL_ACCESS_WRITE);
515 if (status < 0) { 516 if (status < 0) {
516 mlog_errno(status); 517 mlog_errno(status);
@@ -619,7 +620,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
619 mlog_errno(status); 620 mlog_errno(status);
620 goto out_bh; 621 goto out_bh;
621 } 622 }
622 status = ocfs2_journal_access_dq(handle, lqinode, bh, 623 status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
624 bh,
623 OCFS2_JOURNAL_ACCESS_WRITE); 625 OCFS2_JOURNAL_ACCESS_WRITE);
624 if (status < 0) { 626 if (status < 0) {
625 mlog_errno(status); 627 mlog_errno(status);
@@ -993,8 +995,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
993 goto out_trans; 995 goto out_trans;
994 } 996 }
995 dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; 997 dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
996 ocfs2_set_new_buffer_uptodate(lqinode, bh); 998 ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), bh);
997 status = ocfs2_journal_access_dq(handle, lqinode, bh, 999 status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), bh,
998 OCFS2_JOURNAL_ACCESS_CREATE); 1000 OCFS2_JOURNAL_ACCESS_CREATE);
999 if (status < 0) { 1001 if (status < 0) {
1000 mlog_errno(status); 1002 mlog_errno(status);
@@ -1027,8 +1029,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
1027 mlog_errno(status); 1029 mlog_errno(status);
1028 goto out_trans; 1030 goto out_trans;
1029 } 1031 }
1030 ocfs2_set_new_buffer_uptodate(lqinode, dbh); 1032 ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), dbh);
1031 status = ocfs2_journal_access_dq(handle, lqinode, dbh, 1033 status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), dbh,
1032 OCFS2_JOURNAL_ACCESS_CREATE); 1034 OCFS2_JOURNAL_ACCESS_CREATE);
1033 if (status < 0) { 1035 if (status < 0) {
1034 mlog_errno(status); 1036 mlog_errno(status);
@@ -1131,7 +1133,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1131 mlog_errno(status); 1133 mlog_errno(status);
1132 goto out; 1134 goto out;
1133 } 1135 }
1134 ocfs2_set_new_buffer_uptodate(lqinode, bh); 1136 ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), bh);
1135 1137
1136 /* Local quota info, chunk header and the new block we initialize */ 1138 /* Local quota info, chunk header and the new block we initialize */
1137 handle = ocfs2_start_trans(OCFS2_SB(sb), 1139 handle = ocfs2_start_trans(OCFS2_SB(sb),
@@ -1143,7 +1145,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1143 goto out; 1145 goto out;
1144 } 1146 }
1145 /* Zero created block */ 1147 /* Zero created block */
1146 status = ocfs2_journal_access_dq(handle, lqinode, bh, 1148 status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), bh,
1147 OCFS2_JOURNAL_ACCESS_CREATE); 1149 OCFS2_JOURNAL_ACCESS_CREATE);
1148 if (status < 0) { 1150 if (status < 0) {
1149 mlog_errno(status); 1151 mlog_errno(status);
@@ -1158,7 +1160,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1158 goto out_trans; 1160 goto out_trans;
1159 } 1161 }
1160 /* Update chunk header */ 1162 /* Update chunk header */
1161 status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh, 1163 status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
1164 chunk->qc_headerbh,
1162 OCFS2_JOURNAL_ACCESS_WRITE); 1165 OCFS2_JOURNAL_ACCESS_WRITE);
1163 if (status < 0) { 1166 if (status < 0) {
1164 mlog_errno(status); 1167 mlog_errno(status);
@@ -1292,7 +1295,8 @@ static int ocfs2_local_release_dquot(struct dquot *dquot)
1292 goto out; 1295 goto out;
1293 } 1296 }
1294 1297
1295 status = ocfs2_journal_access_dq(handle, sb_dqopt(sb)->files[type], 1298 status = ocfs2_journal_access_dq(handle,
1299 INODE_CACHE(sb_dqopt(sb)->files[type]),
1296 od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE); 1300 od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE);
1297 if (status < 0) { 1301 if (status < 0) {
1298 mlog_errno(status); 1302 mlog_errno(status);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
new file mode 100644
index 000000000000..60287fc56bcb
--- /dev/null
+++ b/fs/ocfs2/refcounttree.c
@@ -0,0 +1,4313 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * refcounttree.c
5 *
6 * Copyright (C) 2009 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License version 2 as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 */
17
18#include <linux/sort.h>
19#define MLOG_MASK_PREFIX ML_REFCOUNT
20#include <cluster/masklog.h>
21#include "ocfs2.h"
22#include "inode.h"
23#include "alloc.h"
24#include "suballoc.h"
25#include "journal.h"
26#include "uptodate.h"
27#include "super.h"
28#include "buffer_head_io.h"
29#include "blockcheck.h"
30#include "refcounttree.h"
31#include "sysfile.h"
32#include "dlmglue.h"
33#include "extent_map.h"
34#include "aops.h"
35#include "xattr.h"
36#include "namei.h"
37
38#include <linux/bio.h>
39#include <linux/blkdev.h>
40#include <linux/gfp.h>
41#include <linux/slab.h>
42#include <linux/writeback.h>
43#include <linux/pagevec.h>
44#include <linux/swap.h>
45#include <linux/security.h>
46#include <linux/fsnotify.h>
47#include <linux/quotaops.h>
48#include <linux/namei.h>
49#include <linux/mount.h>
50
51struct ocfs2_cow_context {
52 struct inode *inode;
53 u32 cow_start;
54 u32 cow_len;
55 struct ocfs2_extent_tree data_et;
56 struct ocfs2_refcount_tree *ref_tree;
57 struct buffer_head *ref_root_bh;
58 struct ocfs2_alloc_context *meta_ac;
59 struct ocfs2_alloc_context *data_ac;
60 struct ocfs2_cached_dealloc_ctxt dealloc;
61 void *cow_object;
62 struct ocfs2_post_refcount *post_refcount;
63 int extra_credits;
64 int (*get_clusters)(struct ocfs2_cow_context *context,
65 u32 v_cluster, u32 *p_cluster,
66 u32 *num_clusters,
67 unsigned int *extent_flags);
68 int (*cow_duplicate_clusters)(handle_t *handle,
69 struct ocfs2_cow_context *context,
70 u32 cpos, u32 old_cluster,
71 u32 new_cluster, u32 new_len);
72};
73
74static inline struct ocfs2_refcount_tree *
75cache_info_to_refcount(struct ocfs2_caching_info *ci)
76{
77 return container_of(ci, struct ocfs2_refcount_tree, rf_ci);
78}
79
80static int ocfs2_validate_refcount_block(struct super_block *sb,
81 struct buffer_head *bh)
82{
83 int rc;
84 struct ocfs2_refcount_block *rb =
85 (struct ocfs2_refcount_block *)bh->b_data;
86
87 mlog(0, "Validating refcount block %llu\n",
88 (unsigned long long)bh->b_blocknr);
89
90 BUG_ON(!buffer_uptodate(bh));
91
92 /*
93 * If the ecc fails, we return the error but otherwise
94 * leave the filesystem running. We know any error is
95 * local to this block.
96 */
97 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check);
98 if (rc) {
99 mlog(ML_ERROR, "Checksum failed for refcount block %llu\n",
100 (unsigned long long)bh->b_blocknr);
101 return rc;
102 }
103
104
105 if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
106 ocfs2_error(sb,
107 "Refcount block #%llu has bad signature %.*s",
108 (unsigned long long)bh->b_blocknr, 7,
109 rb->rf_signature);
110 return -EINVAL;
111 }
112
113 if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
114 ocfs2_error(sb,
115 "Refcount block #%llu has an invalid rf_blkno "
116 "of %llu",
117 (unsigned long long)bh->b_blocknr,
118 (unsigned long long)le64_to_cpu(rb->rf_blkno));
119 return -EINVAL;
120 }
121
122 if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
123 ocfs2_error(sb,
124 "Refcount block #%llu has an invalid "
125 "rf_fs_generation of #%u",
126 (unsigned long long)bh->b_blocknr,
127 le32_to_cpu(rb->rf_fs_generation));
128 return -EINVAL;
129 }
130
131 return 0;
132}
133
134static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
135 u64 rb_blkno,
136 struct buffer_head **bh)
137{
138 int rc;
139 struct buffer_head *tmp = *bh;
140
141 rc = ocfs2_read_block(ci, rb_blkno, &tmp,
142 ocfs2_validate_refcount_block);
143
144 /* If ocfs2_read_block() got us a new bh, pass it up. */
145 if (!rc && !*bh)
146 *bh = tmp;
147
148 return rc;
149}
150
151static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci)
152{
153 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
154
155 return rf->rf_blkno;
156}
157
158static struct super_block *
159ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci)
160{
161 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
162
163 return rf->rf_sb;
164}
165
166static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci)
167{
168 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
169
170 spin_lock(&rf->rf_lock);
171}
172
173static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci)
174{
175 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
176
177 spin_unlock(&rf->rf_lock);
178}
179
180static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci)
181{
182 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
183
184 mutex_lock(&rf->rf_io_mutex);
185}
186
187static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci)
188{
189 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
190
191 mutex_unlock(&rf->rf_io_mutex);
192}
193
194static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = {
195 .co_owner = ocfs2_refcount_cache_owner,
196 .co_get_super = ocfs2_refcount_cache_get_super,
197 .co_cache_lock = ocfs2_refcount_cache_lock,
198 .co_cache_unlock = ocfs2_refcount_cache_unlock,
199 .co_io_lock = ocfs2_refcount_cache_io_lock,
200 .co_io_unlock = ocfs2_refcount_cache_io_unlock,
201};
202
203static struct ocfs2_refcount_tree *
204ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno)
205{
206 struct rb_node *n = osb->osb_rf_lock_tree.rb_node;
207 struct ocfs2_refcount_tree *tree = NULL;
208
209 while (n) {
210 tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node);
211
212 if (blkno < tree->rf_blkno)
213 n = n->rb_left;
214 else if (blkno > tree->rf_blkno)
215 n = n->rb_right;
216 else
217 return tree;
218 }
219
220 return NULL;
221}
222
223/* osb_lock is already locked. */
224static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb,
225 struct ocfs2_refcount_tree *new)
226{
227 u64 rf_blkno = new->rf_blkno;
228 struct rb_node *parent = NULL;
229 struct rb_node **p = &osb->osb_rf_lock_tree.rb_node;
230 struct ocfs2_refcount_tree *tmp;
231
232 while (*p) {
233 parent = *p;
234
235 tmp = rb_entry(parent, struct ocfs2_refcount_tree,
236 rf_node);
237
238 if (rf_blkno < tmp->rf_blkno)
239 p = &(*p)->rb_left;
240 else if (rf_blkno > tmp->rf_blkno)
241 p = &(*p)->rb_right;
242 else {
243 /* This should never happen! */
244 mlog(ML_ERROR, "Duplicate refcount block %llu found!\n",
245 (unsigned long long)rf_blkno);
246 BUG();
247 }
248 }
249
250 rb_link_node(&new->rf_node, parent, p);
251 rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree);
252}
253
254static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree)
255{
256 ocfs2_metadata_cache_exit(&tree->rf_ci);
257 ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres);
258 ocfs2_lock_res_free(&tree->rf_lockres);
259 kfree(tree);
260}
261
262static inline void
263ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb,
264 struct ocfs2_refcount_tree *tree)
265{
266 rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree);
267 if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree)
268 osb->osb_ref_tree_lru = NULL;
269}
270
271static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb,
272 struct ocfs2_refcount_tree *tree)
273{
274 spin_lock(&osb->osb_lock);
275 ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
276 spin_unlock(&osb->osb_lock);
277}
278
279void ocfs2_kref_remove_refcount_tree(struct kref *kref)
280{
281 struct ocfs2_refcount_tree *tree =
282 container_of(kref, struct ocfs2_refcount_tree, rf_getcnt);
283
284 ocfs2_free_refcount_tree(tree);
285}
286
287static inline void
288ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree)
289{
290 kref_get(&tree->rf_getcnt);
291}
292
293static inline void
294ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree)
295{
296 kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree);
297}
298
299static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new,
300 struct super_block *sb)
301{
302 ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops);
303 mutex_init(&new->rf_io_mutex);
304 new->rf_sb = sb;
305 spin_lock_init(&new->rf_lock);
306}
307
308static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb,
309 struct ocfs2_refcount_tree *new,
310 u64 rf_blkno, u32 generation)
311{
312 init_rwsem(&new->rf_sem);
313 ocfs2_refcount_lock_res_init(&new->rf_lockres, osb,
314 rf_blkno, generation);
315}
316
317static struct ocfs2_refcount_tree*
318ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno)
319{
320 struct ocfs2_refcount_tree *new;
321
322 new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS);
323 if (!new)
324 return NULL;
325
326 new->rf_blkno = rf_blkno;
327 kref_init(&new->rf_getcnt);
328 ocfs2_init_refcount_tree_ci(new, osb->sb);
329
330 return new;
331}
332
333static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno,
334 struct ocfs2_refcount_tree **ret_tree)
335{
336 int ret = 0;
337 struct ocfs2_refcount_tree *tree, *new = NULL;
338 struct buffer_head *ref_root_bh = NULL;
339 struct ocfs2_refcount_block *ref_rb;
340
341 spin_lock(&osb->osb_lock);
342 if (osb->osb_ref_tree_lru &&
343 osb->osb_ref_tree_lru->rf_blkno == rf_blkno)
344 tree = osb->osb_ref_tree_lru;
345 else
346 tree = ocfs2_find_refcount_tree(osb, rf_blkno);
347 if (tree)
348 goto out;
349
350 spin_unlock(&osb->osb_lock);
351
352 new = ocfs2_allocate_refcount_tree(osb, rf_blkno);
353 if (!new) {
354 ret = -ENOMEM;
355 mlog_errno(ret);
356 return ret;
357 }
358 /*
359 * We need the generation to create the refcount tree lock and since
360 * it isn't changed during the tree modification, we are safe here to
361 * read without protection.
362 * We also have to purge the cache after we create the lock since the
363 * refcount block may have the stale data. It can only be trusted when
364 * we hold the refcount lock.
365 */
366 ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh);
367 if (ret) {
368 mlog_errno(ret);
369 ocfs2_metadata_cache_exit(&new->rf_ci);
370 kfree(new);
371 return ret;
372 }
373
374 ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
375 new->rf_generation = le32_to_cpu(ref_rb->rf_generation);
376 ocfs2_init_refcount_tree_lock(osb, new, rf_blkno,
377 new->rf_generation);
378 ocfs2_metadata_cache_purge(&new->rf_ci);
379
380 spin_lock(&osb->osb_lock);
381 tree = ocfs2_find_refcount_tree(osb, rf_blkno);
382 if (tree)
383 goto out;
384
385 ocfs2_insert_refcount_tree(osb, new);
386
387 tree = new;
388 new = NULL;
389
390out:
391 *ret_tree = tree;
392
393 osb->osb_ref_tree_lru = tree;
394
395 spin_unlock(&osb->osb_lock);
396
397 if (new)
398 ocfs2_free_refcount_tree(new);
399
400 brelse(ref_root_bh);
401 return ret;
402}
403
404static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno)
405{
406 int ret;
407 struct buffer_head *di_bh = NULL;
408 struct ocfs2_dinode *di;
409
410 ret = ocfs2_read_inode_block(inode, &di_bh);
411 if (ret) {
412 mlog_errno(ret);
413 goto out;
414 }
415
416 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
417
418 di = (struct ocfs2_dinode *)di_bh->b_data;
419 *ref_blkno = le64_to_cpu(di->i_refcount_loc);
420 brelse(di_bh);
421out:
422 return ret;
423}
424
425static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
426 struct ocfs2_refcount_tree *tree, int rw)
427{
428 int ret;
429
430 ret = ocfs2_refcount_lock(tree, rw);
431 if (ret) {
432 mlog_errno(ret);
433 goto out;
434 }
435
436 if (rw)
437 down_write(&tree->rf_sem);
438 else
439 down_read(&tree->rf_sem);
440
441out:
442 return ret;
443}
444
445/*
446 * Lock the refcount tree pointed by ref_blkno and return the tree.
447 * In most case, we lock the tree and read the refcount block.
448 * So read it here if the caller really needs it.
449 *
450 * If the tree has been re-created by other node, it will free the
451 * old one and re-create it.
452 */
453int ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
454 u64 ref_blkno, int rw,
455 struct ocfs2_refcount_tree **ret_tree,
456 struct buffer_head **ref_bh)
457{
458 int ret, delete_tree = 0;
459 struct ocfs2_refcount_tree *tree = NULL;
460 struct buffer_head *ref_root_bh = NULL;
461 struct ocfs2_refcount_block *rb;
462
463again:
464 ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree);
465 if (ret) {
466 mlog_errno(ret);
467 return ret;
468 }
469
470 ocfs2_refcount_tree_get(tree);
471
472 ret = __ocfs2_lock_refcount_tree(osb, tree, rw);
473 if (ret) {
474 mlog_errno(ret);
475 ocfs2_refcount_tree_put(tree);
476 goto out;
477 }
478
479 ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
480 &ref_root_bh);
481 if (ret) {
482 mlog_errno(ret);
483 ocfs2_unlock_refcount_tree(osb, tree, rw);
484 ocfs2_refcount_tree_put(tree);
485 goto out;
486 }
487
488 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
489 /*
490 * If the refcount block has been freed and re-created, we may need
491 * to recreate the refcount tree also.
492 *
493 * Here we just remove the tree from the rb-tree, and the last
494 * kref holder will unlock and delete this refcount_tree.
495 * Then we goto "again" and ocfs2_get_refcount_tree will create
496 * the new refcount tree for us.
497 */
498 if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) {
499 if (!tree->rf_removed) {
500 ocfs2_erase_refcount_tree_from_list(osb, tree);
501 tree->rf_removed = 1;
502 delete_tree = 1;
503 }
504
505 ocfs2_unlock_refcount_tree(osb, tree, rw);
506 /*
507 * We get an extra reference when we create the refcount
508 * tree, so another put will destroy it.
509 */
510 if (delete_tree)
511 ocfs2_refcount_tree_put(tree);
512 brelse(ref_root_bh);
513 ref_root_bh = NULL;
514 goto again;
515 }
516
517 *ret_tree = tree;
518 if (ref_bh) {
519 *ref_bh = ref_root_bh;
520 ref_root_bh = NULL;
521 }
522out:
523 brelse(ref_root_bh);
524 return ret;
525}
526
527int ocfs2_lock_refcount_tree_by_inode(struct inode *inode, int rw,
528 struct ocfs2_refcount_tree **ret_tree,
529 struct buffer_head **ref_bh)
530{
531 int ret;
532 u64 ref_blkno;
533
534 ret = ocfs2_get_refcount_block(inode, &ref_blkno);
535 if (ret) {
536 mlog_errno(ret);
537 return ret;
538 }
539
540 return ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno,
541 rw, ret_tree, ref_bh);
542}
543
544void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
545 struct ocfs2_refcount_tree *tree, int rw)
546{
547 if (rw)
548 up_write(&tree->rf_sem);
549 else
550 up_read(&tree->rf_sem);
551
552 ocfs2_refcount_unlock(tree, rw);
553 ocfs2_refcount_tree_put(tree);
554}
555
556void ocfs2_purge_refcount_trees(struct ocfs2_super *osb)
557{
558 struct rb_node *node;
559 struct ocfs2_refcount_tree *tree;
560 struct rb_root *root = &osb->osb_rf_lock_tree;
561
562 while ((node = rb_last(root)) != NULL) {
563 tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node);
564
565 mlog(0, "Purge tree %llu\n",
566 (unsigned long long) tree->rf_blkno);
567
568 rb_erase(&tree->rf_node, root);
569 ocfs2_free_refcount_tree(tree);
570 }
571}
572
573/*
574 * Create a refcount tree for an inode.
575 * We take for granted that the inode is already locked.
576 */
577static int ocfs2_create_refcount_tree(struct inode *inode,
578 struct buffer_head *di_bh)
579{
580 int ret;
581 handle_t *handle = NULL;
582 struct ocfs2_alloc_context *meta_ac = NULL;
583 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
584 struct ocfs2_inode_info *oi = OCFS2_I(inode);
585 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
586 struct buffer_head *new_bh = NULL;
587 struct ocfs2_refcount_block *rb;
588 struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
589 u16 suballoc_bit_start;
590 u32 num_got;
591 u64 first_blkno;
592
593 BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
594
595 mlog(0, "create tree for inode %lu\n", inode->i_ino);
596
597 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
598 if (ret) {
599 mlog_errno(ret);
600 goto out;
601 }
602
603 handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS);
604 if (IS_ERR(handle)) {
605 ret = PTR_ERR(handle);
606 mlog_errno(ret);
607 goto out;
608 }
609
610 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
611 OCFS2_JOURNAL_ACCESS_WRITE);
612 if (ret) {
613 mlog_errno(ret);
614 goto out_commit;
615 }
616
617 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
618 &suballoc_bit_start, &num_got,
619 &first_blkno);
620 if (ret) {
621 mlog_errno(ret);
622 goto out_commit;
623 }
624
625 new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno);
626 if (!new_tree) {
627 ret = -ENOMEM;
628 mlog_errno(ret);
629 goto out_commit;
630 }
631
632 new_bh = sb_getblk(inode->i_sb, first_blkno);
633 ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh);
634
635 ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh,
636 OCFS2_JOURNAL_ACCESS_CREATE);
637 if (ret) {
638 mlog_errno(ret);
639 goto out_commit;
640 }
641
642 /* Initialize ocfs2_refcount_block. */
643 rb = (struct ocfs2_refcount_block *)new_bh->b_data;
644 memset(rb, 0, inode->i_sb->s_blocksize);
645 strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
646 rb->rf_suballoc_slot = cpu_to_le16(osb->slot_num);
647 rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
648 rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
649 rb->rf_blkno = cpu_to_le64(first_blkno);
650 rb->rf_count = cpu_to_le32(1);
651 rb->rf_records.rl_count =
652 cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb));
653 spin_lock(&osb->osb_lock);
654 rb->rf_generation = osb->s_next_generation++;
655 spin_unlock(&osb->osb_lock);
656
657 ocfs2_journal_dirty(handle, new_bh);
658
659 spin_lock(&oi->ip_lock);
660 oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
661 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
662 di->i_refcount_loc = cpu_to_le64(first_blkno);
663 spin_unlock(&oi->ip_lock);
664
665 mlog(0, "created tree for inode %lu, refblock %llu\n",
666 inode->i_ino, (unsigned long long)first_blkno);
667
668 ocfs2_journal_dirty(handle, di_bh);
669
670 /*
671 * We have to init the tree lock here since it will use
672 * the generation number to create it.
673 */
674 new_tree->rf_generation = le32_to_cpu(rb->rf_generation);
675 ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno,
676 new_tree->rf_generation);
677
678 spin_lock(&osb->osb_lock);
679 tree = ocfs2_find_refcount_tree(osb, first_blkno);
680
681 /*
682 * We've just created a new refcount tree in this block. If
683 * we found a refcount tree on the ocfs2_super, it must be
684 * one we just deleted. We free the old tree before
685 * inserting the new tree.
686 */
687 BUG_ON(tree && tree->rf_generation == new_tree->rf_generation);
688 if (tree)
689 ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
690 ocfs2_insert_refcount_tree(osb, new_tree);
691 spin_unlock(&osb->osb_lock);
692 new_tree = NULL;
693 if (tree)
694 ocfs2_refcount_tree_put(tree);
695
696out_commit:
697 ocfs2_commit_trans(osb, handle);
698
699out:
700 if (new_tree) {
701 ocfs2_metadata_cache_exit(&new_tree->rf_ci);
702 kfree(new_tree);
703 }
704
705 brelse(new_bh);
706 if (meta_ac)
707 ocfs2_free_alloc_context(meta_ac);
708
709 return ret;
710}
711
712static int ocfs2_set_refcount_tree(struct inode *inode,
713 struct buffer_head *di_bh,
714 u64 refcount_loc)
715{
716 int ret;
717 handle_t *handle = NULL;
718 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
719 struct ocfs2_inode_info *oi = OCFS2_I(inode);
720 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
721 struct buffer_head *ref_root_bh = NULL;
722 struct ocfs2_refcount_block *rb;
723 struct ocfs2_refcount_tree *ref_tree;
724
725 BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
726
727 ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
728 &ref_tree, &ref_root_bh);
729 if (ret) {
730 mlog_errno(ret);
731 return ret;
732 }
733
734 handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS);
735 if (IS_ERR(handle)) {
736 ret = PTR_ERR(handle);
737 mlog_errno(ret);
738 goto out;
739 }
740
741 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
742 OCFS2_JOURNAL_ACCESS_WRITE);
743 if (ret) {
744 mlog_errno(ret);
745 goto out_commit;
746 }
747
748 ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh,
749 OCFS2_JOURNAL_ACCESS_WRITE);
750 if (ret) {
751 mlog_errno(ret);
752 goto out_commit;
753 }
754
755 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
756 le32_add_cpu(&rb->rf_count, 1);
757
758 ocfs2_journal_dirty(handle, ref_root_bh);
759
760 spin_lock(&oi->ip_lock);
761 oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
762 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
763 di->i_refcount_loc = cpu_to_le64(refcount_loc);
764 spin_unlock(&oi->ip_lock);
765 ocfs2_journal_dirty(handle, di_bh);
766
767out_commit:
768 ocfs2_commit_trans(osb, handle);
769out:
770 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
771 brelse(ref_root_bh);
772
773 return ret;
774}
775
776int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
777{
778 int ret, delete_tree = 0;
779 handle_t *handle = NULL;
780 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
781 struct ocfs2_inode_info *oi = OCFS2_I(inode);
782 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
783 struct ocfs2_refcount_block *rb;
784 struct inode *alloc_inode = NULL;
785 struct buffer_head *alloc_bh = NULL;
786 struct buffer_head *blk_bh = NULL;
787 struct ocfs2_refcount_tree *ref_tree;
788 int credits = OCFS2_REFCOUNT_TREE_REMOVE_CREDITS;
789 u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc);
790 u16 bit = 0;
791
792 if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
793 return 0;
794
795 BUG_ON(!ref_blkno);
796 ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh);
797 if (ret) {
798 mlog_errno(ret);
799 return ret;
800 }
801
802 rb = (struct ocfs2_refcount_block *)blk_bh->b_data;
803
804 /*
805 * If we are the last user, we need to free the block.
806 * So lock the allocator ahead.
807 */
808 if (le32_to_cpu(rb->rf_count) == 1) {
809 blk = le64_to_cpu(rb->rf_blkno);
810 bit = le16_to_cpu(rb->rf_suballoc_bit);
811 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
812
813 alloc_inode = ocfs2_get_system_file_inode(osb,
814 EXTENT_ALLOC_SYSTEM_INODE,
815 le16_to_cpu(rb->rf_suballoc_slot));
816 if (!alloc_inode) {
817 ret = -ENOMEM;
818 mlog_errno(ret);
819 goto out;
820 }
821 mutex_lock(&alloc_inode->i_mutex);
822
823 ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1);
824 if (ret) {
825 mlog_errno(ret);
826 goto out_mutex;
827 }
828
829 credits += OCFS2_SUBALLOC_FREE;
830 }
831
832 handle = ocfs2_start_trans(osb, credits);
833 if (IS_ERR(handle)) {
834 ret = PTR_ERR(handle);
835 mlog_errno(ret);
836 goto out_unlock;
837 }
838
839 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
840 OCFS2_JOURNAL_ACCESS_WRITE);
841 if (ret) {
842 mlog_errno(ret);
843 goto out_commit;
844 }
845
846 ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh,
847 OCFS2_JOURNAL_ACCESS_WRITE);
848 if (ret) {
849 mlog_errno(ret);
850 goto out_commit;
851 }
852
853 spin_lock(&oi->ip_lock);
854 oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL;
855 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
856 di->i_refcount_loc = 0;
857 spin_unlock(&oi->ip_lock);
858 ocfs2_journal_dirty(handle, di_bh);
859
860 le32_add_cpu(&rb->rf_count , -1);
861 ocfs2_journal_dirty(handle, blk_bh);
862
863 if (!rb->rf_count) {
864 delete_tree = 1;
865 ocfs2_erase_refcount_tree_from_list(osb, ref_tree);
866 ret = ocfs2_free_suballoc_bits(handle, alloc_inode,
867 alloc_bh, bit, bg_blkno, 1);
868 if (ret)
869 mlog_errno(ret);
870 }
871
872out_commit:
873 ocfs2_commit_trans(osb, handle);
874out_unlock:
875 if (alloc_inode) {
876 ocfs2_inode_unlock(alloc_inode, 1);
877 brelse(alloc_bh);
878 }
879out_mutex:
880 if (alloc_inode) {
881 mutex_unlock(&alloc_inode->i_mutex);
882 iput(alloc_inode);
883 }
884out:
885 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
886 if (delete_tree)
887 ocfs2_refcount_tree_put(ref_tree);
888 brelse(blk_bh);
889
890 return ret;
891}
892
893static void ocfs2_find_refcount_rec_in_rl(struct ocfs2_caching_info *ci,
894 struct buffer_head *ref_leaf_bh,
895 u64 cpos, unsigned int len,
896 struct ocfs2_refcount_rec *ret_rec,
897 int *index)
898{
899 int i = 0;
900 struct ocfs2_refcount_block *rb =
901 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
902 struct ocfs2_refcount_rec *rec = NULL;
903
904 for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) {
905 rec = &rb->rf_records.rl_recs[i];
906
907 if (le64_to_cpu(rec->r_cpos) +
908 le32_to_cpu(rec->r_clusters) <= cpos)
909 continue;
910 else if (le64_to_cpu(rec->r_cpos) > cpos)
911 break;
912
913 /* ok, cpos fail in this rec. Just return. */
914 if (ret_rec)
915 *ret_rec = *rec;
916 goto out;
917 }
918
919 if (ret_rec) {
920 /* We meet with a hole here, so fake the rec. */
921 ret_rec->r_cpos = cpu_to_le64(cpos);
922 ret_rec->r_refcount = 0;
923 if (i < le16_to_cpu(rb->rf_records.rl_used) &&
924 le64_to_cpu(rec->r_cpos) < cpos + len)
925 ret_rec->r_clusters =
926 cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos);
927 else
928 ret_rec->r_clusters = cpu_to_le32(len);
929 }
930
931out:
932 *index = i;
933}
934
935/*
936 * Try to remove refcount tree. The mechanism is:
937 * 1) Check whether i_clusters == 0, if no, exit.
938 * 2) check whether we have i_xattr_loc in dinode. if yes, exit.
939 * 3) Check whether we have inline xattr stored outside, if yes, exit.
940 * 4) Remove the tree.
941 */
942int ocfs2_try_remove_refcount_tree(struct inode *inode,
943 struct buffer_head *di_bh)
944{
945 int ret;
946 struct ocfs2_inode_info *oi = OCFS2_I(inode);
947 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
948
949 down_write(&oi->ip_xattr_sem);
950 down_write(&oi->ip_alloc_sem);
951
952 if (oi->ip_clusters)
953 goto out;
954
955 if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && di->i_xattr_loc)
956 goto out;
957
958 if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL &&
959 ocfs2_has_inline_xattr_value_outside(inode, di))
960 goto out;
961
962 ret = ocfs2_remove_refcount_tree(inode, di_bh);
963 if (ret)
964 mlog_errno(ret);
965out:
966 up_write(&oi->ip_alloc_sem);
967 up_write(&oi->ip_xattr_sem);
968 return 0;
969}
970
971/*
972 * Given a cpos and len, try to find the refcount record which contains cpos.
973 * 1. If cpos can be found in one refcount record, return the record.
974 * 2. If cpos can't be found, return a fake record which start from cpos
975 * and end at a small value between cpos+len and start of the next record.
976 * This fake record has r_refcount = 0.
977 */
978static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
979 struct buffer_head *ref_root_bh,
980 u64 cpos, unsigned int len,
981 struct ocfs2_refcount_rec *ret_rec,
982 int *index,
983 struct buffer_head **ret_bh)
984{
985 int ret = 0, i, found;
986 u32 low_cpos;
987 struct ocfs2_extent_list *el;
988 struct ocfs2_extent_rec *tmp, *rec = NULL;
989 struct ocfs2_extent_block *eb;
990 struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL;
991 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
992 struct ocfs2_refcount_block *rb =
993 (struct ocfs2_refcount_block *)ref_root_bh->b_data;
994
995 if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) {
996 ocfs2_find_refcount_rec_in_rl(ci, ref_root_bh, cpos, len,
997 ret_rec, index);
998 *ret_bh = ref_root_bh;
999 get_bh(ref_root_bh);
1000 return 0;
1001 }
1002
1003 el = &rb->rf_list;
1004 low_cpos = cpos & OCFS2_32BIT_POS_MASK;
1005
1006 if (el->l_tree_depth) {
1007 ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh);
1008 if (ret) {
1009 mlog_errno(ret);
1010 goto out;
1011 }
1012
1013 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
1014 el = &eb->h_list;
1015
1016 if (el->l_tree_depth) {
1017 ocfs2_error(sb,
1018 "refcount tree %llu has non zero tree "
1019 "depth in leaf btree tree block %llu\n",
1020 (unsigned long long)ocfs2_metadata_cache_owner(ci),
1021 (unsigned long long)eb_bh->b_blocknr);
1022 ret = -EROFS;
1023 goto out;
1024 }
1025 }
1026
1027 found = 0;
1028 for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
1029 rec = &el->l_recs[i];
1030
1031 if (le32_to_cpu(rec->e_cpos) <= low_cpos) {
1032 found = 1;
1033 break;
1034 }
1035 }
1036
1037 /* adjust len when we have ocfs2_extent_rec after it. */
1038 if (found && i < le16_to_cpu(el->l_next_free_rec) - 1) {
1039 tmp = &el->l_recs[i+1];
1040
1041 if (le32_to_cpu(tmp->e_cpos) < cpos + len)
1042 len = le32_to_cpu(tmp->e_cpos) - cpos;
1043 }
1044
1045 ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno),
1046 &ref_leaf_bh);
1047 if (ret) {
1048 mlog_errno(ret);
1049 goto out;
1050 }
1051
1052 ocfs2_find_refcount_rec_in_rl(ci, ref_leaf_bh, cpos, len,
1053 ret_rec, index);
1054 *ret_bh = ref_leaf_bh;
1055out:
1056 brelse(eb_bh);
1057 return ret;
1058}
1059
1060enum ocfs2_ref_rec_contig {
1061 REF_CONTIG_NONE = 0,
1062 REF_CONTIG_LEFT,
1063 REF_CONTIG_RIGHT,
1064 REF_CONTIG_LEFTRIGHT,
1065};
1066
1067static enum ocfs2_ref_rec_contig
1068 ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb,
1069 int index)
1070{
1071 if ((rb->rf_records.rl_recs[index].r_refcount ==
1072 rb->rf_records.rl_recs[index + 1].r_refcount) &&
1073 (le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) +
1074 le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) ==
1075 le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos)))
1076 return REF_CONTIG_RIGHT;
1077
1078 return REF_CONTIG_NONE;
1079}
1080
1081static enum ocfs2_ref_rec_contig
1082 ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb,
1083 int index)
1084{
1085 enum ocfs2_ref_rec_contig ret = REF_CONTIG_NONE;
1086
1087 if (index < le16_to_cpu(rb->rf_records.rl_used) - 1)
1088 ret = ocfs2_refcount_rec_adjacent(rb, index);
1089
1090 if (index > 0) {
1091 enum ocfs2_ref_rec_contig tmp;
1092
1093 tmp = ocfs2_refcount_rec_adjacent(rb, index - 1);
1094
1095 if (tmp == REF_CONTIG_RIGHT) {
1096 if (ret == REF_CONTIG_RIGHT)
1097 ret = REF_CONTIG_LEFTRIGHT;
1098 else
1099 ret = REF_CONTIG_LEFT;
1100 }
1101 }
1102
1103 return ret;
1104}
1105
1106static void ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb,
1107 int index)
1108{
1109 BUG_ON(rb->rf_records.rl_recs[index].r_refcount !=
1110 rb->rf_records.rl_recs[index+1].r_refcount);
1111
1112 le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters,
1113 le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters));
1114
1115 if (index < le16_to_cpu(rb->rf_records.rl_used) - 2)
1116 memmove(&rb->rf_records.rl_recs[index + 1],
1117 &rb->rf_records.rl_recs[index + 2],
1118 sizeof(struct ocfs2_refcount_rec) *
1119 (le16_to_cpu(rb->rf_records.rl_used) - index - 2));
1120
1121 memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) - 1],
1122 0, sizeof(struct ocfs2_refcount_rec));
1123 le16_add_cpu(&rb->rf_records.rl_used, -1);
1124}
1125
1126/*
1127 * Merge the refcount rec if we are contiguous with the adjacent recs.
1128 */
1129static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb,
1130 int index)
1131{
1132 enum ocfs2_ref_rec_contig contig =
1133 ocfs2_refcount_rec_contig(rb, index);
1134
1135 if (contig == REF_CONTIG_NONE)
1136 return;
1137
1138 if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) {
1139 BUG_ON(index == 0);
1140 index--;
1141 }
1142
1143 ocfs2_rotate_refcount_rec_left(rb, index);
1144
1145 if (contig == REF_CONTIG_LEFTRIGHT)
1146 ocfs2_rotate_refcount_rec_left(rb, index);
1147}
1148
1149/*
1150 * Change the refcount indexed by "index" in ref_bh.
1151 * If refcount reaches 0, remove it.
1152 */
1153static int ocfs2_change_refcount_rec(handle_t *handle,
1154 struct ocfs2_caching_info *ci,
1155 struct buffer_head *ref_leaf_bh,
1156 int index, int merge, int change)
1157{
1158 int ret;
1159 struct ocfs2_refcount_block *rb =
1160 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1161 struct ocfs2_refcount_list *rl = &rb->rf_records;
1162 struct ocfs2_refcount_rec *rec = &rl->rl_recs[index];
1163
1164 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1165 OCFS2_JOURNAL_ACCESS_WRITE);
1166 if (ret) {
1167 mlog_errno(ret);
1168 goto out;
1169 }
1170
1171 mlog(0, "change index %d, old count %u, change %d\n", index,
1172 le32_to_cpu(rec->r_refcount), change);
1173 le32_add_cpu(&rec->r_refcount, change);
1174
1175 if (!rec->r_refcount) {
1176 if (index != le16_to_cpu(rl->rl_used) - 1) {
1177 memmove(rec, rec + 1,
1178 (le16_to_cpu(rl->rl_used) - index - 1) *
1179 sizeof(struct ocfs2_refcount_rec));
1180 memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1],
1181 0, sizeof(struct ocfs2_refcount_rec));
1182 }
1183
1184 le16_add_cpu(&rl->rl_used, -1);
1185 } else if (merge)
1186 ocfs2_refcount_rec_merge(rb, index);
1187
1188 ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
1189 if (ret)
1190 mlog_errno(ret);
1191out:
1192 return ret;
1193}
1194
1195static int ocfs2_expand_inline_ref_root(handle_t *handle,
1196 struct ocfs2_caching_info *ci,
1197 struct buffer_head *ref_root_bh,
1198 struct buffer_head **ref_leaf_bh,
1199 struct ocfs2_alloc_context *meta_ac)
1200{
1201 int ret;
1202 u16 suballoc_bit_start;
1203 u32 num_got;
1204 u64 blkno;
1205 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1206 struct buffer_head *new_bh = NULL;
1207 struct ocfs2_refcount_block *new_rb;
1208 struct ocfs2_refcount_block *root_rb =
1209 (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1210
1211 ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
1212 OCFS2_JOURNAL_ACCESS_WRITE);
1213 if (ret) {
1214 mlog_errno(ret);
1215 goto out;
1216 }
1217
1218 ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
1219 &suballoc_bit_start, &num_got,
1220 &blkno);
1221 if (ret) {
1222 mlog_errno(ret);
1223 goto out;
1224 }
1225
1226 new_bh = sb_getblk(sb, blkno);
1227 if (new_bh == NULL) {
1228 ret = -EIO;
1229 mlog_errno(ret);
1230 goto out;
1231 }
1232 ocfs2_set_new_buffer_uptodate(ci, new_bh);
1233
1234 ret = ocfs2_journal_access_rb(handle, ci, new_bh,
1235 OCFS2_JOURNAL_ACCESS_CREATE);
1236 if (ret) {
1237 mlog_errno(ret);
1238 goto out;
1239 }
1240
1241 /*
1242 * Initialize ocfs2_refcount_block.
1243 * It should contain the same information as the old root.
1244 * so just memcpy it and change the corresponding field.
1245 */
1246 memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
1247
1248 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1249 new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num);
1250 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1251 new_rb->rf_blkno = cpu_to_le64(blkno);
1252 new_rb->rf_cpos = cpu_to_le32(0);
1253 new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
1254 new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
1255 ocfs2_journal_dirty(handle, new_bh);
1256
1257 /* Now change the root. */
1258 memset(&root_rb->rf_list, 0, sb->s_blocksize -
1259 offsetof(struct ocfs2_refcount_block, rf_list));
1260 root_rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(sb));
1261 root_rb->rf_clusters = cpu_to_le32(1);
1262 root_rb->rf_list.l_next_free_rec = cpu_to_le16(1);
1263 root_rb->rf_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
1264 root_rb->rf_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
1265 root_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_TREE_FL);
1266
1267 ocfs2_journal_dirty(handle, ref_root_bh);
1268
1269 mlog(0, "new leaf block %llu, used %u\n", (unsigned long long)blkno,
1270 le16_to_cpu(new_rb->rf_records.rl_used));
1271
1272 *ref_leaf_bh = new_bh;
1273 new_bh = NULL;
1274out:
1275 brelse(new_bh);
1276 return ret;
1277}
1278
1279static int ocfs2_refcount_rec_no_intersect(struct ocfs2_refcount_rec *prev,
1280 struct ocfs2_refcount_rec *next)
1281{
1282 if (ocfs2_get_ref_rec_low_cpos(prev) + le32_to_cpu(prev->r_clusters) <=
1283 ocfs2_get_ref_rec_low_cpos(next))
1284 return 1;
1285
1286 return 0;
1287}
1288
1289static int cmp_refcount_rec_by_low_cpos(const void *a, const void *b)
1290{
1291 const struct ocfs2_refcount_rec *l = a, *r = b;
1292 u32 l_cpos = ocfs2_get_ref_rec_low_cpos(l);
1293 u32 r_cpos = ocfs2_get_ref_rec_low_cpos(r);
1294
1295 if (l_cpos > r_cpos)
1296 return 1;
1297 if (l_cpos < r_cpos)
1298 return -1;
1299 return 0;
1300}
1301
1302static int cmp_refcount_rec_by_cpos(const void *a, const void *b)
1303{
1304 const struct ocfs2_refcount_rec *l = a, *r = b;
1305 u64 l_cpos = le64_to_cpu(l->r_cpos);
1306 u64 r_cpos = le64_to_cpu(r->r_cpos);
1307
1308 if (l_cpos > r_cpos)
1309 return 1;
1310 if (l_cpos < r_cpos)
1311 return -1;
1312 return 0;
1313}
1314
1315static void swap_refcount_rec(void *a, void *b, int size)
1316{
1317 struct ocfs2_refcount_rec *l = a, *r = b, tmp;
1318
1319 tmp = *(struct ocfs2_refcount_rec *)l;
1320 *(struct ocfs2_refcount_rec *)l =
1321 *(struct ocfs2_refcount_rec *)r;
1322 *(struct ocfs2_refcount_rec *)r = tmp;
1323}
1324
1325/*
1326 * The refcount cpos are ordered by their 64bit cpos,
1327 * But we will use the low 32 bit to be the e_cpos in the b-tree.
1328 * So we need to make sure that this pos isn't intersected with others.
1329 *
1330 * Note: The refcount block is already sorted by their low 32 bit cpos,
1331 * So just try the middle pos first, and we will exit when we find
1332 * the good position.
1333 */
1334static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl,
1335 u32 *split_pos, int *split_index)
1336{
1337 int num_used = le16_to_cpu(rl->rl_used);
1338 int delta, middle = num_used / 2;
1339
1340 for (delta = 0; delta < middle; delta++) {
1341 /* Let's check delta earlier than middle */
1342 if (ocfs2_refcount_rec_no_intersect(
1343 &rl->rl_recs[middle - delta - 1],
1344 &rl->rl_recs[middle - delta])) {
1345 *split_index = middle - delta;
1346 break;
1347 }
1348
1349 /* For even counts, don't walk off the end */
1350 if ((middle + delta + 1) == num_used)
1351 continue;
1352
1353 /* Now try delta past middle */
1354 if (ocfs2_refcount_rec_no_intersect(
1355 &rl->rl_recs[middle + delta],
1356 &rl->rl_recs[middle + delta + 1])) {
1357 *split_index = middle + delta + 1;
1358 break;
1359 }
1360 }
1361
1362 if (delta >= middle)
1363 return -ENOSPC;
1364
1365 *split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]);
1366 return 0;
1367}
1368
1369static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
1370 struct buffer_head *new_bh,
1371 u32 *split_cpos)
1372{
1373 int split_index = 0, num_moved, ret;
1374 u32 cpos = 0;
1375 struct ocfs2_refcount_block *rb =
1376 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1377 struct ocfs2_refcount_list *rl = &rb->rf_records;
1378 struct ocfs2_refcount_block *new_rb =
1379 (struct ocfs2_refcount_block *)new_bh->b_data;
1380 struct ocfs2_refcount_list *new_rl = &new_rb->rf_records;
1381
1382 mlog(0, "split old leaf refcount block %llu, count = %u, used = %u\n",
1383 (unsigned long long)ref_leaf_bh->b_blocknr,
1384 le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used));
1385
1386 /*
1387 * XXX: Improvement later.
1388 * If we know all the high 32 bit cpos is the same, no need to sort.
1389 *
1390 * In order to make the whole process safe, we do:
1391 * 1. sort the entries by their low 32 bit cpos first so that we can
1392 * find the split cpos easily.
1393 * 2. call ocfs2_insert_extent to insert the new refcount block.
1394 * 3. move the refcount rec to the new block.
1395 * 4. sort the entries by their 64 bit cpos.
1396 * 5. dirty the new_rb and rb.
1397 */
1398 sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
1399 sizeof(struct ocfs2_refcount_rec),
1400 cmp_refcount_rec_by_low_cpos, swap_refcount_rec);
1401
1402 ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index);
1403 if (ret) {
1404 mlog_errno(ret);
1405 return ret;
1406 }
1407
1408 new_rb->rf_cpos = cpu_to_le32(cpos);
1409
1410 /* move refcount records starting from split_index to the new block. */
1411 num_moved = le16_to_cpu(rl->rl_used) - split_index;
1412 memcpy(new_rl->rl_recs, &rl->rl_recs[split_index],
1413 num_moved * sizeof(struct ocfs2_refcount_rec));
1414
1415 /*ok, remove the entries we just moved over to the other block. */
1416 memset(&rl->rl_recs[split_index], 0,
1417 num_moved * sizeof(struct ocfs2_refcount_rec));
1418
1419 /* change old and new rl_used accordingly. */
1420 le16_add_cpu(&rl->rl_used, -num_moved);
1421 new_rl->rl_used = cpu_to_le32(num_moved);
1422
1423 sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
1424 sizeof(struct ocfs2_refcount_rec),
1425 cmp_refcount_rec_by_cpos, swap_refcount_rec);
1426
1427 sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used),
1428 sizeof(struct ocfs2_refcount_rec),
1429 cmp_refcount_rec_by_cpos, swap_refcount_rec);
1430
1431 *split_cpos = cpos;
1432 return 0;
1433}
1434
1435static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1436 struct ocfs2_caching_info *ci,
1437 struct buffer_head *ref_root_bh,
1438 struct buffer_head *ref_leaf_bh,
1439 struct ocfs2_alloc_context *meta_ac)
1440{
1441 int ret;
1442 u16 suballoc_bit_start;
1443 u32 num_got, new_cpos;
1444 u64 blkno;
1445 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1446 struct ocfs2_refcount_block *root_rb =
1447 (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1448 struct buffer_head *new_bh = NULL;
1449 struct ocfs2_refcount_block *new_rb;
1450 struct ocfs2_extent_tree ref_et;
1451
1452 BUG_ON(!(le32_to_cpu(root_rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL));
1453
1454 ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
1455 OCFS2_JOURNAL_ACCESS_WRITE);
1456 if (ret) {
1457 mlog_errno(ret);
1458 goto out;
1459 }
1460
1461 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1462 OCFS2_JOURNAL_ACCESS_WRITE);
1463 if (ret) {
1464 mlog_errno(ret);
1465 goto out;
1466 }
1467
1468 ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
1469 &suballoc_bit_start, &num_got,
1470 &blkno);
1471 if (ret) {
1472 mlog_errno(ret);
1473 goto out;
1474 }
1475
1476 new_bh = sb_getblk(sb, blkno);
1477 if (new_bh == NULL) {
1478 ret = -EIO;
1479 mlog_errno(ret);
1480 goto out;
1481 }
1482 ocfs2_set_new_buffer_uptodate(ci, new_bh);
1483
1484 ret = ocfs2_journal_access_rb(handle, ci, new_bh,
1485 OCFS2_JOURNAL_ACCESS_CREATE);
1486 if (ret) {
1487 mlog_errno(ret);
1488 goto out;
1489 }
1490
1491 /* Initialize ocfs2_refcount_block. */
1492 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1493 memset(new_rb, 0, sb->s_blocksize);
1494 strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
1495 new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num);
1496 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1497 new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
1498 new_rb->rf_blkno = cpu_to_le64(blkno);
1499 new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
1500 new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
1501 new_rb->rf_records.rl_count =
1502 cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
1503 new_rb->rf_generation = root_rb->rf_generation;
1504
1505 ret = ocfs2_divide_leaf_refcount_block(ref_leaf_bh, new_bh, &new_cpos);
1506 if (ret) {
1507 mlog_errno(ret);
1508 goto out;
1509 }
1510
1511 ocfs2_journal_dirty(handle, ref_leaf_bh);
1512 ocfs2_journal_dirty(handle, new_bh);
1513
1514 ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh);
1515
1516 mlog(0, "insert new leaf block %llu at %u\n",
1517 (unsigned long long)new_bh->b_blocknr, new_cpos);
1518
1519 /* Insert the new leaf block with the specific offset cpos. */
1520 ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr,
1521 1, 0, meta_ac);
1522 if (ret)
1523 mlog_errno(ret);
1524
1525out:
1526 brelse(new_bh);
1527 return ret;
1528}
1529
1530static int ocfs2_expand_refcount_tree(handle_t *handle,
1531 struct ocfs2_caching_info *ci,
1532 struct buffer_head *ref_root_bh,
1533 struct buffer_head *ref_leaf_bh,
1534 struct ocfs2_alloc_context *meta_ac)
1535{
1536 int ret;
1537 struct buffer_head *expand_bh = NULL;
1538
1539 if (ref_root_bh == ref_leaf_bh) {
1540 /*
1541 * the old root bh hasn't been expanded to a b-tree,
1542 * so expand it first.
1543 */
1544 ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh,
1545 &expand_bh, meta_ac);
1546 if (ret) {
1547 mlog_errno(ret);
1548 goto out;
1549 }
1550 } else {
1551 expand_bh = ref_leaf_bh;
1552 get_bh(expand_bh);
1553 }
1554
1555
1556 /* Now add a new refcount block into the tree.*/
1557 ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh,
1558 expand_bh, meta_ac);
1559 if (ret)
1560 mlog_errno(ret);
1561out:
1562 brelse(expand_bh);
1563 return ret;
1564}
1565
1566/*
1567 * Adjust the extent rec in b-tree representing ref_leaf_bh.
1568 *
1569 * Only called when we have inserted a new refcount rec at index 0
1570 * which means ocfs2_extent_rec.e_cpos may need some change.
1571 */
1572static int ocfs2_adjust_refcount_rec(handle_t *handle,
1573 struct ocfs2_caching_info *ci,
1574 struct buffer_head *ref_root_bh,
1575 struct buffer_head *ref_leaf_bh,
1576 struct ocfs2_refcount_rec *rec)
1577{
1578 int ret = 0, i;
1579 u32 new_cpos, old_cpos;
1580 struct ocfs2_path *path = NULL;
1581 struct ocfs2_extent_tree et;
1582 struct ocfs2_refcount_block *rb =
1583 (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1584 struct ocfs2_extent_list *el;
1585
1586 if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL))
1587 goto out;
1588
1589 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1590 old_cpos = le32_to_cpu(rb->rf_cpos);
1591 new_cpos = le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
1592 if (old_cpos <= new_cpos)
1593 goto out;
1594
1595 ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
1596
1597 path = ocfs2_new_path_from_et(&et);
1598 if (!path) {
1599 ret = -ENOMEM;
1600 mlog_errno(ret);
1601 goto out;
1602 }
1603
1604 ret = ocfs2_find_path(ci, path, old_cpos);
1605 if (ret) {
1606 mlog_errno(ret);
1607 goto out;
1608 }
1609
1610 /*
1611 * 2 more credits, one for the leaf refcount block, one for
1612 * the extent block contains the extent rec.
1613 */
1614 ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2);
1615 if (ret < 0) {
1616 mlog_errno(ret);
1617 goto out;
1618 }
1619
1620 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1621 OCFS2_JOURNAL_ACCESS_WRITE);
1622 if (ret < 0) {
1623 mlog_errno(ret);
1624 goto out;
1625 }
1626
1627 ret = ocfs2_journal_access_eb(handle, ci, path_leaf_bh(path),
1628 OCFS2_JOURNAL_ACCESS_WRITE);
1629 if (ret < 0) {
1630 mlog_errno(ret);
1631 goto out;
1632 }
1633
1634 /* change the leaf extent block first. */
1635 el = path_leaf_el(path);
1636
1637 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++)
1638 if (le32_to_cpu(el->l_recs[i].e_cpos) == old_cpos)
1639 break;
1640
1641 BUG_ON(i == le16_to_cpu(el->l_next_free_rec));
1642
1643 el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
1644
1645 /* change the r_cpos in the leaf block. */
1646 rb->rf_cpos = cpu_to_le32(new_cpos);
1647
1648 ocfs2_journal_dirty(handle, path_leaf_bh(path));
1649 ocfs2_journal_dirty(handle, ref_leaf_bh);
1650
1651out:
1652 ocfs2_free_path(path);
1653 return ret;
1654}
1655
1656static int ocfs2_insert_refcount_rec(handle_t *handle,
1657 struct ocfs2_caching_info *ci,
1658 struct buffer_head *ref_root_bh,
1659 struct buffer_head *ref_leaf_bh,
1660 struct ocfs2_refcount_rec *rec,
1661 int index, int merge,
1662 struct ocfs2_alloc_context *meta_ac)
1663{
1664 int ret;
1665 struct ocfs2_refcount_block *rb =
1666 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1667 struct ocfs2_refcount_list *rf_list = &rb->rf_records;
1668 struct buffer_head *new_bh = NULL;
1669
1670 BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
1671
1672 if (rf_list->rl_used == rf_list->rl_count) {
1673 u64 cpos = le64_to_cpu(rec->r_cpos);
1674 u32 len = le32_to_cpu(rec->r_clusters);
1675
1676 ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
1677 ref_leaf_bh, meta_ac);
1678 if (ret) {
1679 mlog_errno(ret);
1680 goto out;
1681 }
1682
1683 ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
1684 cpos, len, NULL, &index,
1685 &new_bh);
1686 if (ret) {
1687 mlog_errno(ret);
1688 goto out;
1689 }
1690
1691 ref_leaf_bh = new_bh;
1692 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1693 rf_list = &rb->rf_records;
1694 }
1695
1696 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1697 OCFS2_JOURNAL_ACCESS_WRITE);
1698 if (ret) {
1699 mlog_errno(ret);
1700 goto out;
1701 }
1702
1703 if (index < le16_to_cpu(rf_list->rl_used))
1704 memmove(&rf_list->rl_recs[index + 1],
1705 &rf_list->rl_recs[index],
1706 (le16_to_cpu(rf_list->rl_used) - index) *
1707 sizeof(struct ocfs2_refcount_rec));
1708
1709 mlog(0, "insert refcount record start %llu, len %u, count %u "
1710 "to leaf block %llu at index %d\n",
1711 (unsigned long long)le64_to_cpu(rec->r_cpos),
1712 le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount),
1713 (unsigned long long)ref_leaf_bh->b_blocknr, index);
1714
1715 rf_list->rl_recs[index] = *rec;
1716
1717 le16_add_cpu(&rf_list->rl_used, 1);
1718
1719 if (merge)
1720 ocfs2_refcount_rec_merge(rb, index);
1721
1722 ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
1723 if (ret) {
1724 mlog_errno(ret);
1725 goto out;
1726 }
1727
1728 if (index == 0) {
1729 ret = ocfs2_adjust_refcount_rec(handle, ci,
1730 ref_root_bh,
1731 ref_leaf_bh, rec);
1732 if (ret)
1733 mlog_errno(ret);
1734 }
1735out:
1736 brelse(new_bh);
1737 return ret;
1738}
1739
1740/*
1741 * Split the refcount_rec indexed by "index" in ref_leaf_bh.
1742 * This is much simple than our b-tree code.
1743 * split_rec is the new refcount rec we want to insert.
1744 * If split_rec->r_refcount > 0, we are changing the refcount(in case we
1745 * increase refcount or decrease a refcount to non-zero).
1746 * If split_rec->r_refcount == 0, we are punching a hole in current refcount
1747 * rec( in case we decrease a refcount to zero).
1748 */
1749static int ocfs2_split_refcount_rec(handle_t *handle,
1750 struct ocfs2_caching_info *ci,
1751 struct buffer_head *ref_root_bh,
1752 struct buffer_head *ref_leaf_bh,
1753 struct ocfs2_refcount_rec *split_rec,
1754 int index, int merge,
1755 struct ocfs2_alloc_context *meta_ac,
1756 struct ocfs2_cached_dealloc_ctxt *dealloc)
1757{
1758 int ret, recs_need;
1759 u32 len;
1760 struct ocfs2_refcount_block *rb =
1761 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1762 struct ocfs2_refcount_list *rf_list = &rb->rf_records;
1763 struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index];
1764 struct ocfs2_refcount_rec *tail_rec = NULL;
1765 struct buffer_head *new_bh = NULL;
1766
1767 BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
1768
1769 mlog(0, "original r_pos %llu, cluster %u, split %llu, cluster %u\n",
1770 le64_to_cpu(orig_rec->r_cpos), le32_to_cpu(orig_rec->r_clusters),
1771 le64_to_cpu(split_rec->r_cpos),
1772 le32_to_cpu(split_rec->r_clusters));
1773
1774 /*
1775 * If we just need to split the header or tail clusters,
1776 * no more recs are needed, just split is OK.
1777 * Otherwise we at least need one new recs.
1778 */
1779 if (!split_rec->r_refcount &&
1780 (split_rec->r_cpos == orig_rec->r_cpos ||
1781 le64_to_cpu(split_rec->r_cpos) +
1782 le32_to_cpu(split_rec->r_clusters) ==
1783 le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
1784 recs_need = 0;
1785 else
1786 recs_need = 1;
1787
1788 /*
1789 * We need one more rec if we split in the middle and the new rec have
1790 * some refcount in it.
1791 */
1792 if (split_rec->r_refcount &&
1793 (split_rec->r_cpos != orig_rec->r_cpos &&
1794 le64_to_cpu(split_rec->r_cpos) +
1795 le32_to_cpu(split_rec->r_clusters) !=
1796 le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
1797 recs_need++;
1798
1799 /* If the leaf block don't have enough record, expand it. */
1800 if (le16_to_cpu(rf_list->rl_used) + recs_need > rf_list->rl_count) {
1801 struct ocfs2_refcount_rec tmp_rec;
1802 u64 cpos = le64_to_cpu(orig_rec->r_cpos);
1803 len = le32_to_cpu(orig_rec->r_clusters);
1804 ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
1805 ref_leaf_bh, meta_ac);
1806 if (ret) {
1807 mlog_errno(ret);
1808 goto out;
1809 }
1810
1811 /*
1812 * We have to re-get it since now cpos may be moved to
1813 * another leaf block.
1814 */
1815 ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
1816 cpos, len, &tmp_rec, &index,
1817 &new_bh);
1818 if (ret) {
1819 mlog_errno(ret);
1820 goto out;
1821 }
1822
1823 ref_leaf_bh = new_bh;
1824 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1825 rf_list = &rb->rf_records;
1826 orig_rec = &rf_list->rl_recs[index];
1827 }
1828
1829 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1830 OCFS2_JOURNAL_ACCESS_WRITE);
1831 if (ret) {
1832 mlog_errno(ret);
1833 goto out;
1834 }
1835
1836 /*
1837 * We have calculated out how many new records we need and store
1838 * in recs_need, so spare enough space first by moving the records
1839 * after "index" to the end.
1840 */
1841 if (index != le16_to_cpu(rf_list->rl_used) - 1)
1842 memmove(&rf_list->rl_recs[index + 1 + recs_need],
1843 &rf_list->rl_recs[index + 1],
1844 (le16_to_cpu(rf_list->rl_used) - index - 1) *
1845 sizeof(struct ocfs2_refcount_rec));
1846
1847 len = (le64_to_cpu(orig_rec->r_cpos) +
1848 le32_to_cpu(orig_rec->r_clusters)) -
1849 (le64_to_cpu(split_rec->r_cpos) +
1850 le32_to_cpu(split_rec->r_clusters));
1851
1852 /*
1853 * If we have "len", the we will split in the tail and move it
1854 * to the end of the space we have just spared.
1855 */
1856 if (len) {
1857 tail_rec = &rf_list->rl_recs[index + recs_need];
1858
1859 memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec));
1860 le64_add_cpu(&tail_rec->r_cpos,
1861 le32_to_cpu(tail_rec->r_clusters) - len);
1862 tail_rec->r_clusters = le32_to_cpu(len);
1863 }
1864
1865 /*
1866 * If the split pos isn't the same as the original one, we need to
1867 * split in the head.
1868 *
1869 * Note: We have the chance that split_rec.r_refcount = 0,
1870 * recs_need = 0 and len > 0, which means we just cut the head from
1871 * the orig_rec and in that case we have done some modification in
1872 * orig_rec above, so the check for r_cpos is faked.
1873 */
1874 if (split_rec->r_cpos != orig_rec->r_cpos && tail_rec != orig_rec) {
1875 len = le64_to_cpu(split_rec->r_cpos) -
1876 le64_to_cpu(orig_rec->r_cpos);
1877 orig_rec->r_clusters = cpu_to_le32(len);
1878 index++;
1879 }
1880
1881 le16_add_cpu(&rf_list->rl_used, recs_need);
1882
1883 if (split_rec->r_refcount) {
1884 rf_list->rl_recs[index] = *split_rec;
1885 mlog(0, "insert refcount record start %llu, len %u, count %u "
1886 "to leaf block %llu at index %d\n",
1887 (unsigned long long)le64_to_cpu(split_rec->r_cpos),
1888 le32_to_cpu(split_rec->r_clusters),
1889 le32_to_cpu(split_rec->r_refcount),
1890 (unsigned long long)ref_leaf_bh->b_blocknr, index);
1891
1892 if (merge)
1893 ocfs2_refcount_rec_merge(rb, index);
1894 }
1895
1896 ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
1897 if (ret)
1898 mlog_errno(ret);
1899
1900out:
1901 brelse(new_bh);
1902 return ret;
1903}
1904
1905static int __ocfs2_increase_refcount(handle_t *handle,
1906 struct ocfs2_caching_info *ci,
1907 struct buffer_head *ref_root_bh,
1908 u64 cpos, u32 len, int merge,
1909 struct ocfs2_alloc_context *meta_ac,
1910 struct ocfs2_cached_dealloc_ctxt *dealloc)
1911{
1912 int ret = 0, index;
1913 struct buffer_head *ref_leaf_bh = NULL;
1914 struct ocfs2_refcount_rec rec;
1915 unsigned int set_len = 0;
1916
1917 mlog(0, "Tree owner %llu, add refcount start %llu, len %u\n",
1918 (unsigned long long)ocfs2_metadata_cache_owner(ci),
1919 (unsigned long long)cpos, len);
1920
1921 while (len) {
1922 ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
1923 cpos, len, &rec, &index,
1924 &ref_leaf_bh);
1925 if (ret) {
1926 mlog_errno(ret);
1927 goto out;
1928 }
1929
1930 set_len = le32_to_cpu(rec.r_clusters);
1931
1932 /*
1933 * Here we may meet with 3 situations:
1934 *
1935 * 1. If we find an already existing record, and the length
1936 * is the same, cool, we just need to increase the r_refcount
1937 * and it is OK.
1938 * 2. If we find a hole, just insert it with r_refcount = 1.
1939 * 3. If we are in the middle of one extent record, split
1940 * it.
1941 */
1942 if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos &&
1943 set_len <= len) {
1944 mlog(0, "increase refcount rec, start %llu, len %u, "
1945 "count %u\n", (unsigned long long)cpos, set_len,
1946 le32_to_cpu(rec.r_refcount));
1947 ret = ocfs2_change_refcount_rec(handle, ci,
1948 ref_leaf_bh, index,
1949 merge, 1);
1950 if (ret) {
1951 mlog_errno(ret);
1952 goto out;
1953 }
1954 } else if (!rec.r_refcount) {
1955 rec.r_refcount = cpu_to_le32(1);
1956
1957 mlog(0, "insert refcount rec, start %llu, len %u\n",
1958 (unsigned long long)le64_to_cpu(rec.r_cpos),
1959 set_len);
1960 ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh,
1961 ref_leaf_bh,
1962 &rec, index,
1963 merge, meta_ac);
1964 if (ret) {
1965 mlog_errno(ret);
1966 goto out;
1967 }
1968 } else {
1969 set_len = min((u64)(cpos + len),
1970 le64_to_cpu(rec.r_cpos) + set_len) - cpos;
1971 rec.r_cpos = cpu_to_le64(cpos);
1972 rec.r_clusters = cpu_to_le32(set_len);
1973 le32_add_cpu(&rec.r_refcount, 1);
1974
1975 mlog(0, "split refcount rec, start %llu, "
1976 "len %u, count %u\n",
1977 (unsigned long long)le64_to_cpu(rec.r_cpos),
1978 set_len, le32_to_cpu(rec.r_refcount));
1979 ret = ocfs2_split_refcount_rec(handle, ci,
1980 ref_root_bh, ref_leaf_bh,
1981 &rec, index, merge,
1982 meta_ac, dealloc);
1983 if (ret) {
1984 mlog_errno(ret);
1985 goto out;
1986 }
1987 }
1988
1989 cpos += set_len;
1990 len -= set_len;
1991 brelse(ref_leaf_bh);
1992 ref_leaf_bh = NULL;
1993 }
1994
1995out:
1996 brelse(ref_leaf_bh);
1997 return ret;
1998}
1999
2000static int ocfs2_remove_refcount_extent(handle_t *handle,
2001 struct ocfs2_caching_info *ci,
2002 struct buffer_head *ref_root_bh,
2003 struct buffer_head *ref_leaf_bh,
2004 struct ocfs2_alloc_context *meta_ac,
2005 struct ocfs2_cached_dealloc_ctxt *dealloc)
2006{
2007 int ret;
2008 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2009 struct ocfs2_refcount_block *rb =
2010 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2011 struct ocfs2_extent_tree et;
2012
2013 BUG_ON(rb->rf_records.rl_used);
2014
2015 ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
2016 ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos),
2017 1, meta_ac, dealloc);
2018 if (ret) {
2019 mlog_errno(ret);
2020 goto out;
2021 }
2022
2023 ocfs2_remove_from_cache(ci, ref_leaf_bh);
2024
2025 /*
2026 * add the freed block to the dealloc so that it will be freed
2027 * when we run dealloc.
2028 */
2029 ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
2030 le16_to_cpu(rb->rf_suballoc_slot),
2031 le64_to_cpu(rb->rf_blkno),
2032 le16_to_cpu(rb->rf_suballoc_bit));
2033 if (ret) {
2034 mlog_errno(ret);
2035 goto out;
2036 }
2037
2038 ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
2039 OCFS2_JOURNAL_ACCESS_WRITE);
2040 if (ret) {
2041 mlog_errno(ret);
2042 goto out;
2043 }
2044
2045 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
2046
2047 le32_add_cpu(&rb->rf_clusters, -1);
2048
2049 /*
2050 * check whether we need to restore the root refcount block if
2051 * there is no leaf extent block at atll.
2052 */
2053 if (!rb->rf_list.l_next_free_rec) {
2054 BUG_ON(rb->rf_clusters);
2055
2056 mlog(0, "reset refcount tree root %llu to be a record block.\n",
2057 (unsigned long long)ref_root_bh->b_blocknr);
2058
2059 rb->rf_flags = 0;
2060 rb->rf_parent = 0;
2061 rb->rf_cpos = 0;
2062 memset(&rb->rf_records, 0, sb->s_blocksize -
2063 offsetof(struct ocfs2_refcount_block, rf_records));
2064 rb->rf_records.rl_count =
2065 cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
2066 }
2067
2068 ocfs2_journal_dirty(handle, ref_root_bh);
2069
2070out:
2071 return ret;
2072}
2073
2074int ocfs2_increase_refcount(handle_t *handle,
2075 struct ocfs2_caching_info *ci,
2076 struct buffer_head *ref_root_bh,
2077 u64 cpos, u32 len,
2078 struct ocfs2_alloc_context *meta_ac,
2079 struct ocfs2_cached_dealloc_ctxt *dealloc)
2080{
2081 return __ocfs2_increase_refcount(handle, ci, ref_root_bh,
2082 cpos, len, 1,
2083 meta_ac, dealloc);
2084}
2085
2086static int ocfs2_decrease_refcount_rec(handle_t *handle,
2087 struct ocfs2_caching_info *ci,
2088 struct buffer_head *ref_root_bh,
2089 struct buffer_head *ref_leaf_bh,
2090 int index, u64 cpos, unsigned int len,
2091 struct ocfs2_alloc_context *meta_ac,
2092 struct ocfs2_cached_dealloc_ctxt *dealloc)
2093{
2094 int ret;
2095 struct ocfs2_refcount_block *rb =
2096 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2097 struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index];
2098
2099 BUG_ON(cpos < le64_to_cpu(rec->r_cpos));
2100 BUG_ON(cpos + len >
2101 le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters));
2102
2103 if (cpos == le64_to_cpu(rec->r_cpos) &&
2104 len == le32_to_cpu(rec->r_clusters))
2105 ret = ocfs2_change_refcount_rec(handle, ci,
2106 ref_leaf_bh, index, 1, -1);
2107 else {
2108 struct ocfs2_refcount_rec split = *rec;
2109 split.r_cpos = cpu_to_le64(cpos);
2110 split.r_clusters = cpu_to_le32(len);
2111
2112 le32_add_cpu(&split.r_refcount, -1);
2113
2114 mlog(0, "split refcount rec, start %llu, "
2115 "len %u, count %u, original start %llu, len %u\n",
2116 (unsigned long long)le64_to_cpu(split.r_cpos),
2117 len, le32_to_cpu(split.r_refcount),
2118 (unsigned long long)le64_to_cpu(rec->r_cpos),
2119 le32_to_cpu(rec->r_clusters));
2120 ret = ocfs2_split_refcount_rec(handle, ci,
2121 ref_root_bh, ref_leaf_bh,
2122 &split, index, 1,
2123 meta_ac, dealloc);
2124 }
2125
2126 if (ret) {
2127 mlog_errno(ret);
2128 goto out;
2129 }
2130
2131 /* Remove the leaf refcount block if it contains no refcount record. */
2132 if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) {
2133 ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh,
2134 ref_leaf_bh, meta_ac,
2135 dealloc);
2136 if (ret)
2137 mlog_errno(ret);
2138 }
2139
2140out:
2141 return ret;
2142}
2143
2144static int __ocfs2_decrease_refcount(handle_t *handle,
2145 struct ocfs2_caching_info *ci,
2146 struct buffer_head *ref_root_bh,
2147 u64 cpos, u32 len,
2148 struct ocfs2_alloc_context *meta_ac,
2149 struct ocfs2_cached_dealloc_ctxt *dealloc,
2150 int delete)
2151{
2152 int ret = 0, index = 0;
2153 struct ocfs2_refcount_rec rec;
2154 unsigned int r_count = 0, r_len;
2155 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2156 struct buffer_head *ref_leaf_bh = NULL;
2157
2158 mlog(0, "Tree owner %llu, decrease refcount start %llu, "
2159 "len %u, delete %u\n",
2160 (unsigned long long)ocfs2_metadata_cache_owner(ci),
2161 (unsigned long long)cpos, len, delete);
2162
2163 while (len) {
2164 ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
2165 cpos, len, &rec, &index,
2166 &ref_leaf_bh);
2167 if (ret) {
2168 mlog_errno(ret);
2169 goto out;
2170 }
2171
2172 r_count = le32_to_cpu(rec.r_refcount);
2173 BUG_ON(r_count == 0);
2174 if (!delete)
2175 BUG_ON(r_count > 1);
2176
2177 r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) +
2178 le32_to_cpu(rec.r_clusters)) - cpos;
2179
2180 ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh,
2181 ref_leaf_bh, index,
2182 cpos, r_len,
2183 meta_ac, dealloc);
2184 if (ret) {
2185 mlog_errno(ret);
2186 goto out;
2187 }
2188
2189 if (le32_to_cpu(rec.r_refcount) == 1 && delete) {
2190 ret = ocfs2_cache_cluster_dealloc(dealloc,
2191 ocfs2_clusters_to_blocks(sb, cpos),
2192 r_len);
2193 if (ret) {
2194 mlog_errno(ret);
2195 goto out;
2196 }
2197 }
2198
2199 cpos += r_len;
2200 len -= r_len;
2201 brelse(ref_leaf_bh);
2202 ref_leaf_bh = NULL;
2203 }
2204
2205out:
2206 brelse(ref_leaf_bh);
2207 return ret;
2208}
2209
2210/* Caller must hold refcount tree lock. */
2211int ocfs2_decrease_refcount(struct inode *inode,
2212 handle_t *handle, u32 cpos, u32 len,
2213 struct ocfs2_alloc_context *meta_ac,
2214 struct ocfs2_cached_dealloc_ctxt *dealloc,
2215 int delete)
2216{
2217 int ret;
2218 u64 ref_blkno;
2219 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2220 struct buffer_head *ref_root_bh = NULL;
2221 struct ocfs2_refcount_tree *tree;
2222
2223 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
2224
2225 ret = ocfs2_get_refcount_block(inode, &ref_blkno);
2226 if (ret) {
2227 mlog_errno(ret);
2228 goto out;
2229 }
2230
2231 ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, &tree);
2232 if (ret) {
2233 mlog_errno(ret);
2234 goto out;
2235 }
2236
2237 ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
2238 &ref_root_bh);
2239 if (ret) {
2240 mlog_errno(ret);
2241 goto out;
2242 }
2243
2244 ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh,
2245 cpos, len, meta_ac, dealloc, delete);
2246 if (ret)
2247 mlog_errno(ret);
2248out:
2249 brelse(ref_root_bh);
2250 return ret;
2251}
2252
2253/*
2254 * Mark the already-existing extent at cpos as refcounted for len clusters.
2255 * This adds the refcount extent flag.
2256 *
2257 * If the existing extent is larger than the request, initiate a
2258 * split. An attempt will be made at merging with adjacent extents.
2259 *
2260 * The caller is responsible for passing down meta_ac if we'll need it.
2261 */
2262static int ocfs2_mark_extent_refcounted(struct inode *inode,
2263 struct ocfs2_extent_tree *et,
2264 handle_t *handle, u32 cpos,
2265 u32 len, u32 phys,
2266 struct ocfs2_alloc_context *meta_ac,
2267 struct ocfs2_cached_dealloc_ctxt *dealloc)
2268{
2269 int ret;
2270
2271 mlog(0, "Inode %lu refcount tree cpos %u, len %u, phys cluster %u\n",
2272 inode->i_ino, cpos, len, phys);
2273
2274 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
2275 ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
2276 "tree, but the feature bit is not set in the "
2277 "super block.", inode->i_ino);
2278 ret = -EROFS;
2279 goto out;
2280 }
2281
2282 ret = ocfs2_change_extent_flag(handle, et, cpos,
2283 len, phys, meta_ac, dealloc,
2284 OCFS2_EXT_REFCOUNTED, 0);
2285 if (ret)
2286 mlog_errno(ret);
2287
2288out:
2289 return ret;
2290}
2291
2292/*
2293 * Given some contiguous physical clusters, calculate what we need
2294 * for modifying their refcount.
2295 */
2296static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
2297 struct ocfs2_caching_info *ci,
2298 struct buffer_head *ref_root_bh,
2299 u64 start_cpos,
2300 u32 clusters,
2301 int *meta_add,
2302 int *credits)
2303{
2304 int ret = 0, index, ref_blocks = 0, recs_add = 0;
2305 u64 cpos = start_cpos;
2306 struct ocfs2_refcount_block *rb;
2307 struct ocfs2_refcount_rec rec;
2308 struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL;
2309 u32 len;
2310
2311 mlog(0, "start_cpos %llu, clusters %u\n",
2312 (unsigned long long)start_cpos, clusters);
2313 while (clusters) {
2314 ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
2315 cpos, clusters, &rec,
2316 &index, &ref_leaf_bh);
2317 if (ret) {
2318 mlog_errno(ret);
2319 goto out;
2320 }
2321
2322 if (ref_leaf_bh != prev_bh) {
2323 /*
2324 * Now we encounter a new leaf block, so calculate
2325 * whether we need to extend the old leaf.
2326 */
2327 if (prev_bh) {
2328 rb = (struct ocfs2_refcount_block *)
2329 prev_bh->b_data;
2330
2331 if (le64_to_cpu(rb->rf_records.rl_used) +
2332 recs_add >
2333 le16_to_cpu(rb->rf_records.rl_count))
2334 ref_blocks++;
2335 }
2336
2337 recs_add = 0;
2338 *credits += 1;
2339 brelse(prev_bh);
2340 prev_bh = ref_leaf_bh;
2341 get_bh(prev_bh);
2342 }
2343
2344 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2345
2346 mlog(0, "recs_add %d,cpos %llu, clusters %u, rec->r_cpos %llu,"
2347 "rec->r_clusters %u, rec->r_refcount %u, index %d\n",
2348 recs_add, (unsigned long long)cpos, clusters,
2349 (unsigned long long)le64_to_cpu(rec.r_cpos),
2350 le32_to_cpu(rec.r_clusters),
2351 le32_to_cpu(rec.r_refcount), index);
2352
2353 len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
2354 le32_to_cpu(rec.r_clusters)) - cpos;
2355 /*
2356 * If the refcount rec already exist, cool. We just need
2357 * to check whether there is a split. Otherwise we just need
2358 * to increase the refcount.
2359 * If we will insert one, increases recs_add.
2360 *
2361 * We record all the records which will be inserted to the
2362 * same refcount block, so that we can tell exactly whether
2363 * we need a new refcount block or not.
2364 */
2365 if (rec.r_refcount) {
2366 /* Check whether we need a split at the beginning. */
2367 if (cpos == start_cpos &&
2368 cpos != le64_to_cpu(rec.r_cpos))
2369 recs_add++;
2370
2371 /* Check whether we need a split in the end. */
2372 if (cpos + clusters < le64_to_cpu(rec.r_cpos) +
2373 le32_to_cpu(rec.r_clusters))
2374 recs_add++;
2375 } else
2376 recs_add++;
2377
2378 brelse(ref_leaf_bh);
2379 ref_leaf_bh = NULL;
2380 clusters -= len;
2381 cpos += len;
2382 }
2383
2384 if (prev_bh) {
2385 rb = (struct ocfs2_refcount_block *)prev_bh->b_data;
2386
2387 if (le64_to_cpu(rb->rf_records.rl_used) + recs_add >
2388 le16_to_cpu(rb->rf_records.rl_count))
2389 ref_blocks++;
2390
2391 *credits += 1;
2392 }
2393
2394 if (!ref_blocks)
2395 goto out;
2396
2397 mlog(0, "we need ref_blocks %d\n", ref_blocks);
2398 *meta_add += ref_blocks;
2399 *credits += ref_blocks;
2400
2401 /*
2402 * So we may need ref_blocks to insert into the tree.
2403 * That also means we need to change the b-tree and add that number
2404 * of records since we never merge them.
2405 * We need one more block for expansion since the new created leaf
2406 * block is also full and needs split.
2407 */
2408 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
2409 if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) {
2410 struct ocfs2_extent_tree et;
2411
2412 ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
2413 *meta_add += ocfs2_extend_meta_needed(et.et_root_el);
2414 *credits += ocfs2_calc_extend_credits(sb,
2415 et.et_root_el,
2416 ref_blocks);
2417 } else {
2418 *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
2419 *meta_add += 1;
2420 }
2421
2422out:
2423 brelse(ref_leaf_bh);
2424 brelse(prev_bh);
2425 return ret;
2426}
2427
2428/*
2429 * For refcount tree, we will decrease some contiguous clusters
2430 * refcount count, so just go through it to see how many blocks
2431 * we gonna touch and whether we need to create new blocks.
2432 *
2433 * Normally the refcount blocks store these refcount should be
2434 * continguous also, so that we can get the number easily.
2435 * As for meta_ac, we will at most add split 2 refcount record and
2436 * 2 more refcount block, so just check it in a rough way.
2437 *
2438 * Caller must hold refcount tree lock.
2439 */
2440int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
2441 struct buffer_head *di_bh,
2442 u64 phys_blkno,
2443 u32 clusters,
2444 int *credits,
2445 struct ocfs2_alloc_context **meta_ac)
2446{
2447 int ret, ref_blocks = 0;
2448 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2449 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2450 struct buffer_head *ref_root_bh = NULL;
2451 struct ocfs2_refcount_tree *tree;
2452 u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
2453
2454 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
2455 ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
2456 "tree, but the feature bit is not set in the "
2457 "super block.", inode->i_ino);
2458 ret = -EROFS;
2459 goto out;
2460 }
2461
2462 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
2463
2464 ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
2465 le64_to_cpu(di->i_refcount_loc), &tree);
2466 if (ret) {
2467 mlog_errno(ret);
2468 goto out;
2469 }
2470
2471 ret = ocfs2_read_refcount_block(&tree->rf_ci,
2472 le64_to_cpu(di->i_refcount_loc),
2473 &ref_root_bh);
2474 if (ret) {
2475 mlog_errno(ret);
2476 goto out;
2477 }
2478
2479 ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
2480 &tree->rf_ci,
2481 ref_root_bh,
2482 start_cpos, clusters,
2483 &ref_blocks, credits);
2484 if (ret) {
2485 mlog_errno(ret);
2486 goto out;
2487 }
2488
2489 mlog(0, "reserve new metadata %d, credits = %d\n",
2490 ref_blocks, *credits);
2491
2492 if (ref_blocks) {
2493 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
2494 ref_blocks, meta_ac);
2495 if (ret)
2496 mlog_errno(ret);
2497 }
2498
2499out:
2500 brelse(ref_root_bh);
2501 return ret;
2502}
2503
2504#define MAX_CONTIG_BYTES 1048576
2505
2506static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb)
2507{
2508 return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES);
2509}
2510
2511static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb)
2512{
2513 return ~(ocfs2_cow_contig_clusters(sb) - 1);
2514}
2515
2516/*
2517 * Given an extent that starts at 'start' and an I/O that starts at 'cpos',
2518 * find an offset (start + (n * contig_clusters)) that is closest to cpos
2519 * while still being less than or equal to it.
2520 *
2521 * The goal is to break the extent at a multiple of contig_clusters.
2522 */
2523static inline unsigned int ocfs2_cow_align_start(struct super_block *sb,
2524 unsigned int start,
2525 unsigned int cpos)
2526{
2527 BUG_ON(start > cpos);
2528
2529 return start + ((cpos - start) & ocfs2_cow_contig_mask(sb));
2530}
2531
2532/*
2533 * Given a cluster count of len, pad it out so that it is a multiple
2534 * of contig_clusters.
2535 */
2536static inline unsigned int ocfs2_cow_align_length(struct super_block *sb,
2537 unsigned int len)
2538{
2539 unsigned int padded =
2540 (len + (ocfs2_cow_contig_clusters(sb) - 1)) &
2541 ocfs2_cow_contig_mask(sb);
2542
2543 /* Did we wrap? */
2544 if (padded < len)
2545 padded = UINT_MAX;
2546
2547 return padded;
2548}
2549
2550/*
2551 * Calculate out the start and number of virtual clusters we need to to CoW.
2552 *
2553 * cpos is vitual start cluster position we want to do CoW in a
2554 * file and write_len is the cluster length.
2555 * max_cpos is the place where we want to stop CoW intentionally.
2556 *
2557 * Normal we will start CoW from the beginning of extent record cotaining cpos.
2558 * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we
2559 * get good I/O from the resulting extent tree.
2560 */
2561static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
2562 struct ocfs2_extent_list *el,
2563 u32 cpos,
2564 u32 write_len,
2565 u32 max_cpos,
2566 u32 *cow_start,
2567 u32 *cow_len)
2568{
2569 int ret = 0;
2570 int tree_height = le16_to_cpu(el->l_tree_depth), i;
2571 struct buffer_head *eb_bh = NULL;
2572 struct ocfs2_extent_block *eb = NULL;
2573 struct ocfs2_extent_rec *rec;
2574 unsigned int want_clusters, rec_end = 0;
2575 int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb);
2576 int leaf_clusters;
2577
2578 BUG_ON(cpos + write_len > max_cpos);
2579
2580 if (tree_height > 0) {
2581 ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh);
2582 if (ret) {
2583 mlog_errno(ret);
2584 goto out;
2585 }
2586
2587 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
2588 el = &eb->h_list;
2589
2590 if (el->l_tree_depth) {
2591 ocfs2_error(inode->i_sb,
2592 "Inode %lu has non zero tree depth in "
2593 "leaf block %llu\n", inode->i_ino,
2594 (unsigned long long)eb_bh->b_blocknr);
2595 ret = -EROFS;
2596 goto out;
2597 }
2598 }
2599
2600 *cow_len = 0;
2601 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
2602 rec = &el->l_recs[i];
2603
2604 if (ocfs2_is_empty_extent(rec)) {
2605 mlog_bug_on_msg(i != 0, "Inode %lu has empty record in "
2606 "index %d\n", inode->i_ino, i);
2607 continue;
2608 }
2609
2610 if (le32_to_cpu(rec->e_cpos) +
2611 le16_to_cpu(rec->e_leaf_clusters) <= cpos)
2612 continue;
2613
2614 if (*cow_len == 0) {
2615 /*
2616 * We should find a refcounted record in the
2617 * first pass.
2618 */
2619 BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED));
2620 *cow_start = le32_to_cpu(rec->e_cpos);
2621 }
2622
2623 /*
2624 * If we encounter a hole, a non-refcounted record or
2625 * pass the max_cpos, stop the search.
2626 */
2627 if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) ||
2628 (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)) ||
2629 (max_cpos <= le32_to_cpu(rec->e_cpos)))
2630 break;
2631
2632 leaf_clusters = le16_to_cpu(rec->e_leaf_clusters);
2633 rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters;
2634 if (rec_end > max_cpos) {
2635 rec_end = max_cpos;
2636 leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos);
2637 }
2638
2639 /*
2640 * How many clusters do we actually need from
2641 * this extent? First we see how many we actually
2642 * need to complete the write. If that's smaller
2643 * than contig_clusters, we try for contig_clusters.
2644 */
2645 if (!*cow_len)
2646 want_clusters = write_len;
2647 else
2648 want_clusters = (cpos + write_len) -
2649 (*cow_start + *cow_len);
2650 if (want_clusters < contig_clusters)
2651 want_clusters = contig_clusters;
2652
2653 /*
2654 * If the write does not cover the whole extent, we
2655 * need to calculate how we're going to split the extent.
2656 * We try to do it on contig_clusters boundaries.
2657 *
2658 * Any extent smaller than contig_clusters will be
2659 * CoWed in its entirety.
2660 */
2661 if (leaf_clusters <= contig_clusters)
2662 *cow_len += leaf_clusters;
2663 else if (*cow_len || (*cow_start == cpos)) {
2664 /*
2665 * This extent needs to be CoW'd from its
2666 * beginning, so all we have to do is compute
2667 * how many clusters to grab. We align
2668 * want_clusters to the edge of contig_clusters
2669 * to get better I/O.
2670 */
2671 want_clusters = ocfs2_cow_align_length(inode->i_sb,
2672 want_clusters);
2673
2674 if (leaf_clusters < want_clusters)
2675 *cow_len += leaf_clusters;
2676 else
2677 *cow_len += want_clusters;
2678 } else if ((*cow_start + contig_clusters) >=
2679 (cpos + write_len)) {
2680 /*
2681 * Breaking off contig_clusters at the front
2682 * of the extent will cover our write. That's
2683 * easy.
2684 */
2685 *cow_len = contig_clusters;
2686 } else if ((rec_end - cpos) <= contig_clusters) {
2687 /*
2688 * Breaking off contig_clusters at the tail of
2689 * this extent will cover cpos.
2690 */
2691 *cow_start = rec_end - contig_clusters;
2692 *cow_len = contig_clusters;
2693 } else if ((rec_end - cpos) <= want_clusters) {
2694 /*
2695 * While we can't fit the entire write in this
2696 * extent, we know that the write goes from cpos
2697 * to the end of the extent. Break that off.
2698 * We try to break it at some multiple of
2699 * contig_clusters from the front of the extent.
2700 * Failing that (ie, cpos is within
2701 * contig_clusters of the front), we'll CoW the
2702 * entire extent.
2703 */
2704 *cow_start = ocfs2_cow_align_start(inode->i_sb,
2705 *cow_start, cpos);
2706 *cow_len = rec_end - *cow_start;
2707 } else {
2708 /*
2709 * Ok, the entire write lives in the middle of
2710 * this extent. Let's try to slice the extent up
2711 * nicely. Optimally, our CoW region starts at
2712 * m*contig_clusters from the beginning of the
2713 * extent and goes for n*contig_clusters,
2714 * covering the entire write.
2715 */
2716 *cow_start = ocfs2_cow_align_start(inode->i_sb,
2717 *cow_start, cpos);
2718
2719 want_clusters = (cpos + write_len) - *cow_start;
2720 want_clusters = ocfs2_cow_align_length(inode->i_sb,
2721 want_clusters);
2722 if (*cow_start + want_clusters <= rec_end)
2723 *cow_len = want_clusters;
2724 else
2725 *cow_len = rec_end - *cow_start;
2726 }
2727
2728 /* Have we covered our entire write yet? */
2729 if ((*cow_start + *cow_len) >= (cpos + write_len))
2730 break;
2731
2732 /*
2733 * If we reach the end of the extent block and don't get enough
2734 * clusters, continue with the next extent block if possible.
2735 */
2736 if (i + 1 == le16_to_cpu(el->l_next_free_rec) &&
2737 eb && eb->h_next_leaf_blk) {
2738 brelse(eb_bh);
2739 eb_bh = NULL;
2740
2741 ret = ocfs2_read_extent_block(INODE_CACHE(inode),
2742 le64_to_cpu(eb->h_next_leaf_blk),
2743 &eb_bh);
2744 if (ret) {
2745 mlog_errno(ret);
2746 goto out;
2747 }
2748
2749 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
2750 el = &eb->h_list;
2751 i = -1;
2752 }
2753 }
2754
2755out:
2756 brelse(eb_bh);
2757 return ret;
2758}
2759
2760/*
2761 * Prepare meta_ac, data_ac and calculate credits when we want to add some
2762 * num_clusters in data_tree "et" and change the refcount for the old
2763 * clusters(starting form p_cluster) in the refcount tree.
2764 *
2765 * Note:
2766 * 1. since we may split the old tree, so we at most will need num_clusters + 2
2767 * more new leaf records.
2768 * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so
2769 * just give data_ac = NULL.
2770 */
2771static int ocfs2_lock_refcount_allocators(struct super_block *sb,
2772 u32 p_cluster, u32 num_clusters,
2773 struct ocfs2_extent_tree *et,
2774 struct ocfs2_caching_info *ref_ci,
2775 struct buffer_head *ref_root_bh,
2776 struct ocfs2_alloc_context **meta_ac,
2777 struct ocfs2_alloc_context **data_ac,
2778 int *credits)
2779{
2780 int ret = 0, meta_add = 0;
2781 int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et);
2782
2783 if (num_free_extents < 0) {
2784 ret = num_free_extents;
2785 mlog_errno(ret);
2786 goto out;
2787 }
2788
2789 if (num_free_extents < num_clusters + 2)
2790 meta_add =
2791 ocfs2_extend_meta_needed(et->et_root_el);
2792
2793 *credits += ocfs2_calc_extend_credits(sb, et->et_root_el,
2794 num_clusters + 2);
2795
2796 ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh,
2797 p_cluster, num_clusters,
2798 &meta_add, credits);
2799 if (ret) {
2800 mlog_errno(ret);
2801 goto out;
2802 }
2803
2804 mlog(0, "reserve new metadata %d, clusters %u, credits = %d\n",
2805 meta_add, num_clusters, *credits);
2806 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add,
2807 meta_ac);
2808 if (ret) {
2809 mlog_errno(ret);
2810 goto out;
2811 }
2812
2813 if (data_ac) {
2814 ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters,
2815 data_ac);
2816 if (ret)
2817 mlog_errno(ret);
2818 }
2819
2820out:
2821 if (ret) {
2822 if (*meta_ac) {
2823 ocfs2_free_alloc_context(*meta_ac);
2824 *meta_ac = NULL;
2825 }
2826 }
2827
2828 return ret;
2829}
2830
2831static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
2832{
2833 BUG_ON(buffer_dirty(bh));
2834
2835 clear_buffer_mapped(bh);
2836
2837 return 0;
2838}
2839
2840static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2841 struct ocfs2_cow_context *context,
2842 u32 cpos, u32 old_cluster,
2843 u32 new_cluster, u32 new_len)
2844{
2845 int ret = 0, partial;
2846 struct ocfs2_caching_info *ci = context->data_et.et_ci;
2847 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2848 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
2849 struct page *page;
2850 pgoff_t page_index;
2851 unsigned int from, to;
2852 loff_t offset, end, map_end;
2853 struct address_space *mapping = context->inode->i_mapping;
2854
2855 mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster,
2856 new_cluster, new_len, cpos);
2857
2858 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
2859 end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
2860
2861 while (offset < end) {
2862 page_index = offset >> PAGE_CACHE_SHIFT;
2863 map_end = (page_index + 1) << PAGE_CACHE_SHIFT;
2864 if (map_end > end)
2865 map_end = end;
2866
2867 /* from, to is the offset within the page. */
2868 from = offset & (PAGE_CACHE_SIZE - 1);
2869 to = PAGE_CACHE_SIZE;
2870 if (map_end & (PAGE_CACHE_SIZE - 1))
2871 to = map_end & (PAGE_CACHE_SIZE - 1);
2872
2873 page = grab_cache_page(mapping, page_index);
2874
2875 /* This page can't be dirtied before we CoW it out. */
2876 BUG_ON(PageDirty(page));
2877
2878 if (!PageUptodate(page)) {
2879 ret = block_read_full_page(page, ocfs2_get_block);
2880 if (ret) {
2881 mlog_errno(ret);
2882 goto unlock;
2883 }
2884 lock_page(page);
2885 }
2886
2887 if (page_has_buffers(page)) {
2888 ret = walk_page_buffers(handle, page_buffers(page),
2889 from, to, &partial,
2890 ocfs2_clear_cow_buffer);
2891 if (ret) {
2892 mlog_errno(ret);
2893 goto unlock;
2894 }
2895 }
2896
2897 ocfs2_map_and_dirty_page(context->inode,
2898 handle, from, to,
2899 page, 0, &new_block);
2900 mark_page_accessed(page);
2901unlock:
2902 unlock_page(page);
2903 page_cache_release(page);
2904 page = NULL;
2905 offset = map_end;
2906 if (ret)
2907 break;
2908 }
2909
2910 return ret;
2911}
2912
2913static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
2914 struct ocfs2_cow_context *context,
2915 u32 cpos, u32 old_cluster,
2916 u32 new_cluster, u32 new_len)
2917{
2918 int ret = 0;
2919 struct super_block *sb = context->inode->i_sb;
2920 struct ocfs2_caching_info *ci = context->data_et.et_ci;
2921 int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
2922 u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
2923 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
2924 struct ocfs2_super *osb = OCFS2_SB(sb);
2925 struct buffer_head *old_bh = NULL;
2926 struct buffer_head *new_bh = NULL;
2927
2928 mlog(0, "old_cluster %u, new %u, len %u\n", old_cluster,
2929 new_cluster, new_len);
2930
2931 for (i = 0; i < blocks; i++, old_block++, new_block++) {
2932 new_bh = sb_getblk(osb->sb, new_block);
2933 if (new_bh == NULL) {
2934 ret = -EIO;
2935 mlog_errno(ret);
2936 break;
2937 }
2938
2939 ocfs2_set_new_buffer_uptodate(ci, new_bh);
2940
2941 ret = ocfs2_read_block(ci, old_block, &old_bh, NULL);
2942 if (ret) {
2943 mlog_errno(ret);
2944 break;
2945 }
2946
2947 ret = ocfs2_journal_access(handle, ci, new_bh,
2948 OCFS2_JOURNAL_ACCESS_CREATE);
2949 if (ret) {
2950 mlog_errno(ret);
2951 break;
2952 }
2953
2954 memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
2955 ret = ocfs2_journal_dirty(handle, new_bh);
2956 if (ret) {
2957 mlog_errno(ret);
2958 break;
2959 }
2960
2961 brelse(new_bh);
2962 brelse(old_bh);
2963 new_bh = NULL;
2964 old_bh = NULL;
2965 }
2966
2967 brelse(new_bh);
2968 brelse(old_bh);
2969 return ret;
2970}
2971
2972static int ocfs2_clear_ext_refcount(handle_t *handle,
2973 struct ocfs2_extent_tree *et,
2974 u32 cpos, u32 p_cluster, u32 len,
2975 unsigned int ext_flags,
2976 struct ocfs2_alloc_context *meta_ac,
2977 struct ocfs2_cached_dealloc_ctxt *dealloc)
2978{
2979 int ret, index;
2980 struct ocfs2_extent_rec replace_rec;
2981 struct ocfs2_path *path = NULL;
2982 struct ocfs2_extent_list *el;
2983 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2984 u64 ino = ocfs2_metadata_cache_owner(et->et_ci);
2985
2986 mlog(0, "inode %llu cpos %u, len %u, p_cluster %u, ext_flags %u\n",
2987 (unsigned long long)ino, cpos, len, p_cluster, ext_flags);
2988
2989 memset(&replace_rec, 0, sizeof(replace_rec));
2990 replace_rec.e_cpos = cpu_to_le32(cpos);
2991 replace_rec.e_leaf_clusters = cpu_to_le16(len);
2992 replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb,
2993 p_cluster));
2994 replace_rec.e_flags = ext_flags;
2995 replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED;
2996
2997 path = ocfs2_new_path_from_et(et);
2998 if (!path) {
2999 ret = -ENOMEM;
3000 mlog_errno(ret);
3001 goto out;
3002 }
3003
3004 ret = ocfs2_find_path(et->et_ci, path, cpos);
3005 if (ret) {
3006 mlog_errno(ret);
3007 goto out;
3008 }
3009
3010 el = path_leaf_el(path);
3011
3012 index = ocfs2_search_extent_list(el, cpos);
3013 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
3014 ocfs2_error(sb,
3015 "Inode %llu has an extent at cpos %u which can no "
3016 "longer be found.\n",
3017 (unsigned long long)ino, cpos);
3018 ret = -EROFS;
3019 goto out;
3020 }
3021
3022 ret = ocfs2_split_extent(handle, et, path, index,
3023 &replace_rec, meta_ac, dealloc);
3024 if (ret)
3025 mlog_errno(ret);
3026
3027out:
3028 ocfs2_free_path(path);
3029 return ret;
3030}
3031
3032static int ocfs2_replace_clusters(handle_t *handle,
3033 struct ocfs2_cow_context *context,
3034 u32 cpos, u32 old,
3035 u32 new, u32 len,
3036 unsigned int ext_flags)
3037{
3038 int ret;
3039 struct ocfs2_caching_info *ci = context->data_et.et_ci;
3040 u64 ino = ocfs2_metadata_cache_owner(ci);
3041
3042 mlog(0, "inode %llu, cpos %u, old %u, new %u, len %u, ext_flags %u\n",
3043 (unsigned long long)ino, cpos, old, new, len, ext_flags);
3044
3045 /*If the old clusters is unwritten, no need to duplicate. */
3046 if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
3047 ret = context->cow_duplicate_clusters(handle, context, cpos,
3048 old, new, len);
3049 if (ret) {
3050 mlog_errno(ret);
3051 goto out;
3052 }
3053 }
3054
3055 ret = ocfs2_clear_ext_refcount(handle, &context->data_et,
3056 cpos, new, len, ext_flags,
3057 context->meta_ac, &context->dealloc);
3058 if (ret)
3059 mlog_errno(ret);
3060out:
3061 return ret;
3062}
3063
3064static int ocfs2_cow_sync_writeback(struct super_block *sb,
3065 struct ocfs2_cow_context *context,
3066 u32 cpos, u32 num_clusters)
3067{
3068 int ret = 0;
3069 loff_t offset, end, map_end;
3070 pgoff_t page_index;
3071 struct page *page;
3072
3073 if (ocfs2_should_order_data(context->inode))
3074 return 0;
3075
3076 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
3077 end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
3078
3079 ret = filemap_fdatawrite_range(context->inode->i_mapping,
3080 offset, end - 1);
3081 if (ret < 0) {
3082 mlog_errno(ret);
3083 return ret;
3084 }
3085
3086 while (offset < end) {
3087 page_index = offset >> PAGE_CACHE_SHIFT;
3088 map_end = (page_index + 1) << PAGE_CACHE_SHIFT;
3089 if (map_end > end)
3090 map_end = end;
3091
3092 page = grab_cache_page(context->inode->i_mapping, page_index);
3093 BUG_ON(!page);
3094
3095 wait_on_page_writeback(page);
3096 if (PageError(page)) {
3097 ret = -EIO;
3098 mlog_errno(ret);
3099 } else
3100 mark_page_accessed(page);
3101
3102 unlock_page(page);
3103 page_cache_release(page);
3104 page = NULL;
3105 offset = map_end;
3106 if (ret)
3107 break;
3108 }
3109
3110 return ret;
3111}
3112
3113static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context,
3114 u32 v_cluster, u32 *p_cluster,
3115 u32 *num_clusters,
3116 unsigned int *extent_flags)
3117{
3118 return ocfs2_get_clusters(context->inode, v_cluster, p_cluster,
3119 num_clusters, extent_flags);
3120}
3121
3122static int ocfs2_make_clusters_writable(struct super_block *sb,
3123 struct ocfs2_cow_context *context,
3124 u32 cpos, u32 p_cluster,
3125 u32 num_clusters, unsigned int e_flags)
3126{
3127 int ret, delete, index, credits = 0;
3128 u32 new_bit, new_len;
3129 unsigned int set_len;
3130 struct ocfs2_super *osb = OCFS2_SB(sb);
3131 handle_t *handle;
3132 struct buffer_head *ref_leaf_bh = NULL;
3133 struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci;
3134 struct ocfs2_refcount_rec rec;
3135
3136 mlog(0, "cpos %u, p_cluster %u, num_clusters %u, e_flags %u\n",
3137 cpos, p_cluster, num_clusters, e_flags);
3138
3139 ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
3140 &context->data_et,
3141 ref_ci,
3142 context->ref_root_bh,
3143 &context->meta_ac,
3144 &context->data_ac, &credits);
3145 if (ret) {
3146 mlog_errno(ret);
3147 return ret;
3148 }
3149
3150 if (context->post_refcount)
3151 credits += context->post_refcount->credits;
3152
3153 credits += context->extra_credits;
3154 handle = ocfs2_start_trans(osb, credits);
3155 if (IS_ERR(handle)) {
3156 ret = PTR_ERR(handle);
3157 mlog_errno(ret);
3158 goto out;
3159 }
3160
3161 while (num_clusters) {
3162 ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh,
3163 p_cluster, num_clusters,
3164 &rec, &index, &ref_leaf_bh);
3165 if (ret) {
3166 mlog_errno(ret);
3167 goto out_commit;
3168 }
3169
3170 BUG_ON(!rec.r_refcount);
3171 set_len = min((u64)p_cluster + num_clusters,
3172 le64_to_cpu(rec.r_cpos) +
3173 le32_to_cpu(rec.r_clusters)) - p_cluster;
3174
3175 /*
3176 * There are many different situation here.
3177 * 1. If refcount == 1, remove the flag and don't COW.
3178 * 2. If refcount > 1, allocate clusters.
3179 * Here we may not allocate r_len once at a time, so continue
3180 * until we reach num_clusters.
3181 */
3182 if (le32_to_cpu(rec.r_refcount) == 1) {
3183 delete = 0;
3184 ret = ocfs2_clear_ext_refcount(handle,
3185 &context->data_et,
3186 cpos, p_cluster,
3187 set_len, e_flags,
3188 context->meta_ac,
3189 &context->dealloc);
3190 if (ret) {
3191 mlog_errno(ret);
3192 goto out_commit;
3193 }
3194 } else {
3195 delete = 1;
3196
3197 ret = __ocfs2_claim_clusters(osb, handle,
3198 context->data_ac,
3199 1, set_len,
3200 &new_bit, &new_len);
3201 if (ret) {
3202 mlog_errno(ret);
3203 goto out_commit;
3204 }
3205
3206 ret = ocfs2_replace_clusters(handle, context,
3207 cpos, p_cluster, new_bit,
3208 new_len, e_flags);
3209 if (ret) {
3210 mlog_errno(ret);
3211 goto out_commit;
3212 }
3213 set_len = new_len;
3214 }
3215
3216 ret = __ocfs2_decrease_refcount(handle, ref_ci,
3217 context->ref_root_bh,
3218 p_cluster, set_len,
3219 context->meta_ac,
3220 &context->dealloc, delete);
3221 if (ret) {
3222 mlog_errno(ret);
3223 goto out_commit;
3224 }
3225
3226 cpos += set_len;
3227 p_cluster += set_len;
3228 num_clusters -= set_len;
3229 brelse(ref_leaf_bh);
3230 ref_leaf_bh = NULL;
3231 }
3232
3233 /* handle any post_cow action. */
3234 if (context->post_refcount && context->post_refcount->func) {
3235 ret = context->post_refcount->func(context->inode, handle,
3236 context->post_refcount->para);
3237 if (ret) {
3238 mlog_errno(ret);
3239 goto out_commit;
3240 }
3241 }
3242
3243 /*
3244 * Here we should write the new page out first if we are
3245 * in write-back mode.
3246 */
3247 if (context->get_clusters == ocfs2_di_get_clusters) {
3248 ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters);
3249 if (ret)
3250 mlog_errno(ret);
3251 }
3252
3253out_commit:
3254 ocfs2_commit_trans(osb, handle);
3255
3256out:
3257 if (context->data_ac) {
3258 ocfs2_free_alloc_context(context->data_ac);
3259 context->data_ac = NULL;
3260 }
3261 if (context->meta_ac) {
3262 ocfs2_free_alloc_context(context->meta_ac);
3263 context->meta_ac = NULL;
3264 }
3265 brelse(ref_leaf_bh);
3266
3267 return ret;
3268}
3269
3270static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
3271{
3272 int ret = 0;
3273 struct inode *inode = context->inode;
3274 u32 cow_start = context->cow_start, cow_len = context->cow_len;
3275 u32 p_cluster, num_clusters;
3276 unsigned int ext_flags;
3277 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3278
3279 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
3280 ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
3281 "tree, but the feature bit is not set in the "
3282 "super block.", inode->i_ino);
3283 return -EROFS;
3284 }
3285
3286 ocfs2_init_dealloc_ctxt(&context->dealloc);
3287
3288 while (cow_len) {
3289 ret = context->get_clusters(context, cow_start, &p_cluster,
3290 &num_clusters, &ext_flags);
3291 if (ret) {
3292 mlog_errno(ret);
3293 break;
3294 }
3295
3296 BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED));
3297
3298 if (cow_len < num_clusters)
3299 num_clusters = cow_len;
3300
3301 ret = ocfs2_make_clusters_writable(inode->i_sb, context,
3302 cow_start, p_cluster,
3303 num_clusters, ext_flags);
3304 if (ret) {
3305 mlog_errno(ret);
3306 break;
3307 }
3308
3309 cow_len -= num_clusters;
3310 cow_start += num_clusters;
3311 }
3312
3313 if (ocfs2_dealloc_has_cluster(&context->dealloc)) {
3314 ocfs2_schedule_truncate_log_flush(osb, 1);
3315 ocfs2_run_deallocs(osb, &context->dealloc);
3316 }
3317
3318 return ret;
3319}
3320
3321/*
3322 * Starting at cpos, try to CoW write_len clusters. Don't CoW
3323 * past max_cpos. This will stop when it runs into a hole or an
3324 * unrefcounted extent.
3325 */
3326static int ocfs2_refcount_cow_hunk(struct inode *inode,
3327 struct buffer_head *di_bh,
3328 u32 cpos, u32 write_len, u32 max_cpos)
3329{
3330 int ret;
3331 u32 cow_start = 0, cow_len = 0;
3332 struct ocfs2_inode_info *oi = OCFS2_I(inode);
3333 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3334 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3335 struct buffer_head *ref_root_bh = NULL;
3336 struct ocfs2_refcount_tree *ref_tree;
3337 struct ocfs2_cow_context *context = NULL;
3338
3339 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
3340
3341 ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list,
3342 cpos, write_len, max_cpos,
3343 &cow_start, &cow_len);
3344 if (ret) {
3345 mlog_errno(ret);
3346 goto out;
3347 }
3348
3349 mlog(0, "CoW inode %lu, cpos %u, write_len %u, cow_start %u, "
3350 "cow_len %u\n", inode->i_ino,
3351 cpos, write_len, cow_start, cow_len);
3352
3353 BUG_ON(cow_len == 0);
3354
3355 context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
3356 if (!context) {
3357 ret = -ENOMEM;
3358 mlog_errno(ret);
3359 goto out;
3360 }
3361
3362 ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
3363 1, &ref_tree, &ref_root_bh);
3364 if (ret) {
3365 mlog_errno(ret);
3366 goto out;
3367 }
3368
3369 context->inode = inode;
3370 context->cow_start = cow_start;
3371 context->cow_len = cow_len;
3372 context->ref_tree = ref_tree;
3373 context->ref_root_bh = ref_root_bh;
3374 context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
3375 context->get_clusters = ocfs2_di_get_clusters;
3376
3377 ocfs2_init_dinode_extent_tree(&context->data_et,
3378 INODE_CACHE(inode), di_bh);
3379
3380 ret = ocfs2_replace_cow(context);
3381 if (ret)
3382 mlog_errno(ret);
3383
3384 /*
3385 * truncate the extent map here since no matter whether we meet with
3386 * any error during the action, we shouldn't trust cached extent map
3387 * any more.
3388 */
3389 ocfs2_extent_map_trunc(inode, cow_start);
3390
3391 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
3392 brelse(ref_root_bh);
3393out:
3394 kfree(context);
3395 return ret;
3396}
3397
3398/*
3399 * CoW any and all clusters between cpos and cpos+write_len.
3400 * Don't CoW past max_cpos. If this returns successfully, all
3401 * clusters between cpos and cpos+write_len are safe to modify.
3402 */
3403int ocfs2_refcount_cow(struct inode *inode,
3404 struct buffer_head *di_bh,
3405 u32 cpos, u32 write_len, u32 max_cpos)
3406{
3407 int ret = 0;
3408 u32 p_cluster, num_clusters;
3409 unsigned int ext_flags;
3410
3411 while (write_len) {
3412 ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
3413 &num_clusters, &ext_flags);
3414 if (ret) {
3415 mlog_errno(ret);
3416 break;
3417 }
3418
3419 if (write_len < num_clusters)
3420 num_clusters = write_len;
3421
3422 if (ext_flags & OCFS2_EXT_REFCOUNTED) {
3423 ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos,
3424 num_clusters, max_cpos);
3425 if (ret) {
3426 mlog_errno(ret);
3427 break;
3428 }
3429 }
3430
3431 write_len -= num_clusters;
3432 cpos += num_clusters;
3433 }
3434
3435 return ret;
3436}
3437
3438static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context,
3439 u32 v_cluster, u32 *p_cluster,
3440 u32 *num_clusters,
3441 unsigned int *extent_flags)
3442{
3443 struct inode *inode = context->inode;
3444 struct ocfs2_xattr_value_root *xv = context->cow_object;
3445
3446 return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster,
3447 num_clusters, &xv->xr_list,
3448 extent_flags);
3449}
3450
3451/*
3452 * Given a xattr value root, calculate the most meta/credits we need for
3453 * refcount tree change if we truncate it to 0.
3454 */
3455int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
3456 struct ocfs2_caching_info *ref_ci,
3457 struct buffer_head *ref_root_bh,
3458 struct ocfs2_xattr_value_root *xv,
3459 int *meta_add, int *credits)
3460{
3461 int ret = 0, index, ref_blocks = 0;
3462 u32 p_cluster, num_clusters;
3463 u32 cpos = 0, clusters = le32_to_cpu(xv->xr_clusters);
3464 struct ocfs2_refcount_block *rb;
3465 struct ocfs2_refcount_rec rec;
3466 struct buffer_head *ref_leaf_bh = NULL;
3467
3468 while (cpos < clusters) {
3469 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
3470 &num_clusters, &xv->xr_list,
3471 NULL);
3472 if (ret) {
3473 mlog_errno(ret);
3474 goto out;
3475 }
3476
3477 cpos += num_clusters;
3478
3479 while (num_clusters) {
3480 ret = ocfs2_get_refcount_rec(ref_ci, ref_root_bh,
3481 p_cluster, num_clusters,
3482 &rec, &index,
3483 &ref_leaf_bh);
3484 if (ret) {
3485 mlog_errno(ret);
3486 goto out;
3487 }
3488
3489 BUG_ON(!rec.r_refcount);
3490
3491 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
3492
3493 /*
3494 * We really don't know whether the other clusters is in
3495 * this refcount block or not, so just take the worst
3496 * case that all the clusters are in this block and each
3497 * one will split a refcount rec, so totally we need
3498 * clusters * 2 new refcount rec.
3499 */
3500 if (le64_to_cpu(rb->rf_records.rl_used) + clusters * 2 >
3501 le16_to_cpu(rb->rf_records.rl_count))
3502 ref_blocks++;
3503
3504 *credits += 1;
3505 brelse(ref_leaf_bh);
3506 ref_leaf_bh = NULL;
3507
3508 if (num_clusters <= le32_to_cpu(rec.r_clusters))
3509 break;
3510 else
3511 num_clusters -= le32_to_cpu(rec.r_clusters);
3512 p_cluster += num_clusters;
3513 }
3514 }
3515
3516 *meta_add += ref_blocks;
3517 if (!ref_blocks)
3518 goto out;
3519
3520 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
3521 if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
3522 *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
3523 else {
3524 struct ocfs2_extent_tree et;
3525
3526 ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh);
3527 *credits += ocfs2_calc_extend_credits(inode->i_sb,
3528 et.et_root_el,
3529 ref_blocks);
3530 }
3531
3532out:
3533 brelse(ref_leaf_bh);
3534 return ret;
3535}
3536
3537/*
3538 * Do CoW for xattr.
3539 */
3540int ocfs2_refcount_cow_xattr(struct inode *inode,
3541 struct ocfs2_dinode *di,
3542 struct ocfs2_xattr_value_buf *vb,
3543 struct ocfs2_refcount_tree *ref_tree,
3544 struct buffer_head *ref_root_bh,
3545 u32 cpos, u32 write_len,
3546 struct ocfs2_post_refcount *post)
3547{
3548 int ret;
3549 struct ocfs2_xattr_value_root *xv = vb->vb_xv;
3550 struct ocfs2_inode_info *oi = OCFS2_I(inode);
3551 struct ocfs2_cow_context *context = NULL;
3552 u32 cow_start, cow_len;
3553
3554 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
3555
3556 ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list,
3557 cpos, write_len, UINT_MAX,
3558 &cow_start, &cow_len);
3559 if (ret) {
3560 mlog_errno(ret);
3561 goto out;
3562 }
3563
3564 BUG_ON(cow_len == 0);
3565
3566 context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
3567 if (!context) {
3568 ret = -ENOMEM;
3569 mlog_errno(ret);
3570 goto out;
3571 }
3572
3573 context->inode = inode;
3574 context->cow_start = cow_start;
3575 context->cow_len = cow_len;
3576 context->ref_tree = ref_tree;
3577 context->ref_root_bh = ref_root_bh;;
3578 context->cow_object = xv;
3579
3580 context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd;
3581 /* We need the extra credits for duplicate_clusters by jbd. */
3582 context->extra_credits =
3583 ocfs2_clusters_to_blocks(inode->i_sb, 1) * cow_len;
3584 context->get_clusters = ocfs2_xattr_value_get_clusters;
3585 context->post_refcount = post;
3586
3587 ocfs2_init_xattr_value_extent_tree(&context->data_et,
3588 INODE_CACHE(inode), vb);
3589
3590 ret = ocfs2_replace_cow(context);
3591 if (ret)
3592 mlog_errno(ret);
3593
3594out:
3595 kfree(context);
3596 return ret;
3597}
3598
3599/*
3600 * Insert a new extent into refcount tree and mark a extent rec
3601 * as refcounted in the dinode tree.
3602 */
3603int ocfs2_add_refcount_flag(struct inode *inode,
3604 struct ocfs2_extent_tree *data_et,
3605 struct ocfs2_caching_info *ref_ci,
3606 struct buffer_head *ref_root_bh,
3607 u32 cpos, u32 p_cluster, u32 num_clusters,
3608 struct ocfs2_cached_dealloc_ctxt *dealloc,
3609 struct ocfs2_post_refcount *post)
3610{
3611 int ret;
3612 handle_t *handle;
3613 int credits = 1, ref_blocks = 0;
3614 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3615 struct ocfs2_alloc_context *meta_ac = NULL;
3616
3617 ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
3618 ref_ci, ref_root_bh,
3619 p_cluster, num_clusters,
3620 &ref_blocks, &credits);
3621 if (ret) {
3622 mlog_errno(ret);
3623 goto out;
3624 }
3625
3626 mlog(0, "reserve new metadata %d, credits = %d\n",
3627 ref_blocks, credits);
3628
3629 if (ref_blocks) {
3630 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
3631 ref_blocks, &meta_ac);
3632 if (ret) {
3633 mlog_errno(ret);
3634 goto out;
3635 }
3636 }
3637
3638 if (post)
3639 credits += post->credits;
3640
3641 handle = ocfs2_start_trans(osb, credits);
3642 if (IS_ERR(handle)) {
3643 ret = PTR_ERR(handle);
3644 mlog_errno(ret);
3645 goto out;
3646 }
3647
3648 ret = ocfs2_mark_extent_refcounted(inode, data_et, handle,
3649 cpos, num_clusters, p_cluster,
3650 meta_ac, dealloc);
3651 if (ret) {
3652 mlog_errno(ret);
3653 goto out_commit;
3654 }
3655
3656 ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
3657 p_cluster, num_clusters, 0,
3658 meta_ac, dealloc);
3659 if (ret) {
3660 mlog_errno(ret);
3661 goto out_commit;
3662 }
3663
3664 if (post && post->func) {
3665 ret = post->func(inode, handle, post->para);
3666 if (ret)
3667 mlog_errno(ret);
3668 }
3669
3670out_commit:
3671 ocfs2_commit_trans(osb, handle);
3672out:
3673 if (meta_ac)
3674 ocfs2_free_alloc_context(meta_ac);
3675 return ret;
3676}
3677
3678static int ocfs2_change_ctime(struct inode *inode,
3679 struct buffer_head *di_bh)
3680{
3681 int ret;
3682 handle_t *handle;
3683 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3684
3685 handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
3686 OCFS2_INODE_UPDATE_CREDITS);
3687 if (IS_ERR(handle)) {
3688 ret = PTR_ERR(handle);
3689 mlog_errno(ret);
3690 goto out;
3691 }
3692
3693 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
3694 OCFS2_JOURNAL_ACCESS_WRITE);
3695 if (ret) {
3696 mlog_errno(ret);
3697 goto out_commit;
3698 }
3699
3700 inode->i_ctime = CURRENT_TIME;
3701 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
3702 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
3703
3704 ocfs2_journal_dirty(handle, di_bh);
3705
3706out_commit:
3707 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
3708out:
3709 return ret;
3710}
3711
3712static int ocfs2_attach_refcount_tree(struct inode *inode,
3713 struct buffer_head *di_bh)
3714{
3715 int ret, data_changed = 0;
3716 struct buffer_head *ref_root_bh = NULL;
3717 struct ocfs2_inode_info *oi = OCFS2_I(inode);
3718 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3719 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3720 struct ocfs2_refcount_tree *ref_tree;
3721 unsigned int ext_flags;
3722 loff_t size;
3723 u32 cpos, num_clusters, clusters, p_cluster;
3724 struct ocfs2_cached_dealloc_ctxt dealloc;
3725 struct ocfs2_extent_tree di_et;
3726
3727 ocfs2_init_dealloc_ctxt(&dealloc);
3728
3729 if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) {
3730 ret = ocfs2_create_refcount_tree(inode, di_bh);
3731 if (ret) {
3732 mlog_errno(ret);
3733 goto out;
3734 }
3735 }
3736
3737 BUG_ON(!di->i_refcount_loc);
3738 ret = ocfs2_lock_refcount_tree(osb,
3739 le64_to_cpu(di->i_refcount_loc), 1,
3740 &ref_tree, &ref_root_bh);
3741 if (ret) {
3742 mlog_errno(ret);
3743 goto out;
3744 }
3745
3746 ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh);
3747
3748 size = i_size_read(inode);
3749 clusters = ocfs2_clusters_for_bytes(inode->i_sb, size);
3750
3751 cpos = 0;
3752 while (cpos < clusters) {
3753 ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
3754 &num_clusters, &ext_flags);
3755
3756 if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) {
3757 ret = ocfs2_add_refcount_flag(inode, &di_et,
3758 &ref_tree->rf_ci,
3759 ref_root_bh, cpos,
3760 p_cluster, num_clusters,
3761 &dealloc, NULL);
3762 if (ret) {
3763 mlog_errno(ret);
3764 goto unlock;
3765 }
3766
3767 data_changed = 1;
3768 }
3769 cpos += num_clusters;
3770 }
3771
3772 if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
3773 ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh,
3774 &ref_tree->rf_ci,
3775 ref_root_bh,
3776 &dealloc);
3777 if (ret) {
3778 mlog_errno(ret);
3779 goto unlock;
3780 }
3781 }
3782
3783 if (data_changed) {
3784 ret = ocfs2_change_ctime(inode, di_bh);
3785 if (ret)
3786 mlog_errno(ret);
3787 }
3788
3789unlock:
3790 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
3791 brelse(ref_root_bh);
3792
3793 if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) {
3794 ocfs2_schedule_truncate_log_flush(osb, 1);
3795 ocfs2_run_deallocs(osb, &dealloc);
3796 }
3797out:
3798 /*
3799 * Empty the extent map so that we may get the right extent
3800 * record from the disk.
3801 */
3802 ocfs2_extent_map_trunc(inode, 0);
3803
3804 return ret;
3805}
3806
3807static int ocfs2_add_refcounted_extent(struct inode *inode,
3808 struct ocfs2_extent_tree *et,
3809 struct ocfs2_caching_info *ref_ci,
3810 struct buffer_head *ref_root_bh,
3811 u32 cpos, u32 p_cluster, u32 num_clusters,
3812 unsigned int ext_flags,
3813 struct ocfs2_cached_dealloc_ctxt *dealloc)
3814{
3815 int ret;
3816 handle_t *handle;
3817 int credits = 0;
3818 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3819 struct ocfs2_alloc_context *meta_ac = NULL;
3820
3821 ret = ocfs2_lock_refcount_allocators(inode->i_sb,
3822 p_cluster, num_clusters,
3823 et, ref_ci,
3824 ref_root_bh, &meta_ac,
3825 NULL, &credits);
3826 if (ret) {
3827 mlog_errno(ret);
3828 goto out;
3829 }
3830
3831 handle = ocfs2_start_trans(osb, credits);
3832 if (IS_ERR(handle)) {
3833 ret = PTR_ERR(handle);
3834 mlog_errno(ret);
3835 goto out;
3836 }
3837
3838 ret = ocfs2_insert_extent(handle, et, cpos,
3839 cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
3840 p_cluster)),
3841 num_clusters, ext_flags, meta_ac);
3842 if (ret) {
3843 mlog_errno(ret);
3844 goto out_commit;
3845 }
3846
3847 ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
3848 p_cluster, num_clusters,
3849 meta_ac, dealloc);
3850 if (ret)
3851 mlog_errno(ret);
3852
3853out_commit:
3854 ocfs2_commit_trans(osb, handle);
3855out:
3856 if (meta_ac)
3857 ocfs2_free_alloc_context(meta_ac);
3858 return ret;
3859}
3860
3861static int ocfs2_duplicate_extent_list(struct inode *s_inode,
3862 struct inode *t_inode,
3863 struct buffer_head *t_bh,
3864 struct ocfs2_caching_info *ref_ci,
3865 struct buffer_head *ref_root_bh,
3866 struct ocfs2_cached_dealloc_ctxt *dealloc)
3867{
3868 int ret = 0;
3869 u32 p_cluster, num_clusters, clusters, cpos;
3870 loff_t size;
3871 unsigned int ext_flags;
3872 struct ocfs2_extent_tree et;
3873
3874 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh);
3875
3876 size = i_size_read(s_inode);
3877 clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size);
3878
3879 cpos = 0;
3880 while (cpos < clusters) {
3881 ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster,
3882 &num_clusters, &ext_flags);
3883
3884 if (p_cluster) {
3885 ret = ocfs2_add_refcounted_extent(t_inode, &et,
3886 ref_ci, ref_root_bh,
3887 cpos, p_cluster,
3888 num_clusters,
3889 ext_flags,
3890 dealloc);
3891 if (ret) {
3892 mlog_errno(ret);
3893 goto out;
3894 }
3895 }
3896
3897 cpos += num_clusters;
3898 }
3899
3900out:
3901 return ret;
3902}
3903
3904/*
3905 * change the new file's attributes to the src.
3906 *
3907 * reflink creates a snapshot of a file, that means the attributes
3908 * must be identical except for three exceptions - nlink, ino, and ctime.
3909 */
3910static int ocfs2_complete_reflink(struct inode *s_inode,
3911 struct buffer_head *s_bh,
3912 struct inode *t_inode,
3913 struct buffer_head *t_bh,
3914 bool preserve)
3915{
3916 int ret;
3917 handle_t *handle;
3918 struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
3919 struct ocfs2_dinode *di = (struct ocfs2_dinode *)t_bh->b_data;
3920 loff_t size = i_size_read(s_inode);
3921
3922 handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb),
3923 OCFS2_INODE_UPDATE_CREDITS);
3924 if (IS_ERR(handle)) {
3925 ret = PTR_ERR(handle);
3926 mlog_errno(ret);
3927 return ret;
3928 }
3929
3930 ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
3931 OCFS2_JOURNAL_ACCESS_WRITE);
3932 if (ret) {
3933 mlog_errno(ret);
3934 goto out_commit;
3935 }
3936
3937 spin_lock(&OCFS2_I(t_inode)->ip_lock);
3938 OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters;
3939 OCFS2_I(t_inode)->ip_attr = OCFS2_I(s_inode)->ip_attr;
3940 OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
3941 spin_unlock(&OCFS2_I(t_inode)->ip_lock);
3942 i_size_write(t_inode, size);
3943
3944 di->i_xattr_inline_size = s_di->i_xattr_inline_size;
3945 di->i_clusters = s_di->i_clusters;
3946 di->i_size = s_di->i_size;
3947 di->i_dyn_features = s_di->i_dyn_features;
3948 di->i_attr = s_di->i_attr;
3949
3950 if (preserve) {
3951 di->i_uid = s_di->i_uid;
3952 di->i_gid = s_di->i_gid;
3953 di->i_mode = s_di->i_mode;
3954
3955 /*
3956 * update time.
3957 * we want mtime to appear identical to the source and
3958 * update ctime.
3959 */
3960 t_inode->i_ctime = CURRENT_TIME;
3961
3962 di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec);
3963 di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec);
3964
3965 t_inode->i_mtime = s_inode->i_mtime;
3966 di->i_mtime = s_di->i_mtime;
3967 di->i_mtime_nsec = s_di->i_mtime_nsec;
3968 }
3969
3970 ocfs2_journal_dirty(handle, t_bh);
3971
3972out_commit:
3973 ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle);
3974 return ret;
3975}
3976
3977static int ocfs2_create_reflink_node(struct inode *s_inode,
3978 struct buffer_head *s_bh,
3979 struct inode *t_inode,
3980 struct buffer_head *t_bh,
3981 bool preserve)
3982{
3983 int ret;
3984 struct buffer_head *ref_root_bh = NULL;
3985 struct ocfs2_cached_dealloc_ctxt dealloc;
3986 struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
3987 struct ocfs2_refcount_block *rb;
3988 struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data;
3989 struct ocfs2_refcount_tree *ref_tree;
3990
3991 ocfs2_init_dealloc_ctxt(&dealloc);
3992
3993 ret = ocfs2_set_refcount_tree(t_inode, t_bh,
3994 le64_to_cpu(di->i_refcount_loc));
3995 if (ret) {
3996 mlog_errno(ret);
3997 goto out;
3998 }
3999
4000 ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
4001 1, &ref_tree, &ref_root_bh);
4002 if (ret) {
4003 mlog_errno(ret);
4004 goto out;
4005 }
4006 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
4007
4008 ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh,
4009 &ref_tree->rf_ci, ref_root_bh,
4010 &dealloc);
4011 if (ret) {
4012 mlog_errno(ret);
4013 goto out_unlock_refcount;
4014 }
4015
4016 ret = ocfs2_complete_reflink(s_inode, s_bh, t_inode, t_bh, preserve);
4017 if (ret)
4018 mlog_errno(ret);
4019
4020out_unlock_refcount:
4021 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
4022 brelse(ref_root_bh);
4023out:
4024 if (ocfs2_dealloc_has_cluster(&dealloc)) {
4025 ocfs2_schedule_truncate_log_flush(osb, 1);
4026 ocfs2_run_deallocs(osb, &dealloc);
4027 }
4028
4029 return ret;
4030}
4031
4032static int __ocfs2_reflink(struct dentry *old_dentry,
4033 struct buffer_head *old_bh,
4034 struct inode *new_inode,
4035 bool preserve)
4036{
4037 int ret;
4038 struct inode *inode = old_dentry->d_inode;
4039 struct buffer_head *new_bh = NULL;
4040
4041 ret = filemap_fdatawrite(inode->i_mapping);
4042 if (ret) {
4043 mlog_errno(ret);
4044 goto out;
4045 }
4046
4047 ret = ocfs2_attach_refcount_tree(inode, old_bh);
4048 if (ret) {
4049 mlog_errno(ret);
4050 goto out;
4051 }
4052
4053 mutex_lock(&new_inode->i_mutex);
4054 ret = ocfs2_inode_lock(new_inode, &new_bh, 1);
4055 if (ret) {
4056 mlog_errno(ret);
4057 goto out_unlock;
4058 }
4059
4060 ret = ocfs2_create_reflink_node(inode, old_bh,
4061 new_inode, new_bh, preserve);
4062 if (ret) {
4063 mlog_errno(ret);
4064 goto inode_unlock;
4065 }
4066
4067 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
4068 ret = ocfs2_reflink_xattrs(inode, old_bh,
4069 new_inode, new_bh,
4070 preserve);
4071 if (ret)
4072 mlog_errno(ret);
4073 }
4074inode_unlock:
4075 ocfs2_inode_unlock(new_inode, 1);
4076 brelse(new_bh);
4077out_unlock:
4078 mutex_unlock(&new_inode->i_mutex);
4079out:
4080 if (!ret) {
4081 ret = filemap_fdatawait(inode->i_mapping);
4082 if (ret)
4083 mlog_errno(ret);
4084 }
4085 return ret;
4086}
4087
4088static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
4089 struct dentry *new_dentry, bool preserve)
4090{
4091 int error;
4092 struct inode *inode = old_dentry->d_inode;
4093 struct buffer_head *old_bh = NULL;
4094 struct inode *new_orphan_inode = NULL;
4095
4096 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
4097 return -EOPNOTSUPP;
4098
4099 error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
4100 &new_orphan_inode);
4101 if (error) {
4102 mlog_errno(error);
4103 goto out;
4104 }
4105
4106 error = ocfs2_inode_lock(inode, &old_bh, 1);
4107 if (error) {
4108 mlog_errno(error);
4109 goto out;
4110 }
4111
4112 down_write(&OCFS2_I(inode)->ip_xattr_sem);
4113 down_write(&OCFS2_I(inode)->ip_alloc_sem);
4114 error = __ocfs2_reflink(old_dentry, old_bh,
4115 new_orphan_inode, preserve);
4116 up_write(&OCFS2_I(inode)->ip_alloc_sem);
4117 up_write(&OCFS2_I(inode)->ip_xattr_sem);
4118
4119 ocfs2_inode_unlock(inode, 1);
4120 brelse(old_bh);
4121
4122 if (error) {
4123 mlog_errno(error);
4124 goto out;
4125 }
4126
4127 /* If the security isn't preserved, we need to re-initialize them. */
4128 if (!preserve) {
4129 error = ocfs2_init_security_and_acl(dir, new_orphan_inode);
4130 if (error)
4131 mlog_errno(error);
4132 }
4133out:
4134 if (!error) {
4135 error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
4136 new_dentry);
4137 if (error)
4138 mlog_errno(error);
4139 }
4140
4141 if (new_orphan_inode) {
4142 /*
4143 * We need to open_unlock the inode no matter whether we
4144 * succeed or not, so that other nodes can delete it later.
4145 */
4146 ocfs2_open_unlock(new_orphan_inode);
4147 if (error)
4148 iput(new_orphan_inode);
4149 }
4150
4151 return error;
4152}
4153
4154/*
4155 * Below here are the bits used by OCFS2_IOC_REFLINK() to fake
4156 * sys_reflink(). This will go away when vfs_reflink() exists in
4157 * fs/namei.c.
4158 */
4159
4160/* copied from may_create in VFS. */
4161static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
4162{
4163 if (child->d_inode)
4164 return -EEXIST;
4165 if (IS_DEADDIR(dir))
4166 return -ENOENT;
4167 return inode_permission(dir, MAY_WRITE | MAY_EXEC);
4168}
4169
4170/* copied from user_path_parent. */
4171static int ocfs2_user_path_parent(const char __user *path,
4172 struct nameidata *nd, char **name)
4173{
4174 char *s = getname(path);
4175 int error;
4176
4177 if (IS_ERR(s))
4178 return PTR_ERR(s);
4179
4180 error = path_lookup(s, LOOKUP_PARENT, nd);
4181 if (error)
4182 putname(s);
4183 else
4184 *name = s;
4185
4186 return error;
4187}
4188
4189/**
4190 * ocfs2_vfs_reflink - Create a reference-counted link
4191 *
4192 * @old_dentry: source dentry + inode
4193 * @dir: directory to create the target
4194 * @new_dentry: target dentry
4195 * @preserve: if true, preserve all file attributes
4196 */
4197int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
4198 struct dentry *new_dentry, bool preserve)
4199{
4200 struct inode *inode = old_dentry->d_inode;
4201 int error;
4202
4203 if (!inode)
4204 return -ENOENT;
4205
4206 error = ocfs2_may_create(dir, new_dentry);
4207 if (error)
4208 return error;
4209
4210 if (dir->i_sb != inode->i_sb)
4211 return -EXDEV;
4212
4213 /*
4214 * A reflink to an append-only or immutable file cannot be created.
4215 */
4216 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4217 return -EPERM;
4218
4219 /* Only regular files can be reflinked. */
4220 if (!S_ISREG(inode->i_mode))
4221 return -EPERM;
4222
4223 /*
4224 * If the caller wants to preserve ownership, they require the
4225 * rights to do so.
4226 */
4227 if (preserve) {
4228 if ((current_fsuid() != inode->i_uid) && !capable(CAP_CHOWN))
4229 return -EPERM;
4230 if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN))
4231 return -EPERM;
4232 }
4233
4234 /*
4235 * If the caller is modifying any aspect of the attributes, they
4236 * are not creating a snapshot. They need read permission on the
4237 * file.
4238 */
4239 if (!preserve) {
4240 error = inode_permission(inode, MAY_READ);
4241 if (error)
4242 return error;
4243 }
4244
4245 mutex_lock(&inode->i_mutex);
4246 vfs_dq_init(dir);
4247 error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
4248 mutex_unlock(&inode->i_mutex);
4249 if (!error)
4250 fsnotify_create(dir, new_dentry);
4251 return error;
4252}
4253/*
4254 * Most codes are copied from sys_linkat.
4255 */
4256int ocfs2_reflink_ioctl(struct inode *inode,
4257 const char __user *oldname,
4258 const char __user *newname,
4259 bool preserve)
4260{
4261 struct dentry *new_dentry;
4262 struct nameidata nd;
4263 struct path old_path;
4264 int error;
4265 char *to = NULL;
4266
4267 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
4268 return -EOPNOTSUPP;
4269
4270 error = user_path_at(AT_FDCWD, oldname, 0, &old_path);
4271 if (error) {
4272 mlog_errno(error);
4273 return error;
4274 }
4275
4276 error = ocfs2_user_path_parent(newname, &nd, &to);
4277 if (error) {
4278 mlog_errno(error);
4279 goto out;
4280 }
4281
4282 error = -EXDEV;
4283 if (old_path.mnt != nd.path.mnt)
4284 goto out_release;
4285 new_dentry = lookup_create(&nd, 0);
4286 error = PTR_ERR(new_dentry);
4287 if (IS_ERR(new_dentry)) {
4288 mlog_errno(error);
4289 goto out_unlock;
4290 }
4291
4292 error = mnt_want_write(nd.path.mnt);
4293 if (error) {
4294 mlog_errno(error);
4295 goto out_dput;
4296 }
4297
4298 error = ocfs2_vfs_reflink(old_path.dentry,
4299 nd.path.dentry->d_inode,
4300 new_dentry, preserve);
4301 mnt_drop_write(nd.path.mnt);
4302out_dput:
4303 dput(new_dentry);
4304out_unlock:
4305 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
4306out_release:
4307 path_put(&nd.path);
4308 putname(to);
4309out:
4310 path_put(&old_path);
4311
4312 return error;
4313}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
new file mode 100644
index 000000000000..c1d19b1d3ecc
--- /dev/null
+++ b/fs/ocfs2/refcounttree.h
@@ -0,0 +1,106 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * refcounttree.h
5 *
6 * Copyright (C) 2009 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License version 2 as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 */
17#ifndef OCFS2_REFCOUNTTREE_H
18#define OCFS2_REFCOUNTTREE_H
19
20struct ocfs2_refcount_tree {
21 struct rb_node rf_node;
22 u64 rf_blkno;
23 u32 rf_generation;
24 struct rw_semaphore rf_sem;
25 struct ocfs2_lock_res rf_lockres;
26 struct kref rf_getcnt;
27 int rf_removed;
28
29 /* the following 4 fields are used by caching_info. */
30 struct ocfs2_caching_info rf_ci;
31 spinlock_t rf_lock;
32 struct mutex rf_io_mutex;
33 struct super_block *rf_sb;
34};
35
36void ocfs2_purge_refcount_trees(struct ocfs2_super *osb);
37int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64 ref_blkno, int rw,
38 struct ocfs2_refcount_tree **tree,
39 struct buffer_head **ref_bh);
40void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
41 struct ocfs2_refcount_tree *tree,
42 int rw);
43
44int ocfs2_decrease_refcount(struct inode *inode,
45 handle_t *handle, u32 cpos, u32 len,
46 struct ocfs2_alloc_context *meta_ac,
47 struct ocfs2_cached_dealloc_ctxt *dealloc,
48 int delete);
49int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
50 struct buffer_head *di_bh,
51 u64 phys_blkno,
52 u32 clusters,
53 int *credits,
54 struct ocfs2_alloc_context **meta_ac);
55int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
56 u32 cpos, u32 write_len, u32 max_cpos);
57
58typedef int (ocfs2_post_refcount_func)(struct inode *inode,
59 handle_t *handle,
60 void *para);
61/*
62 * Some refcount caller need to do more work after we modify the data b-tree
63 * during refcount operation(including CoW and add refcount flag), and make the
64 * transaction complete. So it must give us this structure so that we can do it
65 * within our transaction.
66 *
67 */
68struct ocfs2_post_refcount {
69 int credits; /* credits it need for journal. */
70 ocfs2_post_refcount_func *func; /* real function. */
71 void *para;
72};
73
74int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
75 struct ocfs2_caching_info *ref_ci,
76 struct buffer_head *ref_root_bh,
77 struct ocfs2_xattr_value_root *xv,
78 int *meta_add, int *credits);
79int ocfs2_refcount_cow_xattr(struct inode *inode,
80 struct ocfs2_dinode *di,
81 struct ocfs2_xattr_value_buf *vb,
82 struct ocfs2_refcount_tree *ref_tree,
83 struct buffer_head *ref_root_bh,
84 u32 cpos, u32 write_len,
85 struct ocfs2_post_refcount *post);
86int ocfs2_add_refcount_flag(struct inode *inode,
87 struct ocfs2_extent_tree *data_et,
88 struct ocfs2_caching_info *ref_ci,
89 struct buffer_head *ref_root_bh,
90 u32 cpos, u32 p_cluster, u32 num_clusters,
91 struct ocfs2_cached_dealloc_ctxt *dealloc,
92 struct ocfs2_post_refcount *post);
93int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh);
94int ocfs2_try_remove_refcount_tree(struct inode *inode,
95 struct buffer_head *di_bh);
96int ocfs2_increase_refcount(handle_t *handle,
97 struct ocfs2_caching_info *ci,
98 struct buffer_head *ref_root_bh,
99 u64 cpos, u32 len,
100 struct ocfs2_alloc_context *meta_ac,
101 struct ocfs2_cached_dealloc_ctxt *dealloc);
102int ocfs2_reflink_ioctl(struct inode *inode,
103 const char __user *oldname,
104 const char __user *newname,
105 bool preserve);
106#endif /* OCFS2_REFCOUNTTREE_H */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 424adaa5f900..3c3d673a4d20 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -106,8 +106,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
106 mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n", 106 mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
107 new_clusters, first_new_cluster); 107 new_clusters, first_new_cluster);
108 108
109 ret = ocfs2_journal_access_gd(handle, bm_inode, group_bh, 109 ret = ocfs2_journal_access_gd(handle, INODE_CACHE(bm_inode),
110 OCFS2_JOURNAL_ACCESS_WRITE); 110 group_bh, OCFS2_JOURNAL_ACCESS_WRITE);
111 if (ret < 0) { 111 if (ret < 0) {
112 mlog_errno(ret); 112 mlog_errno(ret);
113 goto out; 113 goto out;
@@ -141,7 +141,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
141 } 141 }
142 142
143 /* update the inode accordingly. */ 143 /* update the inode accordingly. */
144 ret = ocfs2_journal_access_di(handle, bm_inode, bm_bh, 144 ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh,
145 OCFS2_JOURNAL_ACCESS_WRITE); 145 OCFS2_JOURNAL_ACCESS_WRITE);
146 if (ret < 0) { 146 if (ret < 0) {
147 mlog_errno(ret); 147 mlog_errno(ret);
@@ -514,7 +514,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
514 goto out_unlock; 514 goto out_unlock;
515 } 515 }
516 516
517 ocfs2_set_new_buffer_uptodate(inode, group_bh); 517 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), group_bh);
518 518
519 ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh); 519 ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh);
520 if (ret) { 520 if (ret) {
@@ -536,8 +536,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
536 cl = &fe->id2.i_chain; 536 cl = &fe->id2.i_chain;
537 cr = &cl->cl_recs[input->chain]; 537 cr = &cl->cl_recs[input->chain];
538 538
539 ret = ocfs2_journal_access_gd(handle, main_bm_inode, group_bh, 539 ret = ocfs2_journal_access_gd(handle, INODE_CACHE(main_bm_inode),
540 OCFS2_JOURNAL_ACCESS_WRITE); 540 group_bh, OCFS2_JOURNAL_ACCESS_WRITE);
541 if (ret < 0) { 541 if (ret < 0) {
542 mlog_errno(ret); 542 mlog_errno(ret);
543 goto out_commit; 543 goto out_commit;
@@ -552,8 +552,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
552 goto out_commit; 552 goto out_commit;
553 } 553 }
554 554
555 ret = ocfs2_journal_access_di(handle, main_bm_inode, main_bm_bh, 555 ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
556 OCFS2_JOURNAL_ACCESS_WRITE); 556 main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
557 if (ret < 0) { 557 if (ret < 0) {
558 mlog_errno(ret); 558 mlog_errno(ret);
559 goto out_commit; 559 goto out_commit;
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 40661e7824e9..bfbd7e9e949f 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -150,8 +150,8 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
150 * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If 150 * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If
151 * this is not true, the read of -1 (UINT64_MAX) will fail. 151 * this is not true, the read of -1 (UINT64_MAX) will fail.
152 */ 152 */
153 ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh, 153 ret = ocfs2_read_blocks(INODE_CACHE(si->si_inode), -1, si->si_blocks,
154 OCFS2_BH_IGNORE_CACHE, NULL); 154 si->si_bh, OCFS2_BH_IGNORE_CACHE, NULL);
155 if (ret == 0) { 155 if (ret == 0) {
156 spin_lock(&osb->osb_lock); 156 spin_lock(&osb->osb_lock);
157 ocfs2_update_slot_info(si); 157 ocfs2_update_slot_info(si);
@@ -213,7 +213,7 @@ static int ocfs2_update_disk_slot(struct ocfs2_super *osb,
213 ocfs2_update_disk_slot_old(si, slot_num, &bh); 213 ocfs2_update_disk_slot_old(si, slot_num, &bh);
214 spin_unlock(&osb->osb_lock); 214 spin_unlock(&osb->osb_lock);
215 215
216 status = ocfs2_write_block(osb, bh, si->si_inode); 216 status = ocfs2_write_block(osb, bh, INODE_CACHE(si->si_inode));
217 if (status < 0) 217 if (status < 0)
218 mlog_errno(status); 218 mlog_errno(status);
219 219
@@ -404,8 +404,8 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
404 (unsigned long long)blkno); 404 (unsigned long long)blkno);
405 405
406 bh = NULL; /* Acquire a fresh bh */ 406 bh = NULL; /* Acquire a fresh bh */
407 status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh, 407 status = ocfs2_read_blocks(INODE_CACHE(si->si_inode), blkno,
408 OCFS2_BH_IGNORE_CACHE, NULL); 408 1, &bh, OCFS2_BH_IGNORE_CACHE, NULL);
409 if (status < 0) { 409 if (status < 0) {
410 mlog_errno(status); 410 mlog_errno(status);
411 goto bail; 411 goto bail;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 73a16d4666dc..c30b644d9572 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -310,7 +310,7 @@ int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
310 int rc; 310 int rc;
311 struct buffer_head *tmp = *bh; 311 struct buffer_head *tmp = *bh;
312 312
313 rc = ocfs2_read_block(inode, gd_blkno, &tmp, 313 rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
314 ocfs2_validate_group_descriptor); 314 ocfs2_validate_group_descriptor);
315 if (rc) 315 if (rc)
316 goto out; 316 goto out;
@@ -352,7 +352,7 @@ static int ocfs2_block_group_fill(handle_t *handle,
352 } 352 }
353 353
354 status = ocfs2_journal_access_gd(handle, 354 status = ocfs2_journal_access_gd(handle,
355 alloc_inode, 355 INODE_CACHE(alloc_inode),
356 bg_bh, 356 bg_bh,
357 OCFS2_JOURNAL_ACCESS_CREATE); 357 OCFS2_JOURNAL_ACCESS_CREATE);
358 if (status < 0) { 358 if (status < 0) {
@@ -476,7 +476,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
476 mlog_errno(status); 476 mlog_errno(status);
477 goto bail; 477 goto bail;
478 } 478 }
479 ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh); 479 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
480 480
481 status = ocfs2_block_group_fill(handle, 481 status = ocfs2_block_group_fill(handle,
482 alloc_inode, 482 alloc_inode,
@@ -491,7 +491,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
491 491
492 bg = (struct ocfs2_group_desc *) bg_bh->b_data; 492 bg = (struct ocfs2_group_desc *) bg_bh->b_data;
493 493
494 status = ocfs2_journal_access_di(handle, alloc_inode, 494 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
495 bh, OCFS2_JOURNAL_ACCESS_WRITE); 495 bh, OCFS2_JOURNAL_ACCESS_WRITE);
496 if (status < 0) { 496 if (status < 0) {
497 mlog_errno(status); 497 mlog_errno(status);
@@ -1033,7 +1033,7 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
1033 journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 1033 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1034 1034
1035 status = ocfs2_journal_access_gd(handle, 1035 status = ocfs2_journal_access_gd(handle,
1036 alloc_inode, 1036 INODE_CACHE(alloc_inode),
1037 group_bh, 1037 group_bh,
1038 journal_type); 1038 journal_type);
1039 if (status < 0) { 1039 if (status < 0) {
@@ -1106,7 +1106,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
1106 bg_ptr = le64_to_cpu(bg->bg_next_group); 1106 bg_ptr = le64_to_cpu(bg->bg_next_group);
1107 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); 1107 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1108 1108
1109 status = ocfs2_journal_access_gd(handle, alloc_inode, prev_bg_bh, 1109 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1110 prev_bg_bh,
1110 OCFS2_JOURNAL_ACCESS_WRITE); 1111 OCFS2_JOURNAL_ACCESS_WRITE);
1111 if (status < 0) { 1112 if (status < 0) {
1112 mlog_errno(status); 1113 mlog_errno(status);
@@ -1121,8 +1122,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
1121 goto out_rollback; 1122 goto out_rollback;
1122 } 1123 }
1123 1124
1124 status = ocfs2_journal_access_gd(handle, alloc_inode, bg_bh, 1125 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1125 OCFS2_JOURNAL_ACCESS_WRITE); 1126 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1126 if (status < 0) { 1127 if (status < 0) {
1127 mlog_errno(status); 1128 mlog_errno(status);
1128 goto out_rollback; 1129 goto out_rollback;
@@ -1136,8 +1137,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
1136 goto out_rollback; 1137 goto out_rollback;
1137 } 1138 }
1138 1139
1139 status = ocfs2_journal_access_di(handle, alloc_inode, fe_bh, 1140 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1140 OCFS2_JOURNAL_ACCESS_WRITE); 1141 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1141 if (status < 0) { 1142 if (status < 0) {
1142 mlog_errno(status); 1143 mlog_errno(status);
1143 goto out_rollback; 1144 goto out_rollback;
@@ -1288,7 +1289,7 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1288 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 1289 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1289 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain; 1290 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1290 1291
1291 ret = ocfs2_journal_access_di(handle, inode, di_bh, 1292 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1292 OCFS2_JOURNAL_ACCESS_WRITE); 1293 OCFS2_JOURNAL_ACCESS_WRITE);
1293 if (ret < 0) { 1294 if (ret < 0) {
1294 mlog_errno(ret); 1295 mlog_errno(ret);
@@ -1461,7 +1462,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1461 /* Ok, claim our bits now: set the info on dinode, chainlist 1462 /* Ok, claim our bits now: set the info on dinode, chainlist
1462 * and then the group */ 1463 * and then the group */
1463 status = ocfs2_journal_access_di(handle, 1464 status = ocfs2_journal_access_di(handle,
1464 alloc_inode, 1465 INODE_CACHE(alloc_inode),
1465 ac->ac_bh, 1466 ac->ac_bh,
1466 OCFS2_JOURNAL_ACCESS_WRITE); 1467 OCFS2_JOURNAL_ACCESS_WRITE);
1467 if (status < 0) { 1468 if (status < 0) {
@@ -1907,8 +1908,8 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1907 if (ocfs2_is_cluster_bitmap(alloc_inode)) 1908 if (ocfs2_is_cluster_bitmap(alloc_inode))
1908 journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 1909 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1909 1910
1910 status = ocfs2_journal_access_gd(handle, alloc_inode, group_bh, 1911 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1911 journal_type); 1912 group_bh, journal_type);
1912 if (status < 0) { 1913 if (status < 0) {
1913 mlog_errno(status); 1914 mlog_errno(status);
1914 goto bail; 1915 goto bail;
@@ -1993,8 +1994,8 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
1993 goto bail; 1994 goto bail;
1994 } 1995 }
1995 1996
1996 status = ocfs2_journal_access_di(handle, alloc_inode, alloc_bh, 1997 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1997 OCFS2_JOURNAL_ACCESS_WRITE); 1998 alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1998 if (status < 0) { 1999 if (status < 0) {
1999 mlog_errno(status); 2000 mlog_errno(status);
2000 goto bail; 2001 goto bail;
@@ -2151,7 +2152,7 @@ int ocfs2_lock_allocators(struct inode *inode,
2151 2152
2152 BUG_ON(clusters_to_add != 0 && data_ac == NULL); 2153 BUG_ON(clusters_to_add != 0 && data_ac == NULL);
2153 2154
2154 num_free_extents = ocfs2_num_free_extents(osb, inode, et); 2155 num_free_extents = ocfs2_num_free_extents(osb, et);
2155 if (num_free_extents < 0) { 2156 if (num_free_extents < 0) {
2156 ret = num_free_extents; 2157 ret = num_free_extents;
2157 mlog_errno(ret); 2158 mlog_errno(ret);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index a3f8871d21fd..c0e48aeebb1c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -28,7 +28,6 @@
28#include <linux/types.h> 28#include <linux/types.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/highmem.h> 30#include <linux/highmem.h>
31#include <linux/utsname.h>
32#include <linux/init.h> 31#include <linux/init.h>
33#include <linux/random.h> 32#include <linux/random.h>
34#include <linux/statfs.h> 33#include <linux/statfs.h>
@@ -69,6 +68,7 @@
69#include "ver.h" 68#include "ver.h"
70#include "xattr.h" 69#include "xattr.h"
71#include "quota.h" 70#include "quota.h"
71#include "refcounttree.h"
72 72
73#include "buffer_head_io.h" 73#include "buffer_head_io.h"
74 74
@@ -373,7 +373,7 @@ static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
373} 373}
374#endif /* CONFIG_DEBUG_FS */ 374#endif /* CONFIG_DEBUG_FS */
375 375
376static struct file_operations ocfs2_osb_debug_fops = { 376static const struct file_operations ocfs2_osb_debug_fops = {
377 .open = ocfs2_osb_debug_open, 377 .open = ocfs2_osb_debug_open,
378 .release = ocfs2_debug_release, 378 .release = ocfs2_debug_release,
379 .read = ocfs2_debug_read, 379 .read = ocfs2_debug_read,
@@ -965,7 +965,7 @@ static int ocfs2_quota_off(struct super_block *sb, int type, int remount)
965 return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED); 965 return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
966} 966}
967 967
968static struct quotactl_ops ocfs2_quotactl_ops = { 968static const struct quotactl_ops ocfs2_quotactl_ops = {
969 .quota_on = ocfs2_quota_on, 969 .quota_on = ocfs2_quota_on,
970 .quota_off = ocfs2_quota_off, 970 .quota_off = ocfs2_quota_off,
971 .quota_sync = vfs_quota_sync, 971 .quota_sync = vfs_quota_sync,
@@ -1668,8 +1668,6 @@ static void ocfs2_inode_init_once(void *data)
1668 spin_lock_init(&oi->ip_lock); 1668 spin_lock_init(&oi->ip_lock);
1669 ocfs2_extent_map_init(&oi->vfs_inode); 1669 ocfs2_extent_map_init(&oi->vfs_inode);
1670 INIT_LIST_HEAD(&oi->ip_io_markers); 1670 INIT_LIST_HEAD(&oi->ip_io_markers);
1671 oi->ip_created_trans = 0;
1672 oi->ip_last_trans = 0;
1673 oi->ip_dir_start_lookup = 0; 1671 oi->ip_dir_start_lookup = 0;
1674 1672
1675 init_rwsem(&oi->ip_alloc_sem); 1673 init_rwsem(&oi->ip_alloc_sem);
@@ -1683,7 +1681,8 @@ static void ocfs2_inode_init_once(void *data)
1683 ocfs2_lock_res_init_once(&oi->ip_inode_lockres); 1681 ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
1684 ocfs2_lock_res_init_once(&oi->ip_open_lockres); 1682 ocfs2_lock_res_init_once(&oi->ip_open_lockres);
1685 1683
1686 ocfs2_metadata_cache_init(&oi->vfs_inode); 1684 ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode),
1685 &ocfs2_inode_caching_ops);
1687 1686
1688 inode_init_once(&oi->vfs_inode); 1687 inode_init_once(&oi->vfs_inode);
1689} 1688}
@@ -1859,6 +1858,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1859 1858
1860 ocfs2_sync_blockdev(sb); 1859 ocfs2_sync_blockdev(sb);
1861 1860
1861 ocfs2_purge_refcount_trees(osb);
1862
1862 /* No cluster connection means we've failed during mount, so skip 1863 /* No cluster connection means we've failed during mount, so skip
1863 * all the steps which depended on that to complete. */ 1864 * all the steps which depended on that to complete. */
1864 if (osb->cconn) { 1865 if (osb->cconn) {
@@ -2065,6 +2066,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
2065 goto bail; 2066 goto bail;
2066 } 2067 }
2067 2068
2069 osb->osb_rf_lock_tree = RB_ROOT;
2070
2068 osb->s_feature_compat = 2071 osb->s_feature_compat =
2069 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat); 2072 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat);
2070 osb->s_feature_ro_compat = 2073 osb->s_feature_ro_compat =
@@ -2490,7 +2493,8 @@ void __ocfs2_abort(struct super_block* sb,
2490 /* Force a panic(). This stinks, but it's better than letting 2493 /* Force a panic(). This stinks, but it's better than letting
2491 * things continue without having a proper hard readonly 2494 * things continue without having a proper hard readonly
2492 * here. */ 2495 * here. */
2493 OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; 2496 if (!ocfs2_mount_local(OCFS2_SB(sb)))
2497 OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
2494 ocfs2_handle_error(sb); 2498 ocfs2_handle_error(sb);
2495} 2499}
2496 2500
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 579dd1b1110f..e3421030a69f 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -38,7 +38,6 @@
38#include <linux/types.h> 38#include <linux/types.h>
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/pagemap.h> 40#include <linux/pagemap.h>
41#include <linux/utsname.h>
42#include <linux/namei.h> 41#include <linux/namei.h>
43 42
44#define MLOG_MASK_PREFIX ML_NAMEI 43#define MLOG_MASK_PREFIX ML_NAMEI
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 187b99ff0368..b6284f235d2f 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -75,15 +75,77 @@ struct ocfs2_meta_cache_item {
75 75
76static struct kmem_cache *ocfs2_uptodate_cachep = NULL; 76static struct kmem_cache *ocfs2_uptodate_cachep = NULL;
77 77
78void ocfs2_metadata_cache_init(struct inode *inode) 78u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci)
79{ 79{
80 struct ocfs2_inode_info *oi = OCFS2_I(inode); 80 BUG_ON(!ci || !ci->ci_ops);
81 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
82 81
83 oi->ip_flags |= OCFS2_INODE_CACHE_INLINE; 82 return ci->ci_ops->co_owner(ci);
83}
84
85struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci)
86{
87 BUG_ON(!ci || !ci->ci_ops);
88
89 return ci->ci_ops->co_get_super(ci);
90}
91
92static void ocfs2_metadata_cache_lock(struct ocfs2_caching_info *ci)
93{
94 BUG_ON(!ci || !ci->ci_ops);
95
96 ci->ci_ops->co_cache_lock(ci);
97}
98
99static void ocfs2_metadata_cache_unlock(struct ocfs2_caching_info *ci)
100{
101 BUG_ON(!ci || !ci->ci_ops);
102
103 ci->ci_ops->co_cache_unlock(ci);
104}
105
106void ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci)
107{
108 BUG_ON(!ci || !ci->ci_ops);
109
110 ci->ci_ops->co_io_lock(ci);
111}
112
113void ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci)
114{
115 BUG_ON(!ci || !ci->ci_ops);
116
117 ci->ci_ops->co_io_unlock(ci);
118}
119
120
121static void ocfs2_metadata_cache_reset(struct ocfs2_caching_info *ci,
122 int clear)
123{
124 ci->ci_flags |= OCFS2_CACHE_FL_INLINE;
84 ci->ci_num_cached = 0; 125 ci->ci_num_cached = 0;
126
127 if (clear) {
128 ci->ci_created_trans = 0;
129 ci->ci_last_trans = 0;
130 }
131}
132
133void ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci,
134 const struct ocfs2_caching_operations *ops)
135{
136 BUG_ON(!ops);
137
138 ci->ci_ops = ops;
139 ocfs2_metadata_cache_reset(ci, 1);
85} 140}
86 141
142void ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci)
143{
144 ocfs2_metadata_cache_purge(ci);
145 ocfs2_metadata_cache_reset(ci, 1);
146}
147
148
87/* No lock taken here as 'root' is not expected to be visible to other 149/* No lock taken here as 'root' is not expected to be visible to other
88 * processes. */ 150 * processes. */
89static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root) 151static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
@@ -112,19 +174,20 @@ static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
112 * This function is a few more lines longer than necessary due to some 174 * This function is a few more lines longer than necessary due to some
113 * accounting done here, but I think it's worth tracking down those 175 * accounting done here, but I think it's worth tracking down those
114 * bugs sooner -- Mark */ 176 * bugs sooner -- Mark */
115void ocfs2_metadata_cache_purge(struct inode *inode) 177void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci)
116{ 178{
117 struct ocfs2_inode_info *oi = OCFS2_I(inode);
118 unsigned int tree, to_purge, purged; 179 unsigned int tree, to_purge, purged;
119 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
120 struct rb_root root = RB_ROOT; 180 struct rb_root root = RB_ROOT;
121 181
122 spin_lock(&oi->ip_lock); 182 BUG_ON(!ci || !ci->ci_ops);
123 tree = !(oi->ip_flags & OCFS2_INODE_CACHE_INLINE); 183
184 ocfs2_metadata_cache_lock(ci);
185 tree = !(ci->ci_flags & OCFS2_CACHE_FL_INLINE);
124 to_purge = ci->ci_num_cached; 186 to_purge = ci->ci_num_cached;
125 187
126 mlog(0, "Purge %u %s items from Inode %llu\n", to_purge, 188 mlog(0, "Purge %u %s items from Owner %llu\n", to_purge,
127 tree ? "array" : "tree", (unsigned long long)oi->ip_blkno); 189 tree ? "array" : "tree",
190 (unsigned long long)ocfs2_metadata_cache_owner(ci));
128 191
129 /* If we're a tree, save off the root so that we can safely 192 /* If we're a tree, save off the root so that we can safely
130 * initialize the cache. We do the work to free tree members 193 * initialize the cache. We do the work to free tree members
@@ -132,16 +195,17 @@ void ocfs2_metadata_cache_purge(struct inode *inode)
132 if (tree) 195 if (tree)
133 root = ci->ci_cache.ci_tree; 196 root = ci->ci_cache.ci_tree;
134 197
135 ocfs2_metadata_cache_init(inode); 198 ocfs2_metadata_cache_reset(ci, 0);
136 spin_unlock(&oi->ip_lock); 199 ocfs2_metadata_cache_unlock(ci);
137 200
138 purged = ocfs2_purge_copied_metadata_tree(&root); 201 purged = ocfs2_purge_copied_metadata_tree(&root);
139 /* If possible, track the number wiped so that we can more 202 /* If possible, track the number wiped so that we can more
140 * easily detect counting errors. Unfortunately, this is only 203 * easily detect counting errors. Unfortunately, this is only
141 * meaningful for trees. */ 204 * meaningful for trees. */
142 if (tree && purged != to_purge) 205 if (tree && purged != to_purge)
143 mlog(ML_ERROR, "Inode %llu, count = %u, purged = %u\n", 206 mlog(ML_ERROR, "Owner %llu, count = %u, purged = %u\n",
144 (unsigned long long)oi->ip_blkno, to_purge, purged); 207 (unsigned long long)ocfs2_metadata_cache_owner(ci),
208 to_purge, purged);
145} 209}
146 210
147/* Returns the index in the cache array, -1 if not found. 211/* Returns the index in the cache array, -1 if not found.
@@ -182,27 +246,25 @@ ocfs2_search_cache_tree(struct ocfs2_caching_info *ci,
182 return NULL; 246 return NULL;
183} 247}
184 248
185static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi, 249static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci,
186 struct buffer_head *bh) 250 struct buffer_head *bh)
187{ 251{
188 int index = -1; 252 int index = -1;
189 struct ocfs2_meta_cache_item *item = NULL; 253 struct ocfs2_meta_cache_item *item = NULL;
190 254
191 spin_lock(&oi->ip_lock); 255 ocfs2_metadata_cache_lock(ci);
192 256
193 mlog(0, "Inode %llu, query block %llu (inline = %u)\n", 257 mlog(0, "Owner %llu, query block %llu (inline = %u)\n",
194 (unsigned long long)oi->ip_blkno, 258 (unsigned long long)ocfs2_metadata_cache_owner(ci),
195 (unsigned long long) bh->b_blocknr, 259 (unsigned long long) bh->b_blocknr,
196 !!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE)); 260 !!(ci->ci_flags & OCFS2_CACHE_FL_INLINE));
197 261
198 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) 262 if (ci->ci_flags & OCFS2_CACHE_FL_INLINE)
199 index = ocfs2_search_cache_array(&oi->ip_metadata_cache, 263 index = ocfs2_search_cache_array(ci, bh->b_blocknr);
200 bh->b_blocknr);
201 else 264 else
202 item = ocfs2_search_cache_tree(&oi->ip_metadata_cache, 265 item = ocfs2_search_cache_tree(ci, bh->b_blocknr);
203 bh->b_blocknr);
204 266
205 spin_unlock(&oi->ip_lock); 267 ocfs2_metadata_cache_unlock(ci);
206 268
207 mlog(0, "index = %d, item = %p\n", index, item); 269 mlog(0, "index = %d, item = %p\n", index, item);
208 270
@@ -214,7 +276,7 @@ static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi,
214 * 276 *
215 * This can be called under lock_buffer() 277 * This can be called under lock_buffer()
216 */ 278 */
217int ocfs2_buffer_uptodate(struct inode *inode, 279int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci,
218 struct buffer_head *bh) 280 struct buffer_head *bh)
219{ 281{
220 /* Doesn't matter if the bh is in our cache or not -- if it's 282 /* Doesn't matter if the bh is in our cache or not -- if it's
@@ -230,24 +292,24 @@ int ocfs2_buffer_uptodate(struct inode *inode,
230 292
231 /* Ok, locally the buffer is marked as up to date, now search 293 /* Ok, locally the buffer is marked as up to date, now search
232 * our cache to see if we can trust that. */ 294 * our cache to see if we can trust that. */
233 return ocfs2_buffer_cached(OCFS2_I(inode), bh); 295 return ocfs2_buffer_cached(ci, bh);
234} 296}
235 297
236/* 298/*
237 * Determine whether a buffer is currently out on a read-ahead request. 299 * Determine whether a buffer is currently out on a read-ahead request.
238 * ip_io_sem should be held to serialize submitters with the logic here. 300 * ci_io_sem should be held to serialize submitters with the logic here.
239 */ 301 */
240int ocfs2_buffer_read_ahead(struct inode *inode, 302int ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci,
241 struct buffer_head *bh) 303 struct buffer_head *bh)
242{ 304{
243 return buffer_locked(bh) && ocfs2_buffer_cached(OCFS2_I(inode), bh); 305 return buffer_locked(bh) && ocfs2_buffer_cached(ci, bh);
244} 306}
245 307
246/* Requires ip_lock */ 308/* Requires ip_lock */
247static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci, 309static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
248 sector_t block) 310 sector_t block)
249{ 311{
250 BUG_ON(ci->ci_num_cached >= OCFS2_INODE_MAX_CACHE_ARRAY); 312 BUG_ON(ci->ci_num_cached >= OCFS2_CACHE_INFO_MAX_ARRAY);
251 313
252 mlog(0, "block %llu takes position %u\n", (unsigned long long) block, 314 mlog(0, "block %llu takes position %u\n", (unsigned long long) block,
253 ci->ci_num_cached); 315 ci->ci_num_cached);
@@ -292,66 +354,64 @@ static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
292 ci->ci_num_cached++; 354 ci->ci_num_cached++;
293} 355}
294 356
295static inline int ocfs2_insert_can_use_array(struct ocfs2_inode_info *oi, 357/* co_cache_lock() must be held */
296 struct ocfs2_caching_info *ci) 358static inline int ocfs2_insert_can_use_array(struct ocfs2_caching_info *ci)
297{ 359{
298 assert_spin_locked(&oi->ip_lock); 360 return (ci->ci_flags & OCFS2_CACHE_FL_INLINE) &&
299 361 (ci->ci_num_cached < OCFS2_CACHE_INFO_MAX_ARRAY);
300 return (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) &&
301 (ci->ci_num_cached < OCFS2_INODE_MAX_CACHE_ARRAY);
302} 362}
303 363
304/* tree should be exactly OCFS2_INODE_MAX_CACHE_ARRAY wide. NULL the 364/* tree should be exactly OCFS2_CACHE_INFO_MAX_ARRAY wide. NULL the
305 * pointers in tree after we use them - this allows caller to detect 365 * pointers in tree after we use them - this allows caller to detect
306 * when to free in case of error. */ 366 * when to free in case of error.
307static void ocfs2_expand_cache(struct ocfs2_inode_info *oi, 367 *
368 * The co_cache_lock() must be held. */
369static void ocfs2_expand_cache(struct ocfs2_caching_info *ci,
308 struct ocfs2_meta_cache_item **tree) 370 struct ocfs2_meta_cache_item **tree)
309{ 371{
310 int i; 372 int i;
311 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
312 373
313 mlog_bug_on_msg(ci->ci_num_cached != OCFS2_INODE_MAX_CACHE_ARRAY, 374 mlog_bug_on_msg(ci->ci_num_cached != OCFS2_CACHE_INFO_MAX_ARRAY,
314 "Inode %llu, num cached = %u, should be %u\n", 375 "Owner %llu, num cached = %u, should be %u\n",
315 (unsigned long long)oi->ip_blkno, ci->ci_num_cached, 376 (unsigned long long)ocfs2_metadata_cache_owner(ci),
316 OCFS2_INODE_MAX_CACHE_ARRAY); 377 ci->ci_num_cached, OCFS2_CACHE_INFO_MAX_ARRAY);
317 mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE), 378 mlog_bug_on_msg(!(ci->ci_flags & OCFS2_CACHE_FL_INLINE),
318 "Inode %llu not marked as inline anymore!\n", 379 "Owner %llu not marked as inline anymore!\n",
319 (unsigned long long)oi->ip_blkno); 380 (unsigned long long)ocfs2_metadata_cache_owner(ci));
320 assert_spin_locked(&oi->ip_lock);
321 381
322 /* Be careful to initialize the tree members *first* because 382 /* Be careful to initialize the tree members *first* because
323 * once the ci_tree is used, the array is junk... */ 383 * once the ci_tree is used, the array is junk... */
324 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) 384 for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++)
325 tree[i]->c_block = ci->ci_cache.ci_array[i]; 385 tree[i]->c_block = ci->ci_cache.ci_array[i];
326 386
327 oi->ip_flags &= ~OCFS2_INODE_CACHE_INLINE; 387 ci->ci_flags &= ~OCFS2_CACHE_FL_INLINE;
328 ci->ci_cache.ci_tree = RB_ROOT; 388 ci->ci_cache.ci_tree = RB_ROOT;
329 /* this will be set again by __ocfs2_insert_cache_tree */ 389 /* this will be set again by __ocfs2_insert_cache_tree */
330 ci->ci_num_cached = 0; 390 ci->ci_num_cached = 0;
331 391
332 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) { 392 for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) {
333 __ocfs2_insert_cache_tree(ci, tree[i]); 393 __ocfs2_insert_cache_tree(ci, tree[i]);
334 tree[i] = NULL; 394 tree[i] = NULL;
335 } 395 }
336 396
337 mlog(0, "Expanded %llu to a tree cache: flags 0x%x, num = %u\n", 397 mlog(0, "Expanded %llu to a tree cache: flags 0x%x, num = %u\n",
338 (unsigned long long)oi->ip_blkno, oi->ip_flags, ci->ci_num_cached); 398 (unsigned long long)ocfs2_metadata_cache_owner(ci),
399 ci->ci_flags, ci->ci_num_cached);
339} 400}
340 401
341/* Slow path function - memory allocation is necessary. See the 402/* Slow path function - memory allocation is necessary. See the
342 * comment above ocfs2_set_buffer_uptodate for more information. */ 403 * comment above ocfs2_set_buffer_uptodate for more information. */
343static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi, 404static void __ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
344 sector_t block, 405 sector_t block,
345 int expand_tree) 406 int expand_tree)
346{ 407{
347 int i; 408 int i;
348 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
349 struct ocfs2_meta_cache_item *new = NULL; 409 struct ocfs2_meta_cache_item *new = NULL;
350 struct ocfs2_meta_cache_item *tree[OCFS2_INODE_MAX_CACHE_ARRAY] = 410 struct ocfs2_meta_cache_item *tree[OCFS2_CACHE_INFO_MAX_ARRAY] =
351 { NULL, }; 411 { NULL, };
352 412
353 mlog(0, "Inode %llu, block %llu, expand = %d\n", 413 mlog(0, "Owner %llu, block %llu, expand = %d\n",
354 (unsigned long long)oi->ip_blkno, 414 (unsigned long long)ocfs2_metadata_cache_owner(ci),
355 (unsigned long long)block, expand_tree); 415 (unsigned long long)block, expand_tree);
356 416
357 new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS); 417 new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS);
@@ -364,7 +424,7 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
364 if (expand_tree) { 424 if (expand_tree) {
365 /* Do *not* allocate an array here - the removal code 425 /* Do *not* allocate an array here - the removal code
366 * has no way of tracking that. */ 426 * has no way of tracking that. */
367 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) { 427 for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) {
368 tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep, 428 tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep,
369 GFP_NOFS); 429 GFP_NOFS);
370 if (!tree[i]) { 430 if (!tree[i]) {
@@ -376,21 +436,21 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
376 } 436 }
377 } 437 }
378 438
379 spin_lock(&oi->ip_lock); 439 ocfs2_metadata_cache_lock(ci);
380 if (ocfs2_insert_can_use_array(oi, ci)) { 440 if (ocfs2_insert_can_use_array(ci)) {
381 mlog(0, "Someone cleared the tree underneath us\n"); 441 mlog(0, "Someone cleared the tree underneath us\n");
382 /* Ok, items were removed from the cache in between 442 /* Ok, items were removed from the cache in between
383 * locks. Detect this and revert back to the fast path */ 443 * locks. Detect this and revert back to the fast path */
384 ocfs2_append_cache_array(ci, block); 444 ocfs2_append_cache_array(ci, block);
385 spin_unlock(&oi->ip_lock); 445 ocfs2_metadata_cache_unlock(ci);
386 goto out_free; 446 goto out_free;
387 } 447 }
388 448
389 if (expand_tree) 449 if (expand_tree)
390 ocfs2_expand_cache(oi, tree); 450 ocfs2_expand_cache(ci, tree);
391 451
392 __ocfs2_insert_cache_tree(ci, new); 452 __ocfs2_insert_cache_tree(ci, new);
393 spin_unlock(&oi->ip_lock); 453 ocfs2_metadata_cache_unlock(ci);
394 454
395 new = NULL; 455 new = NULL;
396out_free: 456out_free:
@@ -400,14 +460,14 @@ out_free:
400 /* If these were used, then ocfs2_expand_cache re-set them to 460 /* If these were used, then ocfs2_expand_cache re-set them to
401 * NULL for us. */ 461 * NULL for us. */
402 if (tree[0]) { 462 if (tree[0]) {
403 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) 463 for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++)
404 if (tree[i]) 464 if (tree[i])
405 kmem_cache_free(ocfs2_uptodate_cachep, 465 kmem_cache_free(ocfs2_uptodate_cachep,
406 tree[i]); 466 tree[i]);
407 } 467 }
408} 468}
409 469
410/* Item insertion is guarded by ip_io_mutex, so the insertion path takes 470/* Item insertion is guarded by co_io_lock(), so the insertion path takes
411 * advantage of this by not rechecking for a duplicate insert during 471 * advantage of this by not rechecking for a duplicate insert during
412 * the slow case. Additionally, if the cache needs to be bumped up to 472 * the slow case. Additionally, if the cache needs to be bumped up to
413 * a tree, the code will not recheck after acquiring the lock -- 473 * a tree, the code will not recheck after acquiring the lock --
@@ -425,59 +485,55 @@ out_free:
425 * Readahead buffers can be passed in here before the I/O request is 485 * Readahead buffers can be passed in here before the I/O request is
426 * completed. 486 * completed.
427 */ 487 */
428void ocfs2_set_buffer_uptodate(struct inode *inode, 488void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
429 struct buffer_head *bh) 489 struct buffer_head *bh)
430{ 490{
431 int expand; 491 int expand;
432 struct ocfs2_inode_info *oi = OCFS2_I(inode);
433 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
434 492
435 /* The block may very well exist in our cache already, so avoid 493 /* The block may very well exist in our cache already, so avoid
436 * doing any more work in that case. */ 494 * doing any more work in that case. */
437 if (ocfs2_buffer_cached(oi, bh)) 495 if (ocfs2_buffer_cached(ci, bh))
438 return; 496 return;
439 497
440 mlog(0, "Inode %llu, inserting block %llu\n", 498 mlog(0, "Owner %llu, inserting block %llu\n",
441 (unsigned long long)oi->ip_blkno, 499 (unsigned long long)ocfs2_metadata_cache_owner(ci),
442 (unsigned long long)bh->b_blocknr); 500 (unsigned long long)bh->b_blocknr);
443 501
444 /* No need to recheck under spinlock - insertion is guarded by 502 /* No need to recheck under spinlock - insertion is guarded by
445 * ip_io_mutex */ 503 * co_io_lock() */
446 spin_lock(&oi->ip_lock); 504 ocfs2_metadata_cache_lock(ci);
447 if (ocfs2_insert_can_use_array(oi, ci)) { 505 if (ocfs2_insert_can_use_array(ci)) {
448 /* Fast case - it's an array and there's a free 506 /* Fast case - it's an array and there's a free
449 * spot. */ 507 * spot. */
450 ocfs2_append_cache_array(ci, bh->b_blocknr); 508 ocfs2_append_cache_array(ci, bh->b_blocknr);
451 spin_unlock(&oi->ip_lock); 509 ocfs2_metadata_cache_unlock(ci);
452 return; 510 return;
453 } 511 }
454 512
455 expand = 0; 513 expand = 0;
456 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) { 514 if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) {
457 /* We need to bump things up to a tree. */ 515 /* We need to bump things up to a tree. */
458 expand = 1; 516 expand = 1;
459 } 517 }
460 spin_unlock(&oi->ip_lock); 518 ocfs2_metadata_cache_unlock(ci);
461 519
462 __ocfs2_set_buffer_uptodate(oi, bh->b_blocknr, expand); 520 __ocfs2_set_buffer_uptodate(ci, bh->b_blocknr, expand);
463} 521}
464 522
465/* Called against a newly allocated buffer. Most likely nobody should 523/* Called against a newly allocated buffer. Most likely nobody should
466 * be able to read this sort of metadata while it's still being 524 * be able to read this sort of metadata while it's still being
467 * allocated, but this is careful to take ip_io_mutex anyway. */ 525 * allocated, but this is careful to take co_io_lock() anyway. */
468void ocfs2_set_new_buffer_uptodate(struct inode *inode, 526void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci,
469 struct buffer_head *bh) 527 struct buffer_head *bh)
470{ 528{
471 struct ocfs2_inode_info *oi = OCFS2_I(inode);
472
473 /* This should definitely *not* exist in our cache */ 529 /* This should definitely *not* exist in our cache */
474 BUG_ON(ocfs2_buffer_cached(oi, bh)); 530 BUG_ON(ocfs2_buffer_cached(ci, bh));
475 531
476 set_buffer_uptodate(bh); 532 set_buffer_uptodate(bh);
477 533
478 mutex_lock(&oi->ip_io_mutex); 534 ocfs2_metadata_cache_io_lock(ci);
479 ocfs2_set_buffer_uptodate(inode, bh); 535 ocfs2_set_buffer_uptodate(ci, bh);
480 mutex_unlock(&oi->ip_io_mutex); 536 ocfs2_metadata_cache_io_unlock(ci);
481} 537}
482 538
483/* Requires ip_lock. */ 539/* Requires ip_lock. */
@@ -487,7 +543,7 @@ static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
487 sector_t *array = ci->ci_cache.ci_array; 543 sector_t *array = ci->ci_cache.ci_array;
488 int bytes; 544 int bytes;
489 545
490 BUG_ON(index < 0 || index >= OCFS2_INODE_MAX_CACHE_ARRAY); 546 BUG_ON(index < 0 || index >= OCFS2_CACHE_INFO_MAX_ARRAY);
491 BUG_ON(index >= ci->ci_num_cached); 547 BUG_ON(index >= ci->ci_num_cached);
492 BUG_ON(!ci->ci_num_cached); 548 BUG_ON(!ci->ci_num_cached);
493 549
@@ -515,21 +571,19 @@ static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
515 ci->ci_num_cached--; 571 ci->ci_num_cached--;
516} 572}
517 573
518static void ocfs2_remove_block_from_cache(struct inode *inode, 574static void ocfs2_remove_block_from_cache(struct ocfs2_caching_info *ci,
519 sector_t block) 575 sector_t block)
520{ 576{
521 int index; 577 int index;
522 struct ocfs2_meta_cache_item *item = NULL; 578 struct ocfs2_meta_cache_item *item = NULL;
523 struct ocfs2_inode_info *oi = OCFS2_I(inode);
524 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
525 579
526 spin_lock(&oi->ip_lock); 580 ocfs2_metadata_cache_lock(ci);
527 mlog(0, "Inode %llu, remove %llu, items = %u, array = %u\n", 581 mlog(0, "Owner %llu, remove %llu, items = %u, array = %u\n",
528 (unsigned long long)oi->ip_blkno, 582 (unsigned long long)ocfs2_metadata_cache_owner(ci),
529 (unsigned long long) block, ci->ci_num_cached, 583 (unsigned long long) block, ci->ci_num_cached,
530 oi->ip_flags & OCFS2_INODE_CACHE_INLINE); 584 ci->ci_flags & OCFS2_CACHE_FL_INLINE);
531 585
532 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) { 586 if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) {
533 index = ocfs2_search_cache_array(ci, block); 587 index = ocfs2_search_cache_array(ci, block);
534 if (index != -1) 588 if (index != -1)
535 ocfs2_remove_metadata_array(ci, index); 589 ocfs2_remove_metadata_array(ci, index);
@@ -538,7 +592,7 @@ static void ocfs2_remove_block_from_cache(struct inode *inode,
538 if (item) 592 if (item)
539 ocfs2_remove_metadata_tree(ci, item); 593 ocfs2_remove_metadata_tree(ci, item);
540 } 594 }
541 spin_unlock(&oi->ip_lock); 595 ocfs2_metadata_cache_unlock(ci);
542 596
543 if (item) 597 if (item)
544 kmem_cache_free(ocfs2_uptodate_cachep, item); 598 kmem_cache_free(ocfs2_uptodate_cachep, item);
@@ -549,23 +603,24 @@ static void ocfs2_remove_block_from_cache(struct inode *inode,
549 * bother reverting things to an inlined array in the case of a remove 603 * bother reverting things to an inlined array in the case of a remove
550 * which moves us back under the limit. 604 * which moves us back under the limit.
551 */ 605 */
552void ocfs2_remove_from_cache(struct inode *inode, 606void ocfs2_remove_from_cache(struct ocfs2_caching_info *ci,
553 struct buffer_head *bh) 607 struct buffer_head *bh)
554{ 608{
555 sector_t block = bh->b_blocknr; 609 sector_t block = bh->b_blocknr;
556 610
557 ocfs2_remove_block_from_cache(inode, block); 611 ocfs2_remove_block_from_cache(ci, block);
558} 612}
559 613
560/* Called when we remove xattr clusters from an inode. */ 614/* Called when we remove xattr clusters from an inode. */
561void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode, 615void ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci,
562 sector_t block, 616 sector_t block,
563 u32 c_len) 617 u32 c_len)
564{ 618{
565 unsigned int i, b_len = ocfs2_clusters_to_blocks(inode->i_sb, 1) * c_len; 619 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
620 unsigned int i, b_len = ocfs2_clusters_to_blocks(sb, 1) * c_len;
566 621
567 for (i = 0; i < b_len; i++, block++) 622 for (i = 0; i < b_len; i++, block++)
568 ocfs2_remove_block_from_cache(inode, block); 623 ocfs2_remove_block_from_cache(ci, block);
569} 624}
570 625
571int __init init_ocfs2_uptodate_cache(void) 626int __init init_ocfs2_uptodate_cache(void)
@@ -577,7 +632,7 @@ int __init init_ocfs2_uptodate_cache(void)
577 return -ENOMEM; 632 return -ENOMEM;
578 633
579 mlog(0, "%u inlined cache items per inode.\n", 634 mlog(0, "%u inlined cache items per inode.\n",
580 OCFS2_INODE_MAX_CACHE_ARRAY); 635 OCFS2_CACHE_INFO_MAX_ARRAY);
581 636
582 return 0; 637 return 0;
583} 638}
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
index 531b4b3a0c47..0d826fe2da0d 100644
--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h
@@ -26,24 +26,59 @@
26#ifndef OCFS2_UPTODATE_H 26#ifndef OCFS2_UPTODATE_H
27#define OCFS2_UPTODATE_H 27#define OCFS2_UPTODATE_H
28 28
29/*
30 * The caching code relies on locking provided by the user of
31 * struct ocfs2_caching_info. These operations connect that up.
32 */
33struct ocfs2_caching_operations {
34 /*
35 * A u64 representing the owning structure. Usually this
36 * is the block number (i_blkno or whatnot). This is used so
37 * that caching log messages can identify the owning structure.
38 */
39 u64 (*co_owner)(struct ocfs2_caching_info *ci);
40
41 /* The superblock is needed during I/O. */
42 struct super_block *(*co_get_super)(struct ocfs2_caching_info *ci);
43 /*
44 * Lock and unlock the caching data. These will not sleep, and
45 * should probably be spinlocks.
46 */
47 void (*co_cache_lock)(struct ocfs2_caching_info *ci);
48 void (*co_cache_unlock)(struct ocfs2_caching_info *ci);
49
50 /*
51 * Lock and unlock for disk I/O. These will sleep, and should
52 * be mutexes.
53 */
54 void (*co_io_lock)(struct ocfs2_caching_info *ci);
55 void (*co_io_unlock)(struct ocfs2_caching_info *ci);
56};
57
29int __init init_ocfs2_uptodate_cache(void); 58int __init init_ocfs2_uptodate_cache(void);
30void exit_ocfs2_uptodate_cache(void); 59void exit_ocfs2_uptodate_cache(void);
31 60
32void ocfs2_metadata_cache_init(struct inode *inode); 61void ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci,
33void ocfs2_metadata_cache_purge(struct inode *inode); 62 const struct ocfs2_caching_operations *ops);
63void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci);
64void ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci);
65
66u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci);
67void ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci);
68void ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci);
34 69
35int ocfs2_buffer_uptodate(struct inode *inode, 70int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci,
36 struct buffer_head *bh); 71 struct buffer_head *bh);
37void ocfs2_set_buffer_uptodate(struct inode *inode, 72void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
38 struct buffer_head *bh); 73 struct buffer_head *bh);
39void ocfs2_set_new_buffer_uptodate(struct inode *inode, 74void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci,
40 struct buffer_head *bh); 75 struct buffer_head *bh);
41void ocfs2_remove_from_cache(struct inode *inode, 76void ocfs2_remove_from_cache(struct ocfs2_caching_info *ci,
42 struct buffer_head *bh); 77 struct buffer_head *bh);
43void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode, 78void ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci,
44 sector_t block, 79 sector_t block,
45 u32 c_len); 80 u32 c_len);
46int ocfs2_buffer_read_ahead(struct inode *inode, 81int ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci,
47 struct buffer_head *bh); 82 struct buffer_head *bh);
48 83
49#endif /* OCFS2_UPTODATE_H */ 84#endif /* OCFS2_UPTODATE_H */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d1a27cda984f..fe3419068df2 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -55,7 +55,8 @@
55#include "buffer_head_io.h" 55#include "buffer_head_io.h"
56#include "super.h" 56#include "super.h"
57#include "xattr.h" 57#include "xattr.h"
58 58#include "refcounttree.h"
59#include "acl.h"
59 60
60struct ocfs2_xattr_def_value_root { 61struct ocfs2_xattr_def_value_root {
61 struct ocfs2_xattr_value_root xv; 62 struct ocfs2_xattr_value_root xv;
@@ -140,7 +141,7 @@ struct ocfs2_xattr_search {
140 int not_found; 141 int not_found;
141}; 142};
142 143
143static int ocfs2_xattr_bucket_get_name_value(struct inode *inode, 144static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
144 struct ocfs2_xattr_header *xh, 145 struct ocfs2_xattr_header *xh,
145 int index, 146 int index,
146 int *block_off, 147 int *block_off,
@@ -157,7 +158,7 @@ static int ocfs2_xattr_index_block_find(struct inode *inode,
157 struct ocfs2_xattr_search *xs); 158 struct ocfs2_xattr_search *xs);
158 159
159static int ocfs2_xattr_tree_list_index_block(struct inode *inode, 160static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
160 struct ocfs2_xattr_tree_root *xt, 161 struct buffer_head *blk_bh,
161 char *buffer, 162 char *buffer,
162 size_t buffer_size); 163 size_t buffer_size);
163 164
@@ -170,12 +171,42 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
170 struct ocfs2_xattr_search *xs, 171 struct ocfs2_xattr_search *xs,
171 struct ocfs2_xattr_set_ctxt *ctxt); 172 struct ocfs2_xattr_set_ctxt *ctxt);
172 173
173static int ocfs2_delete_xattr_index_block(struct inode *inode, 174typedef int (xattr_tree_rec_func)(struct inode *inode,
174 struct buffer_head *xb_bh); 175 struct buffer_head *root_bh,
176 u64 blkno, u32 cpos, u32 len, void *para);
177static int ocfs2_iterate_xattr_index_block(struct inode *inode,
178 struct buffer_head *root_bh,
179 xattr_tree_rec_func *rec_func,
180 void *para);
181static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
182 struct ocfs2_xattr_bucket *bucket,
183 void *para);
184static int ocfs2_rm_xattr_cluster(struct inode *inode,
185 struct buffer_head *root_bh,
186 u64 blkno,
187 u32 cpos,
188 u32 len,
189 void *para);
190
175static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle, 191static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
176 u64 src_blk, u64 last_blk, u64 to_blk, 192 u64 src_blk, u64 last_blk, u64 to_blk,
177 unsigned int start_bucket, 193 unsigned int start_bucket,
178 u32 *first_hash); 194 u32 *first_hash);
195static int ocfs2_prepare_refcount_xattr(struct inode *inode,
196 struct ocfs2_dinode *di,
197 struct ocfs2_xattr_info *xi,
198 struct ocfs2_xattr_search *xis,
199 struct ocfs2_xattr_search *xbs,
200 struct ocfs2_refcount_tree **ref_tree,
201 int *meta_need,
202 int *credits);
203static int ocfs2_get_xattr_tree_value_root(struct super_block *sb,
204 struct ocfs2_xattr_bucket *bucket,
205 int offset,
206 struct ocfs2_xattr_value_root **xv,
207 struct buffer_head **bh);
208static int ocfs2_xattr_security_set(struct inode *inode, const char *name,
209 const void *value, size_t size, int flags);
179 210
180static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb) 211static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
181{ 212{
@@ -254,9 +285,9 @@ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
254 break; 285 break;
255 } 286 }
256 287
257 if (!ocfs2_buffer_uptodate(bucket->bu_inode, 288 if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
258 bucket->bu_bhs[i])) 289 bucket->bu_bhs[i]))
259 ocfs2_set_new_buffer_uptodate(bucket->bu_inode, 290 ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
260 bucket->bu_bhs[i]); 291 bucket->bu_bhs[i]);
261 } 292 }
262 293
@@ -271,7 +302,7 @@ static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
271{ 302{
272 int rc; 303 int rc;
273 304
274 rc = ocfs2_read_blocks(bucket->bu_inode, xb_blkno, 305 rc = ocfs2_read_blocks(INODE_CACHE(bucket->bu_inode), xb_blkno,
275 bucket->bu_blocks, bucket->bu_bhs, 0, 306 bucket->bu_blocks, bucket->bu_bhs, 0,
276 NULL); 307 NULL);
277 if (!rc) { 308 if (!rc) {
@@ -297,7 +328,8 @@ static int ocfs2_xattr_bucket_journal_access(handle_t *handle,
297 int i, rc = 0; 328 int i, rc = 0;
298 329
299 for (i = 0; i < bucket->bu_blocks; i++) { 330 for (i = 0; i < bucket->bu_blocks; i++) {
300 rc = ocfs2_journal_access(handle, bucket->bu_inode, 331 rc = ocfs2_journal_access(handle,
332 INODE_CACHE(bucket->bu_inode),
301 bucket->bu_bhs[i], type); 333 bucket->bu_bhs[i], type);
302 if (rc) { 334 if (rc) {
303 mlog_errno(rc); 335 mlog_errno(rc);
@@ -399,7 +431,7 @@ static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
399 int rc; 431 int rc;
400 struct buffer_head *tmp = *bh; 432 struct buffer_head *tmp = *bh;
401 433
402 rc = ocfs2_read_block(inode, xb_blkno, &tmp, 434 rc = ocfs2_read_block(INODE_CACHE(inode), xb_blkno, &tmp,
403 ocfs2_validate_xattr_block); 435 ocfs2_validate_xattr_block);
404 436
405 /* If ocfs2_read_block() got us a new bh, pass it up. */ 437 /* If ocfs2_read_block() got us a new bh, pass it up. */
@@ -596,15 +628,14 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
596 int status = 0; 628 int status = 0;
597 handle_t *handle = ctxt->handle; 629 handle_t *handle = ctxt->handle;
598 enum ocfs2_alloc_restarted why; 630 enum ocfs2_alloc_restarted why;
599 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
600 u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters); 631 u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
601 struct ocfs2_extent_tree et; 632 struct ocfs2_extent_tree et;
602 633
603 mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add); 634 mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
604 635
605 ocfs2_init_xattr_value_extent_tree(&et, inode, vb); 636 ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
606 637
607 status = vb->vb_access(handle, inode, vb->vb_bh, 638 status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
608 OCFS2_JOURNAL_ACCESS_WRITE); 639 OCFS2_JOURNAL_ACCESS_WRITE);
609 if (status < 0) { 640 if (status < 0) {
610 mlog_errno(status); 641 mlog_errno(status);
@@ -612,13 +643,11 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
612 } 643 }
613 644
614 prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters); 645 prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
615 status = ocfs2_add_clusters_in_btree(osb, 646 status = ocfs2_add_clusters_in_btree(handle,
616 inode, 647 &et,
617 &logical_start, 648 &logical_start,
618 clusters_to_add, 649 clusters_to_add,
619 0, 650 0,
620 &et,
621 handle,
622 ctxt->data_ac, 651 ctxt->data_ac,
623 ctxt->meta_ac, 652 ctxt->meta_ac,
624 &why); 653 &why);
@@ -649,6 +678,7 @@ leave:
649static int __ocfs2_remove_xattr_range(struct inode *inode, 678static int __ocfs2_remove_xattr_range(struct inode *inode,
650 struct ocfs2_xattr_value_buf *vb, 679 struct ocfs2_xattr_value_buf *vb,
651 u32 cpos, u32 phys_cpos, u32 len, 680 u32 cpos, u32 phys_cpos, u32 len,
681 unsigned int ext_flags,
652 struct ocfs2_xattr_set_ctxt *ctxt) 682 struct ocfs2_xattr_set_ctxt *ctxt)
653{ 683{
654 int ret; 684 int ret;
@@ -656,16 +686,16 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
656 handle_t *handle = ctxt->handle; 686 handle_t *handle = ctxt->handle;
657 struct ocfs2_extent_tree et; 687 struct ocfs2_extent_tree et;
658 688
659 ocfs2_init_xattr_value_extent_tree(&et, inode, vb); 689 ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
660 690
661 ret = vb->vb_access(handle, inode, vb->vb_bh, 691 ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
662 OCFS2_JOURNAL_ACCESS_WRITE); 692 OCFS2_JOURNAL_ACCESS_WRITE);
663 if (ret) { 693 if (ret) {
664 mlog_errno(ret); 694 mlog_errno(ret);
665 goto out; 695 goto out;
666 } 696 }
667 697
668 ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, ctxt->meta_ac, 698 ret = ocfs2_remove_extent(handle, &et, cpos, len, ctxt->meta_ac,
669 &ctxt->dealloc); 699 &ctxt->dealloc);
670 if (ret) { 700 if (ret) {
671 mlog_errno(ret); 701 mlog_errno(ret);
@@ -680,7 +710,14 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
680 goto out; 710 goto out;
681 } 711 }
682 712
683 ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, phys_blkno, len); 713 if (ext_flags & OCFS2_EXT_REFCOUNTED)
714 ret = ocfs2_decrease_refcount(inode, handle,
715 ocfs2_blocks_to_clusters(inode->i_sb,
716 phys_blkno),
717 len, ctxt->meta_ac, &ctxt->dealloc, 1);
718 else
719 ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc,
720 phys_blkno, len);
684 if (ret) 721 if (ret)
685 mlog_errno(ret); 722 mlog_errno(ret);
686 723
@@ -695,6 +732,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
695 struct ocfs2_xattr_set_ctxt *ctxt) 732 struct ocfs2_xattr_set_ctxt *ctxt)
696{ 733{
697 int ret = 0; 734 int ret = 0;
735 unsigned int ext_flags;
698 u32 trunc_len, cpos, phys_cpos, alloc_size; 736 u32 trunc_len, cpos, phys_cpos, alloc_size;
699 u64 block; 737 u64 block;
700 738
@@ -706,7 +744,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
706 while (trunc_len) { 744 while (trunc_len) {
707 ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos, 745 ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
708 &alloc_size, 746 &alloc_size,
709 &vb->vb_xv->xr_list); 747 &vb->vb_xv->xr_list, &ext_flags);
710 if (ret) { 748 if (ret) {
711 mlog_errno(ret); 749 mlog_errno(ret);
712 goto out; 750 goto out;
@@ -717,15 +755,15 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
717 755
718 ret = __ocfs2_remove_xattr_range(inode, vb, cpos, 756 ret = __ocfs2_remove_xattr_range(inode, vb, cpos,
719 phys_cpos, alloc_size, 757 phys_cpos, alloc_size,
720 ctxt); 758 ext_flags, ctxt);
721 if (ret) { 759 if (ret) {
722 mlog_errno(ret); 760 mlog_errno(ret);
723 goto out; 761 goto out;
724 } 762 }
725 763
726 block = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 764 block = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
727 ocfs2_remove_xattr_clusters_from_cache(inode, block, 765 ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode),
728 alloc_size); 766 block, alloc_size);
729 cpos += alloc_size; 767 cpos += alloc_size;
730 trunc_len -= alloc_size; 768 trunc_len -= alloc_size;
731 } 769 }
@@ -810,6 +848,23 @@ static int ocfs2_xattr_list_entries(struct inode *inode,
810 return result; 848 return result;
811} 849}
812 850
851int ocfs2_has_inline_xattr_value_outside(struct inode *inode,
852 struct ocfs2_dinode *di)
853{
854 struct ocfs2_xattr_header *xh;
855 int i;
856
857 xh = (struct ocfs2_xattr_header *)
858 ((void *)di + inode->i_sb->s_blocksize -
859 le16_to_cpu(di->i_xattr_inline_size));
860
861 for (i = 0; i < le16_to_cpu(xh->xh_count); i++)
862 if (!ocfs2_xattr_is_local(&xh->xh_entries[i]))
863 return 1;
864
865 return 0;
866}
867
813static int ocfs2_xattr_ibody_list(struct inode *inode, 868static int ocfs2_xattr_ibody_list(struct inode *inode,
814 struct ocfs2_dinode *di, 869 struct ocfs2_dinode *di,
815 char *buffer, 870 char *buffer,
@@ -855,11 +910,9 @@ static int ocfs2_xattr_block_list(struct inode *inode,
855 struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header; 910 struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
856 ret = ocfs2_xattr_list_entries(inode, header, 911 ret = ocfs2_xattr_list_entries(inode, header,
857 buffer, buffer_size); 912 buffer, buffer_size);
858 } else { 913 } else
859 struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; 914 ret = ocfs2_xattr_tree_list_index_block(inode, blk_bh,
860 ret = ocfs2_xattr_tree_list_index_block(inode, xt,
861 buffer, buffer_size); 915 buffer, buffer_size);
862 }
863 916
864 brelse(blk_bh); 917 brelse(blk_bh);
865 918
@@ -961,7 +1014,7 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode,
961 cpos = 0; 1014 cpos = 0;
962 while (cpos < clusters) { 1015 while (cpos < clusters) {
963 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, 1016 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
964 &num_clusters, el); 1017 &num_clusters, el, NULL);
965 if (ret) { 1018 if (ret) {
966 mlog_errno(ret); 1019 mlog_errno(ret);
967 goto out; 1020 goto out;
@@ -970,7 +1023,8 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode,
970 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); 1023 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
971 /* Copy ocfs2_xattr_value */ 1024 /* Copy ocfs2_xattr_value */
972 for (i = 0; i < num_clusters * bpc; i++, blkno++) { 1025 for (i = 0; i < num_clusters * bpc; i++, blkno++) {
973 ret = ocfs2_read_block(inode, blkno, &bh, NULL); 1026 ret = ocfs2_read_block(INODE_CACHE(inode), blkno,
1027 &bh, NULL);
974 if (ret) { 1028 if (ret) {
975 mlog_errno(ret); 1029 mlog_errno(ret);
976 goto out; 1030 goto out;
@@ -1085,7 +1139,7 @@ static int ocfs2_xattr_block_get(struct inode *inode,
1085 i = xs->here - xs->header->xh_entries; 1139 i = xs->here - xs->header->xh_entries;
1086 1140
1087 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { 1141 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
1088 ret = ocfs2_xattr_bucket_get_name_value(inode, 1142 ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
1089 bucket_xh(xs->bucket), 1143 bucket_xh(xs->bucket),
1090 i, 1144 i,
1091 &block_off, 1145 &block_off,
@@ -1183,7 +1237,7 @@ static int ocfs2_xattr_get(struct inode *inode,
1183 1237
1184static int __ocfs2_xattr_set_value_outside(struct inode *inode, 1238static int __ocfs2_xattr_set_value_outside(struct inode *inode,
1185 handle_t *handle, 1239 handle_t *handle,
1186 struct ocfs2_xattr_value_root *xv, 1240 struct ocfs2_xattr_value_buf *vb,
1187 const void *value, 1241 const void *value,
1188 int value_len) 1242 int value_len)
1189{ 1243{
@@ -1194,28 +1248,34 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
1194 u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len); 1248 u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
1195 u64 blkno; 1249 u64 blkno;
1196 struct buffer_head *bh = NULL; 1250 struct buffer_head *bh = NULL;
1251 unsigned int ext_flags;
1252 struct ocfs2_xattr_value_root *xv = vb->vb_xv;
1197 1253
1198 BUG_ON(clusters > le32_to_cpu(xv->xr_clusters)); 1254 BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
1199 1255
1200 while (cpos < clusters) { 1256 while (cpos < clusters) {
1201 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, 1257 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
1202 &num_clusters, &xv->xr_list); 1258 &num_clusters, &xv->xr_list,
1259 &ext_flags);
1203 if (ret) { 1260 if (ret) {
1204 mlog_errno(ret); 1261 mlog_errno(ret);
1205 goto out; 1262 goto out;
1206 } 1263 }
1207 1264
1265 BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
1266
1208 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); 1267 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
1209 1268
1210 for (i = 0; i < num_clusters * bpc; i++, blkno++) { 1269 for (i = 0; i < num_clusters * bpc; i++, blkno++) {
1211 ret = ocfs2_read_block(inode, blkno, &bh, NULL); 1270 ret = ocfs2_read_block(INODE_CACHE(inode), blkno,
1271 &bh, NULL);
1212 if (ret) { 1272 if (ret) {
1213 mlog_errno(ret); 1273 mlog_errno(ret);
1214 goto out; 1274 goto out;
1215 } 1275 }
1216 1276
1217 ret = ocfs2_journal_access(handle, 1277 ret = ocfs2_journal_access(handle,
1218 inode, 1278 INODE_CACHE(inode),
1219 bh, 1279 bh,
1220 OCFS2_JOURNAL_ACCESS_WRITE); 1280 OCFS2_JOURNAL_ACCESS_WRITE);
1221 if (ret < 0) { 1281 if (ret < 0) {
@@ -1266,7 +1326,7 @@ static int ocfs2_xattr_cleanup(struct inode *inode,
1266 void *val = xs->base + offs; 1326 void *val = xs->base + offs;
1267 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; 1327 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
1268 1328
1269 ret = vb->vb_access(handle, inode, vb->vb_bh, 1329 ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
1270 OCFS2_JOURNAL_ACCESS_WRITE); 1330 OCFS2_JOURNAL_ACCESS_WRITE);
1271 if (ret) { 1331 if (ret) {
1272 mlog_errno(ret); 1332 mlog_errno(ret);
@@ -1294,7 +1354,7 @@ static int ocfs2_xattr_update_entry(struct inode *inode,
1294{ 1354{
1295 int ret; 1355 int ret;
1296 1356
1297 ret = vb->vb_access(handle, inode, vb->vb_bh, 1357 ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
1298 OCFS2_JOURNAL_ACCESS_WRITE); 1358 OCFS2_JOURNAL_ACCESS_WRITE);
1299 if (ret) { 1359 if (ret) {
1300 mlog_errno(ret); 1360 mlog_errno(ret);
@@ -1355,7 +1415,7 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
1355 mlog_errno(ret); 1415 mlog_errno(ret);
1356 return ret; 1416 return ret;
1357 } 1417 }
1358 ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb->vb_xv, 1418 ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb,
1359 xi->value, xi->value_len); 1419 xi->value, xi->value_len);
1360 if (ret < 0) 1420 if (ret < 0)
1361 mlog_errno(ret); 1421 mlog_errno(ret);
@@ -1594,7 +1654,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1594 1654
1595 ret = __ocfs2_xattr_set_value_outside(inode, 1655 ret = __ocfs2_xattr_set_value_outside(inode,
1596 handle, 1656 handle,
1597 vb.vb_xv, 1657 &vb,
1598 xi->value, 1658 xi->value,
1599 xi->value_len); 1659 xi->value_len);
1600 if (ret < 0) 1660 if (ret < 0)
@@ -1615,7 +1675,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1615 } 1675 }
1616 } 1676 }
1617 1677
1618 ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh, 1678 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), xs->inode_bh,
1619 OCFS2_JOURNAL_ACCESS_WRITE); 1679 OCFS2_JOURNAL_ACCESS_WRITE);
1620 if (ret) { 1680 if (ret) {
1621 mlog_errno(ret); 1681 mlog_errno(ret);
@@ -1623,7 +1683,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1623 } 1683 }
1624 1684
1625 if (!(flag & OCFS2_INLINE_XATTR_FL)) { 1685 if (!(flag & OCFS2_INLINE_XATTR_FL)) {
1626 ret = vb.vb_access(handle, inode, vb.vb_bh, 1686 ret = vb.vb_access(handle, INODE_CACHE(inode), vb.vb_bh,
1627 OCFS2_JOURNAL_ACCESS_WRITE); 1687 OCFS2_JOURNAL_ACCESS_WRITE);
1628 if (ret) { 1688 if (ret) {
1629 mlog_errno(ret); 1689 mlog_errno(ret);
@@ -1700,51 +1760,112 @@ out:
1700 return ret; 1760 return ret;
1701} 1761}
1702 1762
1763/*
1764 * In xattr remove, if it is stored outside and refcounted, we may have
1765 * the chance to split the refcount tree. So need the allocators.
1766 */
1767static int ocfs2_lock_xattr_remove_allocators(struct inode *inode,
1768 struct ocfs2_xattr_value_root *xv,
1769 struct ocfs2_caching_info *ref_ci,
1770 struct buffer_head *ref_root_bh,
1771 struct ocfs2_alloc_context **meta_ac,
1772 int *ref_credits)
1773{
1774 int ret, meta_add = 0;
1775 u32 p_cluster, num_clusters;
1776 unsigned int ext_flags;
1777
1778 *ref_credits = 0;
1779 ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster,
1780 &num_clusters,
1781 &xv->xr_list,
1782 &ext_flags);
1783 if (ret) {
1784 mlog_errno(ret);
1785 goto out;
1786 }
1787
1788 if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
1789 goto out;
1790
1791 ret = ocfs2_refcounted_xattr_delete_need(inode, ref_ci,
1792 ref_root_bh, xv,
1793 &meta_add, ref_credits);
1794 if (ret) {
1795 mlog_errno(ret);
1796 goto out;
1797 }
1798
1799 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
1800 meta_add, meta_ac);
1801 if (ret)
1802 mlog_errno(ret);
1803
1804out:
1805 return ret;
1806}
1807
1703static int ocfs2_remove_value_outside(struct inode*inode, 1808static int ocfs2_remove_value_outside(struct inode*inode,
1704 struct ocfs2_xattr_value_buf *vb, 1809 struct ocfs2_xattr_value_buf *vb,
1705 struct ocfs2_xattr_header *header) 1810 struct ocfs2_xattr_header *header,
1811 struct ocfs2_caching_info *ref_ci,
1812 struct buffer_head *ref_root_bh)
1706{ 1813{
1707 int ret = 0, i; 1814 int ret = 0, i, ref_credits;
1708 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1815 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1709 struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, }; 1816 struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
1817 void *val;
1710 1818
1711 ocfs2_init_dealloc_ctxt(&ctxt.dealloc); 1819 ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
1712 1820
1713 ctxt.handle = ocfs2_start_trans(osb,
1714 ocfs2_remove_extent_credits(osb->sb));
1715 if (IS_ERR(ctxt.handle)) {
1716 ret = PTR_ERR(ctxt.handle);
1717 mlog_errno(ret);
1718 goto out;
1719 }
1720
1721 for (i = 0; i < le16_to_cpu(header->xh_count); i++) { 1821 for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
1722 struct ocfs2_xattr_entry *entry = &header->xh_entries[i]; 1822 struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
1723 1823
1724 if (!ocfs2_xattr_is_local(entry)) { 1824 if (ocfs2_xattr_is_local(entry))
1725 void *val; 1825 continue;
1726 1826
1727 val = (void *)header + 1827 val = (void *)header +
1728 le16_to_cpu(entry->xe_name_offset); 1828 le16_to_cpu(entry->xe_name_offset);
1729 vb->vb_xv = (struct ocfs2_xattr_value_root *) 1829 vb->vb_xv = (struct ocfs2_xattr_value_root *)
1730 (val + OCFS2_XATTR_SIZE(entry->xe_name_len)); 1830 (val + OCFS2_XATTR_SIZE(entry->xe_name_len));
1731 ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt); 1831
1732 if (ret < 0) { 1832 ret = ocfs2_lock_xattr_remove_allocators(inode, vb->vb_xv,
1733 mlog_errno(ret); 1833 ref_ci, ref_root_bh,
1734 break; 1834 &ctxt.meta_ac,
1735 } 1835 &ref_credits);
1836
1837 ctxt.handle = ocfs2_start_trans(osb, ref_credits +
1838 ocfs2_remove_extent_credits(osb->sb));
1839 if (IS_ERR(ctxt.handle)) {
1840 ret = PTR_ERR(ctxt.handle);
1841 mlog_errno(ret);
1842 break;
1843 }
1844
1845 ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
1846 if (ret < 0) {
1847 mlog_errno(ret);
1848 break;
1849 }
1850
1851 ocfs2_commit_trans(osb, ctxt.handle);
1852 if (ctxt.meta_ac) {
1853 ocfs2_free_alloc_context(ctxt.meta_ac);
1854 ctxt.meta_ac = NULL;
1736 } 1855 }
1737 } 1856 }
1738 1857
1739 ocfs2_commit_trans(osb, ctxt.handle); 1858 if (ctxt.meta_ac)
1859 ocfs2_free_alloc_context(ctxt.meta_ac);
1740 ocfs2_schedule_truncate_log_flush(osb, 1); 1860 ocfs2_schedule_truncate_log_flush(osb, 1);
1741 ocfs2_run_deallocs(osb, &ctxt.dealloc); 1861 ocfs2_run_deallocs(osb, &ctxt.dealloc);
1742out:
1743 return ret; 1862 return ret;
1744} 1863}
1745 1864
1746static int ocfs2_xattr_ibody_remove(struct inode *inode, 1865static int ocfs2_xattr_ibody_remove(struct inode *inode,
1747 struct buffer_head *di_bh) 1866 struct buffer_head *di_bh,
1867 struct ocfs2_caching_info *ref_ci,
1868 struct buffer_head *ref_root_bh)
1748{ 1869{
1749 1870
1750 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 1871 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -1759,13 +1880,21 @@ static int ocfs2_xattr_ibody_remove(struct inode *inode,
1759 ((void *)di + inode->i_sb->s_blocksize - 1880 ((void *)di + inode->i_sb->s_blocksize -
1760 le16_to_cpu(di->i_xattr_inline_size)); 1881 le16_to_cpu(di->i_xattr_inline_size));
1761 1882
1762 ret = ocfs2_remove_value_outside(inode, &vb, header); 1883 ret = ocfs2_remove_value_outside(inode, &vb, header,
1884 ref_ci, ref_root_bh);
1763 1885
1764 return ret; 1886 return ret;
1765} 1887}
1766 1888
1889struct ocfs2_rm_xattr_bucket_para {
1890 struct ocfs2_caching_info *ref_ci;
1891 struct buffer_head *ref_root_bh;
1892};
1893
1767static int ocfs2_xattr_block_remove(struct inode *inode, 1894static int ocfs2_xattr_block_remove(struct inode *inode,
1768 struct buffer_head *blk_bh) 1895 struct buffer_head *blk_bh,
1896 struct ocfs2_caching_info *ref_ci,
1897 struct buffer_head *ref_root_bh)
1769{ 1898{
1770 struct ocfs2_xattr_block *xb; 1899 struct ocfs2_xattr_block *xb;
1771 int ret = 0; 1900 int ret = 0;
@@ -1773,19 +1902,29 @@ static int ocfs2_xattr_block_remove(struct inode *inode,
1773 .vb_bh = blk_bh, 1902 .vb_bh = blk_bh,
1774 .vb_access = ocfs2_journal_access_xb, 1903 .vb_access = ocfs2_journal_access_xb,
1775 }; 1904 };
1905 struct ocfs2_rm_xattr_bucket_para args = {
1906 .ref_ci = ref_ci,
1907 .ref_root_bh = ref_root_bh,
1908 };
1776 1909
1777 xb = (struct ocfs2_xattr_block *)blk_bh->b_data; 1910 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
1778 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { 1911 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
1779 struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header); 1912 struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
1780 ret = ocfs2_remove_value_outside(inode, &vb, header); 1913 ret = ocfs2_remove_value_outside(inode, &vb, header,
1914 ref_ci, ref_root_bh);
1781 } else 1915 } else
1782 ret = ocfs2_delete_xattr_index_block(inode, blk_bh); 1916 ret = ocfs2_iterate_xattr_index_block(inode,
1917 blk_bh,
1918 ocfs2_rm_xattr_cluster,
1919 &args);
1783 1920
1784 return ret; 1921 return ret;
1785} 1922}
1786 1923
1787static int ocfs2_xattr_free_block(struct inode *inode, 1924static int ocfs2_xattr_free_block(struct inode *inode,
1788 u64 block) 1925 u64 block,
1926 struct ocfs2_caching_info *ref_ci,
1927 struct buffer_head *ref_root_bh)
1789{ 1928{
1790 struct inode *xb_alloc_inode; 1929 struct inode *xb_alloc_inode;
1791 struct buffer_head *xb_alloc_bh = NULL; 1930 struct buffer_head *xb_alloc_bh = NULL;
@@ -1803,7 +1942,7 @@ static int ocfs2_xattr_free_block(struct inode *inode,
1803 goto out; 1942 goto out;
1804 } 1943 }
1805 1944
1806 ret = ocfs2_xattr_block_remove(inode, blk_bh); 1945 ret = ocfs2_xattr_block_remove(inode, blk_bh, ref_ci, ref_root_bh);
1807 if (ret < 0) { 1946 if (ret < 0) {
1808 mlog_errno(ret); 1947 mlog_errno(ret);
1809 goto out; 1948 goto out;
@@ -1863,6 +2002,9 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
1863{ 2002{
1864 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2003 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1865 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 2004 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2005 struct ocfs2_refcount_tree *ref_tree = NULL;
2006 struct buffer_head *ref_root_bh = NULL;
2007 struct ocfs2_caching_info *ref_ci = NULL;
1866 handle_t *handle; 2008 handle_t *handle;
1867 int ret; 2009 int ret;
1868 2010
@@ -1872,8 +2014,21 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
1872 if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) 2014 if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
1873 return 0; 2015 return 0;
1874 2016
2017 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
2018 ret = ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb),
2019 le64_to_cpu(di->i_refcount_loc),
2020 1, &ref_tree, &ref_root_bh);
2021 if (ret) {
2022 mlog_errno(ret);
2023 goto out;
2024 }
2025 ref_ci = &ref_tree->rf_ci;
2026
2027 }
2028
1875 if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { 2029 if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
1876 ret = ocfs2_xattr_ibody_remove(inode, di_bh); 2030 ret = ocfs2_xattr_ibody_remove(inode, di_bh,
2031 ref_ci, ref_root_bh);
1877 if (ret < 0) { 2032 if (ret < 0) {
1878 mlog_errno(ret); 2033 mlog_errno(ret);
1879 goto out; 2034 goto out;
@@ -1882,7 +2037,8 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
1882 2037
1883 if (di->i_xattr_loc) { 2038 if (di->i_xattr_loc) {
1884 ret = ocfs2_xattr_free_block(inode, 2039 ret = ocfs2_xattr_free_block(inode,
1885 le64_to_cpu(di->i_xattr_loc)); 2040 le64_to_cpu(di->i_xattr_loc),
2041 ref_ci, ref_root_bh);
1886 if (ret < 0) { 2042 if (ret < 0) {
1887 mlog_errno(ret); 2043 mlog_errno(ret);
1888 goto out; 2044 goto out;
@@ -1896,7 +2052,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
1896 mlog_errno(ret); 2052 mlog_errno(ret);
1897 goto out; 2053 goto out;
1898 } 2054 }
1899 ret = ocfs2_journal_access_di(handle, inode, di_bh, 2055 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1900 OCFS2_JOURNAL_ACCESS_WRITE); 2056 OCFS2_JOURNAL_ACCESS_WRITE);
1901 if (ret) { 2057 if (ret) {
1902 mlog_errno(ret); 2058 mlog_errno(ret);
@@ -1916,6 +2072,9 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
1916out_commit: 2072out_commit:
1917 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 2073 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
1918out: 2074out:
2075 if (ref_tree)
2076 ocfs2_unlock_refcount_tree(OCFS2_SB(inode->i_sb), ref_tree, 1);
2077 brelse(ref_root_bh);
1919 return ret; 2078 return ret;
1920} 2079}
1921 2080
@@ -2083,6 +2242,84 @@ cleanup:
2083 return ret; 2242 return ret;
2084} 2243}
2085 2244
2245static int ocfs2_create_xattr_block(handle_t *handle,
2246 struct inode *inode,
2247 struct buffer_head *inode_bh,
2248 struct ocfs2_alloc_context *meta_ac,
2249 struct buffer_head **ret_bh,
2250 int indexed)
2251{
2252 int ret;
2253 u16 suballoc_bit_start;
2254 u32 num_got;
2255 u64 first_blkno;
2256 struct ocfs2_dinode *di = (struct ocfs2_dinode *)inode_bh->b_data;
2257 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2258 struct buffer_head *new_bh = NULL;
2259 struct ocfs2_xattr_block *xblk;
2260
2261 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), inode_bh,
2262 OCFS2_JOURNAL_ACCESS_CREATE);
2263 if (ret < 0) {
2264 mlog_errno(ret);
2265 goto end;
2266 }
2267
2268 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
2269 &suballoc_bit_start, &num_got,
2270 &first_blkno);
2271 if (ret < 0) {
2272 mlog_errno(ret);
2273 goto end;
2274 }
2275
2276 new_bh = sb_getblk(inode->i_sb, first_blkno);
2277 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
2278
2279 ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode),
2280 new_bh,
2281 OCFS2_JOURNAL_ACCESS_CREATE);
2282 if (ret < 0) {
2283 mlog_errno(ret);
2284 goto end;
2285 }
2286
2287 /* Initialize ocfs2_xattr_block */
2288 xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
2289 memset(xblk, 0, inode->i_sb->s_blocksize);
2290 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
2291 xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num);
2292 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
2293 xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
2294 xblk->xb_blkno = cpu_to_le64(first_blkno);
2295
2296 if (indexed) {
2297 struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
2298 xr->xt_clusters = cpu_to_le32(1);
2299 xr->xt_last_eb_blk = 0;
2300 xr->xt_list.l_tree_depth = 0;
2301 xr->xt_list.l_count = cpu_to_le16(
2302 ocfs2_xattr_recs_per_xb(inode->i_sb));
2303 xr->xt_list.l_next_free_rec = cpu_to_le16(1);
2304 xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED);
2305 }
2306
2307 ret = ocfs2_journal_dirty(handle, new_bh);
2308 if (ret < 0) {
2309 mlog_errno(ret);
2310 goto end;
2311 }
2312 di->i_xattr_loc = cpu_to_le64(first_blkno);
2313 ocfs2_journal_dirty(handle, inode_bh);
2314
2315 *ret_bh = new_bh;
2316 new_bh = NULL;
2317
2318end:
2319 brelse(new_bh);
2320 return ret;
2321}
2322
2086/* 2323/*
2087 * ocfs2_xattr_block_set() 2324 * ocfs2_xattr_block_set()
2088 * 2325 *
@@ -2095,63 +2332,24 @@ static int ocfs2_xattr_block_set(struct inode *inode,
2095 struct ocfs2_xattr_set_ctxt *ctxt) 2332 struct ocfs2_xattr_set_ctxt *ctxt)
2096{ 2333{
2097 struct buffer_head *new_bh = NULL; 2334 struct buffer_head *new_bh = NULL;
2098 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2099 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
2100 handle_t *handle = ctxt->handle; 2335 handle_t *handle = ctxt->handle;
2101 struct ocfs2_xattr_block *xblk = NULL; 2336 struct ocfs2_xattr_block *xblk = NULL;
2102 u16 suballoc_bit_start;
2103 u32 num_got;
2104 u64 first_blkno;
2105 int ret; 2337 int ret;
2106 2338
2107 if (!xs->xattr_bh) { 2339 if (!xs->xattr_bh) {
2108 ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh, 2340 ret = ocfs2_create_xattr_block(handle, inode, xs->inode_bh,
2109 OCFS2_JOURNAL_ACCESS_CREATE); 2341 ctxt->meta_ac, &new_bh, 0);
2110 if (ret < 0) { 2342 if (ret) {
2111 mlog_errno(ret);
2112 goto end;
2113 }
2114
2115 ret = ocfs2_claim_metadata(osb, handle, ctxt->meta_ac, 1,
2116 &suballoc_bit_start, &num_got,
2117 &first_blkno);
2118 if (ret < 0) {
2119 mlog_errno(ret);
2120 goto end;
2121 }
2122
2123 new_bh = sb_getblk(inode->i_sb, first_blkno);
2124 ocfs2_set_new_buffer_uptodate(inode, new_bh);
2125
2126 ret = ocfs2_journal_access_xb(handle, inode, new_bh,
2127 OCFS2_JOURNAL_ACCESS_CREATE);
2128 if (ret < 0) {
2129 mlog_errno(ret); 2343 mlog_errno(ret);
2130 goto end; 2344 goto end;
2131 } 2345 }
2132 2346
2133 /* Initialize ocfs2_xattr_block */
2134 xs->xattr_bh = new_bh; 2347 xs->xattr_bh = new_bh;
2135 xblk = (struct ocfs2_xattr_block *)new_bh->b_data; 2348 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
2136 memset(xblk, 0, inode->i_sb->s_blocksize);
2137 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
2138 xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num);
2139 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
2140 xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
2141 xblk->xb_blkno = cpu_to_le64(first_blkno);
2142
2143 xs->header = &xblk->xb_attrs.xb_header; 2349 xs->header = &xblk->xb_attrs.xb_header;
2144 xs->base = (void *)xs->header; 2350 xs->base = (void *)xs->header;
2145 xs->end = (void *)xblk + inode->i_sb->s_blocksize; 2351 xs->end = (void *)xblk + inode->i_sb->s_blocksize;
2146 xs->here = xs->header->xh_entries; 2352 xs->here = xs->header->xh_entries;
2147
2148 ret = ocfs2_journal_dirty(handle, new_bh);
2149 if (ret < 0) {
2150 mlog_errno(ret);
2151 goto end;
2152 }
2153 di->i_xattr_loc = cpu_to_le64(first_blkno);
2154 ocfs2_journal_dirty(handle, xs->inode_bh);
2155 } else 2353 } else
2156 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; 2354 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
2157 2355
@@ -2273,7 +2471,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2273 old_in_xb = 1; 2471 old_in_xb = 1;
2274 2472
2275 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { 2473 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
2276 ret = ocfs2_xattr_bucket_get_name_value(inode, 2474 ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
2277 bucket_xh(xbs->bucket), 2475 bucket_xh(xbs->bucket),
2278 i, &block_off, 2476 i, &block_off,
2279 &name_offset); 2477 &name_offset);
@@ -2428,6 +2626,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
2428 struct ocfs2_xattr_search *xis, 2626 struct ocfs2_xattr_search *xis,
2429 struct ocfs2_xattr_search *xbs, 2627 struct ocfs2_xattr_search *xbs,
2430 struct ocfs2_xattr_set_ctxt *ctxt, 2628 struct ocfs2_xattr_set_ctxt *ctxt,
2629 int extra_meta,
2431 int *credits) 2630 int *credits)
2432{ 2631{
2433 int clusters_add, meta_add, ret; 2632 int clusters_add, meta_add, ret;
@@ -2444,6 +2643,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
2444 return ret; 2643 return ret;
2445 } 2644 }
2446 2645
2646 meta_add += extra_meta;
2447 mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, " 2647 mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
2448 "credits = %d\n", xi->name, meta_add, clusters_add, *credits); 2648 "credits = %d\n", xi->name, meta_add, clusters_add, *credits);
2449 2649
@@ -2598,7 +2798,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2598 2798
2599 if (!ret) { 2799 if (!ret) {
2600 /* Update inode ctime. */ 2800 /* Update inode ctime. */
2601 ret = ocfs2_journal_access_di(ctxt->handle, inode, 2801 ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode),
2602 xis->inode_bh, 2802 xis->inode_bh,
2603 OCFS2_JOURNAL_ACCESS_WRITE); 2803 OCFS2_JOURNAL_ACCESS_WRITE);
2604 if (ret) { 2804 if (ret) {
@@ -2711,10 +2911,11 @@ int ocfs2_xattr_set(struct inode *inode,
2711{ 2911{
2712 struct buffer_head *di_bh = NULL; 2912 struct buffer_head *di_bh = NULL;
2713 struct ocfs2_dinode *di; 2913 struct ocfs2_dinode *di;
2714 int ret, credits; 2914 int ret, credits, ref_meta = 0, ref_credits = 0;
2715 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2915 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2716 struct inode *tl_inode = osb->osb_tl_inode; 2916 struct inode *tl_inode = osb->osb_tl_inode;
2717 struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, }; 2917 struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
2918 struct ocfs2_refcount_tree *ref_tree = NULL;
2718 2919
2719 struct ocfs2_xattr_info xi = { 2920 struct ocfs2_xattr_info xi = {
2720 .name_index = name_index, 2921 .name_index = name_index,
@@ -2779,6 +2980,17 @@ int ocfs2_xattr_set(struct inode *inode,
2779 goto cleanup; 2980 goto cleanup;
2780 } 2981 }
2781 2982
2983 /* Check whether the value is refcounted and do some prepartion. */
2984 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL &&
2985 (!xis.not_found || !xbs.not_found)) {
2986 ret = ocfs2_prepare_refcount_xattr(inode, di, &xi,
2987 &xis, &xbs, &ref_tree,
2988 &ref_meta, &ref_credits);
2989 if (ret) {
2990 mlog_errno(ret);
2991 goto cleanup;
2992 }
2993 }
2782 2994
2783 mutex_lock(&tl_inode->i_mutex); 2995 mutex_lock(&tl_inode->i_mutex);
2784 2996
@@ -2793,7 +3005,7 @@ int ocfs2_xattr_set(struct inode *inode,
2793 mutex_unlock(&tl_inode->i_mutex); 3005 mutex_unlock(&tl_inode->i_mutex);
2794 3006
2795 ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis, 3007 ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis,
2796 &xbs, &ctxt, &credits); 3008 &xbs, &ctxt, ref_meta, &credits);
2797 if (ret) { 3009 if (ret) {
2798 mlog_errno(ret); 3010 mlog_errno(ret);
2799 goto cleanup; 3011 goto cleanup;
@@ -2801,7 +3013,7 @@ int ocfs2_xattr_set(struct inode *inode,
2801 3013
2802 /* we need to update inode's ctime field, so add credit for it. */ 3014 /* we need to update inode's ctime field, so add credit for it. */
2803 credits += OCFS2_INODE_UPDATE_CREDITS; 3015 credits += OCFS2_INODE_UPDATE_CREDITS;
2804 ctxt.handle = ocfs2_start_trans(osb, credits); 3016 ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits);
2805 if (IS_ERR(ctxt.handle)) { 3017 if (IS_ERR(ctxt.handle)) {
2806 ret = PTR_ERR(ctxt.handle); 3018 ret = PTR_ERR(ctxt.handle);
2807 mlog_errno(ret); 3019 mlog_errno(ret);
@@ -2819,8 +3031,16 @@ int ocfs2_xattr_set(struct inode *inode,
2819 if (ocfs2_dealloc_has_cluster(&ctxt.dealloc)) 3031 if (ocfs2_dealloc_has_cluster(&ctxt.dealloc))
2820 ocfs2_schedule_truncate_log_flush(osb, 1); 3032 ocfs2_schedule_truncate_log_flush(osb, 1);
2821 ocfs2_run_deallocs(osb, &ctxt.dealloc); 3033 ocfs2_run_deallocs(osb, &ctxt.dealloc);
3034
2822cleanup: 3035cleanup:
3036 if (ref_tree)
3037 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
2823 up_write(&OCFS2_I(inode)->ip_xattr_sem); 3038 up_write(&OCFS2_I(inode)->ip_xattr_sem);
3039 if (!value && !ret) {
3040 ret = ocfs2_try_remove_refcount_tree(inode, di_bh);
3041 if (ret)
3042 mlog_errno(ret);
3043 }
2824 ocfs2_inode_unlock(inode, 1); 3044 ocfs2_inode_unlock(inode, 1);
2825cleanup_nolock: 3045cleanup_nolock:
2826 brelse(di_bh); 3046 brelse(di_bh);
@@ -2849,7 +3069,8 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
2849 u64 e_blkno = 0; 3069 u64 e_blkno = 0;
2850 3070
2851 if (el->l_tree_depth) { 3071 if (el->l_tree_depth) {
2852 ret = ocfs2_find_leaf(inode, el, name_hash, &eb_bh); 3072 ret = ocfs2_find_leaf(INODE_CACHE(inode), el, name_hash,
3073 &eb_bh);
2853 if (ret) { 3074 if (ret) {
2854 mlog_errno(ret); 3075 mlog_errno(ret);
2855 goto out; 3076 goto out;
@@ -2931,7 +3152,7 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
2931 if (cmp) 3152 if (cmp)
2932 continue; 3153 continue;
2933 3154
2934 ret = ocfs2_xattr_bucket_get_name_value(inode, 3155 ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
2935 xh, 3156 xh,
2936 i, 3157 i,
2937 &block_off, 3158 &block_off,
@@ -3175,7 +3396,7 @@ struct ocfs2_xattr_tree_list {
3175 size_t result; 3396 size_t result;
3176}; 3397};
3177 3398
3178static int ocfs2_xattr_bucket_get_name_value(struct inode *inode, 3399static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
3179 struct ocfs2_xattr_header *xh, 3400 struct ocfs2_xattr_header *xh,
3180 int index, 3401 int index,
3181 int *block_off, 3402 int *block_off,
@@ -3188,8 +3409,8 @@ static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
3188 3409
3189 name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset); 3410 name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset);
3190 3411
3191 *block_off = name_offset >> inode->i_sb->s_blocksize_bits; 3412 *block_off = name_offset >> sb->s_blocksize_bits;
3192 *new_offset = name_offset % inode->i_sb->s_blocksize; 3413 *new_offset = name_offset % sb->s_blocksize;
3193 3414
3194 return 0; 3415 return 0;
3195} 3416}
@@ -3209,7 +3430,7 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
3209 prefix = ocfs2_xattr_prefix(type); 3430 prefix = ocfs2_xattr_prefix(type);
3210 3431
3211 if (prefix) { 3432 if (prefix) {
3212 ret = ocfs2_xattr_bucket_get_name_value(inode, 3433 ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
3213 bucket_xh(bucket), 3434 bucket_xh(bucket),
3214 i, 3435 i,
3215 &block_off, 3436 &block_off,
@@ -3232,22 +3453,19 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
3232 return ret; 3453 return ret;
3233} 3454}
3234 3455
3235static int ocfs2_xattr_tree_list_index_block(struct inode *inode, 3456static int ocfs2_iterate_xattr_index_block(struct inode *inode,
3236 struct ocfs2_xattr_tree_root *xt, 3457 struct buffer_head *blk_bh,
3237 char *buffer, 3458 xattr_tree_rec_func *rec_func,
3238 size_t buffer_size) 3459 void *para)
3239{ 3460{
3240 struct ocfs2_extent_list *el = &xt->xt_list; 3461 struct ocfs2_xattr_block *xb =
3462 (struct ocfs2_xattr_block *)blk_bh->b_data;
3463 struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
3241 int ret = 0; 3464 int ret = 0;
3242 u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0; 3465 u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0;
3243 u64 p_blkno = 0; 3466 u64 p_blkno = 0;
3244 struct ocfs2_xattr_tree_list xl = {
3245 .buffer = buffer,
3246 .buffer_size = buffer_size,
3247 .result = 0,
3248 };
3249 3467
3250 if (le16_to_cpu(el->l_next_free_rec) == 0) 3468 if (!el->l_next_free_rec || !rec_func)
3251 return 0; 3469 return 0;
3252 3470
3253 while (name_hash > 0) { 3471 while (name_hash > 0) {
@@ -3255,16 +3473,15 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
3255 &e_cpos, &num_clusters, el); 3473 &e_cpos, &num_clusters, el);
3256 if (ret) { 3474 if (ret) {
3257 mlog_errno(ret); 3475 mlog_errno(ret);
3258 goto out; 3476 break;
3259 } 3477 }
3260 3478
3261 ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters, 3479 ret = rec_func(inode, blk_bh, p_blkno, e_cpos,
3262 ocfs2_list_xattr_bucket, 3480 num_clusters, para);
3263 &xl);
3264 if (ret) { 3481 if (ret) {
3265 if (ret != -ERANGE) 3482 if (ret != -ERANGE)
3266 mlog_errno(ret); 3483 mlog_errno(ret);
3267 goto out; 3484 break;
3268 } 3485 }
3269 3486
3270 if (e_cpos == 0) 3487 if (e_cpos == 0)
@@ -3273,6 +3490,37 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
3273 name_hash = e_cpos - 1; 3490 name_hash = e_cpos - 1;
3274 } 3491 }
3275 3492
3493 return ret;
3494
3495}
3496
3497static int ocfs2_list_xattr_tree_rec(struct inode *inode,
3498 struct buffer_head *root_bh,
3499 u64 blkno, u32 cpos, u32 len, void *para)
3500{
3501 return ocfs2_iterate_xattr_buckets(inode, blkno, len,
3502 ocfs2_list_xattr_bucket, para);
3503}
3504
3505static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
3506 struct buffer_head *blk_bh,
3507 char *buffer,
3508 size_t buffer_size)
3509{
3510 int ret;
3511 struct ocfs2_xattr_tree_list xl = {
3512 .buffer = buffer,
3513 .buffer_size = buffer_size,
3514 .result = 0,
3515 };
3516
3517 ret = ocfs2_iterate_xattr_index_block(inode, blk_bh,
3518 ocfs2_list_xattr_tree_rec, &xl);
3519 if (ret) {
3520 mlog_errno(ret);
3521 goto out;
3522 }
3523
3276 ret = xl.result; 3524 ret = xl.result;
3277out: 3525out:
3278 return ret; 3526 return ret;
@@ -3426,7 +3674,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
3426 */ 3674 */
3427 down_write(&oi->ip_alloc_sem); 3675 down_write(&oi->ip_alloc_sem);
3428 3676
3429 ret = ocfs2_journal_access_xb(handle, inode, xb_bh, 3677 ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), xb_bh,
3430 OCFS2_JOURNAL_ACCESS_WRITE); 3678 OCFS2_JOURNAL_ACCESS_WRITE);
3431 if (ret) { 3679 if (ret) {
3432 mlog_errno(ret); 3680 mlog_errno(ret);
@@ -4263,9 +4511,9 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
4263 (unsigned long long)OCFS2_I(inode)->ip_blkno, 4511 (unsigned long long)OCFS2_I(inode)->ip_blkno,
4264 prev_cpos, (unsigned long long)bucket_blkno(first)); 4512 prev_cpos, (unsigned long long)bucket_blkno(first));
4265 4513
4266 ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh); 4514 ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh);
4267 4515
4268 ret = ocfs2_journal_access_xb(handle, inode, root_bh, 4516 ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), root_bh,
4269 OCFS2_JOURNAL_ACCESS_WRITE); 4517 OCFS2_JOURNAL_ACCESS_WRITE);
4270 if (ret < 0) { 4518 if (ret < 0) {
4271 mlog_errno(ret); 4519 mlog_errno(ret);
@@ -4319,7 +4567,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
4319 4567
4320 mlog(0, "Insert %u clusters at block %llu for xattr at %u\n", 4568 mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
4321 num_bits, (unsigned long long)block, v_start); 4569 num_bits, (unsigned long long)block, v_start);
4322 ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block, 4570 ret = ocfs2_insert_extent(handle, &et, v_start, block,
4323 num_bits, 0, ctxt->meta_ac); 4571 num_bits, 0, ctxt->meta_ac);
4324 if (ret < 0) { 4572 if (ret < 0) {
4325 mlog_errno(ret); 4573 mlog_errno(ret);
@@ -4798,10 +5046,13 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
4798 struct ocfs2_xattr_entry *xe = xs->here; 5046 struct ocfs2_xattr_entry *xe = xs->here;
4799 struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket); 5047 struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
4800 void *base; 5048 void *base;
5049 struct ocfs2_xattr_value_buf vb = {
5050 .vb_access = ocfs2_journal_access,
5051 };
4801 5052
4802 BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe)); 5053 BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
4803 5054
4804 ret = ocfs2_xattr_bucket_get_name_value(inode, xh, 5055 ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, xh,
4805 xe - xh->xh_entries, 5056 xe - xh->xh_entries,
4806 &block_off, 5057 &block_off,
4807 &offset); 5058 &offset);
@@ -4814,8 +5065,10 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
4814 xv = (struct ocfs2_xattr_value_root *)(base + offset + 5065 xv = (struct ocfs2_xattr_value_root *)(base + offset +
4815 OCFS2_XATTR_SIZE(xe->xe_name_len)); 5066 OCFS2_XATTR_SIZE(xe->xe_name_len));
4816 5067
5068 vb.vb_xv = xv;
5069 vb.vb_bh = xs->bucket->bu_bhs[block_off];
4817 ret = __ocfs2_xattr_set_value_outside(inode, handle, 5070 ret = __ocfs2_xattr_set_value_outside(inode, handle,
4818 xv, val, value_len); 5071 &vb, val, value_len);
4819 if (ret) 5072 if (ret)
4820 mlog_errno(ret); 5073 mlog_errno(ret);
4821out: 5074out:
@@ -4826,7 +5079,8 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
4826 struct buffer_head *root_bh, 5079 struct buffer_head *root_bh,
4827 u64 blkno, 5080 u64 blkno,
4828 u32 cpos, 5081 u32 cpos,
4829 u32 len) 5082 u32 len,
5083 void *para)
4830{ 5084{
4831 int ret; 5085 int ret;
4832 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 5086 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -4838,14 +5092,22 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
4838 struct ocfs2_cached_dealloc_ctxt dealloc; 5092 struct ocfs2_cached_dealloc_ctxt dealloc;
4839 struct ocfs2_extent_tree et; 5093 struct ocfs2_extent_tree et;
4840 5094
4841 ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh); 5095 ret = ocfs2_iterate_xattr_buckets(inode, blkno, len,
5096 ocfs2_delete_xattr_in_bucket, para);
5097 if (ret) {
5098 mlog_errno(ret);
5099 return ret;
5100 }
5101
5102 ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh);
4842 5103
4843 ocfs2_init_dealloc_ctxt(&dealloc); 5104 ocfs2_init_dealloc_ctxt(&dealloc);
4844 5105
4845 mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n", 5106 mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n",
4846 cpos, len, (unsigned long long)blkno); 5107 cpos, len, (unsigned long long)blkno);
4847 5108
4848 ocfs2_remove_xattr_clusters_from_cache(inode, blkno, len); 5109 ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode), blkno,
5110 len);
4849 5111
4850 ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac); 5112 ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
4851 if (ret) { 5113 if (ret) {
@@ -4870,14 +5132,14 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
4870 goto out; 5132 goto out;
4871 } 5133 }
4872 5134
4873 ret = ocfs2_journal_access_xb(handle, inode, root_bh, 5135 ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), root_bh,
4874 OCFS2_JOURNAL_ACCESS_WRITE); 5136 OCFS2_JOURNAL_ACCESS_WRITE);
4875 if (ret) { 5137 if (ret) {
4876 mlog_errno(ret); 5138 mlog_errno(ret);
4877 goto out_commit; 5139 goto out_commit;
4878 } 5140 }
4879 5141
4880 ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac, 5142 ret = ocfs2_remove_extent(handle, &et, cpos, len, meta_ac,
4881 &dealloc); 5143 &dealloc);
4882 if (ret) { 5144 if (ret) {
4883 mlog_errno(ret); 5145 mlog_errno(ret);
@@ -5220,7 +5482,7 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
5220 struct ocfs2_xattr_bucket *bucket, 5482 struct ocfs2_xattr_bucket *bucket,
5221 void *para) 5483 void *para)
5222{ 5484{
5223 int ret = 0; 5485 int ret = 0, ref_credits;
5224 struct ocfs2_xattr_header *xh = bucket_xh(bucket); 5486 struct ocfs2_xattr_header *xh = bucket_xh(bucket);
5225 u16 i; 5487 u16 i;
5226 struct ocfs2_xattr_entry *xe; 5488 struct ocfs2_xattr_entry *xe;
@@ -5228,7 +5490,9 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
5228 struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,}; 5490 struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,};
5229 int credits = ocfs2_remove_extent_credits(osb->sb) + 5491 int credits = ocfs2_remove_extent_credits(osb->sb) +
5230 ocfs2_blocks_per_xattr_bucket(inode->i_sb); 5492 ocfs2_blocks_per_xattr_bucket(inode->i_sb);
5231 5493 struct ocfs2_xattr_value_root *xv;
5494 struct ocfs2_rm_xattr_bucket_para *args =
5495 (struct ocfs2_rm_xattr_bucket_para *)para;
5232 5496
5233 ocfs2_init_dealloc_ctxt(&ctxt.dealloc); 5497 ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
5234 5498
@@ -5237,7 +5501,16 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
5237 if (ocfs2_xattr_is_local(xe)) 5501 if (ocfs2_xattr_is_local(xe))
5238 continue; 5502 continue;
5239 5503
5240 ctxt.handle = ocfs2_start_trans(osb, credits); 5504 ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket,
5505 i, &xv, NULL);
5506
5507 ret = ocfs2_lock_xattr_remove_allocators(inode, xv,
5508 args->ref_ci,
5509 args->ref_root_bh,
5510 &ctxt.meta_ac,
5511 &ref_credits);
5512
5513 ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits);
5241 if (IS_ERR(ctxt.handle)) { 5514 if (IS_ERR(ctxt.handle)) {
5242 ret = PTR_ERR(ctxt.handle); 5515 ret = PTR_ERR(ctxt.handle);
5243 mlog_errno(ret); 5516 mlog_errno(ret);
@@ -5248,57 +5521,1439 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
5248 i, 0, &ctxt); 5521 i, 0, &ctxt);
5249 5522
5250 ocfs2_commit_trans(osb, ctxt.handle); 5523 ocfs2_commit_trans(osb, ctxt.handle);
5524 if (ctxt.meta_ac) {
5525 ocfs2_free_alloc_context(ctxt.meta_ac);
5526 ctxt.meta_ac = NULL;
5527 }
5251 if (ret) { 5528 if (ret) {
5252 mlog_errno(ret); 5529 mlog_errno(ret);
5253 break; 5530 break;
5254 } 5531 }
5255 } 5532 }
5256 5533
5534 if (ctxt.meta_ac)
5535 ocfs2_free_alloc_context(ctxt.meta_ac);
5257 ocfs2_schedule_truncate_log_flush(osb, 1); 5536 ocfs2_schedule_truncate_log_flush(osb, 1);
5258 ocfs2_run_deallocs(osb, &ctxt.dealloc); 5537 ocfs2_run_deallocs(osb, &ctxt.dealloc);
5259 return ret; 5538 return ret;
5260} 5539}
5261 5540
5262static int ocfs2_delete_xattr_index_block(struct inode *inode, 5541/*
5263 struct buffer_head *xb_bh) 5542 * Whenever we modify a xattr value root in the bucket(e.g, CoW
5543 * or change the extent record flag), we need to recalculate
5544 * the metaecc for the whole bucket. So it is done here.
5545 *
5546 * Note:
5547 * We have to give the extra credits for the caller.
5548 */
5549static int ocfs2_xattr_bucket_post_refcount(struct inode *inode,
5550 handle_t *handle,
5551 void *para)
5552{
5553 int ret;
5554 struct ocfs2_xattr_bucket *bucket =
5555 (struct ocfs2_xattr_bucket *)para;
5556
5557 ret = ocfs2_xattr_bucket_journal_access(handle, bucket,
5558 OCFS2_JOURNAL_ACCESS_WRITE);
5559 if (ret) {
5560 mlog_errno(ret);
5561 return ret;
5562 }
5563
5564 ocfs2_xattr_bucket_journal_dirty(handle, bucket);
5565
5566 return 0;
5567}
5568
5569/*
5570 * Special action we need if the xattr value is refcounted.
5571 *
5572 * 1. If the xattr is refcounted, lock the tree.
5573 * 2. CoW the xattr if we are setting the new value and the value
5574 * will be stored outside.
5575 * 3. In other case, decrease_refcount will work for us, so just
5576 * lock the refcount tree, calculate the meta and credits is OK.
5577 *
5578 * We have to do CoW before ocfs2_init_xattr_set_ctxt since
5579 * currently CoW is a completed transaction, while this function
5580 * will also lock the allocators and let us deadlock. So we will
5581 * CoW the whole xattr value.
5582 */
5583static int ocfs2_prepare_refcount_xattr(struct inode *inode,
5584 struct ocfs2_dinode *di,
5585 struct ocfs2_xattr_info *xi,
5586 struct ocfs2_xattr_search *xis,
5587 struct ocfs2_xattr_search *xbs,
5588 struct ocfs2_refcount_tree **ref_tree,
5589 int *meta_add,
5590 int *credits)
5264{ 5591{
5265 struct ocfs2_xattr_block *xb =
5266 (struct ocfs2_xattr_block *)xb_bh->b_data;
5267 struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
5268 int ret = 0; 5592 int ret = 0;
5269 u32 name_hash = UINT_MAX, e_cpos, num_clusters; 5593 struct ocfs2_xattr_block *xb;
5270 u64 p_blkno; 5594 struct ocfs2_xattr_entry *xe;
5595 char *base;
5596 u32 p_cluster, num_clusters;
5597 unsigned int ext_flags;
5598 int name_offset, name_len;
5599 struct ocfs2_xattr_value_buf vb;
5600 struct ocfs2_xattr_bucket *bucket = NULL;
5601 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5602 struct ocfs2_post_refcount refcount;
5603 struct ocfs2_post_refcount *p = NULL;
5604 struct buffer_head *ref_root_bh = NULL;
5271 5605
5272 if (le16_to_cpu(el->l_next_free_rec) == 0) 5606 if (!xis->not_found) {
5273 return 0; 5607 xe = xis->here;
5608 name_offset = le16_to_cpu(xe->xe_name_offset);
5609 name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
5610 base = xis->base;
5611 vb.vb_bh = xis->inode_bh;
5612 vb.vb_access = ocfs2_journal_access_di;
5613 } else {
5614 int i, block_off = 0;
5615 xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
5616 xe = xbs->here;
5617 name_offset = le16_to_cpu(xe->xe_name_offset);
5618 name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
5619 i = xbs->here - xbs->header->xh_entries;
5274 5620
5275 while (name_hash > 0) { 5621 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
5276 ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, 5622 ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
5277 &e_cpos, &num_clusters, el); 5623 bucket_xh(xbs->bucket),
5624 i, &block_off,
5625 &name_offset);
5626 if (ret) {
5627 mlog_errno(ret);
5628 goto out;
5629 }
5630 base = bucket_block(xbs->bucket, block_off);
5631 vb.vb_bh = xbs->bucket->bu_bhs[block_off];
5632 vb.vb_access = ocfs2_journal_access;
5633
5634 if (ocfs2_meta_ecc(osb)) {
5635 /*create parameters for ocfs2_post_refcount. */
5636 bucket = xbs->bucket;
5637 refcount.credits = bucket->bu_blocks;
5638 refcount.para = bucket;
5639 refcount.func =
5640 ocfs2_xattr_bucket_post_refcount;
5641 p = &refcount;
5642 }
5643 } else {
5644 base = xbs->base;
5645 vb.vb_bh = xbs->xattr_bh;
5646 vb.vb_access = ocfs2_journal_access_xb;
5647 }
5648 }
5649
5650 if (ocfs2_xattr_is_local(xe))
5651 goto out;
5652
5653 vb.vb_xv = (struct ocfs2_xattr_value_root *)
5654 (base + name_offset + name_len);
5655
5656 ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster,
5657 &num_clusters, &vb.vb_xv->xr_list,
5658 &ext_flags);
5659 if (ret) {
5660 mlog_errno(ret);
5661 goto out;
5662 }
5663
5664 /*
5665 * We just need to check the 1st extent record, since we always
5666 * CoW the whole xattr. So there shouldn't be a xattr with
5667 * some REFCOUNT extent recs after the 1st one.
5668 */
5669 if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
5670 goto out;
5671
5672 ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
5673 1, ref_tree, &ref_root_bh);
5674 if (ret) {
5675 mlog_errno(ret);
5676 goto out;
5677 }
5678
5679 /*
5680 * If we are deleting the xattr or the new size will be stored inside,
5681 * cool, leave it there, the xattr truncate process will remove them
5682 * for us(it still needs the refcount tree lock and the meta, credits).
5683 * And the worse case is that every cluster truncate will split the
5684 * refcount tree, and make the original extent become 3. So we will need
5685 * 2 * cluster more extent recs at most.
5686 */
5687 if (!xi->value || xi->value_len <= OCFS2_XATTR_INLINE_SIZE) {
5688
5689 ret = ocfs2_refcounted_xattr_delete_need(inode,
5690 &(*ref_tree)->rf_ci,
5691 ref_root_bh, vb.vb_xv,
5692 meta_add, credits);
5693 if (ret)
5694 mlog_errno(ret);
5695 goto out;
5696 }
5697
5698 ret = ocfs2_refcount_cow_xattr(inode, di, &vb,
5699 *ref_tree, ref_root_bh, 0,
5700 le32_to_cpu(vb.vb_xv->xr_clusters), p);
5701 if (ret)
5702 mlog_errno(ret);
5703
5704out:
5705 brelse(ref_root_bh);
5706 return ret;
5707}
5708
5709/*
5710 * Add the REFCOUNTED flags for all the extent rec in ocfs2_xattr_value_root.
5711 * The physical clusters will be added to refcount tree.
5712 */
5713static int ocfs2_xattr_value_attach_refcount(struct inode *inode,
5714 struct ocfs2_xattr_value_root *xv,
5715 struct ocfs2_extent_tree *value_et,
5716 struct ocfs2_caching_info *ref_ci,
5717 struct buffer_head *ref_root_bh,
5718 struct ocfs2_cached_dealloc_ctxt *dealloc,
5719 struct ocfs2_post_refcount *refcount)
5720{
5721 int ret = 0;
5722 u32 clusters = le32_to_cpu(xv->xr_clusters);
5723 u32 cpos, p_cluster, num_clusters;
5724 struct ocfs2_extent_list *el = &xv->xr_list;
5725 unsigned int ext_flags;
5726
5727 cpos = 0;
5728 while (cpos < clusters) {
5729 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
5730 &num_clusters, el, &ext_flags);
5731
5732 cpos += num_clusters;
5733 if ((ext_flags & OCFS2_EXT_REFCOUNTED))
5734 continue;
5735
5736 BUG_ON(!p_cluster);
5737
5738 ret = ocfs2_add_refcount_flag(inode, value_et,
5739 ref_ci, ref_root_bh,
5740 cpos - num_clusters,
5741 p_cluster, num_clusters,
5742 dealloc, refcount);
5743 if (ret) {
5744 mlog_errno(ret);
5745 break;
5746 }
5747 }
5748
5749 return ret;
5750}
5751
5752/*
5753 * Given a normal ocfs2_xattr_header, refcount all the entries which
5754 * have value stored outside.
5755 * Used for xattrs stored in inode and ocfs2_xattr_block.
5756 */
5757static int ocfs2_xattr_attach_refcount_normal(struct inode *inode,
5758 struct ocfs2_xattr_value_buf *vb,
5759 struct ocfs2_xattr_header *header,
5760 struct ocfs2_caching_info *ref_ci,
5761 struct buffer_head *ref_root_bh,
5762 struct ocfs2_cached_dealloc_ctxt *dealloc)
5763{
5764
5765 struct ocfs2_xattr_entry *xe;
5766 struct ocfs2_xattr_value_root *xv;
5767 struct ocfs2_extent_tree et;
5768 int i, ret = 0;
5769
5770 for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
5771 xe = &header->xh_entries[i];
5772
5773 if (ocfs2_xattr_is_local(xe))
5774 continue;
5775
5776 xv = (struct ocfs2_xattr_value_root *)((void *)header +
5777 le16_to_cpu(xe->xe_name_offset) +
5778 OCFS2_XATTR_SIZE(xe->xe_name_len));
5779
5780 vb->vb_xv = xv;
5781 ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
5782
5783 ret = ocfs2_xattr_value_attach_refcount(inode, xv, &et,
5784 ref_ci, ref_root_bh,
5785 dealloc, NULL);
5786 if (ret) {
5787 mlog_errno(ret);
5788 break;
5789 }
5790 }
5791
5792 return ret;
5793}
5794
5795static int ocfs2_xattr_inline_attach_refcount(struct inode *inode,
5796 struct buffer_head *fe_bh,
5797 struct ocfs2_caching_info *ref_ci,
5798 struct buffer_head *ref_root_bh,
5799 struct ocfs2_cached_dealloc_ctxt *dealloc)
5800{
5801 struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
5802 struct ocfs2_xattr_header *header = (struct ocfs2_xattr_header *)
5803 (fe_bh->b_data + inode->i_sb->s_blocksize -
5804 le16_to_cpu(di->i_xattr_inline_size));
5805 struct ocfs2_xattr_value_buf vb = {
5806 .vb_bh = fe_bh,
5807 .vb_access = ocfs2_journal_access_di,
5808 };
5809
5810 return ocfs2_xattr_attach_refcount_normal(inode, &vb, header,
5811 ref_ci, ref_root_bh, dealloc);
5812}
5813
5814struct ocfs2_xattr_tree_value_refcount_para {
5815 struct ocfs2_caching_info *ref_ci;
5816 struct buffer_head *ref_root_bh;
5817 struct ocfs2_cached_dealloc_ctxt *dealloc;
5818};
5819
5820static int ocfs2_get_xattr_tree_value_root(struct super_block *sb,
5821 struct ocfs2_xattr_bucket *bucket,
5822 int offset,
5823 struct ocfs2_xattr_value_root **xv,
5824 struct buffer_head **bh)
5825{
5826 int ret, block_off, name_offset;
5827 struct ocfs2_xattr_header *xh = bucket_xh(bucket);
5828 struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset];
5829 void *base;
5830
5831 ret = ocfs2_xattr_bucket_get_name_value(sb,
5832 bucket_xh(bucket),
5833 offset,
5834 &block_off,
5835 &name_offset);
5836 if (ret) {
5837 mlog_errno(ret);
5838 goto out;
5839 }
5840
5841 base = bucket_block(bucket, block_off);
5842
5843 *xv = (struct ocfs2_xattr_value_root *)(base + name_offset +
5844 OCFS2_XATTR_SIZE(xe->xe_name_len));
5845
5846 if (bh)
5847 *bh = bucket->bu_bhs[block_off];
5848out:
5849 return ret;
5850}
5851
5852/*
5853 * For a given xattr bucket, refcount all the entries which
5854 * have value stored outside.
5855 */
5856static int ocfs2_xattr_bucket_value_refcount(struct inode *inode,
5857 struct ocfs2_xattr_bucket *bucket,
5858 void *para)
5859{
5860 int i, ret = 0;
5861 struct ocfs2_extent_tree et;
5862 struct ocfs2_xattr_tree_value_refcount_para *ref =
5863 (struct ocfs2_xattr_tree_value_refcount_para *)para;
5864 struct ocfs2_xattr_header *xh =
5865 (struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data;
5866 struct ocfs2_xattr_entry *xe;
5867 struct ocfs2_xattr_value_buf vb = {
5868 .vb_access = ocfs2_journal_access,
5869 };
5870 struct ocfs2_post_refcount refcount = {
5871 .credits = bucket->bu_blocks,
5872 .para = bucket,
5873 .func = ocfs2_xattr_bucket_post_refcount,
5874 };
5875 struct ocfs2_post_refcount *p = NULL;
5876
5877 /* We only need post_refcount if we support metaecc. */
5878 if (ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)))
5879 p = &refcount;
5880
5881 mlog(0, "refcount bucket %llu, count = %u\n",
5882 (unsigned long long)bucket_blkno(bucket),
5883 le16_to_cpu(xh->xh_count));
5884 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
5885 xe = &xh->xh_entries[i];
5886
5887 if (ocfs2_xattr_is_local(xe))
5888 continue;
5889
5890 ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket, i,
5891 &vb.vb_xv, &vb.vb_bh);
5892 if (ret) {
5893 mlog_errno(ret);
5894 break;
5895 }
5896
5897 ocfs2_init_xattr_value_extent_tree(&et,
5898 INODE_CACHE(inode), &vb);
5899
5900 ret = ocfs2_xattr_value_attach_refcount(inode, vb.vb_xv,
5901 &et, ref->ref_ci,
5902 ref->ref_root_bh,
5903 ref->dealloc, p);
5904 if (ret) {
5905 mlog_errno(ret);
5906 break;
5907 }
5908 }
5909
5910 return ret;
5911
5912}
5913
5914static int ocfs2_refcount_xattr_tree_rec(struct inode *inode,
5915 struct buffer_head *root_bh,
5916 u64 blkno, u32 cpos, u32 len, void *para)
5917{
5918 return ocfs2_iterate_xattr_buckets(inode, blkno, len,
5919 ocfs2_xattr_bucket_value_refcount,
5920 para);
5921}
5922
5923static int ocfs2_xattr_block_attach_refcount(struct inode *inode,
5924 struct buffer_head *blk_bh,
5925 struct ocfs2_caching_info *ref_ci,
5926 struct buffer_head *ref_root_bh,
5927 struct ocfs2_cached_dealloc_ctxt *dealloc)
5928{
5929 int ret = 0;
5930 struct ocfs2_xattr_block *xb =
5931 (struct ocfs2_xattr_block *)blk_bh->b_data;
5932
5933 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
5934 struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
5935 struct ocfs2_xattr_value_buf vb = {
5936 .vb_bh = blk_bh,
5937 .vb_access = ocfs2_journal_access_xb,
5938 };
5939
5940 ret = ocfs2_xattr_attach_refcount_normal(inode, &vb, header,
5941 ref_ci, ref_root_bh,
5942 dealloc);
5943 } else {
5944 struct ocfs2_xattr_tree_value_refcount_para para = {
5945 .ref_ci = ref_ci,
5946 .ref_root_bh = ref_root_bh,
5947 .dealloc = dealloc,
5948 };
5949
5950 ret = ocfs2_iterate_xattr_index_block(inode, blk_bh,
5951 ocfs2_refcount_xattr_tree_rec,
5952 &para);
5953 }
5954
5955 return ret;
5956}
5957
5958int ocfs2_xattr_attach_refcount_tree(struct inode *inode,
5959 struct buffer_head *fe_bh,
5960 struct ocfs2_caching_info *ref_ci,
5961 struct buffer_head *ref_root_bh,
5962 struct ocfs2_cached_dealloc_ctxt *dealloc)
5963{
5964 int ret = 0;
5965 struct ocfs2_inode_info *oi = OCFS2_I(inode);
5966 struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
5967 struct buffer_head *blk_bh = NULL;
5968
5969 if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
5970 ret = ocfs2_xattr_inline_attach_refcount(inode, fe_bh,
5971 ref_ci, ref_root_bh,
5972 dealloc);
5278 if (ret) { 5973 if (ret) {
5279 mlog_errno(ret); 5974 mlog_errno(ret);
5280 goto out; 5975 goto out;
5281 } 5976 }
5977 }
5978
5979 if (!di->i_xattr_loc)
5980 goto out;
5981
5982 ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
5983 &blk_bh);
5984 if (ret < 0) {
5985 mlog_errno(ret);
5986 goto out;
5987 }
5988
5989 ret = ocfs2_xattr_block_attach_refcount(inode, blk_bh, ref_ci,
5990 ref_root_bh, dealloc);
5991 if (ret)
5992 mlog_errno(ret);
5993
5994 brelse(blk_bh);
5995out:
5996
5997 return ret;
5998}
5999
6000typedef int (should_xattr_reflinked)(struct ocfs2_xattr_entry *xe);
6001/*
6002 * Store the information we need in xattr reflink.
6003 * old_bh and new_bh are inode bh for the old and new inode.
6004 */
6005struct ocfs2_xattr_reflink {
6006 struct inode *old_inode;
6007 struct inode *new_inode;
6008 struct buffer_head *old_bh;
6009 struct buffer_head *new_bh;
6010 struct ocfs2_caching_info *ref_ci;
6011 struct buffer_head *ref_root_bh;
6012 struct ocfs2_cached_dealloc_ctxt *dealloc;
6013 should_xattr_reflinked *xattr_reflinked;
6014};
6015
6016/*
6017 * Given a xattr header and xe offset,
6018 * return the proper xv and the corresponding bh.
6019 * xattr in inode, block and xattr tree have different implementaions.
6020 */
6021typedef int (get_xattr_value_root)(struct super_block *sb,
6022 struct buffer_head *bh,
6023 struct ocfs2_xattr_header *xh,
6024 int offset,
6025 struct ocfs2_xattr_value_root **xv,
6026 struct buffer_head **ret_bh,
6027 void *para);
6028
6029/*
6030 * Calculate all the xattr value root metadata stored in this xattr header and
6031 * credits we need if we create them from the scratch.
6032 * We use get_xattr_value_root so that all types of xattr container can use it.
6033 */
6034static int ocfs2_value_metas_in_xattr_header(struct super_block *sb,
6035 struct buffer_head *bh,
6036 struct ocfs2_xattr_header *xh,
6037 int *metas, int *credits,
6038 int *num_recs,
6039 get_xattr_value_root *func,
6040 void *para)
6041{
6042 int i, ret = 0;
6043 struct ocfs2_xattr_value_root *xv;
6044 struct ocfs2_xattr_entry *xe;
6045
6046 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
6047 xe = &xh->xh_entries[i];
6048 if (ocfs2_xattr_is_local(xe))
6049 continue;
6050
6051 ret = func(sb, bh, xh, i, &xv, NULL, para);
6052 if (ret) {
6053 mlog_errno(ret);
6054 break;
6055 }
6056
6057 *metas += le16_to_cpu(xv->xr_list.l_tree_depth) *
6058 le16_to_cpu(xv->xr_list.l_next_free_rec);
6059
6060 *credits += ocfs2_calc_extend_credits(sb,
6061 &def_xv.xv.xr_list,
6062 le32_to_cpu(xv->xr_clusters));
6063
6064 /*
6065 * If the value is a tree with depth > 1, We don't go deep
6066 * to the extent block, so just calculate a maximum record num.
6067 */
6068 if (!xv->xr_list.l_tree_depth)
6069 *num_recs += xv->xr_list.l_next_free_rec;
6070 else
6071 *num_recs += ocfs2_clusters_for_bytes(sb,
6072 XATTR_SIZE_MAX);
6073 }
6074
6075 return ret;
6076}
6077
6078/* Used by xattr inode and block to return the right xv and buffer_head. */
6079static int ocfs2_get_xattr_value_root(struct super_block *sb,
6080 struct buffer_head *bh,
6081 struct ocfs2_xattr_header *xh,
6082 int offset,
6083 struct ocfs2_xattr_value_root **xv,
6084 struct buffer_head **ret_bh,
6085 void *para)
6086{
6087 struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset];
6088
6089 *xv = (struct ocfs2_xattr_value_root *)((void *)xh +
6090 le16_to_cpu(xe->xe_name_offset) +
6091 OCFS2_XATTR_SIZE(xe->xe_name_len));
6092
6093 if (ret_bh)
6094 *ret_bh = bh;
6095
6096 return 0;
6097}
6098
6099/*
6100 * Lock the meta_ac and caculate how much credits we need for reflink xattrs.
6101 * It is only used for inline xattr and xattr block.
6102 */
6103static int ocfs2_reflink_lock_xattr_allocators(struct ocfs2_super *osb,
6104 struct ocfs2_xattr_header *xh,
6105 struct buffer_head *ref_root_bh,
6106 int *credits,
6107 struct ocfs2_alloc_context **meta_ac)
6108{
6109 int ret, meta_add = 0, num_recs = 0;
6110 struct ocfs2_refcount_block *rb =
6111 (struct ocfs2_refcount_block *)ref_root_bh->b_data;
6112
6113 *credits = 0;
6114
6115 ret = ocfs2_value_metas_in_xattr_header(osb->sb, NULL, xh,
6116 &meta_add, credits, &num_recs,
6117 ocfs2_get_xattr_value_root,
6118 NULL);
6119 if (ret) {
6120 mlog_errno(ret);
6121 goto out;
6122 }
6123
6124 /*
6125 * We need to add/modify num_recs in refcount tree, so just calculate
6126 * an approximate number we need for refcount tree change.
6127 * Sometimes we need to split the tree, and after split, half recs
6128 * will be moved to the new block, and a new block can only provide
6129 * half number of recs. So we multiple new blocks by 2.
6130 */
6131 num_recs = num_recs / ocfs2_refcount_recs_per_rb(osb->sb) * 2;
6132 meta_add += num_recs;
6133 *credits += num_recs + num_recs * OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
6134 if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
6135 *credits += le16_to_cpu(rb->rf_list.l_tree_depth) *
6136 le16_to_cpu(rb->rf_list.l_next_free_rec) + 1;
6137 else
6138 *credits += 1;
6139
6140 ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, meta_ac);
6141 if (ret)
6142 mlog_errno(ret);
6143
6144out:
6145 return ret;
6146}
5282 6147
5283 ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters, 6148/*
5284 ocfs2_delete_xattr_in_bucket, 6149 * Given a xattr header, reflink all the xattrs in this container.
5285 NULL); 6150 * It can be used for inode, block and bucket.
6151 *
6152 * NOTE:
6153 * Before we call this function, the caller has memcpy the xattr in
6154 * old_xh to the new_xh.
6155 *
6156 * If args.xattr_reflinked is set, call it to decide whether the xe should
6157 * be reflinked or not. If not, remove it from the new xattr header.
6158 */
6159static int ocfs2_reflink_xattr_header(handle_t *handle,
6160 struct ocfs2_xattr_reflink *args,
6161 struct buffer_head *old_bh,
6162 struct ocfs2_xattr_header *xh,
6163 struct buffer_head *new_bh,
6164 struct ocfs2_xattr_header *new_xh,
6165 struct ocfs2_xattr_value_buf *vb,
6166 struct ocfs2_alloc_context *meta_ac,
6167 get_xattr_value_root *func,
6168 void *para)
6169{
6170 int ret = 0, i, j;
6171 struct super_block *sb = args->old_inode->i_sb;
6172 struct buffer_head *value_bh;
6173 struct ocfs2_xattr_entry *xe, *last;
6174 struct ocfs2_xattr_value_root *xv, *new_xv;
6175 struct ocfs2_extent_tree data_et;
6176 u32 clusters, cpos, p_cluster, num_clusters;
6177 unsigned int ext_flags = 0;
6178
6179 mlog(0, "reflink xattr in container %llu, count = %u\n",
6180 (unsigned long long)old_bh->b_blocknr, le16_to_cpu(xh->xh_count));
6181
6182 last = &new_xh->xh_entries[le16_to_cpu(new_xh->xh_count)];
6183 for (i = 0, j = 0; i < le16_to_cpu(xh->xh_count); i++, j++) {
6184 xe = &xh->xh_entries[i];
6185
6186 if (args->xattr_reflinked && !args->xattr_reflinked(xe)) {
6187 xe = &new_xh->xh_entries[j];
6188
6189 le16_add_cpu(&new_xh->xh_count, -1);
6190 if (new_xh->xh_count) {
6191 memmove(xe, xe + 1,
6192 (void *)last - (void *)xe);
6193 memset(last, 0,
6194 sizeof(struct ocfs2_xattr_entry));
6195 }
6196
6197 /*
6198 * We don't want j to increase in the next round since
6199 * it is already moved ahead.
6200 */
6201 j--;
6202 continue;
6203 }
6204
6205 if (ocfs2_xattr_is_local(xe))
6206 continue;
6207
6208 ret = func(sb, old_bh, xh, i, &xv, NULL, para);
6209 if (ret) {
6210 mlog_errno(ret);
6211 break;
6212 }
6213
6214 ret = func(sb, new_bh, new_xh, j, &new_xv, &value_bh, para);
6215 if (ret) {
6216 mlog_errno(ret);
6217 break;
6218 }
6219
6220 /*
6221 * For the xattr which has l_tree_depth = 0, all the extent
6222 * recs have already be copied to the new xh with the
6223 * propriate OCFS2_EXT_REFCOUNTED flag we just need to
6224 * increase the refount count int the refcount tree.
6225 *
6226 * For the xattr which has l_tree_depth > 0, we need
6227 * to initialize it to the empty default value root,
6228 * and then insert the extents one by one.
6229 */
6230 if (xv->xr_list.l_tree_depth) {
6231 memcpy(new_xv, &def_xv, sizeof(def_xv));
6232 vb->vb_xv = new_xv;
6233 vb->vb_bh = value_bh;
6234 ocfs2_init_xattr_value_extent_tree(&data_et,
6235 INODE_CACHE(args->new_inode), vb);
6236 }
6237
6238 clusters = le32_to_cpu(xv->xr_clusters);
6239 cpos = 0;
6240 while (cpos < clusters) {
6241 ret = ocfs2_xattr_get_clusters(args->old_inode,
6242 cpos,
6243 &p_cluster,
6244 &num_clusters,
6245 &xv->xr_list,
6246 &ext_flags);
6247 if (ret) {
6248 mlog_errno(ret);
6249 goto out;
6250 }
6251
6252 BUG_ON(!p_cluster);
6253
6254 if (xv->xr_list.l_tree_depth) {
6255 ret = ocfs2_insert_extent(handle,
6256 &data_et, cpos,
6257 ocfs2_clusters_to_blocks(
6258 args->old_inode->i_sb,
6259 p_cluster),
6260 num_clusters, ext_flags,
6261 meta_ac);
6262 if (ret) {
6263 mlog_errno(ret);
6264 goto out;
6265 }
6266 }
6267
6268 ret = ocfs2_increase_refcount(handle, args->ref_ci,
6269 args->ref_root_bh,
6270 p_cluster, num_clusters,
6271 meta_ac, args->dealloc);
6272 if (ret) {
6273 mlog_errno(ret);
6274 goto out;
6275 }
6276
6277 cpos += num_clusters;
6278 }
6279 }
6280
6281out:
6282 return ret;
6283}
6284
6285static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args)
6286{
6287 int ret = 0, credits = 0;
6288 handle_t *handle;
6289 struct ocfs2_super *osb = OCFS2_SB(args->old_inode->i_sb);
6290 struct ocfs2_dinode *di = (struct ocfs2_dinode *)args->old_bh->b_data;
6291 int inline_size = le16_to_cpu(di->i_xattr_inline_size);
6292 int header_off = osb->sb->s_blocksize - inline_size;
6293 struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)
6294 (args->old_bh->b_data + header_off);
6295 struct ocfs2_xattr_header *new_xh = (struct ocfs2_xattr_header *)
6296 (args->new_bh->b_data + header_off);
6297 struct ocfs2_alloc_context *meta_ac = NULL;
6298 struct ocfs2_inode_info *new_oi;
6299 struct ocfs2_dinode *new_di;
6300 struct ocfs2_xattr_value_buf vb = {
6301 .vb_bh = args->new_bh,
6302 .vb_access = ocfs2_journal_access_di,
6303 };
6304
6305 ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh,
6306 &credits, &meta_ac);
6307 if (ret) {
6308 mlog_errno(ret);
6309 goto out;
6310 }
6311
6312 handle = ocfs2_start_trans(osb, credits);
6313 if (IS_ERR(handle)) {
6314 ret = PTR_ERR(handle);
6315 mlog_errno(ret);
6316 goto out;
6317 }
6318
6319 ret = ocfs2_journal_access_di(handle, INODE_CACHE(args->new_inode),
6320 args->new_bh, OCFS2_JOURNAL_ACCESS_WRITE);
6321 if (ret) {
6322 mlog_errno(ret);
6323 goto out_commit;
6324 }
6325
6326 memcpy(args->new_bh->b_data + header_off,
6327 args->old_bh->b_data + header_off, inline_size);
6328
6329 new_di = (struct ocfs2_dinode *)args->new_bh->b_data;
6330 new_di->i_xattr_inline_size = cpu_to_le16(inline_size);
6331
6332 ret = ocfs2_reflink_xattr_header(handle, args, args->old_bh, xh,
6333 args->new_bh, new_xh, &vb, meta_ac,
6334 ocfs2_get_xattr_value_root, NULL);
6335 if (ret) {
6336 mlog_errno(ret);
6337 goto out_commit;
6338 }
6339
6340 new_oi = OCFS2_I(args->new_inode);
6341 spin_lock(&new_oi->ip_lock);
6342 new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL;
6343 new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
6344 spin_unlock(&new_oi->ip_lock);
6345
6346 ocfs2_journal_dirty(handle, args->new_bh);
6347
6348out_commit:
6349 ocfs2_commit_trans(osb, handle);
6350
6351out:
6352 if (meta_ac)
6353 ocfs2_free_alloc_context(meta_ac);
6354 return ret;
6355}
6356
6357static int ocfs2_create_empty_xattr_block(struct inode *inode,
6358 struct buffer_head *fe_bh,
6359 struct buffer_head **ret_bh,
6360 int indexed)
6361{
6362 int ret;
6363 handle_t *handle;
6364 struct ocfs2_alloc_context *meta_ac;
6365 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6366
6367 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
6368 if (ret < 0) {
6369 mlog_errno(ret);
6370 return ret;
6371 }
6372
6373 handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS);
6374 if (IS_ERR(handle)) {
6375 ret = PTR_ERR(handle);
6376 mlog_errno(ret);
6377 goto out;
6378 }
6379
6380 mlog(0, "create new xattr block for inode %llu, index = %d\n",
6381 (unsigned long long)fe_bh->b_blocknr, indexed);
6382 ret = ocfs2_create_xattr_block(handle, inode, fe_bh,
6383 meta_ac, ret_bh, indexed);
6384 if (ret)
6385 mlog_errno(ret);
6386
6387 ocfs2_commit_trans(osb, handle);
6388out:
6389 ocfs2_free_alloc_context(meta_ac);
6390 return ret;
6391}
6392
6393static int ocfs2_reflink_xattr_block(struct ocfs2_xattr_reflink *args,
6394 struct buffer_head *blk_bh,
6395 struct buffer_head *new_blk_bh)
6396{
6397 int ret = 0, credits = 0;
6398 handle_t *handle;
6399 struct ocfs2_inode_info *new_oi = OCFS2_I(args->new_inode);
6400 struct ocfs2_dinode *new_di;
6401 struct ocfs2_super *osb = OCFS2_SB(args->new_inode->i_sb);
6402 int header_off = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
6403 struct ocfs2_xattr_block *xb =
6404 (struct ocfs2_xattr_block *)blk_bh->b_data;
6405 struct ocfs2_xattr_header *xh = &xb->xb_attrs.xb_header;
6406 struct ocfs2_xattr_block *new_xb =
6407 (struct ocfs2_xattr_block *)new_blk_bh->b_data;
6408 struct ocfs2_xattr_header *new_xh = &new_xb->xb_attrs.xb_header;
6409 struct ocfs2_alloc_context *meta_ac;
6410 struct ocfs2_xattr_value_buf vb = {
6411 .vb_bh = new_blk_bh,
6412 .vb_access = ocfs2_journal_access_xb,
6413 };
6414
6415 ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh,
6416 &credits, &meta_ac);
6417 if (ret) {
6418 mlog_errno(ret);
6419 return ret;
6420 }
6421
6422 /* One more credits in case we need to add xattr flags in new inode. */
6423 handle = ocfs2_start_trans(osb, credits + 1);
6424 if (IS_ERR(handle)) {
6425 ret = PTR_ERR(handle);
6426 mlog_errno(ret);
6427 goto out;
6428 }
6429
6430 if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) {
6431 ret = ocfs2_journal_access_di(handle,
6432 INODE_CACHE(args->new_inode),
6433 args->new_bh,
6434 OCFS2_JOURNAL_ACCESS_WRITE);
6435 if (ret) {
6436 mlog_errno(ret);
6437 goto out_commit;
6438 }
6439 }
6440
6441 ret = ocfs2_journal_access_xb(handle, INODE_CACHE(args->new_inode),
6442 new_blk_bh, OCFS2_JOURNAL_ACCESS_WRITE);
6443 if (ret) {
6444 mlog_errno(ret);
6445 goto out_commit;
6446 }
6447
6448 memcpy(new_blk_bh->b_data + header_off, blk_bh->b_data + header_off,
6449 osb->sb->s_blocksize - header_off);
6450
6451 ret = ocfs2_reflink_xattr_header(handle, args, blk_bh, xh,
6452 new_blk_bh, new_xh, &vb, meta_ac,
6453 ocfs2_get_xattr_value_root, NULL);
6454 if (ret) {
6455 mlog_errno(ret);
6456 goto out_commit;
6457 }
6458
6459 ocfs2_journal_dirty(handle, new_blk_bh);
6460
6461 if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) {
6462 new_di = (struct ocfs2_dinode *)args->new_bh->b_data;
6463 spin_lock(&new_oi->ip_lock);
6464 new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL;
6465 new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
6466 spin_unlock(&new_oi->ip_lock);
6467
6468 ocfs2_journal_dirty(handle, args->new_bh);
6469 }
6470
6471out_commit:
6472 ocfs2_commit_trans(osb, handle);
6473
6474out:
6475 ocfs2_free_alloc_context(meta_ac);
6476 return ret;
6477}
6478
6479struct ocfs2_reflink_xattr_tree_args {
6480 struct ocfs2_xattr_reflink *reflink;
6481 struct buffer_head *old_blk_bh;
6482 struct buffer_head *new_blk_bh;
6483 struct ocfs2_xattr_bucket *old_bucket;
6484 struct ocfs2_xattr_bucket *new_bucket;
6485};
6486
6487/*
6488 * NOTE:
6489 * We have to handle the case that both old bucket and new bucket
6490 * will call this function to get the right ret_bh.
6491 * So The caller must give us the right bh.
6492 */
6493static int ocfs2_get_reflink_xattr_value_root(struct super_block *sb,
6494 struct buffer_head *bh,
6495 struct ocfs2_xattr_header *xh,
6496 int offset,
6497 struct ocfs2_xattr_value_root **xv,
6498 struct buffer_head **ret_bh,
6499 void *para)
6500{
6501 struct ocfs2_reflink_xattr_tree_args *args =
6502 (struct ocfs2_reflink_xattr_tree_args *)para;
6503 struct ocfs2_xattr_bucket *bucket;
6504
6505 if (bh == args->old_bucket->bu_bhs[0])
6506 bucket = args->old_bucket;
6507 else
6508 bucket = args->new_bucket;
6509
6510 return ocfs2_get_xattr_tree_value_root(sb, bucket, offset,
6511 xv, ret_bh);
6512}
6513
6514struct ocfs2_value_tree_metas {
6515 int num_metas;
6516 int credits;
6517 int num_recs;
6518};
6519
6520static int ocfs2_value_tree_metas_in_bucket(struct super_block *sb,
6521 struct buffer_head *bh,
6522 struct ocfs2_xattr_header *xh,
6523 int offset,
6524 struct ocfs2_xattr_value_root **xv,
6525 struct buffer_head **ret_bh,
6526 void *para)
6527{
6528 struct ocfs2_xattr_bucket *bucket =
6529 (struct ocfs2_xattr_bucket *)para;
6530
6531 return ocfs2_get_xattr_tree_value_root(sb, bucket, offset,
6532 xv, ret_bh);
6533}
6534
6535static int ocfs2_calc_value_tree_metas(struct inode *inode,
6536 struct ocfs2_xattr_bucket *bucket,
6537 void *para)
6538{
6539 struct ocfs2_value_tree_metas *metas =
6540 (struct ocfs2_value_tree_metas *)para;
6541 struct ocfs2_xattr_header *xh =
6542 (struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data;
6543
6544 /* Add the credits for this bucket first. */
6545 metas->credits += bucket->bu_blocks;
6546 return ocfs2_value_metas_in_xattr_header(inode->i_sb, bucket->bu_bhs[0],
6547 xh, &metas->num_metas,
6548 &metas->credits, &metas->num_recs,
6549 ocfs2_value_tree_metas_in_bucket,
6550 bucket);
6551}
6552
6553/*
6554 * Given a xattr extent rec starting from blkno and having len clusters,
6555 * iterate all the buckets calculate how much metadata we need for reflinking
6556 * all the ocfs2_xattr_value_root and lock the allocators accordingly.
6557 */
6558static int ocfs2_lock_reflink_xattr_rec_allocators(
6559 struct ocfs2_reflink_xattr_tree_args *args,
6560 struct ocfs2_extent_tree *xt_et,
6561 u64 blkno, u32 len, int *credits,
6562 struct ocfs2_alloc_context **meta_ac,
6563 struct ocfs2_alloc_context **data_ac)
6564{
6565 int ret, num_free_extents;
6566 struct ocfs2_value_tree_metas metas;
6567 struct ocfs2_super *osb = OCFS2_SB(args->reflink->old_inode->i_sb);
6568 struct ocfs2_refcount_block *rb;
6569
6570 memset(&metas, 0, sizeof(metas));
6571
6572 ret = ocfs2_iterate_xattr_buckets(args->reflink->old_inode, blkno, len,
6573 ocfs2_calc_value_tree_metas, &metas);
6574 if (ret) {
6575 mlog_errno(ret);
6576 goto out;
6577 }
6578
6579 *credits = metas.credits;
6580
6581 /*
6582 * Calculate we need for refcount tree change.
6583 *
6584 * We need to add/modify num_recs in refcount tree, so just calculate
6585 * an approximate number we need for refcount tree change.
6586 * Sometimes we need to split the tree, and after split, half recs
6587 * will be moved to the new block, and a new block can only provide
6588 * half number of recs. So we multiple new blocks by 2.
6589 * In the end, we have to add credits for modifying the already
6590 * existed refcount block.
6591 */
6592 rb = (struct ocfs2_refcount_block *)args->reflink->ref_root_bh->b_data;
6593 metas.num_recs =
6594 (metas.num_recs + ocfs2_refcount_recs_per_rb(osb->sb) - 1) /
6595 ocfs2_refcount_recs_per_rb(osb->sb) * 2;
6596 metas.num_metas += metas.num_recs;
6597 *credits += metas.num_recs +
6598 metas.num_recs * OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
6599 if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
6600 *credits += le16_to_cpu(rb->rf_list.l_tree_depth) *
6601 le16_to_cpu(rb->rf_list.l_next_free_rec) + 1;
6602 else
6603 *credits += 1;
6604
6605 /* count in the xattr tree change. */
6606 num_free_extents = ocfs2_num_free_extents(osb, xt_et);
6607 if (num_free_extents < 0) {
6608 ret = num_free_extents;
6609 mlog_errno(ret);
6610 goto out;
6611 }
6612
6613 if (num_free_extents < len)
6614 metas.num_metas += ocfs2_extend_meta_needed(xt_et->et_root_el);
6615
6616 *credits += ocfs2_calc_extend_credits(osb->sb,
6617 xt_et->et_root_el, len);
6618
6619 if (metas.num_metas) {
6620 ret = ocfs2_reserve_new_metadata_blocks(osb, metas.num_metas,
6621 meta_ac);
5286 if (ret) { 6622 if (ret) {
5287 mlog_errno(ret); 6623 mlog_errno(ret);
5288 goto out; 6624 goto out;
5289 } 6625 }
6626 }
5290 6627
5291 ret = ocfs2_rm_xattr_cluster(inode, xb_bh, 6628 if (len) {
5292 p_blkno, e_cpos, num_clusters); 6629 ret = ocfs2_reserve_clusters(osb, len, data_ac);
6630 if (ret)
6631 mlog_errno(ret);
6632 }
6633out:
6634 if (ret) {
6635 if (*meta_ac) {
6636 ocfs2_free_alloc_context(*meta_ac);
6637 meta_ac = NULL;
6638 }
6639 }
6640
6641 return ret;
6642}
6643
6644static int ocfs2_reflink_xattr_buckets(handle_t *handle,
6645 u64 blkno, u64 new_blkno, u32 clusters,
6646 struct ocfs2_alloc_context *meta_ac,
6647 struct ocfs2_alloc_context *data_ac,
6648 struct ocfs2_reflink_xattr_tree_args *args)
6649{
6650 int i, j, ret = 0;
6651 struct super_block *sb = args->reflink->old_inode->i_sb;
6652 u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
6653 u32 num_buckets = clusters * bpc;
6654 int bpb = args->old_bucket->bu_blocks;
6655 struct ocfs2_xattr_value_buf vb = {
6656 .vb_access = ocfs2_journal_access,
6657 };
6658
6659 for (i = 0; i < num_buckets; i++, blkno += bpb, new_blkno += bpb) {
6660 ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno);
5293 if (ret) { 6661 if (ret) {
5294 mlog_errno(ret); 6662 mlog_errno(ret);
5295 break; 6663 break;
5296 } 6664 }
5297 6665
5298 if (e_cpos == 0) 6666 ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno);
6667 if (ret) {
6668 mlog_errno(ret);
5299 break; 6669 break;
6670 }
5300 6671
5301 name_hash = e_cpos - 1; 6672 /*
6673 * The real bucket num in this series of blocks is stored
6674 * in the 1st bucket.
6675 */
6676 if (i == 0)
6677 num_buckets = le16_to_cpu(
6678 bucket_xh(args->old_bucket)->xh_num_buckets);
6679
6680 ret = ocfs2_xattr_bucket_journal_access(handle,
6681 args->new_bucket,
6682 OCFS2_JOURNAL_ACCESS_CREATE);
6683 if (ret) {
6684 mlog_errno(ret);
6685 break;
6686 }
6687
6688 for (j = 0; j < bpb; j++)
6689 memcpy(bucket_block(args->new_bucket, j),
6690 bucket_block(args->old_bucket, j),
6691 sb->s_blocksize);
6692
6693 ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
6694
6695 ret = ocfs2_reflink_xattr_header(handle, args->reflink,
6696 args->old_bucket->bu_bhs[0],
6697 bucket_xh(args->old_bucket),
6698 args->new_bucket->bu_bhs[0],
6699 bucket_xh(args->new_bucket),
6700 &vb, meta_ac,
6701 ocfs2_get_reflink_xattr_value_root,
6702 args);
6703 if (ret) {
6704 mlog_errno(ret);
6705 break;
6706 }
6707
6708 /*
6709 * Re-access and dirty the bucket to calculate metaecc.
6710 * Because we may extend the transaction in reflink_xattr_header
6711 * which will let the already accessed block gone.
6712 */
6713 ret = ocfs2_xattr_bucket_journal_access(handle,
6714 args->new_bucket,
6715 OCFS2_JOURNAL_ACCESS_WRITE);
6716 if (ret) {
6717 mlog_errno(ret);
6718 break;
6719 }
6720
6721 ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
6722 ocfs2_xattr_bucket_relse(args->old_bucket);
6723 ocfs2_xattr_bucket_relse(args->new_bucket);
6724 }
6725
6726 ocfs2_xattr_bucket_relse(args->old_bucket);
6727 ocfs2_xattr_bucket_relse(args->new_bucket);
6728 return ret;
6729}
6730/*
6731 * Create the same xattr extent record in the new inode's xattr tree.
6732 */
6733static int ocfs2_reflink_xattr_rec(struct inode *inode,
6734 struct buffer_head *root_bh,
6735 u64 blkno,
6736 u32 cpos,
6737 u32 len,
6738 void *para)
6739{
6740 int ret, credits = 0;
6741 u32 p_cluster, num_clusters;
6742 u64 new_blkno;
6743 handle_t *handle;
6744 struct ocfs2_reflink_xattr_tree_args *args =
6745 (struct ocfs2_reflink_xattr_tree_args *)para;
6746 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6747 struct ocfs2_alloc_context *meta_ac = NULL;
6748 struct ocfs2_alloc_context *data_ac = NULL;
6749 struct ocfs2_extent_tree et;
6750
6751 ocfs2_init_xattr_tree_extent_tree(&et,
6752 INODE_CACHE(args->reflink->new_inode),
6753 args->new_blk_bh);
6754
6755 ret = ocfs2_lock_reflink_xattr_rec_allocators(args, &et, blkno,
6756 len, &credits,
6757 &meta_ac, &data_ac);
6758 if (ret) {
6759 mlog_errno(ret);
6760 goto out;
6761 }
6762
6763 handle = ocfs2_start_trans(osb, credits);
6764 if (IS_ERR(handle)) {
6765 ret = PTR_ERR(handle);
6766 mlog_errno(ret);
6767 goto out;
6768 }
6769
6770 ret = ocfs2_claim_clusters(osb, handle, data_ac,
6771 len, &p_cluster, &num_clusters);
6772 if (ret) {
6773 mlog_errno(ret);
6774 goto out_commit;
6775 }
6776
6777 new_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cluster);
6778
6779 mlog(0, "reflink xattr buckets %llu to %llu, len %u\n",
6780 (unsigned long long)blkno, (unsigned long long)new_blkno, len);
6781 ret = ocfs2_reflink_xattr_buckets(handle, blkno, new_blkno, len,
6782 meta_ac, data_ac, args);
6783 if (ret) {
6784 mlog_errno(ret);
6785 goto out_commit;
6786 }
6787
6788 mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
6789 (unsigned long long)new_blkno, len, cpos);
6790 ret = ocfs2_insert_extent(handle, &et, cpos, new_blkno,
6791 len, 0, meta_ac);
6792 if (ret)
6793 mlog_errno(ret);
6794
6795out_commit:
6796 ocfs2_commit_trans(osb, handle);
6797
6798out:
6799 if (meta_ac)
6800 ocfs2_free_alloc_context(meta_ac);
6801 if (data_ac)
6802 ocfs2_free_alloc_context(data_ac);
6803 return ret;
6804}
6805
6806/*
6807 * Create reflinked xattr buckets.
6808 * We will add bucket one by one, and refcount all the xattrs in the bucket
6809 * if they are stored outside.
6810 */
6811static int ocfs2_reflink_xattr_tree(struct ocfs2_xattr_reflink *args,
6812 struct buffer_head *blk_bh,
6813 struct buffer_head *new_blk_bh)
6814{
6815 int ret;
6816 struct ocfs2_reflink_xattr_tree_args para;
6817
6818 memset(&para, 0, sizeof(para));
6819 para.reflink = args;
6820 para.old_blk_bh = blk_bh;
6821 para.new_blk_bh = new_blk_bh;
6822
6823 para.old_bucket = ocfs2_xattr_bucket_new(args->old_inode);
6824 if (!para.old_bucket) {
6825 mlog_errno(-ENOMEM);
6826 return -ENOMEM;
6827 }
6828
6829 para.new_bucket = ocfs2_xattr_bucket_new(args->new_inode);
6830 if (!para.new_bucket) {
6831 ret = -ENOMEM;
6832 mlog_errno(ret);
6833 goto out;
6834 }
6835
6836 ret = ocfs2_iterate_xattr_index_block(args->old_inode, blk_bh,
6837 ocfs2_reflink_xattr_rec,
6838 &para);
6839 if (ret)
6840 mlog_errno(ret);
6841
6842out:
6843 ocfs2_xattr_bucket_free(para.old_bucket);
6844 ocfs2_xattr_bucket_free(para.new_bucket);
6845 return ret;
6846}
6847
6848static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args,
6849 struct buffer_head *blk_bh)
6850{
6851 int ret, indexed = 0;
6852 struct buffer_head *new_blk_bh = NULL;
6853 struct ocfs2_xattr_block *xb =
6854 (struct ocfs2_xattr_block *)blk_bh->b_data;
6855
6856
6857 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)
6858 indexed = 1;
6859
6860 ret = ocfs2_create_empty_xattr_block(args->new_inode, args->new_bh,
6861 &new_blk_bh, indexed);
6862 if (ret) {
6863 mlog_errno(ret);
6864 goto out;
6865 }
6866
6867 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED))
6868 ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh);
6869 else
6870 ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh);
6871 if (ret)
6872 mlog_errno(ret);
6873
6874out:
6875 brelse(new_blk_bh);
6876 return ret;
6877}
6878
6879static int ocfs2_reflink_xattr_no_security(struct ocfs2_xattr_entry *xe)
6880{
6881 int type = ocfs2_xattr_get_type(xe);
6882
6883 return type != OCFS2_XATTR_INDEX_SECURITY &&
6884 type != OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS &&
6885 type != OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
6886}
6887
6888int ocfs2_reflink_xattrs(struct inode *old_inode,
6889 struct buffer_head *old_bh,
6890 struct inode *new_inode,
6891 struct buffer_head *new_bh,
6892 bool preserve_security)
6893{
6894 int ret;
6895 struct ocfs2_xattr_reflink args;
6896 struct ocfs2_inode_info *oi = OCFS2_I(old_inode);
6897 struct ocfs2_dinode *di = (struct ocfs2_dinode *)old_bh->b_data;
6898 struct buffer_head *blk_bh = NULL;
6899 struct ocfs2_cached_dealloc_ctxt dealloc;
6900 struct ocfs2_refcount_tree *ref_tree;
6901 struct buffer_head *ref_root_bh = NULL;
6902
6903 ret = ocfs2_lock_refcount_tree(OCFS2_SB(old_inode->i_sb),
6904 le64_to_cpu(di->i_refcount_loc),
6905 1, &ref_tree, &ref_root_bh);
6906 if (ret) {
6907 mlog_errno(ret);
6908 goto out;
6909 }
6910
6911 ocfs2_init_dealloc_ctxt(&dealloc);
6912
6913 args.old_inode = old_inode;
6914 args.new_inode = new_inode;
6915 args.old_bh = old_bh;
6916 args.new_bh = new_bh;
6917 args.ref_ci = &ref_tree->rf_ci;
6918 args.ref_root_bh = ref_root_bh;
6919 args.dealloc = &dealloc;
6920 if (preserve_security)
6921 args.xattr_reflinked = NULL;
6922 else
6923 args.xattr_reflinked = ocfs2_reflink_xattr_no_security;
6924
6925 if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
6926 ret = ocfs2_reflink_xattr_inline(&args);
6927 if (ret) {
6928 mlog_errno(ret);
6929 goto out_unlock;
6930 }
6931 }
6932
6933 if (!di->i_xattr_loc)
6934 goto out_unlock;
6935
6936 ret = ocfs2_read_xattr_block(old_inode, le64_to_cpu(di->i_xattr_loc),
6937 &blk_bh);
6938 if (ret < 0) {
6939 mlog_errno(ret);
6940 goto out_unlock;
6941 }
6942
6943 ret = ocfs2_reflink_xattr_in_block(&args, blk_bh);
6944 if (ret)
6945 mlog_errno(ret);
6946
6947 brelse(blk_bh);
6948
6949out_unlock:
6950 ocfs2_unlock_refcount_tree(OCFS2_SB(old_inode->i_sb),
6951 ref_tree, 1);
6952 brelse(ref_root_bh);
6953
6954 if (ocfs2_dealloc_has_cluster(&dealloc)) {
6955 ocfs2_schedule_truncate_log_flush(OCFS2_SB(old_inode->i_sb), 1);
6956 ocfs2_run_deallocs(OCFS2_SB(old_inode->i_sb), &dealloc);
5302 } 6957 }
5303 6958
5304out: 6959out:
@@ -5306,6 +6961,51 @@ out:
5306} 6961}
5307 6962
5308/* 6963/*
6964 * Initialize security and acl for a already created inode.
6965 * Used for reflink a non-preserve-security file.
6966 *
6967 * It uses common api like ocfs2_xattr_set, so the caller
6968 * must not hold any lock expect i_mutex.
6969 */
6970int ocfs2_init_security_and_acl(struct inode *dir,
6971 struct inode *inode)
6972{
6973 int ret = 0;
6974 struct buffer_head *dir_bh = NULL;
6975 struct ocfs2_security_xattr_info si = {
6976 .enable = 1,
6977 };
6978
6979 ret = ocfs2_init_security_get(inode, dir, &si);
6980 if (!ret) {
6981 ret = ocfs2_xattr_security_set(inode, si.name,
6982 si.value, si.value_len,
6983 XATTR_CREATE);
6984 if (ret) {
6985 mlog_errno(ret);
6986 goto leave;
6987 }
6988 } else if (ret != -EOPNOTSUPP) {
6989 mlog_errno(ret);
6990 goto leave;
6991 }
6992
6993 ret = ocfs2_inode_lock(dir, &dir_bh, 0);
6994 if (ret) {
6995 mlog_errno(ret);
6996 goto leave;
6997 }
6998
6999 ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
7000 if (ret)
7001 mlog_errno(ret);
7002
7003 ocfs2_inode_unlock(dir, 0);
7004 brelse(dir_bh);
7005leave:
7006 return ret;
7007}
7008/*
5309 * 'security' attributes support 7009 * 'security' attributes support
5310 */ 7010 */
5311static size_t ocfs2_xattr_security_list(struct inode *inode, char *list, 7011static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 1ca7e9a1b7bc..08e36389f56d 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -55,6 +55,8 @@ int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *,
55 int, const char *, const void *, size_t, int, 55 int, const char *, const void *, size_t, int,
56 struct ocfs2_alloc_context *, 56 struct ocfs2_alloc_context *,
57 struct ocfs2_alloc_context *); 57 struct ocfs2_alloc_context *);
58int ocfs2_has_inline_xattr_value_outside(struct inode *inode,
59 struct ocfs2_dinode *di);
58int ocfs2_xattr_remove(struct inode *, struct buffer_head *); 60int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
59int ocfs2_init_security_get(struct inode *, struct inode *, 61int ocfs2_init_security_get(struct inode *, struct inode *,
60 struct ocfs2_security_xattr_info *); 62 struct ocfs2_security_xattr_info *);
@@ -83,5 +85,16 @@ struct ocfs2_xattr_value_buf {
83 struct ocfs2_xattr_value_root *vb_xv; 85 struct ocfs2_xattr_value_root *vb_xv;
84}; 86};
85 87
86 88int ocfs2_xattr_attach_refcount_tree(struct inode *inode,
89 struct buffer_head *fe_bh,
90 struct ocfs2_caching_info *ref_ci,
91 struct buffer_head *ref_root_bh,
92 struct ocfs2_cached_dealloc_ctxt *dealloc);
93int ocfs2_reflink_xattrs(struct inode *old_inode,
94 struct buffer_head *old_bh,
95 struct inode *new_inode,
96 struct buffer_head *new_bh,
97 bool preserve_security);
98int ocfs2_init_security_and_acl(struct inode *dir,
99 struct inode *inode);
87#endif /* OCFS2_XATTR_H */ 100#endif /* OCFS2_XATTR_H */
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index c7275cfbdcfb..b42d62419034 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -489,7 +489,7 @@ out:
489 return ret; 489 return ret;
490} 490}
491 491
492struct inode_operations omfs_dir_inops = { 492const struct inode_operations omfs_dir_inops = {
493 .lookup = omfs_lookup, 493 .lookup = omfs_lookup,
494 .mkdir = omfs_mkdir, 494 .mkdir = omfs_mkdir,
495 .rename = omfs_rename, 495 .rename = omfs_rename,
@@ -498,7 +498,7 @@ struct inode_operations omfs_dir_inops = {
498 .rmdir = omfs_rmdir, 498 .rmdir = omfs_rmdir,
499}; 499};
500 500
501struct file_operations omfs_dir_operations = { 501const struct file_operations omfs_dir_operations = {
502 .read = generic_read_dir, 502 .read = generic_read_dir,
503 .readdir = omfs_readdir, 503 .readdir = omfs_readdir,
504 .llseek = generic_file_llseek, 504 .llseek = generic_file_llseek,
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index d17e774eaf45..399487c09364 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -322,7 +322,7 @@ static sector_t omfs_bmap(struct address_space *mapping, sector_t block)
322 return generic_block_bmap(mapping, block, omfs_get_block); 322 return generic_block_bmap(mapping, block, omfs_get_block);
323} 323}
324 324
325struct file_operations omfs_file_operations = { 325const struct file_operations omfs_file_operations = {
326 .llseek = generic_file_llseek, 326 .llseek = generic_file_llseek,
327 .read = do_sync_read, 327 .read = do_sync_read,
328 .write = do_sync_write, 328 .write = do_sync_write,
@@ -333,11 +333,11 @@ struct file_operations omfs_file_operations = {
333 .splice_read = generic_file_splice_read, 333 .splice_read = generic_file_splice_read,
334}; 334};
335 335
336struct inode_operations omfs_file_inops = { 336const struct inode_operations omfs_file_inops = {
337 .truncate = omfs_truncate 337 .truncate = omfs_truncate
338}; 338};
339 339
340struct address_space_operations omfs_aops = { 340const struct address_space_operations omfs_aops = {
341 .readpage = omfs_readpage, 341 .readpage = omfs_readpage,
342 .readpages = omfs_readpages, 342 .readpages = omfs_readpages,
343 .writepage = omfs_writepage, 343 .writepage = omfs_writepage,
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 379ae5fb4411..f3b7c1541f3a 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -278,7 +278,7 @@ static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf)
278 return 0; 278 return 0;
279} 279}
280 280
281static struct super_operations omfs_sops = { 281static const struct super_operations omfs_sops = {
282 .write_inode = omfs_write_inode, 282 .write_inode = omfs_write_inode,
283 .delete_inode = omfs_delete_inode, 283 .delete_inode = omfs_delete_inode,
284 .put_super = omfs_put_super, 284 .put_super = omfs_put_super,
diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h
index 2bc0f0670406..ebe2fdbe535e 100644
--- a/fs/omfs/omfs.h
+++ b/fs/omfs/omfs.h
@@ -44,16 +44,16 @@ extern int omfs_allocate_range(struct super_block *sb, int min_request,
44extern int omfs_clear_range(struct super_block *sb, u64 block, int count); 44extern int omfs_clear_range(struct super_block *sb, u64 block, int count);
45 45
46/* dir.c */ 46/* dir.c */
47extern struct file_operations omfs_dir_operations; 47extern const struct file_operations omfs_dir_operations;
48extern struct inode_operations omfs_dir_inops; 48extern const struct inode_operations omfs_dir_inops;
49extern int omfs_make_empty(struct inode *inode, struct super_block *sb); 49extern int omfs_make_empty(struct inode *inode, struct super_block *sb);
50extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header, 50extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
51 u64 fsblock); 51 u64 fsblock);
52 52
53/* file.c */ 53/* file.c */
54extern struct file_operations omfs_file_operations; 54extern const struct file_operations omfs_file_operations;
55extern struct inode_operations omfs_file_inops; 55extern const struct inode_operations omfs_file_inops;
56extern struct address_space_operations omfs_aops; 56extern const struct address_space_operations omfs_aops;
57extern void omfs_make_empty_table(struct buffer_head *bh, int offset); 57extern void omfs_make_empty_table(struct buffer_head *bh, int offset);
58extern int omfs_shrink_inode(struct inode *inode); 58extern int omfs_shrink_inode(struct inode *inode);
59 59
diff --git a/fs/open.c b/fs/open.c
index dd98e8076024..4f01e06227c6 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -199,7 +199,7 @@ out:
199int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, 199int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
200 struct file *filp) 200 struct file *filp)
201{ 201{
202 int err; 202 int ret;
203 struct iattr newattrs; 203 struct iattr newattrs;
204 204
205 /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */ 205 /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
@@ -214,12 +214,14 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
214 } 214 }
215 215
216 /* Remove suid/sgid on truncate too */ 216 /* Remove suid/sgid on truncate too */
217 newattrs.ia_valid |= should_remove_suid(dentry); 217 ret = should_remove_suid(dentry);
218 if (ret)
219 newattrs.ia_valid |= ret | ATTR_FORCE;
218 220
219 mutex_lock(&dentry->d_inode->i_mutex); 221 mutex_lock(&dentry->d_inode->i_mutex);
220 err = notify_change(dentry, &newattrs); 222 ret = notify_change(dentry, &newattrs);
221 mutex_unlock(&dentry->d_inode->i_mutex); 223 mutex_unlock(&dentry->d_inode->i_mutex);
222 return err; 224 return ret;
223} 225}
224 226
225static long do_sys_truncate(const char __user *pathname, loff_t length) 227static long do_sys_truncate(const char __user *pathname, loff_t length)
@@ -288,10 +290,9 @@ out:
288 return error; 290 return error;
289} 291}
290 292
291SYSCALL_DEFINE2(truncate, const char __user *, path, unsigned long, length) 293SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
292{ 294{
293 /* on 32-bit boxen it will cut the range 2^31--2^32-1 off */ 295 return do_sys_truncate(path, length);
294 return do_sys_truncate(path, (long)length);
295} 296}
296 297
297static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) 298static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
@@ -957,6 +958,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags,
957 int error; 958 int error;
958 struct file *f; 959 struct file *f;
959 960
961 validate_creds(cred);
962
960 /* 963 /*
961 * We must always pass in a valid mount pointer. Historically 964 * We must always pass in a valid mount pointer. Historically
962 * callers got away with not passing it, but we must enforce this at 965 * callers got away with not passing it, but we must enforce this at
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index ea4e6cb29e13..f38fee0311a7 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -302,7 +302,7 @@ static struct attribute_group part_attr_group = {
302 .attrs = part_attrs, 302 .attrs = part_attrs,
303}; 303};
304 304
305static struct attribute_group *part_attr_groups[] = { 305static const struct attribute_group *part_attr_groups[] = {
306 &part_attr_group, 306 &part_attr_group,
307#ifdef CONFIG_BLK_DEV_IO_TRACE 307#ifdef CONFIG_BLK_DEV_IO_TRACE
308 &blk_trace_attr_group, 308 &blk_trace_attr_group,
@@ -571,7 +571,7 @@ try_scan:
571 } 571 }
572 572
573 if (from + size > get_capacity(disk)) { 573 if (from + size > get_capacity(disk)) {
574 struct block_device_operations *bdops = disk->fops; 574 const struct block_device_operations *bdops = disk->fops;
575 unsigned long long capacity; 575 unsigned long long capacity;
576 576
577 printk(KERN_WARNING 577 printk(KERN_WARNING
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 725a650bbbb8..07f77a7945c3 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -82,6 +82,7 @@
82#include <linux/pid_namespace.h> 82#include <linux/pid_namespace.h>
83#include <linux/ptrace.h> 83#include <linux/ptrace.h>
84#include <linux/tracehook.h> 84#include <linux/tracehook.h>
85#include <linux/swapops.h>
85 86
86#include <asm/pgtable.h> 87#include <asm/pgtable.h>
87#include <asm/processor.h> 88#include <asm/processor.h>
@@ -321,6 +322,94 @@ static inline void task_context_switch_counts(struct seq_file *m,
321 p->nivcsw); 322 p->nivcsw);
322} 323}
323 324
325#ifdef CONFIG_MMU
326
327struct stack_stats {
328 struct vm_area_struct *vma;
329 unsigned long startpage;
330 unsigned long usage;
331};
332
333static int stack_usage_pte_range(pmd_t *pmd, unsigned long addr,
334 unsigned long end, struct mm_walk *walk)
335{
336 struct stack_stats *ss = walk->private;
337 struct vm_area_struct *vma = ss->vma;
338 pte_t *pte, ptent;
339 spinlock_t *ptl;
340 int ret = 0;
341
342 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
343 for (; addr != end; pte++, addr += PAGE_SIZE) {
344 ptent = *pte;
345
346#ifdef CONFIG_STACK_GROWSUP
347 if (pte_present(ptent) || is_swap_pte(ptent))
348 ss->usage = addr - ss->startpage + PAGE_SIZE;
349#else
350 if (pte_present(ptent) || is_swap_pte(ptent)) {
351 ss->usage = ss->startpage - addr + PAGE_SIZE;
352 pte++;
353 ret = 1;
354 break;
355 }
356#endif
357 }
358 pte_unmap_unlock(pte - 1, ptl);
359 cond_resched();
360 return ret;
361}
362
363static inline unsigned long get_stack_usage_in_bytes(struct vm_area_struct *vma,
364 struct task_struct *task)
365{
366 struct stack_stats ss;
367 struct mm_walk stack_walk = {
368 .pmd_entry = stack_usage_pte_range,
369 .mm = vma->vm_mm,
370 .private = &ss,
371 };
372
373 if (!vma->vm_mm || is_vm_hugetlb_page(vma))
374 return 0;
375
376 ss.vma = vma;
377 ss.startpage = task->stack_start & PAGE_MASK;
378 ss.usage = 0;
379
380#ifdef CONFIG_STACK_GROWSUP
381 walk_page_range(KSTK_ESP(task) & PAGE_MASK, vma->vm_end,
382 &stack_walk);
383#else
384 walk_page_range(vma->vm_start, (KSTK_ESP(task) & PAGE_MASK) + PAGE_SIZE,
385 &stack_walk);
386#endif
387 return ss.usage;
388}
389
390static inline void task_show_stack_usage(struct seq_file *m,
391 struct task_struct *task)
392{
393 struct vm_area_struct *vma;
394 struct mm_struct *mm = get_task_mm(task);
395
396 if (mm) {
397 down_read(&mm->mmap_sem);
398 vma = find_vma(mm, task->stack_start);
399 if (vma)
400 seq_printf(m, "Stack usage:\t%lu kB\n",
401 get_stack_usage_in_bytes(vma, task) >> 10);
402
403 up_read(&mm->mmap_sem);
404 mmput(mm);
405 }
406}
407#else
408static void task_show_stack_usage(struct seq_file *m, struct task_struct *task)
409{
410}
411#endif /* CONFIG_MMU */
412
324int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, 413int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
325 struct pid *pid, struct task_struct *task) 414 struct pid *pid, struct task_struct *task)
326{ 415{
@@ -340,6 +429,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
340 task_show_regs(m, task); 429 task_show_regs(m, task);
341#endif 430#endif
342 task_context_switch_counts(m, task); 431 task_context_switch_counts(m, task);
432 task_show_stack_usage(m, task);
343 return 0; 433 return 0;
344} 434}
345 435
@@ -481,7 +571,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
481 rsslim, 571 rsslim,
482 mm ? mm->start_code : 0, 572 mm ? mm->start_code : 0,
483 mm ? mm->end_code : 0, 573 mm ? mm->end_code : 0,
484 (permitted && mm) ? mm->start_stack : 0, 574 (permitted) ? task->stack_start : 0,
485 esp, 575 esp,
486 eip, 576 eip,
487 /* The signal information here is obsolete. 577 /* The signal information here is obsolete.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6f742f6658a9..837469a96598 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -447,7 +447,7 @@ static int proc_oom_score(struct task_struct *task, char *buffer)
447 447
448 do_posix_clock_monotonic_gettime(&uptime); 448 do_posix_clock_monotonic_gettime(&uptime);
449 read_lock(&tasklist_lock); 449 read_lock(&tasklist_lock);
450 points = badness(task, uptime.tv_sec); 450 points = badness(task->group_leader, uptime.tv_sec);
451 read_unlock(&tasklist_lock); 451 read_unlock(&tasklist_lock);
452 return sprintf(buffer, "%lu\n", points); 452 return sprintf(buffer, "%lu\n", points);
453} 453}
@@ -458,7 +458,7 @@ struct limit_names {
458}; 458};
459 459
460static const struct limit_names lnames[RLIM_NLIMITS] = { 460static const struct limit_names lnames[RLIM_NLIMITS] = {
461 [RLIMIT_CPU] = {"Max cpu time", "ms"}, 461 [RLIMIT_CPU] = {"Max cpu time", "seconds"},
462 [RLIMIT_FSIZE] = {"Max file size", "bytes"}, 462 [RLIMIT_FSIZE] = {"Max file size", "bytes"},
463 [RLIMIT_DATA] = {"Max data size", "bytes"}, 463 [RLIMIT_DATA] = {"Max data size", "bytes"},
464 [RLIMIT_STACK] = {"Max stack size", "bytes"}, 464 [RLIMIT_STACK] = {"Max stack size", "bytes"},
@@ -999,11 +999,17 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
999 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); 999 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
1000 char buffer[PROC_NUMBUF]; 1000 char buffer[PROC_NUMBUF];
1001 size_t len; 1001 size_t len;
1002 int oom_adjust; 1002 int oom_adjust = OOM_DISABLE;
1003 unsigned long flags;
1003 1004
1004 if (!task) 1005 if (!task)
1005 return -ESRCH; 1006 return -ESRCH;
1006 oom_adjust = task->oomkilladj; 1007
1008 if (lock_task_sighand(task, &flags)) {
1009 oom_adjust = task->signal->oom_adj;
1010 unlock_task_sighand(task, &flags);
1011 }
1012
1007 put_task_struct(task); 1013 put_task_struct(task);
1008 1014
1009 len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); 1015 len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
@@ -1015,32 +1021,44 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
1015 size_t count, loff_t *ppos) 1021 size_t count, loff_t *ppos)
1016{ 1022{
1017 struct task_struct *task; 1023 struct task_struct *task;
1018 char buffer[PROC_NUMBUF], *end; 1024 char buffer[PROC_NUMBUF];
1019 int oom_adjust; 1025 long oom_adjust;
1026 unsigned long flags;
1027 int err;
1020 1028
1021 memset(buffer, 0, sizeof(buffer)); 1029 memset(buffer, 0, sizeof(buffer));
1022 if (count > sizeof(buffer) - 1) 1030 if (count > sizeof(buffer) - 1)
1023 count = sizeof(buffer) - 1; 1031 count = sizeof(buffer) - 1;
1024 if (copy_from_user(buffer, buf, count)) 1032 if (copy_from_user(buffer, buf, count))
1025 return -EFAULT; 1033 return -EFAULT;
1026 oom_adjust = simple_strtol(buffer, &end, 0); 1034
1035 err = strict_strtol(strstrip(buffer), 0, &oom_adjust);
1036 if (err)
1037 return -EINVAL;
1027 if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && 1038 if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
1028 oom_adjust != OOM_DISABLE) 1039 oom_adjust != OOM_DISABLE)
1029 return -EINVAL; 1040 return -EINVAL;
1030 if (*end == '\n') 1041
1031 end++;
1032 task = get_proc_task(file->f_path.dentry->d_inode); 1042 task = get_proc_task(file->f_path.dentry->d_inode);
1033 if (!task) 1043 if (!task)
1034 return -ESRCH; 1044 return -ESRCH;
1035 if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) { 1045 if (!lock_task_sighand(task, &flags)) {
1046 put_task_struct(task);
1047 return -ESRCH;
1048 }
1049
1050 if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
1051 unlock_task_sighand(task, &flags);
1036 put_task_struct(task); 1052 put_task_struct(task);
1037 return -EACCES; 1053 return -EACCES;
1038 } 1054 }
1039 task->oomkilladj = oom_adjust; 1055
1056 task->signal->oom_adj = oom_adjust;
1057
1058 unlock_task_sighand(task, &flags);
1040 put_task_struct(task); 1059 put_task_struct(task);
1041 if (end - buffer == 0) 1060
1042 return -EIO; 1061 return count;
1043 return end - buffer;
1044} 1062}
1045 1063
1046static const struct file_operations proc_oom_adjust_operations = { 1064static const struct file_operations proc_oom_adjust_operations = {
@@ -1169,17 +1187,16 @@ static ssize_t proc_fault_inject_write(struct file * file,
1169 count = sizeof(buffer) - 1; 1187 count = sizeof(buffer) - 1;
1170 if (copy_from_user(buffer, buf, count)) 1188 if (copy_from_user(buffer, buf, count))
1171 return -EFAULT; 1189 return -EFAULT;
1172 make_it_fail = simple_strtol(buffer, &end, 0); 1190 make_it_fail = simple_strtol(strstrip(buffer), &end, 0);
1173 if (*end == '\n') 1191 if (*end)
1174 end++; 1192 return -EINVAL;
1175 task = get_proc_task(file->f_dentry->d_inode); 1193 task = get_proc_task(file->f_dentry->d_inode);
1176 if (!task) 1194 if (!task)
1177 return -ESRCH; 1195 return -ESRCH;
1178 task->make_it_fail = make_it_fail; 1196 task->make_it_fail = make_it_fail;
1179 put_task_struct(task); 1197 put_task_struct(task);
1180 if (end - buffer == 0) 1198
1181 return -EIO; 1199 return count;
1182 return end - buffer;
1183} 1200}
1184 1201
1185static const struct file_operations proc_fault_inject_operations = { 1202static const struct file_operations proc_fault_inject_operations = {
@@ -2586,9 +2603,6 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
2586 dput(dentry); 2603 dput(dentry);
2587 } 2604 }
2588 2605
2589 if (tgid == 0)
2590 goto out;
2591
2592 name.name = buf; 2606 name.name = buf;
2593 name.len = snprintf(buf, sizeof(buf), "%d", tgid); 2607 name.len = snprintf(buf, sizeof(buf), "%d", tgid);
2594 leader = d_hash_and_lookup(mnt->mnt_root, &name); 2608 leader = d_hash_and_lookup(mnt->mnt_root, &name);
@@ -2645,17 +2659,16 @@ out:
2645void proc_flush_task(struct task_struct *task) 2659void proc_flush_task(struct task_struct *task)
2646{ 2660{
2647 int i; 2661 int i;
2648 struct pid *pid, *tgid = NULL; 2662 struct pid *pid, *tgid;
2649 struct upid *upid; 2663 struct upid *upid;
2650 2664
2651 pid = task_pid(task); 2665 pid = task_pid(task);
2652 if (thread_group_leader(task)) 2666 tgid = task_tgid(task);
2653 tgid = task_tgid(task);
2654 2667
2655 for (i = 0; i <= pid->level; i++) { 2668 for (i = 0; i <= pid->level; i++) {
2656 upid = &pid->numbers[i]; 2669 upid = &pid->numbers[i];
2657 proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, 2670 proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
2658 tgid ? tgid->numbers[i].nr : 0); 2671 tgid->numbers[i].nr);
2659 } 2672 }
2660 2673
2661 upid = &pid->numbers[pid->level]; 2674 upid = &pid->numbers[pid->level];
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 59b43a068872..56013371f9f3 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -17,9 +17,15 @@
17#include <linux/elfcore.h> 17#include <linux/elfcore.h>
18#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
19#include <linux/highmem.h> 19#include <linux/highmem.h>
20#include <linux/bootmem.h>
20#include <linux/init.h> 21#include <linux/init.h>
21#include <asm/uaccess.h> 22#include <asm/uaccess.h>
22#include <asm/io.h> 23#include <asm/io.h>
24#include <linux/list.h>
25#include <linux/ioport.h>
26#include <linux/mm.h>
27#include <linux/memory.h>
28#include <asm/sections.h>
23 29
24#define CORE_STR "CORE" 30#define CORE_STR "CORE"
25 31
@@ -29,17 +35,6 @@
29 35
30static struct proc_dir_entry *proc_root_kcore; 36static struct proc_dir_entry *proc_root_kcore;
31 37
32static int open_kcore(struct inode * inode, struct file * filp)
33{
34 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
35}
36
37static ssize_t read_kcore(struct file *, char __user *, size_t, loff_t *);
38
39static const struct file_operations proc_kcore_operations = {
40 .read = read_kcore,
41 .open = open_kcore,
42};
43 38
44#ifndef kc_vaddr_to_offset 39#ifndef kc_vaddr_to_offset
45#define kc_vaddr_to_offset(v) ((v) - PAGE_OFFSET) 40#define kc_vaddr_to_offset(v) ((v) - PAGE_OFFSET)
@@ -57,18 +52,19 @@ struct memelfnote
57 void *data; 52 void *data;
58}; 53};
59 54
60static struct kcore_list *kclist; 55static LIST_HEAD(kclist_head);
61static DEFINE_RWLOCK(kclist_lock); 56static DEFINE_RWLOCK(kclist_lock);
57static int kcore_need_update = 1;
62 58
63void 59void
64kclist_add(struct kcore_list *new, void *addr, size_t size) 60kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
65{ 61{
66 new->addr = (unsigned long)addr; 62 new->addr = (unsigned long)addr;
67 new->size = size; 63 new->size = size;
64 new->type = type;
68 65
69 write_lock(&kclist_lock); 66 write_lock(&kclist_lock);
70 new->next = kclist; 67 list_add_tail(&new->list, &kclist_head);
71 kclist = new;
72 write_unlock(&kclist_lock); 68 write_unlock(&kclist_lock);
73} 69}
74 70
@@ -80,7 +76,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
80 *nphdr = 1; /* PT_NOTE */ 76 *nphdr = 1; /* PT_NOTE */
81 size = 0; 77 size = 0;
82 78
83 for (m=kclist; m; m=m->next) { 79 list_for_each_entry(m, &kclist_head, list) {
84 try = kc_vaddr_to_offset((size_t)m->addr + m->size); 80 try = kc_vaddr_to_offset((size_t)m->addr + m->size);
85 if (try > size) 81 if (try > size)
86 size = try; 82 size = try;
@@ -97,6 +93,177 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
97 return size + *elf_buflen; 93 return size + *elf_buflen;
98} 94}
99 95
96static void free_kclist_ents(struct list_head *head)
97{
98 struct kcore_list *tmp, *pos;
99
100 list_for_each_entry_safe(pos, tmp, head, list) {
101 list_del(&pos->list);
102 kfree(pos);
103 }
104}
105/*
106 * Replace all KCORE_RAM/KCORE_VMEMMAP information with passed list.
107 */
108static void __kcore_update_ram(struct list_head *list)
109{
110 int nphdr;
111 size_t size;
112 struct kcore_list *tmp, *pos;
113 LIST_HEAD(garbage);
114
115 write_lock(&kclist_lock);
116 if (kcore_need_update) {
117 list_for_each_entry_safe(pos, tmp, &kclist_head, list) {
118 if (pos->type == KCORE_RAM
119 || pos->type == KCORE_VMEMMAP)
120 list_move(&pos->list, &garbage);
121 }
122 list_splice_tail(list, &kclist_head);
123 } else
124 list_splice(list, &garbage);
125 kcore_need_update = 0;
126 proc_root_kcore->size = get_kcore_size(&nphdr, &size);
127 write_unlock(&kclist_lock);
128
129 free_kclist_ents(&garbage);
130}
131
132
133#ifdef CONFIG_HIGHMEM
134/*
135 * If no highmem, we can assume [0...max_low_pfn) continuous range of memory
136 * because memory hole is not as big as !HIGHMEM case.
137 * (HIGHMEM is special because part of memory is _invisible_ from the kernel.)
138 */
139static int kcore_update_ram(void)
140{
141 LIST_HEAD(head);
142 struct kcore_list *ent;
143 int ret = 0;
144
145 ent = kmalloc(sizeof(*ent), GFP_KERNEL);
146 if (!ent)
147 return -ENOMEM;
148 ent->addr = (unsigned long)__va(0);
149 ent->size = max_low_pfn << PAGE_SHIFT;
150 ent->type = KCORE_RAM;
151 list_add(&ent->list, &head);
152 __kcore_update_ram(&head);
153 return ret;
154}
155
156#else /* !CONFIG_HIGHMEM */
157
158#ifdef CONFIG_SPARSEMEM_VMEMMAP
159/* calculate vmemmap's address from given system ram pfn and register it */
160int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
161{
162 unsigned long pfn = __pa(ent->addr) >> PAGE_SHIFT;
163 unsigned long nr_pages = ent->size >> PAGE_SHIFT;
164 unsigned long start, end;
165 struct kcore_list *vmm, *tmp;
166
167
168 start = ((unsigned long)pfn_to_page(pfn)) & PAGE_MASK;
169 end = ((unsigned long)pfn_to_page(pfn + nr_pages)) - 1;
170 end = ALIGN(end, PAGE_SIZE);
171 /* overlap check (because we have to align page */
172 list_for_each_entry(tmp, head, list) {
173 if (tmp->type != KCORE_VMEMMAP)
174 continue;
175 if (start < tmp->addr + tmp->size)
176 if (end > tmp->addr)
177 end = tmp->addr;
178 }
179 if (start < end) {
180 vmm = kmalloc(sizeof(*vmm), GFP_KERNEL);
181 if (!vmm)
182 return 0;
183 vmm->addr = start;
184 vmm->size = end - start;
185 vmm->type = KCORE_VMEMMAP;
186 list_add_tail(&vmm->list, head);
187 }
188 return 1;
189
190}
191#else
192int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
193{
194 return 1;
195}
196
197#endif
198
199static int
200kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg)
201{
202 struct list_head *head = (struct list_head *)arg;
203 struct kcore_list *ent;
204
205 ent = kmalloc(sizeof(*ent), GFP_KERNEL);
206 if (!ent)
207 return -ENOMEM;
208 ent->addr = (unsigned long)__va((pfn << PAGE_SHIFT));
209 ent->size = nr_pages << PAGE_SHIFT;
210
211 /* Sanity check: Can happen in 32bit arch...maybe */
212 if (ent->addr < (unsigned long) __va(0))
213 goto free_out;
214
215 /* cut not-mapped area. ....from ppc-32 code. */
216 if (ULONG_MAX - ent->addr < ent->size)
217 ent->size = ULONG_MAX - ent->addr;
218
219 /* cut when vmalloc() area is higher than direct-map area */
220 if (VMALLOC_START > (unsigned long)__va(0)) {
221 if (ent->addr > VMALLOC_START)
222 goto free_out;
223 if (VMALLOC_START - ent->addr < ent->size)
224 ent->size = VMALLOC_START - ent->addr;
225 }
226
227 ent->type = KCORE_RAM;
228 list_add_tail(&ent->list, head);
229
230 if (!get_sparsemem_vmemmap_info(ent, head)) {
231 list_del(&ent->list);
232 goto free_out;
233 }
234
235 return 0;
236free_out:
237 kfree(ent);
238 return 1;
239}
240
241static int kcore_update_ram(void)
242{
243 int nid, ret;
244 unsigned long end_pfn;
245 LIST_HEAD(head);
246
247 /* Not inialized....update now */
248 /* find out "max pfn" */
249 end_pfn = 0;
250 for_each_node_state(nid, N_HIGH_MEMORY) {
251 unsigned long node_end;
252 node_end = NODE_DATA(nid)->node_start_pfn +
253 NODE_DATA(nid)->node_spanned_pages;
254 if (end_pfn < node_end)
255 end_pfn = node_end;
256 }
257 /* scan 0 to max_pfn */
258 ret = walk_system_ram_range(0, end_pfn, &head, kclist_add_private);
259 if (ret) {
260 free_kclist_ents(&head);
261 return -ENOMEM;
262 }
263 __kcore_update_ram(&head);
264 return ret;
265}
266#endif /* CONFIG_HIGHMEM */
100 267
101/*****************************************************************************/ 268/*****************************************************************************/
102/* 269/*
@@ -192,7 +359,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
192 nhdr->p_align = 0; 359 nhdr->p_align = 0;
193 360
194 /* setup ELF PT_LOAD program header for every area */ 361 /* setup ELF PT_LOAD program header for every area */
195 for (m=kclist; m; m=m->next) { 362 list_for_each_entry(m, &kclist_head, list) {
196 phdr = (struct elf_phdr *) bufp; 363 phdr = (struct elf_phdr *) bufp;
197 bufp += sizeof(struct elf_phdr); 364 bufp += sizeof(struct elf_phdr);
198 offset += sizeof(struct elf_phdr); 365 offset += sizeof(struct elf_phdr);
@@ -265,7 +432,8 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
265 unsigned long start; 432 unsigned long start;
266 433
267 read_lock(&kclist_lock); 434 read_lock(&kclist_lock);
268 proc_root_kcore->size = size = get_kcore_size(&nphdr, &elf_buflen); 435 size = get_kcore_size(&nphdr, &elf_buflen);
436
269 if (buflen == 0 || *fpos >= size) { 437 if (buflen == 0 || *fpos >= size) {
270 read_unlock(&kclist_lock); 438 read_unlock(&kclist_lock);
271 return 0; 439 return 0;
@@ -317,7 +485,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
317 struct kcore_list *m; 485 struct kcore_list *m;
318 486
319 read_lock(&kclist_lock); 487 read_lock(&kclist_lock);
320 for (m=kclist; m; m=m->next) { 488 list_for_each_entry(m, &kclist_head, list) {
321 if (start >= m->addr && start < (m->addr+m->size)) 489 if (start >= m->addr && start < (m->addr+m->size))
322 break; 490 break;
323 } 491 }
@@ -326,45 +494,14 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
326 if (m == NULL) { 494 if (m == NULL) {
327 if (clear_user(buffer, tsz)) 495 if (clear_user(buffer, tsz))
328 return -EFAULT; 496 return -EFAULT;
329 } else if (is_vmalloc_addr((void *)start)) { 497 } else if (is_vmalloc_or_module_addr((void *)start)) {
330 char * elf_buf; 498 char * elf_buf;
331 struct vm_struct *m;
332 unsigned long curstart = start;
333 unsigned long cursize = tsz;
334 499
335 elf_buf = kzalloc(tsz, GFP_KERNEL); 500 elf_buf = kzalloc(tsz, GFP_KERNEL);
336 if (!elf_buf) 501 if (!elf_buf)
337 return -ENOMEM; 502 return -ENOMEM;
338 503 vread(elf_buf, (char *)start, tsz);
339 read_lock(&vmlist_lock); 504 /* we have to zero-fill user buffer even if no read */
340 for (m=vmlist; m && cursize; m=m->next) {
341 unsigned long vmstart;
342 unsigned long vmsize;
343 unsigned long msize = m->size - PAGE_SIZE;
344
345 if (((unsigned long)m->addr + msize) <
346 curstart)
347 continue;
348 if ((unsigned long)m->addr > (curstart +
349 cursize))
350 break;
351 vmstart = (curstart < (unsigned long)m->addr ?
352 (unsigned long)m->addr : curstart);
353 if (((unsigned long)m->addr + msize) >
354 (curstart + cursize))
355 vmsize = curstart + cursize - vmstart;
356 else
357 vmsize = (unsigned long)m->addr +
358 msize - vmstart;
359 curstart = vmstart + vmsize;
360 cursize -= vmsize;
361 /* don't dump ioremap'd stuff! (TA) */
362 if (m->flags & VM_IOREMAP)
363 continue;
364 memcpy(elf_buf + (vmstart - start),
365 (char *)vmstart, vmsize);
366 }
367 read_unlock(&vmlist_lock);
368 if (copy_to_user(buffer, elf_buf, tsz)) { 505 if (copy_to_user(buffer, elf_buf, tsz)) {
369 kfree(elf_buf); 506 kfree(elf_buf);
370 return -EFAULT; 507 return -EFAULT;
@@ -402,12 +539,96 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
402 return acc; 539 return acc;
403} 540}
404 541
542
543static int open_kcore(struct inode *inode, struct file *filp)
544{
545 if (!capable(CAP_SYS_RAWIO))
546 return -EPERM;
547 if (kcore_need_update)
548 kcore_update_ram();
549 if (i_size_read(inode) != proc_root_kcore->size) {
550 mutex_lock(&inode->i_mutex);
551 i_size_write(inode, proc_root_kcore->size);
552 mutex_unlock(&inode->i_mutex);
553 }
554 return 0;
555}
556
557
558static const struct file_operations proc_kcore_operations = {
559 .read = read_kcore,
560 .open = open_kcore,
561};
562
563#ifdef CONFIG_MEMORY_HOTPLUG
564/* just remember that we have to update kcore */
565static int __meminit kcore_callback(struct notifier_block *self,
566 unsigned long action, void *arg)
567{
568 switch (action) {
569 case MEM_ONLINE:
570 case MEM_OFFLINE:
571 write_lock(&kclist_lock);
572 kcore_need_update = 1;
573 write_unlock(&kclist_lock);
574 }
575 return NOTIFY_OK;
576}
577#endif
578
579
580static struct kcore_list kcore_vmalloc;
581
582#ifdef CONFIG_ARCH_PROC_KCORE_TEXT
583static struct kcore_list kcore_text;
584/*
585 * If defined, special segment is used for mapping kernel text instead of
586 * direct-map area. We need to create special TEXT section.
587 */
588static void __init proc_kcore_text_init(void)
589{
590 kclist_add(&kcore_text, _stext, _end - _stext, KCORE_TEXT);
591}
592#else
593static void __init proc_kcore_text_init(void)
594{
595}
596#endif
597
598#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
599/*
600 * MODULES_VADDR has no intersection with VMALLOC_ADDR.
601 */
602struct kcore_list kcore_modules;
603static void __init add_modules_range(void)
604{
605 kclist_add(&kcore_modules, (void *)MODULES_VADDR,
606 MODULES_END - MODULES_VADDR, KCORE_VMALLOC);
607}
608#else
609static void __init add_modules_range(void)
610{
611}
612#endif
613
405static int __init proc_kcore_init(void) 614static int __init proc_kcore_init(void)
406{ 615{
407 proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &proc_kcore_operations); 616 proc_root_kcore = proc_create("kcore", S_IRUSR, NULL,
408 if (proc_root_kcore) 617 &proc_kcore_operations);
409 proc_root_kcore->size = 618 if (!proc_root_kcore) {
410 (size_t)high_memory - PAGE_OFFSET + PAGE_SIZE; 619 printk(KERN_ERR "couldn't create /proc/kcore\n");
620 return 0; /* Always returns 0. */
621 }
622 /* Store text area if it's special */
623 proc_kcore_text_init();
624 /* Store vmalloc area */
625 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
626 VMALLOC_END - VMALLOC_START, KCORE_VMALLOC);
627 add_modules_range();
628 /* Store direct-map area from physical memory map */
629 kcore_update_ram();
630 hotplug_memory_notifier(kcore_callback, 0);
631
411 return 0; 632 return 0;
412} 633}
413module_init(proc_kcore_init); 634module_init(proc_kcore_init);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index d5c410d47fae..c7bff4f603ff 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -81,9 +81,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
81 "Writeback: %8lu kB\n" 81 "Writeback: %8lu kB\n"
82 "AnonPages: %8lu kB\n" 82 "AnonPages: %8lu kB\n"
83 "Mapped: %8lu kB\n" 83 "Mapped: %8lu kB\n"
84 "Shmem: %8lu kB\n"
84 "Slab: %8lu kB\n" 85 "Slab: %8lu kB\n"
85 "SReclaimable: %8lu kB\n" 86 "SReclaimable: %8lu kB\n"
86 "SUnreclaim: %8lu kB\n" 87 "SUnreclaim: %8lu kB\n"
88 "KernelStack: %8lu kB\n"
87 "PageTables: %8lu kB\n" 89 "PageTables: %8lu kB\n"
88#ifdef CONFIG_QUICKLIST 90#ifdef CONFIG_QUICKLIST
89 "Quicklists: %8lu kB\n" 91 "Quicklists: %8lu kB\n"
@@ -95,7 +97,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
95 "Committed_AS: %8lu kB\n" 97 "Committed_AS: %8lu kB\n"
96 "VmallocTotal: %8lu kB\n" 98 "VmallocTotal: %8lu kB\n"
97 "VmallocUsed: %8lu kB\n" 99 "VmallocUsed: %8lu kB\n"
98 "VmallocChunk: %8lu kB\n", 100 "VmallocChunk: %8lu kB\n"
101#ifdef CONFIG_MEMORY_FAILURE
102 "HardwareCorrupted: %8lu kB\n"
103#endif
104 ,
99 K(i.totalram), 105 K(i.totalram),
100 K(i.freeram), 106 K(i.freeram),
101 K(i.bufferram), 107 K(i.bufferram),
@@ -124,10 +130,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
124 K(global_page_state(NR_WRITEBACK)), 130 K(global_page_state(NR_WRITEBACK)),
125 K(global_page_state(NR_ANON_PAGES)), 131 K(global_page_state(NR_ANON_PAGES)),
126 K(global_page_state(NR_FILE_MAPPED)), 132 K(global_page_state(NR_FILE_MAPPED)),
133 K(global_page_state(NR_SHMEM)),
127 K(global_page_state(NR_SLAB_RECLAIMABLE) + 134 K(global_page_state(NR_SLAB_RECLAIMABLE) +
128 global_page_state(NR_SLAB_UNRECLAIMABLE)), 135 global_page_state(NR_SLAB_UNRECLAIMABLE)),
129 K(global_page_state(NR_SLAB_RECLAIMABLE)), 136 K(global_page_state(NR_SLAB_RECLAIMABLE)),
130 K(global_page_state(NR_SLAB_UNRECLAIMABLE)), 137 K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
138 global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
131 K(global_page_state(NR_PAGETABLE)), 139 K(global_page_state(NR_PAGETABLE)),
132#ifdef CONFIG_QUICKLIST 140#ifdef CONFIG_QUICKLIST
133 K(quicklist_total_size()), 141 K(quicklist_total_size()),
@@ -140,6 +148,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
140 (unsigned long)VMALLOC_TOTAL >> 10, 148 (unsigned long)VMALLOC_TOTAL >> 10,
141 vmi.used >> 10, 149 vmi.used >> 10,
142 vmi.largest_chunk >> 10 150 vmi.largest_chunk >> 10
151#ifdef CONFIG_MEMORY_FAILURE
152 ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
153#endif
143 ); 154 );
144 155
145 hugetlb_report_meminfo(m); 156 hugetlb_report_meminfo(m);
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 7e14d1a04001..9fe7d7ebe115 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -109,7 +109,7 @@ static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos)
109 return rb_next((struct rb_node *) v); 109 return rb_next((struct rb_node *) v);
110} 110}
111 111
112static struct seq_operations proc_nommu_region_list_seqop = { 112static const struct seq_operations proc_nommu_region_list_seqop = {
113 .start = nommu_region_list_start, 113 .start = nommu_region_list_start,
114 .next = nommu_region_list_next, 114 .next = nommu_region_list_next,
115 .stop = nommu_region_list_stop, 115 .stop = nommu_region_list_stop,
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 2707c6c7a20f..2281c2cbfe2b 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -2,6 +2,7 @@
2#include <linux/compiler.h> 2#include <linux/compiler.h>
3#include <linux/fs.h> 3#include <linux/fs.h>
4#include <linux/init.h> 4#include <linux/init.h>
5#include <linux/ksm.h>
5#include <linux/mm.h> 6#include <linux/mm.h>
6#include <linux/mmzone.h> 7#include <linux/mmzone.h>
7#include <linux/proc_fs.h> 8#include <linux/proc_fs.h>
@@ -95,6 +96,8 @@ static const struct file_operations proc_kpagecount_operations = {
95#define KPF_UNEVICTABLE 18 96#define KPF_UNEVICTABLE 18
96#define KPF_NOPAGE 20 97#define KPF_NOPAGE 20
97 98
99#define KPF_KSM 21
100
98/* kernel hacking assistances 101/* kernel hacking assistances
99 * WARNING: subject to change, never rely on them! 102 * WARNING: subject to change, never rely on them!
100 */ 103 */
@@ -137,6 +140,8 @@ static u64 get_uflags(struct page *page)
137 u |= 1 << KPF_MMAP; 140 u |= 1 << KPF_MMAP;
138 if (PageAnon(page)) 141 if (PageAnon(page))
139 u |= 1 << KPF_ANON; 142 u |= 1 << KPF_ANON;
143 if (PageKsm(page))
144 u |= 1 << KPF_KSM;
140 145
141 /* 146 /*
142 * compound pages: export both head/tail info 147 * compound pages: export both head/tail info
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 9b1e4e9a16bf..f667e8aeabdf 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -153,7 +153,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
153 153
154 /* careful: calling conventions are nasty here */ 154 /* careful: calling conventions are nasty here */
155 res = count; 155 res = count;
156 error = table->proc_handler(table, write, filp, buf, &res, ppos); 156 error = table->proc_handler(table, write, buf, &res, ppos);
157 if (!error) 157 if (!error)
158 error = res; 158 error = res;
159out: 159out:
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 9bd8be1d235c..2a1bef9203c6 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -243,6 +243,25 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
243 } else if (vma->vm_start <= mm->start_stack && 243 } else if (vma->vm_start <= mm->start_stack &&
244 vma->vm_end >= mm->start_stack) { 244 vma->vm_end >= mm->start_stack) {
245 name = "[stack]"; 245 name = "[stack]";
246 } else {
247 unsigned long stack_start;
248 struct proc_maps_private *pmp;
249
250 pmp = m->private;
251 stack_start = pmp->task->stack_start;
252
253 if (vma->vm_start <= stack_start &&
254 vma->vm_end >= stack_start) {
255 pad_len_spaces(m, len);
256 seq_printf(m,
257 "[threadstack:%08lx]",
258#ifdef CONFIG_STACK_GROWSUP
259 vma->vm_end - stack_start
260#else
261 stack_start - vma->vm_start
262#endif
263 );
264 }
246 } 265 }
247 } else { 266 } else {
248 name = "[vdso]"; 267 name = "[vdso]";
@@ -465,23 +484,28 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
465 return 0; 484 return 0;
466} 485}
467 486
487#define CLEAR_REFS_ALL 1
488#define CLEAR_REFS_ANON 2
489#define CLEAR_REFS_MAPPED 3
490
468static ssize_t clear_refs_write(struct file *file, const char __user *buf, 491static ssize_t clear_refs_write(struct file *file, const char __user *buf,
469 size_t count, loff_t *ppos) 492 size_t count, loff_t *ppos)
470{ 493{
471 struct task_struct *task; 494 struct task_struct *task;
472 char buffer[PROC_NUMBUF], *end; 495 char buffer[PROC_NUMBUF];
473 struct mm_struct *mm; 496 struct mm_struct *mm;
474 struct vm_area_struct *vma; 497 struct vm_area_struct *vma;
498 long type;
475 499
476 memset(buffer, 0, sizeof(buffer)); 500 memset(buffer, 0, sizeof(buffer));
477 if (count > sizeof(buffer) - 1) 501 if (count > sizeof(buffer) - 1)
478 count = sizeof(buffer) - 1; 502 count = sizeof(buffer) - 1;
479 if (copy_from_user(buffer, buf, count)) 503 if (copy_from_user(buffer, buf, count))
480 return -EFAULT; 504 return -EFAULT;
481 if (!simple_strtol(buffer, &end, 0)) 505 if (strict_strtol(strstrip(buffer), 10, &type))
506 return -EINVAL;
507 if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED)
482 return -EINVAL; 508 return -EINVAL;
483 if (*end == '\n')
484 end++;
485 task = get_proc_task(file->f_path.dentry->d_inode); 509 task = get_proc_task(file->f_path.dentry->d_inode);
486 if (!task) 510 if (!task)
487 return -ESRCH; 511 return -ESRCH;
@@ -494,18 +518,31 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
494 down_read(&mm->mmap_sem); 518 down_read(&mm->mmap_sem);
495 for (vma = mm->mmap; vma; vma = vma->vm_next) { 519 for (vma = mm->mmap; vma; vma = vma->vm_next) {
496 clear_refs_walk.private = vma; 520 clear_refs_walk.private = vma;
497 if (!is_vm_hugetlb_page(vma)) 521 if (is_vm_hugetlb_page(vma))
498 walk_page_range(vma->vm_start, vma->vm_end, 522 continue;
499 &clear_refs_walk); 523 /*
524 * Writing 1 to /proc/pid/clear_refs affects all pages.
525 *
526 * Writing 2 to /proc/pid/clear_refs only affects
527 * Anonymous pages.
528 *
529 * Writing 3 to /proc/pid/clear_refs only affects file
530 * mapped pages.
531 */
532 if (type == CLEAR_REFS_ANON && vma->vm_file)
533 continue;
534 if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
535 continue;
536 walk_page_range(vma->vm_start, vma->vm_end,
537 &clear_refs_walk);
500 } 538 }
501 flush_tlb_mm(mm); 539 flush_tlb_mm(mm);
502 up_read(&mm->mmap_sem); 540 up_read(&mm->mmap_sem);
503 mmput(mm); 541 mmput(mm);
504 } 542 }
505 put_task_struct(task); 543 put_task_struct(task);
506 if (end - buffer == 0) 544
507 return -EIO; 545 return count;
508 return end - buffer;
509} 546}
510 547
511const struct file_operations proc_clear_refs_operations = { 548const struct file_operations proc_clear_refs_operations = {
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 0c10a0b3f146..766b1d456050 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -4,13 +4,18 @@
4#include <linux/sched.h> 4#include <linux/sched.h>
5#include <linux/seq_file.h> 5#include <linux/seq_file.h>
6#include <linux/time.h> 6#include <linux/time.h>
7#include <linux/kernel_stat.h>
7#include <asm/cputime.h> 8#include <asm/cputime.h>
8 9
9static int uptime_proc_show(struct seq_file *m, void *v) 10static int uptime_proc_show(struct seq_file *m, void *v)
10{ 11{
11 struct timespec uptime; 12 struct timespec uptime;
12 struct timespec idle; 13 struct timespec idle;
13 cputime_t idletime = cputime_add(init_task.utime, init_task.stime); 14 int i;
15 cputime_t idletime = cputime_zero;
16
17 for_each_possible_cpu(i)
18 idletime = cputime64_add(idletime, kstat_cpu(i).cpustat.idle);
14 19
15 do_posix_clock_monotonic_gettime(&uptime); 20 do_posix_clock_monotonic_gettime(&uptime);
16 monotonic_to_bootbased(&uptime); 21 monotonic_to_bootbased(&uptime);
diff --git a/fs/qnx4/Kconfig b/fs/qnx4/Kconfig
index be8e0e1445b6..5f6089994042 100644
--- a/fs/qnx4/Kconfig
+++ b/fs/qnx4/Kconfig
@@ -6,20 +6,9 @@ config QNX4FS_FS
6 QNX 4 and QNX 6 (the latter is also called QNX RTP). 6 QNX 4 and QNX 6 (the latter is also called QNX RTP).
7 Further information is available at <http://www.qnx.com/>. 7 Further information is available at <http://www.qnx.com/>.
8 Say Y if you intend to mount QNX hard disks or floppies. 8 Say Y if you intend to mount QNX hard disks or floppies.
9 Unless you say Y to "QNX4FS read-write support" below, you will
10 only be able to read these file systems.
11 9
12 To compile this file system support as a module, choose M here: the 10 To compile this file system support as a module, choose M here: the
13 module will be called qnx4. 11 module will be called qnx4.
14 12
15 If you don't know whether you need it, then you don't need it: 13 If you don't know whether you need it, then you don't need it:
16 answer N. 14 answer N.
17
18config QNX4FS_RW
19 bool "QNX4FS write support (DANGEROUS)"
20 depends on QNX4FS_FS && EXPERIMENTAL && BROKEN
21 help
22 Say Y if you want to test write support for QNX4 file systems.
23
24 It's currently broken, so for now:
25 answer N.
diff --git a/fs/qnx4/Makefile b/fs/qnx4/Makefile
index e4d408cc5473..4a283b3f87f8 100644
--- a/fs/qnx4/Makefile
+++ b/fs/qnx4/Makefile
@@ -4,4 +4,4 @@
4 4
5obj-$(CONFIG_QNX4FS_FS) += qnx4.o 5obj-$(CONFIG_QNX4FS_FS) += qnx4.o
6 6
7qnx4-objs := inode.o dir.o namei.o file.o bitmap.o truncate.o 7qnx4-objs := inode.o dir.o namei.o bitmap.o
diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c
index e1cd061a25f7..0afba069d567 100644
--- a/fs/qnx4/bitmap.c
+++ b/fs/qnx4/bitmap.c
@@ -78,84 +78,3 @@ unsigned long qnx4_count_free_blocks(struct super_block *sb)
78 78
79 return total_free; 79 return total_free;
80} 80}
81
82#ifdef CONFIG_QNX4FS_RW
83
84int qnx4_is_free(struct super_block *sb, long block)
85{
86 int start = le32_to_cpu(qnx4_sb(sb)->BitMap->di_first_xtnt.xtnt_blk) - 1;
87 int size = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size);
88 struct buffer_head *bh;
89 const char *g;
90 int ret = -EIO;
91
92 start += block / (QNX4_BLOCK_SIZE * 8);
93 QNX4DEBUG(("qnx4: is_free requesting block [%lu], bitmap in block [%lu]\n",
94 (unsigned long) block, (unsigned long) start));
95 (void) size; /* CHECKME */
96 bh = sb_bread(sb, start);
97 if (bh == NULL) {
98 return -EIO;
99 }
100 g = bh->b_data + (block % QNX4_BLOCK_SIZE);
101 if (((*g) & (1 << (block % 8))) == 0) {
102 QNX4DEBUG(("qnx4: is_free -> block is free\n"));
103 ret = 1;
104 } else {
105 QNX4DEBUG(("qnx4: is_free -> block is busy\n"));
106 ret = 0;
107 }
108 brelse(bh);
109
110 return ret;
111}
112
113int qnx4_set_bitmap(struct super_block *sb, long block, int busy)
114{
115 int start = le32_to_cpu(qnx4_sb(sb)->BitMap->di_first_xtnt.xtnt_blk) - 1;
116 int size = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size);
117 struct buffer_head *bh;
118 char *g;
119
120 start += block / (QNX4_BLOCK_SIZE * 8);
121 QNX4DEBUG(("qnx4: set_bitmap requesting block [%lu], bitmap in block [%lu]\n",
122 (unsigned long) block, (unsigned long) start));
123 (void) size; /* CHECKME */
124 bh = sb_bread(sb, start);
125 if (bh == NULL) {
126 return -EIO;
127 }
128 g = bh->b_data + (block % QNX4_BLOCK_SIZE);
129 if (busy == 0) {
130 (*g) &= ~(1 << (block % 8));
131 } else {
132 (*g) |= (1 << (block % 8));
133 }
134 mark_buffer_dirty(bh);
135 brelse(bh);
136
137 return 0;
138}
139
140static void qnx4_clear_inode(struct inode *inode)
141{
142 struct qnx4_inode_entry *qnx4_ino = qnx4_raw_inode(inode);
143 /* What for? */
144 memset(qnx4_ino->di_fname, 0, sizeof qnx4_ino->di_fname);
145 qnx4_ino->di_size = 0;
146 qnx4_ino->di_num_xtnts = 0;
147 qnx4_ino->di_mode = 0;
148 qnx4_ino->di_status = 0;
149}
150
151void qnx4_free_inode(struct inode *inode)
152{
153 if (inode->i_ino < 1) {
154 printk("free_inode: inode 0 or nonexistent inode\n");
155 return;
156 }
157 qnx4_clear_inode(inode);
158 clear_inode(inode);
159}
160
161#endif
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 003c68f3238b..86cc39cb1398 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -85,9 +85,4 @@ const struct file_operations qnx4_dir_operations =
85const struct inode_operations qnx4_dir_inode_operations = 85const struct inode_operations qnx4_dir_inode_operations =
86{ 86{
87 .lookup = qnx4_lookup, 87 .lookup = qnx4_lookup,
88#ifdef CONFIG_QNX4FS_RW
89 .create = qnx4_create,
90 .unlink = qnx4_unlink,
91 .rmdir = qnx4_rmdir,
92#endif
93}; 88};
diff --git a/fs/qnx4/file.c b/fs/qnx4/file.c
deleted file mode 100644
index 09b170ac936c..000000000000
--- a/fs/qnx4/file.c
+++ /dev/null
@@ -1,40 +0,0 @@
1/*
2 * QNX4 file system, Linux implementation.
3 *
4 * Version : 0.2.1
5 *
6 * Using parts of the xiafs filesystem.
7 *
8 * History :
9 *
10 * 25-05-1998 by Richard Frowijn : first release.
11 * 21-06-1998 by Frank Denis : wrote qnx4_readpage to use generic_file_read.
12 * 27-06-1998 by Frank Denis : file overwriting.
13 */
14
15#include "qnx4.h"
16
17/*
18 * We have mostly NULL's here: the current defaults are ok for
19 * the qnx4 filesystem.
20 */
21const struct file_operations qnx4_file_operations =
22{
23 .llseek = generic_file_llseek,
24 .read = do_sync_read,
25 .aio_read = generic_file_aio_read,
26 .mmap = generic_file_mmap,
27 .splice_read = generic_file_splice_read,
28#ifdef CONFIG_QNX4FS_RW
29 .write = do_sync_write,
30 .aio_write = generic_file_aio_write,
31 .fsync = simple_fsync,
32#endif
33};
34
35const struct inode_operations qnx4_file_inode_operations =
36{
37#ifdef CONFIG_QNX4FS_RW
38 .truncate = qnx4_truncate,
39#endif
40};
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 681df5fcd161..d2cd1798d8c4 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -28,73 +28,6 @@
28 28
29static const struct super_operations qnx4_sops; 29static const struct super_operations qnx4_sops;
30 30
31#ifdef CONFIG_QNX4FS_RW
32
33static void qnx4_delete_inode(struct inode *inode)
34{
35 QNX4DEBUG(("qnx4: deleting inode [%lu]\n", (unsigned long) inode->i_ino));
36 truncate_inode_pages(&inode->i_data, 0);
37 inode->i_size = 0;
38 qnx4_truncate(inode);
39 lock_kernel();
40 qnx4_free_inode(inode);
41 unlock_kernel();
42}
43
44static int qnx4_write_inode(struct inode *inode, int do_sync)
45{
46 struct qnx4_inode_entry *raw_inode;
47 int block, ino;
48 struct buffer_head *bh;
49 ino = inode->i_ino;
50
51 QNX4DEBUG(("qnx4: write inode 1.\n"));
52 if (inode->i_nlink == 0) {
53 return 0;
54 }
55 if (!ino) {
56 printk("qnx4: bad inode number on dev %s: %d is out of range\n",
57 inode->i_sb->s_id, ino);
58 return -EIO;
59 }
60 QNX4DEBUG(("qnx4: write inode 2.\n"));
61 block = ino / QNX4_INODES_PER_BLOCK;
62 lock_kernel();
63 if (!(bh = sb_bread(inode->i_sb, block))) {
64 printk("qnx4: major problem: unable to read inode from dev "
65 "%s\n", inode->i_sb->s_id);
66 unlock_kernel();
67 return -EIO;
68 }
69 raw_inode = ((struct qnx4_inode_entry *) bh->b_data) +
70 (ino % QNX4_INODES_PER_BLOCK);
71 raw_inode->di_mode = cpu_to_le16(inode->i_mode);
72 raw_inode->di_uid = cpu_to_le16(fs_high2lowuid(inode->i_uid));
73 raw_inode->di_gid = cpu_to_le16(fs_high2lowgid(inode->i_gid));
74 raw_inode->di_nlink = cpu_to_le16(inode->i_nlink);
75 raw_inode->di_size = cpu_to_le32(inode->i_size);
76 raw_inode->di_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
77 raw_inode->di_atime = cpu_to_le32(inode->i_atime.tv_sec);
78 raw_inode->di_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
79 raw_inode->di_first_xtnt.xtnt_size = cpu_to_le32(inode->i_blocks);
80 mark_buffer_dirty(bh);
81 if (do_sync) {
82 sync_dirty_buffer(bh);
83 if (buffer_req(bh) && !buffer_uptodate(bh)) {
84 printk("qnx4: IO error syncing inode [%s:%08x]\n",
85 inode->i_sb->s_id, ino);
86 brelse(bh);
87 unlock_kernel();
88 return -EIO;
89 }
90 }
91 brelse(bh);
92 unlock_kernel();
93 return 0;
94}
95
96#endif
97
98static void qnx4_put_super(struct super_block *sb); 31static void qnx4_put_super(struct super_block *sb);
99static struct inode *qnx4_alloc_inode(struct super_block *sb); 32static struct inode *qnx4_alloc_inode(struct super_block *sb);
100static void qnx4_destroy_inode(struct inode *inode); 33static void qnx4_destroy_inode(struct inode *inode);
@@ -108,10 +41,6 @@ static const struct super_operations qnx4_sops =
108 .put_super = qnx4_put_super, 41 .put_super = qnx4_put_super,
109 .statfs = qnx4_statfs, 42 .statfs = qnx4_statfs,
110 .remount_fs = qnx4_remount, 43 .remount_fs = qnx4_remount,
111#ifdef CONFIG_QNX4FS_RW
112 .write_inode = qnx4_write_inode,
113 .delete_inode = qnx4_delete_inode,
114#endif
115}; 44};
116 45
117static int qnx4_remount(struct super_block *sb, int *flags, char *data) 46static int qnx4_remount(struct super_block *sb, int *flags, char *data)
@@ -120,15 +49,7 @@ static int qnx4_remount(struct super_block *sb, int *flags, char *data)
120 49
121 qs = qnx4_sb(sb); 50 qs = qnx4_sb(sb);
122 qs->Version = QNX4_VERSION; 51 qs->Version = QNX4_VERSION;
123#ifndef CONFIG_QNX4FS_RW
124 *flags |= MS_RDONLY; 52 *flags |= MS_RDONLY;
125#endif
126 if (*flags & MS_RDONLY) {
127 return 0;
128 }
129
130 mark_buffer_dirty(qs->sb_buf);
131
132 return 0; 53 return 0;
133} 54}
134 55
@@ -354,9 +275,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
354 } 275 }
355 s->s_op = &qnx4_sops; 276 s->s_op = &qnx4_sops;
356 s->s_magic = QNX4_SUPER_MAGIC; 277 s->s_magic = QNX4_SUPER_MAGIC;
357#ifndef CONFIG_QNX4FS_RW
358 s->s_flags |= MS_RDONLY; /* Yup, read-only yet */ 278 s->s_flags |= MS_RDONLY; /* Yup, read-only yet */
359#endif
360 qnx4_sb(s)->sb_buf = bh; 279 qnx4_sb(s)->sb_buf = bh;
361 qnx4_sb(s)->sb = (struct qnx4_super_block *) bh->b_data; 280 qnx4_sb(s)->sb = (struct qnx4_super_block *) bh->b_data;
362 281
@@ -489,8 +408,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
489 408
490 memcpy(qnx4_inode, raw_inode, QNX4_DIR_ENTRY_SIZE); 409 memcpy(qnx4_inode, raw_inode, QNX4_DIR_ENTRY_SIZE);
491 if (S_ISREG(inode->i_mode)) { 410 if (S_ISREG(inode->i_mode)) {
492 inode->i_op = &qnx4_file_inode_operations; 411 inode->i_fop = &generic_ro_fops;
493 inode->i_fop = &qnx4_file_operations;
494 inode->i_mapping->a_ops = &qnx4_aops; 412 inode->i_mapping->a_ops = &qnx4_aops;
495 qnx4_i(inode)->mmu_private = inode->i_size; 413 qnx4_i(inode)->mmu_private = inode->i_size;
496 } else if (S_ISDIR(inode->i_mode)) { 414 } else if (S_ISDIR(inode->i_mode)) {
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 5972ed214937..ae1e7edbacd6 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -134,108 +134,3 @@ out:
134 134
135 return NULL; 135 return NULL;
136} 136}
137
138#ifdef CONFIG_QNX4FS_RW
139int qnx4_create(struct inode *dir, struct dentry *dentry, int mode,
140 struct nameidata *nd)
141{
142 QNX4DEBUG(("qnx4: qnx4_create\n"));
143 if (dir == NULL) {
144 return -ENOENT;
145 }
146 return -ENOSPC;
147}
148
149int qnx4_rmdir(struct inode *dir, struct dentry *dentry)
150{
151 struct buffer_head *bh;
152 struct qnx4_inode_entry *de;
153 struct inode *inode;
154 int retval;
155 int ino;
156
157 QNX4DEBUG(("qnx4: qnx4_rmdir [%s]\n", dentry->d_name.name));
158 lock_kernel();
159 bh = qnx4_find_entry(dentry->d_name.len, dir, dentry->d_name.name,
160 &de, &ino);
161 if (bh == NULL) {
162 unlock_kernel();
163 return -ENOENT;
164 }
165 inode = dentry->d_inode;
166 if (inode->i_ino != ino) {
167 retval = -EIO;
168 goto end_rmdir;
169 }
170#if 0
171 if (!empty_dir(inode)) {
172 retval = -ENOTEMPTY;
173 goto end_rmdir;
174 }
175#endif
176 if (inode->i_nlink != 2) {
177 QNX4DEBUG(("empty directory has nlink!=2 (%d)\n", inode->i_nlink));
178 }
179 QNX4DEBUG(("qnx4: deleting directory\n"));
180 de->di_status = 0;
181 memset(de->di_fname, 0, sizeof de->di_fname);
182 de->di_mode = 0;
183 mark_buffer_dirty_inode(bh, dir);
184 clear_nlink(inode);
185 mark_inode_dirty(inode);
186 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
187 inode_dec_link_count(dir);
188 retval = 0;
189
190 end_rmdir:
191 brelse(bh);
192
193 unlock_kernel();
194 return retval;
195}
196
197int qnx4_unlink(struct inode *dir, struct dentry *dentry)
198{
199 struct buffer_head *bh;
200 struct qnx4_inode_entry *de;
201 struct inode *inode;
202 int retval;
203 int ino;
204
205 QNX4DEBUG(("qnx4: qnx4_unlink [%s]\n", dentry->d_name.name));
206 lock_kernel();
207 bh = qnx4_find_entry(dentry->d_name.len, dir, dentry->d_name.name,
208 &de, &ino);
209 if (bh == NULL) {
210 unlock_kernel();
211 return -ENOENT;
212 }
213 inode = dentry->d_inode;
214 if (inode->i_ino != ino) {
215 retval = -EIO;
216 goto end_unlink;
217 }
218 retval = -EPERM;
219 if (!inode->i_nlink) {
220 QNX4DEBUG(("Deleting nonexistent file (%s:%lu), %d\n",
221 inode->i_sb->s_id,
222 inode->i_ino, inode->i_nlink));
223 inode->i_nlink = 1;
224 }
225 de->di_status = 0;
226 memset(de->di_fname, 0, sizeof de->di_fname);
227 de->di_mode = 0;
228 mark_buffer_dirty_inode(bh, dir);
229 dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
230 mark_inode_dirty(dir);
231 inode->i_ctime = dir->i_ctime;
232 inode_dec_link_count(inode);
233 retval = 0;
234
235end_unlink:
236 unlock_kernel();
237 brelse(bh);
238
239 return retval;
240}
241#endif
diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h
index 9efc089454f6..33a60858203b 100644
--- a/fs/qnx4/qnx4.h
+++ b/fs/qnx4/qnx4.h
@@ -29,17 +29,9 @@ extern unsigned long qnx4_block_map(struct inode *inode, long iblock);
29 29
30extern struct buffer_head *qnx4_bread(struct inode *, int, int); 30extern struct buffer_head *qnx4_bread(struct inode *, int, int);
31 31
32extern const struct inode_operations qnx4_file_inode_operations;
33extern const struct inode_operations qnx4_dir_inode_operations; 32extern const struct inode_operations qnx4_dir_inode_operations;
34extern const struct file_operations qnx4_file_operations;
35extern const struct file_operations qnx4_dir_operations; 33extern const struct file_operations qnx4_dir_operations;
36extern int qnx4_is_free(struct super_block *sb, long block); 34extern int qnx4_is_free(struct super_block *sb, long block);
37extern int qnx4_set_bitmap(struct super_block *sb, long block, int busy);
38extern int qnx4_create(struct inode *inode, struct dentry *dentry, int mode, struct nameidata *nd);
39extern void qnx4_truncate(struct inode *inode);
40extern void qnx4_free_inode(struct inode *inode);
41extern int qnx4_unlink(struct inode *dir, struct dentry *dentry);
42extern int qnx4_rmdir(struct inode *dir, struct dentry *dentry);
43 35
44static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb) 36static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb)
45{ 37{
diff --git a/fs/qnx4/truncate.c b/fs/qnx4/truncate.c
deleted file mode 100644
index d94d9ee241fe..000000000000
--- a/fs/qnx4/truncate.c
+++ /dev/null
@@ -1,34 +0,0 @@
1/*
2 * QNX4 file system, Linux implementation.
3 *
4 * Version : 0.1
5 *
6 * Using parts of the xiafs filesystem.
7 *
8 * History :
9 *
10 * 30-06-1998 by Frank DENIS : ugly filler.
11 */
12
13#include <linux/smp_lock.h>
14#include "qnx4.h"
15
16#ifdef CONFIG_QNX4FS_RW
17
18void qnx4_truncate(struct inode *inode)
19{
20 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
21 S_ISLNK(inode->i_mode))) {
22 return;
23 }
24 lock_kernel();
25 if (!(S_ISDIR(inode->i_mode))) {
26 /* TODO */
27 }
28 QNX4DEBUG(("qnx4: qnx4_truncate called\n"));
29 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
30 mark_inode_dirty(inode);
31 unlock_kernel();
32}
33
34#endif
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 38f7bd559f35..39b49c42a7ed 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1839,7 +1839,7 @@ EXPORT_SYMBOL(dquot_commit_info);
1839/* 1839/*
1840 * Definitions of diskquota operations. 1840 * Definitions of diskquota operations.
1841 */ 1841 */
1842struct dquot_operations dquot_operations = { 1842const struct dquot_operations dquot_operations = {
1843 .initialize = dquot_initialize, 1843 .initialize = dquot_initialize,
1844 .drop = dquot_drop, 1844 .drop = dquot_drop,
1845 .alloc_space = dquot_alloc_space, 1845 .alloc_space = dquot_alloc_space,
@@ -2461,7 +2461,7 @@ out:
2461} 2461}
2462EXPORT_SYMBOL(vfs_set_dqinfo); 2462EXPORT_SYMBOL(vfs_set_dqinfo);
2463 2463
2464struct quotactl_ops vfs_quotactl_ops = { 2464const struct quotactl_ops vfs_quotactl_ops = {
2465 .quota_on = vfs_quota_on, 2465 .quota_on = vfs_quota_on,
2466 .quota_off = vfs_quota_off, 2466 .quota_off = vfs_quota_off,
2467 .quota_sync = vfs_quota_sync, 2467 .quota_sync = vfs_quota_sync,
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 11f0c06316de..32fae4040ebf 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -69,14 +69,11 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
69 /* make various checks */ 69 /* make various checks */
70 order = get_order(newsize); 70 order = get_order(newsize);
71 if (unlikely(order >= MAX_ORDER)) 71 if (unlikely(order >= MAX_ORDER))
72 goto too_big; 72 return -EFBIG;
73 73
74 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; 74 ret = inode_newsize_ok(inode, newsize);
75 if (limit != RLIM_INFINITY && newsize > limit) 75 if (ret)
76 goto fsize_exceeded; 76 return ret;
77
78 if (newsize > inode->i_sb->s_maxbytes)
79 goto too_big;
80 77
81 i_size_write(inode, newsize); 78 i_size_write(inode, newsize);
82 79
@@ -118,12 +115,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
118 115
119 return 0; 116 return 0;
120 117
121 fsize_exceeded: 118add_error:
122 send_sig(SIGXFSZ, current, 0);
123 too_big:
124 return -EFBIG;
125
126 add_error:
127 while (loop < npages) 119 while (loop < npages)
128 __free_page(pages + loop++); 120 __free_page(pages + loop++);
129 return ret; 121 return ret;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 0ff7566c767c..a6090aa1a7c1 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -34,18 +34,17 @@
34#include <linux/ramfs.h> 34#include <linux/ramfs.h>
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <linux/parser.h> 36#include <linux/parser.h>
37#include <linux/magic.h>
37#include <asm/uaccess.h> 38#include <asm/uaccess.h>
38#include "internal.h" 39#include "internal.h"
39 40
40/* some random number */
41#define RAMFS_MAGIC 0x858458f6
42
43#define RAMFS_DEFAULT_MODE 0755 41#define RAMFS_DEFAULT_MODE 0755
44 42
45static const struct super_operations ramfs_ops; 43static const struct super_operations ramfs_ops;
46static const struct inode_operations ramfs_dir_inode_operations; 44static const struct inode_operations ramfs_dir_inode_operations;
47 45
48static struct backing_dev_info ramfs_backing_dev_info = { 46static struct backing_dev_info ramfs_backing_dev_info = {
47 .name = "ramfs",
49 .ra_pages = 0, /* No readahead */ 48 .ra_pages = 0, /* No readahead */
50 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | 49 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK |
51 BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY | 50 BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
diff --git a/fs/read_write.c b/fs/read_write.c
index 6c8c55dec2bc..3ac28987f22a 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -839,9 +839,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
839 max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); 839 max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
840 840
841 pos = *ppos; 841 pos = *ppos;
842 retval = -EINVAL;
843 if (unlikely(pos < 0))
844 goto fput_out;
845 if (unlikely(pos + count > max)) { 842 if (unlikely(pos + count > max)) {
846 retval = -EOVERFLOW; 843 retval = -EOVERFLOW;
847 if (pos >= max) 844 if (pos >= max)
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 7adea74d6a8a..f0ad05f38022 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -612,7 +612,7 @@ static int reiserfs_mark_dquot_dirty(struct dquot *);
612static int reiserfs_write_info(struct super_block *, int); 612static int reiserfs_write_info(struct super_block *, int);
613static int reiserfs_quota_on(struct super_block *, int, int, char *, int); 613static int reiserfs_quota_on(struct super_block *, int, int, char *, int);
614 614
615static struct dquot_operations reiserfs_quota_operations = { 615static const struct dquot_operations reiserfs_quota_operations = {
616 .initialize = dquot_initialize, 616 .initialize = dquot_initialize,
617 .drop = dquot_drop, 617 .drop = dquot_drop,
618 .alloc_space = dquot_alloc_space, 618 .alloc_space = dquot_alloc_space,
@@ -629,7 +629,7 @@ static struct dquot_operations reiserfs_quota_operations = {
629 .destroy_dquot = dquot_destroy, 629 .destroy_dquot = dquot_destroy,
630}; 630};
631 631
632static struct quotactl_ops reiserfs_qctl_operations = { 632static const struct quotactl_ops reiserfs_qctl_operations = {
633 .quota_on = reiserfs_quota_on, 633 .quota_on = reiserfs_quota_on,
634 .quota_off = vfs_quota_off, 634 .quota_off = vfs_quota_off,
635 .quota_sync = vfs_quota_sync, 635 .quota_sync = vfs_quota_sync,
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 4ab3c03d8f95..c117fa80d1e9 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -284,7 +284,7 @@ static const struct file_operations romfs_dir_operations = {
284 .readdir = romfs_readdir, 284 .readdir = romfs_readdir,
285}; 285};
286 286
287static struct inode_operations romfs_dir_inode_operations = { 287static const struct inode_operations romfs_dir_inode_operations = {
288 .lookup = romfs_lookup, 288 .lookup = romfs_lookup,
289}; 289};
290 290
@@ -528,7 +528,7 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
528 pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK; 528 pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK;
529 529
530 root = romfs_iget(sb, pos); 530 root = romfs_iget(sb, pos);
531 if (!root) 531 if (IS_ERR(root))
532 goto error; 532 goto error;
533 533
534 sb->s_root = d_alloc_root(root); 534 sb->s_root = d_alloc_root(root);
diff --git a/fs/select.c b/fs/select.c
index 8084834e123e..fd38ce2e32e3 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -15,6 +15,7 @@
15 */ 15 */
16 16
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/sched.h>
18#include <linux/syscalls.h> 19#include <linux/syscalls.h>
19#include <linux/module.h> 20#include <linux/module.h>
20#include <linux/slab.h> 21#include <linux/slab.h>
@@ -41,22 +42,28 @@
41 * better solutions.. 42 * better solutions..
42 */ 43 */
43 44
45#define MAX_SLACK (100 * NSEC_PER_MSEC)
46
44static long __estimate_accuracy(struct timespec *tv) 47static long __estimate_accuracy(struct timespec *tv)
45{ 48{
46 long slack; 49 long slack;
47 int divfactor = 1000; 50 int divfactor = 1000;
48 51
52 if (tv->tv_sec < 0)
53 return 0;
54
49 if (task_nice(current) > 0) 55 if (task_nice(current) > 0)
50 divfactor = divfactor / 5; 56 divfactor = divfactor / 5;
51 57
58 if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor))
59 return MAX_SLACK;
60
52 slack = tv->tv_nsec / divfactor; 61 slack = tv->tv_nsec / divfactor;
53 slack += tv->tv_sec * (NSEC_PER_SEC/divfactor); 62 slack += tv->tv_sec * (NSEC_PER_SEC/divfactor);
54 63
55 if (slack > 100 * NSEC_PER_MSEC) 64 if (slack > MAX_SLACK)
56 slack = 100 * NSEC_PER_MSEC; 65 return MAX_SLACK;
57 66
58 if (slack < 0)
59 slack = 0;
60 return slack; 67 return slack;
61} 68}
62 69
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 6c959275f2d0..eae7d9dbf3ff 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -429,20 +429,21 @@ EXPORT_SYMBOL(mangle_path);
429 */ 429 */
430int seq_path(struct seq_file *m, struct path *path, char *esc) 430int seq_path(struct seq_file *m, struct path *path, char *esc)
431{ 431{
432 if (m->count < m->size) { 432 char *buf;
433 char *s = m->buf + m->count; 433 size_t size = seq_get_buf(m, &buf);
434 char *p = d_path(path, s, m->size - m->count); 434 int res = -1;
435
436 if (size) {
437 char *p = d_path(path, buf, size);
435 if (!IS_ERR(p)) { 438 if (!IS_ERR(p)) {
436 s = mangle_path(s, p, esc); 439 char *end = mangle_path(buf, p, esc);
437 if (s) { 440 if (end)
438 p = m->buf + m->count; 441 res = end - buf;
439 m->count = s - m->buf;
440 return s - p;
441 }
442 } 442 }
443 } 443 }
444 m->count = m->size; 444 seq_commit(m, res);
445 return -1; 445
446 return res;
446} 447}
447EXPORT_SYMBOL(seq_path); 448EXPORT_SYMBOL(seq_path);
448 449
@@ -454,26 +455,28 @@ EXPORT_SYMBOL(seq_path);
454int seq_path_root(struct seq_file *m, struct path *path, struct path *root, 455int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
455 char *esc) 456 char *esc)
456{ 457{
457 int err = -ENAMETOOLONG; 458 char *buf;
458 if (m->count < m->size) { 459 size_t size = seq_get_buf(m, &buf);
459 char *s = m->buf + m->count; 460 int res = -ENAMETOOLONG;
461
462 if (size) {
460 char *p; 463 char *p;
461 464
462 spin_lock(&dcache_lock); 465 spin_lock(&dcache_lock);
463 p = __d_path(path, root, s, m->size - m->count); 466 p = __d_path(path, root, buf, size);
464 spin_unlock(&dcache_lock); 467 spin_unlock(&dcache_lock);
465 err = PTR_ERR(p); 468 res = PTR_ERR(p);
466 if (!IS_ERR(p)) { 469 if (!IS_ERR(p)) {
467 s = mangle_path(s, p, esc); 470 char *end = mangle_path(buf, p, esc);
468 if (s) { 471 if (end)
469 p = m->buf + m->count; 472 res = end - buf;
470 m->count = s - m->buf; 473 else
471 return 0; 474 res = -ENAMETOOLONG;
472 }
473 } 475 }
474 } 476 }
475 m->count = m->size; 477 seq_commit(m, res);
476 return err; 478
479 return res < 0 ? res : 0;
477} 480}
478 481
479/* 482/*
@@ -481,20 +484,21 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
481 */ 484 */
482int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc) 485int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
483{ 486{
484 if (m->count < m->size) { 487 char *buf;
485 char *s = m->buf + m->count; 488 size_t size = seq_get_buf(m, &buf);
486 char *p = dentry_path(dentry, s, m->size - m->count); 489 int res = -1;
490
491 if (size) {
492 char *p = dentry_path(dentry, buf, size);
487 if (!IS_ERR(p)) { 493 if (!IS_ERR(p)) {
488 s = mangle_path(s, p, esc); 494 char *end = mangle_path(buf, p, esc);
489 if (s) { 495 if (end)
490 p = m->buf + m->count; 496 res = end - buf;
491 m->count = s - m->buf;
492 return s - p;
493 }
494 } 497 }
495 } 498 }
496 m->count = m->size; 499 seq_commit(m, res);
497 return -1; 500
501 return res;
498} 502}
499 503
500int seq_bitmap(struct seq_file *m, const unsigned long *bits, 504int seq_bitmap(struct seq_file *m, const unsigned long *bits,
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 1402d2d54f52..1c4c8f089970 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -459,14 +459,8 @@ smb_show_options(struct seq_file *s, struct vfsmount *m)
459static void 459static void
460smb_unload_nls(struct smb_sb_info *server) 460smb_unload_nls(struct smb_sb_info *server)
461{ 461{
462 if (server->remote_nls) { 462 unload_nls(server->remote_nls);
463 unload_nls(server->remote_nls); 463 unload_nls(server->local_nls);
464 server->remote_nls = NULL;
465 }
466 if (server->local_nls) {
467 unload_nls(server->local_nls);
468 server->local_nls = NULL;
469 }
470} 464}
471 465
472static void 466static void
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index 9468168b9af5..71c29b6670b4 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -509,7 +509,7 @@ date_unix2dos(struct smb_sb_info *server,
509 month = 2; 509 month = 2;
510 } else { 510 } else {
511 nl_day = (year & 3) || day <= 59 ? day : day - 1; 511 nl_day = (year & 3) || day <= 59 ? day : day - 1;
512 for (month = 0; month < 12; month++) 512 for (month = 1; month < 12; month++)
513 if (day_n[month] > nl_day) 513 if (day_n[month] > nl_day)
514 break; 514 break;
515 } 515 }
diff --git a/fs/splice.c b/fs/splice.c
index 73766d24f97b..7394e9e17534 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -502,8 +502,10 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
502 len = left; 502 len = left;
503 503
504 ret = __generic_file_splice_read(in, ppos, pipe, len, flags); 504 ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
505 if (ret > 0) 505 if (ret > 0) {
506 *ppos += ret; 506 *ppos += ret;
507 file_accessed(in);
508 }
507 509
508 return ret; 510 return ret;
509} 511}
@@ -963,8 +965,10 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
963 965
964 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 966 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
965 ret = file_remove_suid(out); 967 ret = file_remove_suid(out);
966 if (!ret) 968 if (!ret) {
969 file_update_time(out);
967 ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file); 970 ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file);
971 }
968 mutex_unlock(&inode->i_mutex); 972 mutex_unlock(&inode->i_mutex);
969 } while (ret > 0); 973 } while (ret > 0);
970 splice_from_pipe_end(pipe, &sd); 974 splice_from_pipe_end(pipe, &sd);
@@ -976,25 +980,15 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
976 980
977 if (ret > 0) { 981 if (ret > 0) {
978 unsigned long nr_pages; 982 unsigned long nr_pages;
983 int err;
979 984
980 *ppos += ret;
981 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 985 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
982 986
983 /* 987 err = generic_write_sync(out, *ppos, ret);
984 * If file or inode is SYNC and we actually wrote some data, 988 if (err)
985 * sync it. 989 ret = err;
986 */ 990 else
987 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 991 *ppos += ret;
988 int err;
989
990 mutex_lock(&inode->i_mutex);
991 err = generic_osync_inode(inode, mapping,
992 OSYNC_METADATA|OSYNC_DATA);
993 mutex_unlock(&inode->i_mutex);
994
995 if (err)
996 ret = err;
997 }
998 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 992 balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
999 } 993 }
1000 994
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index cb5fc57e370b..6c197ef53add 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -44,7 +44,7 @@
44#include "squashfs.h" 44#include "squashfs.h"
45 45
46static struct file_system_type squashfs_fs_type; 46static struct file_system_type squashfs_fs_type;
47static struct super_operations squashfs_super_ops; 47static const struct super_operations squashfs_super_ops;
48 48
49static int supported_squashfs_filesystem(short major, short minor, short comp) 49static int supported_squashfs_filesystem(short major, short minor, short comp)
50{ 50{
@@ -444,7 +444,7 @@ static struct file_system_type squashfs_fs_type = {
444 .fs_flags = FS_REQUIRES_DEV 444 .fs_flags = FS_REQUIRES_DEV
445}; 445};
446 446
447static struct super_operations squashfs_super_ops = { 447static const struct super_operations squashfs_super_ops = {
448 .alloc_inode = squashfs_alloc_inode, 448 .alloc_inode = squashfs_alloc_inode,
449 .destroy_inode = squashfs_destroy_inode, 449 .destroy_inode = squashfs_destroy_inode,
450 .statfs = squashfs_statfs, 450 .statfs = squashfs_statfs,
diff --git a/fs/super.c b/fs/super.c
index 2761d3e22ed9..19eb70b374bc 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -54,7 +54,7 @@ DEFINE_SPINLOCK(sb_lock);
54static struct super_block *alloc_super(struct file_system_type *type) 54static struct super_block *alloc_super(struct file_system_type *type)
55{ 55{
56 struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER); 56 struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER);
57 static struct super_operations default_op; 57 static const struct super_operations default_op;
58 58
59 if (s) { 59 if (s) {
60 if (security_sb_alloc(s)) { 60 if (security_sb_alloc(s)) {
@@ -62,9 +62,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
62 s = NULL; 62 s = NULL;
63 goto out; 63 goto out;
64 } 64 }
65 INIT_LIST_HEAD(&s->s_dirty);
66 INIT_LIST_HEAD(&s->s_io);
67 INIT_LIST_HEAD(&s->s_more_io);
68 INIT_LIST_HEAD(&s->s_files); 65 INIT_LIST_HEAD(&s->s_files);
69 INIT_LIST_HEAD(&s->s_instances); 66 INIT_LIST_HEAD(&s->s_instances);
70 INIT_HLIST_HEAD(&s->s_anon); 67 INIT_HLIST_HEAD(&s->s_anon);
@@ -171,7 +168,7 @@ int __put_super_and_need_restart(struct super_block *sb)
171 * Drops a temporary reference, frees superblock if there's no 168 * Drops a temporary reference, frees superblock if there's no
172 * references left. 169 * references left.
173 */ 170 */
174static void put_super(struct super_block *sb) 171void put_super(struct super_block *sb)
175{ 172{
176 spin_lock(&sb_lock); 173 spin_lock(&sb_lock);
177 __put_super(sb); 174 __put_super(sb);
@@ -468,6 +465,48 @@ rescan:
468} 465}
469 466
470EXPORT_SYMBOL(get_super); 467EXPORT_SYMBOL(get_super);
468
469/**
470 * get_active_super - get an active reference to the superblock of a device
471 * @bdev: device to get the superblock for
472 *
473 * Scans the superblock list and finds the superblock of the file system
474 * mounted on the device given. Returns the superblock with an active
475 * reference and s_umount held exclusively or %NULL if none was found.
476 */
477struct super_block *get_active_super(struct block_device *bdev)
478{
479 struct super_block *sb;
480
481 if (!bdev)
482 return NULL;
483
484 spin_lock(&sb_lock);
485 list_for_each_entry(sb, &super_blocks, s_list) {
486 if (sb->s_bdev != bdev)
487 continue;
488
489 sb->s_count++;
490 spin_unlock(&sb_lock);
491 down_write(&sb->s_umount);
492 if (sb->s_root) {
493 spin_lock(&sb_lock);
494 if (sb->s_count > S_BIAS) {
495 atomic_inc(&sb->s_active);
496 sb->s_count--;
497 spin_unlock(&sb_lock);
498 return sb;
499 }
500 spin_unlock(&sb_lock);
501 }
502 up_write(&sb->s_umount);
503 put_super(sb);
504 yield();
505 spin_lock(&sb_lock);
506 }
507 spin_unlock(&sb_lock);
508 return NULL;
509}
471 510
472struct super_block * user_get_super(dev_t dev) 511struct super_block * user_get_super(dev_t dev)
473{ 512{
@@ -530,11 +569,15 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
530{ 569{
531 int retval; 570 int retval;
532 int remount_rw; 571 int remount_rw;
533 572
573 if (sb->s_frozen != SB_UNFROZEN)
574 return -EBUSY;
575
534#ifdef CONFIG_BLOCK 576#ifdef CONFIG_BLOCK
535 if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev)) 577 if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev))
536 return -EACCES; 578 return -EACCES;
537#endif 579#endif
580
538 if (flags & MS_RDONLY) 581 if (flags & MS_RDONLY)
539 acct_auto_close(sb); 582 acct_auto_close(sb);
540 shrink_dcache_sb(sb); 583 shrink_dcache_sb(sb);
@@ -710,6 +753,12 @@ static int set_bdev_super(struct super_block *s, void *data)
710{ 753{
711 s->s_bdev = data; 754 s->s_bdev = data;
712 s->s_dev = s->s_bdev->bd_dev; 755 s->s_dev = s->s_bdev->bd_dev;
756
757 /*
758 * We set the bdi here to the queue backing, file systems can
759 * overwrite this in ->fill_super()
760 */
761 s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
713 return 0; 762 return 0;
714} 763}
715 764
@@ -740,9 +789,14 @@ int get_sb_bdev(struct file_system_type *fs_type,
740 * will protect the lockfs code from trying to start a snapshot 789 * will protect the lockfs code from trying to start a snapshot
741 * while we are mounting 790 * while we are mounting
742 */ 791 */
743 down(&bdev->bd_mount_sem); 792 mutex_lock(&bdev->bd_fsfreeze_mutex);
793 if (bdev->bd_fsfreeze_count > 0) {
794 mutex_unlock(&bdev->bd_fsfreeze_mutex);
795 error = -EBUSY;
796 goto error_bdev;
797 }
744 s = sget(fs_type, test_bdev_super, set_bdev_super, bdev); 798 s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
745 up(&bdev->bd_mount_sem); 799 mutex_unlock(&bdev->bd_fsfreeze_mutex);
746 if (IS_ERR(s)) 800 if (IS_ERR(s))
747 goto error_s; 801 goto error_s;
748 802
@@ -889,6 +943,16 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
889 if (error) 943 if (error)
890 goto out_sb; 944 goto out_sb;
891 945
946 /*
947 * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
948 * but s_maxbytes was an unsigned long long for many releases. Throw
949 * this warning for a little while to try and catch filesystems that
950 * violate this rule. This warning should be either removed or
951 * converted to a BUG() in 2.6.34.
952 */
953 WARN((mnt->mnt_sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
954 "negative value (%lld)\n", type->name, mnt->mnt_sb->s_maxbytes);
955
892 mnt->mnt_mountpoint = mnt->mnt_root; 956 mnt->mnt_mountpoint = mnt->mnt_root;
893 mnt->mnt_parent = mnt; 957 mnt->mnt_parent = mnt;
894 up_write(&mnt->mnt_sb->s_umount); 958 up_write(&mnt->mnt_sb->s_umount);
diff --git a/fs/sync.c b/fs/sync.c
index 3422ba61d86d..d104591b066b 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -19,20 +19,29 @@
19 SYNC_FILE_RANGE_WAIT_AFTER) 19 SYNC_FILE_RANGE_WAIT_AFTER)
20 20
21/* 21/*
22 * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0) 22 * Do the filesystem syncing work. For simple filesystems
23 * just dirties buffers with inodes so we have to submit IO for these buffers 23 * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
24 * via __sync_blockdev(). This also speeds up the wait == 1 case since in that 24 * submit IO for these buffers via __sync_blockdev(). This also speeds up the
25 * case write_inode() functions do sync_dirty_buffer() and thus effectively 25 * wait == 1 case since in that case write_inode() functions do
26 * write one block at a time. 26 * sync_dirty_buffer() and thus effectively write one block at a time.
27 */ 27 */
28static int __sync_filesystem(struct super_block *sb, int wait) 28static int __sync_filesystem(struct super_block *sb, int wait)
29{ 29{
30 /*
31 * This should be safe, as we require bdi backing to actually
32 * write out data in the first place
33 */
34 if (!sb->s_bdi)
35 return 0;
36
30 /* Avoid doing twice syncing and cache pruning for quota sync */ 37 /* Avoid doing twice syncing and cache pruning for quota sync */
31 if (!wait) 38 if (!wait) {
32 writeout_quota_sb(sb, -1); 39 writeout_quota_sb(sb, -1);
33 else 40 writeback_inodes_sb(sb);
41 } else {
34 sync_quota_sb(sb, -1); 42 sync_quota_sb(sb, -1);
35 sync_inodes_sb(sb, wait); 43 sync_inodes_sb(sb);
44 }
36 if (sb->s_op->sync_fs) 45 if (sb->s_op->sync_fs)
37 sb->s_op->sync_fs(sb, wait); 46 sb->s_op->sync_fs(sb, wait);
38 return __sync_blockdev(sb->s_bdev, wait); 47 return __sync_blockdev(sb->s_bdev, wait);
@@ -99,7 +108,7 @@ restart:
99 spin_unlock(&sb_lock); 108 spin_unlock(&sb_lock);
100 109
101 down_read(&sb->s_umount); 110 down_read(&sb->s_umount);
102 if (!(sb->s_flags & MS_RDONLY) && sb->s_root) 111 if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi)
103 __sync_filesystem(sb, wait); 112 __sync_filesystem(sb, wait);
104 up_read(&sb->s_umount); 113 up_read(&sb->s_umount);
105 114
@@ -118,7 +127,7 @@ restart:
118 */ 127 */
119SYSCALL_DEFINE0(sync) 128SYSCALL_DEFINE0(sync)
120{ 129{
121 wakeup_pdflush(0); 130 wakeup_flusher_threads(0);
122 sync_filesystems(0); 131 sync_filesystems(0);
123 sync_filesystems(1); 132 sync_filesystems(1);
124 if (unlikely(laptop_mode)) 133 if (unlikely(laptop_mode))
@@ -174,21 +183,26 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
174 ret = err; 183 ret = err;
175 return ret; 184 return ret;
176} 185}
186EXPORT_SYMBOL(file_fsync);
177 187
178/** 188/**
179 * vfs_fsync - perform a fsync or fdatasync on a file 189 * vfs_fsync_range - helper to sync a range of data & metadata to disk
180 * @file: file to sync 190 * @file: file to sync
181 * @dentry: dentry of @file 191 * @dentry: dentry of @file
182 * @data: only perform a fdatasync operation 192 * @start: offset in bytes of the beginning of data range to sync
193 * @end: offset in bytes of the end of data range (inclusive)
194 * @datasync: perform only datasync
183 * 195 *
184 * Write back data and metadata for @file to disk. If @datasync is 196 * Write back data in range @start..@end and metadata for @file to disk. If
185 * set only metadata needed to access modified file data is written. 197 * @datasync is set only metadata needed to access modified file data is
198 * written.
186 * 199 *
187 * In case this function is called from nfsd @file may be %NULL and 200 * In case this function is called from nfsd @file may be %NULL and
188 * only @dentry is set. This can only happen when the filesystem 201 * only @dentry is set. This can only happen when the filesystem
189 * implements the export_operations API. 202 * implements the export_operations API.
190 */ 203 */
191int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) 204int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
205 loff_t end, int datasync)
192{ 206{
193 const struct file_operations *fop; 207 const struct file_operations *fop;
194 struct address_space *mapping; 208 struct address_space *mapping;
@@ -212,7 +226,7 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
212 goto out; 226 goto out;
213 } 227 }
214 228
215 ret = filemap_fdatawrite(mapping); 229 ret = filemap_write_and_wait_range(mapping, start, end);
216 230
217 /* 231 /*
218 * We need to protect against concurrent writers, which could cause 232 * We need to protect against concurrent writers, which could cause
@@ -223,12 +237,29 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
223 if (!ret) 237 if (!ret)
224 ret = err; 238 ret = err;
225 mutex_unlock(&mapping->host->i_mutex); 239 mutex_unlock(&mapping->host->i_mutex);
226 err = filemap_fdatawait(mapping); 240
227 if (!ret)
228 ret = err;
229out: 241out:
230 return ret; 242 return ret;
231} 243}
244EXPORT_SYMBOL(vfs_fsync_range);
245
246/**
247 * vfs_fsync - perform a fsync or fdatasync on a file
248 * @file: file to sync
249 * @dentry: dentry of @file
250 * @datasync: only perform a fdatasync operation
251 *
252 * Write back data and metadata for @file to disk. If @datasync is
253 * set only metadata needed to access modified file data is written.
254 *
255 * In case this function is called from nfsd @file may be %NULL and
256 * only @dentry is set. This can only happen when the filesystem
257 * implements the export_operations API.
258 */
259int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
260{
261 return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync);
262}
232EXPORT_SYMBOL(vfs_fsync); 263EXPORT_SYMBOL(vfs_fsync);
233 264
234static int do_fsync(unsigned int fd, int datasync) 265static int do_fsync(unsigned int fd, int datasync)
@@ -254,6 +285,23 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
254 return do_fsync(fd, 1); 285 return do_fsync(fd, 1);
255} 286}
256 287
288/**
289 * generic_write_sync - perform syncing after a write if file / inode is sync
290 * @file: file to which the write happened
291 * @pos: offset where the write started
292 * @count: length of the write
293 *
294 * This is just a simple wrapper about our general syncing function.
295 */
296int generic_write_sync(struct file *file, loff_t pos, loff_t count)
297{
298 if (!(file->f_flags & O_SYNC) && !IS_SYNC(file->f_mapping->host))
299 return 0;
300 return vfs_fsync_range(file, file->f_path.dentry, pos,
301 pos + count - 1, 1);
302}
303EXPORT_SYMBOL(generic_write_sync);
304
257/* 305/*
258 * sys_sync_file_range() permits finely controlled syncing over a segment of 306 * sys_sync_file_range() permits finely controlled syncing over a segment of
259 * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is 307 * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 2524714bece1..60c702bc10ae 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -40,7 +40,7 @@ struct bin_buffer {
40 struct mutex mutex; 40 struct mutex mutex;
41 void *buffer; 41 void *buffer;
42 int mmapped; 42 int mmapped;
43 struct vm_operations_struct *vm_ops; 43 const struct vm_operations_struct *vm_ops;
44 struct file *file; 44 struct file *file;
45 struct hlist_node list; 45 struct hlist_node list;
46}; 46};
@@ -331,7 +331,7 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
331} 331}
332#endif 332#endif
333 333
334static struct vm_operations_struct bin_vm_ops = { 334static const struct vm_operations_struct bin_vm_ops = {
335 .open = bin_vma_open, 335 .open = bin_vma_open,
336 .close = bin_vma_close, 336 .close = bin_vma_close,
337 .fault = bin_fault, 337 .fault = bin_fault,
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 14f2d71ea3ce..0050fc40e8c9 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -760,6 +760,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
760const struct inode_operations sysfs_dir_inode_operations = { 760const struct inode_operations sysfs_dir_inode_operations = {
761 .lookup = sysfs_lookup, 761 .lookup = sysfs_lookup,
762 .setattr = sysfs_setattr, 762 .setattr = sysfs_setattr,
763 .setxattr = sysfs_setxattr,
763}; 764};
764 765
765static void remove_dir(struct sysfs_dirent *sd) 766static void remove_dir(struct sysfs_dirent *sd)
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 555f0ff988df..e28cecf179f5 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -18,6 +18,8 @@
18#include <linux/capability.h> 18#include <linux/capability.h>
19#include <linux/errno.h> 19#include <linux/errno.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/xattr.h>
22#include <linux/security.h>
21#include "sysfs.h" 23#include "sysfs.h"
22 24
23extern struct super_block * sysfs_sb; 25extern struct super_block * sysfs_sb;
@@ -29,12 +31,14 @@ static const struct address_space_operations sysfs_aops = {
29}; 31};
30 32
31static struct backing_dev_info sysfs_backing_dev_info = { 33static struct backing_dev_info sysfs_backing_dev_info = {
34 .name = "sysfs",
32 .ra_pages = 0, /* No readahead */ 35 .ra_pages = 0, /* No readahead */
33 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 36 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
34}; 37};
35 38
36static const struct inode_operations sysfs_inode_operations ={ 39static const struct inode_operations sysfs_inode_operations ={
37 .setattr = sysfs_setattr, 40 .setattr = sysfs_setattr,
41 .setxattr = sysfs_setxattr,
38}; 42};
39 43
40int __init sysfs_inode_init(void) 44int __init sysfs_inode_init(void)
@@ -42,18 +46,37 @@ int __init sysfs_inode_init(void)
42 return bdi_init(&sysfs_backing_dev_info); 46 return bdi_init(&sysfs_backing_dev_info);
43} 47}
44 48
49struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
50{
51 struct sysfs_inode_attrs *attrs;
52 struct iattr *iattrs;
53
54 attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL);
55 if (!attrs)
56 return NULL;
57 iattrs = &attrs->ia_iattr;
58
59 /* assign default attributes */
60 iattrs->ia_mode = sd->s_mode;
61 iattrs->ia_uid = 0;
62 iattrs->ia_gid = 0;
63 iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
64
65 return attrs;
66}
45int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) 67int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
46{ 68{
47 struct inode * inode = dentry->d_inode; 69 struct inode * inode = dentry->d_inode;
48 struct sysfs_dirent * sd = dentry->d_fsdata; 70 struct sysfs_dirent * sd = dentry->d_fsdata;
49 struct iattr * sd_iattr; 71 struct sysfs_inode_attrs *sd_attrs;
72 struct iattr *iattrs;
50 unsigned int ia_valid = iattr->ia_valid; 73 unsigned int ia_valid = iattr->ia_valid;
51 int error; 74 int error;
52 75
53 if (!sd) 76 if (!sd)
54 return -EINVAL; 77 return -EINVAL;
55 78
56 sd_iattr = sd->s_iattr; 79 sd_attrs = sd->s_iattr;
57 80
58 error = inode_change_ok(inode, iattr); 81 error = inode_change_ok(inode, iattr);
59 if (error) 82 if (error)
@@ -65,42 +88,77 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
65 if (error) 88 if (error)
66 return error; 89 return error;
67 90
68 if (!sd_iattr) { 91 if (!sd_attrs) {
69 /* setting attributes for the first time, allocate now */ 92 /* setting attributes for the first time, allocate now */
70 sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL); 93 sd_attrs = sysfs_init_inode_attrs(sd);
71 if (!sd_iattr) 94 if (!sd_attrs)
72 return -ENOMEM; 95 return -ENOMEM;
73 /* assign default attributes */ 96 sd->s_iattr = sd_attrs;
74 sd_iattr->ia_mode = sd->s_mode; 97 } else {
75 sd_iattr->ia_uid = 0; 98 /* attributes were changed at least once in past */
76 sd_iattr->ia_gid = 0; 99 iattrs = &sd_attrs->ia_iattr;
77 sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME; 100
78 sd->s_iattr = sd_iattr; 101 if (ia_valid & ATTR_UID)
102 iattrs->ia_uid = iattr->ia_uid;
103 if (ia_valid & ATTR_GID)
104 iattrs->ia_gid = iattr->ia_gid;
105 if (ia_valid & ATTR_ATIME)
106 iattrs->ia_atime = timespec_trunc(iattr->ia_atime,
107 inode->i_sb->s_time_gran);
108 if (ia_valid & ATTR_MTIME)
109 iattrs->ia_mtime = timespec_trunc(iattr->ia_mtime,
110 inode->i_sb->s_time_gran);
111 if (ia_valid & ATTR_CTIME)
112 iattrs->ia_ctime = timespec_trunc(iattr->ia_ctime,
113 inode->i_sb->s_time_gran);
114 if (ia_valid & ATTR_MODE) {
115 umode_t mode = iattr->ia_mode;
116
117 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
118 mode &= ~S_ISGID;
119 iattrs->ia_mode = sd->s_mode = mode;
120 }
79 } 121 }
122 return error;
123}
80 124
81 /* attributes were changed atleast once in past */ 125int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
82 126 size_t size, int flags)
83 if (ia_valid & ATTR_UID) 127{
84 sd_iattr->ia_uid = iattr->ia_uid; 128 struct sysfs_dirent *sd = dentry->d_fsdata;
85 if (ia_valid & ATTR_GID) 129 struct sysfs_inode_attrs *iattrs;
86 sd_iattr->ia_gid = iattr->ia_gid; 130 void *secdata;
87 if (ia_valid & ATTR_ATIME) 131 int error;
88 sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime, 132 u32 secdata_len = 0;
89 inode->i_sb->s_time_gran); 133
90 if (ia_valid & ATTR_MTIME) 134 if (!sd)
91 sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime, 135 return -EINVAL;
92 inode->i_sb->s_time_gran); 136 if (!sd->s_iattr)
93 if (ia_valid & ATTR_CTIME) 137 sd->s_iattr = sysfs_init_inode_attrs(sd);
94 sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime, 138 if (!sd->s_iattr)
95 inode->i_sb->s_time_gran); 139 return -ENOMEM;
96 if (ia_valid & ATTR_MODE) { 140
97 umode_t mode = iattr->ia_mode; 141 iattrs = sd->s_iattr;
98 142
99 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) 143 if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
100 mode &= ~S_ISGID; 144 const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
101 sd_iattr->ia_mode = sd->s_mode = mode; 145 error = security_inode_setsecurity(dentry->d_inode, suffix,
102 } 146 value, size, flags);
147 if (error)
148 goto out;
149 error = security_inode_getsecctx(dentry->d_inode,
150 &secdata, &secdata_len);
151 if (error)
152 goto out;
153 if (iattrs->ia_secdata)
154 security_release_secctx(iattrs->ia_secdata,
155 iattrs->ia_secdata_len);
156 iattrs->ia_secdata = secdata;
157 iattrs->ia_secdata_len = secdata_len;
103 158
159 } else
160 return -EINVAL;
161out:
104 return error; 162 return error;
105} 163}
106 164
@@ -146,6 +204,7 @@ static int sysfs_count_nlink(struct sysfs_dirent *sd)
146static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) 204static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
147{ 205{
148 struct bin_attribute *bin_attr; 206 struct bin_attribute *bin_attr;
207 struct sysfs_inode_attrs *iattrs;
149 208
150 inode->i_private = sysfs_get(sd); 209 inode->i_private = sysfs_get(sd);
151 inode->i_mapping->a_ops = &sysfs_aops; 210 inode->i_mapping->a_ops = &sysfs_aops;
@@ -154,16 +213,20 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
154 inode->i_ino = sd->s_ino; 213 inode->i_ino = sd->s_ino;
155 lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key); 214 lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key);
156 215
157 if (sd->s_iattr) { 216 iattrs = sd->s_iattr;
217 if (iattrs) {
158 /* sysfs_dirent has non-default attributes 218 /* sysfs_dirent has non-default attributes
159 * get them for the new inode from persistent copy 219 * get them for the new inode from persistent copy
160 * in sysfs_dirent 220 * in sysfs_dirent
161 */ 221 */
162 set_inode_attr(inode, sd->s_iattr); 222 set_inode_attr(inode, &iattrs->ia_iattr);
223 if (iattrs->ia_secdata)
224 security_inode_notifysecctx(inode,
225 iattrs->ia_secdata,
226 iattrs->ia_secdata_len);
163 } else 227 } else
164 set_default_inode_attr(inode, sd->s_mode); 228 set_default_inode_attr(inode, sd->s_mode);
165 229
166
167 /* initialize inode according to type */ 230 /* initialize inode according to type */
168 switch (sysfs_type(sd)) { 231 switch (sysfs_type(sd)) {
169 case SYSFS_DIR: 232 case SYSFS_DIR:
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 1d897ad808e0..c5081ad77026 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -16,6 +16,7 @@
16#include <linux/kobject.h> 16#include <linux/kobject.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/mutex.h> 18#include <linux/mutex.h>
19#include <linux/security.h>
19 20
20#include "sysfs.h" 21#include "sysfs.h"
21 22
@@ -209,6 +210,7 @@ static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *co
209} 210}
210 211
211const struct inode_operations sysfs_symlink_inode_operations = { 212const struct inode_operations sysfs_symlink_inode_operations = {
213 .setxattr = sysfs_setxattr,
212 .readlink = generic_readlink, 214 .readlink = generic_readlink,
213 .follow_link = sysfs_follow_link, 215 .follow_link = sysfs_follow_link,
214 .put_link = sysfs_put_link, 216 .put_link = sysfs_put_link,
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 3fa0d98481e2..af4c4e7482ac 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -8,6 +8,8 @@
8 * This file is released under the GPLv2. 8 * This file is released under the GPLv2.
9 */ 9 */
10 10
11#include <linux/fs.h>
12
11struct sysfs_open_dirent; 13struct sysfs_open_dirent;
12 14
13/* type-specific structures for sysfs_dirent->s_* union members */ 15/* type-specific structures for sysfs_dirent->s_* union members */
@@ -31,6 +33,12 @@ struct sysfs_elem_bin_attr {
31 struct hlist_head buffers; 33 struct hlist_head buffers;
32}; 34};
33 35
36struct sysfs_inode_attrs {
37 struct iattr ia_iattr;
38 void *ia_secdata;
39 u32 ia_secdata_len;
40};
41
34/* 42/*
35 * sysfs_dirent - the building block of sysfs hierarchy. Each and 43 * sysfs_dirent - the building block of sysfs hierarchy. Each and
36 * every sysfs node is represented by single sysfs_dirent. 44 * every sysfs node is represented by single sysfs_dirent.
@@ -56,7 +64,7 @@ struct sysfs_dirent {
56 unsigned int s_flags; 64 unsigned int s_flags;
57 ino_t s_ino; 65 ino_t s_ino;
58 umode_t s_mode; 66 umode_t s_mode;
59 struct iattr *s_iattr; 67 struct sysfs_inode_attrs *s_iattr;
60}; 68};
61 69
62#define SD_DEACTIVATED_BIAS INT_MIN 70#define SD_DEACTIVATED_BIAS INT_MIN
@@ -148,6 +156,8 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
148struct inode *sysfs_get_inode(struct sysfs_dirent *sd); 156struct inode *sysfs_get_inode(struct sysfs_dirent *sd);
149void sysfs_delete_inode(struct inode *inode); 157void sysfs_delete_inode(struct inode *inode);
150int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); 158int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
159int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
160 size_t size, int flags);
151int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name); 161int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
152int sysfs_inode_init(void); 162int sysfs_inode_init(void);
153 163
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index eaf6d891d46f..076ca50e9933 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -54,41 +54,15 @@
54 * @nr_to_write: how many dirty pages to write-back 54 * @nr_to_write: how many dirty pages to write-back
55 * 55 *
56 * This function shrinks UBIFS liability by means of writing back some amount 56 * This function shrinks UBIFS liability by means of writing back some amount
57 * of dirty inodes and their pages. Returns the amount of pages which were 57 * of dirty inodes and their pages.
58 * written back. The returned value does not include dirty inodes which were
59 * synchronized.
60 * 58 *
61 * Note, this function synchronizes even VFS inodes which are locked 59 * Note, this function synchronizes even VFS inodes which are locked
62 * (@i_mutex) by the caller of the budgeting function, because write-back does 60 * (@i_mutex) by the caller of the budgeting function, because write-back does
63 * not touch @i_mutex. 61 * not touch @i_mutex.
64 */ 62 */
65static int shrink_liability(struct ubifs_info *c, int nr_to_write) 63static void shrink_liability(struct ubifs_info *c, int nr_to_write)
66{ 64{
67 int nr_written; 65 writeback_inodes_sb(c->vfs_sb);
68 struct writeback_control wbc = {
69 .sync_mode = WB_SYNC_NONE,
70 .range_end = LLONG_MAX,
71 .nr_to_write = nr_to_write,
72 };
73
74 generic_sync_sb_inodes(c->vfs_sb, &wbc);
75 nr_written = nr_to_write - wbc.nr_to_write;
76
77 if (!nr_written) {
78 /*
79 * Re-try again but wait on pages/inodes which are being
80 * written-back concurrently (e.g., by pdflush).
81 */
82 memset(&wbc, 0, sizeof(struct writeback_control));
83 wbc.sync_mode = WB_SYNC_ALL;
84 wbc.range_end = LLONG_MAX;
85 wbc.nr_to_write = nr_to_write;
86 generic_sync_sb_inodes(c->vfs_sb, &wbc);
87 nr_written = nr_to_write - wbc.nr_to_write;
88 }
89
90 dbg_budg("%d pages were written back", nr_written);
91 return nr_written;
92} 66}
93 67
94/** 68/**
@@ -741,7 +715,7 @@ long long ubifs_get_free_space_nolock(struct ubifs_info *c)
741 * ubifs_get_free_space - return amount of free space. 715 * ubifs_get_free_space - return amount of free space.
742 * @c: UBIFS file-system description object 716 * @c: UBIFS file-system description object
743 * 717 *
744 * This function calculates and retuns amount of free space to report to 718 * This function calculates and returns amount of free space to report to
745 * user-space. 719 * user-space.
746 */ 720 */
747long long ubifs_get_free_space(struct ubifs_info *c) 721long long ubifs_get_free_space(struct ubifs_info *c)
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index f3a7945527fb..4775af401167 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -510,7 +510,7 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
510 int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt; 510 int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt;
511 int first = 1, iip; 511 int first = 1, iip;
512 struct ubifs_debug_info *d = c->dbg; 512 struct ubifs_debug_info *d = c->dbg;
513 union ubifs_key lower_key, upper_key, l_key, u_key; 513 union ubifs_key uninitialized_var(lower_key), upper_key, l_key, u_key;
514 unsigned long long uninitialized_var(last_sqnum); 514 unsigned long long uninitialized_var(last_sqnum);
515 struct ubifs_idx_node *idx; 515 struct ubifs_idx_node *idx;
516 struct list_head list; 516 struct list_head list;
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index ce2cd8343618..dbc093afd946 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -210,6 +210,20 @@ const char *dbg_cstate(int cmt_state)
210 } 210 }
211} 211}
212 212
213const char *dbg_jhead(int jhead)
214{
215 switch (jhead) {
216 case GCHD:
217 return "0 (GC)";
218 case BASEHD:
219 return "1 (base)";
220 case DATAHD:
221 return "2 (data)";
222 default:
223 return "unknown journal head";
224 }
225}
226
213static void dump_ch(const struct ubifs_ch *ch) 227static void dump_ch(const struct ubifs_ch *ch)
214{ 228{
215 printk(KERN_DEBUG "\tmagic %#x\n", le32_to_cpu(ch->magic)); 229 printk(KERN_DEBUG "\tmagic %#x\n", le32_to_cpu(ch->magic));
@@ -623,8 +637,9 @@ void dbg_dump_budg(struct ubifs_info *c)
623 /* If we are in R/O mode, journal heads do not exist */ 637 /* If we are in R/O mode, journal heads do not exist */
624 if (c->jheads) 638 if (c->jheads)
625 for (i = 0; i < c->jhead_cnt; i++) 639 for (i = 0; i < c->jhead_cnt; i++)
626 printk(KERN_DEBUG "\tjhead %d\t LEB %d\n", 640 printk(KERN_DEBUG "\tjhead %s\t LEB %d\n",
627 c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum); 641 dbg_jhead(c->jheads[i].wbuf.jhead),
642 c->jheads[i].wbuf.lnum);
628 for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) { 643 for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {
629 bud = rb_entry(rb, struct ubifs_bud, rb); 644 bud = rb_entry(rb, struct ubifs_bud, rb);
630 printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum); 645 printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum);
@@ -648,9 +663,90 @@ void dbg_dump_budg(struct ubifs_info *c)
648 663
649void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) 664void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
650{ 665{
651 printk(KERN_DEBUG "LEB %d lprops: free %d, dirty %d (used %d), " 666 int i, spc, dark = 0, dead = 0;
652 "flags %#x\n", lp->lnum, lp->free, lp->dirty, 667 struct rb_node *rb;
653 c->leb_size - lp->free - lp->dirty, lp->flags); 668 struct ubifs_bud *bud;
669
670 spc = lp->free + lp->dirty;
671 if (spc < c->dead_wm)
672 dead = spc;
673 else
674 dark = ubifs_calc_dark(c, spc);
675
676 if (lp->flags & LPROPS_INDEX)
677 printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d "
678 "free + dirty %-8d flags %#x (", lp->lnum, lp->free,
679 lp->dirty, c->leb_size - spc, spc, lp->flags);
680 else
681 printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d "
682 "free + dirty %-8d dark %-4d dead %-4d nodes fit %-3d "
683 "flags %#-4x (", lp->lnum, lp->free, lp->dirty,
684 c->leb_size - spc, spc, dark, dead,
685 (int)(spc / UBIFS_MAX_NODE_SZ), lp->flags);
686
687 if (lp->flags & LPROPS_TAKEN) {
688 if (lp->flags & LPROPS_INDEX)
689 printk(KERN_CONT "index, taken");
690 else
691 printk(KERN_CONT "taken");
692 } else {
693 const char *s;
694
695 if (lp->flags & LPROPS_INDEX) {
696 switch (lp->flags & LPROPS_CAT_MASK) {
697 case LPROPS_DIRTY_IDX:
698 s = "dirty index";
699 break;
700 case LPROPS_FRDI_IDX:
701 s = "freeable index";
702 break;
703 default:
704 s = "index";
705 }
706 } else {
707 switch (lp->flags & LPROPS_CAT_MASK) {
708 case LPROPS_UNCAT:
709 s = "not categorized";
710 break;
711 case LPROPS_DIRTY:
712 s = "dirty";
713 break;
714 case LPROPS_FREE:
715 s = "free";
716 break;
717 case LPROPS_EMPTY:
718 s = "empty";
719 break;
720 case LPROPS_FREEABLE:
721 s = "freeable";
722 break;
723 default:
724 s = NULL;
725 break;
726 }
727 }
728 printk(KERN_CONT "%s", s);
729 }
730
731 for (rb = rb_first((struct rb_root *)&c->buds); rb; rb = rb_next(rb)) {
732 bud = rb_entry(rb, struct ubifs_bud, rb);
733 if (bud->lnum == lp->lnum) {
734 int head = 0;
735 for (i = 0; i < c->jhead_cnt; i++) {
736 if (lp->lnum == c->jheads[i].wbuf.lnum) {
737 printk(KERN_CONT ", jhead %s",
738 dbg_jhead(i));
739 head = 1;
740 }
741 }
742 if (!head)
743 printk(KERN_CONT ", bud of jhead %s",
744 dbg_jhead(bud->jhead));
745 }
746 }
747 if (lp->lnum == c->gc_lnum)
748 printk(KERN_CONT ", GC LEB");
749 printk(KERN_CONT ")\n");
654} 750}
655 751
656void dbg_dump_lprops(struct ubifs_info *c) 752void dbg_dump_lprops(struct ubifs_info *c)
@@ -724,7 +820,7 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
724 820
725 printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n", 821 printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
726 current->pid, lnum); 822 current->pid, lnum);
727 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf); 823 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
728 if (IS_ERR(sleb)) { 824 if (IS_ERR(sleb)) {
729 ubifs_err("scan error %d", (int)PTR_ERR(sleb)); 825 ubifs_err("scan error %d", (int)PTR_ERR(sleb));
730 return; 826 return;
@@ -909,8 +1005,10 @@ out:
909 ubifs_msg("saved lprops statistics dump"); 1005 ubifs_msg("saved lprops statistics dump");
910 dbg_dump_lstats(&d->saved_lst); 1006 dbg_dump_lstats(&d->saved_lst);
911 ubifs_get_lp_stats(c, &lst); 1007 ubifs_get_lp_stats(c, &lst);
1008
912 ubifs_msg("current lprops statistics dump"); 1009 ubifs_msg("current lprops statistics dump");
913 dbg_dump_lstats(&d->saved_lst); 1010 dbg_dump_lstats(&lst);
1011
914 spin_lock(&c->space_lock); 1012 spin_lock(&c->space_lock);
915 dbg_dump_budg(c); 1013 dbg_dump_budg(c);
916 spin_unlock(&c->space_lock); 1014 spin_unlock(&c->space_lock);
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index c1cd73b2e06e..29d960101ea6 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -271,6 +271,7 @@ void ubifs_debugging_exit(struct ubifs_info *c);
271/* Dump functions */ 271/* Dump functions */
272const char *dbg_ntype(int type); 272const char *dbg_ntype(int type);
273const char *dbg_cstate(int cmt_state); 273const char *dbg_cstate(int cmt_state);
274const char *dbg_jhead(int jhead);
274const char *dbg_get_key_dump(const struct ubifs_info *c, 275const char *dbg_get_key_dump(const struct ubifs_info *c,
275 const union ubifs_key *key); 276 const union ubifs_key *key);
276void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode); 277void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode);
@@ -321,6 +322,8 @@ void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
321int dbg_check_lprops(struct ubifs_info *c); 322int dbg_check_lprops(struct ubifs_info *c);
322int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, 323int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
323 int row, int col); 324 int row, int col);
325int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
326 loff_t size);
324 327
325/* Force the use of in-the-gaps method for testing */ 328/* Force the use of in-the-gaps method for testing */
326 329
@@ -425,6 +428,7 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
425 428
426#define dbg_ntype(type) "" 429#define dbg_ntype(type) ""
427#define dbg_cstate(cmt_state) "" 430#define dbg_cstate(cmt_state) ""
431#define dbg_jhead(jhead) ""
428#define dbg_get_key_dump(c, key) ({}) 432#define dbg_get_key_dump(c, key) ({})
429#define dbg_dump_inode(c, inode) ({}) 433#define dbg_dump_inode(c, inode) ({})
430#define dbg_dump_node(c, node) ({}) 434#define dbg_dump_node(c, node) ({})
@@ -460,6 +464,7 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
460#define dbg_check_heap(c, heap, cat, add_pos) ({}) 464#define dbg_check_heap(c, heap, cat, add_pos) ({})
461#define dbg_check_lprops(c) 0 465#define dbg_check_lprops(c) 0
462#define dbg_check_lpt_nodes(c, cnode, row, col) 0 466#define dbg_check_lpt_nodes(c, cnode, row, col) 0
467#define dbg_check_inode_size(c, inode, size) 0
463#define dbg_force_in_the_gaps_enabled 0 468#define dbg_force_in_the_gaps_enabled 0
464#define dbg_force_in_the_gaps() 0 469#define dbg_force_in_the_gaps() 0
465#define dbg_failure_mode 0 470#define dbg_failure_mode 0
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 6d34dc7e33e1..1009adc8d602 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -21,34 +21,32 @@
21 */ 21 */
22 22
23/* 23/*
24 * This file implements VFS file and inode operations of regular files, device 24 * This file implements VFS file and inode operations for regular files, device
25 * nodes and symlinks as well as address space operations. 25 * nodes and symlinks as well as address space operations.
26 * 26 *
27 * UBIFS uses 2 page flags: PG_private and PG_checked. PG_private is set if the 27 * UBIFS uses 2 page flags: @PG_private and @PG_checked. @PG_private is set if
28 * page is dirty and is used for budgeting purposes - dirty pages should not be 28 * the page is dirty and is used for optimization purposes - dirty pages are
29 * budgeted. The PG_checked flag is set if full budgeting is required for the 29 * not budgeted so the flag shows that 'ubifs_write_end()' should not release
30 * page e.g., when it corresponds to a file hole or it is just beyond the file 30 * the budget for this page. The @PG_checked flag is set if full budgeting is
31 * size. The budgeting is done in 'ubifs_write_begin()', because it is OK to 31 * required for the page e.g., when it corresponds to a file hole or it is
32 * fail in this function, and the budget is released in 'ubifs_write_end()'. So 32 * beyond the file size. The budgeting is done in 'ubifs_write_begin()', because
33 * the PG_private and PG_checked flags carry the information about how the page 33 * it is OK to fail in this function, and the budget is released in
34 * was budgeted, to make it possible to release the budget properly. 34 * 'ubifs_write_end()'. So the @PG_private and @PG_checked flags carry
35 * information about how the page was budgeted, to make it possible to release
36 * the budget properly.
35 * 37 *
36 * A thing to keep in mind: inode's 'i_mutex' is locked in most VFS operations 38 * A thing to keep in mind: inode @i_mutex is locked in most VFS operations we
37 * we implement. However, this is not true for '->writepage()', which might be 39 * implement. However, this is not true for 'ubifs_writepage()', which may be
38 * called with 'i_mutex' unlocked. For example, when pdflush is performing 40 * called with @i_mutex unlocked. For example, when pdflush is doing background
39 * write-back, it calls 'writepage()' with unlocked 'i_mutex', although the 41 * write-back, it calls 'ubifs_writepage()' with unlocked @i_mutex. At "normal"
40 * inode has 'I_LOCK' flag in this case. At "normal" work-paths 'i_mutex' is 42 * work-paths the @i_mutex is locked in 'ubifs_writepage()', e.g. in the
41 * locked in '->writepage', e.g. in "sys_write -> alloc_pages -> direct reclaim 43 * "sys_write -> alloc_pages -> direct reclaim path". So, in 'ubifs_writepage()'
42 * path'. So, in '->writepage()' we are only guaranteed that the page is 44 * we are only guaranteed that the page is locked.
43 * locked.
44 * 45 *
45 * Similarly, 'i_mutex' does not have to be locked in readpage(), e.g., 46 * Similarly, @i_mutex is not always locked in 'ubifs_readpage()', e.g., the
46 * readahead path does not have it locked ("sys_read -> generic_file_aio_read 47 * read-ahead path does not lock it ("sys_read -> generic_file_aio_read ->
47 * -> ondemand_readahead -> readpage"). In case of readahead, 'I_LOCK' flag is 48 * ondemand_readahead -> readpage"). In case of readahead, @I_LOCK flag is not
48 * not set as well. However, UBIFS disables readahead. 49 * set as well. However, UBIFS disables readahead.
49 *
50 * This, for example means that there might be 2 concurrent '->writepage()'
51 * calls for the same inode, but different inode dirty pages.
52 */ 50 */
53 51
54#include "ubifs.h" 52#include "ubifs.h"
@@ -449,9 +447,9 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
449 /* 447 /*
450 * We change whole page so no need to load it. But we 448 * We change whole page so no need to load it. But we
451 * have to set the @PG_checked flag to make the further 449 * have to set the @PG_checked flag to make the further
452 * code the page is new. This might be not true, but it 450 * code know that the page is new. This might be not
453 * is better to budget more that to read the page from 451 * true, but it is better to budget more than to read
454 * the media. 452 * the page from the media.
455 */ 453 */
456 SetPageChecked(page); 454 SetPageChecked(page);
457 skipped_read = 1; 455 skipped_read = 1;
@@ -497,8 +495,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
497 } 495 }
498 496
499 /* 497 /*
500 * Whee, we aquired budgeting quickly - without involving 498 * Whee, we acquired budgeting quickly - without involving
501 * garbage-collection, committing or forceing write-back. We return 499 * garbage-collection, committing or forcing write-back. We return
502 * with @ui->ui_mutex locked if we are appending pages, and unlocked 500 * with @ui->ui_mutex locked if we are appending pages, and unlocked
503 * otherwise. This is an optimization (slightly hacky though). 501 * otherwise. This is an optimization (slightly hacky though).
504 */ 502 */
@@ -562,7 +560,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
562 560
563 /* 561 /*
564 * Return 0 to force VFS to repeat the whole operation, or the 562 * Return 0 to force VFS to repeat the whole operation, or the
565 * error code if 'do_readpage()' failes. 563 * error code if 'do_readpage()' fails.
566 */ 564 */
567 copied = do_readpage(page); 565 copied = do_readpage(page);
568 goto out; 566 goto out;
@@ -1175,11 +1173,11 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
1175 ui->ui_size = inode->i_size; 1173 ui->ui_size = inode->i_size;
1176 /* Truncation changes inode [mc]time */ 1174 /* Truncation changes inode [mc]time */
1177 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); 1175 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
1178 /* The other attributes may be changed at the same time as well */ 1176 /* Other attributes may be changed at the same time as well */
1179 do_attr_changes(inode, attr); 1177 do_attr_changes(inode, attr);
1180
1181 err = ubifs_jnl_truncate(c, inode, old_size, new_size); 1178 err = ubifs_jnl_truncate(c, inode, old_size, new_size);
1182 mutex_unlock(&ui->ui_mutex); 1179 mutex_unlock(&ui->ui_mutex);
1180
1183out_budg: 1181out_budg:
1184 if (budgeted) 1182 if (budgeted)
1185 ubifs_release_budget(c, &req); 1183 ubifs_release_budget(c, &req);
@@ -1536,7 +1534,7 @@ out_unlock:
1536 return err; 1534 return err;
1537} 1535}
1538 1536
1539static struct vm_operations_struct ubifs_file_vm_ops = { 1537static const struct vm_operations_struct ubifs_file_vm_ops = {
1540 .fault = filemap_fault, 1538 .fault = filemap_fault,
1541 .page_mkwrite = ubifs_vm_page_mkwrite, 1539 .page_mkwrite = ubifs_vm_page_mkwrite,
1542}; 1540};
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index f0f5f15d384e..618c2701d3a7 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -529,7 +529,7 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
529 * We scan the entire LEB even though we only really need to scan up to 529 * We scan the entire LEB even though we only really need to scan up to
530 * (c->leb_size - lp->free). 530 * (c->leb_size - lp->free).
531 */ 531 */
532 sleb = ubifs_scan(c, lnum, 0, c->sbuf); 532 sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0);
533 if (IS_ERR(sleb)) 533 if (IS_ERR(sleb))
534 return PTR_ERR(sleb); 534 return PTR_ERR(sleb);
535 535
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 762a7d6cec73..e589fedaf1ef 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -297,7 +297,7 @@ static enum hrtimer_restart wbuf_timer_callback_nolock(struct hrtimer *timer)
297{ 297{
298 struct ubifs_wbuf *wbuf = container_of(timer, struct ubifs_wbuf, timer); 298 struct ubifs_wbuf *wbuf = container_of(timer, struct ubifs_wbuf, timer);
299 299
300 dbg_io("jhead %d", wbuf->jhead); 300 dbg_io("jhead %s", dbg_jhead(wbuf->jhead));
301 wbuf->need_sync = 1; 301 wbuf->need_sync = 1;
302 wbuf->c->need_wbuf_sync = 1; 302 wbuf->c->need_wbuf_sync = 1;
303 ubifs_wake_up_bgt(wbuf->c); 303 ubifs_wake_up_bgt(wbuf->c);
@@ -314,7 +314,8 @@ static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
314 314
315 if (wbuf->no_timer) 315 if (wbuf->no_timer)
316 return; 316 return;
317 dbg_io("set timer for jhead %d, %llu-%llu millisecs", wbuf->jhead, 317 dbg_io("set timer for jhead %s, %llu-%llu millisecs",
318 dbg_jhead(wbuf->jhead),
318 div_u64(ktime_to_ns(wbuf->softlimit), USEC_PER_SEC), 319 div_u64(ktime_to_ns(wbuf->softlimit), USEC_PER_SEC),
319 div_u64(ktime_to_ns(wbuf->softlimit) + wbuf->delta, 320 div_u64(ktime_to_ns(wbuf->softlimit) + wbuf->delta,
320 USEC_PER_SEC)); 321 USEC_PER_SEC));
@@ -351,8 +352,8 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
351 /* Write-buffer is empty or not seeked */ 352 /* Write-buffer is empty or not seeked */
352 return 0; 353 return 0;
353 354
354 dbg_io("LEB %d:%d, %d bytes, jhead %d", 355 dbg_io("LEB %d:%d, %d bytes, jhead %s",
355 wbuf->lnum, wbuf->offs, wbuf->used, wbuf->jhead); 356 wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead));
356 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY)); 357 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
357 ubifs_assert(!(wbuf->avail & 7)); 358 ubifs_assert(!(wbuf->avail & 7));
358 ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size); 359 ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
@@ -401,7 +402,7 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
401{ 402{
402 const struct ubifs_info *c = wbuf->c; 403 const struct ubifs_info *c = wbuf->c;
403 404
404 dbg_io("LEB %d:%d, jhead %d", lnum, offs, wbuf->jhead); 405 dbg_io("LEB %d:%d, jhead %s", lnum, offs, dbg_jhead(wbuf->jhead));
405 ubifs_assert(lnum >= 0 && lnum < c->leb_cnt); 406 ubifs_assert(lnum >= 0 && lnum < c->leb_cnt);
406 ubifs_assert(offs >= 0 && offs <= c->leb_size); 407 ubifs_assert(offs >= 0 && offs <= c->leb_size);
407 ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7)); 408 ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7));
@@ -508,9 +509,9 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
508 struct ubifs_info *c = wbuf->c; 509 struct ubifs_info *c = wbuf->c;
509 int err, written, n, aligned_len = ALIGN(len, 8), offs; 510 int err, written, n, aligned_len = ALIGN(len, 8), offs;
510 511
511 dbg_io("%d bytes (%s) to jhead %d wbuf at LEB %d:%d", len, 512 dbg_io("%d bytes (%s) to jhead %s wbuf at LEB %d:%d", len,
512 dbg_ntype(((struct ubifs_ch *)buf)->node_type), wbuf->jhead, 513 dbg_ntype(((struct ubifs_ch *)buf)->node_type),
513 wbuf->lnum, wbuf->offs + wbuf->used); 514 dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs + wbuf->used);
514 ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt); 515 ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt);
515 ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0); 516 ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0);
516 ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size); 517 ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
@@ -535,8 +536,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
535 memcpy(wbuf->buf + wbuf->used, buf, len); 536 memcpy(wbuf->buf + wbuf->used, buf, len);
536 537
537 if (aligned_len == wbuf->avail) { 538 if (aligned_len == wbuf->avail) {
538 dbg_io("flush jhead %d wbuf to LEB %d:%d", 539 dbg_io("flush jhead %s wbuf to LEB %d:%d",
539 wbuf->jhead, wbuf->lnum, wbuf->offs); 540 dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
540 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, 541 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf,
541 wbuf->offs, c->min_io_size, 542 wbuf->offs, c->min_io_size,
542 wbuf->dtype); 543 wbuf->dtype);
@@ -564,8 +565,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
564 * minimal I/O unit. We have to fill and flush write-buffer and switch 565 * minimal I/O unit. We have to fill and flush write-buffer and switch
565 * to the next min. I/O unit. 566 * to the next min. I/O unit.
566 */ 567 */
567 dbg_io("flush jhead %d wbuf to LEB %d:%d", 568 dbg_io("flush jhead %s wbuf to LEB %d:%d",
568 wbuf->jhead, wbuf->lnum, wbuf->offs); 569 dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
569 memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail); 570 memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
570 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, 571 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
571 c->min_io_size, wbuf->dtype); 572 c->min_io_size, wbuf->dtype);
@@ -698,8 +699,8 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
698 int err, rlen, overlap; 699 int err, rlen, overlap;
699 struct ubifs_ch *ch = buf; 700 struct ubifs_ch *ch = buf;
700 701
701 dbg_io("LEB %d:%d, %s, length %d, jhead %d", lnum, offs, 702 dbg_io("LEB %d:%d, %s, length %d, jhead %s", lnum, offs,
702 dbg_ntype(type), len, wbuf->jhead); 703 dbg_ntype(type), len, dbg_jhead(wbuf->jhead));
703 ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0); 704 ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
704 ubifs_assert(!(offs & 7) && offs < c->leb_size); 705 ubifs_assert(!(offs & 7) && offs < c->leb_size);
705 ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT); 706 ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT);
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 64b5f3a309f5..d321baeca68d 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -158,7 +158,7 @@ again:
158 * some. But the write-buffer mutex has to be unlocked because 158 * some. But the write-buffer mutex has to be unlocked because
159 * GC also takes it. 159 * GC also takes it.
160 */ 160 */
161 dbg_jnl("no free space jhead %d, run GC", jhead); 161 dbg_jnl("no free space in jhead %s, run GC", dbg_jhead(jhead));
162 mutex_unlock(&wbuf->io_mutex); 162 mutex_unlock(&wbuf->io_mutex);
163 163
164 lnum = ubifs_garbage_collect(c, 0); 164 lnum = ubifs_garbage_collect(c, 0);
@@ -173,7 +173,8 @@ again:
173 * because we dropped @wbuf->io_mutex, so try once 173 * because we dropped @wbuf->io_mutex, so try once
174 * again. 174 * again.
175 */ 175 */
176 dbg_jnl("GC couldn't make a free LEB for jhead %d", jhead); 176 dbg_jnl("GC couldn't make a free LEB for jhead %s",
177 dbg_jhead(jhead));
177 if (retries++ < 2) { 178 if (retries++ < 2) {
178 dbg_jnl("retry (%d)", retries); 179 dbg_jnl("retry (%d)", retries);
179 goto again; 180 goto again;
@@ -184,7 +185,7 @@ again:
184 } 185 }
185 186
186 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); 187 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
187 dbg_jnl("got LEB %d for jhead %d", lnum, jhead); 188 dbg_jnl("got LEB %d for jhead %s", lnum, dbg_jhead(jhead));
188 avail = c->leb_size - wbuf->offs - wbuf->used; 189 avail = c->leb_size - wbuf->offs - wbuf->used;
189 190
190 if (wbuf->lnum != -1 && avail >= len) { 191 if (wbuf->lnum != -1 && avail >= len) {
@@ -255,7 +256,8 @@ static int write_node(struct ubifs_info *c, int jhead, void *node, int len,
255 *lnum = c->jheads[jhead].wbuf.lnum; 256 *lnum = c->jheads[jhead].wbuf.lnum;
256 *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used; 257 *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
257 258
258 dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len); 259 dbg_jnl("jhead %s, LEB %d:%d, len %d",
260 dbg_jhead(jhead), *lnum, *offs, len);
259 ubifs_prepare_node(c, node, len, 0); 261 ubifs_prepare_node(c, node, len, 0);
260 262
261 return ubifs_wbuf_write_nolock(wbuf, node, len); 263 return ubifs_wbuf_write_nolock(wbuf, node, len);
@@ -285,7 +287,8 @@ static int write_head(struct ubifs_info *c, int jhead, void *buf, int len,
285 287
286 *lnum = c->jheads[jhead].wbuf.lnum; 288 *lnum = c->jheads[jhead].wbuf.lnum;
287 *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used; 289 *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
288 dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len); 290 dbg_jnl("jhead %s, LEB %d:%d, len %d",
291 dbg_jhead(jhead), *lnum, *offs, len);
289 292
290 err = ubifs_wbuf_write_nolock(wbuf, buf, len); 293 err = ubifs_wbuf_write_nolock(wbuf, buf, len);
291 if (err) 294 if (err)
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 5fa27ea031ba..0f530c684f0b 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -229,23 +229,6 @@ static inline void xent_key_init(const struct ubifs_info *c,
229} 229}
230 230
231/** 231/**
232 * xent_key_init_hash - initialize extended attribute entry key without
233 * re-calculating hash function.
234 * @c: UBIFS file-system description object
235 * @key: key to initialize
236 * @inum: host inode number
237 * @hash: extended attribute entry name hash
238 */
239static inline void xent_key_init_hash(const struct ubifs_info *c,
240 union ubifs_key *key, ino_t inum,
241 uint32_t hash)
242{
243 ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
244 key->u32[0] = inum;
245 key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS);
246}
247
248/**
249 * xent_key_init_flash - initialize on-flash extended attribute entry key. 232 * xent_key_init_flash - initialize on-flash extended attribute entry key.
250 * @c: UBIFS file-system description object 233 * @c: UBIFS file-system description object
251 * @k: key to initialize 234 * @k: key to initialize
@@ -295,22 +278,15 @@ static inline void data_key_init(const struct ubifs_info *c,
295} 278}
296 279
297/** 280/**
298 * data_key_init_flash - initialize on-flash data key. 281 * highest_data_key - get the highest possible data key for an inode.
299 * @c: UBIFS file-system description object 282 * @c: UBIFS file-system description object
300 * @k: key to initialize 283 * @key: key to initialize
301 * @inum: inode number 284 * @inum: inode number
302 * @block: block number
303 */ 285 */
304static inline void data_key_init_flash(const struct ubifs_info *c, void *k, 286static inline void highest_data_key(const struct ubifs_info *c,
305 ino_t inum, unsigned int block) 287 union ubifs_key *key, ino_t inum)
306{ 288{
307 union ubifs_key *key = k; 289 data_key_init(c, key, inum, UBIFS_S_KEY_BLOCK_MASK);
308
309 ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK));
310 key->j32[0] = cpu_to_le32(inum);
311 key->j32[1] = cpu_to_le32(block |
312 (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS));
313 memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
314} 290}
315 291
316/** 292/**
@@ -554,4 +530,5 @@ static inline unsigned long long key_max_inode_size(const struct ubifs_info *c)
554 return 0; 530 return 0;
555 } 531 }
556} 532}
533
557#endif /* !__UBIFS_KEY_H__ */ 534#endif /* !__UBIFS_KEY_H__ */
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 56e33772a1ee..c345e125f42c 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -169,8 +169,8 @@ void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud)
169 */ 169 */
170 c->bud_bytes += c->leb_size - bud->start; 170 c->bud_bytes += c->leb_size - bud->start;
171 171
172 dbg_log("LEB %d:%d, jhead %d, bud_bytes %lld", bud->lnum, 172 dbg_log("LEB %d:%d, jhead %s, bud_bytes %lld", bud->lnum,
173 bud->start, bud->jhead, c->bud_bytes); 173 bud->start, dbg_jhead(bud->jhead), c->bud_bytes);
174 spin_unlock(&c->buds_lock); 174 spin_unlock(&c->buds_lock);
175} 175}
176 176
@@ -355,16 +355,16 @@ static void remove_buds(struct ubifs_info *c)
355 * heads (non-closed buds). 355 * heads (non-closed buds).
356 */ 356 */
357 c->cmt_bud_bytes += wbuf->offs - bud->start; 357 c->cmt_bud_bytes += wbuf->offs - bud->start;
358 dbg_log("preserve %d:%d, jhead %d, bud bytes %d, " 358 dbg_log("preserve %d:%d, jhead %s, bud bytes %d, "
359 "cmt_bud_bytes %lld", bud->lnum, bud->start, 359 "cmt_bud_bytes %lld", bud->lnum, bud->start,
360 bud->jhead, wbuf->offs - bud->start, 360 dbg_jhead(bud->jhead), wbuf->offs - bud->start,
361 c->cmt_bud_bytes); 361 c->cmt_bud_bytes);
362 bud->start = wbuf->offs; 362 bud->start = wbuf->offs;
363 } else { 363 } else {
364 c->cmt_bud_bytes += c->leb_size - bud->start; 364 c->cmt_bud_bytes += c->leb_size - bud->start;
365 dbg_log("remove %d:%d, jhead %d, bud bytes %d, " 365 dbg_log("remove %d:%d, jhead %s, bud bytes %d, "
366 "cmt_bud_bytes %lld", bud->lnum, bud->start, 366 "cmt_bud_bytes %lld", bud->lnum, bud->start,
367 bud->jhead, c->leb_size - bud->start, 367 dbg_jhead(bud->jhead), c->leb_size - bud->start,
368 c->cmt_bud_bytes); 368 c->cmt_bud_bytes);
369 rb_erase(p1, &c->buds); 369 rb_erase(p1, &c->buds);
370 /* 370 /*
@@ -429,7 +429,8 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
429 if (lnum == -1 || offs == c->leb_size) 429 if (lnum == -1 || offs == c->leb_size)
430 continue; 430 continue;
431 431
432 dbg_log("add ref to LEB %d:%d for jhead %d", lnum, offs, i); 432 dbg_log("add ref to LEB %d:%d for jhead %s",
433 lnum, offs, dbg_jhead(i));
433 ref = buf + len; 434 ref = buf + len;
434 ref->ch.node_type = UBIFS_REF_NODE; 435 ref->ch.node_type = UBIFS_REF_NODE;
435 ref->lnum = cpu_to_le32(lnum); 436 ref->lnum = cpu_to_le32(lnum);
@@ -695,7 +696,7 @@ int ubifs_consolidate_log(struct ubifs_info *c)
695 lnum = c->ltail_lnum; 696 lnum = c->ltail_lnum;
696 write_lnum = lnum; 697 write_lnum = lnum;
697 while (1) { 698 while (1) {
698 sleb = ubifs_scan(c, lnum, 0, c->sbuf); 699 sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0);
699 if (IS_ERR(sleb)) { 700 if (IS_ERR(sleb)) {
700 err = PTR_ERR(sleb); 701 err = PTR_ERR(sleb);
701 goto out_free; 702 goto out_free;
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 4cdd284dea56..4d4ca388889b 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -281,7 +281,7 @@ void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
281 case LPROPS_FREE: 281 case LPROPS_FREE:
282 if (add_to_lpt_heap(c, lprops, cat)) 282 if (add_to_lpt_heap(c, lprops, cat))
283 break; 283 break;
284 /* No more room on heap so make it uncategorized */ 284 /* No more room on heap so make it un-categorized */
285 cat = LPROPS_UNCAT; 285 cat = LPROPS_UNCAT;
286 /* Fall through */ 286 /* Fall through */
287 case LPROPS_UNCAT: 287 case LPROPS_UNCAT:
@@ -375,8 +375,8 @@ void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
375 * @lprops: LEB properties 375 * @lprops: LEB properties
376 * 376 *
377 * A LEB may have fallen off of the bottom of a heap, and ended up as 377 * A LEB may have fallen off of the bottom of a heap, and ended up as
378 * uncategorized even though it has enough space for us now. If that is the case 378 * un-categorized even though it has enough space for us now. If that is the
379 * this function will put the LEB back onto a heap. 379 * case this function will put the LEB back onto a heap.
380 */ 380 */
381void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops) 381void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops)
382{ 382{
@@ -436,10 +436,10 @@ int ubifs_categorize_lprops(const struct ubifs_info *c,
436/** 436/**
437 * change_category - change LEB properties category. 437 * change_category - change LEB properties category.
438 * @c: UBIFS file-system description object 438 * @c: UBIFS file-system description object
439 * @lprops: LEB properties to recategorize 439 * @lprops: LEB properties to re-categorize
440 * 440 *
441 * LEB properties are categorized to enable fast find operations. When the LEB 441 * LEB properties are categorized to enable fast find operations. When the LEB
442 * properties change they must be recategorized. 442 * properties change they must be re-categorized.
443 */ 443 */
444static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops) 444static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops)
445{ 445{
@@ -461,21 +461,18 @@ static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops)
461} 461}
462 462
463/** 463/**
464 * calc_dark - calculate LEB dark space size. 464 * ubifs_calc_dark - calculate LEB dark space size.
465 * @c: the UBIFS file-system description object 465 * @c: the UBIFS file-system description object
466 * @spc: amount of free and dirty space in the LEB 466 * @spc: amount of free and dirty space in the LEB
467 * 467 *
468 * This function calculates amount of dark space in an LEB which has @spc bytes 468 * This function calculates and returns amount of dark space in an LEB which
469 * of free and dirty space. Returns the calculations result. 469 * has @spc bytes of free and dirty space.
470 * 470 *
471 * Dark space is the space which is not always usable - it depends on which 471 * UBIFS is trying to account the space which might not be usable, and this
472 * nodes are written in which order. E.g., if an LEB has only 512 free bytes, 472 * space is called "dark space". For example, if an LEB has only %512 free
473 * it is dark space, because it cannot fit a large data node. So UBIFS cannot 473 * bytes, it is dark space, because it cannot fit a large data node.
474 * count on this LEB and treat these 512 bytes as usable because it is not true
475 * if, for example, only big chunks of uncompressible data will be written to
476 * the FS.
477 */ 474 */
478static int calc_dark(struct ubifs_info *c, int spc) 475int ubifs_calc_dark(const struct ubifs_info *c, int spc)
479{ 476{
480 ubifs_assert(!(spc & 7)); 477 ubifs_assert(!(spc & 7));
481 478
@@ -518,7 +515,7 @@ static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops)
518 * @free: new free space amount 515 * @free: new free space amount
519 * @dirty: new dirty space amount 516 * @dirty: new dirty space amount
520 * @flags: new flags 517 * @flags: new flags
521 * @idx_gc_cnt: change to the count of idx_gc list 518 * @idx_gc_cnt: change to the count of @idx_gc list
522 * 519 *
523 * This function changes LEB properties (@free, @dirty or @flag). However, the 520 * This function changes LEB properties (@free, @dirty or @flag). However, the
524 * property which has the %LPROPS_NC value is not changed. Returns a pointer to 521 * property which has the %LPROPS_NC value is not changed. Returns a pointer to
@@ -535,7 +532,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
535{ 532{
536 /* 533 /*
537 * This is the only function that is allowed to change lprops, so we 534 * This is the only function that is allowed to change lprops, so we
538 * discard the const qualifier. 535 * discard the "const" qualifier.
539 */ 536 */
540 struct ubifs_lprops *lprops = (struct ubifs_lprops *)lp; 537 struct ubifs_lprops *lprops = (struct ubifs_lprops *)lp;
541 538
@@ -575,7 +572,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
575 if (old_spc < c->dead_wm) 572 if (old_spc < c->dead_wm)
576 c->lst.total_dead -= old_spc; 573 c->lst.total_dead -= old_spc;
577 else 574 else
578 c->lst.total_dark -= calc_dark(c, old_spc); 575 c->lst.total_dark -= ubifs_calc_dark(c, old_spc);
579 576
580 c->lst.total_used -= c->leb_size - old_spc; 577 c->lst.total_used -= c->leb_size - old_spc;
581 } 578 }
@@ -616,7 +613,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
616 if (new_spc < c->dead_wm) 613 if (new_spc < c->dead_wm)
617 c->lst.total_dead += new_spc; 614 c->lst.total_dead += new_spc;
618 else 615 else
619 c->lst.total_dark += calc_dark(c, new_spc); 616 c->lst.total_dark += ubifs_calc_dark(c, new_spc);
620 617
621 c->lst.total_used += c->leb_size - new_spc; 618 c->lst.total_used += c->leb_size - new_spc;
622 } 619 }
@@ -1096,7 +1093,7 @@ static int scan_check_cb(struct ubifs_info *c,
1096 } 1093 }
1097 } 1094 }
1098 1095
1099 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf); 1096 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
1100 if (IS_ERR(sleb)) { 1097 if (IS_ERR(sleb)) {
1101 /* 1098 /*
1102 * After an unclean unmount, empty and freeable LEBs 1099 * After an unclean unmount, empty and freeable LEBs
@@ -1107,7 +1104,7 @@ static int scan_check_cb(struct ubifs_info *c,
1107 "- continuing checking"); 1104 "- continuing checking");
1108 lst->empty_lebs += 1; 1105 lst->empty_lebs += 1;
1109 lst->total_free += c->leb_size; 1106 lst->total_free += c->leb_size;
1110 lst->total_dark += calc_dark(c, c->leb_size); 1107 lst->total_dark += ubifs_calc_dark(c, c->leb_size);
1111 return LPT_SCAN_CONTINUE; 1108 return LPT_SCAN_CONTINUE;
1112 } 1109 }
1113 1110
@@ -1117,7 +1114,7 @@ static int scan_check_cb(struct ubifs_info *c,
1117 "- continuing checking"); 1114 "- continuing checking");
1118 lst->total_free += lp->free; 1115 lst->total_free += lp->free;
1119 lst->total_dirty += lp->dirty; 1116 lst->total_dirty += lp->dirty;
1120 lst->total_dark += calc_dark(c, c->leb_size); 1117 lst->total_dark += ubifs_calc_dark(c, c->leb_size);
1121 return LPT_SCAN_CONTINUE; 1118 return LPT_SCAN_CONTINUE;
1122 } 1119 }
1123 data->err = PTR_ERR(sleb); 1120 data->err = PTR_ERR(sleb);
@@ -1235,7 +1232,7 @@ static int scan_check_cb(struct ubifs_info *c,
1235 if (spc < c->dead_wm) 1232 if (spc < c->dead_wm)
1236 lst->total_dead += spc; 1233 lst->total_dead += spc;
1237 else 1234 else
1238 lst->total_dark += calc_dark(c, spc); 1235 lst->total_dark += ubifs_calc_dark(c, spc);
1239 } 1236 }
1240 1237
1241 ubifs_scan_destroy(sleb); 1238 ubifs_scan_destroy(sleb);
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index a88f33801b98..28beaeedadc0 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -29,7 +29,8 @@
29 * @c: UBIFS file-system description object 29 * @c: UBIFS file-system description object
30 * 30 *
31 * This function scans the master node LEBs and search for the latest master 31 * This function scans the master node LEBs and search for the latest master
32 * node. Returns zero in case of success and a negative error code in case of 32 * node. Returns zero in case of success, %-EUCLEAN if there master area is
33 * corrupted and requires recovery, and a negative error code in case of
33 * failure. 34 * failure.
34 */ 35 */
35static int scan_for_master(struct ubifs_info *c) 36static int scan_for_master(struct ubifs_info *c)
@@ -40,7 +41,7 @@ static int scan_for_master(struct ubifs_info *c)
40 41
41 lnum = UBIFS_MST_LNUM; 42 lnum = UBIFS_MST_LNUM;
42 43
43 sleb = ubifs_scan(c, lnum, 0, c->sbuf); 44 sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
44 if (IS_ERR(sleb)) 45 if (IS_ERR(sleb))
45 return PTR_ERR(sleb); 46 return PTR_ERR(sleb);
46 nodes_cnt = sleb->nodes_cnt; 47 nodes_cnt = sleb->nodes_cnt;
@@ -48,7 +49,7 @@ static int scan_for_master(struct ubifs_info *c)
48 snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, 49 snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
49 list); 50 list);
50 if (snod->type != UBIFS_MST_NODE) 51 if (snod->type != UBIFS_MST_NODE)
51 goto out; 52 goto out_dump;
52 memcpy(c->mst_node, snod->node, snod->len); 53 memcpy(c->mst_node, snod->node, snod->len);
53 offs = snod->offs; 54 offs = snod->offs;
54 } 55 }
@@ -56,7 +57,7 @@ static int scan_for_master(struct ubifs_info *c)
56 57
57 lnum += 1; 58 lnum += 1;
58 59
59 sleb = ubifs_scan(c, lnum, 0, c->sbuf); 60 sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
60 if (IS_ERR(sleb)) 61 if (IS_ERR(sleb))
61 return PTR_ERR(sleb); 62 return PTR_ERR(sleb);
62 if (sleb->nodes_cnt != nodes_cnt) 63 if (sleb->nodes_cnt != nodes_cnt)
@@ -65,7 +66,7 @@ static int scan_for_master(struct ubifs_info *c)
65 goto out; 66 goto out;
66 snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list); 67 snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list);
67 if (snod->type != UBIFS_MST_NODE) 68 if (snod->type != UBIFS_MST_NODE)
68 goto out; 69 goto out_dump;
69 if (snod->offs != offs) 70 if (snod->offs != offs)
70 goto out; 71 goto out;
71 if (memcmp((void *)c->mst_node + UBIFS_CH_SZ, 72 if (memcmp((void *)c->mst_node + UBIFS_CH_SZ,
@@ -78,6 +79,12 @@ static int scan_for_master(struct ubifs_info *c)
78 79
79out: 80out:
80 ubifs_scan_destroy(sleb); 81 ubifs_scan_destroy(sleb);
82 return -EUCLEAN;
83
84out_dump:
85 ubifs_err("unexpected node type %d master LEB %d:%d",
86 snod->type, lnum, snod->offs);
87 ubifs_scan_destroy(sleb);
81 return -EINVAL; 88 return -EINVAL;
82} 89}
83 90
@@ -256,7 +263,8 @@ int ubifs_read_master(struct ubifs_info *c)
256 263
257 err = scan_for_master(c); 264 err = scan_for_master(c);
258 if (err) { 265 if (err) {
259 err = ubifs_recover_master_node(c); 266 if (err == -EUCLEAN)
267 err = ubifs_recover_master_node(c);
260 if (err) 268 if (err)
261 /* 269 /*
262 * Note, we do not free 'c->mst_node' here because the 270 * Note, we do not free 'c->mst_node' here because the
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 152a7b34a141..82009c74b6a3 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -670,9 +670,10 @@ static int kill_orphans(struct ubifs_info *c)
670 struct ubifs_scan_leb *sleb; 670 struct ubifs_scan_leb *sleb;
671 671
672 dbg_rcvry("LEB %d", lnum); 672 dbg_rcvry("LEB %d", lnum);
673 sleb = ubifs_scan(c, lnum, 0, c->sbuf); 673 sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
674 if (IS_ERR(sleb)) { 674 if (IS_ERR(sleb)) {
675 sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0); 675 if (PTR_ERR(sleb) == -EUCLEAN)
676 sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0);
676 if (IS_ERR(sleb)) { 677 if (IS_ERR(sleb)) {
677 err = PTR_ERR(sleb); 678 err = PTR_ERR(sleb);
678 break; 679 break;
@@ -899,7 +900,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
899 for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { 900 for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
900 struct ubifs_scan_leb *sleb; 901 struct ubifs_scan_leb *sleb;
901 902
902 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf); 903 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
903 if (IS_ERR(sleb)) { 904 if (IS_ERR(sleb)) {
904 err = PTR_ERR(sleb); 905 err = PTR_ERR(sleb);
905 break; 906 break;
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index e5f6cf8a1155..f94ddf7efba0 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -286,7 +286,7 @@ int ubifs_recover_master_node(struct ubifs_info *c)
286 mst = mst2; 286 mst = mst2;
287 } 287 }
288 288
289 dbg_rcvry("recovered master node from LEB %d", 289 ubifs_msg("recovered master node from LEB %d",
290 (mst == mst1 ? UBIFS_MST_LNUM : UBIFS_MST_LNUM + 1)); 290 (mst == mst1 ? UBIFS_MST_LNUM : UBIFS_MST_LNUM + 1));
291 291
292 memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ); 292 memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ);
@@ -790,7 +790,7 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
790 * We can only recover at the end of the log, so check that the 790 * We can only recover at the end of the log, so check that the
791 * next log LEB is empty or out of date. 791 * next log LEB is empty or out of date.
792 */ 792 */
793 sleb = ubifs_scan(c, next_lnum, 0, sbuf); 793 sleb = ubifs_scan(c, next_lnum, 0, sbuf, 0);
794 if (IS_ERR(sleb)) 794 if (IS_ERR(sleb))
795 return sleb; 795 return sleb;
796 if (sleb->nodes_cnt) { 796 if (sleb->nodes_cnt) {
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 2970500f32df..5c2d6d759a3e 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -506,7 +506,7 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
506 if (c->need_recovery) 506 if (c->need_recovery)
507 sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD); 507 sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD);
508 else 508 else
509 sleb = ubifs_scan(c, lnum, offs, c->sbuf); 509 sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0);
510 if (IS_ERR(sleb)) 510 if (IS_ERR(sleb))
511 return PTR_ERR(sleb); 511 return PTR_ERR(sleb);
512 512
@@ -836,8 +836,8 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
836 const struct ubifs_cs_node *node; 836 const struct ubifs_cs_node *node;
837 837
838 dbg_mnt("replay log LEB %d:%d", lnum, offs); 838 dbg_mnt("replay log LEB %d:%d", lnum, offs);
839 sleb = ubifs_scan(c, lnum, offs, sbuf); 839 sleb = ubifs_scan(c, lnum, offs, sbuf, c->need_recovery);
840 if (IS_ERR(sleb) ) { 840 if (IS_ERR(sleb)) {
841 if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery) 841 if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery)
842 return PTR_ERR(sleb); 842 return PTR_ERR(sleb);
843 sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf); 843 sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 892ebfee4fe5..96c525384191 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -108,10 +108,9 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
108 108
109 /* Make the node pads to 8-byte boundary */ 109 /* Make the node pads to 8-byte boundary */
110 if ((node_len + pad_len) & 7) { 110 if ((node_len + pad_len) & 7) {
111 if (!quiet) { 111 if (!quiet)
112 dbg_err("bad padding length %d - %d", 112 dbg_err("bad padding length %d - %d",
113 offs, offs + node_len + pad_len); 113 offs, offs + node_len + pad_len);
114 }
115 return SCANNED_A_BAD_PAD_NODE; 114 return SCANNED_A_BAD_PAD_NODE;
116 } 115 }
117 116
@@ -253,15 +252,19 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
253 * @c: UBIFS file-system description object 252 * @c: UBIFS file-system description object
254 * @lnum: logical eraseblock number 253 * @lnum: logical eraseblock number
255 * @offs: offset to start at (usually zero) 254 * @offs: offset to start at (usually zero)
256 * @sbuf: scan buffer (must be c->leb_size) 255 * @sbuf: scan buffer (must be of @c->leb_size bytes in size)
256 * @quiet: print no messages
257 * 257 *
258 * This function scans LEB number @lnum and returns complete information about 258 * This function scans LEB number @lnum and returns complete information about
259 * its contents. Returns the scaned information in case of success and, 259 * its contents. Returns the scaned information in case of success and,
260 * %-EUCLEAN if the LEB neads recovery, and other negative error codes in case 260 * %-EUCLEAN if the LEB neads recovery, and other negative error codes in case
261 * of failure. 261 * of failure.
262 *
263 * If @quiet is non-zero, this function does not print large and scary
264 * error messages and flash dumps in case of errors.
262 */ 265 */
263struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, 266struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
264 int offs, void *sbuf) 267 int offs, void *sbuf, int quiet)
265{ 268{
266 void *buf = sbuf + offs; 269 void *buf = sbuf + offs;
267 int err, len = c->leb_size - offs; 270 int err, len = c->leb_size - offs;
@@ -280,7 +283,7 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
280 283
281 cond_resched(); 284 cond_resched();
282 285
283 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0); 286 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
284 if (ret > 0) { 287 if (ret > 0) {
285 /* Padding bytes or a valid padding node */ 288 /* Padding bytes or a valid padding node */
286 offs += ret; 289 offs += ret;
@@ -320,7 +323,9 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
320 } 323 }
321 324
322 if (offs % c->min_io_size) { 325 if (offs % c->min_io_size) {
323 ubifs_err("empty space starts at non-aligned offset %d", offs); 326 if (!quiet)
327 ubifs_err("empty space starts at non-aligned offset %d",
328 offs);
324 goto corrupted;; 329 goto corrupted;;
325 } 330 }
326 331
@@ -331,18 +336,25 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
331 break; 336 break;
332 for (; len; offs++, buf++, len--) 337 for (; len; offs++, buf++, len--)
333 if (*(uint8_t *)buf != 0xff) { 338 if (*(uint8_t *)buf != 0xff) {
334 ubifs_err("corrupt empty space at LEB %d:%d", 339 if (!quiet)
335 lnum, offs); 340 ubifs_err("corrupt empty space at LEB %d:%d",
341 lnum, offs);
336 goto corrupted; 342 goto corrupted;
337 } 343 }
338 344
339 return sleb; 345 return sleb;
340 346
341corrupted: 347corrupted:
342 ubifs_scanned_corruption(c, lnum, offs, buf); 348 if (!quiet) {
349 ubifs_scanned_corruption(c, lnum, offs, buf);
350 ubifs_err("LEB %d scanning failed", lnum);
351 }
343 err = -EUCLEAN; 352 err = -EUCLEAN;
353 ubifs_scan_destroy(sleb);
354 return ERR_PTR(err);
355
344error: 356error:
345 ubifs_err("LEB %d scanning failed", lnum); 357 ubifs_err("LEB %d scanning failed, error %d", lnum, err);
346 ubifs_scan_destroy(sleb); 358 ubifs_scan_destroy(sleb);
347 return ERR_PTR(err); 359 return ERR_PTR(err);
348} 360}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 26d2e0d80465..333e181ee987 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -36,7 +36,6 @@
36#include <linux/mount.h> 36#include <linux/mount.h>
37#include <linux/math64.h> 37#include <linux/math64.h>
38#include <linux/writeback.h> 38#include <linux/writeback.h>
39#include <linux/smp_lock.h>
40#include "ubifs.h" 39#include "ubifs.h"
41 40
42/* 41/*
@@ -318,6 +317,8 @@ static int ubifs_write_inode(struct inode *inode, int wait)
318 if (err) 317 if (err)
319 ubifs_err("can't write inode %lu, error %d", 318 ubifs_err("can't write inode %lu, error %d",
320 inode->i_ino, err); 319 inode->i_ino, err);
320 else
321 err = dbg_check_inode_size(c, inode, ui->ui_size);
321 } 322 }
322 323
323 ui->dirty = 0; 324 ui->dirty = 0;
@@ -438,12 +439,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
438{ 439{
439 int i, err; 440 int i, err;
440 struct ubifs_info *c = sb->s_fs_info; 441 struct ubifs_info *c = sb->s_fs_info;
441 struct writeback_control wbc = {
442 .sync_mode = WB_SYNC_ALL,
443 .range_start = 0,
444 .range_end = LLONG_MAX,
445 .nr_to_write = LONG_MAX,
446 };
447 442
448 /* 443 /*
449 * Zero @wait is just an advisory thing to help the file system shove 444 * Zero @wait is just an advisory thing to help the file system shove
@@ -454,17 +449,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
454 return 0; 449 return 0;
455 450
456 /* 451 /*
457 * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
458 * pages, so synchronize them first, then commit the journal. Strictly
459 * speaking, it is not necessary to commit the journal here,
460 * synchronizing write-buffers would be enough. But committing makes
461 * UBIFS free space predictions much more accurate, so we want to let
462 * the user be able to get more accurate results of 'statfs()' after
463 * they synchronize the file system.
464 */
465 generic_sync_sb_inodes(sb, &wbc);
466
467 /*
468 * Synchronize write buffers, because 'ubifs_run_commit()' does not 452 * Synchronize write buffers, because 'ubifs_run_commit()' does not
469 * do this if it waits for an already running commit. 453 * do this if it waits for an already running commit.
470 */ 454 */
@@ -474,6 +458,13 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
474 return err; 458 return err;
475 } 459 }
476 460
461 /*
462 * Strictly speaking, it is not necessary to commit the journal here,
463 * synchronizing write-buffers would be enough. But committing makes
464 * UBIFS free space predictions much more accurate, so we want to let
465 * the user be able to get more accurate results of 'statfs()' after
466 * they synchronize the file system.
467 */
477 err = ubifs_run_commit(c); 468 err = ubifs_run_commit(c);
478 if (err) 469 if (err)
479 return err; 470 return err;
@@ -1726,8 +1717,6 @@ static void ubifs_put_super(struct super_block *sb)
1726 ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num, 1717 ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num,
1727 c->vi.vol_id); 1718 c->vi.vol_id);
1728 1719
1729 lock_kernel();
1730
1731 /* 1720 /*
1732 * The following asserts are only valid if there has not been a failure 1721 * The following asserts are only valid if there has not been a failure
1733 * of the media. For example, there will be dirty inodes if we failed 1722 * of the media. For example, there will be dirty inodes if we failed
@@ -1792,8 +1781,6 @@ static void ubifs_put_super(struct super_block *sb)
1792 ubi_close_volume(c->ubi); 1781 ubi_close_volume(c->ubi);
1793 mutex_unlock(&c->umount_mutex); 1782 mutex_unlock(&c->umount_mutex);
1794 kfree(c); 1783 kfree(c);
1795
1796 unlock_kernel();
1797} 1784}
1798 1785
1799static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) 1786static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
@@ -1809,22 +1796,17 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1809 return err; 1796 return err;
1810 } 1797 }
1811 1798
1812 lock_kernel();
1813 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { 1799 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
1814 if (c->ro_media) { 1800 if (c->ro_media) {
1815 ubifs_msg("cannot re-mount due to prior errors"); 1801 ubifs_msg("cannot re-mount due to prior errors");
1816 unlock_kernel();
1817 return -EROFS; 1802 return -EROFS;
1818 } 1803 }
1819 err = ubifs_remount_rw(c); 1804 err = ubifs_remount_rw(c);
1820 if (err) { 1805 if (err)
1821 unlock_kernel();
1822 return err; 1806 return err;
1823 }
1824 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) { 1807 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
1825 if (c->ro_media) { 1808 if (c->ro_media) {
1826 ubifs_msg("cannot re-mount due to prior errors"); 1809 ubifs_msg("cannot re-mount due to prior errors");
1827 unlock_kernel();
1828 return -EROFS; 1810 return -EROFS;
1829 } 1811 }
1830 ubifs_remount_ro(c); 1812 ubifs_remount_ro(c);
@@ -1839,7 +1821,6 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1839 } 1821 }
1840 1822
1841 ubifs_assert(c->lst.taken_empty_lebs > 0); 1823 ubifs_assert(c->lst.taken_empty_lebs > 0);
1842 unlock_kernel();
1843 return 0; 1824 return 0;
1844} 1825}
1845 1826
@@ -1971,6 +1952,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
1971 * 1952 *
1972 * Read-ahead will be disabled because @c->bdi.ra_pages is 0. 1953 * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
1973 */ 1954 */
1955 c->bdi.name = "ubifs",
1974 c->bdi.capabilities = BDI_CAP_MAP_COPY; 1956 c->bdi.capabilities = BDI_CAP_MAP_COPY;
1975 c->bdi.unplug_io_fn = default_unplug_io_fn; 1957 c->bdi.unplug_io_fn = default_unplug_io_fn;
1976 err = bdi_init(&c->bdi); 1958 err = bdi_init(&c->bdi);
@@ -1985,6 +1967,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
1985 if (err) 1967 if (err)
1986 goto out_bdi; 1968 goto out_bdi;
1987 1969
1970 sb->s_bdi = &c->bdi;
1988 sb->s_fs_info = c; 1971 sb->s_fs_info = c;
1989 sb->s_magic = UBIFS_SUPER_MAGIC; 1972 sb->s_magic = UBIFS_SUPER_MAGIC;
1990 sb->s_blocksize = UBIFS_BLOCK_SIZE; 1973 sb->s_blocksize = UBIFS_BLOCK_SIZE;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index f249f7b0d656..e5b1a7d00fa0 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1159,8 +1159,8 @@ static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c,
1159 * o exact match, i.e. the found zero-level znode contains key @key, then %1 1159 * o exact match, i.e. the found zero-level znode contains key @key, then %1
1160 * is returned and slot number of the matched branch is stored in @n; 1160 * is returned and slot number of the matched branch is stored in @n;
1161 * o not exact match, which means that zero-level znode does not contain 1161 * o not exact match, which means that zero-level znode does not contain
1162 * @key, then %0 is returned and slot number of the closed branch is stored 1162 * @key, then %0 is returned and slot number of the closest branch is stored
1163 * in @n; 1163 * in @n;
1164 * o @key is so small that it is even less than the lowest key of the 1164 * o @key is so small that it is even less than the lowest key of the
1165 * leftmost zero-level node, then %0 is returned and %0 is stored in @n. 1165 * leftmost zero-level node, then %0 is returned and %0 is stored in @n.
1166 * 1166 *
@@ -1433,7 +1433,7 @@ static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1)
1433 * @lnum: LEB number is returned here 1433 * @lnum: LEB number is returned here
1434 * @offs: offset is returned here 1434 * @offs: offset is returned here
1435 * 1435 *
1436 * This function look up and reads node with key @key. The caller has to make 1436 * This function looks up and reads node with key @key. The caller has to make
1437 * sure the @node buffer is large enough to fit the node. Returns zero in case 1437 * sure the @node buffer is large enough to fit the node. Returns zero in case
1438 * of success, %-ENOENT if the node was not found, and a negative error code in 1438 * of success, %-ENOENT if the node was not found, and a negative error code in
1439 * case of failure. The node location can be returned in @lnum and @offs. 1439 * case of failure. The node location can be returned in @lnum and @offs.
@@ -3268,3 +3268,73 @@ out_unlock:
3268 mutex_unlock(&c->tnc_mutex); 3268 mutex_unlock(&c->tnc_mutex);
3269 return err; 3269 return err;
3270} 3270}
3271
3272#ifdef CONFIG_UBIFS_FS_DEBUG
3273
3274/**
3275 * dbg_check_inode_size - check if inode size is correct.
3276 * @c: UBIFS file-system description object
3277 * @inum: inode number
3278 * @size: inode size
3279 *
3280 * This function makes sure that the inode size (@size) is correct and it does
3281 * not have any pages beyond @size. Returns zero if the inode is OK, %-EINVAL
3282 * if it has a data page beyond @size, and other negative error code in case of
3283 * other errors.
3284 */
3285int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
3286 loff_t size)
3287{
3288 int err, n;
3289 union ubifs_key from_key, to_key, *key;
3290 struct ubifs_znode *znode;
3291 unsigned int block;
3292
3293 if (!S_ISREG(inode->i_mode))
3294 return 0;
3295 if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
3296 return 0;
3297
3298 block = (size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT;
3299 data_key_init(c, &from_key, inode->i_ino, block);
3300 highest_data_key(c, &to_key, inode->i_ino);
3301
3302 mutex_lock(&c->tnc_mutex);
3303 err = ubifs_lookup_level0(c, &from_key, &znode, &n);
3304 if (err < 0)
3305 goto out_unlock;
3306
3307 if (err) {
3308 err = -EINVAL;
3309 key = &from_key;
3310 goto out_dump;
3311 }
3312
3313 err = tnc_next(c, &znode, &n);
3314 if (err == -ENOENT) {
3315 err = 0;
3316 goto out_unlock;
3317 }
3318 if (err < 0)
3319 goto out_unlock;
3320
3321 ubifs_assert(err == 0);
3322 key = &znode->zbranch[n].key;
3323 if (!key_in_range(c, key, &from_key, &to_key))
3324 goto out_unlock;
3325
3326out_dump:
3327 block = key_block(c, key);
3328 ubifs_err("inode %lu has size %lld, but there are data at offset %lld "
3329 "(data key %s)", (unsigned long)inode->i_ino, size,
3330 ((loff_t)block) << UBIFS_BLOCK_SHIFT, DBGKEY(key));
3331 dbg_dump_inode(c, inode);
3332 dbg_dump_stack();
3333 err = -EINVAL;
3334
3335out_unlock:
3336 mutex_unlock(&c->tnc_mutex);
3337 return err;
3338}
3339
3340#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index fde8d127c768..53288e5d604e 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -245,7 +245,7 @@ static int layout_leb_in_gaps(struct ubifs_info *c, int *p)
245 * it is more comprehensive and less efficient than is needed for this 245 * it is more comprehensive and less efficient than is needed for this
246 * purpose. 246 * purpose.
247 */ 247 */
248 sleb = ubifs_scan(c, lnum, 0, c->ileb_buf); 248 sleb = ubifs_scan(c, lnum, 0, c->ileb_buf, 0);
249 c->ileb_len = 0; 249 c->ileb_len = 0;
250 if (IS_ERR(sleb)) 250 if (IS_ERR(sleb))
251 return PTR_ERR(sleb); 251 return PTR_ERR(sleb);
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index 3eee07e0c495..191ca7863fe7 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -135,6 +135,13 @@
135/* The key is always at the same position in all keyed nodes */ 135/* The key is always at the same position in all keyed nodes */
136#define UBIFS_KEY_OFFSET offsetof(struct ubifs_ino_node, key) 136#define UBIFS_KEY_OFFSET offsetof(struct ubifs_ino_node, key)
137 137
138/* Garbage collector journal head number */
139#define UBIFS_GC_HEAD 0
140/* Base journal head number */
141#define UBIFS_BASE_HEAD 1
142/* Data journal head number */
143#define UBIFS_DATA_HEAD 2
144
138/* 145/*
139 * LEB Properties Tree node types. 146 * LEB Properties Tree node types.
140 * 147 *
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index a29349094422..b2d976366a46 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -105,12 +105,10 @@
105/* Number of non-data journal heads */ 105/* Number of non-data journal heads */
106#define NONDATA_JHEADS_CNT 2 106#define NONDATA_JHEADS_CNT 2
107 107
108/* Garbage collector head */ 108/* Shorter names for journal head numbers for internal usage */
109#define GCHD 0 109#define GCHD UBIFS_GC_HEAD
110/* Base journal head number */ 110#define BASEHD UBIFS_BASE_HEAD
111#define BASEHD 1 111#define DATAHD UBIFS_DATA_HEAD
112/* First "general purpose" journal head */
113#define DATAHD 2
114 112
115/* 'No change' value for 'ubifs_change_lp()' */ 113/* 'No change' value for 'ubifs_change_lp()' */
116#define LPROPS_NC 0x80000001 114#define LPROPS_NC 0x80000001
@@ -1451,7 +1449,7 @@ int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode);
1451 1449
1452/* scan.c */ 1450/* scan.c */
1453struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, 1451struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
1454 int offs, void *sbuf); 1452 int offs, void *sbuf, int quiet);
1455void ubifs_scan_destroy(struct ubifs_scan_leb *sleb); 1453void ubifs_scan_destroy(struct ubifs_scan_leb *sleb);
1456int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, 1454int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
1457 int offs, int quiet); 1455 int offs, int quiet);
@@ -1676,6 +1674,7 @@ const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c);
1676const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c); 1674const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c);
1677const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c); 1675const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c);
1678const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c); 1676const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
1677int ubifs_calc_dark(const struct ubifs_info *c, int spc);
1679 1678
1680/* file.c */ 1679/* file.c */
1681int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync); 1680int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index adafcf556531..195830f47569 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -78,9 +78,9 @@ enum {
78 SECURITY_XATTR, 78 SECURITY_XATTR,
79}; 79};
80 80
81static struct inode_operations none_inode_operations; 81static const struct inode_operations none_inode_operations;
82static struct address_space_operations none_address_operations; 82static const struct address_space_operations none_address_operations;
83static struct file_operations none_file_operations; 83static const struct file_operations none_file_operations;
84 84
85/** 85/**
86 * create_xattr - create an extended attribute. 86 * create_xattr - create an extended attribute.
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 1d2c570704c8..2ffdb6733af1 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -18,59 +18,6 @@
18#include <linux/string.h> 18#include <linux/string.h>
19#include <linux/buffer_head.h> 19#include <linux/buffer_head.h>
20 20
21#if 0
22static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad,
23 uint8_t ad_size, struct kernel_lb_addr fe_loc,
24 int *pos, int *offset, struct buffer_head **bh,
25 int *error)
26{
27 int loffset = *offset;
28 int block;
29 uint8_t *ad;
30 int remainder;
31
32 *error = 0;
33
34 ad = (uint8_t *)(*bh)->b_data + *offset;
35 *offset += ad_size;
36
37 if (!ad) {
38 brelse(*bh);
39 *error = 1;
40 return NULL;
41 }
42
43 if (*offset == dir->i_sb->s_blocksize) {
44 brelse(*bh);
45 block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos);
46 if (!block)
47 return NULL;
48 *bh = udf_tread(dir->i_sb, block);
49 if (!*bh)
50 return NULL;
51 } else if (*offset > dir->i_sb->s_blocksize) {
52 ad = tmpad;
53
54 remainder = dir->i_sb->s_blocksize - loffset;
55 memcpy((uint8_t *)ad, (*bh)->b_data + loffset, remainder);
56
57 brelse(*bh);
58 block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos);
59 if (!block)
60 return NULL;
61 (*bh) = udf_tread(dir->i_sb, block);
62 if (!*bh)
63 return NULL;
64
65 memcpy((uint8_t *)ad + remainder, (*bh)->b_data,
66 ad_size - remainder);
67 *offset = ad_size - remainder;
68 }
69
70 return ad;
71}
72#endif
73
74struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, 21struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
75 struct udf_fileident_bh *fibh, 22 struct udf_fileident_bh *fibh,
76 struct fileIdentDesc *cfi, 23 struct fileIdentDesc *cfi,
@@ -248,39 +195,6 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset)
248 return fi; 195 return fi;
249} 196}
250 197
251#if 0
252static struct extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
253{
254 struct extent_ad *ext;
255 struct fileEntry *fe;
256 uint8_t *ptr;
257
258 if ((!buffer) || (!offset)) {
259 printk(KERN_ERR "udf: udf_get_fileextent() invalidparms\n");
260 return NULL;
261 }
262
263 fe = (struct fileEntry *)buffer;
264
265 if (fe->descTag.tagIdent != cpu_to_le16(TAG_IDENT_FE)) {
266 udf_debug("0x%x != TAG_IDENT_FE\n",
267 le16_to_cpu(fe->descTag.tagIdent));
268 return NULL;
269 }
270
271 ptr = (uint8_t *)(fe->extendedAttr) +
272 le32_to_cpu(fe->lengthExtendedAttr);
273
274 if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs)))
275 ptr += *offset;
276
277 ext = (struct extent_ad *)ptr;
278
279 *offset = *offset + sizeof(struct extent_ad);
280 return ext;
281}
282#endif
283
284struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset, 198struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset,
285 int inc) 199 int inc)
286{ 200{
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 7464305382b5..b80cbd78833c 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -193,9 +193,11 @@ int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
193static int udf_release_file(struct inode *inode, struct file *filp) 193static int udf_release_file(struct inode *inode, struct file *filp)
194{ 194{
195 if (filp->f_mode & FMODE_WRITE) { 195 if (filp->f_mode & FMODE_WRITE) {
196 mutex_lock(&inode->i_mutex);
196 lock_kernel(); 197 lock_kernel();
197 udf_discard_prealloc(inode); 198 udf_discard_prealloc(inode);
198 unlock_kernel(); 199 unlock_kernel();
200 mutex_unlock(&inode->i_mutex);
199 } 201 }
200 return 0; 202 return 0;
201} 203}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index e7533f785636..6d24c2c63f93 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -90,19 +90,16 @@ no_delete:
90} 90}
91 91
92/* 92/*
93 * If we are going to release inode from memory, we discard preallocation and 93 * If we are going to release inode from memory, we truncate last inode extent
94 * truncate last inode extent to proper length. We could use drop_inode() but 94 * to proper length. We could use drop_inode() but it's called under inode_lock
95 * it's called under inode_lock and thus we cannot mark inode dirty there. We 95 * and thus we cannot mark inode dirty there. We use clear_inode() but we have
96 * use clear_inode() but we have to make sure to write inode as it's not written 96 * to make sure to write inode as it's not written automatically.
97 * automatically.
98 */ 97 */
99void udf_clear_inode(struct inode *inode) 98void udf_clear_inode(struct inode *inode)
100{ 99{
101 struct udf_inode_info *iinfo; 100 struct udf_inode_info *iinfo;
102 if (!(inode->i_sb->s_flags & MS_RDONLY)) { 101 if (!(inode->i_sb->s_flags & MS_RDONLY)) {
103 lock_kernel(); 102 lock_kernel();
104 /* Discard preallocation for directories, symlinks, etc. */
105 udf_discard_prealloc(inode);
106 udf_truncate_tail_extent(inode); 103 udf_truncate_tail_extent(inode);
107 unlock_kernel(); 104 unlock_kernel();
108 write_inode_now(inode, 0); 105 write_inode_now(inode, 0);
@@ -664,8 +661,12 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
664 udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum); 661 udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum);
665 662
666#ifdef UDF_PREALLOCATE 663#ifdef UDF_PREALLOCATE
667 /* preallocate blocks */ 664 /* We preallocate blocks only for regular files. It also makes sense
668 udf_prealloc_extents(inode, c, lastblock, laarr, &endnum); 665 * for directories but there's a problem when to drop the
666 * preallocation. We might use some delayed work for that but I feel
667 * it's overengineering for a filesystem like UDF. */
668 if (S_ISREG(inode->i_mode))
669 udf_prealloc_extents(inode, c, lastblock, laarr, &endnum);
669#endif 670#endif
670 671
671 /* merge any continuous blocks in laarr */ 672 /* merge any continuous blocks in laarr */
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 1b88fd5df05d..43e24a3b8e10 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -36,14 +36,10 @@ unsigned int udf_get_last_session(struct super_block *sb)
36 ms_info.addr_format = CDROM_LBA; 36 ms_info.addr_format = CDROM_LBA;
37 i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long)&ms_info); 37 i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long)&ms_info);
38 38
39#define WE_OBEY_THE_WRITTEN_STANDARDS 1
40
41 if (i == 0) { 39 if (i == 0) {
42 udf_debug("XA disk: %s, vol_desc_start=%d\n", 40 udf_debug("XA disk: %s, vol_desc_start=%d\n",
43 (ms_info.xa_flag ? "yes" : "no"), ms_info.addr.lba); 41 (ms_info.xa_flag ? "yes" : "no"), ms_info.addr.lba);
44#if WE_OBEY_THE_WRITTEN_STANDARDS
45 if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */ 42 if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */
46#endif
47 vol_desc_start = ms_info.addr.lba; 43 vol_desc_start = ms_info.addr.lba;
48 } else { 44 } else {
49 udf_debug("CDROMMULTISESSION not supported: rc=%d\n", i); 45 udf_debug("CDROMMULTISESSION not supported: rc=%d\n", i);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 6a29fa34c478..21dad8c608f9 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -943,7 +943,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
943 pc->componentType = 1; 943 pc->componentType = 1;
944 pc->lengthComponentIdent = 0; 944 pc->lengthComponentIdent = 0;
945 pc->componentFileVersionNum = 0; 945 pc->componentFileVersionNum = 0;
946 pc += sizeof(struct pathComponent);
947 elen += sizeof(struct pathComponent); 946 elen += sizeof(struct pathComponent);
948 } 947 }
949 948
diff --git a/fs/xattr.c b/fs/xattr.c
index 1c3d0af59ddf..6d4f6d3449fb 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -66,22 +66,28 @@ xattr_permission(struct inode *inode, const char *name, int mask)
66 return inode_permission(inode, mask); 66 return inode_permission(inode, mask);
67} 67}
68 68
69int 69/**
70vfs_setxattr(struct dentry *dentry, const char *name, const void *value, 70 * __vfs_setxattr_noperm - perform setxattr operation without performing
71 size_t size, int flags) 71 * permission checks.
72 *
73 * @dentry - object to perform setxattr on
74 * @name - xattr name to set
75 * @value - value to set @name to
76 * @size - size of @value
77 * @flags - flags to pass into filesystem operations
78 *
79 * returns the result of the internal setxattr or setsecurity operations.
80 *
81 * This function requires the caller to lock the inode's i_mutex before it
82 * is executed. It also assumes that the caller will make the appropriate
83 * permission checks.
84 */
85int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
86 const void *value, size_t size, int flags)
72{ 87{
73 struct inode *inode = dentry->d_inode; 88 struct inode *inode = dentry->d_inode;
74 int error; 89 int error = -EOPNOTSUPP;
75
76 error = xattr_permission(inode, name, MAY_WRITE);
77 if (error)
78 return error;
79 90
80 mutex_lock(&inode->i_mutex);
81 error = security_inode_setxattr(dentry, name, value, size, flags);
82 if (error)
83 goto out;
84 error = -EOPNOTSUPP;
85 if (inode->i_op->setxattr) { 91 if (inode->i_op->setxattr) {
86 error = inode->i_op->setxattr(dentry, name, value, size, flags); 92 error = inode->i_op->setxattr(dentry, name, value, size, flags);
87 if (!error) { 93 if (!error) {
@@ -97,6 +103,29 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
97 if (!error) 103 if (!error)
98 fsnotify_xattr(dentry); 104 fsnotify_xattr(dentry);
99 } 105 }
106
107 return error;
108}
109
110
111int
112vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
113 size_t size, int flags)
114{
115 struct inode *inode = dentry->d_inode;
116 int error;
117
118 error = xattr_permission(inode, name, MAY_WRITE);
119 if (error)
120 return error;
121
122 mutex_lock(&inode->i_mutex);
123 error = security_inode_setxattr(dentry, name, value, size, flags);
124 if (error)
125 goto out;
126
127 error = __vfs_setxattr_noperm(dentry, name, value, size, flags);
128
100out: 129out:
101 mutex_unlock(&inode->i_mutex); 130 mutex_unlock(&inode->i_mutex);
102 return error; 131 return error;
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index aecf2519db76..381854461b28 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -216,7 +216,6 @@ xfs_setfilesize(
216 if (ip->i_d.di_size < isize) { 216 if (ip->i_d.di_size < isize) {
217 ip->i_d.di_size = isize; 217 ip->i_d.di_size = isize;
218 ip->i_update_core = 1; 218 ip->i_update_core = 1;
219 ip->i_update_size = 1;
220 xfs_mark_inode_dirty_sync(ip); 219 xfs_mark_inode_dirty_sync(ip);
221 } 220 }
222 221
@@ -1636,4 +1635,5 @@ const struct address_space_operations xfs_address_space_operations = {
1636 .direct_IO = xfs_vm_direct_IO, 1635 .direct_IO = xfs_vm_direct_IO,
1637 .migratepage = buffer_migrate_page, 1636 .migratepage = buffer_migrate_page,
1638 .is_partially_uptodate = block_is_partially_uptodate, 1637 .is_partially_uptodate = block_is_partially_uptodate,
1638 .error_remove_page = generic_error_remove_page,
1639}; 1639};
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 0542fd507649..629370974e57 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -42,7 +42,7 @@
42 42
43#include <linux/dcache.h> 43#include <linux/dcache.h>
44 44
45static struct vm_operations_struct xfs_file_vm_ops; 45static const struct vm_operations_struct xfs_file_vm_ops;
46 46
47STATIC ssize_t 47STATIC ssize_t
48xfs_file_aio_read( 48xfs_file_aio_read(
@@ -172,12 +172,21 @@ xfs_file_release(
172 */ 172 */
173STATIC int 173STATIC int
174xfs_file_fsync( 174xfs_file_fsync(
175 struct file *filp, 175 struct file *file,
176 struct dentry *dentry, 176 struct dentry *dentry,
177 int datasync) 177 int datasync)
178{ 178{
179 xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED); 179 struct inode *inode = dentry->d_inode;
180 return -xfs_fsync(XFS_I(dentry->d_inode)); 180 struct xfs_inode *ip = XFS_I(inode);
181 int error;
182
183 /* capture size updates in I/O completion before writing the inode. */
184 error = filemap_fdatawait(inode->i_mapping);
185 if (error)
186 return error;
187
188 xfs_iflags_clear(ip, XFS_ITRUNCATED);
189 return -xfs_fsync(ip);
181} 190}
182 191
183STATIC int 192STATIC int
@@ -271,7 +280,7 @@ const struct file_operations xfs_dir_file_operations = {
271 .fsync = xfs_file_fsync, 280 .fsync = xfs_file_fsync,
272}; 281};
273 282
274static struct vm_operations_struct xfs_file_vm_ops = { 283static const struct vm_operations_struct xfs_file_vm_ops = {
275 .fault = filemap_fault, 284 .fault = filemap_fault,
276 .page_mkwrite = xfs_vm_page_mkwrite, 285 .page_mkwrite = xfs_vm_page_mkwrite,
277}; 286};
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 0882d166239a..eafcc7c18706 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -619,7 +619,7 @@ xfs_file_compat_ioctl(
619 case XFS_IOC_GETVERSION_32: 619 case XFS_IOC_GETVERSION_32:
620 cmd = _NATIVE_IOC(cmd, long); 620 cmd = _NATIVE_IOC(cmd, long);
621 return xfs_file_ioctl(filp, cmd, p); 621 return xfs_file_ioctl(filp, cmd, p);
622 case XFS_IOC_SWAPEXT: { 622 case XFS_IOC_SWAPEXT_32: {
623 struct xfs_swapext sxp; 623 struct xfs_swapext sxp;
624 struct compat_xfs_swapext __user *sxu = arg; 624 struct compat_xfs_swapext __user *sxu = arg;
625 625
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 8070b34cc287..da0159d99f82 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -43,7 +43,6 @@
43#include "xfs_error.h" 43#include "xfs_error.h"
44#include "xfs_itable.h" 44#include "xfs_itable.h"
45#include "xfs_rw.h" 45#include "xfs_rw.h"
46#include "xfs_acl.h"
47#include "xfs_attr.h" 46#include "xfs_attr.h"
48#include "xfs_buf_item.h" 47#include "xfs_buf_item.h"
49#include "xfs_utils.h" 48#include "xfs_utils.h"
@@ -485,14 +484,6 @@ xfs_vn_put_link(
485} 484}
486 485
487STATIC int 486STATIC int
488xfs_vn_permission(
489 struct inode *inode,
490 int mask)
491{
492 return generic_permission(inode, mask, xfs_check_acl);
493}
494
495STATIC int
496xfs_vn_getattr( 487xfs_vn_getattr(
497 struct vfsmount *mnt, 488 struct vfsmount *mnt,
498 struct dentry *dentry, 489 struct dentry *dentry,
@@ -696,7 +687,7 @@ xfs_vn_fiemap(
696} 687}
697 688
698static const struct inode_operations xfs_inode_operations = { 689static const struct inode_operations xfs_inode_operations = {
699 .permission = xfs_vn_permission, 690 .check_acl = xfs_check_acl,
700 .truncate = xfs_vn_truncate, 691 .truncate = xfs_vn_truncate,
701 .getattr = xfs_vn_getattr, 692 .getattr = xfs_vn_getattr,
702 .setattr = xfs_vn_setattr, 693 .setattr = xfs_vn_setattr,
@@ -724,7 +715,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
724 .rmdir = xfs_vn_unlink, 715 .rmdir = xfs_vn_unlink,
725 .mknod = xfs_vn_mknod, 716 .mknod = xfs_vn_mknod,
726 .rename = xfs_vn_rename, 717 .rename = xfs_vn_rename,
727 .permission = xfs_vn_permission, 718 .check_acl = xfs_check_acl,
728 .getattr = xfs_vn_getattr, 719 .getattr = xfs_vn_getattr,
729 .setattr = xfs_vn_setattr, 720 .setattr = xfs_vn_setattr,
730 .setxattr = generic_setxattr, 721 .setxattr = generic_setxattr,
@@ -749,7 +740,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
749 .rmdir = xfs_vn_unlink, 740 .rmdir = xfs_vn_unlink,
750 .mknod = xfs_vn_mknod, 741 .mknod = xfs_vn_mknod,
751 .rename = xfs_vn_rename, 742 .rename = xfs_vn_rename,
752 .permission = xfs_vn_permission, 743 .check_acl = xfs_check_acl,
753 .getattr = xfs_vn_getattr, 744 .getattr = xfs_vn_getattr,
754 .setattr = xfs_vn_setattr, 745 .setattr = xfs_vn_setattr,
755 .setxattr = generic_setxattr, 746 .setxattr = generic_setxattr,
@@ -762,7 +753,7 @@ static const struct inode_operations xfs_symlink_inode_operations = {
762 .readlink = generic_readlink, 753 .readlink = generic_readlink,
763 .follow_link = xfs_vn_follow_link, 754 .follow_link = xfs_vn_follow_link,
764 .put_link = xfs_vn_put_link, 755 .put_link = xfs_vn_put_link,
765 .permission = xfs_vn_permission, 756 .check_acl = xfs_check_acl,
766 .getattr = xfs_vn_getattr, 757 .getattr = xfs_vn_getattr,
767 .setattr = xfs_vn_setattr, 758 .setattr = xfs_vn_setattr,
768 .setxattr = generic_setxattr, 759 .setxattr = generic_setxattr,
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 7078974a6eee..49e4a6aea73c 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -812,18 +812,21 @@ write_retry:
812 812
813 /* Handle various SYNC-type writes */ 813 /* Handle various SYNC-type writes */
814 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { 814 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
815 loff_t end = pos + ret - 1;
815 int error2; 816 int error2;
816 817
817 xfs_iunlock(xip, iolock); 818 xfs_iunlock(xip, iolock);
818 if (need_i_mutex) 819 if (need_i_mutex)
819 mutex_unlock(&inode->i_mutex); 820 mutex_unlock(&inode->i_mutex);
820 error2 = sync_page_range(inode, mapping, pos, ret); 821
822 error2 = filemap_write_and_wait_range(mapping, pos, end);
821 if (!error) 823 if (!error)
822 error = error2; 824 error = error2;
823 if (need_i_mutex) 825 if (need_i_mutex)
824 mutex_lock(&inode->i_mutex); 826 mutex_lock(&inode->i_mutex);
825 xfs_ilock(xip, iolock); 827 xfs_ilock(xip, iolock);
826 error2 = xfs_write_sync_logforce(mp, xip); 828
829 error2 = xfs_fsync(xip);
827 if (!error) 830 if (!error)
828 error = error2; 831 error = error2;
829 } 832 }
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index cb6e2cca214f..9e41f91aa269 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -150,7 +150,7 @@ xfs_fs_set_xquota(
150 return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq); 150 return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq);
151} 151}
152 152
153struct quotactl_ops xfs_quotactl_operations = { 153const struct quotactl_ops xfs_quotactl_operations = {
154 .quota_sync = xfs_fs_quota_sync, 154 .quota_sync = xfs_fs_quota_sync,
155 .get_xstate = xfs_fs_get_xstate, 155 .get_xstate = xfs_fs_get_xstate,
156 .set_xstate = xfs_fs_set_xstate, 156 .set_xstate = xfs_fs_set_xstate,
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
index c3526d445f6a..76fdc5861932 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -20,16 +20,9 @@
20 20
21DEFINE_PER_CPU(struct xfsstats, xfsstats); 21DEFINE_PER_CPU(struct xfsstats, xfsstats);
22 22
23STATIC int 23static int xfs_stat_proc_show(struct seq_file *m, void *v)
24xfs_read_xfsstats(
25 char *buffer,
26 char **start,
27 off_t offset,
28 int count,
29 int *eof,
30 void *data)
31{ 24{
32 int c, i, j, len, val; 25 int c, i, j, val;
33 __uint64_t xs_xstrat_bytes = 0; 26 __uint64_t xs_xstrat_bytes = 0;
34 __uint64_t xs_write_bytes = 0; 27 __uint64_t xs_write_bytes = 0;
35 __uint64_t xs_read_bytes = 0; 28 __uint64_t xs_read_bytes = 0;
@@ -60,18 +53,18 @@ xfs_read_xfsstats(
60 }; 53 };
61 54
62 /* Loop over all stats groups */ 55 /* Loop over all stats groups */
63 for (i=j=len = 0; i < ARRAY_SIZE(xstats); i++) { 56 for (i=j = 0; i < ARRAY_SIZE(xstats); i++) {
64 len += sprintf(buffer + len, "%s", xstats[i].desc); 57 seq_printf(m, "%s", xstats[i].desc);
65 /* inner loop does each group */ 58 /* inner loop does each group */
66 while (j < xstats[i].endpoint) { 59 while (j < xstats[i].endpoint) {
67 val = 0; 60 val = 0;
68 /* sum over all cpus */ 61 /* sum over all cpus */
69 for_each_possible_cpu(c) 62 for_each_possible_cpu(c)
70 val += *(((__u32*)&per_cpu(xfsstats, c) + j)); 63 val += *(((__u32*)&per_cpu(xfsstats, c) + j));
71 len += sprintf(buffer + len, " %u", val); 64 seq_printf(m, " %u", val);
72 j++; 65 j++;
73 } 66 }
74 buffer[len++] = '\n'; 67 seq_putc(m, '\n');
75 } 68 }
76 /* extra precision counters */ 69 /* extra precision counters */
77 for_each_possible_cpu(i) { 70 for_each_possible_cpu(i) {
@@ -80,36 +73,38 @@ xfs_read_xfsstats(
80 xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes; 73 xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
81 } 74 }
82 75
83 len += sprintf(buffer + len, "xpc %Lu %Lu %Lu\n", 76 seq_printf(m, "xpc %Lu %Lu %Lu\n",
84 xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); 77 xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
85 len += sprintf(buffer + len, "debug %u\n", 78 seq_printf(m, "debug %u\n",
86#if defined(DEBUG) 79#if defined(DEBUG)
87 1); 80 1);
88#else 81#else
89 0); 82 0);
90#endif 83#endif
84 return 0;
85}
91 86
92 if (offset >= len) { 87static int xfs_stat_proc_open(struct inode *inode, struct file *file)
93 *start = buffer; 88{
94 *eof = 1; 89 return single_open(file, xfs_stat_proc_show, NULL);
95 return 0;
96 }
97 *start = buffer + offset;
98 if ((len -= offset) > count)
99 return count;
100 *eof = 1;
101
102 return len;
103} 90}
104 91
92static const struct file_operations xfs_stat_proc_fops = {
93 .owner = THIS_MODULE,
94 .open = xfs_stat_proc_open,
95 .read = seq_read,
96 .llseek = seq_lseek,
97 .release = single_release,
98};
99
105int 100int
106xfs_init_procfs(void) 101xfs_init_procfs(void)
107{ 102{
108 if (!proc_mkdir("fs/xfs", NULL)) 103 if (!proc_mkdir("fs/xfs", NULL))
109 goto out; 104 goto out;
110 105
111 if (!create_proc_read_entry("fs/xfs/stat", 0, NULL, 106 if (!proc_create("fs/xfs/stat", 0, NULL,
112 xfs_read_xfsstats, NULL)) 107 &xfs_stat_proc_fops))
113 goto out_remove_entry; 108 goto out_remove_entry;
114 return 0; 109 return 0;
115 110
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a220d36f789b..bdd41c8c342f 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -67,7 +67,7 @@
67#include <linux/freezer.h> 67#include <linux/freezer.h>
68#include <linux/parser.h> 68#include <linux/parser.h>
69 69
70static struct super_operations xfs_super_operations; 70static const struct super_operations xfs_super_operations;
71static kmem_zone_t *xfs_ioend_zone; 71static kmem_zone_t *xfs_ioend_zone;
72mempool_t *xfs_ioend_pool; 72mempool_t *xfs_ioend_pool;
73 73
@@ -579,15 +579,19 @@ xfs_showargs(
579 else if (mp->m_qflags & XFS_UQUOTA_ACCT) 579 else if (mp->m_qflags & XFS_UQUOTA_ACCT)
580 seq_puts(m, "," MNTOPT_UQUOTANOENF); 580 seq_puts(m, "," MNTOPT_UQUOTANOENF);
581 581
582 if (mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD)) 582 /* Either project or group quotas can be active, not both */
583 seq_puts(m, "," MNTOPT_PRJQUOTA); 583
584 else if (mp->m_qflags & XFS_PQUOTA_ACCT) 584 if (mp->m_qflags & XFS_PQUOTA_ACCT) {
585 seq_puts(m, "," MNTOPT_PQUOTANOENF); 585 if (mp->m_qflags & XFS_OQUOTA_ENFD)
586 586 seq_puts(m, "," MNTOPT_PRJQUOTA);
587 if (mp->m_qflags & (XFS_GQUOTA_ACCT|XFS_OQUOTA_ENFD)) 587 else
588 seq_puts(m, "," MNTOPT_GRPQUOTA); 588 seq_puts(m, "," MNTOPT_PQUOTANOENF);
589 else if (mp->m_qflags & XFS_GQUOTA_ACCT) 589 } else if (mp->m_qflags & XFS_GQUOTA_ACCT) {
590 seq_puts(m, "," MNTOPT_GQUOTANOENF); 590 if (mp->m_qflags & XFS_OQUOTA_ENFD)
591 seq_puts(m, "," MNTOPT_GRPQUOTA);
592 else
593 seq_puts(m, "," MNTOPT_GQUOTANOENF);
594 }
591 595
592 if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) 596 if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
593 seq_puts(m, "," MNTOPT_NOQUOTA); 597 seq_puts(m, "," MNTOPT_NOQUOTA);
@@ -687,7 +691,7 @@ xfs_barrier_test(
687 return error; 691 return error;
688} 692}
689 693
690void 694STATIC void
691xfs_mountfs_check_barriers(xfs_mount_t *mp) 695xfs_mountfs_check_barriers(xfs_mount_t *mp)
692{ 696{
693 int error; 697 int error;
@@ -1532,7 +1536,7 @@ xfs_fs_get_sb(
1532 mnt); 1536 mnt);
1533} 1537}
1534 1538
1535static struct super_operations xfs_super_operations = { 1539static const struct super_operations xfs_super_operations = {
1536 .alloc_inode = xfs_fs_alloc_inode, 1540 .alloc_inode = xfs_fs_alloc_inode,
1537 .destroy_inode = xfs_fs_destroy_inode, 1541 .destroy_inode = xfs_fs_destroy_inode,
1538 .write_inode = xfs_fs_write_inode, 1542 .write_inode = xfs_fs_write_inode,
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 5a2ea3a21781..18175ebd58ed 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -93,7 +93,7 @@ extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
93 93
94extern const struct export_operations xfs_export_operations; 94extern const struct export_operations xfs_export_operations;
95extern struct xattr_handler *xfs_xattr_handlers[]; 95extern struct xattr_handler *xfs_xattr_handlers[];
96extern struct quotactl_ops xfs_quotactl_operations; 96extern const struct quotactl_ops xfs_quotactl_operations;
97 97
98#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) 98#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info))
99 99
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 98ef624d9baf..320be6aea492 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -749,21 +749,6 @@ __xfs_inode_clear_reclaim_tag(
749 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); 749 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
750} 750}
751 751
752void
753xfs_inode_clear_reclaim_tag(
754 xfs_inode_t *ip)
755{
756 xfs_mount_t *mp = ip->i_mount;
757 xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
758
759 read_lock(&pag->pag_ici_lock);
760 spin_lock(&ip->i_flags_lock);
761 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
762 spin_unlock(&ip->i_flags_lock);
763 read_unlock(&pag->pag_ici_lock);
764 xfs_put_perag(mp, pag);
765}
766
767STATIC int 752STATIC int
768xfs_reclaim_inode_now( 753xfs_reclaim_inode_now(
769 struct xfs_inode *ip, 754 struct xfs_inode *ip,
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 59120602588a..27920eb7a820 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -49,7 +49,6 @@ int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
49 49
50void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); 50void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
51void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); 51void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
52void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
53void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, 52void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
54 struct xfs_inode *ip); 53 struct xfs_inode *ip);
55 54
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 916c0ffb6083..c5bc67c4e3bb 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -26,7 +26,6 @@ STATIC int
26xfs_stats_clear_proc_handler( 26xfs_stats_clear_proc_handler(
27 ctl_table *ctl, 27 ctl_table *ctl,
28 int write, 28 int write,
29 struct file *filp,
30 void __user *buffer, 29 void __user *buffer,
31 size_t *lenp, 30 size_t *lenp,
32 loff_t *ppos) 31 loff_t *ppos)
@@ -34,7 +33,7 @@ xfs_stats_clear_proc_handler(
34 int c, ret, *valp = ctl->data; 33 int c, ret, *valp = ctl->data;
35 __uint32_t vn_active; 34 __uint32_t vn_active;
36 35
37 ret = proc_dointvec_minmax(ctl, write, filp, buffer, lenp, ppos); 36 ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
38 37
39 if (!ret && write && *valp) { 38 if (!ret && write && *valp) {
40 printk("XFS Clearing xfsstats\n"); 39 printk("XFS Clearing xfsstats\n");
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 21b08c0396a1..83e7ea3e25fa 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -48,50 +48,34 @@
48 48
49struct xqmstats xqmstats; 49struct xqmstats xqmstats;
50 50
51STATIC int 51static int xqm_proc_show(struct seq_file *m, void *v)
52xfs_qm_read_xfsquota(
53 char *buffer,
54 char **start,
55 off_t offset,
56 int count,
57 int *eof,
58 void *data)
59{ 52{
60 int len;
61
62 /* maximum; incore; ratio free to inuse; freelist */ 53 /* maximum; incore; ratio free to inuse; freelist */
63 len = sprintf(buffer, "%d\t%d\t%d\t%u\n", 54 seq_printf(m, "%d\t%d\t%d\t%u\n",
64 ndquot, 55 ndquot,
65 xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0, 56 xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
66 xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0, 57 xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
67 xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0); 58 xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0);
68 59 return 0;
69 if (offset >= len) {
70 *start = buffer;
71 *eof = 1;
72 return 0;
73 }
74 *start = buffer + offset;
75 if ((len -= offset) > count)
76 return count;
77 *eof = 1;
78
79 return len;
80} 60}
81 61
82STATIC int 62static int xqm_proc_open(struct inode *inode, struct file *file)
83xfs_qm_read_stats(
84 char *buffer,
85 char **start,
86 off_t offset,
87 int count,
88 int *eof,
89 void *data)
90{ 63{
91 int len; 64 return single_open(file, xqm_proc_show, NULL);
65}
66
67static const struct file_operations xqm_proc_fops = {
68 .owner = THIS_MODULE,
69 .open = xqm_proc_open,
70 .read = seq_read,
71 .llseek = seq_lseek,
72 .release = single_release,
73};
92 74
75static int xqmstat_proc_show(struct seq_file *m, void *v)
76{
93 /* quota performance statistics */ 77 /* quota performance statistics */
94 len = sprintf(buffer, "qm %u %u %u %u %u %u %u %u\n", 78 seq_printf(m, "qm %u %u %u %u %u %u %u %u\n",
95 xqmstats.xs_qm_dqreclaims, 79 xqmstats.xs_qm_dqreclaims,
96 xqmstats.xs_qm_dqreclaim_misses, 80 xqmstats.xs_qm_dqreclaim_misses,
97 xqmstats.xs_qm_dquot_dups, 81 xqmstats.xs_qm_dquot_dups,
@@ -100,25 +84,27 @@ xfs_qm_read_stats(
100 xqmstats.xs_qm_dqwants, 84 xqmstats.xs_qm_dqwants,
101 xqmstats.xs_qm_dqshake_reclaims, 85 xqmstats.xs_qm_dqshake_reclaims,
102 xqmstats.xs_qm_dqinact_reclaims); 86 xqmstats.xs_qm_dqinact_reclaims);
87 return 0;
88}
103 89
104 if (offset >= len) { 90static int xqmstat_proc_open(struct inode *inode, struct file *file)
105 *start = buffer; 91{
106 *eof = 1; 92 return single_open(file, xqmstat_proc_show, NULL);
107 return 0;
108 }
109 *start = buffer + offset;
110 if ((len -= offset) > count)
111 return count;
112 *eof = 1;
113
114 return len;
115} 93}
116 94
95static const struct file_operations xqmstat_proc_fops = {
96 .owner = THIS_MODULE,
97 .open = xqmstat_proc_open,
98 .read = seq_read,
99 .llseek = seq_lseek,
100 .release = single_release,
101};
102
117void 103void
118xfs_qm_init_procfs(void) 104xfs_qm_init_procfs(void)
119{ 105{
120 create_proc_read_entry("fs/xfs/xqmstat", 0, NULL, xfs_qm_read_stats, NULL); 106 proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops);
121 create_proc_read_entry("fs/xfs/xqm", 0, NULL, xfs_qm_read_xfsquota, NULL); 107 proc_create("fs/xfs/xqm", 0, NULL, &xqm_proc_fops);
122} 108}
123 109
124void 110void
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index f24b50b68d03..a5d54bf4931b 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -198,6 +198,15 @@ typedef struct xfs_perag
198 xfs_agino_t pagi_count; /* number of allocated inodes */ 198 xfs_agino_t pagi_count; /* number of allocated inodes */
199 int pagb_count; /* pagb slots in use */ 199 int pagb_count; /* pagb slots in use */
200 xfs_perag_busy_t *pagb_list; /* unstable blocks */ 200 xfs_perag_busy_t *pagb_list; /* unstable blocks */
201
202 /*
203 * Inode allocation search lookup optimisation.
204 * If the pagino matches, the search for new inodes
205 * doesn't need to search the near ones again straight away
206 */
207 xfs_agino_t pagl_pagino;
208 xfs_agino_t pagl_leftrec;
209 xfs_agino_t pagl_rightrec;
201#ifdef __KERNEL__ 210#ifdef __KERNEL__
202 spinlock_t pagb_lock; /* lock for pagb_list */ 211 spinlock_t pagb_lock; /* lock for pagb_list */
203 212
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 8ee5b5a76a2a..8971fb09d387 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -3713,7 +3713,7 @@ done:
3713 * entry (null if none). Else, *lastxp will be set to the index 3713 * entry (null if none). Else, *lastxp will be set to the index
3714 * of the found entry; *gotp will contain the entry. 3714 * of the found entry; *gotp will contain the entry.
3715 */ 3715 */
3716xfs_bmbt_rec_host_t * /* pointer to found extent entry */ 3716STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
3717xfs_bmap_search_multi_extents( 3717xfs_bmap_search_multi_extents(
3718 xfs_ifork_t *ifp, /* inode fork pointer */ 3718 xfs_ifork_t *ifp, /* inode fork pointer */
3719 xfs_fileoff_t bno, /* block number searched for */ 3719 xfs_fileoff_t bno, /* block number searched for */
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 1b8ff9256bd0..56f62d2edc35 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -392,17 +392,6 @@ xfs_bmap_count_blocks(
392 int whichfork, 392 int whichfork,
393 int *count); 393 int *count);
394 394
395/*
396 * Search the extent records for the entry containing block bno.
397 * If bno lies in a hole, point to the next entry. If bno lies
398 * past eof, *eofp will be set, and *prevp will contain the last
399 * entry (null if none). Else, *lastxp will be set to the index
400 * of the found entry; *gotp will contain the entry.
401 */
402xfs_bmbt_rec_host_t *
403xfs_bmap_search_multi_extents(struct xfs_ifork *, xfs_fileoff_t, int *,
404 xfs_extnum_t *, xfs_bmbt_irec_t *, xfs_bmbt_irec_t *);
405
406#endif /* __KERNEL__ */ 395#endif /* __KERNEL__ */
407 396
408#endif /* __XFS_BMAP_H__ */ 397#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 5c1ade06578e..eb7b702d0690 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -202,16 +202,6 @@ xfs_bmbt_get_state(
202 ext_flag); 202 ext_flag);
203} 203}
204 204
205/* Endian flipping versions of the bmbt extraction functions */
206void
207xfs_bmbt_disk_get_all(
208 xfs_bmbt_rec_t *r,
209 xfs_bmbt_irec_t *s)
210{
211 __xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
212 get_unaligned_be64(&r->l1), s);
213}
214
215/* 205/*
216 * Extract the blockcount field from an on disk bmap extent record. 206 * Extract the blockcount field from an on disk bmap extent record.
217 */ 207 */
@@ -816,6 +806,16 @@ xfs_bmbt_trace_key(
816 *l1 = 0; 806 *l1 = 0;
817} 807}
818 808
809/* Endian flipping versions of the bmbt extraction functions */
810STATIC void
811xfs_bmbt_disk_get_all(
812 xfs_bmbt_rec_t *r,
813 xfs_bmbt_irec_t *s)
814{
815 __xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
816 get_unaligned_be64(&r->l1), s);
817}
818
819STATIC void 819STATIC void
820xfs_bmbt_trace_record( 820xfs_bmbt_trace_record(
821 struct xfs_btree_cur *cur, 821 struct xfs_btree_cur *cur,
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 0e8df007615e..5549d495947f 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -220,7 +220,6 @@ extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
220extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r); 220extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
221extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r); 221extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r);
222 222
223extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
224extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r); 223extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
225extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r); 224extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
226 225
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 26717388acf5..52b5f14d0c32 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -646,46 +646,6 @@ xfs_btree_read_bufl(
646} 646}
647 647
648/* 648/*
649 * Get a buffer for the block, return it read in.
650 * Short-form addressing.
651 */
652int /* error */
653xfs_btree_read_bufs(
654 xfs_mount_t *mp, /* file system mount point */
655 xfs_trans_t *tp, /* transaction pointer */
656 xfs_agnumber_t agno, /* allocation group number */
657 xfs_agblock_t agbno, /* allocation group block number */
658 uint lock, /* lock flags for read_buf */
659 xfs_buf_t **bpp, /* buffer for agno/agbno */
660 int refval) /* ref count value for buffer */
661{
662 xfs_buf_t *bp; /* return value */
663 xfs_daddr_t d; /* real disk block address */
664 int error;
665
666 ASSERT(agno != NULLAGNUMBER);
667 ASSERT(agbno != NULLAGBLOCK);
668 d = XFS_AGB_TO_DADDR(mp, agno, agbno);
669 if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
670 mp->m_bsize, lock, &bp))) {
671 return error;
672 }
673 ASSERT(!bp || !XFS_BUF_GETERROR(bp));
674 if (bp != NULL) {
675 switch (refval) {
676 case XFS_ALLOC_BTREE_REF:
677 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
678 break;
679 case XFS_INO_BTREE_REF:
680 XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, refval);
681 break;
682 }
683 }
684 *bpp = bp;
685 return 0;
686}
687
688/*
689 * Read-ahead the block, don't wait for it, don't return a buffer. 649 * Read-ahead the block, don't wait for it, don't return a buffer.
690 * Long-form addressing. 650 * Long-form addressing.
691 */ 651 */
@@ -2951,7 +2911,7 @@ error0:
2951 * inode we have to copy the single block it was pointing to into the 2911 * inode we have to copy the single block it was pointing to into the
2952 * inode. 2912 * inode.
2953 */ 2913 */
2954int 2914STATIC int
2955xfs_btree_kill_iroot( 2915xfs_btree_kill_iroot(
2956 struct xfs_btree_cur *cur) 2916 struct xfs_btree_cur *cur)
2957{ 2917{
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 4f852b735b96..7fa07062bdda 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -379,20 +379,6 @@ xfs_btree_read_bufl(
379 int refval);/* ref count value for buffer */ 379 int refval);/* ref count value for buffer */
380 380
381/* 381/*
382 * Get a buffer for the block, return it read in.
383 * Short-form addressing.
384 */
385int /* error */
386xfs_btree_read_bufs(
387 struct xfs_mount *mp, /* file system mount point */
388 struct xfs_trans *tp, /* transaction pointer */
389 xfs_agnumber_t agno, /* allocation group number */
390 xfs_agblock_t agbno, /* allocation group block number */
391 uint lock, /* lock flags for read_buf */
392 struct xfs_buf **bpp, /* buffer for agno/agbno */
393 int refval);/* ref count value for buffer */
394
395/*
396 * Read-ahead the block, don't wait for it, don't return a buffer. 382 * Read-ahead the block, don't wait for it, don't return a buffer.
397 * Long-form addressing. 383 * Long-form addressing.
398 */ 384 */
@@ -432,7 +418,6 @@ int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
432int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *); 418int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
433int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *); 419int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
434int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *); 420int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
435int xfs_btree_kill_iroot(struct xfs_btree_cur *);
436int xfs_btree_insert(struct xfs_btree_cur *, int *); 421int xfs_btree_insert(struct xfs_btree_cur *, int *);
437int xfs_btree_delete(struct xfs_btree_cur *, int *); 422int xfs_btree_delete(struct xfs_btree_cur *, int *);
438int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); 423int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index c4ea51b55dce..f52ac276277e 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -117,7 +117,7 @@ struct getbmapx {
117#define BMV_IF_VALID \ 117#define BMV_IF_VALID \
118 (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC) 118 (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC)
119 119
120/* bmv_oflags values - returned for for each non-header segment */ 120/* bmv_oflags values - returned for each non-header segment */
121#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */ 121#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */
122#define BMV_OF_DELALLOC 0x2 /* segment = delayed allocation */ 122#define BMV_OF_DELALLOC 0x2 /* segment = delayed allocation */
123#define BMV_OF_LAST 0x4 /* segment is the last in the file */ 123#define BMV_OF_LAST 0x4 /* segment is the last in the file */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 3120a3a5e20f..ab64f3efb43b 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -57,75 +57,35 @@ xfs_ialloc_cluster_alignment(
57} 57}
58 58
59/* 59/*
60 * Lookup the record equal to ino in the btree given by cur. 60 * Lookup a record by ino in the btree given by cur.
61 */
62STATIC int /* error */
63xfs_inobt_lookup_eq(
64 struct xfs_btree_cur *cur, /* btree cursor */
65 xfs_agino_t ino, /* starting inode of chunk */
66 __int32_t fcnt, /* free inode count */
67 xfs_inofree_t free, /* free inode mask */
68 int *stat) /* success/failure */
69{
70 cur->bc_rec.i.ir_startino = ino;
71 cur->bc_rec.i.ir_freecount = fcnt;
72 cur->bc_rec.i.ir_free = free;
73 return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
74}
75
76/*
77 * Lookup the first record greater than or equal to ino
78 * in the btree given by cur.
79 */ 61 */
80int /* error */ 62int /* error */
81xfs_inobt_lookup_ge( 63xfs_inobt_lookup(
82 struct xfs_btree_cur *cur, /* btree cursor */ 64 struct xfs_btree_cur *cur, /* btree cursor */
83 xfs_agino_t ino, /* starting inode of chunk */ 65 xfs_agino_t ino, /* starting inode of chunk */
84 __int32_t fcnt, /* free inode count */ 66 xfs_lookup_t dir, /* <=, >=, == */
85 xfs_inofree_t free, /* free inode mask */
86 int *stat) /* success/failure */ 67 int *stat) /* success/failure */
87{ 68{
88 cur->bc_rec.i.ir_startino = ino; 69 cur->bc_rec.i.ir_startino = ino;
89 cur->bc_rec.i.ir_freecount = fcnt; 70 cur->bc_rec.i.ir_freecount = 0;
90 cur->bc_rec.i.ir_free = free; 71 cur->bc_rec.i.ir_free = 0;
91 return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); 72 return xfs_btree_lookup(cur, dir, stat);
92} 73}
93 74
94/* 75/*
95 * Lookup the first record less than or equal to ino 76 * Update the record referred to by cur to the value given.
96 * in the btree given by cur.
97 */
98int /* error */
99xfs_inobt_lookup_le(
100 struct xfs_btree_cur *cur, /* btree cursor */
101 xfs_agino_t ino, /* starting inode of chunk */
102 __int32_t fcnt, /* free inode count */
103 xfs_inofree_t free, /* free inode mask */
104 int *stat) /* success/failure */
105{
106 cur->bc_rec.i.ir_startino = ino;
107 cur->bc_rec.i.ir_freecount = fcnt;
108 cur->bc_rec.i.ir_free = free;
109 return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
110}
111
112/*
113 * Update the record referred to by cur to the value given
114 * by [ino, fcnt, free].
115 * This either works (return 0) or gets an EFSCORRUPTED error. 77 * This either works (return 0) or gets an EFSCORRUPTED error.
116 */ 78 */
117STATIC int /* error */ 79STATIC int /* error */
118xfs_inobt_update( 80xfs_inobt_update(
119 struct xfs_btree_cur *cur, /* btree cursor */ 81 struct xfs_btree_cur *cur, /* btree cursor */
120 xfs_agino_t ino, /* starting inode of chunk */ 82 xfs_inobt_rec_incore_t *irec) /* btree record */
121 __int32_t fcnt, /* free inode count */
122 xfs_inofree_t free) /* free inode mask */
123{ 83{
124 union xfs_btree_rec rec; 84 union xfs_btree_rec rec;
125 85
126 rec.inobt.ir_startino = cpu_to_be32(ino); 86 rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
127 rec.inobt.ir_freecount = cpu_to_be32(fcnt); 87 rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
128 rec.inobt.ir_free = cpu_to_be64(free); 88 rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
129 return xfs_btree_update(cur, &rec); 89 return xfs_btree_update(cur, &rec);
130} 90}
131 91
@@ -135,9 +95,7 @@ xfs_inobt_update(
135int /* error */ 95int /* error */
136xfs_inobt_get_rec( 96xfs_inobt_get_rec(
137 struct xfs_btree_cur *cur, /* btree cursor */ 97 struct xfs_btree_cur *cur, /* btree cursor */
138 xfs_agino_t *ino, /* output: starting inode of chunk */ 98 xfs_inobt_rec_incore_t *irec, /* btree record */
139 __int32_t *fcnt, /* output: number of free inodes */
140 xfs_inofree_t *free, /* output: free inode mask */
141 int *stat) /* output: success/failure */ 99 int *stat) /* output: success/failure */
142{ 100{
143 union xfs_btree_rec *rec; 101 union xfs_btree_rec *rec;
@@ -145,14 +103,136 @@ xfs_inobt_get_rec(
145 103
146 error = xfs_btree_get_rec(cur, &rec, stat); 104 error = xfs_btree_get_rec(cur, &rec, stat);
147 if (!error && *stat == 1) { 105 if (!error && *stat == 1) {
148 *ino = be32_to_cpu(rec->inobt.ir_startino); 106 irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
149 *fcnt = be32_to_cpu(rec->inobt.ir_freecount); 107 irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
150 *free = be64_to_cpu(rec->inobt.ir_free); 108 irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
151 } 109 }
152 return error; 110 return error;
153} 111}
154 112
155/* 113/*
114 * Verify that the number of free inodes in the AGI is correct.
115 */
116#ifdef DEBUG
117STATIC int
118xfs_check_agi_freecount(
119 struct xfs_btree_cur *cur,
120 struct xfs_agi *agi)
121{
122 if (cur->bc_nlevels == 1) {
123 xfs_inobt_rec_incore_t rec;
124 int freecount = 0;
125 int error;
126 int i;
127
128 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
129 if (error)
130 return error;
131
132 do {
133 error = xfs_inobt_get_rec(cur, &rec, &i);
134 if (error)
135 return error;
136
137 if (i) {
138 freecount += rec.ir_freecount;
139 error = xfs_btree_increment(cur, 0, &i);
140 if (error)
141 return error;
142 }
143 } while (i == 1);
144
145 if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
146 ASSERT(freecount == be32_to_cpu(agi->agi_freecount));
147 }
148 return 0;
149}
150#else
151#define xfs_check_agi_freecount(cur, agi) 0
152#endif
153
154/*
155 * Initialise a new set of inodes.
156 */
157STATIC void
158xfs_ialloc_inode_init(
159 struct xfs_mount *mp,
160 struct xfs_trans *tp,
161 xfs_agnumber_t agno,
162 xfs_agblock_t agbno,
163 xfs_agblock_t length,
164 unsigned int gen)
165{
166 struct xfs_buf *fbuf;
167 struct xfs_dinode *free;
168 int blks_per_cluster, nbufs, ninodes;
169 int version;
170 int i, j;
171 xfs_daddr_t d;
172
173 /*
174 * Loop over the new block(s), filling in the inodes.
175 * For small block sizes, manipulate the inodes in buffers
176 * which are multiples of the blocks size.
177 */
178 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
179 blks_per_cluster = 1;
180 nbufs = length;
181 ninodes = mp->m_sb.sb_inopblock;
182 } else {
183 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
184 mp->m_sb.sb_blocksize;
185 nbufs = length / blks_per_cluster;
186 ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
187 }
188
189 /*
190 * Figure out what version number to use in the inodes we create.
191 * If the superblock version has caught up to the one that supports
192 * the new inode format, then use the new inode version. Otherwise
193 * use the old version so that old kernels will continue to be
194 * able to use the file system.
195 */
196 if (xfs_sb_version_hasnlink(&mp->m_sb))
197 version = 2;
198 else
199 version = 1;
200
201 for (j = 0; j < nbufs; j++) {
202 /*
203 * Get the block.
204 */
205 d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
206 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
207 mp->m_bsize * blks_per_cluster,
208 XFS_BUF_LOCK);
209 ASSERT(fbuf);
210 ASSERT(!XFS_BUF_GETERROR(fbuf));
211
212 /*
213 * Initialize all inodes in this buffer and then log them.
214 *
215 * XXX: It would be much better if we had just one transaction
216 * to log a whole cluster of inodes instead of all the
217 * individual transactions causing a lot of log traffic.
218 */
219 xfs_biozero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
220 for (i = 0; i < ninodes; i++) {
221 int ioffset = i << mp->m_sb.sb_inodelog;
222 uint isize = sizeof(struct xfs_dinode);
223
224 free = xfs_make_iptr(mp, fbuf, i);
225 free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
226 free->di_version = version;
227 free->di_gen = cpu_to_be32(gen);
228 free->di_next_unlinked = cpu_to_be32(NULLAGINO);
229 xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
230 }
231 xfs_trans_inode_alloc_buf(tp, fbuf);
232 }
233}
234
235/*
156 * Allocate new inodes in the allocation group specified by agbp. 236 * Allocate new inodes in the allocation group specified by agbp.
157 * Return 0 for success, else error code. 237 * Return 0 for success, else error code.
158 */ 238 */
@@ -164,24 +244,15 @@ xfs_ialloc_ag_alloc(
164{ 244{
165 xfs_agi_t *agi; /* allocation group header */ 245 xfs_agi_t *agi; /* allocation group header */
166 xfs_alloc_arg_t args; /* allocation argument structure */ 246 xfs_alloc_arg_t args; /* allocation argument structure */
167 int blks_per_cluster; /* fs blocks per inode cluster */
168 xfs_btree_cur_t *cur; /* inode btree cursor */ 247 xfs_btree_cur_t *cur; /* inode btree cursor */
169 xfs_daddr_t d; /* disk addr of buffer */
170 xfs_agnumber_t agno; 248 xfs_agnumber_t agno;
171 int error; 249 int error;
172 xfs_buf_t *fbuf; /* new free inodes' buffer */ 250 int i;
173 xfs_dinode_t *free; /* new free inode structure */
174 int i; /* inode counter */
175 int j; /* block counter */
176 int nbufs; /* num bufs of new inodes */
177 xfs_agino_t newino; /* new first inode's number */ 251 xfs_agino_t newino; /* new first inode's number */
178 xfs_agino_t newlen; /* new number of inodes */ 252 xfs_agino_t newlen; /* new number of inodes */
179 int ninodes; /* num inodes per buf */
180 xfs_agino_t thisino; /* current inode number, for loop */ 253 xfs_agino_t thisino; /* current inode number, for loop */
181 int version; /* inode version number to use */
182 int isaligned = 0; /* inode allocation at stripe unit */ 254 int isaligned = 0; /* inode allocation at stripe unit */
183 /* boundary */ 255 /* boundary */
184 unsigned int gen;
185 256
186 args.tp = tp; 257 args.tp = tp;
187 args.mp = tp->t_mountp; 258 args.mp = tp->t_mountp;
@@ -202,12 +273,12 @@ xfs_ialloc_ag_alloc(
202 */ 273 */
203 agi = XFS_BUF_TO_AGI(agbp); 274 agi = XFS_BUF_TO_AGI(agbp);
204 newino = be32_to_cpu(agi->agi_newino); 275 newino = be32_to_cpu(agi->agi_newino);
276 agno = be32_to_cpu(agi->agi_seqno);
205 args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + 277 args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
206 XFS_IALLOC_BLOCKS(args.mp); 278 XFS_IALLOC_BLOCKS(args.mp);
207 if (likely(newino != NULLAGINO && 279 if (likely(newino != NULLAGINO &&
208 (args.agbno < be32_to_cpu(agi->agi_length)))) { 280 (args.agbno < be32_to_cpu(agi->agi_length)))) {
209 args.fsbno = XFS_AGB_TO_FSB(args.mp, 281 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
210 be32_to_cpu(agi->agi_seqno), args.agbno);
211 args.type = XFS_ALLOCTYPE_THIS_BNO; 282 args.type = XFS_ALLOCTYPE_THIS_BNO;
212 args.mod = args.total = args.wasdel = args.isfl = 283 args.mod = args.total = args.wasdel = args.isfl =
213 args.userdata = args.minalignslop = 0; 284 args.userdata = args.minalignslop = 0;
@@ -258,8 +329,7 @@ xfs_ialloc_ag_alloc(
258 * For now, just allocate blocks up front. 329 * For now, just allocate blocks up front.
259 */ 330 */
260 args.agbno = be32_to_cpu(agi->agi_root); 331 args.agbno = be32_to_cpu(agi->agi_root);
261 args.fsbno = XFS_AGB_TO_FSB(args.mp, 332 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
262 be32_to_cpu(agi->agi_seqno), args.agbno);
263 /* 333 /*
264 * Allocate a fixed-size extent of inodes. 334 * Allocate a fixed-size extent of inodes.
265 */ 335 */
@@ -282,8 +352,7 @@ xfs_ialloc_ag_alloc(
282 if (isaligned && args.fsbno == NULLFSBLOCK) { 352 if (isaligned && args.fsbno == NULLFSBLOCK) {
283 args.type = XFS_ALLOCTYPE_NEAR_BNO; 353 args.type = XFS_ALLOCTYPE_NEAR_BNO;
284 args.agbno = be32_to_cpu(agi->agi_root); 354 args.agbno = be32_to_cpu(agi->agi_root);
285 args.fsbno = XFS_AGB_TO_FSB(args.mp, 355 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
286 be32_to_cpu(agi->agi_seqno), args.agbno);
287 args.alignment = xfs_ialloc_cluster_alignment(&args); 356 args.alignment = xfs_ialloc_cluster_alignment(&args);
288 if ((error = xfs_alloc_vextent(&args))) 357 if ((error = xfs_alloc_vextent(&args)))
289 return error; 358 return error;
@@ -294,85 +363,30 @@ xfs_ialloc_ag_alloc(
294 return 0; 363 return 0;
295 } 364 }
296 ASSERT(args.len == args.minlen); 365 ASSERT(args.len == args.minlen);
297 /*
298 * Convert the results.
299 */
300 newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
301 /*
302 * Loop over the new block(s), filling in the inodes.
303 * For small block sizes, manipulate the inodes in buffers
304 * which are multiples of the blocks size.
305 */
306 if (args.mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(args.mp)) {
307 blks_per_cluster = 1;
308 nbufs = (int)args.len;
309 ninodes = args.mp->m_sb.sb_inopblock;
310 } else {
311 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(args.mp) /
312 args.mp->m_sb.sb_blocksize;
313 nbufs = (int)args.len / blks_per_cluster;
314 ninodes = blks_per_cluster * args.mp->m_sb.sb_inopblock;
315 }
316 /*
317 * Figure out what version number to use in the inodes we create.
318 * If the superblock version has caught up to the one that supports
319 * the new inode format, then use the new inode version. Otherwise
320 * use the old version so that old kernels will continue to be
321 * able to use the file system.
322 */
323 if (xfs_sb_version_hasnlink(&args.mp->m_sb))
324 version = 2;
325 else
326 version = 1;
327 366
328 /* 367 /*
368 * Stamp and write the inode buffers.
369 *
329 * Seed the new inode cluster with a random generation number. This 370 * Seed the new inode cluster with a random generation number. This
330 * prevents short-term reuse of generation numbers if a chunk is 371 * prevents short-term reuse of generation numbers if a chunk is
331 * freed and then immediately reallocated. We use random numbers 372 * freed and then immediately reallocated. We use random numbers
332 * rather than a linear progression to prevent the next generation 373 * rather than a linear progression to prevent the next generation
333 * number from being easily guessable. 374 * number from being easily guessable.
334 */ 375 */
335 gen = random32(); 376 xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, args.len,
336 for (j = 0; j < nbufs; j++) { 377 random32());
337 /*
338 * Get the block.
339 */
340 d = XFS_AGB_TO_DADDR(args.mp, be32_to_cpu(agi->agi_seqno),
341 args.agbno + (j * blks_per_cluster));
342 fbuf = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, d,
343 args.mp->m_bsize * blks_per_cluster,
344 XFS_BUF_LOCK);
345 ASSERT(fbuf);
346 ASSERT(!XFS_BUF_GETERROR(fbuf));
347 378
348 /* 379 /*
349 * Initialize all inodes in this buffer and then log them. 380 * Convert the results.
350 * 381 */
351 * XXX: It would be much better if we had just one transaction to 382 newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
352 * log a whole cluster of inodes instead of all the individual
353 * transactions causing a lot of log traffic.
354 */
355 xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog);
356 for (i = 0; i < ninodes; i++) {
357 int ioffset = i << args.mp->m_sb.sb_inodelog;
358 uint isize = sizeof(struct xfs_dinode);
359
360 free = xfs_make_iptr(args.mp, fbuf, i);
361 free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
362 free->di_version = version;
363 free->di_gen = cpu_to_be32(gen);
364 free->di_next_unlinked = cpu_to_be32(NULLAGINO);
365 xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
366 }
367 xfs_trans_inode_alloc_buf(tp, fbuf);
368 }
369 be32_add_cpu(&agi->agi_count, newlen); 383 be32_add_cpu(&agi->agi_count, newlen);
370 be32_add_cpu(&agi->agi_freecount, newlen); 384 be32_add_cpu(&agi->agi_freecount, newlen);
371 agno = be32_to_cpu(agi->agi_seqno);
372 down_read(&args.mp->m_peraglock); 385 down_read(&args.mp->m_peraglock);
373 args.mp->m_perag[agno].pagi_freecount += newlen; 386 args.mp->m_perag[agno].pagi_freecount += newlen;
374 up_read(&args.mp->m_peraglock); 387 up_read(&args.mp->m_peraglock);
375 agi->agi_newino = cpu_to_be32(newino); 388 agi->agi_newino = cpu_to_be32(newino);
389
376 /* 390 /*
377 * Insert records describing the new inode chunk into the btree. 391 * Insert records describing the new inode chunk into the btree.
378 */ 392 */
@@ -380,13 +394,17 @@ xfs_ialloc_ag_alloc(
380 for (thisino = newino; 394 for (thisino = newino;
381 thisino < newino + newlen; 395 thisino < newino + newlen;
382 thisino += XFS_INODES_PER_CHUNK) { 396 thisino += XFS_INODES_PER_CHUNK) {
383 if ((error = xfs_inobt_lookup_eq(cur, thisino, 397 cur->bc_rec.i.ir_startino = thisino;
384 XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i))) { 398 cur->bc_rec.i.ir_freecount = XFS_INODES_PER_CHUNK;
399 cur->bc_rec.i.ir_free = XFS_INOBT_ALL_FREE;
400 error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, &i);
401 if (error) {
385 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); 402 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
386 return error; 403 return error;
387 } 404 }
388 ASSERT(i == 0); 405 ASSERT(i == 0);
389 if ((error = xfs_btree_insert(cur, &i))) { 406 error = xfs_btree_insert(cur, &i);
407 if (error) {
390 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); 408 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
391 return error; 409 return error;
392 } 410 }
@@ -539,6 +557,62 @@ nextag:
539} 557}
540 558
541/* 559/*
560 * Try to retrieve the next record to the left/right from the current one.
561 */
562STATIC int
563xfs_ialloc_next_rec(
564 struct xfs_btree_cur *cur,
565 xfs_inobt_rec_incore_t *rec,
566 int *done,
567 int left)
568{
569 int error;
570 int i;
571
572 if (left)
573 error = xfs_btree_decrement(cur, 0, &i);
574 else
575 error = xfs_btree_increment(cur, 0, &i);
576
577 if (error)
578 return error;
579 *done = !i;
580 if (i) {
581 error = xfs_inobt_get_rec(cur, rec, &i);
582 if (error)
583 return error;
584 XFS_WANT_CORRUPTED_RETURN(i == 1);
585 }
586
587 return 0;
588}
589
590STATIC int
591xfs_ialloc_get_rec(
592 struct xfs_btree_cur *cur,
593 xfs_agino_t agino,
594 xfs_inobt_rec_incore_t *rec,
595 int *done,
596 int left)
597{
598 int error;
599 int i;
600
601 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
602 if (error)
603 return error;
604 *done = !i;
605 if (i) {
606 error = xfs_inobt_get_rec(cur, rec, &i);
607 if (error)
608 return error;
609 XFS_WANT_CORRUPTED_RETURN(i == 1);
610 }
611
612 return 0;
613}
614
615/*
542 * Visible inode allocation functions. 616 * Visible inode allocation functions.
543 */ 617 */
544 618
@@ -592,8 +666,8 @@ xfs_dialloc(
592 int j; /* result code */ 666 int j; /* result code */
593 xfs_mount_t *mp; /* file system mount structure */ 667 xfs_mount_t *mp; /* file system mount structure */
594 int offset; /* index of inode in chunk */ 668 int offset; /* index of inode in chunk */
595 xfs_agino_t pagino; /* parent's a.g. relative inode # */ 669 xfs_agino_t pagino; /* parent's AG relative inode # */
596 xfs_agnumber_t pagno; /* parent's allocation group number */ 670 xfs_agnumber_t pagno; /* parent's AG number */
597 xfs_inobt_rec_incore_t rec; /* inode allocation record */ 671 xfs_inobt_rec_incore_t rec; /* inode allocation record */
598 xfs_agnumber_t tagno; /* testing allocation group number */ 672 xfs_agnumber_t tagno; /* testing allocation group number */
599 xfs_btree_cur_t *tcur; /* temp cursor */ 673 xfs_btree_cur_t *tcur; /* temp cursor */
@@ -716,6 +790,8 @@ nextag:
716 */ 790 */
717 agno = tagno; 791 agno = tagno;
718 *IO_agbp = NULL; 792 *IO_agbp = NULL;
793
794 restart_pagno:
719 cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno)); 795 cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
720 /* 796 /*
721 * If pagino is 0 (this is the root inode allocation) use newino. 797 * If pagino is 0 (this is the root inode allocation) use newino.
@@ -723,220 +799,199 @@ nextag:
723 */ 799 */
724 if (!pagino) 800 if (!pagino)
725 pagino = be32_to_cpu(agi->agi_newino); 801 pagino = be32_to_cpu(agi->agi_newino);
726#ifdef DEBUG
727 if (cur->bc_nlevels == 1) {
728 int freecount = 0;
729 802
730 if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) 803 error = xfs_check_agi_freecount(cur, agi);
731 goto error0; 804 if (error)
732 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 805 goto error0;
733 do {
734 if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
735 &rec.ir_freecount, &rec.ir_free, &i)))
736 goto error0;
737 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
738 freecount += rec.ir_freecount;
739 if ((error = xfs_btree_increment(cur, 0, &i)))
740 goto error0;
741 } while (i == 1);
742 806
743 ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
744 XFS_FORCED_SHUTDOWN(mp));
745 }
746#endif
747 /* 807 /*
748 * If in the same a.g. as the parent, try to get near the parent. 808 * If in the same AG as the parent, try to get near the parent.
749 */ 809 */
750 if (pagno == agno) { 810 if (pagno == agno) {
751 if ((error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i))) 811 xfs_perag_t *pag = &mp->m_perag[agno];
812 int doneleft; /* done, to the left */
813 int doneright; /* done, to the right */
814 int searchdistance = 10;
815
816 error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
817 if (error)
818 goto error0;
819 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
820
821 error = xfs_inobt_get_rec(cur, &rec, &j);
822 if (error)
752 goto error0; 823 goto error0;
753 if (i != 0 && 824 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
754 (error = xfs_inobt_get_rec(cur, &rec.ir_startino, 825
755 &rec.ir_freecount, &rec.ir_free, &j)) == 0 && 826 if (rec.ir_freecount > 0) {
756 j == 1 &&
757 rec.ir_freecount > 0) {
758 /* 827 /*
759 * Found a free inode in the same chunk 828 * Found a free inode in the same chunk
760 * as parent, done. 829 * as the parent, done.
761 */ 830 */
831 goto alloc_inode;
762 } 832 }
833
834
835 /*
836 * In the same AG as parent, but parent's chunk is full.
837 */
838
839 /* duplicate the cursor, search left & right simultaneously */
840 error = xfs_btree_dup_cursor(cur, &tcur);
841 if (error)
842 goto error0;
843
763 /* 844 /*
764 * In the same a.g. as parent, but parent's chunk is full. 845 * Skip to last blocks looked up if same parent inode.
765 */ 846 */
766 else { 847 if (pagino != NULLAGINO &&
767 int doneleft; /* done, to the left */ 848 pag->pagl_pagino == pagino &&
768 int doneright; /* done, to the right */ 849 pag->pagl_leftrec != NULLAGINO &&
850 pag->pagl_rightrec != NULLAGINO) {
851 error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
852 &trec, &doneleft, 1);
853 if (error)
854 goto error1;
769 855
856 error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
857 &rec, &doneright, 0);
770 if (error) 858 if (error)
771 goto error0;
772 ASSERT(i == 1);
773 ASSERT(j == 1);
774 /*
775 * Duplicate the cursor, search left & right
776 * simultaneously.
777 */
778 if ((error = xfs_btree_dup_cursor(cur, &tcur)))
779 goto error0;
780 /*
781 * Search left with tcur, back up 1 record.
782 */
783 if ((error = xfs_btree_decrement(tcur, 0, &i)))
784 goto error1; 859 goto error1;
785 doneleft = !i; 860 } else {
786 if (!doneleft) { 861 /* search left with tcur, back up 1 record */
787 if ((error = xfs_inobt_get_rec(tcur, 862 error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
788 &trec.ir_startino, 863 if (error)
789 &trec.ir_freecount,
790 &trec.ir_free, &i)))
791 goto error1;
792 XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
793 }
794 /*
795 * Search right with cur, go forward 1 record.
796 */
797 if ((error = xfs_btree_increment(cur, 0, &i)))
798 goto error1; 864 goto error1;
799 doneright = !i;
800 if (!doneright) {
801 if ((error = xfs_inobt_get_rec(cur,
802 &rec.ir_startino,
803 &rec.ir_freecount,
804 &rec.ir_free, &i)))
805 goto error1;
806 XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
807 }
808 /*
809 * Loop until we find the closest inode chunk
810 * with a free one.
811 */
812 while (!doneleft || !doneright) {
813 int useleft; /* using left inode
814 chunk this time */
815 865
866 /* search right with cur, go forward 1 record. */
867 error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
868 if (error)
869 goto error1;
870 }
871
872 /*
873 * Loop until we find an inode chunk with a free inode.
874 */
875 while (!doneleft || !doneright) {
876 int useleft; /* using left inode chunk this time */
877
878 if (!--searchdistance) {
816 /* 879 /*
817 * Figure out which block is closer, 880 * Not in range - save last search
818 * if both are valid. 881 * location and allocate a new inode
819 */
820 if (!doneleft && !doneright)
821 useleft =
822 pagino -
823 (trec.ir_startino +
824 XFS_INODES_PER_CHUNK - 1) <
825 rec.ir_startino - pagino;
826 else
827 useleft = !doneleft;
828 /*
829 * If checking the left, does it have
830 * free inodes?
831 */
832 if (useleft && trec.ir_freecount) {
833 /*
834 * Yes, set it up as the chunk to use.
835 */
836 rec = trec;
837 xfs_btree_del_cursor(cur,
838 XFS_BTREE_NOERROR);
839 cur = tcur;
840 break;
841 }
842 /*
843 * If checking the right, does it have
844 * free inodes?
845 */
846 if (!useleft && rec.ir_freecount) {
847 /*
848 * Yes, it's already set up.
849 */
850 xfs_btree_del_cursor(tcur,
851 XFS_BTREE_NOERROR);
852 break;
853 }
854 /*
855 * If used the left, get another one
856 * further left.
857 */
858 if (useleft) {
859 if ((error = xfs_btree_decrement(tcur, 0,
860 &i)))
861 goto error1;
862 doneleft = !i;
863 if (!doneleft) {
864 if ((error = xfs_inobt_get_rec(
865 tcur,
866 &trec.ir_startino,
867 &trec.ir_freecount,
868 &trec.ir_free, &i)))
869 goto error1;
870 XFS_WANT_CORRUPTED_GOTO(i == 1,
871 error1);
872 }
873 }
874 /*
875 * If used the right, get another one
876 * further right.
877 */ 882 */
878 else { 883 pag->pagl_leftrec = trec.ir_startino;
879 if ((error = xfs_btree_increment(cur, 0, 884 pag->pagl_rightrec = rec.ir_startino;
880 &i))) 885 pag->pagl_pagino = pagino;
881 goto error1; 886 goto newino;
882 doneright = !i; 887 }
883 if (!doneright) { 888
884 if ((error = xfs_inobt_get_rec( 889 /* figure out the closer block if both are valid. */
885 cur, 890 if (!doneleft && !doneright) {
886 &rec.ir_startino, 891 useleft = pagino -
887 &rec.ir_freecount, 892 (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
888 &rec.ir_free, &i))) 893 rec.ir_startino - pagino;
889 goto error1; 894 } else {
890 XFS_WANT_CORRUPTED_GOTO(i == 1, 895 useleft = !doneleft;
891 error1);
892 }
893 }
894 } 896 }
895 ASSERT(!doneleft || !doneright); 897
898 /* free inodes to the left? */
899 if (useleft && trec.ir_freecount) {
900 rec = trec;
901 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
902 cur = tcur;
903
904 pag->pagl_leftrec = trec.ir_startino;
905 pag->pagl_rightrec = rec.ir_startino;
906 pag->pagl_pagino = pagino;
907 goto alloc_inode;
908 }
909
910 /* free inodes to the right? */
911 if (!useleft && rec.ir_freecount) {
912 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
913
914 pag->pagl_leftrec = trec.ir_startino;
915 pag->pagl_rightrec = rec.ir_startino;
916 pag->pagl_pagino = pagino;
917 goto alloc_inode;
918 }
919
920 /* get next record to check */
921 if (useleft) {
922 error = xfs_ialloc_next_rec(tcur, &trec,
923 &doneleft, 1);
924 } else {
925 error = xfs_ialloc_next_rec(cur, &rec,
926 &doneright, 0);
927 }
928 if (error)
929 goto error1;
896 } 930 }
931
932 /*
933 * We've reached the end of the btree. because
934 * we are only searching a small chunk of the
935 * btree each search, there is obviously free
936 * inodes closer to the parent inode than we
937 * are now. restart the search again.
938 */
939 pag->pagl_pagino = NULLAGINO;
940 pag->pagl_leftrec = NULLAGINO;
941 pag->pagl_rightrec = NULLAGINO;
942 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
943 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
944 goto restart_pagno;
897 } 945 }
946
898 /* 947 /*
899 * In a different a.g. from the parent. 948 * In a different AG from the parent.
900 * See if the most recently allocated block has any free. 949 * See if the most recently allocated block has any free.
901 */ 950 */
902 else if (be32_to_cpu(agi->agi_newino) != NULLAGINO) { 951newino:
903 if ((error = xfs_inobt_lookup_eq(cur, 952 if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
904 be32_to_cpu(agi->agi_newino), 0, 0, &i))) 953 error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
954 XFS_LOOKUP_EQ, &i);
955 if (error)
905 goto error0; 956 goto error0;
906 if (i == 1 && 957
907 (error = xfs_inobt_get_rec(cur, &rec.ir_startino, 958 if (i == 1) {
908 &rec.ir_freecount, &rec.ir_free, &j)) == 0 && 959 error = xfs_inobt_get_rec(cur, &rec, &j);
909 j == 1 &&
910 rec.ir_freecount > 0) {
911 /*
912 * The last chunk allocated in the group still has
913 * a free inode.
914 */
915 }
916 /*
917 * None left in the last group, search the whole a.g.
918 */
919 else {
920 if (error) 960 if (error)
921 goto error0; 961 goto error0;
922 if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) 962
923 goto error0; 963 if (j == 1 && rec.ir_freecount > 0) {
924 ASSERT(i == 1); 964 /*
925 for (;;) { 965 * The last chunk allocated in the group
926 if ((error = xfs_inobt_get_rec(cur, 966 * still has a free inode.
927 &rec.ir_startino, 967 */
928 &rec.ir_freecount, &rec.ir_free, 968 goto alloc_inode;
929 &i)))
930 goto error0;
931 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
932 if (rec.ir_freecount > 0)
933 break;
934 if ((error = xfs_btree_increment(cur, 0, &i)))
935 goto error0;
936 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
937 } 969 }
938 } 970 }
939 } 971 }
972
973 /*
974 * None left in the last group, search the whole AG
975 */
976 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
977 if (error)
978 goto error0;
979 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
980
981 for (;;) {
982 error = xfs_inobt_get_rec(cur, &rec, &i);
983 if (error)
984 goto error0;
985 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
986 if (rec.ir_freecount > 0)
987 break;
988 error = xfs_btree_increment(cur, 0, &i);
989 if (error)
990 goto error0;
991 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
992 }
993
994alloc_inode:
940 offset = xfs_ialloc_find_free(&rec.ir_free); 995 offset = xfs_ialloc_find_free(&rec.ir_free);
941 ASSERT(offset >= 0); 996 ASSERT(offset >= 0);
942 ASSERT(offset < XFS_INODES_PER_CHUNK); 997 ASSERT(offset < XFS_INODES_PER_CHUNK);
@@ -945,33 +1000,19 @@ nextag:
945 ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); 1000 ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
946 rec.ir_free &= ~XFS_INOBT_MASK(offset); 1001 rec.ir_free &= ~XFS_INOBT_MASK(offset);
947 rec.ir_freecount--; 1002 rec.ir_freecount--;
948 if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, 1003 error = xfs_inobt_update(cur, &rec);
949 rec.ir_free))) 1004 if (error)
950 goto error0; 1005 goto error0;
951 be32_add_cpu(&agi->agi_freecount, -1); 1006 be32_add_cpu(&agi->agi_freecount, -1);
952 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); 1007 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
953 down_read(&mp->m_peraglock); 1008 down_read(&mp->m_peraglock);
954 mp->m_perag[tagno].pagi_freecount--; 1009 mp->m_perag[tagno].pagi_freecount--;
955 up_read(&mp->m_peraglock); 1010 up_read(&mp->m_peraglock);
956#ifdef DEBUG
957 if (cur->bc_nlevels == 1) {
958 int freecount = 0;
959 1011
960 if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) 1012 error = xfs_check_agi_freecount(cur, agi);
961 goto error0; 1013 if (error)
962 do { 1014 goto error0;
963 if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, 1015
964 &rec.ir_freecount, &rec.ir_free, &i)))
965 goto error0;
966 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
967 freecount += rec.ir_freecount;
968 if ((error = xfs_btree_increment(cur, 0, &i)))
969 goto error0;
970 } while (i == 1);
971 ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
972 XFS_FORCED_SHUTDOWN(mp));
973 }
974#endif
975 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 1016 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
976 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); 1017 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
977 *inop = ino; 1018 *inop = ino;
@@ -1062,38 +1103,23 @@ xfs_difree(
1062 * Initialize the cursor. 1103 * Initialize the cursor.
1063 */ 1104 */
1064 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); 1105 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
1065#ifdef DEBUG
1066 if (cur->bc_nlevels == 1) {
1067 int freecount = 0;
1068 1106
1069 if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) 1107 error = xfs_check_agi_freecount(cur, agi);
1070 goto error0; 1108 if (error)
1071 do { 1109 goto error0;
1072 if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, 1110
1073 &rec.ir_freecount, &rec.ir_free, &i)))
1074 goto error0;
1075 if (i) {
1076 freecount += rec.ir_freecount;
1077 if ((error = xfs_btree_increment(cur, 0, &i)))
1078 goto error0;
1079 }
1080 } while (i == 1);
1081 ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
1082 XFS_FORCED_SHUTDOWN(mp));
1083 }
1084#endif
1085 /* 1111 /*
1086 * Look for the entry describing this inode. 1112 * Look for the entry describing this inode.
1087 */ 1113 */
1088 if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) { 1114 if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
1089 cmn_err(CE_WARN, 1115 cmn_err(CE_WARN,
1090 "xfs_difree: xfs_inobt_lookup_le returned() an error %d on %s. Returning error.", 1116 "xfs_difree: xfs_inobt_lookup returned() an error %d on %s. Returning error.",
1091 error, mp->m_fsname); 1117 error, mp->m_fsname);
1092 goto error0; 1118 goto error0;
1093 } 1119 }
1094 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1120 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1095 if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, &rec.ir_freecount, 1121 error = xfs_inobt_get_rec(cur, &rec, &i);
1096 &rec.ir_free, &i))) { 1122 if (error) {
1097 cmn_err(CE_WARN, 1123 cmn_err(CE_WARN,
1098 "xfs_difree: xfs_inobt_get_rec() returned an error %d on %s. Returning error.", 1124 "xfs_difree: xfs_inobt_get_rec() returned an error %d on %s. Returning error.",
1099 error, mp->m_fsname); 1125 error, mp->m_fsname);
@@ -1148,12 +1174,14 @@ xfs_difree(
1148 } else { 1174 } else {
1149 *delete = 0; 1175 *delete = 0;
1150 1176
1151 if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, rec.ir_free))) { 1177 error = xfs_inobt_update(cur, &rec);
1178 if (error) {
1152 cmn_err(CE_WARN, 1179 cmn_err(CE_WARN,
1153 "xfs_difree: xfs_inobt_update() returned an error %d on %s. Returning error.", 1180 "xfs_difree: xfs_inobt_update returned an error %d on %s.",
1154 error, mp->m_fsname); 1181 error, mp->m_fsname);
1155 goto error0; 1182 goto error0;
1156 } 1183 }
1184
1157 /* 1185 /*
1158 * Change the inode free counts and log the ag/sb changes. 1186 * Change the inode free counts and log the ag/sb changes.
1159 */ 1187 */
@@ -1165,28 +1193,10 @@ xfs_difree(
1165 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); 1193 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
1166 } 1194 }
1167 1195
1168#ifdef DEBUG 1196 error = xfs_check_agi_freecount(cur, agi);
1169 if (cur->bc_nlevels == 1) { 1197 if (error)
1170 int freecount = 0; 1198 goto error0;
1171 1199
1172 if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
1173 goto error0;
1174 do {
1175 if ((error = xfs_inobt_get_rec(cur,
1176 &rec.ir_startino,
1177 &rec.ir_freecount,
1178 &rec.ir_free, &i)))
1179 goto error0;
1180 if (i) {
1181 freecount += rec.ir_freecount;
1182 if ((error = xfs_btree_increment(cur, 0, &i)))
1183 goto error0;
1184 }
1185 } while (i == 1);
1186 ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
1187 XFS_FORCED_SHUTDOWN(mp));
1188 }
1189#endif
1190 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 1200 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1191 return 0; 1201 return 0;
1192 1202
@@ -1297,9 +1307,7 @@ xfs_imap(
1297 chunk_agbno = agbno - offset_agbno; 1307 chunk_agbno = agbno - offset_agbno;
1298 } else { 1308 } else {
1299 xfs_btree_cur_t *cur; /* inode btree cursor */ 1309 xfs_btree_cur_t *cur; /* inode btree cursor */
1300 xfs_agino_t chunk_agino; /* first agino in inode chunk */ 1310 xfs_inobt_rec_incore_t chunk_rec;
1301 __int32_t chunk_cnt; /* count of free inodes in chunk */
1302 xfs_inofree_t chunk_free; /* mask of free inodes in chunk */
1303 xfs_buf_t *agbp; /* agi buffer */ 1311 xfs_buf_t *agbp; /* agi buffer */
1304 int i; /* temp state */ 1312 int i; /* temp state */
1305 1313
@@ -1315,15 +1323,14 @@ xfs_imap(
1315 } 1323 }
1316 1324
1317 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); 1325 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
1318 error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i); 1326 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
1319 if (error) { 1327 if (error) {
1320 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " 1328 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
1321 "xfs_inobt_lookup_le() failed"); 1329 "xfs_inobt_lookup() failed");
1322 goto error0; 1330 goto error0;
1323 } 1331 }
1324 1332
1325 error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt, 1333 error = xfs_inobt_get_rec(cur, &chunk_rec, &i);
1326 &chunk_free, &i);
1327 if (error) { 1334 if (error) {
1328 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " 1335 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
1329 "xfs_inobt_get_rec() failed"); 1336 "xfs_inobt_get_rec() failed");
@@ -1341,7 +1348,7 @@ xfs_imap(
1341 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 1348 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1342 if (error) 1349 if (error)
1343 return error; 1350 return error;
1344 chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino); 1351 chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_rec.ir_startino);
1345 offset_agbno = agbno - chunk_agbno; 1352 offset_agbno = agbno - chunk_agbno;
1346 } 1353 }
1347 1354
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index aeee8278f92c..bb5385475e1f 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -150,23 +150,15 @@ xfs_ialloc_pagi_init(
150 xfs_agnumber_t agno); /* allocation group number */ 150 xfs_agnumber_t agno); /* allocation group number */
151 151
152/* 152/*
153 * Lookup the first record greater than or equal to ino 153 * Lookup a record by ino in the btree given by cur.
154 * in the btree given by cur.
155 */ 154 */
156int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino, 155int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
157 __int32_t fcnt, xfs_inofree_t free, int *stat); 156 xfs_lookup_t dir, int *stat);
158
159/*
160 * Lookup the first record less than or equal to ino
161 * in the btree given by cur.
162 */
163int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino,
164 __int32_t fcnt, xfs_inofree_t free, int *stat);
165 157
166/* 158/*
167 * Get the data from the pointed-to record. 159 * Get the data from the pointed-to record.
168 */ 160 */
169extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino, 161extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
170 __int32_t *fcnt, xfs_inofree_t *free, int *stat); 162 xfs_inobt_rec_incore_t *rec, int *stat);
171 163
172#endif /* __XFS_IALLOC_H__ */ 164#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index ecbf8b4d2e2e..80e526489be5 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -82,7 +82,6 @@ xfs_inode_alloc(
82 memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); 82 memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
83 ip->i_flags = 0; 83 ip->i_flags = 0;
84 ip->i_update_core = 0; 84 ip->i_update_core = 0;
85 ip->i_update_size = 0;
86 ip->i_delayed_blks = 0; 85 ip->i_delayed_blks = 0;
87 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); 86 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
88 ip->i_size = 0; 87 ip->i_size = 0;
@@ -456,32 +455,6 @@ out_error_or_again:
456 return error; 455 return error;
457} 456}
458 457
459
460/*
461 * Look for the inode corresponding to the given ino in the hash table.
462 * If it is there and its i_transp pointer matches tp, return it.
463 * Otherwise, return NULL.
464 */
465xfs_inode_t *
466xfs_inode_incore(xfs_mount_t *mp,
467 xfs_ino_t ino,
468 xfs_trans_t *tp)
469{
470 xfs_inode_t *ip;
471 xfs_perag_t *pag;
472
473 pag = xfs_get_perag(mp, ino);
474 read_lock(&pag->pag_ici_lock);
475 ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ino));
476 read_unlock(&pag->pag_ici_lock);
477 xfs_put_perag(mp, pag);
478
479 /* the returned inode must match the transaction */
480 if (ip && (ip->i_transp != tp))
481 return NULL;
482 return ip;
483}
484
485/* 458/*
486 * Decrement reference count of an inode structure and unlock it. 459 * Decrement reference count of an inode structure and unlock it.
487 * 460 *
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index da428b3fe0f5..c1dc7ef5a1d8 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -651,7 +651,7 @@ xfs_iformat_btree(
651 return 0; 651 return 0;
652} 652}
653 653
654void 654STATIC void
655xfs_dinode_from_disk( 655xfs_dinode_from_disk(
656 xfs_icdinode_t *to, 656 xfs_icdinode_t *to,
657 xfs_dinode_t *from) 657 xfs_dinode_t *from)
@@ -1247,7 +1247,7 @@ xfs_isize_check(
1247 * In that case the pages will still be in memory, but the inode size 1247 * In that case the pages will still be in memory, but the inode size
1248 * will never have been updated. 1248 * will never have been updated.
1249 */ 1249 */
1250xfs_fsize_t 1250STATIC xfs_fsize_t
1251xfs_file_last_byte( 1251xfs_file_last_byte(
1252 xfs_inode_t *ip) 1252 xfs_inode_t *ip)
1253{ 1253{
@@ -3837,7 +3837,7 @@ xfs_iext_inline_to_direct(
3837/* 3837/*
3838 * Resize an extent indirection array to new_size bytes. 3838 * Resize an extent indirection array to new_size bytes.
3839 */ 3839 */
3840void 3840STATIC void
3841xfs_iext_realloc_indirect( 3841xfs_iext_realloc_indirect(
3842 xfs_ifork_t *ifp, /* inode fork pointer */ 3842 xfs_ifork_t *ifp, /* inode fork pointer */
3843 int new_size) /* new indirection array size */ 3843 int new_size) /* new indirection array size */
@@ -3862,7 +3862,7 @@ xfs_iext_realloc_indirect(
3862/* 3862/*
3863 * Switch from indirection array to linear (direct) extent allocations. 3863 * Switch from indirection array to linear (direct) extent allocations.
3864 */ 3864 */
3865void 3865STATIC void
3866xfs_iext_indirect_to_direct( 3866xfs_iext_indirect_to_direct(
3867 xfs_ifork_t *ifp) /* inode fork pointer */ 3867 xfs_ifork_t *ifp) /* inode fork pointer */
3868{ 3868{
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 65f24a3cc992..0b38b9a869ec 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -261,7 +261,6 @@ typedef struct xfs_inode {
261 /* Miscellaneous state. */ 261 /* Miscellaneous state. */
262 unsigned short i_flags; /* see defined flags below */ 262 unsigned short i_flags; /* see defined flags below */
263 unsigned char i_update_core; /* timestamps/size is dirty */ 263 unsigned char i_update_core; /* timestamps/size is dirty */
264 unsigned char i_update_size; /* di_size field is dirty */
265 unsigned int i_delayed_blks; /* count of delay alloc blks */ 264 unsigned int i_delayed_blks; /* count of delay alloc blks */
266 265
267 xfs_icdinode_t i_d; /* most of ondisk inode */ 266 xfs_icdinode_t i_d; /* most of ondisk inode */
@@ -468,8 +467,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
468/* 467/*
469 * xfs_iget.c prototypes. 468 * xfs_iget.c prototypes.
470 */ 469 */
471xfs_inode_t *xfs_inode_incore(struct xfs_mount *, xfs_ino_t,
472 struct xfs_trans *);
473int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, 470int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
474 uint, uint, xfs_inode_t **, xfs_daddr_t); 471 uint, uint, xfs_inode_t **, xfs_daddr_t);
475void xfs_iput(xfs_inode_t *, uint); 472void xfs_iput(xfs_inode_t *, uint);
@@ -504,7 +501,6 @@ void xfs_ipin(xfs_inode_t *);
504void xfs_iunpin(xfs_inode_t *); 501void xfs_iunpin(xfs_inode_t *);
505int xfs_iflush(xfs_inode_t *, uint); 502int xfs_iflush(xfs_inode_t *, uint);
506void xfs_ichgtime(xfs_inode_t *, int); 503void xfs_ichgtime(xfs_inode_t *, int);
507xfs_fsize_t xfs_file_last_byte(xfs_inode_t *);
508void xfs_lock_inodes(xfs_inode_t **, int, uint); 504void xfs_lock_inodes(xfs_inode_t **, int, uint);
509void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); 505void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
510 506
@@ -572,8 +568,6 @@ int xfs_itobp(struct xfs_mount *, struct xfs_trans *,
572 struct xfs_buf **, uint); 568 struct xfs_buf **, uint);
573int xfs_iread(struct xfs_mount *, struct xfs_trans *, 569int xfs_iread(struct xfs_mount *, struct xfs_trans *,
574 struct xfs_inode *, xfs_daddr_t, uint); 570 struct xfs_inode *, xfs_daddr_t, uint);
575void xfs_dinode_from_disk(struct xfs_icdinode *,
576 struct xfs_dinode *);
577void xfs_dinode_to_disk(struct xfs_dinode *, 571void xfs_dinode_to_disk(struct xfs_dinode *,
578 struct xfs_icdinode *); 572 struct xfs_icdinode *);
579void xfs_idestroy_fork(struct xfs_inode *, int); 573void xfs_idestroy_fork(struct xfs_inode *, int);
@@ -592,8 +586,6 @@ void xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int);
592void xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int); 586void xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int);
593void xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int); 587void xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int);
594void xfs_iext_realloc_direct(xfs_ifork_t *, int); 588void xfs_iext_realloc_direct(xfs_ifork_t *, int);
595void xfs_iext_realloc_indirect(xfs_ifork_t *, int);
596void xfs_iext_indirect_to_direct(xfs_ifork_t *);
597void xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t); 589void xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t);
598void xfs_iext_inline_to_direct(xfs_ifork_t *, int); 590void xfs_iext_inline_to_direct(xfs_ifork_t *, int);
599void xfs_iext_destroy(xfs_ifork_t *); 591void xfs_iext_destroy(xfs_ifork_t *);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 977c4aec587e..47d5b663c37e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -263,14 +263,6 @@ xfs_inode_item_format(
263 } 263 }
264 264
265 /* 265 /*
266 * We don't have to worry about re-ordering here because
267 * the update_size field is protected by the inode lock
268 * and we have that held in exclusive mode.
269 */
270 if (ip->i_update_size)
271 ip->i_update_size = 0;
272
273 /*
274 * Make sure to get the latest atime from the Linux inode. 266 * Make sure to get the latest atime from the Linux inode.
275 */ 267 */
276 xfs_synchronize_atime(ip); 268 xfs_synchronize_atime(ip);
@@ -712,8 +704,6 @@ xfs_inode_item_unlock(
712 * Clear out the fields of the inode log item particular 704 * Clear out the fields of the inode log item particular
713 * to the current transaction. 705 * to the current transaction.
714 */ 706 */
715 iip->ili_ilock_recur = 0;
716 iip->ili_iolock_recur = 0;
717 iip->ili_flags = 0; 707 iip->ili_flags = 0;
718 708
719 /* 709 /*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index a52ac125f055..65bae4c9b8bf 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -137,8 +137,6 @@ typedef struct xfs_inode_log_item {
137 struct xfs_inode *ili_inode; /* inode ptr */ 137 struct xfs_inode *ili_inode; /* inode ptr */
138 xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ 138 xfs_lsn_t ili_flush_lsn; /* lsn at last flush */
139 xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ 139 xfs_lsn_t ili_last_lsn; /* lsn at last transaction */
140 unsigned short ili_ilock_recur; /* lock recursion count */
141 unsigned short ili_iolock_recur; /* lock recursion count */
142 unsigned short ili_flags; /* misc flags */ 140 unsigned short ili_flags; /* misc flags */
143 unsigned short ili_logged; /* flushed logged data */ 141 unsigned short ili_logged; /* flushed logged data */
144 unsigned int ili_last_fields; /* fields when flushed */ 142 unsigned int ili_last_fields; /* fields when flushed */
diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h
index 7a28191cb0de..b8e4ee4e89a4 100644
--- a/fs/xfs/xfs_inum.h
+++ b/fs/xfs/xfs_inum.h
@@ -72,7 +72,6 @@ struct xfs_mount;
72 72
73#if XFS_BIG_INUMS 73#if XFS_BIG_INUMS
74#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL)) 74#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL))
75#define XFS_INO64_OFFSET ((xfs_ino_t)(1ULL << 32))
76#else 75#else
77#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 32) - 1ULL)) 76#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 32) - 1ULL))
78#endif 77#endif
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index aeb2d2221c7d..b68f9107e26c 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -39,7 +39,7 @@
39#include "xfs_error.h" 39#include "xfs_error.h"
40#include "xfs_btree.h" 40#include "xfs_btree.h"
41 41
42int 42STATIC int
43xfs_internal_inum( 43xfs_internal_inum(
44 xfs_mount_t *mp, 44 xfs_mount_t *mp,
45 xfs_ino_t ino) 45 xfs_ino_t ino)
@@ -353,9 +353,6 @@ xfs_bulkstat(
353 int end_of_ag; /* set if we've seen the ag end */ 353 int end_of_ag; /* set if we've seen the ag end */
354 int error; /* error code */ 354 int error; /* error code */
355 int fmterror;/* bulkstat formatter result */ 355 int fmterror;/* bulkstat formatter result */
356 __int32_t gcnt; /* current btree rec's count */
357 xfs_inofree_t gfree; /* current btree rec's free mask */
358 xfs_agino_t gino; /* current btree rec's start inode */
359 int i; /* loop index */ 356 int i; /* loop index */
360 int icount; /* count of inodes good in irbuf */ 357 int icount; /* count of inodes good in irbuf */
361 size_t irbsize; /* size of irec buffer in bytes */ 358 size_t irbsize; /* size of irec buffer in bytes */
@@ -442,40 +439,43 @@ xfs_bulkstat(
442 * we need to get the remainder of the chunk we're in. 439 * we need to get the remainder of the chunk we're in.
443 */ 440 */
444 if (agino > 0) { 441 if (agino > 0) {
442 xfs_inobt_rec_incore_t r;
443
445 /* 444 /*
446 * Lookup the inode chunk that this inode lives in. 445 * Lookup the inode chunk that this inode lives in.
447 */ 446 */
448 error = xfs_inobt_lookup_le(cur, agino, 0, 0, &tmp); 447 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE,
448 &tmp);
449 if (!error && /* no I/O error */ 449 if (!error && /* no I/O error */
450 tmp && /* lookup succeeded */ 450 tmp && /* lookup succeeded */
451 /* got the record, should always work */ 451 /* got the record, should always work */
452 !(error = xfs_inobt_get_rec(cur, &gino, &gcnt, 452 !(error = xfs_inobt_get_rec(cur, &r, &i)) &&
453 &gfree, &i)) &&
454 i == 1 && 453 i == 1 &&
455 /* this is the right chunk */ 454 /* this is the right chunk */
456 agino < gino + XFS_INODES_PER_CHUNK && 455 agino < r.ir_startino + XFS_INODES_PER_CHUNK &&
457 /* lastino was not last in chunk */ 456 /* lastino was not last in chunk */
458 (chunkidx = agino - gino + 1) < 457 (chunkidx = agino - r.ir_startino + 1) <
459 XFS_INODES_PER_CHUNK && 458 XFS_INODES_PER_CHUNK &&
460 /* there are some left allocated */ 459 /* there are some left allocated */
461 xfs_inobt_maskn(chunkidx, 460 xfs_inobt_maskn(chunkidx,
462 XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) { 461 XFS_INODES_PER_CHUNK - chunkidx) &
462 ~r.ir_free) {
463 /* 463 /*
464 * Grab the chunk record. Mark all the 464 * Grab the chunk record. Mark all the
465 * uninteresting inodes (because they're 465 * uninteresting inodes (because they're
466 * before our start point) free. 466 * before our start point) free.
467 */ 467 */
468 for (i = 0; i < chunkidx; i++) { 468 for (i = 0; i < chunkidx; i++) {
469 if (XFS_INOBT_MASK(i) & ~gfree) 469 if (XFS_INOBT_MASK(i) & ~r.ir_free)
470 gcnt++; 470 r.ir_freecount++;
471 } 471 }
472 gfree |= xfs_inobt_maskn(0, chunkidx); 472 r.ir_free |= xfs_inobt_maskn(0, chunkidx);
473 irbp->ir_startino = gino; 473 irbp->ir_startino = r.ir_startino;
474 irbp->ir_freecount = gcnt; 474 irbp->ir_freecount = r.ir_freecount;
475 irbp->ir_free = gfree; 475 irbp->ir_free = r.ir_free;
476 irbp++; 476 irbp++;
477 agino = gino + XFS_INODES_PER_CHUNK; 477 agino = r.ir_startino + XFS_INODES_PER_CHUNK;
478 icount = XFS_INODES_PER_CHUNK - gcnt; 478 icount = XFS_INODES_PER_CHUNK - r.ir_freecount;
479 } else { 479 } else {
480 /* 480 /*
481 * If any of those tests failed, bump the 481 * If any of those tests failed, bump the
@@ -493,7 +493,7 @@ xfs_bulkstat(
493 /* 493 /*
494 * Start of ag. Lookup the first inode chunk. 494 * Start of ag. Lookup the first inode chunk.
495 */ 495 */
496 error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &tmp); 496 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp);
497 icount = 0; 497 icount = 0;
498 } 498 }
499 /* 499 /*
@@ -501,6 +501,8 @@ xfs_bulkstat(
501 * until we run out of inodes or space in the buffer. 501 * until we run out of inodes or space in the buffer.
502 */ 502 */
503 while (irbp < irbufend && icount < ubcount) { 503 while (irbp < irbufend && icount < ubcount) {
504 xfs_inobt_rec_incore_t r;
505
504 /* 506 /*
505 * Loop as long as we're unable to read the 507 * Loop as long as we're unable to read the
506 * inode btree. 508 * inode btree.
@@ -510,51 +512,55 @@ xfs_bulkstat(
510 if (XFS_AGINO_TO_AGBNO(mp, agino) >= 512 if (XFS_AGINO_TO_AGBNO(mp, agino) >=
511 be32_to_cpu(agi->agi_length)) 513 be32_to_cpu(agi->agi_length))
512 break; 514 break;
513 error = xfs_inobt_lookup_ge(cur, agino, 0, 0, 515 error = xfs_inobt_lookup(cur, agino,
514 &tmp); 516 XFS_LOOKUP_GE, &tmp);
515 cond_resched(); 517 cond_resched();
516 } 518 }
517 /* 519 /*
518 * If ran off the end of the ag either with an error, 520 * If ran off the end of the ag either with an error,
519 * or the normal way, set end and stop collecting. 521 * or the normal way, set end and stop collecting.
520 */ 522 */
521 if (error || 523 if (error) {
522 (error = xfs_inobt_get_rec(cur, &gino, &gcnt,
523 &gfree, &i)) ||
524 i == 0) {
525 end_of_ag = 1; 524 end_of_ag = 1;
526 break; 525 break;
527 } 526 }
527
528 error = xfs_inobt_get_rec(cur, &r, &i);
529 if (error || i == 0) {
530 end_of_ag = 1;
531 break;
532 }
533
528 /* 534 /*
529 * If this chunk has any allocated inodes, save it. 535 * If this chunk has any allocated inodes, save it.
530 * Also start read-ahead now for this chunk. 536 * Also start read-ahead now for this chunk.
531 */ 537 */
532 if (gcnt < XFS_INODES_PER_CHUNK) { 538 if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
533 /* 539 /*
534 * Loop over all clusters in the next chunk. 540 * Loop over all clusters in the next chunk.
535 * Do a readahead if there are any allocated 541 * Do a readahead if there are any allocated
536 * inodes in that cluster. 542 * inodes in that cluster.
537 */ 543 */
538 for (agbno = XFS_AGINO_TO_AGBNO(mp, gino), 544 agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
539 chunkidx = 0; 545 for (chunkidx = 0;
540 chunkidx < XFS_INODES_PER_CHUNK; 546 chunkidx < XFS_INODES_PER_CHUNK;
541 chunkidx += nicluster, 547 chunkidx += nicluster,
542 agbno += nbcluster) { 548 agbno += nbcluster) {
543 if (xfs_inobt_maskn(chunkidx, 549 if (xfs_inobt_maskn(chunkidx, nicluster)
544 nicluster) & ~gfree) 550 & ~r.ir_free)
545 xfs_btree_reada_bufs(mp, agno, 551 xfs_btree_reada_bufs(mp, agno,
546 agbno, nbcluster); 552 agbno, nbcluster);
547 } 553 }
548 irbp->ir_startino = gino; 554 irbp->ir_startino = r.ir_startino;
549 irbp->ir_freecount = gcnt; 555 irbp->ir_freecount = r.ir_freecount;
550 irbp->ir_free = gfree; 556 irbp->ir_free = r.ir_free;
551 irbp++; 557 irbp++;
552 icount += XFS_INODES_PER_CHUNK - gcnt; 558 icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
553 } 559 }
554 /* 560 /*
555 * Set agino to after this chunk and bump the cursor. 561 * Set agino to after this chunk and bump the cursor.
556 */ 562 */
557 agino = gino + XFS_INODES_PER_CHUNK; 563 agino = r.ir_startino + XFS_INODES_PER_CHUNK;
558 error = xfs_btree_increment(cur, 0, &tmp); 564 error = xfs_btree_increment(cur, 0, &tmp);
559 cond_resched(); 565 cond_resched();
560 } 566 }
@@ -820,9 +826,7 @@ xfs_inumbers(
820 int bufidx; 826 int bufidx;
821 xfs_btree_cur_t *cur; 827 xfs_btree_cur_t *cur;
822 int error; 828 int error;
823 __int32_t gcnt; 829 xfs_inobt_rec_incore_t r;
824 xfs_inofree_t gfree;
825 xfs_agino_t gino;
826 int i; 830 int i;
827 xfs_ino_t ino; 831 xfs_ino_t ino;
828 int left; 832 int left;
@@ -855,7 +859,8 @@ xfs_inumbers(
855 continue; 859 continue;
856 } 860 }
857 cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno); 861 cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
858 error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp); 862 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE,
863 &tmp);
859 if (error) { 864 if (error) {
860 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); 865 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
861 cur = NULL; 866 cur = NULL;
@@ -870,9 +875,8 @@ xfs_inumbers(
870 continue; 875 continue;
871 } 876 }
872 } 877 }
873 if ((error = xfs_inobt_get_rec(cur, &gino, &gcnt, &gfree, 878 error = xfs_inobt_get_rec(cur, &r, &i);
874 &i)) || 879 if (error || i == 0) {
875 i == 0) {
876 xfs_buf_relse(agbp); 880 xfs_buf_relse(agbp);
877 agbp = NULL; 881 agbp = NULL;
878 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 882 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
@@ -881,10 +885,12 @@ xfs_inumbers(
881 agino = 0; 885 agino = 0;
882 continue; 886 continue;
883 } 887 }
884 agino = gino + XFS_INODES_PER_CHUNK - 1; 888 agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
885 buffer[bufidx].xi_startino = XFS_AGINO_TO_INO(mp, agno, gino); 889 buffer[bufidx].xi_startino =
886 buffer[bufidx].xi_alloccount = XFS_INODES_PER_CHUNK - gcnt; 890 XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
887 buffer[bufidx].xi_allocmask = ~gfree; 891 buffer[bufidx].xi_alloccount =
892 XFS_INODES_PER_CHUNK - r.ir_freecount;
893 buffer[bufidx].xi_allocmask = ~r.ir_free;
888 bufidx++; 894 bufidx++;
889 left--; 895 left--;
890 if (bufidx == bcount) { 896 if (bufidx == bcount) {
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 1fb04e7deb61..20792bf45946 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -99,11 +99,6 @@ xfs_bulkstat_one(
99 void *dibuff, 99 void *dibuff,
100 int *stat); 100 int *stat);
101 101
102int
103xfs_internal_inum(
104 xfs_mount_t *mp,
105 xfs_ino_t ino);
106
107typedef int (*inumbers_fmt_pf)( 102typedef int (*inumbers_fmt_pf)(
108 void __user *ubuffer, /* buffer to write to */ 103 void __user *ubuffer, /* buffer to write to */
109 const xfs_inogrp_t *buffer, /* buffer to read from */ 104 const xfs_inogrp_t *buffer, /* buffer to read from */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index bcad5f4c1fd1..679c7c4926a2 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -451,8 +451,6 @@ extern int xlog_find_tail(xlog_t *log,
451extern int xlog_recover(xlog_t *log); 451extern int xlog_recover(xlog_t *log);
452extern int xlog_recover_finish(xlog_t *log); 452extern int xlog_recover_finish(xlog_t *log);
453extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); 453extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
454extern void xlog_recover_process_iunlinks(xlog_t *log);
455
456extern struct xfs_buf *xlog_get_bp(xlog_t *, int); 454extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
457extern void xlog_put_bp(struct xfs_buf *); 455extern void xlog_put_bp(struct xfs_buf *);
458 456
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 47da2fb45377..1099395d7d6c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3263,7 +3263,7 @@ xlog_recover_process_one_iunlink(
3263 * freeing of the inode and its removal from the list must be 3263 * freeing of the inode and its removal from the list must be
3264 * atomic. 3264 * atomic.
3265 */ 3265 */
3266void 3266STATIC void
3267xlog_recover_process_iunlinks( 3267xlog_recover_process_iunlinks(
3268 xlog_t *log) 3268 xlog_t *log)
3269{ 3269{
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 5c6f092659c1..8b6c9e807efb 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1568,7 +1568,7 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
1568 * 1568 *
1569 * The m_sb_lock must be held when this routine is called. 1569 * The m_sb_lock must be held when this routine is called.
1570 */ 1570 */
1571int 1571STATIC int
1572xfs_mod_incore_sb_unlocked( 1572xfs_mod_incore_sb_unlocked(
1573 xfs_mount_t *mp, 1573 xfs_mount_t *mp,
1574 xfs_sb_field_t field, 1574 xfs_sb_field_t field,
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a5122382afde..a6c023bc0fb2 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -414,13 +414,10 @@ typedef struct xfs_mod_sb {
414 414
415extern int xfs_log_sbcount(xfs_mount_t *, uint); 415extern int xfs_log_sbcount(xfs_mount_t *, uint);
416extern int xfs_mountfs(xfs_mount_t *mp); 416extern int xfs_mountfs(xfs_mount_t *mp);
417extern void xfs_mountfs_check_barriers(xfs_mount_t *mp);
418 417
419extern void xfs_unmountfs(xfs_mount_t *); 418extern void xfs_unmountfs(xfs_mount_t *);
420extern int xfs_unmountfs_writesb(xfs_mount_t *); 419extern int xfs_unmountfs_writesb(xfs_mount_t *);
421extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); 420extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
422extern int xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t,
423 int64_t, int);
424extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, 421extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
425 uint, int); 422 uint, int);
426extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t); 423extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index afee7eb24323..4b0613d99faa 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -564,35 +564,6 @@ xfs_mru_cache_lookup(
564} 564}
565 565
566/* 566/*
567 * To look up an element using its key, but leave its location in the internal
568 * lists alone, call xfs_mru_cache_peek(). If the element isn't found, this
569 * function returns NULL.
570 *
571 * See the comments above the declaration of the xfs_mru_cache_lookup() function
572 * for important locking information pertaining to this call.
573 */
574void *
575xfs_mru_cache_peek(
576 xfs_mru_cache_t *mru,
577 unsigned long key)
578{
579 xfs_mru_cache_elem_t *elem;
580
581 ASSERT(mru && mru->lists);
582 if (!mru || !mru->lists)
583 return NULL;
584
585 spin_lock(&mru->lock);
586 elem = radix_tree_lookup(&mru->store, key);
587 if (!elem)
588 spin_unlock(&mru->lock);
589 else
590 __release(mru_lock); /* help sparse not be stupid */
591
592 return elem ? elem->value : NULL;
593}
594
595/*
596 * To release the internal data structure spinlock after having performed an 567 * To release the internal data structure spinlock after having performed an
597 * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done() 568 * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done()
598 * with the data store pointer. 569 * with the data store pointer.
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
index dd58ea1bbebe..5d439f34b0c9 100644
--- a/fs/xfs/xfs_mru_cache.h
+++ b/fs/xfs/xfs_mru_cache.h
@@ -49,7 +49,6 @@ int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
49void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key); 49void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key);
50void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key); 50void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key);
51void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key); 51void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key);
52void *xfs_mru_cache_peek(struct xfs_mru_cache *mru, unsigned long key);
53void xfs_mru_cache_done(struct xfs_mru_cache *mru); 52void xfs_mru_cache_done(struct xfs_mru_cache *mru);
54 53
55#endif /* __XFS_MRU_CACHE_H__ */ 54#endif /* __XFS_MRU_CACHE_H__ */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index fea68615ed23..3f816ad7ff19 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -88,90 +88,6 @@ xfs_write_clear_setuid(
88} 88}
89 89
90/* 90/*
91 * Handle logging requirements of various synchronous types of write.
92 */
93int
94xfs_write_sync_logforce(
95 xfs_mount_t *mp,
96 xfs_inode_t *ip)
97{
98 int error = 0;
99
100 /*
101 * If we're treating this as O_DSYNC and we have not updated the
102 * size, force the log.
103 */
104 if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
105 !(ip->i_update_size)) {
106 xfs_inode_log_item_t *iip = ip->i_itemp;
107
108 /*
109 * If an allocation transaction occurred
110 * without extending the size, then we have to force
111 * the log up the proper point to ensure that the
112 * allocation is permanent. We can't count on
113 * the fact that buffered writes lock out direct I/O
114 * writes - the direct I/O write could have extended
115 * the size nontransactionally, then finished before
116 * we started. xfs_write_file will think that the file
117 * didn't grow but the update isn't safe unless the
118 * size change is logged.
119 *
120 * Force the log if we've committed a transaction
121 * against the inode or if someone else has and
122 * the commit record hasn't gone to disk (e.g.
123 * the inode is pinned). This guarantees that
124 * all changes affecting the inode are permanent
125 * when we return.
126 */
127 if (iip && iip->ili_last_lsn) {
128 error = _xfs_log_force(mp, iip->ili_last_lsn,
129 XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
130 } else if (xfs_ipincount(ip) > 0) {
131 error = _xfs_log_force(mp, (xfs_lsn_t)0,
132 XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
133 }
134
135 } else {
136 xfs_trans_t *tp;
137
138 /*
139 * O_SYNC or O_DSYNC _with_ a size update are handled
140 * the same way.
141 *
142 * If the write was synchronous then we need to make
143 * sure that the inode modification time is permanent.
144 * We'll have updated the timestamp above, so here
145 * we use a synchronous transaction to log the inode.
146 * It's not fast, but it's necessary.
147 *
148 * If this a dsync write and the size got changed
149 * non-transactionally, then we need to ensure that
150 * the size change gets logged in a synchronous
151 * transaction.
152 */
153 tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
154 if ((error = xfs_trans_reserve(tp, 0,
155 XFS_SWRITE_LOG_RES(mp),
156 0, 0, 0))) {
157 /* Transaction reserve failed */
158 xfs_trans_cancel(tp, 0);
159 } else {
160 /* Transaction reserve successful */
161 xfs_ilock(ip, XFS_ILOCK_EXCL);
162 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
163 xfs_trans_ihold(tp, ip);
164 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
165 xfs_trans_set_sync(tp);
166 error = xfs_trans_commit(tp, 0);
167 xfs_iunlock(ip, XFS_ILOCK_EXCL);
168 }
169 }
170
171 return error;
172}
173
174/*
175 * Force a shutdown of the filesystem instantly while keeping 91 * Force a shutdown of the filesystem instantly while keeping
176 * the filesystem consistent. We don't do an unmount here; just shutdown 92 * the filesystem consistent. We don't do an unmount here; just shutdown
177 * the shop, make sure that absolutely nothing persistent happens to 93 * the shop, make sure that absolutely nothing persistent happens to
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index f76c003ec55d..f5e4874c37d8 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -68,7 +68,6 @@ xfs_get_extsz_hint(
68 * Prototypes for functions in xfs_rw.c. 68 * Prototypes for functions in xfs_rw.c.
69 */ 69 */
70extern int xfs_write_clear_setuid(struct xfs_inode *ip); 70extern int xfs_write_clear_setuid(struct xfs_inode *ip);
71extern int xfs_write_sync_logforce(struct xfs_mount *mp, struct xfs_inode *ip);
72extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp); 71extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
73extern int xfs_bioerror(struct xfs_buf *bp); 72extern int xfs_bioerror(struct xfs_buf *bp);
74extern int xfs_bioerror_relse(struct xfs_buf *bp); 73extern int xfs_bioerror_relse(struct xfs_buf *bp);
@@ -78,10 +77,4 @@ extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
78extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp, 77extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp,
79 xfs_buf_t *bp, xfs_daddr_t blkno); 78 xfs_buf_t *bp, xfs_daddr_t blkno);
80 79
81/*
82 * Prototypes for functions in xfs_vnodeops.c.
83 */
84extern int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip,
85 int flags);
86
87#endif /* __XFS_RW_H__ */ 80#endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 775249a54f6f..ed47fc77759c 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -68,7 +68,7 @@ typedef struct xfs_trans_header {
68#define XFS_TRANS_GROWFS 14 68#define XFS_TRANS_GROWFS 14
69#define XFS_TRANS_STRAT_WRITE 15 69#define XFS_TRANS_STRAT_WRITE 15
70#define XFS_TRANS_DIOSTRAT 16 70#define XFS_TRANS_DIOSTRAT 16
71#define XFS_TRANS_WRITE_SYNC 17 71/* 17 was XFS_TRANS_WRITE_SYNC */
72#define XFS_TRANS_WRITEID 18 72#define XFS_TRANS_WRITEID 18
73#define XFS_TRANS_ADDAFORK 19 73#define XFS_TRANS_ADDAFORK 19
74#define XFS_TRANS_ATTRINVAL 20 74#define XFS_TRANS_ATTRINVAL 20
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 8ee2f8c8b0a6..218829e6a152 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -307,7 +307,7 @@ xfs_trans_read_buf(
307 return (flags & XFS_BUF_TRYLOCK) ? 307 return (flags & XFS_BUF_TRYLOCK) ?
308 EAGAIN : XFS_ERROR(ENOMEM); 308 EAGAIN : XFS_ERROR(ENOMEM);
309 309
310 if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) { 310 if (XFS_BUF_GETERROR(bp) != 0) {
311 xfs_ioerror_alert("xfs_trans_read_buf", mp, 311 xfs_ioerror_alert("xfs_trans_read_buf", mp,
312 bp, blkno); 312 bp, blkno);
313 error = XFS_BUF_GETERROR(bp); 313 error = XFS_BUF_GETERROR(bp);
@@ -315,7 +315,7 @@ xfs_trans_read_buf(
315 return error; 315 return error;
316 } 316 }
317#ifdef DEBUG 317#ifdef DEBUG
318 if (xfs_do_error && (bp != NULL)) { 318 if (xfs_do_error) {
319 if (xfs_error_target == target) { 319 if (xfs_error_target == target) {
320 if (((xfs_req_num++) % xfs_error_mod) == 0) { 320 if (((xfs_req_num++) % xfs_error_mod) == 0) {
321 xfs_buf_relse(bp); 321 xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 23d276af2e0c..785ff101da0a 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -49,30 +49,7 @@ xfs_trans_inode_broot_debug(
49 49
50 50
51/* 51/*
52 * Get and lock the inode for the caller if it is not already 52 * Get an inode and join it to the transaction.
53 * locked within the given transaction. If it is already locked
54 * within the transaction, just increment its lock recursion count
55 * and return a pointer to it.
56 *
57 * For an inode to be locked in a transaction, the inode lock, as
58 * opposed to the io lock, must be taken exclusively. This ensures
59 * that the inode can be involved in only 1 transaction at a time.
60 * Lock recursion is handled on the io lock, but only for lock modes
61 * of equal or lesser strength. That is, you can recur on the io lock
62 * held EXCL with a SHARED request but not vice versa. Also, if
63 * the inode is already a part of the transaction then you cannot
64 * go from not holding the io lock to having it EXCL or SHARED.
65 *
66 * Use the inode cache routine xfs_inode_incore() to find the inode
67 * if it is already owned by this transaction.
68 *
69 * If we don't already own the inode, use xfs_iget() to get it.
70 * Since the inode log item structure is embedded in the incore
71 * inode structure and is initialized when the inode is brought
72 * into memory, there is nothing to do with it here.
73 *
74 * If the given transaction pointer is NULL, just call xfs_iget().
75 * This simplifies code which must handle both cases.
76 */ 53 */
77int 54int
78xfs_trans_iget( 55xfs_trans_iget(
@@ -84,62 +61,11 @@ xfs_trans_iget(
84 xfs_inode_t **ipp) 61 xfs_inode_t **ipp)
85{ 62{
86 int error; 63 int error;
87 xfs_inode_t *ip;
88
89 /*
90 * If the transaction pointer is NULL, just call the normal
91 * xfs_iget().
92 */
93 if (tp == NULL)
94 return xfs_iget(mp, NULL, ino, flags, lock_flags, ipp, 0);
95
96 /*
97 * If we find the inode in core with this transaction
98 * pointer in its i_transp field, then we know we already
99 * have it locked. In this case we just increment the lock
100 * recursion count and return the inode to the caller.
101 * Assert that the inode is already locked in the mode requested
102 * by the caller. We cannot do lock promotions yet, so
103 * die if someone gets this wrong.
104 */
105 if ((ip = xfs_inode_incore(tp->t_mountp, ino, tp)) != NULL) {
106 /*
107 * Make sure that the inode lock is held EXCL and
108 * that the io lock is never upgraded when the inode
109 * is already a part of the transaction.
110 */
111 ASSERT(ip->i_itemp != NULL);
112 ASSERT(lock_flags & XFS_ILOCK_EXCL);
113 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
114 ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
115 xfs_isilocked(ip, XFS_IOLOCK_EXCL));
116 ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
117 (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_EXCL));
118 ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
119 xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED));
120 ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
121 (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_ANY));
122
123 if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) {
124 ip->i_itemp->ili_iolock_recur++;
125 }
126 if (lock_flags & XFS_ILOCK_EXCL) {
127 ip->i_itemp->ili_ilock_recur++;
128 }
129 *ipp = ip;
130 return 0;
131 }
132
133 ASSERT(lock_flags & XFS_ILOCK_EXCL);
134 error = xfs_iget(tp->t_mountp, tp, ino, flags, lock_flags, &ip, 0);
135 if (error) {
136 return error;
137 }
138 ASSERT(ip != NULL);
139 64
140 xfs_trans_ijoin(tp, ip, lock_flags); 65 error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp, 0);
141 *ipp = ip; 66 if (!error && tp)
142 return 0; 67 xfs_trans_ijoin(tp, *ipp, lock_flags);
68 return error;
143} 69}
144 70
145/* 71/*
@@ -163,8 +89,6 @@ xfs_trans_ijoin(
163 xfs_inode_item_init(ip, ip->i_mount); 89 xfs_inode_item_init(ip, ip->i_mount);
164 iip = ip->i_itemp; 90 iip = ip->i_itemp;
165 ASSERT(iip->ili_flags == 0); 91 ASSERT(iip->ili_flags == 0);
166 ASSERT(iip->ili_ilock_recur == 0);
167 ASSERT(iip->ili_iolock_recur == 0);
168 92
169 /* 93 /*
170 * Get a log_item_desc to point at the new item. 94 * Get a log_item_desc to point at the new item.
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 492d75bae2bf..a434f287962d 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -611,7 +611,7 @@ xfs_fsync(
611 xfs_inode_t *ip) 611 xfs_inode_t *ip)
612{ 612{
613 xfs_trans_t *tp; 613 xfs_trans_t *tp;
614 int error; 614 int error = 0;
615 int log_flushed = 0, changed = 1; 615 int log_flushed = 0, changed = 1;
616 616
617 xfs_itrace_entry(ip); 617 xfs_itrace_entry(ip);
@@ -619,14 +619,9 @@ xfs_fsync(
619 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 619 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
620 return XFS_ERROR(EIO); 620 return XFS_ERROR(EIO);
621 621
622 /* capture size updates in I/O completion before writing the inode. */
623 error = xfs_wait_on_pages(ip, 0, -1);
624 if (error)
625 return XFS_ERROR(error);
626
627 /* 622 /*
628 * We always need to make sure that the required inode state is safe on 623 * We always need to make sure that the required inode state is safe on
629 * disk. The vnode might be clean but we still might need to force the 624 * disk. The inode might be clean but we still might need to force the
630 * log because of committed transactions that haven't hit the disk yet. 625 * log because of committed transactions that haven't hit the disk yet.
631 * Likewise, there could be unflushed non-transactional changes to the 626 * Likewise, there could be unflushed non-transactional changes to the
632 * inode core that have to go to disk and this requires us to issue 627 * inode core that have to go to disk and this requires us to issue
@@ -638,7 +633,7 @@ xfs_fsync(
638 */ 633 */
639 xfs_ilock(ip, XFS_ILOCK_SHARED); 634 xfs_ilock(ip, XFS_ILOCK_SHARED);
640 635
641 if (!(ip->i_update_size || ip->i_update_core)) { 636 if (!ip->i_update_core) {
642 /* 637 /*
643 * Timestamps/size haven't changed since last inode flush or 638 * Timestamps/size haven't changed since last inode flush or
644 * inode transaction commit. That means either nothing got 639 * inode transaction commit. That means either nothing got
@@ -718,7 +713,7 @@ xfs_fsync(
718 * when the link count isn't zero and by xfs_dm_punch_hole() when 713 * when the link count isn't zero and by xfs_dm_punch_hole() when
719 * punching a hole to EOF. 714 * punching a hole to EOF.
720 */ 715 */
721int 716STATIC int
722xfs_free_eofblocks( 717xfs_free_eofblocks(
723 xfs_mount_t *mp, 718 xfs_mount_t *mp,
724 xfs_inode_t *ip, 719 xfs_inode_t *ip,
@@ -1476,8 +1471,8 @@ xfs_create(
1476 if (error == ENOSPC) { 1471 if (error == ENOSPC) {
1477 /* flush outstanding delalloc blocks and retry */ 1472 /* flush outstanding delalloc blocks and retry */
1478 xfs_flush_inodes(dp); 1473 xfs_flush_inodes(dp);
1479 error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0, 1474 error = xfs_trans_reserve(tp, resblks, log_res, 0,
1480 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT); 1475 XFS_TRANS_PERM_LOG_RES, log_count);
1481 } 1476 }
1482 if (error == ENOSPC) { 1477 if (error == ENOSPC) {
1483 /* No space at all so try a "no-allocation" reservation */ 1478 /* No space at all so try a "no-allocation" reservation */